Study_note(zb_data)/Machine Learning

์Šคํ„ฐ๋””๋…ธํŠธ (ML6_Wine)

KloudHyun 2023. 9. 22. 19:53

๐Ÿ“Œ plotly.express

import pandas as pd
red = pd.read_csv('../data/winequality-red.csv', sep=';') 
white = pd.read_csv('../data/winequality-white.csv', sep=';')
red['color']= 1.
white['color'] = 0.
wine =pd.concat([red,white])
wine.info()
wine['quality'].unique()
>>>>
array([5, 6, 7, 4, 8, 3, 9], dtype=int64)
import plotly.express as px
fig = px.histogram(wine, x='quality')
fig.show()

fig = px.histogram(wine, x='quality', color = 'color')
fig.show()

X = wine.drop(['color'], axis=1)
y = wine['color']
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
np.unique(y_train, return_counts=True)
>>>>
(array([0., 1.]), array([3913, 1284], dtype=int64))

๐Ÿ“Œ Train ๋ฐ์ดํ„ฐ์™€ Test ๋ฐ์ดํ„ฐ์˜ ๊ฐ ๋“ฑ๊ธ‰๋ณ„ ๊ฐœ์ˆ˜

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Histogram(x=X_train['quality'], name='Train'))
fig.add_trace(go.Histogram(x=X_test['quality'], name='Test'))

fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.7)
fig.show()

๐Ÿ“Œ ๋ฐ์ดํ„ฐ accuracy, ์˜ˆ์ธกํ•ด๋ณด์ž

from sklearn.tree import DecisionTreeClassifier

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
from sklearn.metrics import accuracy_score

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
>>>>
Train Acc :  0.9553588608812776
Test Acc :  0.9569230769230769
X.columns
>>>>
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

๐Ÿ“Œboxplot ๋งŒ๋“ค์–ด๋ณด์ž

fig = go.Figure()
fig.add_trace(go.Box(y=X['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X['quality'], name='quality'))
fig.show()

- ์ปฌ๋Ÿผ ๊ฐ„์˜ ๋ฒ”์œ„ ๊ฒฉ์ฐจ๊ฐ€ ์‹ฌํ•  ๊ฒฝ์šฐ์— ์ œ๋Œ€๋กœ ํ•™์Šต์ด ์•ˆ๋  '์ˆ˜๋„' ์žˆ๋‹ค (์ปฌ๋Ÿผ์˜ ์ตœ๋Œ€/์ตœ์†Œ ๋ฒ”์œ„๊ฐ€ ๊ฐ๊ฐ ๋‹ค๋ฅด๊ณ , ํ‰๊ท ๊ณผ ๋ถ„์‚ฐ์ด ๊ฐ๊ฐ ๋‹ค๋ฅด๋‹ค)

- ํŠน์„ฑ์˜ ํŽธํ–ฅ ๋ฌธ์ œ๋Š” ์ตœ์ ์˜ ๋ชจ๋ธ์„ ์ฐพ๋Š”๋ฐ ๋ฐฉํ•ด๊ฐ€ ๋  ์ˆ˜ ์žˆ๋‹ค.

- ์ผ๋ฐ˜์ ์œผ๋กœ scaler๋ฅผ ์ ์šฉํ•˜๋Š” ๊ฒƒ์€ ํšจ๊ณผ์ ์ผ ์ˆ˜ ์žˆ๋‹ค 

- ๊ฒฐ๋ก ์€ .. ํ•ด๋ด์•ผ ์•ˆ๋‹ค..

from sklearn.preprocessing import MinMaxScaler, StandardScaler

MMS = MinMaxScaler()
SS = StandardScaler()

SS.fit(X)
MMS.fit(X)

X_ss = SS.transform(X)
X_mms = MMS.transform(X)
X_ss_pd = pd.DataFrame(X_ss, columns=X.columns)
X_mms_pd = pd.DataFrame(X_mms, columns=X.columns)
X_mms
>>>>
array([[0.29752066, 0.41333333, 0.        , ..., 0.19101124, 0.20289855,
        0.33333333],
       [0.33057851, 0.53333333, 0.        , ..., 0.25842697, 0.26086957,
        0.33333333],
       [0.33057851, 0.45333333, 0.02409639, ..., 0.24157303, 0.26086957,
        0.33333333],
       ...,
       [0.2231405 , 0.10666667, 0.11445783, ..., 0.13483146, 0.20289855,
        0.5       ],
       [0.14049587, 0.14      , 0.18072289, ..., 0.08988764, 0.69565217,
        0.66666667],
       [0.18181818, 0.08666667, 0.22891566, ..., 0.05617978, 0.55072464,
        0.5       ]])

๐Ÿ“Œmin_max scaler? -- ์ตœ๋Œ€ ์ตœ์†Ÿ๊ฐ’์„ 1๊ณผ 0์œผ๋กœ ๊ฐ•์ œ๋กœ ๋งž์ถ”๋Š” ๊ฒƒ

fig = go.Figure()
fig.add_trace(go.Box(y=X_mms_pd['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X_mms_pd['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X_mms_pd['quality'], name='quality'))
fig.show()

๐Ÿ“ŒStandard scaler? -- ํ‰๊ท ์„ 0์œผ๋กœ ํ‘œ์ค€ํŽธ์ฐจ๋ฅผ 1๋กœ ๋งž์ถ”๋Š” ๊ฒƒ

 

X = wine.drop(['taste'], axis=1)
y = wine['taste']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
>>>>
Train Acc :  0.9553588608812776
Test Acc :  0.9569230769230769
import matplotlib.pyplot as plt
import sklearn.tree as tree

plt.figure(figsize = (12, 8))
tree.plot_tree(wine_tree, feature_names=X.columns)

๐Ÿ“ŒQuality๋ฅผ ๋นผ๊ณ  ๋‹ค์‹œ fit

- taste ๋ผ๋Š” ์ปฌ๋Ÿผ์ด ๊ฒฐ๊ตญ์—” Quality๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ๋งŒ๋“ค์–ด์ง„ ์ปฌ๋Ÿผ์ด๊ธฐ ๋•Œ๋ฌธ์— ๊ฒฐ๊ณผ ๊ฐ’์ด ๋†’๊ฒŒ ๋‚˜์˜จ๋‹ค

- Quality ์ปฌ๋Ÿผ์„ ๋นผ๊ณ  ์ง„ํ–‰

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
>>>>
Train Acc :  0.7294593034442948
Test Acc :  0.7161538461538461
plt.figure(figsize=(12, 8))
tree.plot_tree(wine_tree, feature_names=X.columns,
               rounded=True,
               filled=True)
plt.show()