Study_note(zb_data)/Machine Learning
์คํฐ๋๋ ธํธ (ML6_Wine)
KloudHyun
2023. 9. 22. 19:53
๐ plotly.express
import pandas as pd
red = pd.read_csv('../data/winequality-red.csv', sep=';')
white = pd.read_csv('../data/winequality-white.csv', sep=';')
red['color']= 1.
white['color'] = 0.
wine =pd.concat([red,white])
wine.info()
wine['quality'].unique()
>>>>
array([5, 6, 7, 4, 8, 3, 9], dtype=int64)
import plotly.express as px
fig = px.histogram(wine, x='quality')
fig.show()
fig = px.histogram(wine, x='quality', color = 'color')
fig.show()
X = wine.drop(['color'], axis=1)
y = wine['color']
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
np.unique(y_train, return_counts=True)
>>>>
(array([0., 1.]), array([3913, 1284], dtype=int64))
๐ Train ๋ฐ์ดํฐ์ Test ๋ฐ์ดํฐ์ ๊ฐ ๋ฑ๊ธ๋ณ ๊ฐ์
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Histogram(x=X_train['quality'], name='Train'))
fig.add_trace(go.Histogram(x=X_test['quality'], name='Test'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.7)
fig.show()
๐ ๋ฐ์ดํฐ accuracy, ์์ธกํด๋ณด์
from sklearn.tree import DecisionTreeClassifier
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
>>>>
Train Acc : 0.9553588608812776
Test Acc : 0.9569230769230769
X.columns
>>>>
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
๐boxplot ๋ง๋ค์ด๋ณด์
fig = go.Figure()
fig.add_trace(go.Box(y=X['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X['quality'], name='quality'))
fig.show()
- ์ปฌ๋ผ ๊ฐ์ ๋ฒ์ ๊ฒฉ์ฐจ๊ฐ ์ฌํ ๊ฒฝ์ฐ์ ์ ๋๋ก ํ์ต์ด ์๋ '์๋' ์๋ค (์ปฌ๋ผ์ ์ต๋/์ต์ ๋ฒ์๊ฐ ๊ฐ๊ฐ ๋ค๋ฅด๊ณ , ํ๊ท ๊ณผ ๋ถ์ฐ์ด ๊ฐ๊ฐ ๋ค๋ฅด๋ค)
- ํน์ฑ์ ํธํฅ ๋ฌธ์ ๋ ์ต์ ์ ๋ชจ๋ธ์ ์ฐพ๋๋ฐ ๋ฐฉํด๊ฐ ๋ ์ ์๋ค.
- ์ผ๋ฐ์ ์ผ๋ก scaler๋ฅผ ์ ์ฉํ๋ ๊ฒ์ ํจ๊ณผ์ ์ผ ์ ์๋ค
- ๊ฒฐ๋ก ์ .. ํด๋ด์ผ ์๋ค..
from sklearn.preprocessing import MinMaxScaler, StandardScaler
MMS = MinMaxScaler()
SS = StandardScaler()
SS.fit(X)
MMS.fit(X)
X_ss = SS.transform(X)
X_mms = MMS.transform(X)
X_ss_pd = pd.DataFrame(X_ss, columns=X.columns)
X_mms_pd = pd.DataFrame(X_mms, columns=X.columns)
X_mms
>>>>
array([[0.29752066, 0.41333333, 0. , ..., 0.19101124, 0.20289855,
0.33333333],
[0.33057851, 0.53333333, 0. , ..., 0.25842697, 0.26086957,
0.33333333],
[0.33057851, 0.45333333, 0.02409639, ..., 0.24157303, 0.26086957,
0.33333333],
...,
[0.2231405 , 0.10666667, 0.11445783, ..., 0.13483146, 0.20289855,
0.5 ],
[0.14049587, 0.14 , 0.18072289, ..., 0.08988764, 0.69565217,
0.66666667],
[0.18181818, 0.08666667, 0.22891566, ..., 0.05617978, 0.55072464,
0.5 ]])
๐min_max scaler? -- ์ต๋ ์ต์๊ฐ์ 1๊ณผ 0์ผ๋ก ๊ฐ์ ๋ก ๋ง์ถ๋ ๊ฒ
fig = go.Figure()
fig.add_trace(go.Box(y=X_mms_pd['fixed acidity'], name='fixed acidity'))
fig.add_trace(go.Box(y=X_mms_pd['chlorides'], name='chlorides'))
fig.add_trace(go.Box(y=X_mms_pd['quality'], name='quality'))
fig.show()
๐Standard scaler? -- ํ๊ท ์ 0์ผ๋ก ํ์คํธ์ฐจ๋ฅผ 1๋ก ๋ง์ถ๋ ๊ฒ
X = wine.drop(['taste'], axis=1)
y = wine['taste']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
>>>>
Train Acc : 0.9553588608812776
Test Acc : 0.9569230769230769
import matplotlib.pyplot as plt
import sklearn.tree as tree
plt.figure(figsize = (12, 8))
tree.plot_tree(wine_tree, feature_names=X.columns)
๐Quality๋ฅผ ๋นผ๊ณ ๋ค์ fit
- taste ๋ผ๋ ์ปฌ๋ผ์ด ๊ฒฐ๊ตญ์ Quality๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ๋ง๋ค์ด์ง ์ปฌ๋ผ์ด๊ธฐ ๋๋ฌธ์ ๊ฒฐ๊ณผ ๊ฐ์ด ๋๊ฒ ๋์จ๋ค
- Quality ์ปฌ๋ผ์ ๋นผ๊ณ ์งํ
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)
wine_tree.fit(X_train, y_train)
y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)
print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))
>>>>
Train Acc : 0.7294593034442948
Test Acc : 0.7161538461538461
plt.figure(figsize=(12, 8))
tree.plot_tree(wine_tree, feature_names=X.columns,
rounded=True,
filled=True)
plt.show()