Study_note(zb_data)/Machine Learning
์คํฐ๋๋ ธํธ (ํต๊ณ์ ํ๊ท)
KloudHyun
2023. 9. 28. 21:37
๐ ํต๊ณ์ ํ๊ท
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv('../data/ecommerce.csv', sep=',')
data.drop(['Email', 'Address', 'Avatar'], axis=1, inplace=True)
data.head()
๐ปBoxplot์ ๊ทธ๋ ค๋ณด์
plt.figure(figsize=(12, 8))
sns.boxplot(data=data.iloc[:, :-1]);
plt.figure(figsize=(12, 8))
sns.boxplot(data=data['Yearly Amount Spent']);
๐ปPairplot์ผ๋ก ๊ฒฝํฅ์ฑ์ ํ์ธํด๋ณด์
- ๋ฉค๋ฒ์ฝ๊ณผ ์ฐ๊ฐ ์ง๋ถ ๊ธ์ก์ ์์ด ์๊ด๊ด๊ณ๊ฐ ์๋ ๊ฒ์ ํ์ธํ ์ ์๋ค.
plt.figure(figsize=(12, 6))
sns.pairplot(data=data);
๐ป๋ ์์ธํ๊ฒ ํ์ธํด๋ณด๊ธฐ
- ์๊ด๊ด๊ณ๊ฐ ์๋ ๊ฒ์ผ๋ก ํ์ ์ด ๋๋ค.
plt.figure(figsize=(12, 6))
sns.lmplot(x='Length of Membership', y= 'Yearly Amount Spent', data=data);
๐ ์๊ด์ฑ์ด ๋์ ๊ฒ์ ํ์ฉํ์ฌ ํ๊ท๋ฅผ ํด๋ณด์
import statsmodels.api as sm
X=data['Length of Membership']
y=data['Yearly Amount Spent']
lm = sm.OLS(y, X).fit()
lm.summary()
๐ป๋ ์์ธํ๊ฒ ํ์ธํด๋ณด๊ธฐ
pred = lm.predict(X)
sns.scatterplot(x=X, y=y)
plt.plot(X, pred, 'r', ls='dashed', lw=3)
sns.scatterplot(x=y, y=pred)
plt.plot([min(y), max(y)], [min(y), max(y)], 'r', ls='dashed', lw=3)
plt.plot([0, max(y)], [0, max(y)], 'b', ls='dashed', lw=3)
๐ป์์ํญ ๋ฃ์ด์ฃผ๊ธฐ
X = np.c_[X, [1]*len(X)]
X[:5]
>>>>
array([[4.08262063, 1. ],
[2.66403418, 1. ],
[4.1045432 , 1. ],
[3.12017878, 1. ],
[4.44630832, 1. ]])
lm = sm.OLS(y, X).fit()
lm.summary()
๐ปํ์ธํด๋ณด์
- R-squared
- AIC (๋ง๋ค์ด๋ธ ๋ชจ๋ธ์ด ๋ฐ์ดํฐ๋ฅผ ์ผ๋ง๋ ์ ๋ฐ์ํ๋์ง ์ธก์ ํ๋ ๋๊ตฌ)
๐ sklearn ํด๋ณด๊ธฐ
from sklearn.model_selection import train_test_split
X = data.drop('Yearly Amount Spent', axis=1)
y = data['Yearly Amount Spent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
lm = sm.OLS(y_train, X_train).fit()
lm.summary()
pred = lm.predict(X_test)
sns.scatterplot(x=y_test, y=pred)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r');