๐ ์ ์ฉ์นด๋ ๋ถ์ ์ฌ์ฉ์ ๊ฒ์ถํด๋ณด๊ธฐ
๐ปData import
→ ์ ์ฉ์นด๋ ๋ถ๋ ์ฌ์ฉ์ ๋น์จ์ด ๋งค์ฐ ๋ฎ๋ค (0.17%..)๋น์จ์ด ๋ฎ๋ค๋ ๊ฒ์? ๋ฐ์ดํฐ๊ฐ ๋งค์ฐ ๋ถ๊ท ํํ ์ํ๋ผ๋ ๊ฒ.
import pandas as pd
raw_data = pd.read_csv('../data/creditcard.csv')
raw_data.head()
raw_data['Class'].value_counts()
>>>>
Class
0 284315
1 492
Name: count, dtype: int64
frauds_rate = round(raw_data['Class'].value_counts()[1] / len(raw_data) * 100,2)
frauds_rate
>>>>
0.17
๐ปData๋ฅผ plot์ผ๋ก ๊ทธ๋ ค๋ณด๊ธฐ
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(x='Class', data = raw_data)
plt.title('Class Distribution')
plt.show()
๐ปtrain data์ ๋น์จ ํ์ธ
from sklearn.model_selection import train_test_split
X = raw_data.iloc[:, 1:-1] # column V1 ~ amount ๊น์ง
y = raw_data.iloc[:, -1] # ๋ง์ง๋ง ์ปฌ๋ผ๋ง ์ ํ (Class)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state= 13, stratify=y)
import numpy as np
np.unique(y_train, return_counts=True)
>>>>
(array([0, 1], dtype=int64), array([227451, 394], dtype=int64))
# ๋ฐ์ดํฐ์ ๋ถ๊ท ํ ์ ๋๊ฐ ์ด๋ค์ง ํ์ธ
tmp = np.unique(y_train, return_counts=True)[1]
tmp[1] / len(y_train) * 100
>>>>
0.17292457591783889
๐ ๋ค์ํ ์๋๋ฅผ ํด๋ณด๊ธฐ
๐ปdef ์์ฑ
# ๋ถ๋ฅ๊ธฐ์ ์ฑ๋ฅ์ returnํ๋ ํจ์
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
def get_clf_eval(y_test, pred):
acc = accuracy_score(y_test, pred)
pre = precision_score(y_test, pred)
re = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
return acc, pre, re, f1, auc
# ์ฑ๋ฅ์ ์ถ๋ ฅํ๋ ํจ์ ์์ฑ
from sklearn.metrics import confusion_matrix
def print_clf_eval(y_test, pred):
confusion = confusion_matrix(y_test, pred)
acc, pre, re, f1, auc = get_clf_eval(y_test, pred)
print('confusion metrix')
print(confusion)
print('----------------')
print('Accuracy : {0:.4f}, precision : {1:.4f}'. format(acc, pre))
print('recall : {0:.4f}, f1_score : {1:.4f}, auc : {2:.4f}'. format(re, f1, auc))
๐ป๋ก์ง์คํฑ ํ๊ท ์คํ ํด๋ณด๊ธฐ
→ 56,864 ๊ฐ์ 0 (์ ์ data) ์ค์์ 8๊ฐ๋ฅผ fraud๋ก ์ค์ธ
→ 98๊ฐ์ 1 (๋ถ๋ data) ์ค์์ 40๊ฐ๋ฅผ ์ ์์ผ๋ก ์ค์ธ
→ Accuracy๋ 99.92% ์ด์ง๋ง, Recall ์์น๋ 59%๋ก ๊ฒฐ๊ณผ๊ฐ ์ข์ง ์๋ค (๋ถ๋์ธ ๋ฐ์ดํฐ ์ค, ์ง์ง ๋ถ๋์ ๊ฐ์งํด๋ด๋ ๋น์จ์ด 59%)
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state=13, solver='liblinear')
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print_clf_eval(y_test, lr_pred)
>>>>
confusion metrix
[[56856 8]
[ 40 58]]
----------------
Accuracy : 0.9992, precision : 0.8788
recall : 0.5918, f1_score : 0.7073, auc : 0.7958
๐ป๊ฒฐ์ ๋๋ฌด ์คํ ํด๋ณด๊ธฐ
→ 56,864 ๊ฐ์ 0 (์ ์ data) ์ค์์ 8๊ฐ๋ฅผ fraud๋ก ์ค์ธ
→ 98๊ฐ์ 1 (๋ถ๋ data) ์ค์์ 33๊ฐ๋ฅผ ์ ์์ผ๋ก ์ค์ธ (recall 66%)
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print_clf_eval(y_test, dt_pred)
>>>>
confusion metrix
[[56856 8]
[ 33 65]]
----------------
Accuracy : 0.9993, precision : 0.8904
recall : 0.6633, f1_score : 0.7602, auc : 0.8316
๐ป๋๋คํฌ๋ ์คํธ๋ก ์คํ ํด๋ณด๊ธฐ
→ 56,864 ๊ฐ์ 0 (์ ์ data) ์ค์์ 7๊ฐ๋ฅผ fraud๋ก ์ค์ธ
→ 98๊ฐ์ 1 (๋ถ๋ data) ์ค์์ 25๊ฐ๋ฅผ ์ ์์ผ๋ก ์ค์ธ (recall 74%)
→ ์ ์ ์ฑ๋ฅ์ด ๊ด์ฐฎ์ ์ง๋ ๊ฒ ๊ฐ๋ค
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=13, n_jobs=-1, n_estimators=100)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print_clf_eval(y_test, rf_pred)
>>>>
confusion metrix
[[56857 7]
[ 25 73]]
----------------
Accuracy : 0.9994, precision : 0.9125
recall : 0.7449, f1_score : 0.8202, auc : 0.8724
๐ปLGBM์ผ๋ก ์คํํด๋ณด๊ธฐ
→ 56,864 ๊ฐ์ 0 (์ ์ data) ์ค์์ 6๊ฐ๋ฅผ fraud๋ก ์ค์ธ
→ 98๊ฐ์ 1 (๋ถ๋ data) ์ค์์ 24๊ฐ๋ฅผ ์ ์์ผ๋ก ์ค์ธ (recall 75%)
→ ๋๋ค ํฌ๋ ์คํธ์ ํฐ ์ฐจ์ด๋ ์๋ ๋ฏ..
from lightgbm import LGBMClassifier
lgbm_clf = LGBMClassifier(random_state=13, n_jobs=-1, n_estimators=1000, num_leaves=64, boost_from_average=False)
lgbm_clf.fit(X_train, y_train)
lgbm_pred = lgbm_clf.predict(X_test)
print_clf_eval(y_test, lgbm_pred)
>>>>
confusion metrix
[[56858 6]
[ 24 74]]
----------------
Accuracy : 0.9995, precision : 0.9250
recall : 0.7551, f1_score : 0.8315, auc : 0.8775
๐ป๊ฒฐ๊ณผ๋ฅผ DataFrame ์ผ๋ก !
def get_result(model, X_train, y_train, X_test, y_test):
model.fit(X_train, y_train)
pred = model.predict(X_test)
return get_clf_eval(y_test, pred)
import pandas as pd
def get_result_pd(models, model_names, X_train, y_train, X_test, y_test):
col_names = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
tmp = []
for model in models:
tmp.append(get_result(model, X_train, y_train, X_test, y_test))
return pd.DataFrame(tmp, columns=col_names, index=model_names)
๐ป๊ฒฐ๊ณผ๋ฅผ ๋ฐ์ดํฐ ํ๋ ์์ผ๋ก!
→ Accuracy๋ ๋งค์ฐ ๋๋ค
→ Recall ์ฑ๋ฅ์ RandomForest, LGBM์ ์ฑ๋ฅ์ด ์ข์๋ณด์ธ๋ค.
models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['Logistic Regression', 'DecisionTree', 'RandomForest', 'LightGBM']
result = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)
result
'Study_note(zb_data) > Machine Learning' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
์คํฐ๋๋ ธํธ (credit card data 3, 4) (1) | 2023.10.02 |
---|---|
์คํฐ๋๋ ธํธ (credit card data 2) (0) | 2023.10.02 |
์คํฐ๋๋ ธํธ (kNN) (1) | 2023.10.02 |
์คํฐ๋๋ ธํธ (Boosting Algorithm) (1) | 2023.10.02 |
์คํฐ๋๋ ธํธ (HAR ๋ฐ์ดํฐ ๋ค๋ค๋ณด๊ธฐ) (0) | 2023.09.30 |