๋ณธ๋ฌธ ๋ฐ”๋กœ๊ฐ€๊ธฐ

Study_note(zb_data)/Machine Learning

์Šคํ„ฐ๋””๋…ธํŠธ (credit card data 2)

๐Ÿ“Œ ์‹ ์šฉ์นด๋“œ ๋ถ€์ • ์‚ฌ์šฉ์ž ๊ฒ€์ถœํ•ด๋ณด๊ธฐ 2

๐Ÿ”ปํŠน์ • ๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ง€๊ณ  ๋‹ค์‹œ ๋„์ „ ํ•ด๋ณด์ž.

plt.figure(figsize=(12, 10))
sns.distplot(raw_data['Amount'], color='r');
plt.show()

๐Ÿ”ปStandardScaler ์ ์šฉ 

→ Amount ์ปฌ๋Ÿผ์ด ๋น„๋Œ€์นญ์„ฑ์ด ๋„ˆ๋ฌด ๋†’๋‹ค (๋ฐ์ดํ„ฐ๊ฐ€ ํ•œ ์ชฝ์œผ๋กœ ์ ๋ ค ์žˆ๋‹ค)

→ ๋”ฐ๋ผ์„œ, ์ •๊ทœํ™”๋ฅผ ์‹œ์ผœ์„œ ํŽธํ–ฅ์„ฑ์„ ๋ฐ”๊ฟ”๋ณด์ž.

→ ๊ฒฐ๊ณผ ๊ฐ’์€ ๋ณ€๋™์„ฑ์ด ์—†์ด ๋ฏธ๋ฏธํ•œ ์ˆ˜์ค€์ด๋‹ค..

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
amount_n = scaler.fit_transform(raw_data['Amount'].values.reshape(-1, 1))

raw_data_copy = raw_data.iloc[:, 1:-2]
raw_data_copy['Amount_Scaled'] = amount_n
raw_data_copy.head()

→ V1 ~ Amount_Scaled ๊นŒ์ง€ ์ปฌ๋Ÿผ์„ ๋งŒ๋“  ๊ฒƒ

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3, random_state= 13, stratify=y)
models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['Logistic Regression', 'DecisionTree', 'RandomForest', 'LightGBM']
result = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

result

๐Ÿ”ปroc_curve๋ฅผ ๊ทธ๋ ค๋ณด์ž

from sklearn.metrics import roc_curve

def draw_roc_curve(models, model_names, X_test, y_test):
    plt.figure(figsize=(12, 10))

    for model in range(len(models)):
        pred = models[model].predict_proba(X_test)[:, 1]
        fpr, tpr, thresholds = roc_curve(y_test, pred)
        plt.plot(fpr, tpr, label=model_names[model])

    plt.plot([0,1], [0,1], 'k--', label='random guess')
    plt.title('ROC')
    plt.legend()
    plt.grid()
    plt.show()

 

draw_roc_curve(models, model_names, X_test, y_test)


๐Ÿ”ปlog ํ•จ์ˆ˜๋ฅผ ์ ์šฉ ์‹œ์ผœ๋ณด์ž

→ ํฐ ๊ฐ’์€ ์ƒ๋Œ€์ ์œผ๋กœ ๋‚ฎ์€ ๊ฐ’์œผ๋กœ ๋ฐ”๊ฟ”์ฃผ๊ณ , ๋‚ฎ์€ ๊ฐ’์€ ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉํ•˜๋„๋ก ์„ค์ • X๊ฐ€ ์ปค์งˆ์ˆ˜๋ก, y ๊ฐ’์„ ์–ต์ œํ•ด์ฃผ๋Š” ๊ธฐ๋Šฅ์„ ๊ฐ€์ง€๊ณ  ์žˆ๋‹ค (log scale ์ ์šฉ)

amount_log = np.log1p(raw_data['Amount'])
raw_data_copy['Amount_Scaled'] = amount_log
plt.figure(figsize=(12, 6))
sns.distplot(raw_data_copy['Amount_Scaled'], color='r')
plt.show()

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(raw_data_copy, y, test_size=0.3, random_state= 13, stratify=y)


models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['Logistic Regression', 'DecisionTree', 'RandomForest', 'LightGBM']
result = get_result_pd(models, model_names, X_train, y_train, X_test, y_test)

result

draw_roc_curve(models, model_names, X_test, y_test)