Study_note(zb_data)/Machine Learning
์คํฐ๋๋ ธํธ (ML5)
KloudHyun
2023. 9. 21. 23:08
๐ LabelEncoder
df = pd.DataFrame({
'A' : ['a', 'b', 'c', 'a', 'b'],
'B' : [1, 2, 3, 1, 0],
})
df
๐ Label_encoder
- fit -> transform
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['A'])
le.classes_
>>>>
array(['a', 'b', 'c'], dtype=object)
le.transform(df['A'])
>>>>
array([0, 1, 2, 0, 1])
le.fit_transform(df['A'])
>>>>
array([0, 1, 2, 0, 1])
le.inverse_transform(df['le_A'])
>>>>
array(['a', 'b', 'c', 'a', 'b'], dtype=object)
df = pd.DataFrame( {
'A' : [10, 20, -10, 0, 25],
'B' : [1, 2, 3, 1, 0]
})
df
>>>>
๐ min-max scaling
- ๋ฐ์ดํฐ ์ ๊ทํ
-> df
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit(df)
mms.data_max_, mms.data_min_, mms.data_range_
>>>>
(array([25., 3.]), array([-10., 0.]), array([35., 3.]))
mms.inverse_transform(df_mms)
>>>>
array([[ 10., 1.],
[ 20., 2.],
[-10., 3.],
[ 0., 1.],
[ 25., 0.]])
df_mms = mms.transform(df)
df_mms
>>>>
array([[0.57142857, 0.33333333],
[0.85714286, 0.66666667],
[0. , 1. ],
[0.28571429, 0.33333333],
[1. , 0. ]])
# ๊ฐ๊ฐ์ ๋งฅ์๋ฉ์ด 1๋ก ๋ฐํ
๐ Standard Scaler
- ๋ฐ์ดํฐ ํ์คํ
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(df)
# ํ๊ท ๊ณผ ํ์คํธ์ฐจ
ss.mean_, ss.scale_
>>>>
(array([9. , 1.4]), array([12.80624847, 1.0198039 ]))
df_ss = ss.transform(df)
df_ss
>>>>
array([[ 0.07808688, -0.39223227],
[ 0.85895569, 0.58834841],
[-1.48365074, 1.56892908],
[-0.70278193, -0.39223227],
[ 1.2493901 , -1.37281295]])
#๊ธฐ์กด ๋ฐ์ดํฐ๋ 0์ด๊ณ ํ์คํธ์ฐจ๋ 1์ธ ๋ฐ์ดํฐ๋ก ๊ต์ฒด
๐ Robust Scaler
df = pd.DataFrame({
'A' : [-0.1, 0., 0.1, 0.2, 0.3, 0.4, 1.0, 1.1, 5]
})
df
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
mm = MinMaxScaler()
ss = StandardScaler()
rs = RobustScaler()
df_scaler = df.copy()
df_scaler['MinMax'] = mm.fit_transform(df)
df_scaler['Standard'] = ss.fit_transform(df)
df_scaler['Robust'] = rs.fit_transform(df)
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style='whitegrid')
plt.figure(figsize=(16, 6))
sns.boxplot(data=df_scaler, orient='h')