๋ณธ๋ฌธ ๋ฐ”๋กœ๊ฐ€๊ธฐ

Study_note(zb_data)/Machine Learning

์Šคํ„ฐ๋””๋…ธํŠธ (ML3)

๐Ÿ“Œ Titanic_EDA

๐Ÿ”ป Titanic ์ƒ์กด์œจ ํ™•์ธํ•˜๊ธฐ

import pandas as pd

titanic = pd.read_excel('../data/titanic.xls')
titanic.head()
import matplotlib.pyplot as plt
import seaborn as sns

# graph 2๊ฐœ ์ œ์ž‘
f, ax = plt.subplots(1, 2, figsize = (16, 8)); 

# autopct = %ํ‘œ์‹œ
titanic['survived'].value_counts().plot.pie(ax=ax[0], autopct='%1.1f%%', shadow=True, explode = [0, 0.1]);
ax[0].set_title('Pie plot - survived')
ax[0].set_ylabel('')

sns.countplot(x='survived', data=titanic, ax=ax[1])
ax[1].set_title('Count plot - survived')
plt.show()

๐Ÿ”ป ์„ฑ๋ณ„์„ ๋†“๊ณ  bar ์ฐจํŠธ๋กœ ํ™•์ธํ•ด๋ณด์ž

# graph 2๊ฐœ ์ œ์ž‘
f, ax = plt.subplots(1, 2, figsize = (16, 8));

sns.countplot(x='sex', data=titanic, ax=ax[0])
ax[0].set_title('Count of passengers of sex')
ax[0].set_ylabel('')

sns.countplot(x='sex', data=titanic, hue='survived', ax=ax[1])
ax[1].set_title('sex : survived and Unsurvived')
plt.show()

๐Ÿ”ป crosstab

# survived
pd.crosstab(titanic['pclass'], titanic['survived'], margins=True)

๐Ÿ”ป Class ๋ณ„๋กœ ๊ตฌ๋ถ„ ํ•ด๋ณด๊ธฐ

- 3๋“ฑ์‹ค์— 20๋Œ€ ๋‚จ์„ฑ์ด ๋งŽ์•˜๋‹ค๋Š” ์‚ฌ์‹ค์„ ์•Œ ์ˆ˜ ์žˆ๋‹ค.

grid = sns.FacetGrid(titanic, row='pclass', col='sex', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha=0.8, bins=20)
grid.add_legend()

๐Ÿ”ป๋‚˜์ด๋ณ„ ์Šน๊ฐ ํ˜„ํ™ฉ

- ์•„์ด์™€ 2-30๋Œ€๊ฐ€ ๋งŽ์•˜๋‹ค๋Š” ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ๋‹ค.

import plotly_express as px
fig = px.histogram(titanic, x='age')
fig.show()

๐Ÿ”ป๋‚˜์ด๋ณ„ ์Šน๊ฐ ํ˜„ํ™ฉ

- ๋“ฑ์‹ค๋ณ„ ์ƒ์กด๋ฅ ์„ ์—ฐ๋ น๋ณ„๋กœ ํ™•์ธํ•˜๊ธฐ

- ์„ ์‹ค ๋“ฑ๊ธ‰์ด ๋†’์„์ˆ˜๋ก ์ƒ์กด๋ฅ ์ด ๋†’์€ ๋“ฏ ๋‚˜์˜จ๋‹ค.

grid = sns.FacetGrid(titanic, row='pclass', col='survived', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha=0.5, bins=20)
grid.add_legend()

๐Ÿ”ป๋‚˜์ด๋ฅผ ์ •ํ™•ํ•˜๊ฒŒ ๊ตฌ๋ถ„ํ•ด๋ณด์ž

- pandas์˜ cut์„ ํ™œ์šฉํ•˜์—ฌ label์„ ๋ถ™์—ฌ๋ณด์ž

titanic['age_category'] = pd.cut(titanic['age'], bins=[0,7,15,30,60,100],
       include_lowest=True,
       labels=['baby', 'teen', 'young', 'adult', 'old'])
titanic.head()

- ์–ด๋ฆฌ๊ณ , ์—ฌ์„ฑ, 1๋“ฑ์‹ค์ผ ์ˆ˜๋ก ์ƒ์กดํ•˜๊ธฐ ์œ ๋ฆฌํ–ˆ์„๊นŒ?

plt.figure(figsize=(12, 4))
plt.subplot(131)
sns.barplot(x='pclass', y='survived', data=titanic)
plt.subplot(132)
sns.barplot(x='age_category', y='survived', data=titanic)
plt.subplot(133)
sns.barplot(x='sex', y='survived', data=titanic)
plt.show()

๐Ÿ”ป๋‚จ/์—ฌ ๋‚˜์ด๋ณ„ ์ƒ์กด ์ƒํ™ฉ์„ ๋ณด๋‹ค ๋” ๋“ค์—ฌ๋‹ค๋ณด์ž

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
women = titanic[titanic['sex'] == 'female']
men = titanic[titanic['sex'] == 'male']

ax = sns.distplot(women[women['survived']==1]['age'], bins=20, label='survived', ax=axes[0], kde=False)
ax = sns.distplot(women[women['survived']==0]['age'], bins=40, label='not survived', ax=axes[0], kde=False)
ax.legend(); ax.set_title('Female')

ax = sns.distplot(men[men['survived']==1]['age'], bins=20, label='survived', ax=axes[1], kde=False)
ax = sns.distplot(men[men['survived']==0]['age'], bins=40, label='not survived', ax=axes[1], kde=False)
ax.legend(); ax.set_title('male')

๐Ÿ”ป์‚ฌํšŒ์  ์‹ ๋ถ„์„ ์ •๋ฆฌํ•˜๊ณ  ๊ทธ๋ž˜ํ”„๋กœ ๋‚˜ํƒ€๋‚ด๋ณด์ž

import re

title = []
for idx, dataset in titanic.iterrows():
    tmp = dataset['name']
    title.append(re.search('\,\s\w+(\s\w+)?\.', tmp).group()[2:-1])

title
>>>>
['Miss',
 'Master',
 'Miss',
 'Mr',
 'Mrs',
 'Mr',
...

pd.crosstab(titanic['title'], titanic['sex'])

titanic['title'].unique()
>>>>
array(['Miss', 'Master', 'Mr', 'Mrs', 'Col', 'Mme', 'Dr', 'Major', 'Capt',
       'Lady', 'Sir', 'Mlle', 'Dona', 'Jonkheer', 'the Countess', 'Don',
       'Rev', 'Ms'], dtype=object)

# ์‚ฌํšŒ์  ์‹ ๋ถ„ ํƒ€์ดํ‹€์„ ๊ฐ„๋žตํ™”ํ•˜์ž
titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Mrs')

Rare_f = ['Dona', 'Lady', 'the Countess']
Rare_m = ['Capt', 'Col', 'Don', 'Major', 'Rev', 'Sir', 'Dr', 'Master', 'Jonkheer']

for each in Rare_f:
    titanic['title'] = titanic['title'].replace(each, 'Rare_f')

for each in Rare_m:
    titanic['title'] = titanic['title'].replace(each, 'Rare_m')

๐Ÿ”ปgroup by๋กœ ์‹ ๋ถ„์„ ๋ฌถ์–ด์„œ ํ‘œํ˜„ํ•ด๋ณด์ž

- ๊ท€์กฑ์˜ ๋‚จ์„ฑ --> ์ ˆ๋ฐ˜๋„ ์‚ด์•„๋‚จ์ง€ ๋ชปํ–ˆ๋‹ค.

titanic[['title', 'survived']].groupby(['title'], as_index=False).mean()