๐ Titanic_EDA
๐ป Titanic ์์กด์จ ํ์ธํ๊ธฐ
import pandas as pd
titanic = pd.read_excel('../data/titanic.xls')
titanic.head()
import matplotlib.pyplot as plt
import seaborn as sns
# graph 2๊ฐ ์ ์
f, ax = plt.subplots(1, 2, figsize = (16, 8));
# autopct = %ํ์
titanic['survived'].value_counts().plot.pie(ax=ax[0], autopct='%1.1f%%', shadow=True, explode = [0, 0.1]);
ax[0].set_title('Pie plot - survived')
ax[0].set_ylabel('')
sns.countplot(x='survived', data=titanic, ax=ax[1])
ax[1].set_title('Count plot - survived')
plt.show()
๐ป ์ฑ๋ณ์ ๋๊ณ bar ์ฐจํธ๋ก ํ์ธํด๋ณด์
# graph 2๊ฐ ์ ์
f, ax = plt.subplots(1, 2, figsize = (16, 8));
sns.countplot(x='sex', data=titanic, ax=ax[0])
ax[0].set_title('Count of passengers of sex')
ax[0].set_ylabel('')
sns.countplot(x='sex', data=titanic, hue='survived', ax=ax[1])
ax[1].set_title('sex : survived and Unsurvived')
plt.show()
๐ป crosstab
# survived
pd.crosstab(titanic['pclass'], titanic['survived'], margins=True)
๐ป Class ๋ณ๋ก ๊ตฌ๋ถ ํด๋ณด๊ธฐ
- 3๋ฑ์ค์ 20๋ ๋จ์ฑ์ด ๋ง์๋ค๋ ์ฌ์ค์ ์ ์ ์๋ค.
grid = sns.FacetGrid(titanic, row='pclass', col='sex', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha=0.8, bins=20)
grid.add_legend()
๐ป๋์ด๋ณ ์น๊ฐ ํํฉ
- ์์ด์ 2-30๋๊ฐ ๋ง์๋ค๋ ๊ฒ์ ์ ์ ์๋ค.
import plotly_express as px
fig = px.histogram(titanic, x='age')
fig.show()
๐ป๋์ด๋ณ ์น๊ฐ ํํฉ
- ๋ฑ์ค๋ณ ์์กด๋ฅ ์ ์ฐ๋ น๋ณ๋ก ํ์ธํ๊ธฐ
- ์ ์ค ๋ฑ๊ธ์ด ๋์์๋ก ์์กด๋ฅ ์ด ๋์ ๋ฏ ๋์จ๋ค.
grid = sns.FacetGrid(titanic, row='pclass', col='survived', height=4, aspect=2)
grid.map(plt.hist, 'age', alpha=0.5, bins=20)
grid.add_legend()
๐ป๋์ด๋ฅผ ์ ํํ๊ฒ ๊ตฌ๋ถํด๋ณด์
- pandas์ cut์ ํ์ฉํ์ฌ label์ ๋ถ์ฌ๋ณด์
titanic['age_category'] = pd.cut(titanic['age'], bins=[0,7,15,30,60,100],
include_lowest=True,
labels=['baby', 'teen', 'young', 'adult', 'old'])
titanic.head()
- ์ด๋ฆฌ๊ณ , ์ฌ์ฑ, 1๋ฑ์ค์ผ ์๋ก ์์กดํ๊ธฐ ์ ๋ฆฌํ์๊น?
plt.figure(figsize=(12, 4))
plt.subplot(131)
sns.barplot(x='pclass', y='survived', data=titanic)
plt.subplot(132)
sns.barplot(x='age_category', y='survived', data=titanic)
plt.subplot(133)
sns.barplot(x='sex', y='survived', data=titanic)
plt.show()
๐ป๋จ/์ฌ ๋์ด๋ณ ์์กด ์ํฉ์ ๋ณด๋ค ๋ ๋ค์ฌ๋ค๋ณด์
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
women = titanic[titanic['sex'] == 'female']
men = titanic[titanic['sex'] == 'male']
ax = sns.distplot(women[women['survived']==1]['age'], bins=20, label='survived', ax=axes[0], kde=False)
ax = sns.distplot(women[women['survived']==0]['age'], bins=40, label='not survived', ax=axes[0], kde=False)
ax.legend(); ax.set_title('Female')
ax = sns.distplot(men[men['survived']==1]['age'], bins=20, label='survived', ax=axes[1], kde=False)
ax = sns.distplot(men[men['survived']==0]['age'], bins=40, label='not survived', ax=axes[1], kde=False)
ax.legend(); ax.set_title('male')
๐ป์ฌํ์ ์ ๋ถ์ ์ ๋ฆฌํ๊ณ ๊ทธ๋ํ๋ก ๋ํ๋ด๋ณด์
import re
title = []
for idx, dataset in titanic.iterrows():
tmp = dataset['name']
title.append(re.search('\,\s\w+(\s\w+)?\.', tmp).group()[2:-1])
title
>>>>
['Miss',
'Master',
'Miss',
'Mr',
'Mrs',
'Mr',
...
pd.crosstab(titanic['title'], titanic['sex'])
titanic['title'].unique()
>>>>
array(['Miss', 'Master', 'Mr', 'Mrs', 'Col', 'Mme', 'Dr', 'Major', 'Capt',
'Lady', 'Sir', 'Mlle', 'Dona', 'Jonkheer', 'the Countess', 'Don',
'Rev', 'Ms'], dtype=object)
# ์ฌํ์ ์ ๋ถ ํ์ดํ์ ๊ฐ๋ตํํ์
titanic['title'] = titanic['title'].replace('Mlle', 'Miss')
titanic['title'] = titanic['title'].replace('Ms', 'Miss')
titanic['title'] = titanic['title'].replace('Mme', 'Mrs')
Rare_f = ['Dona', 'Lady', 'the Countess']
Rare_m = ['Capt', 'Col', 'Don', 'Major', 'Rev', 'Sir', 'Dr', 'Master', 'Jonkheer']
for each in Rare_f:
titanic['title'] = titanic['title'].replace(each, 'Rare_f')
for each in Rare_m:
titanic['title'] = titanic['title'].replace(each, 'Rare_m')
๐ปgroup by๋ก ์ ๋ถ์ ๋ฌถ์ด์ ํํํด๋ณด์
- ๊ท์กฑ์ ๋จ์ฑ --> ์ ๋ฐ๋ ์ด์๋จ์ง ๋ชปํ๋ค.
titanic[['title', 'survived']].groupby(['title'], as_index=False).mean()
'Study_note(zb_data) > Machine Learning' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
์คํฐ๋๋ ธํธ (ML6_Wine) (0) | 2023.09.22 |
---|---|
์คํฐ๋๋ ธํธ (ML5) (0) | 2023.09.21 |
์คํฐ๋๋ ธํธ (ML4) (0) | 2023.09.21 |
์คํฐ๋ ๋ ธํธ (ML2) (0) | 2023.09.19 |
์คํฐ๋ ๋ ธํธ (ML) (0) | 2023.09.19 |