Study_note(zb_data)/EDA
์คํฐ๋ ๋ ธํธ (์ธ๊ตฌ ๋ฐ์ดํฐ ๋ถ์ํ๊ธฐ 2)
KloudHyun
2023. 8. 17. 18:30
๐ Population Data Analyst
๐ฉ ๋ชฉํ ๋ฐ์ดํฐ
์ธ๊ตฌ ์๋ฉธ ์๊ธฐ ์ง์ญ ํ์- ์ธ๊ตฌ ์๋ฉธ ์๊ธฐ ์ง์ญ ์ง๋ ํํ
- ์ง๋ ํํ์ ๋ํ ์นด๋ฅดํ ๊ทธ๋จ ํํ
๐ฉ ์์ ๋งต ๊ฐ์ ธ์ค๊ธฐ
draw_korea_raw = pd.read_excel("../data/07_draw_korea_raw.xlsx")
draw_korea_raw
๐ฉ stack์ผ๋ก ์ฌ ์ ๋น
- multi index๋ก ๊ฐ์ ์ ๋ฆฌ
draw_korea_raw_stacked = pd.DataFrame(draw_korea_raw.stack())
draw_korea_raw_stacked
draw_korea_raw_stacked.reset_index(inplace=True)
draw_korea_raw_stacked
>>>>
draw_korea_raw_stacked.rename(
columns={
"level_0": "y",
"level_1": "x",
0 : "ID"
}, inplace = True
)
draw_korea = draw_korea_raw_stacked
๐ฉ ํค์คํ ๊ทธ๋จ ์ํ ๊ทธ๋ฆฌ๊ธฐ
BORDER_LINES = [
[(5, 1), (5, 2), (7, 2),(7, 3),(11, 3),(11, 0)], # ์ธ์ฒ
[(5, 4), (5, 5),(2, 5),(2, 7),(4, 7),(4, 9),(7, 9),(7, 7),
(9, 7),(9, 5),(10, 5),(10, 4),(5, 4)], # ์์ธ
[(1, 7), (1, 8), (3, 8), (3, 10), (10, 10),(10, 7), (12, 7),
(12, 6),(11, 6),(11, 5),(12, 5),(12, 4),(11, 4),(11, 3)], #๊ฒฝ๊ธฐ๋
[(8, 10), (8, 11), (6, 11), (6, 12)], #๊ฐ์๋
[(12, 5), (13, 5), (13, 4),(14, 4), (14, 5), (15, 5), (15, 4), (16, 4), (16, 2)], #์ถฉ์ฒญ๋ถ๋
[(16, 4), (17, 4), (17, 5),(16, 5), (16, 6), (19, 6), (19, 5),
(20, 5), (20, 4), (21, 4), (21, 3),(19, 3), (19, 1)], #์ ๋ผ๋ถ๋
[(13, 5), (13, 6), (16, 6)], #๋์ ์
[(13, 5), (14, 5)], #์ธ์ข
์
[(21, 2), (21, 3), (22, 3), (22, 4), (24, 4), (24, 2), (21, 2)], # ๊ด์ฃผ
[(20, 5), (21, 5), (21, 6), (23, 6)], #์ ๋ผ๋จ๋
[(10, 8), (12, 8), (12, 9), (14, 9), (14, 8), (16, 8), (16, 6)], #์ถฉ์ฒญ๋ถ๋
[(14, 9), (14, 11), (14, 12), (13, 12), (13, 13)], # ๊ฒฝ์๋ถ๋
[(15, 8), (17, 8), (17, 10), (16, 10), (16, 11), (14, 11)], # ๋๊ตฌ
[(17, 9), (18, 9), (18, 8), (19, 8), (19, 9), (20, 9), (20, 10), (21, 10)], # ๋ถ์ฐ
[(16, 11), (16, 13)],
[(27, 5), (27, 6), (25, 6)]
]
๐ฉ ์ง์ญ ์ด๋ฆ ํจ์
def plot_text_simple(draw_korea):
for idx, row in draw_korea.iterrows():
if len(row["ID"].split()) == 2:
dispname = "{}\n{}".format(row["ID"].split()[0], row["ID"].split()[1])
# ๊ธ์๊ฐ 2๊ธ์ ์ด์์ด๋ฉด ๊ฐํ์ ํด์ค๋ค.
elif row["ID"][:2] == "๊ณ ์ฑ":
#์์ ๋๊ธ์๊ฐ ๊ณ ์ฑ์ด๋ผ๋ฉด ๊ณ ์ฑ์ ์ถ๋ ฅํ๋ค (์์น๊ฐ ๋ฌ๋ผ์ ๊ตฌ๋ถ๊ฐ๋ฅ)
dispname = "๊ณ ์ฑ"
else:
dispname = row["ID"]
if len(dispname.splitlines()[-1]) >= 3:
#split ํ๊ณ ๋ท๊ธ์๊ฐ 3๊ธ์ ์ด์์ด๋ฉด
fontsize, linespacing = 9.5, 1.5
#ํฐํธ ์ฌ์ด์ฆ๋ฅผ ์ค์ฌ๋ผ
else:
#๊ทธ๊ฒ ์๋๋ผ๋ฉด
fontsize, linespacing = 11, 1.2
#๊ธฐ์กด ํฐํธ ์ฌ์ด์ฆ ์ ์ฉ
# ์ฃผ์ ๊ธฐ๋ฅ์ ํ์ฉ
plt.annotate(
dispname, #dispname์ ์ถ๋ ฅ
(row["x"] + 0.5, row["y"] + 0.5),
#์ ์ ๊ฒน์น์ง ์๋๋ก ๋์์ค๋ค
weight = "bold",
fontsize = fontsize,
linespacing = linespacing,
ha="center", # ์ํ ์ ๋ ฌ
va="center" # ์์ง ์ ๋ ฌ
)
๐ฉ ์ง๋ ๊ทธ๋ฆผ ํจ์
def simpleDraw(draw_korea):
plt.figure(figsize=(8, 11))
plot_text_simple(draw_korea)
for path in BORDER_LINES:
ys, xs = zip(*path)
# x์ขํ, y์ขํ ๋ผ๋ฆฌ ๋ชจ์์ค๋ค
plt.plot(xs, ys, c="black", lw=1.5)
# x, y ์ขํ ๊ฐ์ ์ง์
plt.gca().invert_yaxis()
# y value ๊ฐ ๋ฐ์
plt.axis("off")
plt.tight_layout()
plt.show()
๐ฉ ๋ฐ์ดํฐ ๊ฒ์ฆ ํ merge
set(draw_korea["ID"].unique()) - set(pop["ID"].unique())
>>>>
set()
set(pop["ID"].unique()) - set(draw_korea["ID"].unique())
>>>>
{'๊ณ ์', '๋ถ์ฒ', '์ฑ๋จ', '์์', '์์ฐ', '์์', '์ฉ์ธ', '์ ์ฃผ', '์ฐฝ์', '์ฒ์', '์ฒญ์ฃผ', 'ํฌํญ'}
tmp_list = list(set(pop["ID"].unique()) - set(draw_korea["ID"].unique()))
for tmp in tmp_list:
pop = pop.drop(pop[pop["ID"]== tmp].index)
#ํด๋นํ๋ ์ง์ญ์ ์ง์๋ฒ๋ฆฐ๋ค.
print(set(pop["ID"].unique()) - set(draw_korea["ID"].unique()))
pop = pd.merge(pop, draw_korea, how="left", on="ID")
pop.head()
๐ฉ ์นด๋ฅดํ ๊ทธ๋จ ์๊ฐํ
def get_data_info(targetData, blockedMap):
whitelabelmin = (
max(blockedMap[targetData]) - min(blockedMap[targetData])
) * 0.25 + min(blockedMap[targetData])
vmin = min(blockedMap[targetData])
vmax = max(blockedMap[targetData])
mapdata = blockedMap.pivot_table(index="y", columns="x", values=targetData)
return mapdata, vmax, vmin, whitelabelmin
def get_data_info_for_zero_center(targetData, blockedMap):
whitelabelmin = 5
tmp_max = max(
[np.abs(min(blockedMap[targetData])), np.abs(max(blockedMap[targetData]))]
)
vmin, vmax = -tmp_max, tmp_max
mapdata = blockedMap.pivot_table(index="y", columns="x", values=targetData)
return mapdata, vmax, vmin, whitelabelmin
def plot_text(targetData, blockedMap, whitelabelmin):
for idx, row in blockedMap.iterrows():
if len(row["ID"].split()) == 2:
dispname = "{}\n{}".format(row["ID"].split()[0], row["ID"].split()[1]) # ๊ธ์๊ฐ 2๊ธ์ ์ด์์ด๋ฉด ๊ฐํ์ ํด์ค๋ค.
elif row["ID"][:2] == "๊ณ ์ฑ":
dispname = "๊ณ ์ฑ"
else:
dispname = row["ID"]
if len(dispname.splitlines()[-1]) >= 3:
fontsize, linespacing = 9.5, 1.5
else:
fontsize, linespacing = 11, 1.2
annocolor = "white" if np.abs(row[targetData]) > whitelabelmin else "black"
# ์ฃผ์ ๊ธฐ๋ฅ์ ํ์ฉ
plt.annotate(
dispname,
(row["x"] + 0.5, row["y"] + 0.5),
weight = "bold",
color = annocolor,
fontsize = fontsize,
linespacing = linespacing,
ha="center", # ์ํ ์ ๋ ฌ
va="center" # ์์ง ์ ๋ ฌ
)
def drawKorea(targetData, blockedMap, cmapname, zeroCenter=False):
if zeroCenter:
masked_mapdata, vmax, vmin, whitelabelmin = get_data_info_for_zero_center(targetData, blockedMap)
if not zeroCenter:
masked_mapdata, vmax, vmin, whitelabelmin = get_data_info(targetData, blockedMap)
plt.figure(figsize=(8, 11))
plt.pcolor(masked_mapdata, vmin=vmin, vmax=vmax, cmap=cmapname, edgecolor = "#aaaaaa", linewidth=0.5)
plot_text(targetData, blockedMap, whitelabelmin)
for path in BORDER_LINES:
ys, xs = zip(*path) # x์ขํ, y์ขํ ๋ผ๋ฆฌ ๋ชจ์์ค๋ค
plt.plot(xs, ys, c="black", lw=1.5)
plt.gca().invert_yaxis() # y value ๊ฐ ๋ฐ์
plt.axis("off")
plt.tight_layout()
cb = plt.colorbar(shrink=0.1, aspect=10)
cb.set_label(targetData)
plt.show()
๐ฉ ์ง๋ ์๊ฐํ
import folium
import json
pop_folium = pop.set_index("ID")
pop_folium.head()
geo_path = "../data/07_skorea_municipalities_geo_simple.json"
geo_str = json.load(open(geo_path, encoding="utf-8"))
#์ธ๊ตฌ์ ํฉ๊ณ
mymap = folium.Map(location = [36.2002, 127.054], zoom_start=7)
mymap.choropleth(
geo_data=geo_str,
data=pop_folium["์ธ๊ตฌ์ํฉ๊ณ"],
key_on="feature.id",
columns = [pop_folium.index, pop_folium["์ธ๊ตฌ์ํฉ๊ณ"]],
fill_color="YlGnBu"
)
mymap
#์๋ฉธ์๊ธฐ์ง์ญ ์๊ฐํ
mymap = folium.Map(location = [36.2002, 127.054], zoom_start=7)
mymap.choropleth(
geo_data=geo_str,
data=pop_folium["์๋ฉธ์๊ธฐ์ง์ญ"],
key_on="feature.id",
columns = [pop_folium.index, pop_folium["์๋ฉธ์๊ธฐ์ง์ญ"]],
fill_color="PuRd"
)
mymap