Study_note(zb_data)/EDA

์Šคํ„ฐ๋”” ๋…ธํŠธ (์ธ๊ตฌ ๋ฐ์ดํ„ฐ ๋ถ„์„ํ•˜๊ธฐ 2)

KloudHyun 2023. 8. 17. 18:30

๐Ÿ“Œ Population Data Analyst

๐Ÿšฉ ๋ชฉํ‘œ ๋ฐ์ดํ„ฐ

  • ์ธ๊ตฌ ์†Œ๋ฉธ ์œ„๊ธฐ ์ง€์—ญ ํŒŒ์•…
  • ์ธ๊ตฌ ์†Œ๋ฉธ ์œ„๊ธฐ ์ง€์—ญ ์ง€๋„ ํ‘œํ˜„
  • ์ง€๋„ ํ‘œํ˜„์— ๋Œ€ํ•œ ์นด๋ฅดํ† ๊ทธ๋žจ ํ‘œํ˜„

๐Ÿšฉ ์—‘์…€ ๋งต ๊ฐ€์ ธ์˜ค๊ธฐ

draw_korea_raw = pd.read_excel("../data/07_draw_korea_raw.xlsx")
draw_korea_raw

๐Ÿšฉ stack์œผ๋กœ ์žฌ ์ •๋น„

  • multi index๋กœ ๊ฐ’์„ ์ •๋ฆฌ
draw_korea_raw_stacked = pd.DataFrame(draw_korea_raw.stack())
draw_korea_raw_stacked

draw_korea_raw_stacked.reset_index(inplace=True)
draw_korea_raw_stacked
>>>>

 

draw_korea_raw_stacked.rename(
    columns={
        "level_0": "y",
        "level_1": "x",
        0 : "ID"
    }, inplace = True
)
draw_korea = draw_korea_raw_stacked

๐Ÿšฉ ํ‚ค์Šคํ† ๊ทธ๋žจ ์ƒ˜ํ”Œ ๊ทธ๋ฆฌ๊ธฐ

BORDER_LINES = [
    [(5, 1), (5, 2), (7, 2),(7, 3),(11, 3),(11, 0)], # ์ธ์ฒœ
    [(5, 4), (5, 5),(2, 5),(2, 7),(4, 7),(4, 9),(7, 9),(7, 7),
     (9, 7),(9, 5),(10, 5),(10, 4),(5, 4)], # ์„œ์šธ
    [(1, 7), (1, 8), (3, 8), (3, 10), (10, 10),(10, 7), (12, 7),
     (12, 6),(11, 6),(11, 5),(12, 5),(12, 4),(11, 4),(11, 3)], #๊ฒฝ๊ธฐ๋„
    [(8, 10), (8, 11), (6, 11), (6, 12)], #๊ฐ•์›๋„
    [(12, 5), (13, 5), (13, 4),(14, 4), (14, 5), (15, 5), (15, 4), (16, 4), (16, 2)], #์ถฉ์ฒญ๋ถ๋„
    [(16, 4), (17, 4), (17, 5),(16, 5), (16, 6), (19, 6), (19, 5),
     (20, 5), (20, 4), (21, 4), (21, 3),(19, 3), (19, 1)], #์ „๋ผ๋ถ๋„
    [(13, 5), (13, 6), (16, 6)], #๋Œ€์ „์‹œ
    [(13, 5), (14, 5)], #์„ธ์ข…์‹œ
    [(21, 2), (21, 3), (22, 3), (22, 4), (24, 4), (24, 2), (21, 2)], # ๊ด‘์ฃผ
    [(20, 5), (21, 5), (21, 6), (23, 6)], #์ „๋ผ๋‚จ๋„
    [(10, 8), (12, 8), (12, 9), (14, 9), (14, 8), (16, 8), (16, 6)],  #์ถฉ์ฒญ๋ถ๋„
    [(14, 9), (14, 11), (14, 12), (13, 12), (13, 13)], # ๊ฒฝ์ƒ๋ถ๋„
    [(15, 8), (17, 8), (17, 10), (16, 10), (16, 11), (14, 11)], # ๋Œ€๊ตฌ
    [(17, 9), (18, 9), (18, 8), (19, 8), (19, 9), (20, 9), (20, 10), (21, 10)], # ๋ถ€์‚ฐ
    [(16, 11), (16, 13)],
    [(27, 5), (27, 6), (25, 6)]
]

๐Ÿšฉ ์ง€์—ญ ์ด๋ฆ„ ํ•จ์ˆ˜

def plot_text_simple(draw_korea):
    for idx, row in draw_korea.iterrows():
        if len(row["ID"].split()) == 2:
            dispname = "{}\n{}".format(row["ID"].split()[0], row["ID"].split()[1]) 
            # ๊ธ€์ž๊ฐ€ 2๊ธ€์ž ์ด์ƒ์ด๋ฉด ๊ฐœํ–‰์„ ํ•ด์ค€๋‹ค.
        elif row["ID"][:2] == "๊ณ ์„ฑ":
        #์•ž์— ๋‘๊ธ€์ž๊ฐ€ ๊ณ ์„ฑ์ด๋ผ๋ฉด ๊ณ ์„ฑ์„ ์ถœ๋ ฅํ•œ๋‹ค (์œ„์น˜๊ฐ€ ๋‹ฌ๋ผ์„œ ๊ตฌ๋ถ„๊ฐ€๋Šฅ)
            dispname = "๊ณ ์„ฑ"
        else:
            dispname = row["ID"]
        if len(dispname.splitlines()[-1]) >= 3:
        #split ํ•˜๊ณ  ๋’ท๊ธ€์ž๊ฐ€ 3๊ธ€์ž ์ด์ƒ์ด๋ฉด
            fontsize, linespacing = 9.5, 1.5
            #ํฐํŠธ ์‚ฌ์ด์ฆˆ๋ฅผ ์ค„์—ฌ๋ผ
        else:
        #๊ทธ๊ฒŒ ์•„๋‹ˆ๋ผ๋ฉด
            fontsize, linespacing = 11, 1.2
            #๊ธฐ์กด ํฐํŠธ ์‚ฌ์ด์ฆˆ ์ ์šฉ
    
        # ์ฃผ์„ ๊ธฐ๋Šฅ์„ ํ™œ์šฉ
        plt.annotate(
            dispname, #dispname์„ ์ถœ๋ ฅ
            (row["x"] + 0.5, row["y"] + 0.5),
            #์„ ์— ๊ฒน์น˜์ง€ ์•Š๋„๋ก ๋„์›Œ์ค€๋‹ค
            weight = "bold",
            fontsize = fontsize,
            linespacing = linespacing,
            ha="center", # ์ˆ˜ํ‰ ์ •๋ ฌ
            va="center" # ์ˆ˜์ง ์ •๋ ฌ
        )

๐Ÿšฉ ์ง€๋„ ๊ทธ๋ฆผ ํ•จ์ˆ˜

def simpleDraw(draw_korea):
    plt.figure(figsize=(8, 11))
    
    plot_text_simple(draw_korea)
    
    for path in BORDER_LINES:
        ys, xs = zip(*path) 
        # x์ขŒํ‘œ, y์ขŒํ‘œ ๋ผ๋ฆฌ ๋ชจ์•„์ค€๋‹ค
        plt.plot(xs, ys, c="black", lw=1.5)
        # x, y ์ขŒํ‘œ ๊ฐ’์„ ์ง€์ •
    plt.gca().invert_yaxis() 
    # y value ๊ฐ’ ๋ฐ˜์ „
    plt.axis("off")
    plt.tight_layout()
    plt.show()

๐Ÿšฉ ๋ฐ์ดํ„ฐ ๊ฒ€์ฆ ํ›„ merge

set(draw_korea["ID"].unique()) - set(pop["ID"].unique())
>>>>
set()
set(pop["ID"].unique()) - set(draw_korea["ID"].unique())
>>>>
{'๊ณ ์–‘', '๋ถ€์ฒœ', '์„ฑ๋‚จ', '์ˆ˜์›', '์•ˆ์‚ฐ', '์•ˆ์–‘', '์šฉ์ธ', '์ „์ฃผ', '์ฐฝ์›', '์ฒœ์•ˆ', '์ฒญ์ฃผ', 'ํฌํ•ญ'}
tmp_list = list(set(pop["ID"].unique()) - set(draw_korea["ID"].unique()))
for tmp in tmp_list:
    pop = pop.drop(pop[pop["ID"]== tmp].index)
    #ํ•ด๋‹นํ•˜๋Š” ์ง€์—ญ์„ ์ง€์›Œ๋ฒ„๋ฆฐ๋‹ค.
print(set(pop["ID"].unique()) - set(draw_korea["ID"].unique()))
pop = pd.merge(pop, draw_korea, how="left", on="ID")
pop.head()

๐Ÿšฉ ์นด๋ฅดํ† ๊ทธ๋žจ ์‹œ๊ฐํ™”

def get_data_info(targetData, blockedMap):
    whitelabelmin = (
        max(blockedMap[targetData]) - min(blockedMap[targetData])
    ) * 0.25 + min(blockedMap[targetData])
    vmin = min(blockedMap[targetData])
    vmax = max(blockedMap[targetData])
    
    mapdata = blockedMap.pivot_table(index="y", columns="x", values=targetData)
    
    return mapdata, vmax, vmin, whitelabelmin
def get_data_info_for_zero_center(targetData, blockedMap):
    
    whitelabelmin = 5
    tmp_max = max(
        [np.abs(min(blockedMap[targetData])), np.abs(max(blockedMap[targetData]))]
    )
    vmin, vmax = -tmp_max, tmp_max
    mapdata = blockedMap.pivot_table(index="y", columns="x", values=targetData)
    return mapdata, vmax, vmin, whitelabelmin
def plot_text(targetData, blockedMap, whitelabelmin):
    for idx, row in blockedMap.iterrows():
        if len(row["ID"].split()) == 2:
            dispname = "{}\n{}".format(row["ID"].split()[0], row["ID"].split()[1]) # ๊ธ€์ž๊ฐ€ 2๊ธ€์ž ์ด์ƒ์ด๋ฉด ๊ฐœํ–‰์„ ํ•ด์ค€๋‹ค.
        elif row["ID"][:2] == "๊ณ ์„ฑ":
            dispname = "๊ณ ์„ฑ"
        else:
            dispname = row["ID"]
        if len(dispname.splitlines()[-1]) >= 3:
            fontsize, linespacing = 9.5, 1.5
        else:
            fontsize, linespacing = 11, 1.2
        
        annocolor = "white" if np.abs(row[targetData]) > whitelabelmin else "black"
        
        # ์ฃผ์„ ๊ธฐ๋Šฅ์„ ํ™œ์šฉ
        plt.annotate(
            dispname, 
            (row["x"] + 0.5, row["y"] + 0.5),
            weight = "bold",
            color = annocolor,
            fontsize = fontsize,
            linespacing = linespacing,
            ha="center", # ์ˆ˜ํ‰ ์ •๋ ฌ
            va="center" # ์ˆ˜์ง ์ •๋ ฌ
        )
def drawKorea(targetData, blockedMap, cmapname, zeroCenter=False):
    if zeroCenter:
        masked_mapdata, vmax, vmin, whitelabelmin = get_data_info_for_zero_center(targetData, blockedMap)
    if not zeroCenter:
        masked_mapdata, vmax, vmin, whitelabelmin = get_data_info(targetData, blockedMap)
    
    plt.figure(figsize=(8, 11))
    plt.pcolor(masked_mapdata, vmin=vmin, vmax=vmax, cmap=cmapname, edgecolor = "#aaaaaa", linewidth=0.5)
    
    plot_text(targetData, blockedMap, whitelabelmin)
    
    for path in BORDER_LINES:
        ys, xs = zip(*path) # x์ขŒํ‘œ, y์ขŒํ‘œ ๋ผ๋ฆฌ ๋ชจ์•„์ค€๋‹ค
        plt.plot(xs, ys, c="black", lw=1.5)
    plt.gca().invert_yaxis() # y value ๊ฐ’ ๋ฐ˜์ „
    plt.axis("off")
    plt.tight_layout()
    cb = plt.colorbar(shrink=0.1, aspect=10)
    cb.set_label(targetData)
    plt.show()

์™ผ : ์ธ๊ตฌ์ˆ˜ ํ•ฉ๊ณ„ / ์˜ค : ์†Œ๋ฉธ์œ„๊ธฐ์ง€์—ญ

 

์™ผ : ์—ฌ์„ฑ๋น„ / ์˜ค : 2030 ์—ฌ์„ฑ๋น„

๐Ÿšฉ ์ง€๋„ ์‹œ๊ฐํ™”

import folium
import json

pop_folium = pop.set_index("ID")
pop_folium.head()
geo_path = "../data/07_skorea_municipalities_geo_simple.json"
geo_str = json.load(open(geo_path, encoding="utf-8"))
#์ธ๊ตฌ์ˆ˜ ํ•ฉ๊ณ„
mymap = folium.Map(location = [36.2002, 127.054], zoom_start=7)
mymap.choropleth(
    geo_data=geo_str,
    data=pop_folium["์ธ๊ตฌ์ˆ˜ํ•ฉ๊ณ„"],
    key_on="feature.id",
    columns = [pop_folium.index, pop_folium["์ธ๊ตฌ์ˆ˜ํ•ฉ๊ณ„"]],
    fill_color="YlGnBu"
)
mymap

#์†Œ๋ฉธ์œ„๊ธฐ์ง€์—ญ ์‹œ๊ฐํ™”
mymap = folium.Map(location = [36.2002, 127.054], zoom_start=7)
mymap.choropleth(
    geo_data=geo_str,
    data=pop_folium["์†Œ๋ฉธ์œ„๊ธฐ์ง€์—ญ"],
    key_on="feature.id",
    columns = [pop_folium.index, pop_folium["์†Œ๋ฉธ์œ„๊ธฐ์ง€์—ญ"]],
    fill_color="PuRd"
)
mymap