Study_note(zb_data)/Machine Learning

μŠ€ν„°λ””λ…ΈνŠΈ (μžμ—°μ–΄ 처리)

KloudHyun 2023. 10. 4. 00:02

πŸ“Œ konlpy (ν•œκ΅­μ–΄ μžμ—°μ–΄ 처리)

-> Kkma, Hannanum, Okt λ“±μ˜ νŒ¨ν‚€μ§€λ‘œ μžμ—°μ–΄ 뢄석 μ²˜λ¦¬κ°€ κ°€λŠ₯ν•˜λ‹€.

→ 각각 μ²˜λ¦¬ν•˜λŠ” 방식이 μ•½κ°„μ”© 차이가 있음

πŸ”»Kkma

→ sentences, nouns, pos

from konlpy.tag import Kkma
kkma = Kkma()

# λ¬Έμž₯ μΆ”μΆœ
kkma.sentences('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
['ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€', 'μž¬λ―Έμžˆμ–΄μš”~~']

# λͺ…사 μΆ”μΆœ
kkma.nouns('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
['ν•œκ΅­μ–΄', '뢄석']

# ν˜•νƒœμ†Œ λΆ€μ°© (Tagging)
kkma.pos('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
[('ν•œκ΅­μ–΄', 'NNG'),
 ('뢄석', 'NNG'),
 ('을', 'JKO'),
 ('μ‹œμž‘ν•˜', 'VV'),
 ('γ…‚λ‹ˆλ‹€', 'EFN'),
 ('재미있', 'VA'),
 ('μ–΄μš”', 'EFN'),
 ('~~', 'SW')]

πŸ”»Hannanum

→ nouns, morphs, pos

from konlpy.tag import Hannanum
hannanum = Hannanum()

# λͺ…사 μΆ”μΆœ
hannanum.nouns('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
['ν•œκ΅­μ–΄', '뢄석', 'μ‹œμž‘']

# ν˜•νƒœμ†Œ μΆ”μΆœ
hannanum.morphs('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
['ν•œκ΅­μ–΄', '뢄석', '을', 'μ‹œμž‘', 'ν•˜', 'γ…‚λ‹ˆλ‹€', '재미있', 'μ–΄μš”', '~~']

# ν’ˆμ‚¬ λΆ€μ°©
hannanum.pos('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
[('ν•œκ΅­μ–΄', 'N'),
 ('뢄석', 'N'),
 ('을', 'J'),
 ('μ‹œμž‘', 'N'),
 ('ν•˜', 'X'),
 ('γ…‚λ‹ˆλ‹€', 'E'),
 ('재미있', 'P'),
 ('μ–΄μš”', 'E'),
 ('~~', 'S')]

πŸ”»Okt

→ nouns, morphs, pos

from konlpy.tag import Okt
t = Okt()

# λͺ…사 μΆ”μΆœ
t.nouns('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
['ν•œκ΅­μ–΄', '뢄석', 'μ‹œμž‘']

# ν˜•νƒœμ†Œ μΆ”μΆœ
t.morphs('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
['ν•œκ΅­μ–΄', '뢄석', '을', 'μ‹œμž‘', 'ν•©λ‹ˆλ‹€', 'μž¬λ―Έμžˆμ–΄μš”', '~~']

# ν’ˆμ‚¬ λΆ€μ°©
t.pos('ν•œκ΅­μ–΄ 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€ μž¬λ―Έμžˆμ–΄μš”~~')
>>>>
[('ν•œκ΅­μ–΄', 'Noun'),
 ('뢄석', 'Noun'),
 ('을', 'Josa'),
 ('μ‹œμž‘', 'Noun'),
 ('ν•©λ‹ˆλ‹€', 'Verb'),
 ('μž¬λ―Έμžˆμ–΄μš”', 'Adjective'),
 ('~~', 'Punctuation')]

πŸ“Œ Wordcloud

πŸ”»μ›Œλ“œν΄λΌμš°λ“œ λ§Œλ“€μ–΄λ³΄κΈ°

from wordcloud import WordCloud, STOPWORDS
import numpy as np
from PIL import Image

# alice μ†Œμ„€ txt 파일
text = open('../data/06_alice.txt').read()

# alice 이미지 png 파일
alice_mask = np.array(Image.open("../data/06_alice_mask.png"))

# wordcloudμ—μ„œ μ œμ™Έλ˜λŠ” 단어 (saidλΌλŠ” 단어도 기쑴에 있던 단어에 μΆ”κ°€μ‹œν‚¨λ‹€)
stopwords = set(STOPWORDS)
stopwords.add("said")

πŸ”»plot으둜 그렀보자

import matplotlib.pyplot as plt
from matplotlib import font_manager, rc

rc('font', family = 'Malgun Gothic')

plt.figure(figsize=(8, 8))

# image show
# alice_mask.png νŒŒμΌμ„ λΏŒλ¦°λ‹€
# interpolation -> λΈ”λŸ­ 사이에 그림이 μ΄μ–΄μ§ˆλ•Œ μ–΄λ–»κ²Œ μ΄μ–΄μ§ˆμ§€ κ²°μ •
plt.imshow(alice_mask, cmap='Greys_r', interpolation='bilinear')

# λ°°κ²½ white
# μ΅œλŒ€ 단어 수 2000개
# mask -> 그림 파일
# stopwords -> λΆˆμš©μ–΄, μ˜ˆμ™Έμ²˜λ¦¬
wc = WordCloud(
    background_color="white", max_words=2000, mask = alice_mask, stopwords=stopwords
)

# 생성
wc = wc.generate(text)
# Alice 단어λ₯Ό 1둜 κΈ°μ€€, λ‹€λ₯Έ λ‹¨μ–΄μ˜ 비쀑

wc.words_
>>>>
{'Alice': 1.0,
 'little': 0.29508196721311475,
 'one': 0.27595628415300544,
 'know': 0.2459016393442623,
 'went': 0.226775956284153,
 .....
plt.figure(figsize=(12, 12))
# image show
# wc = 이전에 μ„€μ •ν•œ μ›Œλ“œ ν΄λΌμš°λ“œ
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


πŸ“Œ Wordcloud 2

# a_new_hope file read
text = open('../data/06_a_new_hope.txt').read()

# 단어 처리
text = text.replace('HAN', 'Han')
text = text.replace("LUKE's", 'Luke')

# 배경이 될 κ·Έλ¦Ό png μ„€μ •
mask = np.array(Image.open("../data/06_stormtrooper_mask.png"))
# λΆˆμš©μ–΄ 처리 및 단어 μΆ”κ°€
stopwords = set(STOPWORDS)
stopwords.add("int")
stopwords.add("ext")
# μ΅œλŒ€ 단어 1000개
# mask, λΆˆμš©μ–΄, margin
wc = WordCloud(
    max_words=1000, mask = mask, stopwords=stopwords, margin=10
).generate(text)
# μ›Œλ“œν΄λΌμš°λ“œ μ•ˆμ— λ“€μ–΄κ°€λŠ” κΈ€μž 색을 μ§€μ •
# μ›Œλ“œν΄λΌμš°λ“œ ν™ˆνŽ˜μ΄μ§€μ— μ½”λ“œκ°€ κ²Œμž¬λ˜μ–΄μžˆμœΌλ―€λ‘œ, κ·ΈλŒ€λ‘œ μ‚¬μš©
import random

def grey_color_func(
        word, font_size, position, orientation, random_state=None, **kwargs
):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
plt.figure(figsize=(12, 12))

# image show
# wc.recolor μ›Œλ“œν΄λΌμš°λ“œ 내에 μžˆλŠ” λ‹¨μ–΄μ˜ 색상 μž¬μ‘°μ •
plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3), interpolation='bilinear')
plt.axis("off")
plt.show()