Study_note(zb_data)/Machine Learning
μ€ν°λλ ΈνΈ (μμ°μ΄ μ²λ¦¬)
KloudHyun
2023. 10. 4. 00:02
π konlpy (νκ΅μ΄ μμ°μ΄ μ²λ¦¬)
-> Kkma, Hannanum, Okt λ±μ ν¨ν€μ§λ‘ μμ°μ΄ λΆμ μ²λ¦¬κ° κ°λ₯νλ€.
→ κ°κ° μ²λ¦¬νλ λ°©μμ΄ μ½κ°μ© μ°¨μ΄κ° μμ
π»Kkma
→ sentences, nouns, pos
from konlpy.tag import Kkma
kkma = Kkma()
# λ¬Έμ₯ μΆμΆ
kkma.sentences('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
['νκ΅μ΄ λΆμμ μμν©λλ€', 'μ¬λ―Έμμ΄μ~~']
# λͺ
μ¬ μΆμΆ
kkma.nouns('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
['νκ΅μ΄', 'λΆμ']
# ννμ λΆμ°© (Tagging)
kkma.pos('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
[('νκ΅μ΄', 'NNG'),
('λΆμ', 'NNG'),
('μ', 'JKO'),
('μμν', 'VV'),
('γ
λλ€', 'EFN'),
('μ¬λ―Έμ', 'VA'),
('μ΄μ', 'EFN'),
('~~', 'SW')]
π»Hannanum
→ nouns, morphs, pos
from konlpy.tag import Hannanum
hannanum = Hannanum()
# λͺ
μ¬ μΆμΆ
hannanum.nouns('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
['νκ΅μ΄', 'λΆμ', 'μμ']
# ννμ μΆμΆ
hannanum.morphs('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
['νκ΅μ΄', 'λΆμ', 'μ', 'μμ', 'ν', 'γ
λλ€', 'μ¬λ―Έμ', 'μ΄μ', '~~']
# νμ¬ λΆμ°©
hannanum.pos('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
[('νκ΅μ΄', 'N'),
('λΆμ', 'N'),
('μ', 'J'),
('μμ', 'N'),
('ν', 'X'),
('γ
λλ€', 'E'),
('μ¬λ―Έμ', 'P'),
('μ΄μ', 'E'),
('~~', 'S')]
π»Okt
→ nouns, morphs, pos
from konlpy.tag import Okt
t = Okt()
# λͺ
μ¬ μΆμΆ
t.nouns('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
['νκ΅μ΄', 'λΆμ', 'μμ']
# ννμ μΆμΆ
t.morphs('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
['νκ΅μ΄', 'λΆμ', 'μ', 'μμ', 'ν©λλ€', 'μ¬λ―Έμμ΄μ', '~~']
# νμ¬ λΆμ°©
t.pos('νκ΅μ΄ λΆμμ μμν©λλ€ μ¬λ―Έμμ΄μ~~')
>>>>
[('νκ΅μ΄', 'Noun'),
('λΆμ', 'Noun'),
('μ', 'Josa'),
('μμ', 'Noun'),
('ν©λλ€', 'Verb'),
('μ¬λ―Έμμ΄μ', 'Adjective'),
('~~', 'Punctuation')]
π Wordcloud
π»μλν΄λΌμ°λ λ§λ€μ΄λ³΄κΈ°
from wordcloud import WordCloud, STOPWORDS
import numpy as np
from PIL import Image
# alice μμ€ txt νμΌ
text = open('../data/06_alice.txt').read()
# alice μ΄λ―Έμ§ png νμΌ
alice_mask = np.array(Image.open("../data/06_alice_mask.png"))
# wordcloudμμ μ μΈλλ λ¨μ΄ (saidλΌλ λ¨μ΄λ κΈ°μ‘΄μ μλ λ¨μ΄μ μΆκ°μν¨λ€)
stopwords = set(STOPWORDS)
stopwords.add("said")
π»plotμΌλ‘ κ·Έλ €λ³΄μ
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
rc('font', family = 'Malgun Gothic')
plt.figure(figsize=(8, 8))
# image show
# alice_mask.png νμΌμ λΏλ¦°λ€
# interpolation -> λΈλ μ¬μ΄μ κ·Έλ¦Όμ΄ μ΄μ΄μ§λ μ΄λ»κ² μ΄μ΄μ§μ§ κ²°μ
plt.imshow(alice_mask, cmap='Greys_r', interpolation='bilinear')
# λ°°κ²½ white
# μ΅λ λ¨μ΄ μ 2000κ°
# mask -> κ·Έλ¦Ό νμΌ
# stopwords -> λΆμ©μ΄, μμΈμ²λ¦¬
wc = WordCloud(
background_color="white", max_words=2000, mask = alice_mask, stopwords=stopwords
)
# μμ±
wc = wc.generate(text)
# Alice λ¨μ΄λ₯Ό 1λ‘ κΈ°μ€, λ€λ₯Έ λ¨μ΄μ λΉμ€
wc.words_
>>>>
{'Alice': 1.0,
'little': 0.29508196721311475,
'one': 0.27595628415300544,
'know': 0.2459016393442623,
'went': 0.226775956284153,
.....
plt.figure(figsize=(12, 12))
# image show
# wc = μ΄μ μ μ€μ ν μλ ν΄λΌμ°λ
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
π Wordcloud 2
# a_new_hope file read
text = open('../data/06_a_new_hope.txt').read()
# λ¨μ΄ μ²λ¦¬
text = text.replace('HAN', 'Han')
text = text.replace("LUKE's", 'Luke')
# λ°°κ²½μ΄ λ κ·Έλ¦Ό png μ€μ
mask = np.array(Image.open("../data/06_stormtrooper_mask.png"))
# λΆμ©μ΄ μ²λ¦¬ λ° λ¨μ΄ μΆκ°
stopwords = set(STOPWORDS)
stopwords.add("int")
stopwords.add("ext")
# μ΅λ λ¨μ΄ 1000κ°
# mask, λΆμ©μ΄, margin
wc = WordCloud(
max_words=1000, mask = mask, stopwords=stopwords, margin=10
).generate(text)
# μλν΄λΌμ°λ μμ λ€μ΄κ°λ κΈμ μμ μ§μ
# μλν΄λΌμ°λ ννμ΄μ§μ μ½λκ° κ²μ¬λμ΄μμΌλ―λ‘, κ·Έλλ‘ μ¬μ©
import random
def grey_color_func(
word, font_size, position, orientation, random_state=None, **kwargs
):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
plt.figure(figsize=(12, 12))
# image show
# wc.recolor μλν΄λΌμ°λ λ΄μ μλ λ¨μ΄μ μμ μ¬μ‘°μ
plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3), interpolation='bilinear')
plt.axis("off")
plt.show()