๐ Naive Bayes Classifier ๊ฐ์ฑ ๋ถ์ (eng)
๐ปtokenize๋ฅผ ํ์ฉํ๋ค.
→ ์ง๋ํ์ต์ด๊ธฐ ๋๋ฌธ์ train ๋ฐ์ดํฐ ์ฒ๋ผ ์ ๋ต์ ์๋ ค์ฃผ์ด์ผ ํ๋ค.
→ ์ ์ฒด ๋ง๋ญ์น๋ฅผ ๋ง๋ ๋ค
from nltk.tokenize import word_tokenize
import nltk
# 1. train ๋ฐ์ดํฐ์์ ๊ฐ sentence (๋ฌธ์ฅ)๋ฅผ ๋ฐ์์จ๋ค.
# 2. ๊ฐ sentence์ ๋ฌธ์ฅ์ tokenize ํ ํ๋ค. (๋ถํ )
# 3. set ๋ช
๋ น์ด ์์ด์ ์ค๋ณต์ด ์ ๊ฑฐ ๋๋ค.
train = [
('i like you', 'pos'),
('i hate you', 'neg'),
('you like me', 'neg'),
('i like her', 'pos'),
]
all_words = set(
word.lower() for sentence in train for word in word_tokenize(sentence[0])
)
all_words
>>>>
{'hate', 'her', 'i', 'like', 'me', 'you'}
๐ป๋จ์ด์ ์ ๋ฌด ํ์ (๋ง๋ญ์น ๋๋น)
# 1. train ์์ ๋ฐ์ดํฐ ํ์์ฉ (๋ฌธ์ฅ, ๊ฐ์ ) ๊ฐ์ ธ์จ๋ค
# -> ex) ('i like you', 'pos')
# 2. ๋ฌธ์ฅ๋ง ๊ฐ์ง๊ณ ์์ ๋์ด์ฐ๊ธฐ๋ก ๋ถ๋ฆฌ
# -> ex) ('i like you')
# 3. ๋ถ๋ฆฌํ ๋จ์ด๋ค์ด all_words์ ์๋์ง ํ์
t = [({word : (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t
>>>>
[({'me': False,
'like': True,
'her': False,
'you': True,
'hate': False,
'i': True},
'pos'),
...
๐ปtrain!
# NaiveBayesClassifier ํ์ฉ
# ํน์ฑ ํ์
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()
>>>>
Most Informative Features
hate = False pos : neg = 1.7 : 1.0
her = False neg : pos = 1.7 : 1.0
i = True pos : neg = 1.7 : 1.0
like = True pos : neg = 1.7 : 1.0
me = False pos : neg = 1.7 : 1.0
you = True neg : pos = 1.7 : 1.0
๐ป์ด์ ํ ์คํธ ๋ฐ์ดํฐ๋ฅผ ๋ฃ์ด๋ณด์
# for๋ฌธ์ ํตํด all_words์์ word๋ฅผ ํ๋์ฉ ๋นผ์จ๋ค
# ๊ฐ word๊ฐ test_sentence๋ฅผ tokenizeํ ๊ฒ์ ์๋ ์ง ์๋ ์ง ํ์
test_sentence = 'i like MeRui'
test_sent_features = {
word.lower() : (word in word_tokenize(test_sentence.lower())) for word in all_words
}
test_sent_features
>>>>
{'me': False, 'like': True, 'her': False, 'you': False, 'hate': False, 'i': True}
# ํ์
ํ ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฅ๊ธฐ์ ๋ฃ๋๋ค
classifier.classify(test_sent_features)
>>>>
'pos'
๐ Naive Bayes Classifier ๊ฐ์ฑ ๋ถ์ (kor)
๐ปtokenize๋ฅผ ํ์ฉํ๋ค.
→ ํ๊ธ์ด๊ธฐ ๋๋ฌธ์ konlpy ํ์ฉ
from konlpy.tag import Okt
pos_tagger = Okt()
train = [
("๋ฉ๋ฆฌ๊ฐ ์ข์", "pos"),
("๊ณ ์์ด๋ ์ข์", "pos"),
("๋ ์์
์ด ์ง๋ฃจํด", "neg"),
("๋ฉ๋ฆฌ๋ ์ด์ ๊ณ ์์ด์ผ", "pos"),
("๋ ๋ง์น๊ณ ๋ฉ๋ฆฌ๋ ๋๊ฑฐ์ผ", "pos"),
]
all_words = set(
word for sentence in train for word in word_tokenize(sentence[0])
)
all_words
>>>>
{'๊ณ ์์ด๋', '๊ณ ์์ด์ผ', '๋', '๋๊ฑฐ์ผ', '๋ง์น๊ณ ', '๋ฉ๋ฆฌ๊ฐ', '๋ฉ๋ฆฌ๋', '๋ฉ๋ฆฌ๋', '์์
์ด', '์ด์', '์ข์', '์ง๋ฃจํด'}
๐ป๋จ์ด์ ์ ๋ฌด ํ์
t = [({word : (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
t
>>>
[({'๋ฉ๋ฆฌ๊ฐ': True,
'๋ง์น๊ณ ': False,
'์ง๋ฃจํด': False,
'๋๊ฑฐ์ผ': False,
'๋ฉ๋ฆฌ๋': False,
'๋': False,
'์ด์': False,
'๋ฉ๋ฆฌ๋': False,
'์์
์ด': False,
'๊ณ ์์ด๋': False,
'์ข์': True,
'๊ณ ์์ด์ผ': False},
'pos'),
....
๐ปํน์ฑ ํ์
classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features()
>>>>
Most Informative Features
๋ = True neg : pos = 2.5 : 1.0
์ข์ = False neg : pos = 1.5 : 1.0
๊ณ ์์ด๋ = False neg : pos = 1.1 : 1.0
๊ณ ์์ด์ผ = False neg : pos = 1.1 : 1.0
๋๊ฑฐ์ผ = False neg : pos = 1.1 : 1.0
๋ง์น๊ณ = False neg : pos = 1.1 : 1.0
๋ฉ๋ฆฌ๊ฐ = False neg : pos = 1.1 : 1.0
๋ฉ๋ฆฌ๋ = False neg : pos = 1.1 : 1.0
๋ฉ๋ฆฌ๋ = False neg : pos = 1.1 : 1.0
์ด์ = False neg : pos = 1.1 : 1.0
๐ปํ ์คํธ ๋ฌธ์ฅ ๋ฃ์ด์ ํ์ธ
→ ํ ์คํธ ๋ฌธ์ฅ์ ํ์ธ ํด๋ณด๋, ํํ์ ๋ถ์์ด ํ์์ ์ผ๋ก ์ด๋ฃจ์ด์ ธ์ผ ๋ ์ ํํ ๊ฒฐ๊ณผ๊ฐ ๋์ค๋ ๊ฒ์ด ํ์ธ๋๋ค.
test_sentence = '๋ ์์
์ด ๋ง์น๋ฉด ๋ฉ๋ฆฌ๋ ๋๊ฑฐ์ผ'
test_sent_features = {
word.lower() : (word in word_tokenize(test_sentence.lower())) for word in all_words
}
test_sent_features
>>>>
{'๋ฉ๋ฆฌ๊ฐ': False, '๋ง์น๊ณ ': False, '์ง๋ฃจํด': False, '๋๊ฑฐ์ผ': True, '๋ฉ๋ฆฌ๋': False, '๋': True, '์ด์': False, '๋ฉ๋ฆฌ๋': True,
'์์
์ด': True, '๊ณ ์์ด๋': False, '์ข์': False, '๊ณ ์์ด์ผ': False}
classifier.classify(test_sent_features)
>>>>
'neg'
๐ปํํ์ ๋ถ์์ ํ ํ ๋ค์ ์๋
→ ํํ์ ๋ถ์ ํ ํ์ฌ๋ฅผ ๋จ์ด ๋ค์ ๋ถ์ฌ ๋ฃ๋๋ก ํด๋ณด์
def tokenize(doc):
return ["/".join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
train_docs = [(tokenize(row[0]), row[1]) for row in train]
train_docs
>>>>
[(['๋ฉ๋ฆฌ/Noun', '๊ฐ/Josa', '์ข๋ค/Adjective'], 'pos'),
(['๊ณ ์์ด/Noun', '๋/Josa', '์ข๋ค/Adjective'], 'pos'),
(['๋/Noun', '์์
/Noun', '์ด/Josa', '์ง๋ฃจํ๋ค/Adjective'], 'neg'),
(['๋ฉ๋ฆฌ/Noun', '๋/Josa', '์ด์๋ค/Adjective', '๊ณ ์์ด/Noun', '์ผ/Josa'], 'pos'),
(['๋/Noun', '๋ง์น/Noun', '๊ณ /Josa', '๋ฉ๋ฆฌ/Noun', '๋/Josa', '๋๋ค/Verb'], 'pos')]
tokens = [t for d in train_docs for t in d[0]]
tokens
>>>>
['๋ฉ๋ฆฌ/Noun', '๊ฐ/Josa', '์ข๋ค/Adjective', '๊ณ ์์ด/Noun', '๋/Josa', '์ข๋ค/Adjective', '๋/Noun', '์์
/Noun', '์ด/Josa',
'์ง๋ฃจํ๋ค/Adjective', '๋ฉ๋ฆฌ/Noun', '๋/Josa', '์ด์๋ค/Adjective', '๊ณ ์์ด/Noun', '์ผ/Josa', '๋/Noun', '๋ง์น/Noun', '๊ณ /Josa',
'๋ฉ๋ฆฌ/Noun', '๋/Josa', '๋๋ค/Verb']
๐ป๋จ์ด์ ์ ๋ฌด ํ์
# tokens์ ์๋ word๊ฐ set(doc)์ ์๋์ง ์ ๋ฌด ํ์
def term_exists(doc):
return {word : (word in set(doc)) for word in tokens}
# 1. train_docs์ ์๋ d(๋ฌธ์ฅ),c(๊ฐ์ )๋ฅผ ์ถ์ถํ๋ค
# 2. term_exists์ ๋ฌธ์ฅ์ ๋ฃ์ด ๊ฐ ๋จ์ด๊ฐ ์๋์ง ํ์ธ ๋ฐ ๋ฐํ
train_xy = [(term_exists(d), c) for d,c in train_docs]
train_xy
>>>>
[({'๋ฉ๋ฆฌ/Noun': True,
'๊ฐ/Josa': True,
'์ข๋ค/Adjective': True,
'๊ณ ์์ด/Noun': False,
'๋/Josa': False,
'๋/Noun': False,
'์์
/Noun': False,
'์ด/Josa': False,
'์ง๋ฃจํ๋ค/Adjective': False,
'๋/Josa': False,
'์ด์๋ค/Adjective': False,
'์ผ/Josa': False,
'๋ง์น/Noun': False,
'๊ณ /Josa': False,
'๋/Josa': False,
'๋๋ค/Verb': False},
'pos'),
....
๐ป์ฃผ์ ํน์ฑ ํ์
classifier = nltk.NaiveBayesClassifier.train(train_xy)
classifier.show_most_informative_features()
>>>>
Most Informative Features
๋/Noun = True neg : pos = 2.5 : 1.0
๋ฉ๋ฆฌ/Noun = False neg : pos = 2.5 : 1.0
๊ณ ์์ด/Noun = False neg : pos = 1.5 : 1.0
์ข๋ค/Adjective = False neg : pos = 1.5 : 1.0
๊ฐ/Josa = False neg : pos = 1.1 : 1.0
๊ณ /Josa = False neg : pos = 1.1 : 1.0
๋๋ค/Verb = False neg : pos = 1.1 : 1.0
๋/Josa = False neg : pos = 1.1 : 1.0
๋/Josa = False neg : pos = 1.1 : 1.0
๋/Josa = False neg : pos = 1.1 : 1.0
๐ปํ ์คํธ ๋ฌธ์ฅ ํ์ธํ๊ธฐ
test_sentence = [('๋ ์์
์ด ๋ง์น๋ฉด ๋ฉ๋ฆฌ๋ ๋๊ฑฐ์ผ')]
test_docs = pos_tagger.pos(test_sentence[0])
test_docs
test_sent_features = {word : (word in tokens) for word in test_docs}
test_sent_features
>>>>
{('๋', 'Noun'): False,
('์์
', 'Noun'): False,
('์ด', 'Josa'): False,
('๋ง์น', 'Noun'): False,
('๋ฉด', 'Josa'): False,
('๋ฉ๋ฆฌ', 'Noun'): False,
('๋', 'Josa'): False,
('๋๊ฑฐ์ผ', 'Verb'): False}
classifier.classify(test_sent_features)
>>>>
'pos'
โป ์ฐธ๊ณ
→ ํ๊ธ์ ํน์ฑ์ ์กฐ์ฌ ๋ฑ์ด ๋ง์ด ๋ถ๊ธฐ ๋๋ฌธ์, ์ ํํ ๊ฒฐ๊ณผ๋ฅผ ์ํด์๋ ํํ์ ๋ถ์์ ํ์๋ก ์งํํ์ฌ์ผ ํ๋ค.
→ for๋ฌธ ๋ฑ์ด ๋ง์ด ๋ฑ์ฅํ๋๋ฐ ์ดํดํ๊ธฐ ์ด๋ ค์ด ๋ถ๋ถ์ ๋ณต์ต์ด ํ์.
'Study_note(zb_data) > Machine Learning' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
์คํฐ๋๋ ธํธ (TF-IDF ํ์ฉ) (0) | 2023.10.06 |
---|---|
์คํฐ๋๋ ธํธ (๋ฌธ์ฅ์ ์ ์ฌ๋ vectorize ํ์ฉํ๊ธฐ) (0) | 2023.10.06 |
์คํฐ๋๋ ธํธ (๋ฒ๋ น ๊ด๋ จ ๋ถ์) (1) | 2023.10.04 |
์คํฐ๋๋ ธํธ (์์ฐ์ด ์ฒ๋ฆฌ) (0) | 2023.10.04 |
์คํฐ๋๋ ธํธ (credit card data 3, 4) (1) | 2023.10.02 |