Rnn 2 (자연어와 단어의 분산 표현)
자연어와 단어의 분산 표현
1. 통계 기반 기법
1.1 파이썬으로 말뭉치 전처리하기
import numpy as np
text = 'You say goodbye and I say hello.'
# (1) 단어 단위로 변환
text = text.lower()
text = text.replace('.', ' .')
words = text.split(' ')
words
# 단어 단위로 구분하는 또다른 방법 : nltk 패키지 이용하기
# https://www.nltk.org/howto/tokenize.html
# import nltk
# nltk.download('punkt')
# text = 'You say goodbye and I say hello.'
# nltk.tokenize.word_tokenize(text)
# (2) word를 id로, id를 word로 매핑
word_to_id = {}
id_to_word = {}
for word in words:
if word not in word_to_id:
new_id = len(word_to_id)
word_to_id[word] = new_id
id_to_word[new_id] = word
word_to_id, id_to_word
({'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6},
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'})
# (3) 말뭉치(corpus) 준비
corpus = np.array([word_to_id[word] for word in words])
corpus
array([0, 1, 2, 3, 4, 1, 5, 6])
def preprocess(text):
# (1) 단어 단위로 변환
text = text.lower()
text = text.replace('.', ' .')
words = text.split(' ')
# (2) word를 id로, id를 word로 매핑
word_to_id = {}
id_to_word = {}
for word in words:
if word not in word_to_id:
new_id = len(word_to_id)
word_to_id[word] = new_id
id_to_word[new_id] = word
# (3) 말뭉치(corpus) 준비
corpus = np.array([word_to_id[word] for word in words])
return corpus, word_to_id, id_to_word
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
corpus, word_to_id, id_to_word
(array([0, 1, 2, 3, 4, 1, 5, 6]),
{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6},
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'})
1.2 단어의 분산 표현
1.3 분포 가설
from IPython.display import Image
Image('./deep_learning_2_images/fig 2-3.png', width=500)
1.4 동시발생 행렬
Image('./deep_learning_2_images/fig 2-4.png', width=320)
Image('./deep_learning_2_images/fig 2-5.png', width=400)
Image('./deep_learning_2_images/fig 2-6.png', width=400)
Image('./deep_learning_2_images/fig 2-7.png', width=400)
동시발생 행렬 함수 구현
def create_co_matrix(corpus, vocab_size, window_size=1):
'''동시발생 행렬 생성
:param corpus: 말뭉치(단어 ID 목록)
:param vocab_size: 어휘 수
:param window_size: 윈도우 크기(윈도우 크기가 1이면 타깃 단어 좌우 한 단어씩이 맥락에 포함)
:return: 동시발생 행렬
'''
corpus_size = len(corpus)
co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)
for idx, word_id in enumerate(corpus):
for i in range(1, window_size + 1):
left_idx = idx - i
right_idx = idx + i
if left_idx >= 0:
left_word_id = corpus[left_idx]
co_matrix[word_id, left_word_id] += 1
if right_idx < corpus_size:
right_word_id = corpus[right_idx]
co_matrix[word_id, right_word_id] += 1
return co_matrix
corpus
array([0, 1, 2, 3, 4, 1, 5, 6])
word_to_id # vocabulary (unique한 단어들이 모여져 있음)
{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
vocab_size = len(word_to_id) # 7
create_co_matrix(corpus, vocab_size, window_size=1)
array([[0., 1., 0., 0., 0., 0., 0.],
[1., 0., 1., 0., 1., 1., 0.],
[0., 1., 0., 1., 0., 0., 0.],
[0., 0., 1., 0., 1., 0., 0.],
[0., 1., 0., 1., 0., 0., 0.],
[0., 1., 0., 0., 0., 0., 1.],
[0., 0., 0., 0., 0., 1., 0.]], dtype=float32)
1.5 벡터 간 유사도
코사인 유사도
Image('./deep_learning_2_images/e 2-1.png', width=400)
코사인 유사도 구현
def cos_similarity(x, y, eps=1e-8):
'''코사인 유사도 산출
:param x: 벡터
:param y: 벡터
:param eps: '0으로 나누기'를 방지하기 위한 작은 값
:return:
'''
nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
return np.dot(nx, ny)
“you”와 “i”의 유사도 구하기
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size, window_size=1)
# 'you'의 단어벡터
c0 = C[word_to_id['you']]
# 'i'의 단어벡터
c1 = C[word_to_id['i']]
cos_similarity(c0, c1) # 'you', 'i'의 유사도
0.70710677
1.6 유사 단어의 랭킹 표시
검색어와 비슷한 단어를 유사도 순으로 출력하는 함수
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
# 1. 검색어를 꺼낸다
if query not in word_to_id:
print('%s(을)를 찾을 수 없습니다.' % query)
return
print('\n[query] ' + query)
query_id = word_to_id[query]
query_vec = word_matrix[query_id]
# 2. 코사인 유사도 계산
vocab_size = len(id_to_word)
similarity = np.zeros(vocab_size)
for i in range(vocab_size):
similarity[i] = cos_similarity(word_matrix[i], query_vec)
# 3. 코사인 유사도를 기준으로 내림차순으로 출력
count = 0
for i in (-1 * similarity).argsort():
if id_to_word[i] == query:
continue
print(' %s: %s' % (id_to_word[i], similarity[i]))
count += 1
if count >= top:
return
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size, window_size=1)
most_similar('you', word_to_id, id_to_word, C, top=5)
[query] you
goodbye: 0.7071067690849304
i: 0.7071067690849304
hello: 0.7071067690849304
say: 0.0
and: 0.0
2. 통계 기반 기법 개선하기
2.1 상호 정보량
점별 상호정보량(PMI)
Image('./deep_learning_2_images/e 2-2.png', width=200)
Image('./deep_learning_2_images/e 2-3.png', width=400)
양의 상호정보량(PPMI)
Image('./deep_learning_2_images/e 2-6.png', width=200)
동시발생 행렬을 PPMI 행렬로 변환하는 함수
def ppmi(C, verbose=False, eps = 1e-8):
'''PPMI(점별 상호정보량) 생성
:param C: 동시발생 행렬
:param verbose: 진행 상황을 출력할지 여부
:return:
'''
M = np.zeros_like(C, dtype=np.float32)
N = np.sum(C)
S = np.sum(C, axis=0)
total = C.shape[0] * C.shape[1]
cnt = 0
for i in range(C.shape[0]):
for j in range(C.shape[1]):
pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
M[i, j] = max(0, pmi)
if verbose:
cnt += 1
if cnt % (total//100 + 1) == 0:
print('%.1f%% 완료' % (100*cnt/total))
return M
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size, window_size=1)
W = ppmi(C)
np.set_printoptions(precision=3) # 유효 자릿수를 세 자리로 표시
print('동시발생 행렬')
print(C)
print('-'*50)
print('PPMI')
print(W)
동시발생 행렬
[[0. 1. 0. 0. 0. 0. 0.]
[1. 0. 1. 0. 1. 1. 0.]
[0. 1. 0. 1. 0. 0. 0.]
[0. 0. 1. 0. 1. 0. 0.]
[0. 1. 0. 1. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 1. 0.]]
--------------------------------------------------
PPMI
[[0. 1.807 0. 0. 0. 0. 0. ]
[1.807 0. 0.807 0. 0.807 0.807 0. ]
[0. 0.807 0. 1.807 0. 0. 0. ]
[0. 0. 1.807 0. 1.807 0. 0. ]
[0. 0.807 0. 1.807 0. 0. 0. ]
[0. 0.807 0. 0. 0. 0. 2.807]
[0. 0. 0. 0. 0. 2.807 0. ]]
2.2 차원 감소
Image('./deep_learning_2_images/Fig 2-8.png', width=500)
특잇값분해(SVD)
Image('./deep_learning_2_images/e 2-7.png', width=100)
Image('./deep_learning_2_images/Fig 2-9.png', width=400)
Image('./deep_learning_2_images/Fig 2-10.png', width=500)
2.3 SVD에 의한 차원 감소
PPMI 행렬에 SVD 적용한 코드
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(id_to_word)
C = create_co_matrix(corpus, vocab_size, window_size=1)
W = ppmi(C)
# SVD
U, S, V = np.linalg.svd(W)
np.set_printoptions(precision=3) # 유효 자릿수를 세 자리로 표시
print(C[0]) # 동시발생 행렬
print(W[0]) # PPMI 행렬
print(U[0]) # SVD
[0. 1. 0. 0. 0. 0. 0.]
[0. 1.807 0. 0. 0. 0. 0. ]
[-3.409e-01 -1.110e-16 -3.886e-16 -1.205e-01 0.000e+00 9.323e-01
2.664e-16]
print(U[0, :2])
[-3.409e-01 -1.110e-16]
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
for word, word_id in word_to_id.items():
plt.annotate(word, (U[word_id, 0], U[word_id, 1]))
plt.scatter(U[:,0], U[:,1], alpha=0.5)
plt.show()
2.4 PTB 데이터셋
Image('./deep_learning_2_images/Fig 2-12.png', width=600)
PTB 데이터셋 이용 코드
from dataset import ptb
corpus, word_to_id, id_to_word = ptb.load_data('train')
print('말뭉치 크기:', len(corpus))
print('corpus[:30]:', corpus[:30])
print()
print('id_to_word[0]:', id_to_word[0])
print('id_to_word[1]:', id_to_word[1])
print('id_to_word[2]:', id_to_word[2])
print()
print("word_to_id['car']:", word_to_id['car'])
print("word_to_id['happy']:", word_to_id['happy'])
print("word_to_id['lexus']:", word_to_id['lexus'])
말뭉치 크기: 929589
corpus[:30]: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29]
id_to_word[0]: aer
id_to_word[1]: banknote
id_to_word[2]: berlitz
word_to_id['car']: 3856
word_to_id['happy']: 4428
word_to_id['lexus']: 7426
len(word_to_id)
10000
2.5 PTB 데이터셋 평가
import numpy as np
from dataset import ptb
window_size = 2
wordvec_size = 100
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
print('동시발생 수 계산 ...')
C = create_co_matrix(corpus, vocab_size, window_size)
print('PPMI 계산 ...')
W = ppmi(C, verbose=True)
print('calculating SVD ...')
try:
# truncated SVD (빠르다!)
from sklearn.utils.extmath import randomized_svd
U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5,
random_state=None)
except ImportError:
# SVD (느리다)
U, S, V = np.linalg.svd(W)
동시발생 수 계산 ...
PPMI 계산 ...
1.0% 완료
2.0% 완료
3.0% 완료
4.0% 완료
5.0% 완료
6.0% 완료
7.0% 완료
8.0% 완료
9.0% 완료
10.0% 완료
11.0% 완료
12.0% 완료
13.0% 완료
14.0% 완료
15.0% 완료
16.0% 완료
17.0% 완료
18.0% 완료
19.0% 완료
20.0% 완료
21.0% 완료
22.0% 완료
23.0% 완료
24.0% 완료
25.0% 완료
26.0% 완료
27.0% 완료
28.0% 완료
29.0% 완료
30.0% 완료
31.0% 완료
32.0% 완료
33.0% 완료
34.0% 완료
35.0% 완료
36.0% 완료
37.0% 완료
38.0% 완료
39.0% 완료
40.0% 완료
41.0% 완료
42.0% 완료
43.0% 완료
44.0% 완료
45.0% 완료
46.0% 완료
47.0% 완료
48.0% 완료
49.0% 완료
50.0% 완료
51.0% 완료
52.0% 완료
53.0% 완료
54.0% 완료
55.0% 완료
56.0% 완료
57.0% 완료
58.0% 완료
59.0% 완료
60.0% 완료
61.0% 완료
62.0% 완료
63.0% 완료
64.0% 완료
65.0% 완료
66.0% 완료
67.0% 완료
68.0% 완료
69.0% 완료
70.0% 완료
71.0% 완료
72.0% 완료
73.0% 완료
74.0% 완료
75.0% 완료
76.0% 완료
77.0% 완료
78.0% 완료
79.0% 완료
80.0% 완료
81.0% 완료
82.0% 완료
83.0% 완료
84.0% 완료
85.0% 완료
86.0% 완료
87.0% 완료
88.0% 완료
89.0% 완료
90.0% 완료
91.0% 완료
92.0% 완료
93.0% 완료
94.0% 완료
95.0% 완료
96.0% 완료
97.0% 완료
98.0% 완료
99.0% 완료
calculating SVD ...
word_vecs = U[:, :wordvec_size]
querys = ['you', 'year', 'car', 'toyota']
for query in querys:
most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
[query] you
i: 0.6588214039802551
we: 0.6168168783187866
do: 0.5753165483474731
've: 0.503013014793396
someone: 0.5021402835845947
[query] year
month: 0.6689368486404419
earlier: 0.6434653997421265
june: 0.6115486025810242
quarter: 0.592315137386322
last: 0.5841507315635681
[query] car
auto: 0.6716699600219727
luxury: 0.623845100402832
vehicle: 0.5700278878211975
cars: 0.499309778213501
truck: 0.49552878737449646
[query] toyota
motor: 0.7331503033638
motors: 0.6686433553695679
lexus: 0.6316478252410889
honda: 0.6249175667762756
mazda: 0.6197052001953125
querys = ['child', 'school']
for query in querys:
most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
[query] child
youngest: 0.45422083139419556
freshman: 0.44593486189842224
disabled: 0.398049533367157
confronted: 0.39624708890914917
care: 0.3911650776863098
[query] school
university: 0.6808527708053589
hopkins: 0.6793955564498901
graduate: 0.6584905385971069
johns: 0.613092303276062
harvard: 0.5787250995635986
word_vecs.shape
(10000, 100)
word_to_id['school']
2051
word_vecs[2051] # school이라는 단어의 분산 표현
array([ 0.017, 0.002, 0.012, 0.005, -0.022, 0.005, 0.012, -0.011,
-0.032, 0.002, -0.019, 0.008, 0.015, 0.001, 0.006, 0.053,
-0.002, -0.015, -0.019, -0.014, 0.002, 0.005, -0.018, -0.03 ,
0.019, -0.029, 0.005, -0.018, 0.005, 0.002, -0.038, -0. ,
0.024, 0.004, 0.033, 0.01 , 0.001, -0.015, -0.012, 0.001,
-0.01 , 0.026, 0.011, -0.003, 0.029, 0.005, -0.027, -0.002,
-0.015, -0.053, -0.022, 0. , 0.036, 0.035, 0.006, 0.009,
-0.022, 0.013, -0.004, -0.012, 0.014, -0.018, -0.048, -0.023,
-0.008, -0.031, 0.011, -0.001, -0.016, -0.046, -0.027, -0.027,
-0.016, -0.016, -0.018, 0.019, -0.018, 0.011, -0.004, -0.001,
-0.007, -0.002, 0.019, 0.013, 0.053, -0.008, 0.014, -0.02 ,
-0.041, 0.003, 0.021, -0.053, -0.027, -0.013, 0.038, -0.008,
-0.003, -0.011, 0.018, 0.003], dtype=float32)
댓글남기기