8 분 소요

자연어와 단어의 분산 표현

1. 통계 기반 기법

1.1 파이썬으로 말뭉치 전처리하기

import numpy as np
text = 'You say goodbye and I say hello.'

# (1) 단어 단위로 변환
text = text.lower()
text = text.replace('.', ' .')
words = text.split(' ')
words
# 단어 단위로 구분하는 또다른 방법 : nltk 패키지 이용하기
# https://www.nltk.org/howto/tokenize.html

# import nltk
# nltk.download('punkt')

# text = 'You say goodbye and I say hello.'
# nltk.tokenize.word_tokenize(text)
# (2) word를 id로, id를 word로 매핑
word_to_id = {}
id_to_word = {}

for word in words:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

word_to_id, id_to_word
({'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6},
 {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'})
# (3) 말뭉치(corpus) 준비  
corpus = np.array([word_to_id[word] for word in words])
corpus
array([0, 1, 2, 3, 4, 1, 5, 6])
def preprocess(text):
    # (1) 단어 단위로 변환
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split(' ')

    # (2) word를 id로, id를 word로 매핑
    word_to_id = {}
    id_to_word = {}

    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
            
    # (3) 말뭉치(corpus) 준비  
    corpus = np.array([word_to_id[word] for word in words])
    
    return corpus, word_to_id, id_to_word
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
corpus, word_to_id, id_to_word
(array([0, 1, 2, 3, 4, 1, 5, 6]),
 {'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6},
 {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'})

1.2 단어의 분산 표현

1.3 분포 가설

from IPython.display import Image
Image('./deep_learning_2_images/fig 2-3.png', width=500)

png

1.4 동시발생 행렬

Image('./deep_learning_2_images/fig 2-4.png', width=320)

png

Image('./deep_learning_2_images/fig 2-5.png', width=400)

png

Image('./deep_learning_2_images/fig 2-6.png', width=400)

png

Image('./deep_learning_2_images/fig 2-7.png', width=400)

png

동시발생 행렬 함수 구현

def create_co_matrix(corpus, vocab_size, window_size=1):
    '''동시발생 행렬 생성

    :param corpus: 말뭉치(단어 ID 목록)
    :param vocab_size: 어휘 수
    :param window_size: 윈도우 크기(윈도우 크기가 1이면 타깃 단어 좌우 한 단어씩이 맥락에 포함)
    :return: 동시발생 행렬
    '''
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)                                                                   

    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i

            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1

            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1

    return co_matrix
corpus
array([0, 1, 2, 3, 4, 1, 5, 6])
word_to_id # vocabulary (unique한 단어들이 모여져 있음)
{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}
vocab_size = len(word_to_id) # 7
create_co_matrix(corpus, vocab_size, window_size=1)
array([[0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 1., 1., 0.],
       [0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0.]], dtype=float32)

1.5 벡터 간 유사도

코사인 유사도

Image('./deep_learning_2_images/e 2-1.png', width=400)

png

코사인 유사도 구현

def cos_similarity(x, y, eps=1e-8):
    '''코사인 유사도 산출

    :param x: 벡터
    :param y: 벡터
    :param eps: '0으로 나누기'를 방지하기 위한 작은 값
    :return:
    '''
    nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
    ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
    return np.dot(nx, ny)

“you”와 “i”의 유사도 구하기

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size, window_size=1)

# 'you'의 단어벡터
c0 = C[word_to_id['you']]
# 'i'의 단어벡터
c1 = C[word_to_id['i']]
cos_similarity(c0, c1) # 'you', 'i'의 유사도
0.70710677

1.6 유사 단어의 랭킹 표시

검색어와 비슷한 단어를 유사도 순으로 출력하는 함수

def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    # 1. 검색어를 꺼낸다
    if query not in word_to_id:
        print('%s(을)를 찾을 수 없습니다.' % query)
        return

    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    # 2. 코사인 유사도 계산
    vocab_size = len(id_to_word)

    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    # 3. 코사인 유사도를 기준으로 내림차순으로 출력
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))

        count += 1
        if count >= top:
            return
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size, window_size=1)

most_similar('you', word_to_id, id_to_word, C, top=5)
[query] you
 goodbye: 0.7071067690849304
 i: 0.7071067690849304
 hello: 0.7071067690849304
 say: 0.0
 and: 0.0

2. 통계 기반 기법 개선하기

2.1 상호 정보량

점별 상호정보량(PMI)

Image('./deep_learning_2_images/e 2-2.png', width=200)

png

Image('./deep_learning_2_images/e 2-3.png', width=400)

png

양의 상호정보량(PPMI)

Image('./deep_learning_2_images/e 2-6.png', width=200)

png

동시발생 행렬을 PPMI 행렬로 변환하는 함수

def ppmi(C, verbose=False, eps = 1e-8):
    '''PPMI(점별 상호정보량) 생성

    :param C: 동시발생 행렬
    :param verbose: 진행 상황을 출력할지 여부
    :return:
    '''
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total//100 + 1) == 0:
                    print('%.1f%% 완료' % (100*cnt/total))
    return M
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size, window_size=1)
W = ppmi(C)
     
np.set_printoptions(precision=3)  # 유효 자릿수를 세 자리로 표시
print('동시발생 행렬')
print(C)
print('-'*50)
print('PPMI')
print(W)
동시발생 행렬
[[0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 1. 1. 0.]
 [0. 1. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0.]]
--------------------------------------------------
PPMI
[[0.    1.807 0.    0.    0.    0.    0.   ]
 [1.807 0.    0.807 0.    0.807 0.807 0.   ]
 [0.    0.807 0.    1.807 0.    0.    0.   ]
 [0.    0.    1.807 0.    1.807 0.    0.   ]
 [0.    0.807 0.    1.807 0.    0.    0.   ]
 [0.    0.807 0.    0.    0.    0.    2.807]
 [0.    0.    0.    0.    0.    2.807 0.   ]]

2.2 차원 감소

Image('./deep_learning_2_images/Fig 2-8.png', width=500)

png

특잇값분해(SVD)

Image('./deep_learning_2_images/e 2-7.png', width=100)

png

Image('./deep_learning_2_images/Fig 2-9.png', width=400)

png

Image('./deep_learning_2_images/Fig 2-10.png', width=500)

png

2.3 SVD에 의한 차원 감소

PPMI 행렬에 SVD 적용한 코드

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(id_to_word)
C = create_co_matrix(corpus, vocab_size, window_size=1)
W = ppmi(C)

# SVD
U, S, V = np.linalg.svd(W)
np.set_printoptions(precision=3)  # 유효 자릿수를 세 자리로 표시
print(C[0]) # 동시발생 행렬
print(W[0]) # PPMI 행렬
print(U[0]) # SVD
[0. 1. 0. 0. 0. 0. 0.]
[0.    1.807 0.    0.    0.    0.    0.   ]
[-3.409e-01 -1.110e-16 -3.886e-16 -1.205e-01  0.000e+00  9.323e-01
  2.664e-16]
print(U[0, :2])
[-3.409e-01 -1.110e-16]
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
for word, word_id in word_to_id.items():
    plt.annotate(word, (U[word_id, 0], U[word_id, 1]))
plt.scatter(U[:,0], U[:,1], alpha=0.5)
plt.show()

png

2.4 PTB 데이터셋

Image('./deep_learning_2_images/Fig 2-12.png', width=600)

png

PTB 데이터셋 이용 코드

from dataset import ptb

corpus, word_to_id, id_to_word = ptb.load_data('train')

print('말뭉치 크기:', len(corpus))
print('corpus[:30]:', corpus[:30])
print()
print('id_to_word[0]:', id_to_word[0])
print('id_to_word[1]:', id_to_word[1])
print('id_to_word[2]:', id_to_word[2])
print()
print("word_to_id['car']:", word_to_id['car'])
print("word_to_id['happy']:", word_to_id['happy'])
print("word_to_id['lexus']:", word_to_id['lexus'])
말뭉치 크기: 929589
corpus[:30]: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]

id_to_word[0]: aer
id_to_word[1]: banknote
id_to_word[2]: berlitz

word_to_id['car']: 3856
word_to_id['happy']: 4428
word_to_id['lexus']: 7426
len(word_to_id)
10000

2.5 PTB 데이터셋 평가

import numpy as np
from dataset import ptb

window_size = 2
wordvec_size = 100

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
print('동시발생 수 계산 ...')
C = create_co_matrix(corpus, vocab_size, window_size)
print('PPMI 계산 ...')
W = ppmi(C, verbose=True)

print('calculating SVD ...')
try:
    # truncated SVD (빠르다!)
    from sklearn.utils.extmath import randomized_svd
    U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5,
                             random_state=None)
except ImportError:
    # SVD (느리다)
    U, S, V = np.linalg.svd(W)

동시발생 수 계산 ...
PPMI 계산 ...
1.0% 완료
2.0% 완료
3.0% 완료
4.0% 완료
5.0% 완료
6.0% 완료
7.0% 완료
8.0% 완료
9.0% 완료
10.0% 완료
11.0% 완료
12.0% 완료
13.0% 완료
14.0% 완료
15.0% 완료
16.0% 완료
17.0% 완료
18.0% 완료
19.0% 완료
20.0% 완료
21.0% 완료
22.0% 완료
23.0% 완료
24.0% 완료
25.0% 완료
26.0% 완료
27.0% 완료
28.0% 완료
29.0% 완료
30.0% 완료
31.0% 완료
32.0% 완료
33.0% 완료
34.0% 완료
35.0% 완료
36.0% 완료
37.0% 완료
38.0% 완료
39.0% 완료
40.0% 완료
41.0% 완료
42.0% 완료
43.0% 완료
44.0% 완료
45.0% 완료
46.0% 완료
47.0% 완료
48.0% 완료
49.0% 완료
50.0% 완료
51.0% 완료
52.0% 완료
53.0% 완료
54.0% 완료
55.0% 완료
56.0% 완료
57.0% 완료
58.0% 완료
59.0% 완료
60.0% 완료
61.0% 완료
62.0% 완료
63.0% 완료
64.0% 완료
65.0% 완료
66.0% 완료
67.0% 완료
68.0% 완료
69.0% 완료
70.0% 완료
71.0% 완료
72.0% 완료
73.0% 완료
74.0% 완료
75.0% 완료
76.0% 완료
77.0% 완료
78.0% 완료
79.0% 완료
80.0% 완료
81.0% 완료
82.0% 완료
83.0% 완료
84.0% 완료
85.0% 완료
86.0% 완료
87.0% 완료
88.0% 완료
89.0% 완료
90.0% 완료
91.0% 완료
92.0% 완료
93.0% 완료
94.0% 완료
95.0% 완료
96.0% 완료
97.0% 완료
98.0% 완료
99.0% 완료
calculating SVD ...
word_vecs = U[:, :wordvec_size]

querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
[query] you
 i: 0.6588214039802551
 we: 0.6168168783187866
 do: 0.5753165483474731
 've: 0.503013014793396
 someone: 0.5021402835845947

[query] year
 month: 0.6689368486404419
 earlier: 0.6434653997421265
 june: 0.6115486025810242
 quarter: 0.592315137386322
 last: 0.5841507315635681

[query] car
 auto: 0.6716699600219727
 luxury: 0.623845100402832
 vehicle: 0.5700278878211975
 cars: 0.499309778213501
 truck: 0.49552878737449646

[query] toyota
 motor: 0.7331503033638
 motors: 0.6686433553695679
 lexus: 0.6316478252410889
 honda: 0.6249175667762756
 mazda: 0.6197052001953125
querys = ['child', 'school']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
[query] child
 youngest: 0.45422083139419556
 freshman: 0.44593486189842224
 disabled: 0.398049533367157
 confronted: 0.39624708890914917
 care: 0.3911650776863098

[query] school
 university: 0.6808527708053589
 hopkins: 0.6793955564498901
 graduate: 0.6584905385971069
 johns: 0.613092303276062
 harvard: 0.5787250995635986
word_vecs.shape
(10000, 100)
word_to_id['school']
2051
word_vecs[2051] # school이라는 단어의 분산 표현
array([ 0.017,  0.002,  0.012,  0.005, -0.022,  0.005,  0.012, -0.011,
       -0.032,  0.002, -0.019,  0.008,  0.015,  0.001,  0.006,  0.053,
       -0.002, -0.015, -0.019, -0.014,  0.002,  0.005, -0.018, -0.03 ,
        0.019, -0.029,  0.005, -0.018,  0.005,  0.002, -0.038, -0.   ,
        0.024,  0.004,  0.033,  0.01 ,  0.001, -0.015, -0.012,  0.001,
       -0.01 ,  0.026,  0.011, -0.003,  0.029,  0.005, -0.027, -0.002,
       -0.015, -0.053, -0.022,  0.   ,  0.036,  0.035,  0.006,  0.009,
       -0.022,  0.013, -0.004, -0.012,  0.014, -0.018, -0.048, -0.023,
       -0.008, -0.031,  0.011, -0.001, -0.016, -0.046, -0.027, -0.027,
       -0.016, -0.016, -0.018,  0.019, -0.018,  0.011, -0.004, -0.001,
       -0.007, -0.002,  0.019,  0.013,  0.053, -0.008,  0.014, -0.02 ,
       -0.041,  0.003,  0.021, -0.053, -0.027, -0.013,  0.038, -0.008,
       -0.003, -0.011,  0.018,  0.003], dtype=float32)

Reference

댓글남기기