4 분 소요

CBOW with PTB Dataset

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
device(type='cuda')

1. 데이터 다운로드

!mkdir ptb_dataset
!wget https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt -P ./ptb_dataset
--2023-04-24 03:38:56--  https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5101618 (4.9M) [text/plain]
Saving to: ‘./ptb_dataset/ptb.train.txt’

ptb.train.txt       100%[===================>]   4.87M  --.-KB/s    in 0.08s   

2023-04-24 03:38:56 (62.6 MB/s) - ‘./ptb_dataset/ptb.train.txt’ saved [5101618/5101618]

2. 데이터 불러오기

dataset_dir = './ptb_dataset/'
train_file_name = 'ptb.train.txt'
def create_contexts_target(corpus, window_size=1):
    target = corpus[window_size:-window_size]
    contexts = []

    for idx in range(window_size, len(corpus)-window_size):
        cs = []
        for t in range(-window_size, window_size + 1):
            if t == 0:
                continue
            cs.append(corpus[idx + t])
        contexts.append(cs)

    return np.array(contexts), np.array(target)
class PTBDataset(Dataset):
    def __init__(self, file_path, window_size):
        self.file_path = file_path
        self.word_to_id, self.id_to_word, self.words = self.load_vocab()

        corpus = np.array([self.word_to_id[w] for w in self.words])
        print('corpus size :', len(corpus))

        self.contexts, self.target = create_contexts_target(corpus, window_size)
        print('context.shpape:', self.contexts.shape, 'target.shape:', self.target.shape)


    def load_vocab(self):
        words = open(file_path).read().replace('\n', '<eos>').strip().split()
        word_to_id = {}
        id_to_word = {}

        for i, word in enumerate(words):
            if word not in word_to_id:
                new_id = len(word_to_id)
                word_to_id[word] = new_id
                id_to_word[new_id] = word
        print('vocab size:', len(id_to_word))        
        return word_to_id, id_to_word, words

    def __len__(self):
        return len(self.target)

    def __getitem__(self, index):
        return self.contexts[index], self.target[index]

학습 후 유사도 측정을 위해 준비

dataset_dir = './ptb_dataset/'
train_file_name = 'ptb.train.txt'
file_path = dataset_dir + train_file_name
words = open(file_path).read().replace('\n', '<eos>').strip().split()
words # 말뭉치에 있는 데이터를 단어 단위로 쪼개어 리스트로 보관
word_to_id = {}
id_to_word = {}

for i, word in enumerate(words):
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word
print('corpus size :', len(words))        
print('vocab size : ', len(id_to_word))        
corpus size : 929589
vocab size :  10000
def load_vocab():
    words = open(file_path).read().replace('\n', '<eos>').strip().split()
    word_to_id = {}
    id_to_word = {}

    for i, word in enumerate(words):
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    # print('corpus size :', len(words))        
    print('vocab size : ', len(id_to_word))       

    return word_to_id, id_to_word
word_to_id, id_to_word = load_vocab()
corpus size : 929589
vocab size :  10000
dataset_dir = './ptb_dataset/'
train_file_name = 'ptb.train.txt'
window_size = 5
dataset = PTBDataset(file_path, window_size)
vocab size: 10000
corpus size : 929589
context.shpape: (929579, 10) target.shape: (929579,)
len(dataset)
929579
dataset[100] # contexts, target
(array([76, 77, 64, 78, 79, 27, 28, 81, 82, 83]), 80)

3. 데이터 적재

batch_size=100
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
contexts, target = next(iter(dataloader))
contexts.size(), target.size()
(torch.Size([100, 10]), torch.Size([100]))

4. 모델 생성

vocab_size = 10000
hidden_size = 100
embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = hidden_size)
emb_out = embedding(contexts)
emb_out.shape
torch.Size([100, 10, 100])
h_mean = emb_out.mean(axis=1)
h_mean.shape 
torch.Size([100, 100])
class CBOW_Model(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(CBOW_Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = hidden_size)
        self.linear = nn.Linear(in_features=hidden_size, out_features=vocab_size)


    def forward(self, inputs): # input (batch_size, context_size)
        h = self.embedding(inputs) # h (batch_size, context_size, hidden_size)
        h_mean = h.mean(axis=1) # h_mean (batch_size, hidden_size)
        out = self.linear(h_mean) # out (batch_size, vocab_size)
        return out
vocab_size = 10000
hidden_size = 100

model = CBOW_Model(vocab_size=vocab_size, hidden_size=hidden_size)
model.to(device)
CBOW_Model(
  (embedding): Embedding(10000, 100)
  (linear): Linear(in_features=100, out_features=10000, bias=True)
)
contexts, target = contexts.to(device), target.to(device)
out = model(contexts)
out.shape, out.dtype
(torch.Size([100, 10000]), torch.float32)

5. 모델 설정 (손실함수, 옵티마이저 선택)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

6. 모델 훈련

def train_loop(model, trainloader, loss_fn, epochs, optimizer):  
    min_loss = 1000000  
    trigger = 0
    patience = 4     

    for epoch in range(epochs):
        model.train()
        train_loss = []

        for batch_data in trainloader:
            contexts = batch_data[0].to(device)
            target = batch_data[1].to(device)

            optimizer.zero_grad()
            outputs = model(contexts)
            loss = loss_fn(outputs, target)
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        train_epoch_loss = np.mean(train_loss)
        total_loss["train"].append(train_epoch_loss)

        print( "Epoch: {}/{}, Train Loss={:.5f}".format(                    
                    epoch + 1, epochs,
                    total_loss["train"][-1]))  
epochs = 15
total_loss = {"train": []}
%time train_loop(model, dataloader, loss_fn, epochs, optimizer)
Epoch: 1/15, Train Loss=5.86334
Epoch: 2/15, Train Loss=5.12299
Epoch: 3/15, Train Loss=4.84547
Epoch: 4/15, Train Loss=4.69440
Epoch: 5/15, Train Loss=4.60174
Epoch: 6/15, Train Loss=4.53866
Epoch: 7/15, Train Loss=4.49440
Epoch: 8/15, Train Loss=4.46302
Epoch: 9/15, Train Loss=4.43796
Epoch: 10/15, Train Loss=4.41853
Epoch: 11/15, Train Loss=4.40023
Epoch: 12/15, Train Loss=4.38645
Epoch: 13/15, Train Loss=4.37518
Epoch: 14/15, Train Loss=4.36635
Epoch: 15/15, Train Loss=4.35788
CPU times: user 5min 2s, sys: 8.68 s, total: 5min 11s
Wall time: 5min 12s

7. 유사도 측정

list(model.parameters())[0].shape
torch.Size([10000, 100])
# embedding from first model layer
embeddings = list(model.parameters())[0]
torch.Size([10000, 100])
# detach : https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
embeddings = embeddings.cpu().detach().numpy()
embeddings
array([[-3.4201128 ,  1.0614903 , -3.2150905 , ...,  2.100835  ,
         1.3760728 ,  2.2108448 ],
       [ 0.46403676, -2.0405898 , -2.4745762 , ...,  0.32860556,
        -1.334212  , -0.22706386],
       [ 3.47059   , -0.6248425 ,  0.9439361 , ...,  3.689676  ,
        -0.8413572 , -0.39743933],
       ...,
       [ 1.8678907 ,  6.9253416 , -0.58181703, ...,  1.1381289 ,
        -3.7073238 , -0.4031666 ],
       [ 2.3469465 ,  3.015586  , -3.6032927 , ...,  1.1200864 ,
         2.8106098 , -4.200683  ],
       [ 0.34840223,  0.5831198 , -0.88251144, ..., -2.5415118 ,
        -3.3060513 ,  0.03533731]], dtype=float32)
def cos_similarity(x, y, eps=1e-8):
    '''코사인 유사도 산출

    :param x: 벡터
    :param y: 벡터
    :param eps: '0으로 나누기'를 방지하기 위한 작은 값
    :return:
    '''
    nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
    ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
    return np.dot(nx, ny)
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    # 1. 검색어를 꺼낸다
    if query not in word_to_id:
        print('%s(을)를 찾을 수 없습니다.' % query)
        return

    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    # 2. 코사인 유사도 계산
    vocab_size = len(id_to_word)

    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    # 3. 코사인 유사도를 기준으로 내림차순으로 출력
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(' %s: %s' % (id_to_word[i], similarity[i]))

        count += 1
        if count >= top:
            return
word_vecs = embeddings

# 가장 비슷한(most similar) 단어 뽑기
querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)
[query] you
 we: 0.7654329538345337
 they: 0.7229254841804504
 i: 0.6785027384757996
 she: 0.4186348617076874
 he: 0.36235079169273376

[query] year
 week: 0.7872598171234131
 month: 0.7634924650192261
 summer: 0.5889633297920227
 decade: 0.5301210880279541
 spring: 0.5232934951782227

[query] car
 cars: 0.499886691570282
 clothing: 0.4101100265979767
 glass: 0.3856881260871887
 family: 0.3844160735607147
 cigarette: 0.38434258103370667

[query] toyota
 factory: 0.44822514057159424
 mitsubishi: 0.42368119955062866
 potatoes: 0.39004334807395935
 vehicles: 0.3864486813545227
 loan-loss: 0.38212400674819946

댓글남기기