6 분 소요

Word-level language model using RNN

  • 참고 : https://github.com/pytorch/examples/tree/main/word_language_model
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
device(type='cuda')

1. 데이터 다운로드

!mkdir ptb_dataset
!wget https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt -P ./ptb_dataset
!wget https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.valid.txt -P ./ptb_dataset
!wget https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.test.txt -P ./ptb_dataset

2. 데이터 불러오기

dataset_dir = './ptb_dataset/'
train_file_name = 'ptb.train.txt'
valid_file_name = 'ptb.valid.txt'
test_file_name = 'ptb.test.txt'
word_to_id = {}
id_to_word = {}
def load_vocab(data_type="train"):
    if data_type == 'train':
        file_path = dataset_dir + train_file_name
        words = open(file_path).read().replace('\n', '<eos>').strip().split()
        for i, word in enumerate(words):
            if word not in word_to_id:
                tmp_id = len(word_to_id)
                word_to_id[word] = tmp_id
                id_to_word[tmp_id] = word    
        corpus = np.array([word_to_id[w] for w in words])        
        print("vocab size : ", len(id_to_word))
        print("corpus size : ", len(corpus))
        return corpus, word_to_id, id_to_word
    elif data_type == 'valid':
        file_path = dataset_dir + valid_file_name
        words = open(file_path).read().replace('\n', '<eos>').strip().split()
        corpus = np.array([word_to_id[w] for w in words])        
        print("corpus size : ", len(corpus))
        return corpus, word_to_id, id_to_word
    else:
        file_path = dataset_dir + test_file_name
        words = open(file_path).read().replace('\n', '<eos>').strip().split()
        corpus = np.array([word_to_id[w] for w in words])        
        print("corpus size : ", len(corpus))
        return corpus, word_to_id, id_to_word
corpus, word_to_id, id_to_word = load_vocab("train")
corpus_val, _, _ = load_vocab("valid")
corpus_test, _, _ = load_vocab("test")
vocab size :  10000
corpus size :  929589
corpus size :  73760
corpus size :  82430

3. 데이터 적재

4. 모델 생성

class SimpleRnnlm(nn.Module):
    def __init__(self, vocab_size, wordvec_size, hidden_size, num_layers):
        super().__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

        self.encoder = nn.Embedding(num_embeddings=vocab_size, embedding_dim = wordvec_size)
        self.rnn = nn.RNN(input_size=wordvec_size, hidden_size= hidden_size,
                          num_layers = num_layers, batch_first = True)
        self.decoder = nn.Linear(in_features=hidden_size, out_features=vocab_size)

    def forward(self, inputs, hidden): # inputs :(batch_size, time_size)
        embedded = self.encoder(inputs) # embeddings : (batch_size, time_size, wordvec_size)
        outputs, hidden = self.rnn(embedded, hidden) # outputs : (batch_size, time_size, hidden_size)
                                                        # hidden : (num_layers , batch_size, hidden_size)
        decoded = self.decoder(outputs) # decoded : (batch_size, time_size, vocab_size)                                                  
        decoded = decoded.view(-1, self.vocab_size) # decoded : (batch_size*time_size, vocab_size)
        decoded = F.log_softmax(decoded, dim=1)

        return decoded, hidden
     

    def init_hidden(self, batch_size):
        # https://pytorch.org/docs/stable/generated/torch.Tensor.new_zeros.html
        # 아래 hidden 벡터를 생성할 때 weight와 동일한 device, dtype으로 만들기 위해
        # weight.new_zeros()를 사용
        weight = next(self.parameters())
        return weight.new_zeros(num_layers, batch_size, self.hidden_size)        
vocab_size = 10000
wordvec_size = 100
hidden_size = 100 
num_layers = 1

model = SimpleRnnlm(vocab_size=vocab_size, wordvec_size=wordvec_size, 
                    hidden_size=hidden_size, num_layers=num_layers)
model.to(device)
SimpleRnnlm(
  (encoder): Embedding(10000, 100)
  (rnn): RNN(100, 100, batch_first=True)
  (decoder): Linear(in_features=100, out_features=10000, bias=True)
)

5. 모델 설정 (손실함수, 옵티마이저 선택)

lr = 0.001
loss_fn = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.25, 
                                                 patience=1, verbose=True)

6. 모델 훈련

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
def get_batch(xs, ts, batch_size, time_size):
    global time_idx   

    data_size = len(xs)
    jump = data_size // batch_size
    offsets = [i * jump for i in range(batch_size)] 

    batch_x = np.empty((batch_size, time_size), dtype=np.int64)
    batch_t = np.empty((batch_size, time_size), dtype=np.int64)
    for t in range(time_size):
        for i, offset in enumerate(offsets):
            batch_x[i, t] = xs[(offset + time_idx) % data_size]
            batch_t[i, t] = ts[(offset + time_idx) % data_size]
        time_idx += 1
    batch_x = torch.from_numpy(batch_x).to(device)
    batch_t = torch.from_numpy(batch_t).to(device)
    return batch_x, batch_t
def eval_perplexity(model, corpus, batch_size=10, time_size=35):
    corpus_size = len(corpus)
    total_loss, loss_cnt = 0, 0
    max_iters = (corpus_size - 1) // (batch_size * time_size)
    jump = (corpus_size - 1) // batch_size
    
    hidden = model.init_hidden(batch_size)
    model.eval()
    for iters in range(max_iters):
        xs = np.zeros((batch_size, time_size), dtype=np.int64)
        ts = np.zeros((batch_size, time_size), dtype=np.int64)
        time_offset = iters * time_size
        offsets = [time_offset + (i * jump) for i in range(batch_size)]
        for t in range(time_size):
            for i, offset in enumerate(offsets):
                xs[i, t] = corpus[(offset + t) % corpus_size]
                ts[i, t] = corpus[(offset + t + 1) % corpus_size]
        xs = torch.from_numpy(xs).to(device)
        ts = torch.from_numpy(ts).to(device)
        
        hidden = repackage_hidden(hidden)
        outputs, hidden = model(xs, hidden)
        loss = loss_fn(outputs, ts.view(-1))
               
        total_loss += loss.item()
    
    valid_epoch_loss = total_loss / max_iters
    ppl = np.exp(valid_epoch_loss)
    return ppl
batch_size = 10
time_size = 5 
xs = corpus[:-1]  # 입력
ts = corpus[1:]   # 출력(정답 레이블)
data_size = len(xs)
max_iters = data_size // (batch_size * time_size)
time_idx = 0

def train_loop(model, loss_fn, epochs, optimizer):      
    ppl_list = []
    hidden = model.init_hidden(batch_size)

    for epoch in range(epochs):
        model.train()
        train_loss = []
        for iter in range(max_iters):            
            batch_x, batch_t = get_batch(xs, ts, batch_size, time_size)
            optimizer.zero_grad()
            hidden = repackage_hidden(hidden)
            outputs, hidden = model(batch_x, hidden)
            loss = loss_fn(outputs, batch_t.view(-1))
            loss.backward()            
            optimizer.step()
            train_loss.append(loss.item())

        train_epoch_loss = np.mean(train_loss)
        ppl = np.exp(train_epoch_loss)
        ppl_list.append(float(ppl))

        # 검증 퍼플렉서티
        eval_ppl = eval_perplexity(model, corpus_val, batch_size=batch_size, time_size=time_size )

        print('| 에폭 %d | 퍼플렉서티 %.2f | 검증 퍼플렉서티 %.2f'
                % (epoch+1, ppl, eval_ppl))
        
        scheduler.step(eval_ppl)
epochs = 15
%time train_loop(model, loss_fn, epochs, optimizer)
| 에폭 1 | 퍼플렉서티 280.96 | 검증 퍼플렉서티 233.11
| 에폭 2 | 퍼플렉서티 170.23 | 검증 퍼플렉서티 209.86
| 에폭 3 | 퍼플렉서티 141.56 | 검증 퍼플렉서티 201.83
| 에폭 4 | 퍼플렉서티 126.95 | 검증 퍼플렉서티 200.18
| 에폭 5 | 퍼플렉서티 116.64 | 검증 퍼플렉서티 200.70
| 에폭 6 | 퍼플렉서티 108.73 | 검증 퍼플렉서티 200.99
Epoch 00006: reducing learning rate of group 0 to 2.5000e-04.
| 에폭 7 | 퍼플렉서티 92.79 | 검증 퍼플렉서티 181.59
| 에폭 8 | 퍼플렉서티 88.51 | 검증 퍼플렉서티 180.36
| 에폭 9 | 퍼플렉서티 86.45 | 검증 퍼플렉서티 179.95
| 에폭 10 | 퍼플렉서티 84.89 | 검증 퍼플렉서티 180.22
| 에폭 11 | 퍼플렉서티 83.62 | 검증 퍼플렉서티 180.52
Epoch 00011: reducing learning rate of group 0 to 6.2500e-05.
| 에폭 12 | 퍼플렉서티 81.16 | 검증 퍼플렉서티 175.54
| 에폭 13 | 퍼플렉서티 80.32 | 검증 퍼플렉서티 175.28
| 에폭 14 | 퍼플렉서티 80.00 | 검증 퍼플렉서티 175.17
| 에폭 15 | 퍼플렉서티 79.71 | 검증 퍼플렉서티 175.10
CPU times: user 10min 27s, sys: 11.2 s, total: 10min 38s
Wall time: 10min 39s
# test_ppl = eval_perplexity(model, corpus_test, batch_size=batch_size, time_size=time_size )

7. 문장 생성 실험

class RnnlmGen(SimpleRnnlm):
    def generate(self, start_id, skip_ids=None, sample_size=100, hidden=None):
        word_ids = [start_id]
        x = start_id
        while len(word_ids) < sample_size:
            sample_x = torch.tensor(x).reshape(1, 1)
            hidden = repackage_hidden(hidden)
            log_p, hidden = model(sample_x, hidden)
            log_p = log_p.detach().numpy().flatten()
            p = np.exp(log_p)
            sampled = np.random.choice(len(p), size=1, p=p)
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))

        return word_ids
gen_model = RnnlmGen(vocab_size=vocab_size, wordvec_size=wordvec_size, hidden_size=hidden_size, num_layers=num_layers)
gen_model.to('cpu')
model.to('cpu')
SimpleRnnlm(
  (encoder): Embedding(10000, 100)
  (rnn): RNN(100, 100, batch_first=True)
  (decoder): Linear(in_features=100, out_features=10000, bias=True)
)
sampe_batch = 1
hidden = model.init_hidden(sampe_batch)
# start 문자와 skip 문자 설정
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]
# 문장 생성
word_ids = gen_model.generate(start_id, skip_ids, hidden=hidden)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print(txt)
you know you know the supreme court turn quickly been to cooperate with the quota disappearance without change a partner will be available to price may reflect the transaction with six-month lehman 's october it is half the issue of.
 among ibm position was closed at c$ late yesterday.
 both shearson 's parent of proposed late last selling them.
 but others raise funds for both companies rising mistakenly a healthy investment.
 southern california companies yet seen as soon as a strong mitchell to pegged finances about slipped to # a transaction plan to focus about a
sample_batch = 1
hidden = model.init_hidden(sample_batch)
start_words = 'the meaning of life is'
start_ids = [word_to_id[w] for w in start_words.split(' ')]

for x in start_ids[:-1]:
    sample_x = torch.tensor(x).reshape(1, 1)
    hidden = repackage_hidden(hidden)
    model(sample_x, hidden)

word_ids = gen_model.generate(start_ids[-1], skip_ids, hidden=hidden)
word_ids = start_ids[:-1] + word_ids
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print('-' * 50)
print(txt)
--------------------------------------------------
the meaning of life is with taking late with automotive demand.
 for its service disposal is many more than it discovered that if you can make sense to a former affordable personally party.
 if we did in an annual appropriations committees of services has the equipment and a u.s. car to be sold or later.
 it 's a uncertainty averaged bonds series b double its common shares closed to yield after increased interest has been withheld nationwide of total existing national scuttle a older example of american express or control and state u.s. pharmaceutical tend to hold who had said the

Reference

댓글남기기