15 분 소요

LSTM 모델을 사용하여 AG News Category 예측하기

import os
import random
import string
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.utils.data as data
import pickle
import nltk
nltk.download('punkt')

import string
from collections import Counter
from copy import deepcopy
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
# 시드값 고정
seed = 50
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)                # 파이썬 난수 생성기 시드 고정
np.random.seed(seed)             # 넘파이 난수 생성기 시드 고정
torch.manual_seed(seed)          # 파이토치 난수 생성기 시드 고정 (CPU 사용 시)
torch.cuda.manual_seed(seed)     # 파이토치 난수 생성기 시드 고정 (GPU 사용 시)
torch.cuda.manual_seed_all(seed) # 파이토치 난수 생성기 시드 고정 (멀티GPU 사용 시)
torch.backends.cudnn.deterministic = True # 확정적 연산 사용
torch.backends.cudnn.benchmark = False    # 벤치마크 기능 해제
torch.backends.cudnn.enabled = False      # cudnn 사용 해제
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
device(type='cuda')

1. 데이터 다운로드

# kaggle api를 사용할 수 있는 패키지 설치
!pip install kaggle

# kaggle.json upload
from google.colab import files
files.upload()

# permmision warning 방지
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# download
!kaggle datasets download -d amananandrai/ag-news-classification-dataset

!mkdir dataset
# unzip(압축풀기)
!unzip -q ag-news-classification-dataset.zip -d dataset/
Saving kaggle.json to kaggle.json
Downloading ag-news-classification-dataset.zip to /content
 88% 10.0M/11.4M [00:01<00:00, 12.4MB/s]
100% 11.4M/11.4M [00:01<00:00, 7.71MB/s]

2. 데이터 불러오기

Vocabulary

class Vocabulary():
    def __init__(self, vocab_threshold, vocab_file,
                 mask_word="<mask>",
                 start_word="<start>",
                 end_word="<end>",
                 unk_word="<unk>",
                 news_df=None, vocab_from_file=False):

        self.vocab_threshold = vocab_threshold
        # train과 valid로 나귀기전 전체 데이터(train_news_df)
        self.news_df = news_df

        # dictionary 초기화화
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

        if vocab_from_file:
            # 파일로부터 읽기
            with open(vocab_file, 'rb') as f:
                vocab = pickle.load(f)
                self.word2idx = vocab.word2idx
                self.idx2word = vocab.idx2word
            print('Vocabulary succesfully loaded from vocab.pkl file!')
        else:
            self.build_vocab()
            with open(vocab_file, 'wb') as f:
                pickle.dump(self, f)

    def build_vocab(self):
        # mask_word (0), start_word(1), end_word (2), unk_word (3)
        self.mask_index = self.add_word(mask_word) # 0
        self.begin_seq_index = self.add_word(start_word) # 1
        self.end_seq_index = self.add_word(end_word) # 2
        self.unk_index = self.add_word(unk_word) # 3
        self.add_description()

    def add_word(self, word):
        if not word in self.word2idx:
            idx = self.idx
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
        return idx

    def add_description(self):
        counter = Counter()
        for description in self.news_df.Description:
            tokens = nltk.tokenize.word_tokenize(description.lower())
            counter.update(tokens)

        for word, cnt in counter.items():
            if cnt >= self.vocab_threshold:
                self.add_word(word)

        print("description_vocab 길이 :", len(self.word2idx))

    def __call__(self, word): # lookup word
        return self.word2idx.get(word, self.unk_index)

    def __len__(self):
        return len(self.word2idx)
#data_dir = '/kaggle/input/ag-news-classification-dataset/'
data_dir = './dataset/'
train_news_csv= data_dir + "train.csv"
test_news_csv= data_dir + "test.csv"
train_news_df = pd.read_csv(train_news_csv)
test_news_df = pd.read_csv(test_news_csv)
len(train_news_df)
120000
from sklearn.model_selection import train_test_split

train_indices, valid_indices = train_test_split(range(len(train_news_df)),
                                                stratify= train_news_df['Class Index'],
                                                test_size=0.2)
len(train_indices), len(valid_indices)
(96000, 24000)
train_df = train_news_df.iloc[train_indices]
valid_df = train_news_df.iloc[valid_indices]

클래스별 분포

train_news_df['Class Index'].value_counts()/len(train_news_df)
3    0.25
4    0.25
2    0.25
1    0.25
Name: Class Index, dtype: float64
valid_df['Class Index'].value_counts()/len(valid_df)
3    0.25
2    0.25
4    0.25
1    0.25
Name: Class Index, dtype: float64
# Consists of class ids 1-4 where 1-World, 2-Sports, 3-Business, 4-Sci/Tech
category_map = {1:"World", 2:"Sports", 3:"Business", 4:"Sci/Tech"}

Dataset

vocab_threshold = 25
mask_word = "<mask>"
start_word = "<start>"
end_word = "<end>"
unk_word = "<unk>"
vocab_file = './vocab.pkl'
vocab_from_file = False

vocab = Vocabulary(vocab_threshold, vocab_file,
                    mask_word, start_word, end_word, unk_word,
                    train_news_df, vocab_from_file)
description_vocab 길이 : 10320
def vectorize(text, vector_length = -1):
    # 입력 : 'Clijsters Unsure About Latest Injury, Says Hewitt'
    # 출력 : [1, 2, 4, 9, 10, 9, 2, 0, 0, 0, 0]
    # vocabulary 에서 text의 각 단어들의 id를 가져올 수 있도록

    indices = [vocab.begin_seq_index]
    word_list = nltk.tokenize.word_tokenize(text.lower())
    for word in word_list:
        indices.append(vocab(word))
    indices.append(vocab.end_seq_index)

    if vector_length < 0:
        vector_length = len(indices)

    out_vector = np.zeros(vector_length, dtype=np.int64)
    out_vector[:len(indices)] = indices
    out_vector[len(indices):] = vocab.mask_index

    return out_vector
vectorize("I am a boy", -1)
array([   1,  464, 7647,   21, 4123,    2])
class NewsDataset(Dataset):
  def __init__(self, mode, batch_size, vocab_threshold, vocab_file,
               mask_word, start_word, end_word, unk_word,
               news_df, vocab_from_file):
      self.news_df = news_df
      self.batch_size = batch_size
      self.description_vocab = Vocabulary(vocab_threshold, vocab_file,
                                    mask_word, start_word, end_word, unk_word,
                                    train_news_df, vocab_from_file)
      # (1) 문자열을 max_length 로 고정해서 벡터화할 때
      # measure_len = lambda context : len(nltk.tokenize.word_tokenize(context.lower()))
      # self.max_seq_length = max(map(measure_len, train_news_df.Title)) + 2

      # (2) 문자열을 가변적으로 벡터화할 때
      self.description_lengths = [len(nltk.tokenize.word_tokenize(description.lower()))
                                         for description in self.news_df.Description]

      self.description_vectors = [vectorize(self.news_df.iloc[index].Description, -1)
                                   for index in range(len(self.news_df)) ]

  def __getitem__(self, index):
      row = self.news_df.iloc[index]
      # description_vector = vectorize(row.Description, -1)
      description_vector = self.description_vectors[index]
      category_index = row['Class Index'] - 1
      return {'x_data' : description_vector,
              'y_target' : category_index }

  def __len__(self):
      return len(self.news_df)

  def get_train_indices(self):
      # 전체 데이터에서 description의 길이 중 하나를 선택해서
      # 그 길이와 같은 description들의 indices를 반환
      sel_length = np.random.choice(self.description_lengths)
      condition = [self.description_lengths[i] == sel_length for i in np.arange(len(self.description_lengths))]
      all_indices = np.where(condition)[0]
      indices = list(np.random.choice(all_indices, size=self.batch_size))
      return indices

train_batch_size = 32
valid_batch_size = 32
test_batch_size = 32
vocab_threshold = 25
mask_word = "<mask>"
start_word = "<start>"
end_word = "<end>"
unk_word = "<unk>"
vocab_file = './vocab.pkl'
vocab_from_file = False

trainset = NewsDataset("train", train_batch_size, vocab_threshold, vocab_file,
                       mask_word, start_word, end_word, unk_word,
                       train_df, vocab_from_file)
description_vocab 길이 : 10320
vocab_from_file = True
validset = NewsDataset("valid", valid_batch_size, vocab_threshold, vocab_file,
                       mask_word, start_word, end_word, unk_word,
                       valid_df, vocab_from_file)
testset = NewsDataset("test", test_batch_size, vocab_threshold, vocab_file,
                       mask_word, start_word, end_word, unk_word,
                       test_news_df, vocab_from_file)
Vocabulary succesfully loaded from vocab.pkl file!
Vocabulary succesfully loaded from vocab.pkl file!
len(trainset), len(validset), len(testset)
(96000, 24000, 7600)

3. 데이터 적재 : DataLoader

indices = trainset.get_train_indices() # description의 길이가 같은 indices (batch_size)
initial_sampler = data.sampler.SubsetRandomSampler(indices = indices) # random하게 뒤섞음
batch_sampler = data.sampler.BatchSampler(sampler=initial_sampler, batch_size=train_batch_size, drop_last=False)
trainloader = DataLoader(dataset=trainset, batch_sampler=batch_sampler, num_workers=2)
# !lscpu
indices = validset.get_train_indices() # description의 길이가 같은 indices (batch_size)
initial_sampler = data.sampler.SubsetRandomSampler(indices = indices) # random하게 뒤섞음
batch_sampler = data.sampler.BatchSampler(sampler=initial_sampler, batch_size=valid_batch_size, drop_last=False)
validloader = DataLoader(dataset=validset, batch_sampler=batch_sampler, num_workers=2)
indices = testset.get_train_indices() # description의 길이가 같은 indices (batch_size)
initial_sampler = data.sampler.SubsetRandomSampler(indices = indices) # random하게 뒤섞음
batch_sampler = data.sampler.BatchSampler(sampler=initial_sampler, batch_size=test_batch_size, drop_last=False)
testloader = DataLoader(dataset=testset, batch_sampler=batch_sampler, num_workers=2)
batch = next(iter(trainloader))
batch['x_data'].size(), batch['y_target'].size()
(torch.Size([32, 33]), torch.Size([32]))
batch = next(iter(validloader))
batch['x_data'].size(), batch['y_target'].size()
(torch.Size([32, 23]), torch.Size([32]))
batch = next(iter(testloader))
batch['x_data'].size(), batch['y_target'].size()
(torch.Size([32, 40]), torch.Size([32]))
len(trainloader), len(validloader), len(testloader)
(1, 1, 1)

5. 모델 생성: NewsClassifier

image.png

class NewsClassifier(nn.Module):
    def __init__(self, embedding_size, vocab_size,
                 rnn_hidden_dim, hidden_dim, num_layers, num_classes, dropout):

        super().__init__()

        self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.lstm = nn.LSTM(input_size = embedding_size, hidden_size = rnn_hidden_dim,
                           num_layers = num_layers,
                           dropout = dropout,
                           batch_first = True)
        self.classifier = nn.Sequential(
                            nn.Linear(in_features=rnn_hidden_dim, out_features = hidden_dim),
                            nn.ReLU(),
                            nn.Dropout(dropout),
                            nn.Linear(in_features= hidden_dim, out_features = num_classes)
                           )


    def forward(self, inputs, apply_softmax=False): # input : (batch_size, description_length)
        embeddings = self.emb(inputs) # embeddings : (batch_size, description_length, embedding_size)
        _, (hidden, cell) = self.lstm(embeddings) # outputs : (batch_size, description_length, rnn_hidden_dim)
                                               # hidden : (num_layers , batch_size, rnn_hidden_dim)
        hidden = hidden[1] # hidden : (batch_size, rnn_hidden_dim)
        outputs = self.classifier(hidden) # outputs : (batch_size, num_classes)

        if apply_softmax:
            outputs = F.softmax(outputs, dim=1)
        return outputs

하이퍼 파라미터 설정

embedding_size=100
rnn_hidden_dim = 100
hidden_dim = 50
num_layers = 2
dropout = 0.2
learning_rate=0.001
num_epochs=12
classifier = NewsClassifier(embedding_size=embedding_size,
                            vocab_size=len(vocab),
                            rnn_hidden_dim = rnn_hidden_dim,
                            hidden_dim = hidden_dim,
                            num_layers = num_layers,
                            num_classes=4, dropout=dropout)
classifier = classifier.to(device)
classifier
NewsClassifier(
  (emb): Embedding(10320, 100)
  (lstm): LSTM(100, 100, num_layers=2, batch_first=True, dropout=0.2)
  (classifier): Sequential(
    (0): Linear(in_features=100, out_features=50, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=50, out_features=4, bias=True)
  )
)
out = classifier(batch['x_data'].to(device))
out.shape
torch.Size([32, 4])

6. 모델 설정 (손실함수, 옵티마이저 선택)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.01,
                                           patience=1, verbose=True)

7. 모델 훈련

train_step = len(trainset) // train_batch_size
valid_step = len(validset) // valid_batch_size
test_step = len(testset) // test_batch_size
train_step, valid_step, test_step
(3000, 750, 237)
def validate(model, validloader, loss_fn):
    model.eval()
    total = 0
    correct = 0
    valid_loss = []
    valid_epoch_loss=0
    valid_accuracy = 0

    with torch.no_grad():
        for step in range(1, valid_step+1):
            indices = validset.get_train_indices()
            initial_sampler = data.sampler.SubsetRandomSampler(indices=indices)
            batch_sampler=data.sampler.BatchSampler(sampler=initial_sampler,
                                                    batch_size=valid_batch_size,
                                                    drop_last=False)
            validloader= data.DataLoader(dataset=validset, num_workers=0,
                                         batch_sampler=batch_sampler)
            # Obtain the batch.
            batch_dict = next(iter(validloader))
            inputs = batch_dict['x_data'].to(device)
            labels = batch_dict['y_target'].to(device)

            # 전방향 예측과 손실
            logits = model(inputs)
            loss = loss_fn(logits, labels)
            valid_loss.append(loss.item())

            # 정확도
            _, preds = torch.max(logits, 1) # 배치에 대한 최종 예측
            # preds = logit.max(dim=1)[1]
            correct += int((preds == labels).sum()) # 배치 중 맞은 것의 개수가 correct에 누적
            total += labels.shape[0] # 배치 사이즈만큼씩 total에 누적

    valid_epoch_loss = np.mean(valid_loss)
    total_loss["val"].append(valid_epoch_loss)
    valid_accuracy = correct / total

    return valid_epoch_loss, valid_accuracy
def train_loop(model, trainloader, loss_fn, epochs, optimizer):
    min_loss = 1000000
    trigger = 0
    patience = 3

    for epoch in range(epochs):
        model.train()
        train_loss = []

        for step in range(1, train_step+1):
            indices = trainset.get_train_indices()
            initial_sampler = data.sampler.SubsetRandomSampler(indices=indices)
            batch_sampler=data.sampler.BatchSampler(sampler=initial_sampler,
                                                    batch_size=train_batch_size,
                                                    drop_last=False)
            trainloader= data.DataLoader(dataset=trainset, num_workers=2,
                                         batch_sampler = batch_sampler)

            # Obtain the batch.
            batch_dict = next(iter(trainloader))
            inputs = batch_dict['x_data'].to(device)
            labels = batch_dict['y_target'].to(device)


            optimizer.zero_grad()
            logits = model(inputs)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        train_epoch_loss = np.mean(train_loss)
        total_loss["train"].append(train_epoch_loss)

        valid_epoch_loss, valid_accuracy = validate(model, validloader, loss_fn)

        print("Epoch: {}/{}, Train Loss={:.4f}, Val Loss={:.4f}, Val Accyracy={:.4f}".format(
                epoch + 1, epochs,
                total_loss["train"][-1],
                total_loss["val"][-1],
                valid_accuracy))

        # Early Stopping (조기 종료)
        if valid_epoch_loss > min_loss: # valid_loss가 min_loss를 갱신하지 못하면
          trigger += 1
          print('trigger : ', trigger)
          if trigger > patience:
            print('Early Stopping !!!')
            print('Training loop is finished !!')
            return
        else:
          trigger = 0
          min_loss = valid_epoch_loss
          best_model_state = deepcopy(model.state_dict())
          torch.save(best_model_state, 'best_checkpoint.pth')
        # -------------------------------------------

        # Learning Rate Scheduler
        scheduler.step(valid_epoch_loss)
        # -------------------------------------------
total_loss = {"train": [], "val": []}
%time train_loop(classifier, trainloader, loss_fn, num_epochs, optimizer)
Epoch: 1/12, Train Loss=0.5833, Val Loss=0.4604, Val Accyracy=0.8458
Epoch: 2/12, Train Loss=0.3168, Val Loss=0.3307, Val Accyracy=0.8917
Epoch: 3/12, Train Loss=0.2435, Val Loss=0.3096, Val Accyracy=0.8998
Epoch: 4/12, Train Loss=0.1943, Val Loss=0.3056, Val Accyracy=0.9028
Epoch: 5/12, Train Loss=0.1647, Val Loss=0.3115, Val Accyracy=0.8996
trigger :  1
Epoch: 6/12, Train Loss=0.1352, Val Loss=0.3383, Val Accyracy=0.9022
trigger :  2
Epoch 00006: reducing learning rate of group 0 to 1.0000e-05.
Epoch: 7/12, Train Loss=0.1224, Val Loss=0.3317, Val Accyracy=0.9032
trigger :  3
Epoch: 8/12, Train Loss=0.1160, Val Loss=0.3162, Val Accyracy=0.9085
trigger :  4
Early Stopping !!!
Training loop is finished !!
CPU times: user 26min 30s, sys: 26min 55s, total: 53min 25s
Wall time: 1h 37min 8s
import matplotlib.pyplot as plt

plt.plot(total_loss['train'], label="train_loss")
plt.plot(total_loss['val'], label="vallid_loss")
plt.legend()
plt.show()

png

8. 모델 평가

def evaluate(model, testloader, loss_fn):
    model.eval()
    total = 0
    correct = 0
    test_loss = []
    test_epoch_loss=0
    test_accuracy = 0

    with torch.no_grad():
        for step in range(1, test_step+1):
            indices = testset.get_train_indices()
            initial_sampler = data.sampler.SubsetRandomSampler(indices=indices)
            batch_sampler=data.sampler.BatchSampler(sampler=initial_sampler,
                                                    batch_size=test_batch_size,
                                                    drop_last=False)
            testloader= data.DataLoader(dataset=testset, num_workers=2,
                                        batch_sampler=batch_sampler)

            # Obtain the batch.
            batch_dict = next(iter(testloader))
            inputs = batch_dict['x_data'].to(device)
            labels = batch_dict['y_target'].to(device)

            # 전방향 예측과 손실
            logits = model(inputs)
            loss = loss_fn(logits, labels)
            test_loss.append(loss.item())

            # 정확도
            _, preds = torch.max(logits, 1) # 배치에 대한 최종 예측
            # preds = logit.max(dim=1)[1]
            correct += int((preds == labels).sum()) # 배치 중 맞은 것의 개수가 correct에 누적
            total += labels.shape[0] # 배치 사이즈만큼씩 total에 누적

    test_epoch_loss = np.mean(test_loss)
    # total_loss["val"].append(test_epoch_loss)
    test_accuracy = correct / total

    print('Test Loss : {:.5f}'.format(test_epoch_loss),
        'Test Accuracy : {:.5f}'.format(test_accuracy))


evaluate(classifier, testloader, loss_fn)
Test Loss : 0.32579 Test Accuracy : 0.90730
# valid loss or accuracy 기준 best model
best_state_dict = torch.load('best_checkpoint.pth')
best_classifier = classifier
best_classifier.to(device)
best_classifier.load_state_dict(best_state_dict)

evaluate(best_classifier, testloader, loss_fn)
Test Loss : 0.33211 Test Accuracy : 0.88898

9. 모델 예측

def predict_category(text, classifier, max_length):
    # 뉴스 제목을 기반으로 카테고리를 예측

    # 1. vetororize
    vectorized_text = vectorize(text, vector_length=max_length)
    vectorized_text = torch.tensor(vectorized_text).unsqueeze(0) # tensor로 바꾸고, 배치처리를 위해 차원 늘림

    # 2. model의 예측
    result = classifier(vectorized_text, apply_softmax=True) # result : 예측 확률
    probability, index= result.max(dim=1)
    predict = index.item() + 1 # 0번 클래스 예측은 실제 데이터 에서는 1번 클래스와 같다.
    probability = probability.item()
    preidct_category = category_map[predict]

    return {'category':preidct_category, 'probability':probability}

def get_samples():
    # True Category 기반 샘플 얻어오기
    # 클래스 별로 5개씩 샘플을 준비
    samples = {}

    for category in testset.news_df['Class Index'].unique(): # 1=>2=>3=>4
        samples[category]= testset.news_df.Description[testset.news_df['Class Index'] == category].tolist()[-5:]

    return samples

test_samples = get_samples()
# Consists of class ids 1-4 where 1-World, 2-Sports, 3-Business, 4-Sci/Tech
category_map = {1:"World", 2:"Sports", 3:"Business", 4:"Sci/Tech"}
classifier = classifier.to('cpu')
for truth, sample_group in test_samples.items():
    print(f"True Category: {category_map[truth]}")
    print('='*50)
    for sample in sample_group:
        prediction = predict_category(sample, classifier, max_length = -1)
        print("예측: {} (p={:0.2f})".format(prediction['category'], prediction['probability']))
        print("샘플: {}".format(sample))
        print('-'*30)
    print()
True Category: Business
==================================================
예측: Business (p=0.77)
샘플: MOSCOW (AFP) - Russia forged ahead with the weekend auction of the core asset of crippled oil giant Yukos despite a disputed US court order barring the sale, with state-controlled gas giant Gazprom entering the bidding.
------------------------------
예측: Business (p=1.00)
샘플: The head of plane maker Airbus yesterday won a bitter battle to oust his boss from the helm of parent aerospace group Eads after winning the support of a key shareholder.
------------------------------
예측: Business (p=0.88)
샘플: Standard  amp; Poor #39;s Equity Research said the purchase of Rent.com by eBay (nasdaq: EBAY - news - people ) could be a bit of a miscalculation.
------------------------------
예측: Business (p=0.92)
샘플: SINGAPORE : Doctors in the United States have warned that painkillers Bextra and Celebrex may be linked to major cardiovascular problems and should not be prescribed.
------------------------------
예측: Sci/Tech (p=0.60)
샘플: EBay plans to buy the apartment and home rental service Rent.com for \$415 million, adding to its already exhaustive breadth of offerings.
------------------------------

True Category: Sci/Tech
==================================================
예측: Sci/Tech (p=0.71)
샘플: A software company that Microsoft acquired this week to help beef up computer security may come with a bug of its own--a company claiming ownership of the programs.
------------------------------
예측: Sci/Tech (p=0.70)
샘플: The U.S. Army has struck a deal with IBM and other companies to create an automated record-keeping system that ends the need for electronic forms to be printed out, signed and delivered up the military service's chain of command.
------------------------------
예측: Sci/Tech (p=0.97)
샘플: InfoWorld - The great debate over the impact of Oracle's hostile takeover of PeopleSoft has all the big industry analyst organizations weighing in. However, in most of the analysis one group's opinion seems to have been overlooked: that of PeopleSoft users.
------------------------------
예측: Sci/Tech (p=0.94)
샘플: AP - Australian scientists who helped discover a species of tiny humans nicknamed Hobbits have been hailed for making the second most important scientific achievement of 2004.
------------------------------
예측: Sci/Tech (p=0.98)
샘플: Internet search providers are reacting to users #39; rising interest in finding video content on the Web, while acknowledging that there are steep challenges that need to be overcome.
------------------------------

True Category: Sports
==================================================
예측: Sports (p=0.63)
샘플: NEW YORK - The TV lights were on, the cameras rolled and the symphony of cameras flashing in his face blinded Pedro Martinez - but not for long.
------------------------------
예측: Sports (p=0.99)
샘플: DAVIE - The Dolphins want Nick Saban, and the LSU coach could be on his way. Although LSU Athletic Director Skip Bertman said Friday that  quot;an offer is very imminent, quot; the Dolphins are committed to adhering 
------------------------------
예측: Sports (p=1.00)
샘플: Paceman Mashrafe Mortaza claimed two prize scalps, including Sachin Tendulkar with the day #39;s first ball, to lead a Bangladesh fightback in the second and final test against India on Saturday.
------------------------------
예측: Sports (p=0.98)
샘플: With the supply of attractive pitching options dwindling daily -- they lost Pedro Martinez to the Mets, missed on Tim Hudson, and are resigned to Randy Johnson becoming a Yankee -- the Red Sox struck again last night, coming to terms with free agent Matt Clement on a three-year deal that will pay the righthander in the neighborhood of \$25 ...
------------------------------
예측: Sports (p=0.99)
샘플: Like Roger Clemens did almost exactly eight years earlier, Pedro Martinez has left the Red Sox apparently bitter about the way he was treated by management.
------------------------------

True Category: World
==================================================
예측: Business (p=0.97)
샘플: The \$500 billion drug industry is stumbling badly in its core business of finding new medicines, while aggressively marketing existing drugs.
------------------------------
예측: World (p=1.00)
샘플: Canadian Press - BANJA LUKA, Bosnia-Herzegovina (AP) - The prime minister of the Serbian half of Bosnia resigned Friday, a day after the U.S. government and Bosnia's top international administrator sanctioned Bosnian Serbs for failing to arrest and hand over war crimes suspects to the UN tribunal.
------------------------------
예측: World (p=0.98)
샘플: The European Union's decision to hold entry talks with Turkey receives a widespread welcome.
------------------------------
예측: World (p=1.00)
샘플: WASHINGTON -- Outgoing Secretary of State Colin L. Powell said yesterday he doesn't regret being the public face for the Bush administration's international call to war in Iraq. He also believes diplomacy is making headway in containing nuclear threats in Iran and North Korea, he said in an interview.
------------------------------
예측: World (p=0.83)
샘플: Ukrainian presidential candidate Viktor Yushchenko was poisoned with the most harmful known dioxin, which is contained in Agent Orange, a scientist who analyzed his blood said Friday.
------------------------------

참고문법

Counter

from collections import Counter
# 사용 예 (1)
s = 'life is short, so python is easy.'

counter = Counter(s)
counter
# 사용 예 (2)
s = 'life is short, so python is easy.'

counter = Counter()
tokens = s.split()
for token in tokens:
    counter[token] += 1
counter
Counter({'life': 1, 'is': 2, 'short,': 1, 'so': 1, 'python': 1, 'easy.': 1})
# 사용 예 (3)
s = 'life is short, so python is easy.'

counter = Counter()
tokens = s.split()
counter.update(tokens)
counter
Counter({'life': 1, 'is': 2, 'short,': 1, 'so': 1, 'python': 1, 'easy.': 1})
# 사용 예 (4)
s = 'life is short, so python is easy.'

counter = Counter()
tokens = nltk.tokenize.word_tokenize(s)
counter.update(tokens)
counter
Counter({'life': 1,
         'is': 2,
         'short': 1,
         ',': 1,
         'so': 1,
         'python': 1,
         'easy': 1,
         '.': 1})
# 아래 문자열에 대해 소문자로 변환전 tokenize 결과후 변환후 결과가 다름
s = "AP - Environmentalists asked the U.S. Fish and Wildlife Service on Wednesday to grant protected status to the California spotted owl, claiming the bird's old-growth forest habitat is threatened by logging."
counter = Counter()
tokens = nltk.tokenize.word_tokenize(s)
counter.update(tokens)
counter
s = "AP - Environmentalists asked the U.S. Fish and Wildlife Service on Wednesday to grant protected status to the California spotted owl, claiming the bird's old-growth forest habitat is threatened by logging."
counter = Counter()
tokens = nltk.tokenize.word_tokenize(s.lower())
counter.update(tokens)
counter

np.where

a = np.arange(10)
cond = a < 5
cond
array([ True,  True,  True,  True,  True, False, False, False, False,
       False])
np.where(cond, a, a*10)
array([ 0,  1,  2,  3,  4, 50, 60, 70, 80, 90])
np.where(cond) # condition만 적으면 아래 np.asarray(cond).nonzero()와 동일한 결과과
(array([0, 1, 2, 3, 4]),)
np.asarray(cond).nonzero()
(array([0, 1, 2, 3, 4]),)
description_lengths = [37, 38, 45, 2, 3, 37, 37, 45, 45, 50]
sel_length = 37
cond = [description_lengths[i] == sel_length for i in np.arange(len(description_lengths))]
indices = np.where(cond)
indices
(array([0, 5, 6]),)

BatchSampler

indices = range(10)
initial_sampler = data.sampler.SubsetRandomSampler(indices=indices)
batch_sampler = data.sampler.BatchSampler(sampler=initial_sampler, batch_size=3, drop_last=False)
list(batch_sampler)
[[9, 2, 0], [6, 7, 5], [3, 8, 4], [1]]
indices = range(32) # 같은 길이인 description들의 indices
initial_sampler = data.sampler.SubsetRandomSampler(indices=indices) # random하게 뒤섞음
batch_sampler = data.sampler.BatchSampler(sampler=initial_sampler, batch_size=32, drop_last=True) # initial sampler에서 샘플링된 데이터를 배치 단위로 만들어줌
list(batch_sampler)

Reference

댓글남기기