HW2¶
COSI-134A: StatNLP
Deadline: Nov 19, 2020¶
Implement the Viterbi algorithm, the forward algorithm, as well as the scoring function for the LSTM-CRF model.
1. Setup¶
In [ ]:
import os
import random
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
In [ ]:
# Hyperparameters
NUM_EPOCHS = 5
LEARNING_RATE = 0.002
EMBED_DIM = 50
HIDDEN_DIM = 50
NUM_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.2
SEED = 1334
DEVICE_ID = 0
In [ ]:
os.environ[“CUDA_VISIBLE_DEVICES”] = f”{DEVICE_ID}”
device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)
device
In [ ]:
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
2. Data¶
In [ ]:
# modified samples from ptb/dev, with reduced POS tag space
training_data = [
“The_DT complicated_VB language_NN in_PP the_DT huge_JJ new_JJ law_NN has_VB muddied_VB the_DT fight_NN ._.”,
“Shearson_NN Lehman_NN Hutton_NN Inc._NN by_PP yesterday_NN afternoon_NN had_VB written_VB new_JJ TV_NN ads_NN ._.”,
“This_DT time_NN ,_, the_DT firms_NN were_VB ready_JJ ._.”,
“It_NN goes_VB on_PP to_PP plug_VB a_DT few_JJ diversified_JJ Fidelity_NN funds_NN by_PP name_NN ._.”,
“Everybody_NN was_VB making_VB money_NN ._.”
]
In [ ]:
# artificial sentence
test_data = “Alphabet_NN Inc._NN showed_VB new_JJ TV_NN ads_NN yesterday_NN ._.”
3. Vocab¶
In [ ]:
src_itos, tgt_itos = set(), set()
In [ ]:
for sent in training_data:
sent = [x.split(‘_’) for x in sent.split()]
sent_src, sent_tgt = zip(*sent)
src_itos.update(sent_src)
tgt_itos.update(sent_tgt)
In [ ]:
src_itos, tgt_itos = sorted(src_itos), sorted(tgt_itos)
tgt_itos
In [ ]:
UNK = ‘
BOS = ‘
EOS = ‘
In [ ]:
src_itos = [UNK] + src_itos
src_stoi = {word: i for i, word in enumerate(src_itos)}
src_vocab = (src_itos, src_stoi)
In [ ]:
tgt_itos = [BOS, EOS] + tgt_itos
tgt_stoi = {word: i for i, word in enumerate(tgt_itos)}
tgt_vocab = (tgt_itos, tgt_stoi)
4. Vectorized Data¶
In [ ]:
def convert_seq(seq, vocab, is_target=False):
if type(seq) is str:
seq = seq.split()
out_seq = []
for tok in seq:
if tok in vocab:
out_seq.append(vocab[tok])
else:
if is_target:
raise RuntimeError(f”Unknown target token: `{repr(tok)}` from vocab: {‘, ‘.join(vocab)}”)
else:
out_seq.append(vocab[UNK])
return out_seq
In [ ]:
training_vectors = []
for sent in training_data:
sent = [x.split(‘_’) for x in sent.split()]
src, tgt = zip(*sent)
src = torch.tensor([convert_seq(src, src_stoi)], dtype=torch.long)
tgt = torch.tensor(convert_seq(tgt, tgt_stoi, is_target=True), dtype=torch.long)
training_vectors.append((src, tgt))
In [ ]:
test_vector = [x.split(‘_’) for x in test_data.split()]
test_src, test_tgt = zip(*test_vector)
test_vector = [
torch.tensor([convert_seq(test_src, src_stoi)], dtype=torch.long),
torch.tensor(convert_seq(test_tgt, tgt_stoi, is_target=True), dtype=torch.long)
]
5. BiLSTM-CRF POS Tagger Implementation¶
In [ ]:
input_dim = len(src_itos)
output_dim = len(tgt_itos)
input_dim, output_dim
In [ ]:
class BiLSTM(nn.Module):
def __init__(self, input_dim, output_dim, device):
super().__init__()
self.embedding = nn.Embedding(input_dim, EMBED_DIM)
self.lstm = nn.LSTM(EMBED_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True, bidirectional=BIDIRECTIONAL)
self.linear = nn.Linear(HIDDEN_DIM*2 if BIDIRECTIONAL else HIDDEN_DIM, output_dim) # project to vocab space
self.dropout = nn.Dropout(DROPOUT)
self.device = device
self.hidden = self.init_hidden()
def init_hidden(self):
direction_multiplier = 2 if BIDIRECTIONAL else 1
return (torch.randn(direction_multiplier * NUM_LAYERS, 1, HIDDEN_DIM, device=self.device), # h0
torch.randn(direction_multiplier * NUM_LAYERS, 1, HIDDEN_DIM, device=self.device)) # c0
def forward(self, x):
embed = self.embedding(x)
embed = self.dropout(embed)
outputs, _ = self.lstm(embed, self.hidden)
outputs = self.linear(outputs)
return outputs.squeeze() # assumes batch size of 1, whose dimension will be reduced here
In [ ]:
class CRF(nn.Module):
“””TODO: Impelement CRF forward, score and viterbi functions”””
def __init__(self, tgt_vocab, device):
super().__init__()
self.tgt_itos, self.tgt_stoi = tgt_vocab
self.tag_size = len(self.tgt_itos)
self.device = device
# transition matrix
self.transitions = nn.Parameter(torch.randn(self.tag_size, self.tag_size, device=self.device))
self.transitions.data[self.tgt_stoi[BOS], :] = -1000.
self.transitions.data[:, self.tgt_stoi[EOS]] = -1000.
def forward(self, feats):
raise NotImplementedError(“Implement this”)
def score(self, feats, tags):
raise NotImplementedError(“Implement this”)
def viterbi(self, feats):
raise NotImplementedError(“Implement this”)
In [ ]:
class BiLSTM_CRF(nn.Module):
def __init__(self, input_dim, output_dim, tgt_vocab, device):
super().__init__()
self.lstm = BiLSTM(input_dim, output_dim, device=device)
self.crf = CRF(tgt_vocab, device=device)
def neg_log_likelihood(self, src, tgt):
“””Compute negative log likelihood given sentence and gold POS labels”””
feats = self.lstm(src)
forward_score = self.crf(feats)
gold_score = self.crf.score(feats, tgt)
return forward_score – gold_score
def forward(self, src):
“””Tag a single sentence”””
feats = self.lstm(src)
out = self.crf.viterbi(feats)
return out
In [ ]:
model = BiLSTM_CRF(input_dim, output_dim, tgt_vocab, device)
model = model.to(device)
6. Train¶
A. Before Training¶
In [ ]:
test_src, test_tgt = test_vector
” “.join(src_itos[x] for x in test_src.squeeze().tolist())
In [ ]:
” “.join(tgt_itos[x] for x in test_tgt.tolist())
In [ ]:
score, pred_seq = model(test_src.to(device))
score.item()
In [ ]:
# gold
” “.join(tgt_itos[x] for x in test_tgt.tolist())
In [ ]:
# pred
” “.join(tgt_itos[x] for x in torch.cat(pred_seq, 0).tolist())
B. Training¶
In [ ]:
optimizer = optim.Adam(model.parameters(), LEARNING_RATE)
In [ ]:
for i in range(NUM_EPOCHS):
epoch_loss = 0.
num_correct = 0
num_tokens = 0
model.train()
for src, tgt in tqdm.tqdm(training_vectors, desc=f”[Training {i+1}/{NUM_EPOCHS}]”):
src = src.to(device)
tgt = tgt.to(device)
model.zero_grad()
loss = model.neg_log_likelihood(src, tgt)
epoch_loss += loss
loss.backward()
optimizer.step()
_, pred = model(src)
num_correct += (torch.cat(pred, 0)==tgt).sum()
num_tokens += len(tgt)
epoch_acc = num_correct.item() / num_tokens
print(“Training Epoch # {} Loss: {:.2f} Acc: {:.2f}”.format(i+1, epoch_loss.item(), epoch_acc))
C. After Training¶
In [ ]:
score, pred_seq = model(test_src.to(device))
score.item()
In [ ]:
” “.join(src_itos[x] for x in test_src.squeeze().tolist())
In [ ]:
# gold
” “.join(tgt_itos[x] for x in test_tgt.tolist())
In [ ]:
# pred
” “.join(tgt_itos[x] for x in torch.cat(pred_seq, 0).tolist())