# RNN for POS Tagging

What included in the notebook:

- Implementation of RNN model for POS Tagging

In [59]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fd143cd10b0>

## Dataset

We will use the Treebak data obtained from nltk.


In [60]:
%%capture

import nltk
from nltk.corpus import treebank

nltk.download('universal_tagset')
nltk.download('treebank')

Load data set

In [61]:
tagged_sentences = treebank.tagged_sents(tagset='universal')
tagged_sentences[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

## Create train/test/split

In [62]:
from sklearn.model_selection import train_test_split

train_tagged_sentences, test_tagged_sentences = train_test_split(tagged_sentences, test_size=0.2, random_state=42)

## Seperate sentences and tag sequences

In [63]:
def make_x_y(tagged_sentences):
    """Seperate sentences and tag sequences from tagged sentences

    Arguments
    ----------
        tagged_sentences

    Returns
    ----------
        sentences (list): list of sentences. Each sentence is a list of words
        tag_sequences
    """
    sentences = []
    tag_sequences = []
    for s in tagged_sentences:
        words, tags = zip(*s)
        sentences.append(list(words))
        tag_sequences.append(list(tags))
    return sentences, tag_sequences

In [64]:
train_sentences, train_tag_sequences = make_x_y(train_tagged_sentences)
test_sentences, test_tag_sequences = make_x_y(test_tagged_sentences)

## Steps in building RNN model for POS Tagging

- Create Vocabulary, Vectorizer, Dataset
- Implement model class
- Training loop
- Evaluation on the test data

## Create Vocabulary

We modified the Vocabulary class in the previous lecture.

We need to convert tags into integer indeces, so we will create two vocabularies, one for words and one for tags.

In [65]:
from collections import defaultdict

class Vocabulary:
    def __init__(self, token_to_idx=None, use_unk=True):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self.pad_index = 0
        
        if use_unk:
            self.unk_index = 1
        else:
            self.unk_index = -1

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    @classmethod
    def build_vocab(cls, sequences, use_unk=True):
        """Build vocabulary from a list of sequences
        A sequence may be a sequence of words or a sequence of tags.

        Arguments:
        ----------
            sequences (list): list of sequences, each sentence list of words
            or list of tags
        
        Return:
        ----------
            vocab (Vocabulary): a Vocabulary object
        """
        if use_unk:
            token_to_idx = {"<PAD>": 0, "<UNK>": 1}
        else:
            token_to_idx = {"<PAD>": 0}

        vocab = cls(token_to_idx, use_unk=use_unk)
        for s in sequences:
            for word in s:
                vocab.add_token(word)
        return vocab

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [66]:
# Word vocabulary
word_vocab = Vocabulary.build_vocab(train_sentences)
print(word_vocab)


<Vocabulary(size=11051)>


In [67]:
# Tag vocabulary
tag_vocab = Vocabulary.build_vocab(train_tag_sequences, use_unk=False)
print(tag_vocab._token_to_idx)

{'<PAD>': 0, 'NOUN': 1, '.': 2, 'NUM': 3, 'ADJ': 4, 'VERB': 5, 'DET': 6, 'ADP': 7, 'CONJ': 8, 'PRON': 9, 'X': 10, 'ADV': 11, 'PRT': 12}


## Data Vectorizer

In [68]:
import torch
import numpy as np

def vectorize(vocab, sequence):
    """
    Args:
        vocab (Vocabulary)
        sequence (list): list of words or tags
    """
    indices = [vocab.lookup_token(token) for token in sequence]
    
    return torch.tensor(indices, dtype=torch.long)

In [69]:
print(train_sentences[0])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']


In [70]:
vectorize(word_vocab, train_sentences[0])

tensor([ 2,  3,  4,  5,  6,  7,  4,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18])

In [71]:
print(train_tag_sequences[0])

['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


In [72]:
vectorize(tag_vocab, train_tag_sequences[0])

tensor([1, 1, 2, 3, 1, 4, 2, 5, 5, 6, 1, 7, 6, 4, 1, 1, 3, 2])

Vectorize train/test data

In [73]:
train_data = [vectorize(word_vocab, t) for t in train_sentences]
test_data = [vectorize(word_vocab, t) for t in test_sentences]

train_y = [vectorize(tag_vocab, t) for t in train_tag_sequences]
test_y = [vectorize(tag_vocab, t) for t in test_tag_sequences]

## Dataset class

In [74]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, sequences, tag_sequences):
        """
        Args:
            sequences (list): list of sentences. Each sentence is a list of words
            tag_sequences (list): list of tag sequences, each for one sentence
        """
        self.sequences = sequences
        self.tag_sequences = tag_sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, index):
        x = self.sequences[index]
        y = self.tag_sequences[index]

        return x, y

Create train_dataset and test_dataset

In [75]:
train_dataset = TextDataset(train_data, train_y)
test_dataset = TextDataset(test_data, test_y)

In [76]:
print( train_dataset[1] )

(tensor([19, 20, 21, 22, 18]), tensor([8, 9, 5, 4, 2]))


## Create DataLoader

We need to define function for processing batches generated by DataLoader

In [77]:
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    """Processing a batch generated by DataLoader

    Arguments:
    -----
        batch (torch.tensor): a tensor generated by DataLoader
    """
    (x, y) = zip(*batch)
    x_lens = torch.tensor([len(x) for x in x])
    y_lens = torch.tensor([len(y) for y in y])
    
    x_pad = pad_sequence(x, batch_first=True, padding_value=0)
    y_pad = pad_sequence(y, batch_first=True, padding_value=0)

    return x_pad, y_pad, x_lens, y_lens

## RNN Tagging Model

In [78]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class LSTMTagger(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size, 
                 num_layers=1, batch_first=True, padding_idx=0):
        
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, 
                                padding_idx=padding_idx)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=num_layers, bidirectional=True, batch_first=batch_first)
        self.fc = nn.Linear(in_features=2*hidden_dim, out_features=tagset_size)

        ## Comment out to disable weight initialization
        torch.nn.init.xavier_uniform_(self.emb.weight)
        torch.nn.init.xavier_uniform_(self.fc.weight)

    def forward(self, x_in, x_lens):
        x_embed = self.emb(x_in)
        x_packed = pack_padded_sequence(x_embed, x_lens, batch_first=True, enforce_sorted=False)
        output_packed, _ = self.lstm(x_packed)
        output_padded, output_lengths = pad_packed_sequence(output_packed, batch_first=True)
        tag_space = self.fc(output_padded)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Create an LSTM Tagger Model

In [79]:
vocab_size = len(word_vocab)
embedding_dim = 300
hidden_dim = 128
num_layers = 2
tagset_size = len(tag_vocab)
batch_first = True

model = LSTMTagger(vocab_size=vocab_size, 
                   embedding_dim=embedding_dim, 
                   hidden_dim=hidden_dim,
                   num_layers=num_layers,
                   tagset_size=tagset_size, 
                   batch_first=batch_first)

In [80]:
print(model)

LSTMTagger(
  (emb): Embedding(11051, 300, padding_idx=0)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=13, bias=True)
)


## Training Loop

In [81]:
from tqdm.notebook import trange, tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

learning_rate = 1e-3
batch_size = 32
epochs = 100

criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.to(device)

def train():
    train_dataloader = DataLoader(
        train_dataset,
        collate_fn=collate_batch,
        batch_size=batch_size,
    )
    model.train()
    train_iterator = trange(int(epochs), desc="Epoch")

    for _ in train_iterator:
        for x_pad, y_pad, x_lens, y_lens in train_dataloader:
            x_pad = x_pad.to(device)
            y_pad = y_pad.to(device)

            optimizer.zero_grad()
            pred = model(x_pad, x_lens)
            
            pred = pred.view(-1, pred.shape[-1])
            y_pad = y_pad.view(-1)

            loss = criterion(pred, y_pad)
            loss.backward()
            optimizer.step()

train()


Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

## Evaluation

In [82]:
from sklearn import metrics

def evaluate():
    model.eval()
    test_dataloader = DataLoader(
        test_dataset,
        collate_fn=collate_batch,
        shuffle=False,
        batch_size=batch_size,
    )

    y_true = []
    y_pred = []
    with torch.no_grad():
        for x_pad, y_pad, x_lens, y_lens in tqdm(test_dataloader, desc="Evaluating"):
            x_pad = x_pad.to(device)
            y_pad = y_pad.to(device)

            logits = model(x_pad, x_lens)
            predictions = logits.argmax(-1)

            predictions = predictions.detach().cpu().numpy()
            y_pad = y_pad.detach().cpu().numpy()

            y_lens = y_lens.numpy()

            for i in range(y_pad.shape[0]):
                len_y = y_lens[i]
                for true_tag, predicted_tag in zip(y_pad[i][:len_y], predictions[i][:len_y]):
                    if predicted_tag != tag_vocab.pad_index:
                        true_tag = tag_vocab.lookup_index(true_tag)
                        predicted_tag = tag_vocab.lookup_index(predicted_tag)
                        y_true.append(true_tag)
                        y_pred.append(predicted_tag)

    print("Accuracy: %.4f" % metrics.accuracy_score(y_true, y_pred))
    print(metrics.classification_report(y_true, y_pred))

evaluate()

Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

Accuracy: 0.9572
              precision    recall  f1-score   support

           .       1.00      1.00      1.00      2354
         ADJ       0.86      0.81      0.84      1316
         ADP       0.97      0.98      0.98      2027
         ADV       0.91      0.85      0.88       634
        CONJ       0.99      0.98      0.98       470
         DET       0.99      0.99      0.99      1795
        NOUN       0.94      0.97      0.96      5943
         NUM       0.96      0.92      0.94       725
        PRON       0.98      0.99      0.98       521
         PRT       0.96      0.97      0.97       654
        VERB       0.94      0.95      0.95      2740
           X       1.00      0.96      0.98      1357

    accuracy                           0.96     20536
   macro avg       0.96      0.95      0.95     20536
weighted avg       0.96      0.96      0.96     20536



## Further Improvements

- Initialize weights of the neural networks
- Use pre-trained word embeddings

## References

- [Sequence Models and Long Short-Term Memory Networks](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html), on official Pytorch tutorial.
- [LSTM (character + word) POS-tag model PyTorch](https://www.kaggle.com/code/krishanudb/lstm-character-word-pos-tag-model-pytorch)
- [1 - BiLSTM for PoS Tagging](https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1_bilstm.ipynb), Notebook
- [bentrevett/pytorch-pos-tagging](https://github.com/bentrevett/pytorch-pos-tagging)
- [Pad pack sequences for Pytorch batch processing with DataLoader](https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html)
