In [1]:

!pip install -Uqq stanza

     |████████████████████████████████| 574 kB 5.1 MB/s 
     |████████████████████████████████| 4.7 MB 33.9 MB/s 
     |████████████████████████████████| 197 kB 68.0 MB/s 
     |████████████████████████████████| 6.6 MB 45.7 MB/s 
     |████████████████████████████████| 120 kB 56.5 MB/s 
  Building wheel for emoji (setup.py) ... done

In [2]:

import stanza
import nltk
from nltk.corpus import wordnet
from nltk.corpus import verbnet

stanza.download('en') # download English model
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse,ner')

nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('verbnet')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English)...

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
=========================
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |
=========================

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package verbnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/verbnet.zip.

Out[2]:

True

In [11]:

def events(text):
  '''
  as per Page: 32
  inputs plot text
  returns (s, v, o, m)
  where v is a verb, 
  s is the subject of the verb,
  o is the object of the verb, 
  and m is the modifier—or “wildcard”, 
  which can be a propositional object, 
  indirect object, causal complement 
  (e.g., in “I was glad that he drove,” “drove” is the causal complement to “glad.”), 
  or any other dependency unclassifiable to Stanford’s dependency parser.
  '''
  doc = nlp(text)

  v,s,o,m = 'UNK','UNK','UNK','UNK'
  for sent in doc.sentences:
    for word in sent.words:
      if word.pos == "VERB":
        # Generalized
        # 3. Verbs were replaced by VerbNet [79] version 3.2.43 frames (e.g. “arrived”/“arriving” become “escape-51.1”)
        # TODO: verbnet.classids('scurry')
        gVerb = verbnet.classids(word.text) #.split('.')[0]
        v =  gVerb[0].split('.')[0] if gVerb else word.text #verb
      if word.deprel == "nsubj" and word.pos == "PRON":
        s = word.text #subject of the verb
      if o == 'UNK' and word.deprel == "obl" and word.pos == "NOUN":
        # Generalized
        if  word.pos == "NOUN":
          # 1. Named entities were identified (cf. [77]), and “PERSON” names were replaced with the tag <PERSON>n, 
          #    where n indicates the nth character name in the sentence. Other named entities were labelled as their 
          #    named entity recognition (NER) category (e.g. LOCATION, ORGANIZATION, etc.)
          # 2. nouns were replaced by the WordNet [78] Synset two levels up in the inherited hypernym hierarchy
          # TODO: woi.hypernyms()[0]
          # e.g. self-propelled vehicle.n.01 vs the original word “car” (car.n.01)), while avoiding labelling it too generally (e.g. entity.n.01
          synset = wordnet.synsets(word.text)
          if synset:
            woi = synset[0] if synset[0] else word.text
            o = woi.hypernyms()[0].name().split('.')[0]
          else:
            o = word.text
        else:
          o = word.text #object of the verb
        #Character Name Numbering

        #Adding Genre Information
      if  word.deprel == "det":
        m = sent.words[word.head-1].text #modifier
  return s + " " + v + " " + o + " " +  m

In [ ]:

!wget https://www.dropbox.com/s/24pa44w7u7wvtma/plots.zip

In [ ]:

!unzip plots.zip

In [4]:

import pandas as pd

def parseFile():
    with open('plots','r') as f:
        lines = [line.split("<EOS>")[-1] for line in f if line.strip()]
    return lines

df = pd.DataFrame(parseFile(), columns=["plots"])

df.head()

Out[4]:

	plots
0	Old Major, the old boar on the Manor Farm, sum...
1	When Major dies, two young pigs, Snowball and ...
2	The animals revolt and drive the drunken and i...
3	They adopt the Seven Commandments of Animalism...
4	Snowball teaches the animals to read and write...

In [6]:

df.count()

Out[6]:

plots    121649
dtype: int64

In [5]:

train = df.loc[9:25, ['plots']]

In [9]:

train.head()

Out[9]:

	plots
9	Snowball, who has been studying the battles of...
10	Snowball's popularity soars, and this event is...
11	It is celebrated annually with the firing of a...
12	Napoleon and Snowball struggle for pre-eminenc...
13	When Snowball announces his plans to modernize...

In [12]:

train['event'] = train.apply(lambda row: events(row.plots), axis=1)

In [13]:

train.head()

Out[13]:

	plots	event
9	Snowball, who has been studying the battles of...	who escape-51 expectation men
10	Snowball's popularity soars, and this event is...	UNK proclaimed UNK Cowshed
11	It is celebrated annually with the firing of a...	UNK celebrated attack Revolution
12	Napoleon and Snowball struggle for pre-eminenc...	UNK battle-36 pre-eminence UNK
13	When Snowball announces his plans to modernize...	UNK declares UNK windmill

Events to Events RNN¶

RNN example

In [14]:

import torch
from torch import nn

import numpy as np

In [15]:

# text = ['hey how are you','good i am fine','have a nice day']
text = train.event.tolist()
# Join all the sentences together and extract the unique characters from the combined sentences
chars = set(''.join(text))

# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(chars))

# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}

In [16]:

# Finding the length of the longest string in our data
maxlen = len(max(text, key=len))

# Padding

# A simple loop that loops through the list of sentences and adds a ' ' whitespace until the length of
# the sentence matches the length of the longest sentence
for i in range(len(text)):
  while len(text[i])<maxlen:
      text[i] += ' '

In [17]:

# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []

for i in range(len(text)):
    # Remove last character for input sequence
  input_seq.append(text[i][:-1])
    
    # Remove first character for target sequence
  target_seq.append(text[i][1:])
  print("Input Sequence: {}\nTarget Sequence: {}".format(input_seq[i], target_seq[i]))

Input Sequence: who escape-51 expectation men             
Target Sequence: ho escape-51 expectation men              
Input Sequence: UNK proclaimed UNK Cowshed                
Target Sequence: NK proclaimed UNK Cowshed                 
Input Sequence: UNK celebrated attack Revolution          
Target Sequence: NK celebrated attack Revolution           
Input Sequence: UNK battle-36 pre-eminence UNK            
Target Sequence: NK battle-36 pre-eminence UNK             
Input Sequence: UNK declares UNK windmill                 
Target Sequence: NK declares UNK windmill                  
Input Sequence: who meander-47 UNK farm                   
Target Sequence: ho meander-47 UNK farm                    
Input Sequence: UNK claims swine idea                     
Target Sequence: NK claims swine idea                      
Input Sequence: UNK cooperate-73-3 commitment windmill    
Target Sequence: NK cooperate-73-3 commitment windmill     
Input Sequence: UNK sabotage atmospheric_phenomenon animal
Target Sequence: NK sabotage atmospheric_phenomenon animals
Input Sequence: he consorting canine farm                 
Target Sequence: e consorting canine farm                  
Input Sequence: who representing military_action battle   
Target Sequence: ho representing military_action battle    
Input Sequence: who adopting song man                     
Target Sequence: ho adopting song man                      
Input Sequence: they convinced UNK animals                
Target Sequence: hey convinced UNK animals                 
Input Sequence: UNK restored UNK windmill                 
Target Sequence: NK restored UNK windmill                  
Input Sequence: they wounded outgo workhorse              
Target Sequence: hey wounded outgo workhorse               
Input Sequence: he working ill_health windmill            
Target Sequence: e working ill_health windmill             
Input Sequence: UNK given artistic_movement surgeon       
Target Sequence: NK given artistic_movement surgeon

In [18]:

for i in range(len(text)):
    input_seq[i] = [char2int[character] for character in input_seq[i]]
    target_seq[i] = [char2int[character] for character in target_seq[i]]

In [19]:

dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)

def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
    
    # Replacing the 0 at the relevant character index with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            features[i, u, sequence[i][u]] = 1
    return features

In [20]:

# Input shape --> (Batch Size, Sequence Length, One-Hot Encoding Size)
input_seq = one_hot_encode(input_seq, dict_size, seq_len, batch_size)

In [21]:

input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)

In [22]:

# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used

In [23]:

class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)   
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        
        batch_size = x.size(0)

        # Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [24]:

# Instantiate the model with hyperparameters
model = Model(input_size=dict_size, output_size=dict_size, hidden_dim=12, n_layers=1)
# We'll also set the model to the device that we defined earlier (default is CPU)
model.to(device)

# Define hyperparameters
n_epochs = 100
lr=0.01

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [25]:

# Training Run
for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad() # Clears existing gradients from previous epoch
    input_seq.to(device)
    output, hidden = model(input_seq)
    loss = criterion(output, target_seq.view(-1).long())
    loss.backward() # Does backpropagation and calculates gradients
    optimizer.step() # Updates the weights accordingly
    
    if epoch%10 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(loss.item()))

Epoch: 10/100............. Loss: 2.6087
Epoch: 20/100............. Loss: 2.5179
Epoch: 30/100............. Loss: 2.4569
Epoch: 40/100............. Loss: 2.3277
Epoch: 50/100............. Loss: 2.1268
Epoch: 60/100............. Loss: 1.9311
Epoch: 70/100............. Loss: 1.7815
Epoch: 80/100............. Loss: 1.6578
Epoch: 90/100............. Loss: 1.5524
Epoch: 100/100............. Loss: 1.4615

In [26]:

# This function takes in the model and character as arguments and returns the next character prediction and hidden state
def predict(model, character):
    # One-hot encoding our input to fit into the model
    character = np.array([[char2int[c] for c in character]])
    character = one_hot_encode(character, dict_size, character.shape[1], 1)
    character = torch.from_numpy(character)
    character.to(device)
    
    out, hidden = model(character)

    prob = nn.functional.softmax(out[-1], dim=0).data
    # Taking the class with the highest probability score from the output
    char_ind = torch.max(prob, dim=0)[1].item()

    return int2char[char_ind], hidden

In [27]:

# This function takes the desired output length and input characters as arguments, returning the produced sentence
def sample(model, out_len, start='hey'):
    model.eval() # eval mode
    start = start.lower()
    # First off, run through the starting characters
    chars = [ch for ch in start]
    size = out_len - len(chars)
    # Now pass in the previous characters and get a new one
    for ii in range(size):
        char, h = predict(model, chars)
        chars.append(char)

    return ''.join(chars)

In [34]:

sample(model, 15, 'claims')

Out[34]:

'claimse        '