!pip install -Uqq stanza
|████████████████████████████████| 574 kB 5.1 MB/s |████████████████████████████████| 4.7 MB 33.9 MB/s |████████████████████████████████| 197 kB 68.0 MB/s |████████████████████████████████| 6.6 MB 45.7 MB/s |████████████████████████████████| 120 kB 56.5 MB/s Building wheel for emoji (setup.py) ... done
import stanza
import nltk
from nltk.corpus import wordnet
from nltk.corpus import verbnet
stanza.download('en') # download English model
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse,ner')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('verbnet')
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 0%| …
INFO:stanza:Downloading default packages for language: en (English)...
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.0/models/default.zip: 0%| | 0…
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 0%| …
INFO:stanza:Loading these models for language: en (English): ========================= | Processor | Package | ------------------------- | tokenize | combined | | pos | combined | | lemma | combined | | depparse | combined | | ner | ontonotes | ========================= INFO:stanza:Use device: cpu INFO:stanza:Loading: tokenize INFO:stanza:Loading: pos INFO:stanza:Loading: lemma INFO:stanza:Loading: depparse INFO:stanza:Loading: ner INFO:stanza:Done loading processors! [nltk_data] Downloading package omw-1.4 to /root/nltk_data... [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Downloading package verbnet to /root/nltk_data... [nltk_data] Unzipping corpora/verbnet.zip.
True
def events(text):
'''
as per Page: 32
inputs plot text
returns (s, v, o, m)
where v is a verb,
s is the subject of the verb,
o is the object of the verb,
and m is the modifier—or “wildcard”,
which can be a propositional object,
indirect object, causal complement
(e.g., in “I was glad that he drove,” “drove” is the causal complement to “glad.”),
or any other dependency unclassifiable to Stanford’s dependency parser.
'''
doc = nlp(text)
v,s,o,m = 'UNK','UNK','UNK','UNK'
for sent in doc.sentences:
for word in sent.words:
if word.pos == "VERB":
# Generalized
# 3. Verbs were replaced by VerbNet [79] version 3.2.43 frames (e.g. “arrived”/“arriving” become “escape-51.1”)
# TODO: verbnet.classids('scurry')
gVerb = verbnet.classids(word.text) #.split('.')[0]
v = gVerb[0].split('.')[0] if gVerb else word.text #verb
if word.deprel == "nsubj" and word.pos == "PRON":
s = word.text #subject of the verb
if o == 'UNK' and word.deprel == "obl" and word.pos == "NOUN":
# Generalized
if word.pos == "NOUN":
# 1. Named entities were identified (cf. [77]), and “PERSON” names were replaced with the tag <PERSON>n,
# where n indicates the nth character name in the sentence. Other named entities were labelled as their
# named entity recognition (NER) category (e.g. LOCATION, ORGANIZATION, etc.)
# 2. nouns were replaced by the WordNet [78] Synset two levels up in the inherited hypernym hierarchy
# TODO: woi.hypernyms()[0]
# e.g. self-propelled vehicle.n.01 vs the original word “car” (car.n.01)), while avoiding labelling it too generally (e.g. entity.n.01
synset = wordnet.synsets(word.text)
if synset:
woi = synset[0] if synset[0] else word.text
o = woi.hypernyms()[0].name().split('.')[0]
else:
o = word.text
else:
o = word.text #object of the verb
#Character Name Numbering
#Adding Genre Information
if word.deprel == "det":
m = sent.words[word.head-1].text #modifier
return s + " " + v + " " + o + " " + m
!wget https://www.dropbox.com/s/24pa44w7u7wvtma/plots.zip
!unzip plots.zip
import pandas as pd
def parseFile():
with open('plots','r') as f:
lines = [line.split("<EOS>")[-1] for line in f if line.strip()]
return lines
df = pd.DataFrame(parseFile(), columns=["plots"])
df.head()
plots | |
---|---|
0 | Old Major, the old boar on the Manor Farm, sum... |
1 | When Major dies, two young pigs, Snowball and ... |
2 | The animals revolt and drive the drunken and i... |
3 | They adopt the Seven Commandments of Animalism... |
4 | Snowball teaches the animals to read and write... |
df.count()
plots 121649 dtype: int64
train = df.loc[9:25, ['plots']]
train.head()
plots | |
---|---|
9 | Snowball, who has been studying the battles of... |
10 | Snowball's popularity soars, and this event is... |
11 | It is celebrated annually with the firing of a... |
12 | Napoleon and Snowball struggle for pre-eminenc... |
13 | When Snowball announces his plans to modernize... |
train['event'] = train.apply(lambda row: events(row.plots), axis=1)
train.head()
plots | event | |
---|---|---|
9 | Snowball, who has been studying the battles of... | who escape-51 expectation men |
10 | Snowball's popularity soars, and this event is... | UNK proclaimed UNK Cowshed |
11 | It is celebrated annually with the firing of a... | UNK celebrated attack Revolution |
12 | Napoleon and Snowball struggle for pre-eminenc... | UNK battle-36 pre-eminence UNK |
13 | When Snowball announces his plans to modernize... | UNK declares UNK windmill |
import torch
from torch import nn
import numpy as np
# text = ['hey how are you','good i am fine','have a nice day']
text = train.event.tolist()
# Join all the sentences together and extract the unique characters from the combined sentences
chars = set(''.join(text))
# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(chars))
# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}
# Finding the length of the longest string in our data
maxlen = len(max(text, key=len))
# Padding
# A simple loop that loops through the list of sentences and adds a ' ' whitespace until the length of
# the sentence matches the length of the longest sentence
for i in range(len(text)):
while len(text[i])<maxlen:
text[i] += ' '
# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []
for i in range(len(text)):
# Remove last character for input sequence
input_seq.append(text[i][:-1])
# Remove first character for target sequence
target_seq.append(text[i][1:])
print("Input Sequence: {}\nTarget Sequence: {}".format(input_seq[i], target_seq[i]))
Input Sequence: who escape-51 expectation men Target Sequence: ho escape-51 expectation men Input Sequence: UNK proclaimed UNK Cowshed Target Sequence: NK proclaimed UNK Cowshed Input Sequence: UNK celebrated attack Revolution Target Sequence: NK celebrated attack Revolution Input Sequence: UNK battle-36 pre-eminence UNK Target Sequence: NK battle-36 pre-eminence UNK Input Sequence: UNK declares UNK windmill Target Sequence: NK declares UNK windmill Input Sequence: who meander-47 UNK farm Target Sequence: ho meander-47 UNK farm Input Sequence: UNK claims swine idea Target Sequence: NK claims swine idea Input Sequence: UNK cooperate-73-3 commitment windmill Target Sequence: NK cooperate-73-3 commitment windmill Input Sequence: UNK sabotage atmospheric_phenomenon animal Target Sequence: NK sabotage atmospheric_phenomenon animals Input Sequence: he consorting canine farm Target Sequence: e consorting canine farm Input Sequence: who representing military_action battle Target Sequence: ho representing military_action battle Input Sequence: who adopting song man Target Sequence: ho adopting song man Input Sequence: they convinced UNK animals Target Sequence: hey convinced UNK animals Input Sequence: UNK restored UNK windmill Target Sequence: NK restored UNK windmill Input Sequence: they wounded outgo workhorse Target Sequence: hey wounded outgo workhorse Input Sequence: he working ill_health windmill Target Sequence: e working ill_health windmill Input Sequence: UNK given artistic_movement surgeon Target Sequence: NK given artistic_movement surgeon
for i in range(len(text)):
input_seq[i] = [char2int[character] for character in input_seq[i]]
target_seq[i] = [char2int[character] for character in target_seq[i]]
dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)
def one_hot_encode(sequence, dict_size, seq_len, batch_size):
# Creating a multi-dimensional array of zeros with the desired output shape
features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
# Replacing the 0 at the relevant character index with a 1 to represent that character
for i in range(batch_size):
for u in range(seq_len):
features[i, u, sequence[i][u]] = 1
return features
# Input shape --> (Batch Size, Sequence Length, One-Hot Encoding Size)
input_seq = one_hot_encode(input_seq, dict_size, seq_len, batch_size)
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()
# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
device = torch.device("cuda")
print("GPU is available")
else:
device = torch.device("cpu")
print("GPU not available, CPU used")
GPU not available, CPU used
class Model(nn.Module):
def __init__(self, input_size, output_size, hidden_dim, n_layers):
super(Model, self).__init__()
# Defining some parameters
self.hidden_dim = hidden_dim
self.n_layers = n_layers
#Defining the layers
# RNN Layer
self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)
# Fully connected layer
self.fc = nn.Linear(hidden_dim, output_size)
def forward(self, x):
batch_size = x.size(0)
# Initializing hidden state for first input using method defined below
hidden = self.init_hidden(batch_size)
# Passing in the input and hidden state into the model and obtaining outputs
out, hidden = self.rnn(x, hidden)
# Reshaping the outputs such that it can be fit into the fully connected layer
out = out.contiguous().view(-1, self.hidden_dim)
out = self.fc(out)
return out, hidden
def init_hidden(self, batch_size):
# This method generates the first hidden state of zeros which we'll use in the forward pass
# We'll send the tensor holding the hidden state to the device we specified earlier as well
hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
return hidden
# Instantiate the model with hyperparameters
model = Model(input_size=dict_size, output_size=dict_size, hidden_dim=12, n_layers=1)
# We'll also set the model to the device that we defined earlier (default is CPU)
model.to(device)
# Define hyperparameters
n_epochs = 100
lr=0.01
# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# Training Run
for epoch in range(1, n_epochs + 1):
optimizer.zero_grad() # Clears existing gradients from previous epoch
input_seq.to(device)
output, hidden = model(input_seq)
loss = criterion(output, target_seq.view(-1).long())
loss.backward() # Does backpropagation and calculates gradients
optimizer.step() # Updates the weights accordingly
if epoch%10 == 0:
print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
print("Loss: {:.4f}".format(loss.item()))
Epoch: 10/100............. Loss: 2.6087 Epoch: 20/100............. Loss: 2.5179 Epoch: 30/100............. Loss: 2.4569 Epoch: 40/100............. Loss: 2.3277 Epoch: 50/100............. Loss: 2.1268 Epoch: 60/100............. Loss: 1.9311 Epoch: 70/100............. Loss: 1.7815 Epoch: 80/100............. Loss: 1.6578 Epoch: 90/100............. Loss: 1.5524 Epoch: 100/100............. Loss: 1.4615
# This function takes in the model and character as arguments and returns the next character prediction and hidden state
def predict(model, character):
# One-hot encoding our input to fit into the model
character = np.array([[char2int[c] for c in character]])
character = one_hot_encode(character, dict_size, character.shape[1], 1)
character = torch.from_numpy(character)
character.to(device)
out, hidden = model(character)
prob = nn.functional.softmax(out[-1], dim=0).data
# Taking the class with the highest probability score from the output
char_ind = torch.max(prob, dim=0)[1].item()
return int2char[char_ind], hidden
# This function takes the desired output length and input characters as arguments, returning the produced sentence
def sample(model, out_len, start='hey'):
model.eval() # eval mode
start = start.lower()
# First off, run through the starting characters
chars = [ch for ch in start]
size = out_len - len(chars)
# Now pass in the previous characters and get a new one
for ii in range(size):
char, h = predict(model, chars)
chars.append(char)
return ''.join(chars)
sample(model, 15, 'claims')
'claimse '