Notebook

Word2vec Skip-gram model in PyTorch - Simple and with Negative sampling¶

In [ ]:

# data - cleaned wiki articles
!wget -q https://s3.amazonaws.com/video.udacity-data.com/topher/2018/October/5bbe6499_text8/text8.zip
!unzip -q text8.zip && rm text8.zip
!ls -al .

total 97676
drwxr-xr-x 1 root root      4096 Sep  9 14:25 .
drwxr-xr-x 1 root root      4096 Sep  9 14:21 ..
drwxr-xr-x 4 root root      4096 Sep  1 19:26 .config
drwxr-xr-x 1 root root      4096 Sep  1 19:26 sample_data
-rw-r--r-- 1 root root 100000000 Jun  9  2006 text8

In [ ]:

# read in the extracted text file      
with open('text8') as f:
    text = f.read()

# print out the first 100 characters
text[:1000]

Out[ ]:

' anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization of society it has also been taken up as a positive label by self defined anarchists the word anarchism is derived from the greek without archons ruler chief king anarchism as a political philosophy is the belief that rulers are unnecessary and should be abolished although there are differing interpretations of what this means anarchism also refers to related social movements that advocate the elimination of authoritarian institutions particularly the state the word anarchy as most anarchists use it does not imply chaos nihilism or anomie but rather a harmonious anti authoritarian society in place of what are regarded as authoritarian political structures and coercive economic instituti'

In [ ]:

import re
from collections import Counter

def preprocess(text):

    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    
    # Remove all words with  5 or fewer occurences
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > 5]

    return trimmed_words

In [ ]:

# get list of words
words = preprocess(text)
print(words[:30])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst']

In [ ]:

# print some stats about this word data
print("Total words in text: {}".format(len(words)))
print("Unique words: {}".format(len(set(words)))) # `set` removes any duplicate words

Total words in text: 16680599
Unique words: 63641

In [ ]:

def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    :param words: Input list of words
    :return: Two dictionaries, vocab_to_int, int_to_vocab
    """
    word_counts = Counter(words)
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

In [ ]:

vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

print(int_words[:30])

[5233, 3080, 11, 5, 194, 1, 3133, 45, 58, 155, 127, 741, 476, 10571, 133, 0, 27349, 1, 0, 102, 854, 2, 0, 15067, 58112, 1, 0, 150, 854, 3580]

In [ ]:

from collections import Counter
import random
import numpy as np

threshold = 1e-5
word_counts = Counter(int_words)
#print(list(word_counts.items())[0])  # dictionary of int_words, how many times they appear

total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
# discard some frequent words, according to the subsampling equation
# create a new list of words for training
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

print(train_words[:30])

[5233, 45, 476, 10571, 27349, 15067, 58112, 3580, 10712, 1324, 3672, 371, 539, 97, 2757, 7088, 5233, 44611, 2877, 792, 5233, 2621, 8983, 4147, 6437, 153, 5233, 19, 4860, 6753]

In [ ]:

def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''
    
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = words[start:idx] + words[idx+1:stop+1]
    
    return list(target_words)

In [ ]:

# run this cell multiple times to check for random window selection
int_text = [i for i in range(10)]
print('Input: ', int_text)
idx=5 # word index of interest

target = get_target(int_text, idx=idx, window_size=5)
print('Target: ', target)  # you should get some indices around the idx

Input:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Target:  [2, 3, 4, 6, 7, 8]

In [ ]:

def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y
    
int_text = [i for i in range(20)]
x,y = next(get_batches(int_text, batch_size=4, window_size=5))

print('x\n', x)
print('y\n', y)

x
 [0, 0, 0, 1, 1, 1, 2, 2, 2, 3]
y
 [1, 2, 3, 0, 2, 3, 0, 1, 3, 2]

In [ ]:

def cosine_similarity(embedding, valid_size=16, valid_window=100, device='cpu'):
    """ Returns the cosine similarity of validation words with words in the embedding matrix.
        Here, embedding should be a PyTorch embedding module.
    """
    
    # Here we're calculating the cosine similarity between some random words and 
    # our embedding vectors. With the similarities, we can look at what words are
    # close to our random words.
    
    # sim = (a . b) / |a||b|
    
    embed_vectors = embedding.weight
    
    # magnitude of embedding vectors, |b|
    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)
    
    # pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples,
                               random.sample(range(1000,1000+valid_window), valid_size//2))
    valid_examples = torch.LongTensor(valid_examples).to(device)
    
    valid_vectors = embedding(valid_examples)
    similarities = torch.mm(valid_vectors, embed_vectors.t())/magnitudes
        
    return valid_examples, similarities

Simple skip-gram model¶

In [ ]:

class SkipGram(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()
        
        self.embed = nn.Embedding(n_vocab, n_embed)
        self.output = nn.Linear(n_embed, n_vocab)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        x = self.embed(x)
        scores = self.output(x)
        log_ps = self.log_softmax(scores)
        
        return log_ps

Negative-sampling skip-gram model¶

In [ ]:

import torch
from torch import nn
import torch.optim as optim


class SkipGramNeg(nn.Module):
    def __init__(self, n_vocab, n_embed, noise_dist=None):
        super().__init__()
        
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.noise_dist = noise_dist
        
        # define embedding layers for input and output words
        self.in_embed = nn.Embedding(n_vocab, n_embed)
        self.out_embed = nn.Embedding(n_vocab, n_embed)
        
        # Initialize embedding tables with uniform distribution
        # I believe this helps with convergence
        self.in_embed.weight.data.uniform_(-1, 1)
        self.out_embed.weight.data.uniform_(-1, 1)
        
    def forward_input(self, input_words):
        input_vectors = self.in_embed(input_words)
        return input_vectors
    
    def forward_output(self, output_words):
        output_vectors = self.out_embed(output_words)
        return output_vectors
    
    def forward_noise(self, batch_size, n_samples):
        """ Generate noise vectors with shape (batch_size, n_samples, n_embed)"""
        if self.noise_dist is None:
            # Sample words uniformly
            noise_dist = torch.ones(self.n_vocab)
        else:
            noise_dist = self.noise_dist
            
        # Sample words from our noise distribution
        noise_words = torch.multinomial(noise_dist,
                                        batch_size * n_samples,
                                        replacement=True)
        
        device = "cuda" if model.out_embed.weight.is_cuda else "cpu"
        noise_words = noise_words.to(device)
        
        noise_vectors = self.out_embed(noise_words).view(batch_size, n_samples, self.n_embed)
        
        return noise_vectors

In [ ]:

class NegativeSamplingLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input_vectors, output_vectors, noise_vectors):
        
        batch_size, embed_size = input_vectors.shape
        
        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size, embed_size, 1)
        
        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size, 1, embed_size)
        
        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
        out_loss = out_loss.squeeze()
        
        # incorrect log-sigmoid loss
        noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1)  # sum the losses over the sample of noise vectors

        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss
        return -(out_loss + noise_loss).mean()

Training¶

Below is our training loop, and I recommend that you train on GPU, if available.

In [ ]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Get our noise distribution
# Using word frequencies calculated earlier in the notebook
word_freqs = np.array(sorted(freqs.values(), reverse=True))
unigram_dist = word_freqs/word_freqs.sum()
noise_dist = torch.from_numpy(unigram_dist**(0.75)/np.sum(unigram_dist**(0.75)))

# instantiating the model
embedding_dim = 300
model = SkipGramNeg(len(vocab_to_int), embedding_dim, noise_dist=noise_dist).to(device)

# using the loss that we defined
criterion = NegativeSamplingLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.003)

print_every = 1500
steps = 0
epochs = 5

# train for some number of epochs
for e in range(epochs):
    
    # get our input, target batches
    for input_words, target_words in get_batches(train_words, 512):
        steps += 1
        inputs, targets = torch.LongTensor(input_words), torch.LongTensor(target_words)
        inputs, targets = inputs.to(device), targets.to(device)
        
        # input, output, and noise vectors
        input_vectors = model.forward_input(inputs)
        output_vectors = model.forward_output(targets)
        noise_vectors = model.forward_noise(inputs.shape[0], 5)

        # negative sampling loss
        loss = criterion(input_vectors, output_vectors, noise_vectors)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # loss stats
        if steps % print_every == 0:
            print("Epoch: {}/{}".format(e+1, epochs))
            print("Loss: ", loss.item()) # avg batch loss at this point in training
            valid_examples, valid_similarities = cosine_similarity(model.in_embed, device=device)
            _, closest_idxs = valid_similarities.topk(6)

            valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
            for ii, valid_idx in enumerate(valid_examples):
                closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
                print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest_words))
            print("...\n")

Epoch: 1/5
Loss:  6.795905590057373
its | blue, minstrels, braille, exempting, goncourt
all | being, epigraphic, warmest, quell, saudi
time | construction, pearlman, thirds, buenos, robbers
was | landscapes, the, geometrical, forms, orgies
american | scarborough, banality, restored, technically, a
have | cellulose, one, protectors, discrimination, seeds
when | teletext, leaves, wholesale, wort, removed
that | designers, summers, justification, gynt, death
scale | assyria, butt, visitors, residence, booted
versions | liberation, heraldic, underscores, exclusively, woodwinds
additional | differently, solomon, moran, recreating, filipino
know | paradox, phones, erects, collateral, powerbook
experience | appreciable, company, celebrities, aether, dermal
bbc | compilers, nichols, repayment, words, seeing
animals | bladders, john, inscriptions, israelites, golf
except | struggled, taggart, hazlitt, sasquatch, u
...

Epoch: 1/5
Loss:  5.46177339553833
seven | one, zero, two, nine, the
but | of, in, the, and, as
more | of, and, nine, one, s
such | to, classical, one, as, and
about | and, welcoming, on, marianas, to
state | the, shlomo, during, manhattan, ships
known | prophecy, canis, colleges, zero, untie
it | of, the, in, as, to
existence | zionist, forbids, rcmp, formed, stub
mainly | rws, nica, constitutions, brigham, catalans
frac | public, capture, selectively, him, implication
bible | binders, ohrid, arrangements, reflexes, to
quite | frisch, to, ria, most, phonemes
universe | boxers, introducing, barrie, donkeys, caloric
primarily | aerospace, salad, location, gettysburg, encyclopaedia
notes | permo, tomography, peroxide, gross, egyptian
...

Epoch: 1/5
Loss:  4.1365885734558105
it | of, the, a, that, as
about | of, such, also, the, and
three | one, six, two, zero, nine
or | which, the, this, of, that
he | in, his, of, was, the
only | this, at, not, of, the
known | in, most, a, as, that
people | of, in, to, as, which
powers | salem, dribble, to, explosions, craggy
rise | ornament, aruba, animalia, disproportionate, aligned
ice | president, mitra, uplift, noises, shinogi
pre | elephant, concert, praeparatio, leaves, birth
versions | underscores, heraldic, golfers, woodwinds, cgi
except | brag, volta, newsome, skis, oliphant
stage | foraminifera, deckard, trope, announces, osiris
joseph | incomprehensible, methoxy, statewide, respond, back
...

Epoch: 1/5
Loss:  3.478447675704956
two | six, nine, three, seven, one
used | are, using, use, can, is
other | that, some, or, which, they
they | not, to, the, but, which
has | also, to, by, in, of
the | of, and, to, in, by
seven | nine, eight, six, one, three
also | in, the, and, by, of
operations | ayahuasca, progress, suits, same, dialect
report | least, bryce, unlike, ugarte, dump
numerous | ratify, better, in, beginning, intuition
file | system, need, software, data, use
troops | government, the, war, state, policy
discovered | terminal, altar, powdered, however, law
heavy | iguanas, faster, certain, transnistria, excludes
orthodox | church, christian, christians, state, celebrating
...

Epoch: 1/5
Loss:  3.3221445083618164
is | the, which, other, a, or
between | of, still, see, and, which
most | and, in, of, the, to
only | in, by, as, and, which
with | the, were, by, was, in
all | it, or, the, are, of
may | a, through, most, is, an
if | we, be, they, does, can
something | think, if, things, provided, get
san | newspaper, seven, one, player, zero
http | www, com, external, org, online
frac | x, z, equation, vector, n
stage | show, rumoured, film, in, best
bible | biblical, scholars, text, christian, god
active | planned, mandatory, behe, ige, until
egypt | period, kingdom, from, tribes, eastern
...

Epoch: 1/5
Loss:  3.260934591293335
only | so, to, a, and, the
b | d, seven, writer, composer, six
however | can, a, with, the, to
can | because, or, this, all, be
system | operating, systems, data, using, allows
is | the, a, this, and, in
their | the, in, with, or, this
from | is, the, of, and, or
derived | words, languages, pronunciation, word, language
troops | forces, army, military, war, battle
file | data, software, web, interface, user
mathematics | mathematical, theory, analysis, published, j
http | www, com, links, online, external
joseph | politician, b, composer, robert, author
mainly | important, north, some, modern, territory
police | forces, independence, government, military, attack
...

Epoch: 2/5
Loss:  2.8388876914978027
zero | one, five, two, four, seven
by | to, the, of, on, had
no | so, not, be, do, their
at | the, were, on, s, and
years | birth, year, age, population, until
over | to, than, from, the, between
in | the, from, to, of, also
on | the, by, a, to, and
pope | catholic, john, iv, bishop, church
bbc | news, weekly, official, april, directory
taking | he, this, his, out, what
pre | the, what, well, as, adopted
dr | actor, writer, david, actress, founder
report | org, http, forum, links, agency
institute | university, links, school, research, education
accepted | view, doctrine, speaking, as, taught
...

Epoch: 2/5
Loss:  2.8355414867401123
all | an, this, is, by, the
new | york, university, college, school, community
american | companies, actor, list, african, americans
also | and, as, the, include, to
than | is, to, while, which, more
war | soviet, battle, army, troops, forces
b | n, k, x, f, h
known | the, a, of, with, old
applications | computer, application, computers, allow, systems
existence | questions, that, theories, universe, our
notes | note, given, written, an, book
file | files, unix, windows, interface, system
primarily | small, mostly, factors, various, gas
creation | concept, universal, philosophy, interpretation, societies
shown | using, closed, p, being, e
additional | the, is, hand, usually, device
...

Epoch: 2/5
Loss:  2.8384206295013428
by | the, in, of, and, was
be | any, is, can, if, a
for | to, a, is, example, such
about | of, for, however, to, also
is | the, a, are, this, or
often | example, is, some, sometimes, typically
where | a, is, an, the, along
war | troops, soviet, invasion, soldiers, battle
ice | wet, dry, coast, snow, volcanic
police | military, national, armed, government, court
older | household, birth, living, age, married
universe | theories, matter, cosmic, our, observations
placed | mouth, tiny, display, surfaces, normal
smith | robert, williams, richard, jack, actress
defense | military, legal, security, commission, enforcement
file | files, windows, web, unix, user
...

Epoch: 2/5
Loss:  2.6320250034332275
during | was, in, the, forced, war
history | culture, geography, egypt, century, south
war | army, against, allied, troops, warfare
as | form, have, most, and, such
it | to, this, is, would, in
states | united, national, union, nations, territories
time | however, the, a, of, it
four | eight, one, six, nine, seven
creation | creator, rejected, view, though, inspired
engineering | research, technology, electrical, computer, institute
governor | appointed, minister, president, leader, elected
alternative | primary, modern, example, universal, more
mainly | from, principally, formerly, region, northern
active | cell, due, technical, move, involved
question | questions, something, saying, answer, that
notes | online, page, press, literature, reading
...

Epoch: 2/5
Loss:  2.4916765689849854
no | not, does, minor, i, g
will | can, make, if, does, not
people | americans, population, the, living, of
five | two, seven, four, six, eight
it | is, that, the, but, in
there | is, the, are, all, of
more | by, these, but, such, rather
system | operating, systems, data, limited, applications
marriage | marry, married, daughter, zeus, child
quite | very, like, be, using, other
ocean | atlantic, pacific, islands, coast, satellite
arts | martial, students, academy, schools, sports
engine | engines, turbine, piston, combustion, jet
scale | scales, production, greatly, massive, development
something | we, saying, things, my, thing
account | writings, evidence, according, accounts, book
...

Epoch: 2/5
Loss:  2.714233636856079
the | in, a, to, and, of
it | a, then, this, if, to
up | could, at, a, through, after
known | of, ancient, whose, modern, named
for | both, and, s, as, special
its | use, form, mainly, these, interaction
time | much, was, and, upon, previous
s | on, for, the, first, was
troops | army, forces, regiment, armed, infantry
report | news, investigation, reports, review, pdf
brother | his, son, wife, duke, father
active | formed, its, mechanism, radical, with
san | francisco, diego, california, los, angeles
instance | method, physical, should, cannot, expressions
shown | are, is, same, pattern, e
articles | online, wikipedia, reading, publications, topics
...

Epoch: 3/5
Loss:  2.4762697219848633
five | one, zero, four, six, seven
this | be, they, because, that, even
seven | eight, one, four, two, five
were | was, of, in, the, now
american | actor, actress, journalist, politician, songwriter
about | according, have, at, than, there
d | b, politician, composer, laureate, physicist
his | he, him, himself, spent, brother
file | files, windows, format, interface, microsoft
orthodox | catholic, churches, church, christians, christianity
engine | engines, powered, motors, turbine, piston
brother | wife, died, son, father, cousin
taking | out, to, stay, took, fled
gold | silver, iron, copper, purple, glass
defense | force, military, personnel, forces, defensive
running | run, shooting, ran, record, hit
...

Epoch: 3/5
Loss:  2.578321695327759
while | than, still, or, them, a
a | as, is, in, or, the
state | university, college, located, county, colorado
he | his, him, himself, met, son
been | that, than, to, were, have
which | is, the, from, a, some
states | united, america, national, american, mexico
three | seven, two, zero, nine, four
accepted | not, these, theologians, catholic, understood
construction | building, formal, structures, natural, petroleum
nobel | prize, laureate, chemist, physicist, novelist
placed | filled, plastic, removed, balls, front
writers | fiction, births, novelists, winners, literature
magazine | cartoons, newspapers, published, interview, reviews
powers | elected, power, judicial, legislative, government
troops | army, battle, forces, infantry, war
...

Epoch: 3/5
Loss:  2.280730724334717
not | is, be, a, or, the
with | the, as, also, a, and
nine | seven, six, eight, one, five
during | period, july, had, late, returned
from | of, by, the, and, or
use | used, these, are, uses, different
between | which, regions, is, the, central
years | five, year, zero, one, months
versions | microsoft, windows, version, hardware, linux
ice | dry, wet, rocks, glaciers, water
channel | channels, stations, radio, network, station
consists | each, is, basis, process, smaller
woman | she, mother, her, child, young
notes | written, repertoire, note, harmonic, tuning
accepted | not, disputed, criteria, evidence, some
writers | fiction, novelists, authors, literature, births
...

Epoch: 3/5
Loss:  2.522282600402832
their | were, the, which, from, other
a | to, the, an, it, of
as | form, is, and, such, to
into | of, the, in, which, and
american | actress, actor, nobel, singer, laureate
people | who, have, living, many, certain
i | son, r, my, you, t
by | of, in, the, and, to
stage | directing, actors, film, productions, films
shows | show, singles, hollywood, tour, album
mean | is, we, similarly, correct, sometimes
nobel | prize, laureate, chemist, physicist, physiology
creation | created, not, the, supernatural, eventual
event | events, of, tour, resulted, reached
governor | appointed, chief, office, exercised, bill
square | smallest, approximately, numbers, rectangular, sqrt
...

Epoch: 3/5
Loss:  2.66041898727417
they | themselves, their, the, in, to
first | the, by, s, was, one
was | had, after, as, he, with
new | york, in, the, university, states
three | four, two, five, seven, six
th | century, earliest, nd, rd, eight
five | two, four, seven, three, eight
that | to, as, a, for, it
gold | silver, metals, glass, precious, copper
additional | on, these, simple, called, for
placed | called, a, this, that, pieces
issue | decisions, opinion, governments, federal, question
cost | reduce, costs, prices, price, increase
bible | hebrew, tanakh, testament, text, torah
woman | male, fertility, female, birth, man
assembly | legislative, parliamentary, deputies, parliament, elected
...

Epoch: 3/5
Loss:  2.6305782794952393
were | was, had, and, later, by
history | site, external, links, archive, overview
they | their, them, to, not, often
time | was, that, the, his, s
only | a, to, with, is, it
as | and, also, with, such, the
zero | three, four, two, five, one
united | states, national, state, presidents, world
bbc | day, news, listing, days, links
mean | sum, y, measure, arithmetic, coefficients
articles | encyclopedia, archive, page, text, online
engineering | research, technology, electrical, disciplines, processing
existence | metaphysical, false, issues, truth, believe
shows | show, movies, tv, episode, acclaimed
gold | silver, bronze, coin, iron, deposits
alternative | article, complete, itself, including, information
...

Epoch: 4/5
Loss:  2.210568428039551
state | legislature, counties, residents, constitutional, southern
known | also, as, and, found, now
their | were, they, the, own, not
of | the, and, in, is, by
between | and, the, of, east, south
as | and, are, is, with, a
had | was, afterwards, became, after, to
only | is, itself, be, it, not
mainly | from, are, in, and, many
shows | actors, productions, theatre, tv, movie
liberal | liberals, conservative, opposition, party, conservatives
pope | leo, papacy, pius, papal, archbishop
orthodox | christianity, christians, churches, catholic, church
award | awards, nominations, academy, best, nomination
hold | calling, enter, regard, instead, accept
governor | appointed, minister, elected, election, presidency
...

Visualizing the word vectors¶

Below we'll use T-SNE to visualize how our high-dimensional word vectors cluster together. T-SNE is used to project these vectors into two dimensions while preserving local stucture. Check out this post from Christopher Olah to learn more about T-SNE and other ways to visualize high-dimensional data.

In [ ]:

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# getting embeddings from the embedding layer of our model, by name
embeddings = model.in_embed.weight.to('cpu').data.numpy()

viz_words = 380
tsne = TSNE()
embed_tsne = tsne.fit_transform(embeddings[:viz_words, :])

fig, ax = plt.subplots(figsize=(16, 16))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)