In [ ]:

text = "The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013."

In [ ]:

text

In [ ]:

import spacy

In [ ]:

nlp = spacy.load('en')

In [ ]:

doc = nlp(text)

In [ ]:

doc

In [ ]:

for token in doc:
    print('"' + token.text + '"')

In [ ]:

for token in doc[:10]:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

In [ ]:

for ent in doc.ents:
    print(ent.text, ent.label_)

In [ ]:

from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [ ]:

for sent in doc.sents:
    print(sent)

In [ ]:

doc.sents

In [ ]:

list(doc.sents)

In [ ]:

newdoc = nlp(list(doc.sents)[0].text)

In [ ]:

for token in newdoc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

In [ ]:

displacy.render(newdoc, style='dep', jupyter=True, options={'distance': 90})

In [ ]:

nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['minister'].vector)

In [ ]:

dog = nlp.vocab["dog"]
cat = nlp.vocab["cat"]
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]

In [ ]:

dog.similarity(cat)

In [ ]:

dog.similarity(apple)

In [ ]:

dog.similarity(orange)

In [ ]:

apple.similarity(orange)

In [ ]:

from scipy.spatial.distance import cosine

In [ ]:

1 - cosine(dog.vector, cat.vector)

In [ ]:

def vector_similarity(x, y):
    return 1 - cosine(x, y)

In [ ]:

vector_similarity(dog.vector, apple.vector)

In [ ]:

def make_guess_word(words):
    [first, second, third] = words
    return nlp.vocab[first].vector - nlp.vocab[second].vector + nlp.vocab[third].vector

In [ ]:

def get_similar_word(words, scope=nlp.vocab):
    
    guess_word = make_guess_word(words)

    similarities = []

    for word in scope:
        if not word.has_vector:
            continue

        similarity = vector_similarity(guess_word, word.vector)
        similarities.append((word, similarity))


    similarities = sorted(similarities, key=lambda item: -item[1])
    print([word[0].text for word in similarities[:10]])

In [ ]:

words = ["king", "queen", "woman"]

In [ ]:

get_similar_word(words)

In [ ]:

words = ["Paris", "London", "England"]

In [ ]:

get_similar_word(words)

In [ ]:

import numpy as np

In [ ]:

embedding = np.array([])

In [ ]:

word_list = []

In [ ]:

for token in doc:
    if not(token.is_punct) and not(token.text in word_list):
        word_list.append(token.text)

In [ ]:

word_list

In [ ]:

for word in word_list:
    embedding = np.append(embedding, nlp.vocab[word].vector)

In [ ]:

embedding.shape

In [ ]:

embedding = embedding.reshape(len(word_list), -1)

In [ ]:

embedding.shape

In [ ]:

from sklearn.manifold import TSNE

In [ ]:

tsne = TSNE()

In [ ]:

low_dim_embedding = tsne.fit_transform(embedding)

In [ ]:

import matplotlib.pyplot as plt
%pylab inline

In [ ]:

def plot_with_labels(low_dim_embs, labels, filename='tsne.pdf'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
    plt.savefig(filename)

In [ ]:

plot_with_labels(low_dim_embedding, word_list)

In [ ]: