In [ ]:
text = "The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013."
In [ ]:
text
In [ ]:
import spacy
In [ ]:
nlp = spacy.load('en')
In [ ]:
doc = nlp(text)
In [ ]:
doc
In [ ]:
for token in doc:
    print('"' + token.text + '"')
In [ ]:
for token in doc[:10]:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))
In [ ]:
for ent in doc.ents:
    print(ent.text, ent.label_)
In [ ]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)
In [ ]:
for sent in doc.sents:
    print(sent)
In [ ]:
doc.sents
In [ ]:
list(doc.sents)
In [ ]:
newdoc = nlp(list(doc.sents)[0].text)
In [ ]:
for token in newdoc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
In [ ]:
displacy.render(newdoc, style='dep', jupyter=True, options={'distance': 90})
In [ ]:
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['minister'].vector)
In [ ]:
dog = nlp.vocab["dog"]
cat = nlp.vocab["cat"]
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]
In [ ]:
dog.similarity(cat)
In [ ]:
dog.similarity(apple)
In [ ]:
dog.similarity(orange)
In [ ]:
apple.similarity(orange)
In [ ]:
from scipy.spatial.distance import cosine
In [ ]:
1 - cosine(dog.vector, cat.vector)
In [ ]:
def vector_similarity(x, y):
    return 1 - cosine(x, y)
In [ ]:
vector_similarity(dog.vector, apple.vector)
In [ ]:
def make_guess_word(words):
    [first, second, third] = words
    return nlp.vocab[first].vector - nlp.vocab[second].vector + nlp.vocab[third].vector
In [ ]:
def get_similar_word(words, scope=nlp.vocab):
    
    guess_word = make_guess_word(words)

    similarities = []

    for word in scope:
        if not word.has_vector:
            continue

        similarity = vector_similarity(guess_word, word.vector)
        similarities.append((word, similarity))


    similarities = sorted(similarities, key=lambda item: -item[1])
    print([word[0].text for word in similarities[:10]])
In [ ]:
words = ["king", "queen", "woman"]
In [ ]:
get_similar_word(words)
In [ ]:
words = ["Paris", "London", "England"]
In [ ]:
get_similar_word(words)
In [ ]:
import numpy as np
In [ ]:
embedding = np.array([])
In [ ]:
word_list = []
In [ ]:
for token in doc:
    if not(token.is_punct) and not(token.text in word_list):
        word_list.append(token.text)
In [ ]:
word_list
In [ ]:
for word in word_list:
    embedding = np.append(embedding, nlp.vocab[word].vector)
In [ ]:
embedding.shape
In [ ]:
embedding = embedding.reshape(len(word_list), -1)
In [ ]:
embedding.shape
In [ ]:
from sklearn.manifold import TSNE
In [ ]:
tsne = TSNE()
In [ ]:
low_dim_embedding = tsne.fit_transform(embedding)
In [ ]:
import matplotlib.pyplot as plt
%pylab inline
In [ ]:
def plot_with_labels(low_dim_embs, labels, filename='tsne.pdf'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
    plt.savefig(filename)
In [ ]:
plot_with_labels(low_dim_embedding, word_list)
In [ ]: