text = "The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013."
text
import spacy
nlp = spacy.load('en')
doc = nlp(text)
doc
for token in doc:
print('"' + token.text + '"')
for token in doc[:10]:
print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
token.text,
token.idx,
token.lemma_,
token.is_punct,
token.is_space,
token.shape_,
token.pos_,
token.tag_
))
for ent in doc.ents:
print(ent.text, ent.label_)
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)
for sent in doc.sents:
print(sent)
doc.sents
list(doc.sents)
newdoc = nlp(list(doc.sents)[0].text)
for token in newdoc:
print("{0}/{1} <--{2}-- {3}/{4}".format(
token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
displacy.render(newdoc, style='dep', jupyter=True, options={'distance': 90})
nlp = spacy.load('en_core_web_lg')
print(nlp.vocab['minister'].vector)
dog = nlp.vocab["dog"]
cat = nlp.vocab["cat"]
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]
dog.similarity(cat)
dog.similarity(apple)
dog.similarity(orange)
apple.similarity(orange)
from scipy.spatial.distance import cosine
1 - cosine(dog.vector, cat.vector)
def vector_similarity(x, y):
return 1 - cosine(x, y)
vector_similarity(dog.vector, apple.vector)
def make_guess_word(words):
[first, second, third] = words
return nlp.vocab[first].vector - nlp.vocab[second].vector + nlp.vocab[third].vector
def get_similar_word(words, scope=nlp.vocab):
guess_word = make_guess_word(words)
similarities = []
for word in scope:
if not word.has_vector:
continue
similarity = vector_similarity(guess_word, word.vector)
similarities.append((word, similarity))
similarities = sorted(similarities, key=lambda item: -item[1])
print([word[0].text for word in similarities[:10]])
words = ["king", "queen", "woman"]
get_similar_word(words)
words = ["Paris", "London", "England"]
get_similar_word(words)
import numpy as np
embedding = np.array([])
word_list = []
for token in doc:
if not(token.is_punct) and not(token.text in word_list):
word_list.append(token.text)
word_list
for word in word_list:
embedding = np.append(embedding, nlp.vocab[word].vector)
embedding.shape
embedding = embedding.reshape(len(word_list), -1)
embedding.shape
from sklearn.manifold import TSNE
tsne = TSNE()
low_dim_embedding = tsne.fit_transform(embedding)
import matplotlib.pyplot as plt
%pylab inline
def plot_with_labels(low_dim_embs, labels, filename='tsne.pdf'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
plot_with_labels(low_dim_embedding, word_list)