#!/usr/bin/env python
# coding: utf-8
# # Training Embeddings Using Gensim and FastText
# > Word embeddings are an approach to representing text in NLP. In this notebook we will demonstrate how to train embeddings both CBOW and SkipGram methods using Genism and Fasttext.
#
# - toc: true
# - badges: true
# - comments: true
# - categories: [Concept, Embedding, Gensim, FastText]
# - author: "Quantum Stat"
# - image:
# In[ ]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')
# In[ ]:
# define training data
#Genism word2vec requires that a format of ‘list of lists’ be provided for training where every document contained in a list.
#Every list contains lists of tokens of that document.
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]
#Training the model
model_cbow = Word2Vec(corpus, min_count=1,sg=0) #using CBOW Architecture for trainnig
model_skipgram = Word2Vec(corpus, min_count=1,sg=1)#using skipGram Architecture for training
# ## Continuous Bag of Words (CBOW)
# In CBOW, the primary task is to build a language model that correctly predicts the center word given the context words in which the center word appears.
# In[ ]:
#Summarize the loaded model
print(model_cbow)
#Summarize vocabulary
words = list(model_cbow.wv.vocab)
print(words)
#Acess vector for one word
print(model_cbow['dog'])
# In[ ]:
#Compute similarity
print("Similarity between eats and bites:",model_cbow.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_cbow.similarity('eats', 'man'))
# From the above similarity scores we can conclude that eats is more similar to bites than man.
# In[ ]:
#Most similarity
model_cbow.most_similar('meat')
# In[ ]:
# save model
model_cbow.save('model_cbow.bin')
# load model
new_model_cbow = Word2Vec.load('model_cbow.bin')
print(new_model_cbow)
# ## SkipGram
# In skipgram, the task is to predict the context words from the center word.
# In[ ]:
#Summarize the loaded model
print(model_skipgram)
#Summarize vocabulary
words = list(model_skipgram.wv.vocab)
print(words)
#Acess vector for one word
print(model_skipgram['dog'])
# In[ ]:
#Compute similarity
print("Similarity between eats and bites:",model_skipgram.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_skipgram.similarity('eats', 'man'))
# From the above similarity scores we can conclude that eats is more similar to bites than man.
# In[ ]:
#Most similarity
model_skipgram.most_similar('meat')
# In[ ]:
# save model
model_skipgram.save('model_skipgram.bin')
# load model
new_model_skipgram = Word2Vec.load('model_skipgram.bin')
print(new_model_skipgram)
# ## Training Your Embedding on Wiki Corpus
#
# ##### The corpus download page : https://dumps.wikimedia.org/enwiki/20200120/
# The entire wiki corpus as of 28/04/2020 is just over 16GB in size.
# We will take a part of this corpus due to computation constraints and train our word2vec and fasttext embeddings.
#
# The file size is 294MB so it can take a while to download.
#
# Source for code which downloads files from Google Drive: https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive/39225039#39225039
# In[ ]:
import os
import requests
os.makedirs('data/en', exist_ok= True)
file_name = "data/en/enwiki-latest-pages-articles-multistream14.xml-p13159683p14324602.bz2"
file_id = "11804g0GcWnBIVDahjo5fQyc05nQLXGwF"
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if not os.path.exists(file_name):
download_file_from_google_drive(file_id, file_name)
else:
print("file already exists, skipping download")
print(f"File at: {file_name}")
# In[ ]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
import time
# In[ ]:
#Preparing the Training data
wiki = WikiCorpus(file_name, lemmatize=False, dictionary={})
sentences = list(wiki.get_texts())
#if you get a memory error executing the lines above
#comment the lines out and uncomment the lines below.
#loading will be slower, but stable.
# wiki = WikiCorpus(file_name, processes=4, lemmatize=False, dictionary={})
# sentences = list(wiki.get_texts())
#if you still get a memory error, try settings processes to 1 or 2 and then run it again.
# ### Hyperparameters
#
#
# 1. sg - Selecting the training algorithm: 1 for skip-gram else its 0 for CBOW. Default is CBOW.
# 2. min_count- Ignores all words with total frequency lower than this.
# There are many more hyperparamaeters whose list can be found in the official documentation [here.](https://radimrehurek.com/gensim/models/word2vec.html)
#
# In[ ]:
#CBOW
start = time.time()
word2vec_cbow = Word2Vec(sentences,min_count=10, sg=0)
end = time.time()
print("CBOW Model Training Complete.\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))
# In[ ]:
#Summarize the loaded model
print(word2vec_cbow)
print("-"*30)
#Summarize vocabulary
words = list(word2vec_cbow.wv.vocab)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)
#Acess vector for one word
print(f"Length of vector: {len(word2vec_cbow['film'])}")
print(word2vec_cbow['film'])
print("-"*30)
#Compute similarity
print("Similarity between film and drama:",word2vec_cbow.similarity('film', 'drama'))
print("Similarity between film and tiger:",word2vec_cbow.similarity('film', 'tiger'))
print("-"*30)
# In[ ]:
# save model
from gensim.models import Word2Vec, KeyedVectors
word2vec_cbow.wv.save_word2vec_format('word2vec_cbow.bin', binary=True)
# load model
# new_modelword2vec_cbow = Word2Vec.load('word2vec_cbow.bin')
# print(word2vec_cbow)
# In[ ]:
#SkipGram
start = time.time()
word2vec_skipgram = Word2Vec(sentences,min_count=10, sg=1)
end = time.time()
print("SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))
# In[ ]:
#Summarize the loaded model
print(word2vec_skipgram)
print("-"*30)
#Summarize vocabulary
words = list(word2vec_skipgram.wv.vocab)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)
#Acess vector for one word
print(f"Length of vector: {len(word2vec_skipgram['film'])}")
print(word2vec_skipgram['film'])
print("-"*30)
#Compute similarity
print("Similarity between film and drama:",word2vec_skipgram.similarity('film', 'drama'))
print("Similarity between film and tiger:",word2vec_skipgram.similarity('film', 'tiger'))
print("-"*30)
# In[ ]:
# save model
word2vec_skipgram.wv.save_word2vec_format('word2vec_sg.bin', binary=True)
# load model
# new_model_skipgram = Word2Vec.load('model_skipgram.bin')
# print(model_skipgram)
# ## FastText
# In[ ]:
#CBOW
start = time.time()
fasttext_cbow = FastText(sentences, sg=0, min_count=10)
end = time.time()
print("FastText CBOW Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))
# In[ ]:
#Summarize the loaded model
print(fasttext_cbow)
print("-"*30)
#Summarize vocabulary
words = list(fasttext_cbow.wv.vocab)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)
#Acess vector for one word
print(f"Length of vector: {len(fasttext_cbow['film'])}")
print(fasttext_cbow['film'])
print("-"*30)
#Compute similarity
print("Similarity between film and drama:",fasttext_cbow.similarity('film', 'drama'))
print("Similarity between film and tiger:",fasttext_cbow.similarity('film', 'tiger'))
print("-"*30)
# In[ ]:
#SkipGram
start = time.time()
fasttext_skipgram = FastText(sentences, sg=1, min_count=10)
end = time.time()
print("FastText SkipGram Model Training Complete\nTime taken for training is:{:.2f} hrs ".format((end-start)/3600.0))
# In[ ]:
#Summarize the loaded model
print(fasttext_skipgram)
print("-"*30)
#Summarize vocabulary
words = list(fasttext_skipgram.wv.vocab)
print(f"Length of vocabulary: {len(words)}")
print("Printing the first 30 words.")
print(words[:30])
print("-"*30)
#Acess vector for one word
print(f"Length of vector: {len(fasttext_skipgram['film'])}")
print(fasttext_skipgram['film'])
print("-"*30)
#Compute similarity
print("Similarity between film and drama:",fasttext_skipgram.similarity('film', 'drama'))
print("Similarity between film and tiger:",fasttext_skipgram.similarity('film', 'tiger'))
print("-"*30)
# An interesting obeseravtion if you noticed is that CBOW trains faster than SkipGram in both cases.
# We will leave it to the user to figure out why. A hint would be to refer the working of CBOW and skipgram.