# A good practice in programming is to place your import statements at the top of your code, and to keep them together
import re # for regular expressions
import os # to look up operating system-based info
import string # to do fancy things with strings
import glob # to locate a specific file type
from pathlib import Path # to access files in other directories
import gensim # to access Word2Vec
from gensim.models import Word2Vec # to access Gensim's flavor of Word2Vec
import pandas as pd # to sort and organize data
dirpath = r'FILL IN YOUR FILEPATH HERE' # get file path (you can change this)
file_type = ".txt" # if your data is not in a plain text format, you can change this
filenames = []
data = []
# this for loop will run through folders and subfolders looking for a specific file type
for root, dirs, files in os.walk(dirpath, topdown=False):
# look through all the files in the given directory
for name in files:
if (root + os.sep + name).endswith(file_type):
filenames.append(os.path.join(root, name))
# look through all the directories
for name in dirs:
if (root + os.sep + name).endswith(file_type):
filenames.append(os.path.join(root, name))
# this for loop then goes through the list of files, reads them, and then adds the text to a list
for filename in filenames:
with open(filename) as afile:
print(filename)
data.append(afile.read()) # read the file and then add it to the list
afile.close() # close the file when you're done
def clean_text(text):
# Cleans the given text using regular expressions to split and lower-cased versions to create
# a list of tokens for each text.
# The function accepts a list of texts and returns a list of of lists of tokens
# lower case
tokens = text.split()
tokens = [t.lower() for t in tokens]
# remove punctuation using regular expressions
# this line of code locates the punctuation within the given text and compiles that punctuation into a single variable
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# this line of code substitutes the punctuation we just compiled with nothing ''
tokens = [re_punc.sub('', token) for token in tokens]
# only include tokens that aren't numbers
tokens = [token for token in tokens if token.isalpha()]
return tokens
# clean text from folder of text files, stored in the data variable
data_clean = []
for x in data:
data_clean.append(clean_text(x))
# Check that the length of data and the length of data_clean are the same. Both numbers printed should be the same
print(len(data))
print(len(data_clean))
# check that the first item in data and the first item in data_clean are the same.
# both print statements should print the same word, with the data cleaning function applied in the second one
print(data[0].split()[0])
print(data_clean[0][0])
# check that the last item in data_clean and the last item in data are the same
# both print statements should print the same word, with the data cleaning function applied in the second one
print(data[0].split()[-1])
print(data_clean[0][-1])
# train the model
model = Word2Vec(sentences=data_clean, window=5, min_count=3, workers=4, epochs=5, sg=1)
# save the model
model.save("word2vec.model")
# load the model
model = Word2Vec.load("word2vec.model")
# set the word that we are checking for
word = "milk"
# if that word is in our vocabulary
if word in model.wv.key_to_index:
# print a statement to let us know
print("The word %s is in your model vocabulary" % word)
# otherwise, let us know that it isn't
else:
print("%s is not in your model vocabulary" % word)
# returns a list with the top ten words used in similar contexts to the word "milk"
model.wv.most_similar('milk', topn=10)
# returns the top ten most similar words to "recipe" that are dissimilar from "milk"
model.wv.most_similar(positive = ["recipe"], negative=["milk"], topn=10)
# returns the top ten most similar words to both "recipe" and "milk"
model.wv.most_similar(positive = ["recipe", "milk"], topn=10)
# returns a cosine similarity score for the two words you provide
model.wv.similarity("milk", "cream")
# returns a prediction for the other words in a set containing the words "flour," "eggs," and "cream"
model.predict_output_word([ "flour", "eggs", "cream"])
# displays the number of words in your model's vocabulary
print(len(model.wv))
dirpath = Path(r".").glob('*.model') #current directory plus only files that end in 'model'
files = dirpath
model_list = [] # a list to hold the actual models
model_filenames = [] # the filepath for the models so we know where they came from
#this for loop looks for files that end with ".model" loads them, and then adds those to a list
for filename in files:
# turn the filename into a string and save it to "file_path"
file_path = str(filename)
print(file_path)
# load the model with the file_path
model = Word2Vec.load(file_path)
# add the model to our mode_list
model_list.append(model)
# add the filepath to the model_filenames list
model_filenames.append(file_path)
# set the word that we are checking for
word = "milk"
# if that word is in our vocabulary
if word in model.wv.key_to_index:
# print a statement to let us know
print("The word %s is in your model vocabulary" % word)
# otherwise, let us know that it isn't
else:
print("%s is not in your model vocabulary" % word)
#test word pairs that we are going to use to evaluate the models
test_words = [("stir", "whisk"),
("cream", "milk"),
("cake", "muffin"),
("jam", "jelly"),
("reserve", "save"),
("bake", "cook")]
# these for loops will go through each list, the test word list and the models list,
# and will run all the words through each model
# then the results will be added to a dataframe
# since NumPy 19.0, sometimes working with arrays of conflicting dimensions will throw a deprecation warning
# this warning does not impact the code or the results, so we're going to filter it out
# you can also specify "dtype=object" on the resulting array
# np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
# create an empty dataframe with the column headings we need
evaluation_results = pd.DataFrame(columns=['Model', 'Test Words', 'Cosine Similarity'], dtype=object)
# iterate though the model_list
for i in range(len(model_list)):
# for each model in model_list, test the tuple pairs
for x in range(len(test_words)):
# calculate the similarity score for each tuple
similarity_score = model_list[i].wv.similarity(*test_words[x])
# create a temporary dataframe with the test results
df = [model_filenames[i], test_words[x], similarity_score]
# add the temporary dataframe to our final dataframe
evaluation_results.loc[x] = df
# save the evaluation_results dataframe as a .csv called "word2vec_model_evaluation.csv" in our current directory
# if you want the .csv saved somewhere specific, include the filepath in the .to_csv() call
evaluation_results.to_csv('word2vec_model_evaluation.csv')
Here are some resources if you would like to learn more about word vectors:
The Women Writers Vector Toolkit is a web interface for exploring word vectors, accompanied by glossaries, sources, case studies, and sample assignments. This toolkit includes links to a GitHub repository with RMD walkthroughs with code for training word2vec models in R, as well as download and resources on preparing text corpora.
The Women Writers Project Resources page has guides on: searching your corpus, corpus analysis and preparation, model validation and assessment, and other materials for working with word vectors.
[Link to other PH tutorial in draft]
This walkthrough was written on November 16, 2022 using Python 3.8.3 and Gensim 4.2.0