#!/usr/bin/env python # coding: utf-8 # Open In Colab # ## Preparing Your Corpus # In[ ]: # A good practice in programming is to place your import statements at the top of your code, and to keep them together import re # for regular expressions import os # to look up operating system-based info import string # to do fancy things with strings import glob # to locate a specific file type from pathlib import Path # to access files in other directories import gensim # to access Word2Vec from gensim.models import Word2Vec # to access Gensim's flavor of Word2Vec import pandas as pd # to sort and organize data # In[ ]: dirpath = r'FILL IN YOUR FILEPATH HERE' # get file path (you can change this) file_type = ".txt" # if your data is not in a plain text format, you can change this filenames = [] data = [] # this for loop will run through folders and subfolders looking for a specific file type for root, dirs, files in os.walk(dirpath, topdown=False): # look through all the files in the given directory for name in files: if (root + os.sep + name).endswith(file_type): filenames.append(os.path.join(root, name)) # look through all the directories for name in dirs: if (root + os.sep + name).endswith(file_type): filenames.append(os.path.join(root, name)) # this for loop then goes through the list of files, reads them, and then adds the text to a list for filename in filenames: with open(filename) as afile: print(filename) data.append(afile.read()) # read the file and then add it to the list afile.close() # close the file when you're done # In[ ]: def clean_text(text): # Cleans the given text using regular expressions to split and lower-cased versions to create # a list of tokens for each text. # The function accepts a list of texts and returns a list of of lists of tokens # lower case tokens = text.split() tokens = [t.lower() for t in tokens] # remove punctuation using regular expressions # this line of code locates the punctuation within the given text and compiles that punctuation into a single variable re_punc = re.compile('[%s]' % re.escape(string.punctuation)) # this line of code substitutes the punctuation we just compiled with nothing '' tokens = [re_punc.sub('', token) for token in tokens] # only include tokens that aren't numbers tokens = [token for token in tokens if token.isalpha()] return tokens # In[ ]: # clean text from folder of text files, stored in the data variable data_clean = [] for x in data: data_clean.append(clean_text(x)) # In[ ]: # Check that the length of data and the length of data_clean are the same. Both numbers printed should be the same print(len(data)) print(len(data_clean)) # In[ ]: # check that the first item in data and the first item in data_clean are the same. # both print statements should print the same word, with the data cleaning function applied in the second one print(data[0].split()[0]) print(data_clean[0][0]) # In[ ]: # check that the last item in data_clean and the last item in data are the same # both print statements should print the same word, with the data cleaning function applied in the second one print(data[0].split()[-1]) print(data_clean[0][-1]) # ## Model Creation # In[ ]: # train the model model = Word2Vec(sentences=data_clean, window=5, min_count=3, workers=4, epochs=5, sg=1) # save the model model.save("word2vec.model") # In[ ]: # load the model model = Word2Vec.load("word2vec.model") # ## Analysis ## # # ### Exploratory Queries # In[ ]: # set the word that we are checking for word = "milk" # if that word is in our vocabulary if word in model.wv.key_to_index: # print a statement to let us know print("The word %s is in your model vocabulary" % word) # otherwise, let us know that it isn't else: print("%s is not in your model vocabulary" % word) # In[ ]: # returns a list with the top ten words used in similar contexts to the word "milk" model.wv.most_similar('milk', topn=10) # In[ ]: # returns the top ten most similar words to "recipe" that are dissimilar from "milk" model.wv.most_similar(positive = ["recipe"], negative=["milk"], topn=10) # In[ ]: # returns the top ten most similar words to both "recipe" and "milk" model.wv.most_similar(positive = ["recipe", "milk"], topn=10) # In[ ]: # returns a cosine similarity score for the two words you provide model.wv.similarity("milk", "cream") # In[ ]: # returns a prediction for the other words in a set containing the words "flour," "eggs," and "cream" model.predict_output_word([ "flour", "eggs", "cream"]) # In[ ]: # displays the number of words in your model's vocabulary print(len(model.wv)) # ## Validation ## # # In[ ]: dirpath = Path(r".").glob('*.model') #current directory plus only files that end in 'model' files = dirpath model_list = [] # a list to hold the actual models model_filenames = [] # the filepath for the models so we know where they came from # In[ ]: #this for loop looks for files that end with ".model" loads them, and then adds those to a list for filename in files: # turn the filename into a string and save it to "file_path" file_path = str(filename) print(file_path) # load the model with the file_path model = Word2Vec.load(file_path) # add the model to our mode_list model_list.append(model) # add the filepath to the model_filenames list model_filenames.append(file_path) # In[ ]: # set the word that we are checking for word = "milk" # if that word is in our vocabulary if word in model.wv.key_to_index: # print a statement to let us know print("The word %s is in your model vocabulary" % word) # otherwise, let us know that it isn't else: print("%s is not in your model vocabulary" % word) # In[ ]: #test word pairs that we are going to use to evaluate the models test_words = [("stir", "whisk"), ("cream", "milk"), ("cake", "muffin"), ("jam", "jelly"), ("reserve", "save"), ("bake", "cook")] # In[ ]: # these for loops will go through each list, the test word list and the models list, # and will run all the words through each model # then the results will be added to a dataframe # since NumPy 19.0, sometimes working with arrays of conflicting dimensions will throw a deprecation warning # this warning does not impact the code or the results, so we're going to filter it out # you can also specify "dtype=object" on the resulting array # np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning) # create an empty dataframe with the column headings we need evaluation_results = pd.DataFrame(columns=['Model', 'Test Words', 'Cosine Similarity'], dtype=object) # iterate though the model_list for i in range(len(model_list)): # for each model in model_list, test the tuple pairs for x in range(len(test_words)): # calculate the similarity score for each tuple similarity_score = model_list[i].wv.similarity(*test_words[x]) # create a temporary dataframe with the test results df = [model_filenames[i], test_words[x], similarity_score] # add the temporary dataframe to our final dataframe evaluation_results.loc[x] = df # save the evaluation_results dataframe as a .csv called "word2vec_model_evaluation.csv" in our current directory # if you want the .csv saved somewhere specific, include the filepath in the .to_csv() call evaluation_results.to_csv('word2vec_model_evaluation.csv') # ## Next Steps # # Here are some resources if you would like to learn more about word vectors: # # - The Women [Writers Vector Toolkit](https://wwp.northeastern.edu/lab/wwvt/index.html) is a web interface for exploring word vectors, accompanied by glossaries, sources, case studies, and sample assignments. This toolkit includes links to a [GitHub repository with RMD walkthroughs](https://github.com/NEU-DSG/wwp-public-code-share/tree/main/WordVectors) with code for training word2vec models in R, as well as [download and resources on preparing text corpora](https://wwp.northeastern.edu/lab/wwvt/resources/downloads/index.html). # # - The [Women Writers Project Resources](https://wwp.northeastern.edu/outreach/resources/index.html) page has guides on: searching your corpus, corpus analysis and preparation, model validation and assessment, and other materials for working with word vectors. # # - [Link to other PH tutorial in draft] # # # _This walkthrough was written on November 16, 2022 using Python 3.8.3 and Gensim 4.2.0_