#!/usr/bin/env python # coding: utf-8 # # Word2Vec Analysis on the Gnadenhutten Massacre # # **Author**: [Ung, Lik Teng](https://github.com/unglikteng)
# **Class**: [DH150, Winter 2019](http://asandersgarcia.humspace.ucla.edu/courses/dh150w19/)
# **Instructor**: [Professor Ashley Sanders Garcia](http://asandersgarcia.humspace.ucla.edu/) # # Word2Vec is a popular word embedding, which is able to model words in high-dimensional space beyond frequency count. The advantage of Word2Vec is that it can capture the "contexts" of a word within a specific body of corpus. I trained a Word2Vec model on 9 newspaper articles on the Gnadenhutten Massacre that happened on March 8, 1782. I am interested in how different sides involved in this massacre were being discussed in public discourse. Specifically, I am interested in words that are most associated with the Moravian Indians and the American militia. # # **Table of Contents** # * [1.Documents Import](#import) # * [2.Text Preprocessing](#preprocessing) # * [3.Word2Vec Training](#training) # * [4.Word2Vec to Tensor](#tensor) # In[35]: import cython, os #ENSURE cython package is installed on computer/canopy import string, re, collections import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from string import ascii_letters, digits from smart_open import smart_open import gensim from gensim.models import phrases from gensim import corpora, models, similarities #calc all similarities at once, from http://radimrehurek.com/gensim/tut3.html from gensim.models import Word2Vec, KeyedVectors from sklearn.manifold import TSNE from sklearn.feature_extraction.text import CountVectorizer # from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # import plotly.offline as py # py.init_notebook_mode(connected=True) # import plotly.graph_objs as go # import plotly.tools as tls import plotly.plotly as py import plotly.tools as plotly_tools import plotly.graph_objs as go plotly_tools.set_credentials_file(username='unglikteng', api_key='ho4TAl3mWMMNK6DnRSCL') from nltk import word_tokenize from nltk.stem.snowball import SnowballStemmer from nltk.corpus import stopwords plt.style.use('ggplot') get_ipython().run_line_magic('matplotlib', 'inline') # *** # # ## 1. Documents Import # # The directory path and filename are hardcoded here. Import your text documents if you would like to analyze them with Word2Vec. # *** # In[3]: primaries = [] primaryPath = os.path.join(os.path.realpath(""),"primary") for root, directories, file in os.walk('primary'): for txt in file: path = os.path.join(primaryPath, txt) file = open(path, "r") primaries.append("".join(file.read().splitlines())) file.close() # In[4]: # 9 Newspaper articles len(primaries) # In[5]: # Get a sense of how the article look like primaries[0] # *** # # ## 2. Text Preprocessing # *** # In[6]: # Define our Text Preprocessor class ## Tokenize -> Remove stopwords -> Stemming class Preprocessor: def tokenize_word(self, sentence, to_token = None): # all lower case lower = sentence.strip().lower() # remove punctuation punctuation_table = str.maketrans(string.punctuation, len(string.punctuation)*' ' ) noPunc = lower.translate(punctuation_table) # remove digit nodigit = re.sub(r'\d+', '', noPunc) nodigit = re.sub(r'\s+', ' ', nodigit).strip() if to_token: tokenized = word_tokenize(nodigit) return tokenized return nodigit def stem_word(self, tokens): stemmer = SnowballStemmer("english") stemmed = [] for token in tokens: stemmed.append(stemmer.stem(token)) return stemmed def remove_stopwords(self, tokens): stopword_list = stopwords.words("english") filtered = [w for w in tokens if w not in stopword_list] return filtered # In[7]: # Define preprocessor object preprocessor = Preprocessor() primariesToken = [preprocessor.stem_word( preprocessor.remove_stopwords( preprocessor.tokenize_word(line, to_token=True))) for line in primaries] primariesUnstemmed = [preprocessor.remove_stopwords( preprocessor.tokenize_word(line, to_token=True)) for line in primaries] # In[8]: # Build stemming dictionary # This dictionary will help us trace back to the unstemmed words stem_dict = {} for i, row in enumerate(primariesUnstemmed): for j, token in enumerate(row): stem_dict.update({primariesToken[i][j]:token}) # In[9]: # Visualize the corpus - Frequency Analysis def word_counter(list_of_doc): countVec = CountVectorizer() df_cv = countVec.fit_transform(list_of_doc) word_freq = dict(zip(countVec.get_feature_names(), np.asarray(df_cv.sum(axis=0)).ravel())) word_counter = collections.Counter(word_freq) word_counter_df = pd.DataFrame(word_counter.most_common(20), columns = ['word','freq']) a4_dims = (15, 10) fig, ax = plt.subplots(figsize = a4_dims) sns.barplot(x="word", y="freq", data=word_counter_df, palette= "PuBuGn_d",ax=ax) return word_counter # In[10]: wc_primaries = word_counter([" ".join(tokens) for tokens in primariesToken]) # *** # # ## 3. Word2Vec Training # *** # In[11]: # Text Preprocessing -> Phrase Detection with Gensim -> Word2Vec Training ## Phrase Detection bigram_transformer = phrases.Phrases(primariesToken) bigram= phrases.Phraser(bigram_transformer) # **Word2Vec Hyperparameters**: # * skip-gram method is used instead of CBOW (Continuous Bag of Words) since skip-gram generally performs better on small dataset # * Dimension of word vectors: 500 # * min_count: since the corpus is pretty small, set min_count to 2 is reasonable # # In[12]: model_primaries = Word2Vec(bigram[primariesToken], workers=4, sg=1,size=500,window=5, min_count = 2, sample=1e-3) model_primaries.init_sims(replace=True) #Precompute L2-normalized vectors. If replace is set to TRUE, forget the original vectors and only keep the normalized ones. Saves lots of memory, but can't continue to train the model. model_primaries.save("model_primaries") #save your model for later use! change the name to something to remember the hyperparameters you trained it with # In[13]: # Load the model model_p = Word2Vec.load("model_primaries") # In[14]: # There are 1318 words in the vocabulary len(model_p.wv.vocab) # In[15]: model_p.wv.vocab.keys() # In[16]: # model_p.wv.most_similar(positive = ["white", "american"], # negative = ["british"]) # $\overrightarrow{Dimension_g} = \overrightarrow{white} + \overrightarrow{american} - \overrightarrow{british}$ # In[17]: whiteAmerican_british = [('moravian', 0.9998588562011719), ('indian', 0.9998587369918823), ('fairfield', 0.9998579621315002), ('mani', 0.9998571872711182), ('live', 0.9998558163642883), ('murder', 0.9998539686203003), ('day', 0.9998528957366943), ('missionari', 0.9998528957366943), ('massacr', 0.9998518824577332), ('ohio', 0.999851405620575)] # In[19]: ## Visualize words most similar to White American my_word_list=[] my_word_vectors=[] # label=[] for i in whiteAmerican_british: if my_word_list not in my_word_list: my_word_list.append(i[0]) my_word_vectors.append(model_p.wv[i[0]]) tsne_model = TSNE(perplexity=5, n_components=2, init='pca', n_iter=3000, random_state=23) #you may need to tune these, epsecially the perplexity. #Use PCA to reduce dimensionality to 2-D, an "X" and a "Y new_values = tsne_model.fit_transform(my_word_vectors) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) trace1 = go.Scatter( x = x, y = y, mode = 'markers+text', name = "Similar to White American", text = [stem_dict[word] if stem_dict.get(word) else word for word in my_word_list], textposition='bottom center' ) data = [trace1] layout = go.Layout(dict(title = "Most similar Words to White American", yaxis = dict(title = "Dimension2"), xaxis = dict(title = "Dimension1"), plot_bgcolor = "rgb(243,243,243)", paper_bgcolor = "rgb(243,243,243)", ) ) fig = go.Figure(data=data,layout=layout) py.iplot(fig) # **Some of the significant words that come up from this specific dimension include**; # * Massacre # * Moravian Indians # * War # * Missionary # * Ohio # # These words construct what we know about the Gnadenhutten Massacre - "The Moravian Indians, who were not allies of Britain, were massacred by American militia in Gnadenhutte, Ohio. # # *** # # ## 4. Word2Vec to Tensor # # I am using [Google Embedding Projector](https://projector.tensorflow.org/) to visualize my Word2Vec model. Since it is built on tensorflow, we need to convert our Word2Vec output to the tensor format. The function below does just that. # # The final visualization of this Word2Vec model can be found [here](https://projector.tensorflow.org/?config=https://gist.githubusercontent.com/unglikteng/3d31526c9e090ff8123c4e7a1b07d2bb/raw/74fa1165c92c98a890d7837b97f33599146ebb0f/projector_config.json). # *** # In[23]: def word2vec2tensor(model, tensor_filename): outfiletsv = tensor_filename + '_tensor.tsv' outfiletsvmeta = tensor_filename + '_metadata.tsv' with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata: for word in model.wv.index2word: word = stem_dict[word] if stem_dict.get(word) else word file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n')) vector_row = '\t'.join(str(x) for x in model.wv.__getitem__(word)) file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n')) # In[28]: # word2vec2tensor(model_p, "model_primaries")