#!/usr/bin/env python
# coding: utf-8
# # Word2Vec Analysis on the Gnadenhutten Massacre
#
# **Author**: [Ung, Lik Teng](https://github.com/unglikteng)
# **Class**: [DH150, Winter 2019](http://asandersgarcia.humspace.ucla.edu/courses/dh150w19/)
# **Instructor**: [Professor Ashley Sanders Garcia](http://asandersgarcia.humspace.ucla.edu/)
#
# Word2Vec is a popular word embedding, which is able to model words in high-dimensional space beyond frequency count. The advantage of Word2Vec is that it can capture the "contexts" of a word within a specific body of corpus. I trained a Word2Vec model on 9 newspaper articles on the Gnadenhutten Massacre that happened on March 8, 1782. I am interested in how different sides involved in this massacre were being discussed in public discourse. Specifically, I am interested in words that are most associated with the Moravian Indians and the American militia.
#
# **Table of Contents**
# * [1.Documents Import](#import)
# * [2.Text Preprocessing](#preprocessing)
# * [3.Word2Vec Training](#training)
# * [4.Word2Vec to Tensor](#tensor)
# In[35]:
import cython, os #ENSURE cython package is installed on computer/canopy
import string, re, collections
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from string import ascii_letters, digits
from smart_open import smart_open
import gensim
from gensim.models import phrases
from gensim import corpora, models, similarities #calc all similarities at once, from http://radimrehurek.com/gensim/tut3.html
from gensim.models import Word2Vec, KeyedVectors
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.tools as tls
import plotly.plotly as py
import plotly.tools as plotly_tools
import plotly.graph_objs as go
plotly_tools.set_credentials_file(username='unglikteng', api_key='ho4TAl3mWMMNK6DnRSCL')
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
plt.style.use('ggplot')
get_ipython().run_line_magic('matplotlib', 'inline')
# ***
#
# ## 1. Documents Import
#
# The directory path and filename are hardcoded here. Import your text documents if you would like to analyze them with Word2Vec.
# ***
# In[3]:
primaries = []
primaryPath = os.path.join(os.path.realpath(""),"primary")
for root, directories, file in os.walk('primary'):
for txt in file:
path = os.path.join(primaryPath, txt)
file = open(path, "r")
primaries.append("".join(file.read().splitlines()))
file.close()
# In[4]:
# 9 Newspaper articles
len(primaries)
# In[5]:
# Get a sense of how the article look like
primaries[0]
# ***
#
# ## 2. Text Preprocessing
# ***
# In[6]:
# Define our Text Preprocessor class
## Tokenize -> Remove stopwords -> Stemming
class Preprocessor:
def tokenize_word(self, sentence, to_token = None):
# all lower case
lower = sentence.strip().lower()
# remove punctuation
punctuation_table = str.maketrans(string.punctuation, len(string.punctuation)*' ' )
noPunc = lower.translate(punctuation_table)
# remove digit
nodigit = re.sub(r'\d+', '', noPunc)
nodigit = re.sub(r'\s+', ' ', nodigit).strip()
if to_token:
tokenized = word_tokenize(nodigit)
return tokenized
return nodigit
def stem_word(self, tokens):
stemmer = SnowballStemmer("english")
stemmed = []
for token in tokens:
stemmed.append(stemmer.stem(token))
return stemmed
def remove_stopwords(self, tokens):
stopword_list = stopwords.words("english")
filtered = [w for w in tokens if w not in stopword_list]
return filtered
# In[7]:
# Define preprocessor object
preprocessor = Preprocessor()
primariesToken = [preprocessor.stem_word(
preprocessor.remove_stopwords(
preprocessor.tokenize_word(line, to_token=True)))
for line in primaries]
primariesUnstemmed = [preprocessor.remove_stopwords(
preprocessor.tokenize_word(line, to_token=True))
for line in primaries]
# In[8]:
# Build stemming dictionary
# This dictionary will help us trace back to the unstemmed words
stem_dict = {}
for i, row in enumerate(primariesUnstemmed):
for j, token in enumerate(row):
stem_dict.update({primariesToken[i][j]:token})
# In[9]:
# Visualize the corpus - Frequency Analysis
def word_counter(list_of_doc):
countVec = CountVectorizer()
df_cv = countVec.fit_transform(list_of_doc)
word_freq = dict(zip(countVec.get_feature_names(), np.asarray(df_cv.sum(axis=0)).ravel()))
word_counter = collections.Counter(word_freq)
word_counter_df = pd.DataFrame(word_counter.most_common(20), columns = ['word','freq'])
a4_dims = (15, 10)
fig, ax = plt.subplots(figsize = a4_dims)
sns.barplot(x="word", y="freq", data=word_counter_df, palette= "PuBuGn_d",ax=ax)
return word_counter
# In[10]:
wc_primaries = word_counter([" ".join(tokens) for tokens in primariesToken])
# ***
#
# ## 3. Word2Vec Training
# ***
# In[11]:
# Text Preprocessing -> Phrase Detection with Gensim -> Word2Vec Training
## Phrase Detection
bigram_transformer = phrases.Phrases(primariesToken)
bigram= phrases.Phraser(bigram_transformer)
# **Word2Vec Hyperparameters**:
# * skip-gram method is used instead of CBOW (Continuous Bag of Words) since skip-gram generally performs better on small dataset
# * Dimension of word vectors: 500
# * min_count: since the corpus is pretty small, set min_count to 2 is reasonable
#
# In[12]:
model_primaries = Word2Vec(bigram[primariesToken], workers=4, sg=1,size=500,window=5, min_count = 2, sample=1e-3)
model_primaries.init_sims(replace=True) #Precompute L2-normalized vectors. If replace is set to TRUE, forget the original vectors and only keep the normalized ones. Saves lots of memory, but can't continue to train the model.
model_primaries.save("model_primaries") #save your model for later use! change the name to something to remember the hyperparameters you trained it with
# In[13]:
# Load the model
model_p = Word2Vec.load("model_primaries")
# In[14]:
# There are 1318 words in the vocabulary
len(model_p.wv.vocab)
# In[15]:
model_p.wv.vocab.keys()
# In[16]:
# model_p.wv.most_similar(positive = ["white", "american"],
# negative = ["british"])
# $\overrightarrow{Dimension_g} = \overrightarrow{white} + \overrightarrow{american} - \overrightarrow{british}$
# In[17]:
whiteAmerican_british = [('moravian', 0.9998588562011719),
('indian', 0.9998587369918823),
('fairfield', 0.9998579621315002),
('mani', 0.9998571872711182),
('live', 0.9998558163642883),
('murder', 0.9998539686203003),
('day', 0.9998528957366943),
('missionari', 0.9998528957366943),
('massacr', 0.9998518824577332),
('ohio', 0.999851405620575)]
# In[19]:
## Visualize words most similar to White American
my_word_list=[]
my_word_vectors=[]
# label=[]
for i in whiteAmerican_british:
if my_word_list not in my_word_list:
my_word_list.append(i[0])
my_word_vectors.append(model_p.wv[i[0]])
tsne_model = TSNE(perplexity=5, n_components=2, init='pca', n_iter=3000, random_state=23) #you may need to tune these, epsecially the perplexity. #Use PCA to reduce dimensionality to 2-D, an "X" and a "Y
new_values = tsne_model.fit_transform(my_word_vectors)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
trace1 = go.Scatter(
x = x,
y = y,
mode = 'markers+text',
name = "Similar to White American",
text = [stem_dict[word] if stem_dict.get(word) else word for word in my_word_list],
textposition='bottom center'
)
data = [trace1]
layout = go.Layout(dict(title = "Most similar Words to White American",
yaxis = dict(title = "Dimension2"),
xaxis = dict(title = "Dimension1"),
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
)
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
# **Some of the significant words that come up from this specific dimension include**;
# * Massacre
# * Moravian Indians
# * War
# * Missionary
# * Ohio
#
# These words construct what we know about the Gnadenhutten Massacre - "The Moravian Indians, who were not allies of Britain, were massacred by American militia in Gnadenhutte, Ohio.
#
# ***
#
# ## 4. Word2Vec to Tensor
#
# I am using [Google Embedding Projector](https://projector.tensorflow.org/) to visualize my Word2Vec model. Since it is built on tensorflow, we need to convert our Word2Vec output to the tensor format. The function below does just that.
#
# The final visualization of this Word2Vec model can be found [here](https://projector.tensorflow.org/?config=https://gist.githubusercontent.com/unglikteng/3d31526c9e090ff8123c4e7a1b07d2bb/raw/74fa1165c92c98a890d7837b97f33599146ebb0f/projector_config.json).
# ***
# In[23]:
def word2vec2tensor(model, tensor_filename):
outfiletsv = tensor_filename + '_tensor.tsv'
outfiletsvmeta = tensor_filename + '_metadata.tsv'
with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata:
for word in model.wv.index2word:
word = stem_dict[word] if stem_dict.get(word) else word
file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
vector_row = '\t'.join(str(x) for x in model.wv.__getitem__(word))
file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))
# In[28]:
# word2vec2tensor(model_p, "model_primaries")