# a bunch of import statements for the functions we'll be using
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_selection import SelectKBest
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import re, string, codecs
This notebook teaches how to group texts together based on their similarities/differences and how to visualize the results. By the end of the notebook, we will be able to put in a new text and see where it falls in relation to all of the other texts.
Module Developers: Sujude, Erik, Jonathan, Stephanie
1 - [Section 1: Clustering Documents](#section 1)
2 - [Section 2: Multidimensional Scaling](#section 2)
Dependencies:
Clustering is the grouping of a particular set of objects based on their characteristics, grouping them according to their similarities so that "objects in the same group (called a cluster) are more similar (in some sense or another) to each other than to those in other groups (clusters)." (Learn more: https://en.wikipedia.org/wiki/Cluster_analysis)
First we'll read in the DTM (Document Term Matrix) from a CSV (comma-separated values) file. We're using a DTM that has been cleaned of all stopwords and normalized.
The first line of the csv consists of a list of all the words, and the first column of the csv has all the text names, so we'll load those in first. Below I use the file path 'Data/stop11tfidf1gram.csv', but if you want to try out different DTMs (all of them can be found in the Data folder) you can replace the filepath with your chosen file name. All file paths should be of the form 'Data/[name of doc here].csv'.
filepath = 'Data/cleanonegram.csv' #this is where the dtm is stored
First we'll read in the csv file using the relevant pandas function. Below I do so and display the first 5 rows of the resulting DataFrame (a DataFrame is what the pandas library uses to represent a table).
tfidf = pd.read_csv(filepath, index_col=0)
# tfidf = pd.concat([pd.read_csv('Data/cleanonegram.csv', index_col = 0).loc[["NEW"], tfidf.columns], tfidf])
tfidf.head()
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-8-f85f0ebe4eca> in <module>() ----> 1 tfidf = pd.read_csv(filepath, index_col=0) 2 # tfidf = pd.concat([pd.read_csv('Data/cleanonegram.csv', index_col = 0).loc[["NEW"], tfidf.columns], tfidf]) 3 tfidf.head() /Users/stephkim/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision) 644 skip_blank_lines=skip_blank_lines) 645 --> 646 return _read(filepath_or_buffer, kwds) 647 648 parser_f.__name__ = name /Users/stephkim/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds) 387 388 # Create the parser. --> 389 parser = TextFileReader(filepath_or_buffer, **kwds) 390 391 if (nrows is not None) and (chunksize is not None): /Users/stephkim/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds) 728 self.options['has_index_names'] = kwds['has_index_names'] 729 --> 730 self._make_engine(self.engine) 731 732 def close(self): /Users/stephkim/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine) 921 def _make_engine(self, engine='c'): 922 if engine == 'c': --> 923 self._engine = CParserWrapper(self.f, **self.options) 924 else: 925 if engine == 'python': /Users/stephkim/anaconda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds) 1388 kwds['allow_leading_cols'] = self.index_col is not False 1389 -> 1390 self._reader = _parser.TextReader(src, **kwds) 1391 1392 # XXX pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)() pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)() FileNotFoundError: File b'Data/cleanonegram.csv' does not exist
First let's generate a list of words and texts present in the DTM. The list of words is just all the column names of the DataFrame. The list of document IDs is the index (or the name of a row) of each row of the DataFrame.
words = [re.sub('[^\w\[\]\-/]','', word) for word in list(tfidf.columns.values)] #get all the column names (i.e. words)
print("First 5 words: ", words[:5])
print("Number of words: ", len(words))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-9-439512b6a13d> in <module>() ----> 1 words = [re.sub('[^\w\[\]\-/]','', word) for word in list(tfidf.columns.values)] #get all the column names (i.e. words) 2 print("First 5 words: ", words[:5]) 3 print("Number of words: ", len(words)) NameError: name 'tfidf' is not defined
texts = list(tfidf.index.values) #get all the index names (i.e. text IDs)
print("First 5 texts: ", texts[:5])
print("Number of texts: ", len(texts))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-10-f7626a39db62> in <module>() ----> 1 texts = list(tfidf.index.values) #get all the index names (i.e. text IDs) 2 print("First 5 texts: ", texts[:5]) 3 print("Number of texts: ", len(texts)) NameError: name 'tfidf' is not defined
I'll also define a function to access document names based on indices. This will come in handy later.
def id_text(index):
return texts[index]
I'll also define a dictionary, and function that maps the document ID's (like c.0.1.1) to document names (like 'Ur III catalogue from Nibru (N1)'). Don't worry too much about how this works, just understand what it does.
idToName = pd.read_csv('Data/idToTextName.csv', index_col=0, names=['id', 'name'])
idToName.head()
name | |
---|---|
id | |
c.4.12.2 | A hymn to Martu (Martu B) |
c.4.12.1 | A šir-gida to Martu (Martu A) |
c.4.29.1 | A šir-gida to Nuska (Nuska A) |
c.4.29.2 | A šir-gida to Nuska (Nuska B) |
c.4.08.31 | A balbale to Inana (Dumuzid-Inana E1) |
def text_name(doc):
if(type(doc) == int): #if an index was passed in, turn it into a doc ID
doc = id_text(doc)
try:
return idToName.loc[doc]['name'] #not all document labels will be in the DataFrame
except:
return doc #if not in the DataFrame, just return the argument passed in
The ultimate goal of this exercise is to compare a new text to the existing texts in the corpus. We will first do this by categorizing the existing documents into "clusters", and then seeing which of those clusters the new text best fits in. Below we define a new text.
new_text = tfidf.iloc[0] #the new text, which is really just a row from the tfidf
We now try to cluster documents that are similar to each other.
K-Means clustering is an iterative method to group vectors into $k$ clusters. It uses the following process:
To see this process visually, consider the following two-dimensional dataset (our actual dataset is more like 4000+ dimensions, one dimension for each term):
xs = [1, 1.5, 1.5, 2, 3.5, 4]
ys = [1, 0.0, 5, 4, 2, 2]
plt.plot(xs, ys, 'bs')
plt.show()
Let's say we want three clusters. First we initialize random cluster centers (in red):
cxs = [2, 3, 4]
cys = [0, 2, 3]
plt.plot(xs, ys, 'bs')
plt.plot(cxs, cys, 'r^')
plt.show()
We now assign each data point (or vector) to the nearest cluster center.
plt.plot(xs[:2], ys[:2], 'bs')
plt.plot(xs[2:5], ys[2:5], 'gs')
plt.plot(xs[5:], ys[5:], 'ys')
plt.plot(cxs, cys, 'r^')
plt.show()
Here I've colored the ones closest to the (2, 0) cluster in blue, the ones closest to the (3, 2) cluster in green, and the ones closest to the (4, 3) cluster in yellow.
Now we recenter all the cluster centers at the actual centers of the current clusters
cxs = [1.25, 7/3, 4]
cys = [.5, 11/3, 2]
plt.plot(xs[:2], ys[:2], 'bs')
plt.plot(xs[2:5], ys[2:5], 'gs')
plt.plot(xs[5:], ys[5:], 'ys')
plt.plot(cxs, cys, 'r^')
plt.show()
If we repeat the above process (assign vectors to closest centers, then re-define centers) one more time, we will end up with the below clustering.
cxs = [1.25, 1.75, 3.75]
cys = [.5, 4.5, 2]
plt.plot(xs[:2], ys[:2], 'bs')
plt.plot(xs[2:4], ys[2:4], 'gs')
plt.plot(xs[4:], ys[4:], 'ys')
plt.plot(cxs, cys, 'r^')
plt.show()
Looks about right! Even though this time it only took two iterations, usually this will take many many iterations to get right. Furthermore, since the initial cluster centers are randomized, and our dataset doesn't have extremely clear clusters like in this example, we will end up with different clusterings every time we run k-means on it.
Let's try executing K-Means on our documents using 7 clusters. I picked 7 arbitrarily - in the future we can try using the "elbow method" to determine a number of clusters, or we can try different options and manually inspect the clusters to see if they make any sense.
We use scikit-learns convenient KMeans function; it does all the work for us, we just have to specify number of clusters.
dtm_normalized_kmeans = KMeans(n_clusters=7, max_iter=1000).fit(tfidf)
We can examine the "label" of each of the documents as defined by this clustering - this tells us which cluster each of the documents is classified under.
labels = dtm_normalized_kmeans.labels_
labels
array([3, 2, 5, 5, 5, 2, 5, 5, 2, 2, 5, 2, 2, 3, 5, 5, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 3, 2, 2, 4, 4, 4, 3, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 2, 2, 6, 6, 6, 2, 3, 2, 5, 3, 3, 3, 3, 3, 5, 5, 6, 6, 6, 2, 6, 6, 6, 6, 5, 3, 5, 5, 3, 5, 2, 5, 5, 5, 5, 4, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 4, 5, 5, 5, 5, 5, 5, 5, 1, 2, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 3, 5, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 6, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 3, 2, 2, 2, 2, 2, 2, 1, 2, 5, 3, 5, 2, 5, 5, 2, 2, 5, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 5, 5, 5, 2, 5, 2, 5, 2, 6, 5, 5, 5, 2, 5, 5, 5, 6, 5, 5, 3, 2, 2, 5, 5, 2, 2, 5, 2, 2, 2, 5, 5, 2, 5, 5, 5, 2, 5, 2, 2, 2, 4, 4, 4, 4, 2, 4, 3, 5, 5, 5, 2, 2, 2, 5, 2, 5, 2, 5, 3, 3, 2, 2, 2, 2, 3, 3, 2, 2, 3, 2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3])
It seems like a there is a nice spread of documents being assigned to different clusters. (what we don't want is something like 50% or more of the documents falling into the same cluster).
Let's see what the centers of each of the clusters look like.
cluster_centers = dtm_normalized_kmeans.cluster_centers_
cluster_centers
array([[ 0.00000000e+00, 2.77052924e-02, -1.08420217e-19, ..., 0.00000000e+00, 5.42101086e-20, 2.71050543e-20], [ 6.45082761e-03, 8.67361738e-19, 2.16840434e-19, ..., -2.16840434e-19, 1.08420217e-19, -2.71050543e-20], [ 4.40117818e-03, 3.15881856e-03, 1.07511997e-03, ..., 1.41651819e-03, 2.83007182e-04, 5.01722791e-04], ..., [ 0.00000000e+00, 8.67361738e-19, 2.16840434e-19, ..., 0.00000000e+00, 2.40189125e-03, 2.71050543e-20], [ 4.97035070e-04, 7.37797049e-05, 1.08420217e-18, ..., -1.73472348e-18, 2.90439073e-04, -1.08420217e-19], [ 1.94294611e-03, 1.30913345e-03, 2.16840434e-19, ..., 0.00000000e+00, 1.08420217e-19, 2.71050543e-20]])
Interesting... We get a nice variation across cluster centers (i.e. not all 0s or anything like that), so we can move on.
Below I now define a method that takes in a cluster number, and outputs a dictionary whose values are lists of documents that belong to a specific cluster. The idea is that someone familiar with this corpus could look at the output and determine if the clustering makes any sense whatsoever.
def inspect_clusters(num_clusters, use_id=True):
km = KMeans(n_clusters=num_clusters, max_iter=1000).fit(tfidf)
labels = km.labels_
clusters = {}
for i in range(num_clusters):
docs = [j for j in range(len(labels)) if labels[j] == i]
clusters[i] = [id_text(k) for k in docs] if use_id else [text_name(id_text(k)) for k in docs]
return clusters
Here's an example of a use of this function. This examines the documents that appear in a certain cluster given that we choose $k = 7$.
seven_clusters = inspect_clusters(7)
print(seven_clusters[0])
['c.1.7.1', 'c.3.1.01', 'c.3.1.02', 'c.3.1.03', 'c.3.1.04', 'c.3.1.05', 'c.3.1.06', 'c.3.1.06.1', 'c.3.1.07', 'c.3.1.08', 'c.3.1.11.1', 'c.3.1.13.2', 'c.3.1.15', 'c.3.1.16', 'c.3.1.17', 'c.3.1.18', 'c.3.1.19', 'c.3.1.20', 'c.3.2.02', 'c.3.2.03', 'c.3.2.04', 'c.3.3.04', 'c.3.3.05', 'c.3.3.08', 'c.3.3.11', 'c.4.22.2', 'c.5.5.5', 'c.5.6.3', 'c.6.1.13', 'c.6.1.27']
print(seven_clusters[1])
['NEW', 'c.0.1.1', 'c.0.2.08', 'c.1.1.1', 'c.1.1.2', 'c.1.1.3', 'c.1.1.4', 'c.1.2.2', 'c.1.3.1', 'c.1.3.2', 'c.1.3.3', 'c.1.3.5', 'c.1.4.1', 'c.1.4.1.1', 'c.1.4.3', 'c.1.4.4', 'c.1.5.1', 'c.1.7.3', 'c.1.7.4', 'c.1.7.7', 'c.1.8.1.2', 'c.1.8.1.3', 'c.1.8.1.4', 'c.1.8.1.5', 'c.1.8.1.5.1', 'c.1.8.2.1', 'c.1.8.2.2', 'c.1.8.2.3', 'c.1.8.2.4', 'c.2.1.4', 'c.2.1.5', 'c.2.1.6', 'c.2.1.7', 'c.2.2.2', 'c.2.2.3', 'c.2.2.4', 'c.2.2.5', 'c.2.2.6', 'c.2.3.1', 'c.2.3.2', 'c.2.4.1.1', 'c.2.4.2.01', 'c.2.4.2.02', 'c.2.4.2.03', 'c.2.4.2.04', 'c.2.4.2.05', 'c.2.4.2.21', 'c.2.4.2.24', 'c.2.4.2.a', 'c.2.5.1.2', 'c.2.5.1.3', 'c.2.5.1.4', 'c.2.5.2.1', 'c.2.5.3.1', 'c.2.5.3.4', 'c.2.5.4.10', 'c.2.5.4.11', 'c.2.5.4.a', 'c.2.5.6.6', 'c.2.5.8.1', 'c.2.6.9.1', 'c.2.7.1.1', 'c.2.8.2.1', 'c.2.8.3.1', 'c.2.8.5.a', 'c.3.1.11', 'c.3.2.05', 'c.3.3.01', 'c.3.3.03', 'c.3.3.09', 'c.3.3.10', 'c.3.3.21', 'c.3.3.22', 'c.3.3.39', 'c.4.02.1', 'c.4.06.1', 'c.4.07.1', 'c.4.07.2', 'c.4.07.3', 'c.4.07.4', 'c.4.07.5', 'c.4.07.a', 'c.4.08.09', 'c.4.08.10', 'c.4.08.20', 'c.4.08.29', 'c.4.08.33', 'c.4.08.a', 'c.4.14.1', 'c.4.14.3', 'c.4.15.2', 'c.4.15.3', 'c.4.16.1', 'c.4.16.2', 'c.4.19.1', 'c.4.19.2', 'c.4.19.3', 'c.4.22.1', 'c.4.22.4', 'c.4.22.5', 'c.4.25.1', 'c.4.28.1', 'c.4.31.1', 'c.4.32.2', 'c.4.80.2', 'c.5.1.3', 'c.5.2.4', 'c.5.2.5', 'c.5.3.2', 'c.5.3.3', 'c.5.3.5', 'c.5.3.7', 'c.5.4.11', 'c.5.4.12', 'c.5.5.1', 'c.5.5.2', 'c.5.5.a', 'c.5.7.1', 'c.5.7.2', 'c.5.9.2']
Save clusters into variables (used later in the MDS Section)
cluster_0 = seven_clusters[0]
cluster_1 = seven_clusters[1]
cluster_2 = seven_clusters[2]
cluster_3 = seven_clusters[3]
cluster_4 = seven_clusters[4]
cluster_5 = seven_clusters[5]
cluster_6 = seven_clusters[6]
We can also choose to use the names of documents instead of their labeled id by using the use_id
tag in the function call.
seven_clusters_names = inspect_clusters(7, use_id=False)
seven_clusters_names[1][:10]
['OB catalogue possibly from Zimbir (B1)', 'Dumuzid and his sisters', 'Inana and Bilulu: an ulila to Inana', 'A lullaby for a son of Šulgi (Šulgi N)', 'A love song of Šulgi (Šulgi Z)', 'A song of Šulgi', 'A balbale to Bau for Šu-Suen (Šu-Suen A)', 'A balbale to Inana for Šu-Suen (Šu-Suen B)', 'A balbale to Inana for Šu-Suen (Šu-Suen C)', 'A love song of Išme-Dagan (Išme-Dagan J)']
We see a high frequency of documents belonging in a specific "genre" (like c.2) per cluster, so perhaps we're onto something by choosing 7 clusters!
And finally, below I define a method to classify a new text into one of the seven clusters. It does this by finding which of the cluster centers is closest (in terms of Euclidean distance) to the new text vector.
def classify(cluster_centers, new_text):
euclid_dist = lambda x, y: np.linalg.norm(x-y)
return min(range(len(cluster_centers)), \
key=lambda i: euclid_dist(cluster_centers[i], new_text))
If we test this function using the cluster centers we just generated with seven clusters, and passing in the second document of our existing corpus as a "new text", we see that our classifier correctly chooses the category k-means had chosen previously.
classify(cluster_centers, tfidf.iloc[2])
5
It's difficult to visualize the clustering of 4000+ dimension vectors (each term adds to the dimensionality of the vecotrs).
Here we attempt to use feature selection - trying to pick out two or three of the most "significant" features (where features are terms in this case) and plot those features onto 2D or 3D graphs. Below we try two different selection criteria to determine which terms to plot.
This one sees which word has the highest tf-idf values across the clusters we choose to plot.
# gives the features with the largest magnitude (summed across given cluster centers)
def largest_selector(clusters, num_features):
size = []
for feat in range(len(clusters[0])):
size.append((feat, sum([c[feat] for c in clusters])))
size = sorted(size, key=lambda t: -t[1])
return size[:num_features]
This one sees which word has the highest difference across tf-idf values across the clusters we choose to plot.
# gives the features with the largest difference between clusters
def largest_diff_selector(clusters, num_features):
tot_diff = []
for feat in range(len(clusters[0])):
tot_diff.append((feat, sum([abs(c1[feat] - c2[feat]) for c1 in clusters for c2 in clusters])))
tot_diff = sorted(tot_diff, key=lambda t: -t[1])
return tot_diff[:num_features]
Below I've defined two functions that take in a list of cluster numbers, a selector function (one of the two just defined above, and a new text - a text that is not yet in the corpus that we want to compare.
Each of these functions will use matplotlib to plot a scatterplot of the clusters selected (in different colors) according to the frequency of the terms that were selected by the selector function. They will also print out the terms that were selected.
Don't worry too much about how these functions work. We'll go through a couple examples below.
Also, just for the sake of consistency, I'll be loading in a set of clusters that was previously generated so I can adequately explain what is going on with the plots, since K-Means will tend to produce a different set of clusters each time.
labels = list(pd.read_pickle('text_to_cluster.pickle')['cluster'])
cluster_centers = pd.read_pickle('cluster_to_center.pickle').as_matrix()
def plot2d(clusters, selector_func, new_text=None):
selected_features = selector_func([cluster_centers[i] for i in clusters], 2)
selected_features = [f[0] for f in selected_features]
selected_dtm = tfidf.iloc[:,selected_features]
mask = lambda i: [lbl == i for lbl in labels]
cluster_vectors = [selected_dtm.loc[mask(i)] for i in clusters]
colors = iter(['b', 'g', 'y', 'c'])
cluster_numbers = iter(clusters)
for cluster in cluster_vectors:
X, Y = list(cluster.iloc[:,0]), list(cluster.iloc[:,1])
plt.scatter(X, Y, c=next(colors), label="cluster " + str(next(cluster_numbers)), s=20)
if new_text is not None:
plt.scatter(new_text[selected_features[0]], new_text[selected_features[1]], c='r', s=40, label="new text")
plt.legend()
plt.xlabel(words[selected_features[0]])
plt.ylabel(words[selected_features[1]])
plt.show()
return selected_features
def plot3d(clusters, selector_func, new_text=None):
selected_features = selector_func([cluster_centers[i] for i in clusters], 3)
selected_features = [f[0] for f in selected_features]
selected_dtm = tfidf.iloc[:,selected_features]
mask = lambda i: [lbl == i for lbl in labels]
cluster_vectors = [selected_dtm.loc[mask(i)] for i in clusters]
cluster_numbers = iter(clusters)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for cluster in cluster_vectors:
X, Y, Z = list(cluster.iloc[:,0]), list(cluster.iloc[:,1]), list(cluster.iloc[:,2])
ax.scatter(X, Y, Z, label="cluster " + str(next(cluster_numbers)), s=10)
if new_text is not None:
X, Y, Z = new_text[selected_features[0]], new_text[selected_features[1]], new_text[selected_features[2]]
ax.scatter(X, Y, Z, label="new text", c='red', s=40)
plt.legend()
plt.xlabel(words[selected_features[0]])
plt.ylabel(words[selected_features[1]])
ax.set_zlabel(words[selected_features[2]])
plt.draw()
plt.show()
return selected_features
Let's try using these functions now. First, 2D.
Below I pass in [2, 5]
as clusters (meaning we want to visualize the documents in the 0th and 1st cluster). We are using the largest_selector function to pick out two features.
selected = plot2d([2, 5], largest_selector)
We can see that cluster2 (in blue) has greater variation across the x-axis and y-axis (more across y-axis), or the terms 'Ninurtak[1]DN' and 'ursa[hero]N', and generally has a higher frequency of 'Ninurtak[1]DN' as compared to cluster5 (in green) , which has less variation across each axis.
Let's try passing in the exact same thing, but also add in a "new text". Remember new_text
? We defined it way way above in the notebook.
selected = plot2d([2, 5], largest_selector, new_text=new_text)
Our "new text", marked in red, seems to stay around cluster 5, so we might predict that our new text is most similar to cluster 5 documents.
Let's try passing in the exact same paramenters into the 3D function.
selected = plot3d([2, 5], largest_selector, new_text=new_text)
We see the same results as before, except we now see that cluster2 (in orange) has variation across the third term our selector chose, 'mah[great]V/i', whereas cluster5 and our new_text (in red) seem to not vary across this third term much.
We can also throw in more clusters (than two) into the function. Here I call the same function, but also include cluster3.
selected = plot3d([2, 5, 3], largest_selector, new_text=new_text)
Here we see that cluster3 (green), like cluster5 (orange), doesn't vary much in the x-axis, though it does have more variation in the z-axis than both the other clusters. Our new text still doesn't look like it fits any of these too well.
If you look at labels up above, we could see that it's actually classified into cluster5. One might have been able to guess this from the plots just by where it is in comparison to the clusters.
Below I write out a couple other usages of these functions. See if you can explain in words what they are visualizing, and what the differences between the clusters seem to be.
selected = plot3d([1, 5, 6], largest_diff_selector)
selected = plot2d([2, 4, 6], largest_diff_selector)
selected = plot3d([2, 4, 6], largest_diff_selector)
We'll use "agglomerative clustering" here, a variant of hierarchical clustering.
In agglomerative clustering, each document starts as a cluster. The algorithm will look for which clusters are most similar and will pair them up ("link" them) to form a new cluster. Now we have one fewer clusters (because two clusters just became one). This keeps happening until all the documents are part of a single cluster.
dtm_normalized_hierarchical = AgglomerativeClustering(7).fit(tfidf)
We can check the number of "leaves" as a sanity check. This should equal the number of documents.
dtm_normalized_hierarchical.n_leaves_
357
We can also check the labels of each document. Like with k-means, this represents which cluster this method would place each document in.
hierarchical_labels = dtm_normalized_hierarchical.labels_
hierarchical_labels
array([6, 1, 2, 0, 0, 0, 0, 1, 1, 1, 0, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6, 6, 1, 1, 1, 1, 1, 3, 3, 2, 1, 1, 6, 1, 1, 1, 6, 6, 6, 1, 6, 6, 6, 6, 6, 6, 0, 0, 1, 1, 4, 1, 2, 6, 6, 6, 6, 6, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 1, 6, 6, 6, 6, 3, 2, 2, 6, 2, 1, 1, 0, 0, 1, 1, 1, 3, 0, 0, 0, 2, 0, 0, 0, 5, 1, 2, 2, 6, 1, 1, 2, 6, 2, 2, 2, 2, 2, 2, 1, 6, 2, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 6, 0, 0, 2, 2, 2, 0, 2, 0, 0, 0, 1, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 4, 1, 1, 5, 6, 1, 4, 4, 1, 1, 1, 2, 1, 1, 1, 2, 6, 2, 2, 2, 2, 1, 2, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 2, 2, 6, 1, 0, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 3, 3, 3, 3, 1, 2, 6, 2, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 6, 6, 0, 1, 1, 4, 1, 4, 6, 4, 4, 2, 6, 6, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 1, 4, 1, 4, 4], dtype=int64)
Like with k-means, we see that there is a fair amount of variation in the labels.
We can also look at the "children" attribute. This tells us how the documents were linked together.
children = dtm_normalized_hierarchical.children_
children[:10]
array([[116, 243], [ 66, 67], [ 42, 43], [ 3, 4], [134, 138], [ 94, 95], [ 6, 360], [129, 130], [ 29, 30], [ 60, 61]], dtype=int64)
The first pair of the array indicates which two documents were linked together first. For example, if the array looked like what's below:
np.array([[115, 242], [65, 66], [41, 42], [2, 3], [133, 137]])
array([[115, 242], [ 65, 66], [ 41, 42], [ 2, 3], [133, 137]])
This would mean that documents 115 and 242 were linked together first, followed by documents 65 and 66, etc.
Note that this means the children array should have length one less than the total number of documents. Think about why this is.
Let's draw a dendrogram! Here we're going to use scipy's function.
First we go through the agglomerative proceess (called "linkage").
Z = linkage(tfidf)
And now we plot all the documents in a dendrogram. A dengrogram looks has a tree structure, meaning that it starts with a root, and splits off into more and more "branches", which then keep splitting until we reach "leaves", which in this case are documents. Leaves on the same branch as "similar" to each other as decided by the linkage function.
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('document index')
plt.ylabel('distance')
dendrogram(
Z,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
)
plt.show()
This is way too crowded, since it has all 394 documents. Let's try it on some subset of the documents - maybe the ones from a specific cluster.
So first we select only the documents from cluster 0.
# select only documents from one of the clusters
mask = [i for i in range(len(hierarchical_labels)) if hierarchical_labels[i] == 2]
dendro_labels = [text_name(i) for i in mask]
selected_hierarchical = tfidf.iloc[mask]
len(selected_hierarchical)
72
Then we go through the agglomerative process.
Z2 = linkage(selected_hierarchical)
And now we plot!
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('document index')
plt.ylabel('distance')
dendrogram(
Z2,
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=8., # font size for the x axis labels
labels=dendro_labels
)
plt.show()
We can now clearly see (if we go from bottom up) the progression in which the documents were linked together. For example, documents 10 and 11 (near the right) were linked together pretty early (meaning they were deemed relatively "similar"), this new "cluster" was then linked with document 7, then 8, then 9.
Multidimensional scaling (MDS) is similar to factor analysis, which reduces the number of variables one has to work with and detects structure and patterns in the relationships between variables. It helps classify variables. The end goal is to analyze and be able to find a group of variables that results in clear similarities and dissimilarities (distances) between the objects using the variables that best describe them. “In factor analysis, the similarities between objects (e.g., variables) are expressed in the correlation matrix. With MDS, you can analyze any kind of similarity or dissimilarity matrix, in addition to correlation matrices.” (More information: http://www.statsoft.com/Textbook/Principal-Components-Factor-Analysis and http://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling)
A simple example of MDS is a map of cities. We can use 2 dimensions to describe the location of the cities. MDS arranges the objects (cities) in a particular dimension (2-D) to demonstrate the observed differences. “As a result, we can "explain" the distances in terms of underlying dimensions; in our example, we could explain the distances in terms of the two geographical dimensions: north/south and east/west.” (More information: http://www.statsoft.com/Textbook/Principal-Components-Factor-Analysis and http://scikit-learn.org/stable/modules/manifold.html#multidimensional-scaling)
!pip install datascience
!pip install plotly
#a bunch of import statements for the functions we'll be using
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise
from sklearn.manifold import MDS
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()
Requirement already satisfied: datascience in /Users/stephkim/anaconda/lib/python3.6/site-packages Requirement already satisfied: folium==0.1.5 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from datascience) Requirement already satisfied: coveralls==0.5 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from datascience) Requirement already satisfied: setuptools in /Users/stephkim/anaconda/lib/python3.6/site-packages/setuptools-27.2.0-py3.6.egg (from datascience) Requirement already satisfied: sphinx in /Users/stephkim/anaconda/lib/python3.6/site-packages/Sphinx-1.5.1-py3.6.egg (from datascience) Requirement already satisfied: coverage==3.7.1 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from datascience) Requirement already satisfied: pytest in /Users/stephkim/anaconda/lib/python3.6/site-packages (from datascience) Requirement already satisfied: PyYAML>=3.10 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from coveralls==0.5->datascience) Requirement already satisfied: requests>=1.0.0 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from coveralls==0.5->datascience) Requirement already satisfied: docopt>=0.6.1 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from coveralls==0.5->datascience) Requirement already satisfied: six>=1.5 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from sphinx->datascience) Requirement already satisfied: Jinja2>=2.3 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from sphinx->datascience) Requirement already satisfied: Pygments>=2.0 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from sphinx->datascience) Requirement already satisfied: docutils>=0.11 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from sphinx->datascience) Requirement already satisfied: snowballstemmer>=1.1 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from sphinx->datascience) Requirement already satisfied: babel!=2.0,>=1.3 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from sphinx->datascience) Requirement already satisfied: alabaster<0.8,>=0.7 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from sphinx->datascience) Requirement already satisfied: imagesize in /Users/stephkim/anaconda/lib/python3.6/site-packages (from sphinx->datascience) Requirement already satisfied: py>=1.4.29 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from pytest->datascience) Requirement already satisfied: MarkupSafe>=0.23 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from Jinja2>=2.3->sphinx->datascience) Requirement already satisfied: pytz>=0a in /Users/stephkim/anaconda/lib/python3.6/site-packages (from babel!=2.0,>=1.3->sphinx->datascience) Requirement already satisfied: plotly in /Users/stephkim/anaconda/lib/python3.6/site-packages Requirement already satisfied: pytz in /Users/stephkim/anaconda/lib/python3.6/site-packages (from plotly) Requirement already satisfied: requests in /Users/stephkim/anaconda/lib/python3.6/site-packages (from plotly) Requirement already satisfied: nbformat>=4.2 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from plotly) Requirement already satisfied: decorator>=4.0.6 in /Users/stephkim/anaconda/lib/python3.6/site-packages (from plotly) Requirement already satisfied: six in /Users/stephkim/anaconda/lib/python3.6/site-packages (from plotly)
file = pd.read_csv(filepath, index_col = 0)
file.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-a6c8c07edf5b> in <module>() ----> 1 file = pd.read_csv(filepath, index_col = 0) 2 file.head() NameError: name 'pd' is not defined
words = file.columns
texts = file.index
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-2-e0328c80227c> in <module>() ----> 1 words = file.columns 2 texts = file.index NameError: name 'file' is not defined
print(words[:10]) #print out the first 10 words as a sanity check
print(len(words))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-3-335b556625bd> in <module>() ----> 1 print(words[:10]) #print out the first 10 words as a sanity check 2 print(len(words)) NameError: name 'words' is not defined
dtm = file.as_matrix()
dtm
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-4-04235d7f2545> in <module>() ----> 1 dtm = file.as_matrix() 2 dtm NameError: name 'file' is not defined
len(dtm)
357
Create the distance matrix (dist_matrix) which compares every text to all other texts using cosine distance. We convert this matrix to a pandas dataframe, so we can work with the data more easily.
dist_matrix = pairwise.pairwise_distances(dtm, metric='cosine')
pd.DataFrame(dist_matrix, index = file.index, columns = file.index).head()
id_text | NEW | c.0.1.1 | c.0.1.2 | c.0.2.01 | c.0.2.02 | c.0.2.03 | c.0.2.04 | c.0.2.06 | c.0.2.07 | c.0.2.08 | ... | c.6.1.23 | c.6.1.24 | c.6.1.25 | c.6.1.26 | c.6.1.27 | c.6.1.28 | c.6.2.1 | c.6.2.2 | c.6.2.3 | c.6.2.5 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id_text | |||||||||||||||||||||
NEW | 0.000000 | 0.962945 | 0.851581 | 0.919699 | 0.874433 | 0.948302 | 0.902691 | 0.892054 | 0.954069 | 0.910012 | ... | 0.866161 | 0.916890 | 0.892752 | 0.858174 | 0.908099 | 0.864070 | 0.869662 | 0.852963 | 0.749770 | 0.774005 |
c.0.1.1 | 0.962945 | 0.000000 | 0.969680 | 0.966868 | 0.962363 | 0.997151 | 0.990835 | 0.991770 | 0.982845 | 0.983801 | ... | 0.980515 | 0.990688 | 0.988028 | 0.941877 | 0.986637 | 0.967051 | 0.962104 | 0.953026 | 0.941010 | 0.954521 |
c.0.1.2 | 0.851581 | 0.969680 | 0.000000 | 0.784443 | 0.712469 | 0.884764 | 0.692718 | 0.776575 | 0.983248 | 0.839090 | ... | 0.924080 | 0.991364 | 0.959432 | 0.920062 | 0.982407 | 0.932048 | 0.914199 | 0.919821 | 0.890544 | 0.906900 |
c.0.2.01 | 0.919699 | 0.966868 | 0.784443 | 0.000000 | 0.246642 | 0.623910 | 0.277529 | 0.789035 | 0.981838 | 0.908113 | ... | 0.924489 | 0.984083 | 0.958428 | 0.942139 | 0.960728 | 0.929075 | 0.888298 | 0.904438 | 0.868081 | 0.890143 |
c.0.2.02 | 0.874433 | 0.962363 | 0.712469 | 0.246642 | 0.000000 | 0.700735 | 0.307911 | 0.735444 | 0.990221 | 0.874732 | ... | 0.909242 | 0.986926 | 0.951505 | 0.950474 | 0.947313 | 0.915647 | 0.891802 | 0.908933 | 0.870498 | 0.883610 |
5 rows × 357 columns
We create a table of the titles (taken from the given csv) and display titles and which text they came from. We shorten the titles (display titles) so when we plot the data the labels are concise and easier to interpret. We keep them together so if we are given one of the three attributes we can easily find and access the other ones.
names = pd.read_csv('Data/idToTextName.csv', index_col=0, header = None)
names.columns = ["Full title"]
names.index.name = "text ref"
names["display title"] = names["Full title"].str.slice(0,23)
names.head()
Full title | display title | |
---|---|---|
text ref | ||
c.4.12.2 | A hymn to Martu (Martu B) | A hymn to Martu (Martu |
c.4.12.1 | A šir-gida to Martu (Martu A) | A šir-gida to Martu (Ma |
c.4.29.1 | A šir-gida to Nuska (Nuska A) | A šir-gida to Nuska (Nu |
c.4.29.2 | A šir-gida to Nuska (Nuska B) | A šir-gida to Nuska (Nu |
c.4.08.31 | A balbale to Inana (Dumuzid-Inana E1) | A balbale to Inana (Dum |
labels = names["display title"]
fulltitles = names["Full title"]
labels.head()
text ref c.4.12.2 A hymn to Martu (Martu c.4.12.1 A šir-gida to Martu (Ma c.4.29.1 A šir-gida to Nuska (Nu c.4.29.2 A šir-gida to Nuska (Nu c.4.08.31 A balbale to Inana (Dum Name: display title, dtype: object
We use the clusters from the K means results above.
clusters = [cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6]
Make a dictionary of colors. We will use each color to uniquely label each cluster.
dict_color_cluster = { #define unique color for every cluster
0: '#2c4ff9', #dark blue
1: '#db0f12', #red
2: '#000000', #black
3: '#3FB230', #green
4: '#ff54f9', #pink
5: '#630AFF', #purple
6: '#F5770D', #orange
-1: '#00ffff' #cyan
}
Using the MDS function from sklearn.manifold, we run MDS on the data that we processed and grouped. When we call MDS, we used "precomputed" since we are using cosine distances we already calculated (in the dist_matrix). The output is the points that we will use to plot each text.
mds_cluster = MDS(n_components = 2, dissimilarity="precomputed") #use MDS
embeddings_cluster = mds_cluster.fit_transform(dist_matrix) #the points/vectors of the texts obtained by MDS
embeddings_cluster
array([[ 0.17061923, -0.08740414], [ 0.78848158, -0.10479215], [-0.02287633, -0.62204817], [ 0.20120709, -0.56470676], [-0.28317329, -0.57406227], [-0.188623 , 0.75038584], [-0.14670863, -0.6968594 ], [ 0.11473336, -0.66631904], [ 0.7814253 , -0.15693786], [-0.64083933, -0.41901616], [-0.37358358, -0.67691921], [ 0.31919244, -0.53404101], [ 0.26501496, 0.35391037], [-0.20123137, 0.40171282], [-0.0683872 , -0.07086821], [-0.34928297, 0.18443367], [ 0.53504492, 0.04088804], [ 0.12436799, -0.0127533 ], [ 0.27518697, -0.03879568], [-0.19164743, 0.19387329], [-0.02624566, 0.37562467], [-0.23215019, 0.72709022], [-0.24736183, 0.38780838], [ 0.06102338, 0.26411977], [ 0.20247553, 0.57210687], [ 0.02810496, 0.68287122], [ 0.20662103, 0.45970251], [ 0.62428521, -0.23119187], [-0.61920713, 0.34281546], [ 0.05120741, -0.29437794], [ 0.09594024, -0.13092697], [ 0.34485069, -0.28435661], [ 0.31396386, 0.60209063], [-0.30449668, 0.65314626], [-0.15685075, 0.4258192 ], [-0.06866007, 0.64904524], [ 0.58988658, 0.4455675 ], [ 0.57040757, -0.33030909], [ 0.53950639, -0.45784404], [ 0.42424845, 0.00584578], [ 0.40925629, 0.06892206], [ 0.28306086, 0.41685682], [ 0.29332833, 0.08969365], [ 0.36861925, -0.01295065], [ 0.01252958, 0.20716223], [ 0.0572205 , 0.32630334], [-0.15860067, 0.32874296], [ 0.29412412, 0.21585187], [ 0.78733347, 0.11557657], [ 0.19420246, 0.72600828], [ 0.05019174, 0.8144891 ], [-0.48058747, 0.59701387], [ 0.08261956, 0.18740722], [-0.26124719, 0.51197528], [ 0.19499697, -0.13036013], [-0.12043461, -0.65847772], [-0.01326016, 0.14804916], [ 0.07903299, 0.02120579], [ 0.03060869, 0.11354539], [-0.0848148 , 0.21333138], [-0.00773833, 0.63911465], [-0.09513082, -0.64137962], [-0.14080641, 0.27077196], [-0.32794574, -0.55171624], [-0.1566661 , 0.05590793], [ 0.6048671 , -0.15316993], [ 0.16975205, -0.6400933 ], [-0.22000997, -0.57209016], [ 0.47860606, -0.62845274], [ 0.25422687, -0.42004969], [-0.19472057, 0.11240152], [ 0.05912082, 0.0952451 ], [ 0.03499918, -0.03453772], [-0.12035541, 0.03312678], [-0.02401228, 0.03672692], [-0.28858754, -0.10841238], [ 0.63360024, -0.3403464 ], [-0.25477536, 0.01143055], [-0.26203353, -0.05404679], [-0.40075446, -0.28226681], [-0.30244854, 0.05296482], [-0.02378492, -0.56638645], [ 0.48211289, -0.42613148], [-0.59410696, 0.08656558], [-0.15865476, -0.00996181], [-0.44447767, 0.2317069 ], [ 0.74072201, 0.16918659], [-0.64541952, 0.38198615], [-0.08420594, -0.67483138], [-0.55225719, -0.5492926 ], [ 0.70697344, -0.29057453], [-0.5997335 , 0.50481084], [ 0.68521127, -0.05637293], [ 0.062072 , -0.66063332], [-0.71707379, 0.20981794], [-0.5575333 , 0.0887752 ], [-0.64687931, -0.27812039], [-0.66612494, -0.00147244], [-0.53754719, -0.36695428], [-0.49775492, -0.20311511], [-0.58974453, -0.29856398], [ 0.56372921, -0.22136899], [-0.66142245, 0.26060133], [-0.61855125, -0.38834629], [-0.62052353, 0.2601551 ], [ 0.09077484, -0.19855849], [-0.44519003, -0.42383346], [-0.43032443, -0.55956749], [-0.16065983, -0.42825946], [-0.06287823, -0.02777428], [-0.20576749, -0.25316897], [-0.31162036, -0.45062328], [-0.32422945, -0.38413584], [-0.37471696, -0.45687142], [-0.2773379 , -0.29747756], [ 0.08059027, -0.37473197], [ 0.66075843, -0.142627 ], [ 0.13821753, -0.21318781], [-0.3293075 , -0.2376523 ], [-0.05463742, -0.5815779 ], [-0.24481894, -0.25783139], [-0.40856838, 0.12451664], [-0.34209223, -0.65112819], [-0.25870979, -0.22386312], [-0.38084205, -0.02034618], [-0.63862055, -0.20312724], [-0.64272561, -0.04794655], [-0.69715032, -0.15878591], [-0.66503404, -0.03548954], [-0.1253775 , -0.16940218], [-0.47712932, -0.0101404 ], [-0.31961247, -0.32912599], [-0.0293256 , -0.47885058], [ 0.01014634, -0.62007064], [-0.12769611, -0.27936679], [-0.20735378, -0.18309146], [-0.01464536, -0.37549382], [-0.15131314, -0.4305352 ], [-0.19529724, -0.36556426], [ 0.5048748 , -0.52992829], [ 0.04891784, -0.55135972], [-0.48618458, 0.06915461], [-0.29180354, 0.12133176], [-0.26964836, -0.45190704], [ 0.15182727, -0.71799572], [-0.50367471, 0.17350663], [-0.41597202, -0.04058011], [ 0.05959479, -0.43949027], [ 0.04182227, -0.68875646], [-0.39960147, -0.20860444], [-0.448329 , -0.26704433], [ 0.38853022, -0.61455152], [-0.02075561, -0.17092043], [-0.52470777, -0.42597618], [-0.52614412, -0.22850525], [-0.23231139, -0.61043466], [-0.3727303 , 0.25012637], [-0.36199342, 0.49327423], [-0.72200824, -0.14074259], [-0.52772891, -0.47928804], [ 0.1391511 , -0.38268745], [-0.62483556, -0.15216082], [-0.4635886 , -0.62533813], [-0.12149171, -0.34832714], [-0.52186484, -0.16649879], [-0.32390584, -0.19113402], [ 0.34296569, -0.44199713], [-0.59347183, 0.16031955], [-0.46766371, 0.49175167], [-0.47678608, 0.425915 ], [ 0.10231676, 0.67675528], [ 0.37662011, 0.50836338], [ 0.34478848, 0.69886636], [-0.16062681, 0.64049707], [-0.09476062, 0.78975111], [-0.26049938, 0.54885064], [-0.28858886, 0.72780163], [ 0.66693373, 0.1200164 ], [ 0.08495425, 0.69529803], [ 0.52495323, -0.14617215], [ 0.70546436, -0.23071857], [ 0.02010677, 0.58708469], [ 0.27251905, 0.66973843], [ 0.3764669 , 0.56147459], [ 0.24329014, 0.56970013], [ 0.31040418, 0.66044263], [ 0.54664593, 0.47278789], [ 0.36653138, 0.60375238], [ 0.43602838, 0.27677467], [ 0.73523626, 0.04529236], [ 0.73490854, -0.34672603], [ 0.79292532, 0.07268144], [ 0.29168615, -0.1367859 ], [ 0.51673245, -0.24068005], [-0.02941718, 0.49582796], [-0.44344142, 0.58731861], [ 0.74462683, 0.25176149], [ 0.73603119, 0.30364529], [-0.35851513, 0.71214535], [ 0.46330312, 0.5087613 ], [ 0.48499824, -0.18013344], [ 0.7204265 , 0.01934361], [ 0.39463846, -0.46843652], [ 0.19239928, -0.44535756], [-0.36524903, 0.3758663 ], [-0.58930348, 0.02365929], [ 0.30751366, -0.25777414], [-0.07475561, -0.45395162], [-0.14218469, -0.07643307], [ 0.46420007, 0.10142616], [ 0.20960478, -0.60493245], [-0.10915994, -0.05991043], [-0.06685447, 0.10036669], [-0.27623647, 0.22900559], [-0.48172503, 0.30931556], [ 0.29246633, -0.75234773], [ 0.59533915, -0.27234439], [ 0.6779669 , 0.1593916 ], [-0.17779549, -0.67736196], [-0.51369705, 0.25992009], [-0.78348533, -0.0507157 ], [ 0.54776712, 0.50132627], [ 0.61750351, 0.22733796], [-0.38062207, 0.59700716], [-0.19415475, -0.78955816], [ 0.56987274, -0.45721523], [-0.60537522, -0.46064923], [-0.50629922, 0.41683486], [ 0.52529168, 0.58465734], [ 0.5753686 , 0.17902956], [ 0.50770571, -0.40650615], [ 0.55052832, 0.22206104], [ 0.37549892, -0.68266297], [-0.47375783, 0.47014129], [-0.34849621, 0.67742847], [-0.2490782 , 0.64858113], [ 0.62401126, 0.35851073], [ 0.47238335, -0.25515974], [ 0.57452281, -0.08593194], [-0.55524592, 0.41584096], [ 0.749136 , -0.22074766], [-0.27507081, 0.7580631 ], [ 0.55854372, 0.36715379], [ 0.44791131, -0.56971033], [-0.06656537, -0.19387377], [ 0.14433228, -0.48112115], [-0.46030951, -0.03767679], [ 0.70182498, -0.17631805], [-0.67530396, -0.07097001], [ 0.64180457, -0.10057811], [-0.54266345, -0.10441454], [ 0.57528611, -0.54489151], [-0.74650817, -0.01662823], [-0.52198554, -0.33491123], [-0.71930612, 0.11175647], [-0.60432304, -0.17945304], [-0.01391208, -0.74894365], [-0.46282236, -0.13141044], [-0.7183797 , 0.17334083], [-0.70497452, -0.1219655 ], [-0.68082358, 0.06066864], [-0.6397222 , -0.33585347], [-0.40336998, -0.59531731], [-0.12872887, 0.19371555], [-0.60274481, 0.52767118], [-0.36132542, 0.57355518], [ 0.2091974 , -0.30064362], [-0.60242288, -0.25249064], [-0.25656232, -0.54711783], [ 0.56716796, 0.54446904], [-0.66804551, 0.15846262], [ 0.3586228 , -0.16303298], [ 0.15326264, 0.72587927], [ 0.66676782, 0.32920823], [-0.62576751, 0.17537444], [ 0.08527655, -0.26215722], [ 0.7138228 , 0.37333353], [ 0.03428995, -0.50069803], [-0.17300467, -0.75347246], [-0.38193612, -0.52060335], [-0.78172417, 0.16658448], [-0.42114447, -0.3948012 ], [ 0.07179597, -0.73852632], [ 0.06981841, -0.78816499], [-0.26321289, -0.72748934], [-0.14795967, -0.53844995], [-0.04672738, -0.38340388], [ 0.11894377, -0.55926715], [ 0.42799796, -0.39267757], [ 0.3846937 , -0.57659861], [ 0.28053465, -0.45868956], [ 0.01084576, -0.06856735], [-0.38719504, -0.39489108], [-0.46111131, -0.49610919], [ 0.24984415, -0.30342032], [-0.40512196, 0.49352526], [ 0.34839943, -0.60381486], [-0.56517604, 0.40227536], [-0.68875568, -0.33726828], [ 0.48047008, -0.54748857], [-0.36604146, -0.10809754], [ 0.28474653, -0.56674783], [ 0.1710633 , -0.76000436], [ 0.18701404, 0.13880428], [ 0.21371591, 0.05703836], [ 0.64888772, -0.379986 ], [ 0.26920204, 0.45893651], [ 0.36106698, 0.10967579], [ 0.47022265, 0.171934 ], [-0.07746018, 0.36822689], [ 0.03876404, 0.40981655], [ 0.48672951, -0.37936836], [ 0.22331802, 0.70406847], [ 0.15614782, 0.65563076], [-0.31473725, 0.34714829], [ 0.2830981 , 0.04417532], [ 0.42911158, -0.0595998 ], [ 0.30526845, -0.64985079], [ 0.67371666, 0.20477164], [-0.76859252, 0.13843811], [ 0.12245734, 0.34485567], [ 0.58113152, 0.3253796 ], [-0.03534831, 0.72456796], [ 0.65823255, 0.0491429 ], [ 0.39498916, 0.65004462], [-0.09231555, 0.6717807 ], [-0.02588572, 0.65366666], [ 0.16422539, 0.28120571], [ 0.20794198, 0.39614301], [ 0.1905064 , 0.24032389], [ 0.31126729, 0.47960576], [ 0.41851718, 0.458979 ], [ 0.35025763, 0.29125215], [-0.40918903, 0.56301353], [ 0.15268652, 0.56186042], [-0.56807633, 0.46695438], [ 0.30960294, 0.34515189], [ 0.47112349, 0.36798238], [ 0.45612448, 0.42271808], [ 0.05781096, 0.55556324], [ 0.36905148, 0.32406579], [-0.09214864, 0.5244029 ], [-0.16220996, 0.61611426], [ 0.45050875, 0.61457347], [ 0.52829968, 0.17322854], [ 0.6783474 , 0.29194059], [ 0.19487315, 0.30283475], [ 0.54413929, -0.01564063], [ 0.73968017, -0.01071915], [-0.12976547, 0.58601435], [ 0.3877851 , 0.35888906], [ 0.0352073 , 0.75799459], [ 0.09142646, 0.50608576], [ 0.1381669 , 0.42637313], [ 0.41270353, 0.18257481], [ 0.15824761, 0.2210137 ], [ 0.22749656, 0.18792507]])
Create a function that when given a text (input), it can output which cluster it is in or if it's the new text we are analyzing.
def clusterpicker(label):
for i in range(len(clusters)):
if label in clusters[i]:
return i
Make a table (pd dataframe) of all the data we want to plot or use as labels.
plotdata = pd.DataFrame(embeddings_cluster, columns = ["x", "y"], index = file.index)
plotdata["cluster"] = [clusterpicker(i) for i in file.index]
plotdata["title"] = ["???" if i not in labels else labels[i] for i in file.index]
plotdata["fulltitle"] = [str(i) if i not in fulltitles else fulltitles[i] for i in file.index]
plotdata.loc["NEW", "cluster"] = -1
plotdata.loc["NEW", "title"] = "NEW"
plotdata.loc["NEW", "fulltitle"] = "iddindaganAB"
plotdata.head()
x | y | cluster | title | fulltitle | |
---|---|---|---|---|---|
id_text | |||||
NEW | 0.170619 | -0.087404 | -1 | NEW | iddindaganAB |
c.0.1.1 | 0.788482 | -0.104792 | 1 | Ur III catalogue from N | Ur III catalogue from Nibru (N1) |
c.0.1.2 | -0.022876 | -0.622048 | 6 | Ur III catalogue at Yal | Ur III catalogue at Yale (Y1) |
c.0.2.01 | 0.201207 | -0.564707 | 6 | OB catalogue from Nibru | OB catalogue from Nibru (N2) |
c.0.2.02 | -0.283173 | -0.574062 | 6 | OB catalogue in the Lou | OB catalogue in the Louvre (L) |
size_dict = {
0: 11,
1: 11,
2: 11,
3: 11,
4: 11,
5: 11,
6: 11,
-1: 33
}
Plot our findings using MDS.
data = [ dict(x=df['x'], y=df['y'], name='Cluster ' + str(name), mode='text', text = df['title'], hovertext = df['fulltitle'], marker = dict(size=10),
textfont=dict(
family='sans serif',
size= size_dict[name],
color=dict_color_cluster[name]
))
for name, df in plotdata.groupby('cluster')]
layout= go.Layout(
title= 'MDS of corpus',
hovermode= 'closest',
legend = dict(font = dict(size = 12)),
width = 1200,
height = 1050,
xaxis= dict(
title= 'x',
ticklen= 5,
zeroline= False,
gridwidth= 2,
),
yaxis=dict(
title= 'y',
ticklen= 5,
gridwidth= 2,
),
showlegend= True
)
fig= go.Figure(data=data, layout=layout)
py.iplot(fig)