#!/usr/bin/env python # coding: utf-8 # In[20]: #a bunch of import statements for the functions we'll be using from sklearn.cluster import KMeans, AgglomerativeClustering from scipy.cluster.hierarchy import dendrogram, linkage from sklearn.feature_selection import SelectKBest from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics import pairwise from sklearn.manifold import MDS import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import plotly.offline as py import plotly.graph_objs as go py.init_notebook_mode() # In[21]: file = pd.read_csv("Data/onegramfilteredtfidf.csv", index_col = 0) file.head() # In[22]: words = file.columns texts = file.index # In[23]: print(words[:10]) #print out the first 10 words as a sanity check print(len(words)) # In[24]: dtm = file.as_matrix() dtm # In[25]: len(dtm) # In[26]: dist_matrix = pairwise.pairwise_distances(dtm, metric='cosine') pd.DataFrame(dist_matrix, index = file.index, columns = file.index).head() # In[27]: names = pd.read_csv('Data/idToTextName.csv', index_col=0, header = None) names.columns = ["Full title"] names.index.name = "text ref" names["display title"] = names["Full title"].str.slice(0,23) names.head() # In[28]: labels = names["display title"] fulltitles = names["Full title"] labels.head() # In[29]: cluster_0 = ['c.1.4.1.3', 'c.2.4.1.7', 'c.2.4.2.14', 'c.2.4.2.26', 'c.2.4.4.1', 'c.2.4.4.2', 'c.2.4.4.3', 'c.4.07.6', 'c.4.07.7', 'c.4.07.8', 'c.4.07.9', 'c.4.08.01', 'c.4.08.02', 'c.4.08.03', 'c.4.08.04', 'c.4.08.05', 'c.4.08.07', 'c.4.08.08', 'c.4.08.18', 'c.4.08.25', 'c.4.08.26', 'c.4.08.28', 'c.4.08.29', 'c.4.08.30', 'c.4.08.32', 'c.4.08.33', 'c.4.13.02', 'c.4.22.2', 'c.5.5.5'] cluster_1 = ['c.0.1.2', 'c.0.2.01', 'c.0.2.02', 'c.0.2.03', 'c.0.2.04', 'c.0.2.06', 'c.0.2.11', 'c.0.2.13', 'c.1.6.1', 'c.1.6.2', 'c.1.6.3', 'c.2.4.2.20', 'c.2.4.4.4', 'c.2.5.4.15', 'c.2.5.7.1', 'c.4.27.01', 'c.4.27.02', 'c.4.27.03', 'c.4.27.04', 'c.4.27.07', 'c.4.80.2', 'c.5.5.4'] cluster_2 = ['c.1.3.1', 'c.1.3.2', 'c.1.3.3', 'c.1.3.4', 'c.1.3.5', 'c.1.4.1', 'c.1.4.4', 'c.1.8.1.2', 'c.1.8.2.3', 'c.2.1.6', 'c.2.4.2.24', 'c.2.4.2.a', 'c.2.5.1.3', 'c.2.5.3.1', 'c.2.5.4.10', 'c.2.5.4.11', 'c.2.5.6.4', 'c.2.6.9.8', 'c.2.7.1.1', 'c.2.8.2.6', 'c.2.8.3.1', 'c.3.3.01', 'c.3.3.10', 'c.4.07.1', 'c.4.07.2', 'c.4.07.3', 'c.4.07.4', 'c.4.07.5', 'c.4.07.a', 'c.4.08.06', 'c.4.08.09', 'c.4.08.10', 'c.4.08.16', 'c.4.08.20', 'c.4.08.a', 'c.4.19.3', 'c.4.33.2'] cluster_3 = ['c.1.1.3', 'c.1.1.4', 'c.2.1.7', 'c.2.3.1', 'c.2.3.2', 'c.2.4.1.2', 'c.2.4.1.3', 'c.2.4.1.5', 'c.2.4.1.6', 'c.2.4.1.a', 'c.2.4.2.01', 'c.2.4.2.03', 'c.2.4.2.04', 'c.2.4.2.05', 'c.2.4.2.07', 'c.2.4.2.15', 'c.2.4.2.16', 'c.2.4.2.17', 'c.2.4.2.18', 'c.2.4.2.21', 'c.2.4.2.22', 'c.2.4.2.25', 'c.2.4.4.a', 'c.2.4.5.2', 'c.2.5.1.4', 'c.2.5.2.1', 'c.2.5.3.2', 'c.2.5.3.4', 'c.2.5.4.01', 'c.2.5.4.02', 'c.2.5.4.03', 'c.2.5.4.04', 'c.2.5.4.05', 'c.2.5.4.08', 'c.2.5.4.09', 'c.2.5.4.13', 'c.2.5.4.17', 'c.2.5.4.19', 'c.2.5.4.21', 'c.2.5.4.23', 'c.2.5.4.24', 'c.2.5.4.27', 'c.2.5.4.29', 'c.2.5.4.b', 'c.2.5.5.1', 'c.2.5.5.2', 'c.2.5.5.3', 'c.2.5.5.4', 'c.2.5.5.5', 'c.2.5.6.1', 'c.2.5.6.2', 'c.2.5.6.3', 'c.2.5.6.5', 'c.2.5.6.6', 'c.2.5.7.2', 'c.2.5.8.1', 'c.2.6.6.1', 'c.2.6.6.5', 'c.2.6.7.1', 'c.2.6.9.1', 'c.2.6.9.2', 'c.2.6.9.3', 'c.2.6.9.4', 'c.2.6.9.5', 'c.2.6.9.6', 'c.2.6.9.7', 'c.2.8.2.2', 'c.2.8.2.4', 'c.2.8.3.2', 'c.2.8.3.3', 'c.2.8.3.5', 'c.2.8.3.6', 'c.2.8.5.1', 'c.3.3.21', 'c.4.01.1', 'c.4.02.1', 'c.4.03.1', 'c.4.05.1', 'c.4.12.1', 'c.4.12.2', 'c.4.14.1', 'c.4.15.2', 'c.4.15.3', 'c.4.16.1', 'c.4.19.1', 'c.4.21.1', 'c.4.22.1', 'c.4.22.4', 'c.4.22.5', 'c.4.22.6', 'c.4.24.1', 'c.4.25.1', 'c.4.25.2', 'c.4.26.1', 'c.4.27.06', 'c.4.28.1', 'c.4.29.1', 'c.4.29.2', 'c.4.31.1', 'c.4.33.1', 'c.4.80.1', 'c.4.80.4', 'c.5.3.7'] cluster_4 = ['c.0.1.1', 'c.1.7.6', 'c.1.7.7', 'c.2.5.1.2', 'c.3.1.01', 'c.3.1.02', 'c.3.1.03', 'c.3.1.04', 'c.3.1.05', 'c.3.1.06', 'c.3.1.06.1', 'c.3.1.07', 'c.3.1.08', 'c.3.1.11', 'c.3.1.11.1', 'c.3.1.13.2', 'c.3.1.15', 'c.3.1.16', 'c.3.1.17', 'c.3.1.18', 'c.3.1.19', 'c.3.1.20', 'c.3.1.21', 'c.3.2.02', 'c.3.2.03', 'c.3.2.04', 'c.3.3.04', 'c.3.3.05', 'c.3.3.08', 'c.3.3.09', 'c.3.3.11', 'c.4.08.23', 'c.4.08.31', 'c.5.3.1', 'c.5.6.3', 'c.5.6.5', 'c.6.1.18', 'c.6.1.21'] cluster_5 = ['c.2.4.2.b', 'c.2.4.4.6', 'c.2.4.5.1', 'c.2.4.5.3', 'c.2.4.5.4', 'c.2.4.5.5', 'c.2.5.3.3', 'c.2.6.2.1', 'c.2.6.2.a', 'c.2.8.2.3', 'c.3.3.22', 'c.4.13.01', 'c.4.13.03', 'c.4.13.04', 'c.4.13.05', 'c.4.13.07', 'c.4.13.08', 'c.4.13.09', 'c.4.13.10', 'c.4.13.11', 'c.4.13.12', 'c.4.13.13', 'c.4.13.14', 'c.4.13.15', 'c.4.13.a', 'c.4.13.c', 'c.4.17.1'] cluster_6 = ['c.0.2.07', 'c.0.2.08', 'c.1.1.1', 'c.1.1.2', 'c.1.2.1', 'c.1.2.2', 'c.1.4.1.1', 'c.1.4.3', 'c.1.5.1', 'c.1.7.1', 'c.1.7.3', 'c.1.7.4', 'c.1.7.8', 'c.1.8.1.1', 'c.1.8.1.3', 'c.1.8.1.4', 'c.1.8.1.5', 'c.1.8.1.5.1', 'c.1.8.2.1', 'c.1.8.2.2', 'c.1.8.2.4', 'c.2.1.1', 'c.2.1.2', 'c.2.1.3', 'c.2.1.4', 'c.2.1.5', 'c.2.2.2', 'c.2.2.3', 'c.2.2.4', 'c.2.2.5', 'c.2.2.6', 'c.2.4.1.1', 'c.2.4.1.4', 'c.2.4.2.02', 'c.2.4.3.1', 'c.2.5.4.a', 'c.2.8.2.1', 'c.2.8.5.a', 'c.2.8.5.b', 'c.3.2.05', 'c.3.3.02', 'c.3.3.03', 'c.3.3.39', 'c.4.06.1', 'c.4.08.15', 'c.4.13.06', 'c.4.14.2', 'c.4.14.3', 'c.4.16.2', 'c.4.19.2', 'c.4.23.1', 'c.4.32.2', 'c.4.32.e', 'c.4.32.f', 'c.5.1.3', 'c.5.2.4', 'c.5.2.5', 'c.5.3.2', 'c.5.3.3', 'c.5.3.5', 'c.5.3.6', 'c.5.4.11', 'c.5.4.12', 'c.5.5.1', 'c.5.5.2', 'c.5.5.3', 'c.5.5.a', 'c.5.6.1', 'c.5.7.1', 'c.5.7.2', 'c.5.9.1', 'c.5.9.2', 'c.6.1.01', 'c.6.1.02', 'c.6.1.03', 'c.6.1.04', 'c.6.1.05', 'c.6.1.07', 'c.6.1.08', 'c.6.1.09', 'c.6.1.10', 'c.6.1.11', 'c.6.1.12', 'c.6.1.13', 'c.6.1.14', 'c.6.1.15', 'c.6.1.16', 'c.6.1.17', 'c.6.1.19', 'c.6.1.22', 'c.6.1.23', 'c.6.1.24', 'c.6.1.25', 'c.6.1.26', 'c.6.1.27', 'c.6.1.28', 'c.6.2.1', 'c.6.2.2', 'c.6.2.3', 'c.6.2.5'] # In[30]: clusters = [cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6] # In[31]: dict_color_cluster = { #define unique color for every cluster 0: '#2c4ff9', #dark blue 1: '#db0f12', #red 2: '#000000', #black 3: '#3FB230', #green 4: '#ff54f9', #pink 5: '#630AFF', #purple 6: '#F5770D' #orange } # In[ ]: # In[32]: mds_cluster = MDS(n_components = 2, dissimilarity="precomputed") #use MDS embeddings_cluster = mds_cluster.fit_transform(dist_matrix) #the points/vectors of the texts obtained by MDS embeddings_cluster # In[33]: def clusterpicker(label): for i in range(len(clusters)): if label in clusters[i]: return i # In[34]: plotdata = pd.DataFrame(embeddings_cluster, columns = ["x", "y"], index = file.index) plotdata["cluster"] = [clusterpicker(i) for i in file.index] plotdata["title"] = ["???" if i not in labels else labels[i] for i in file.index] plotdata["fulltitle"] = [str(i) if i not in fulltitles else fulltitles[i] for i in file.index] plotdata.head() # In[35]: data = [ dict(x=df['x'], y=df['y'], name='Cluster ' + str(name), mode='text', text = df['title'], hovertext = df['fulltitle'], marker = dict(size=10), textfont=dict( family='sans serif', size=11, color=dict_color_cluster[name] )) for name, df in plotdata.groupby('cluster')] layout= go.Layout( title= 'MDS of corpus', hovermode= 'closest', legend = dict(font = dict(size = 12)), width = 1200, height = 1050, xaxis= dict( title= 'x', ticklen= 5, zeroline= False, gridwidth= 2, ), yaxis=dict( title= 'y', ticklen= 5, gridwidth= 2, ), showlegend= True ) fig= go.Figure(data=data, layout=layout) py.iplot(fig) # In[ ]: