#!/usr/bin/env python
# coding: utf-8

# In[20]:


#a bunch of import statements for the functions we'll be using
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise
from sklearn.manifold import MDS

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()


# In[21]:


file = pd.read_csv("Data/onegramfilteredtfidf.csv", index_col = 0)
file.head()


# In[22]:


words = file.columns
texts = file.index


# In[23]:


print(words[:10]) #print out the first 10 words as a sanity check
print(len(words))


# In[24]:


dtm = file.as_matrix()
dtm


# In[25]:


len(dtm)


# In[26]:


dist_matrix = pairwise.pairwise_distances(dtm, metric='cosine') 

pd.DataFrame(dist_matrix, index = file.index, columns = file.index).head()


# In[27]:


names = pd.read_csv('Data/idToTextName.csv', index_col=0, header = None)
names.columns = ["Full title"]
names.index.name = "text ref"
names["display title"] = names["Full title"].str.slice(0,23)
names.head()


# In[28]:


labels = names["display title"]
fulltitles = names["Full title"]
labels.head()


# In[29]:


cluster_0 = ['c.1.4.1.3', 'c.2.4.1.7', 'c.2.4.2.14', 'c.2.4.2.26', 'c.2.4.4.1', 'c.2.4.4.2', 'c.2.4.4.3', 'c.4.07.6', 'c.4.07.7', 'c.4.07.8', 'c.4.07.9', 'c.4.08.01', 'c.4.08.02', 'c.4.08.03', 'c.4.08.04', 'c.4.08.05', 'c.4.08.07', 'c.4.08.08', 'c.4.08.18', 'c.4.08.25', 'c.4.08.26', 'c.4.08.28', 'c.4.08.29', 'c.4.08.30', 'c.4.08.32', 'c.4.08.33', 'c.4.13.02', 'c.4.22.2', 'c.5.5.5']
cluster_1 = ['c.0.1.2', 'c.0.2.01', 'c.0.2.02', 'c.0.2.03', 'c.0.2.04', 'c.0.2.06', 'c.0.2.11', 'c.0.2.13', 'c.1.6.1', 'c.1.6.2', 'c.1.6.3', 'c.2.4.2.20', 'c.2.4.4.4', 'c.2.5.4.15', 'c.2.5.7.1', 'c.4.27.01', 'c.4.27.02', 'c.4.27.03', 'c.4.27.04', 'c.4.27.07', 'c.4.80.2', 'c.5.5.4']
cluster_2 = ['c.1.3.1', 'c.1.3.2', 'c.1.3.3', 'c.1.3.4', 'c.1.3.5', 'c.1.4.1', 'c.1.4.4', 'c.1.8.1.2', 'c.1.8.2.3', 'c.2.1.6', 'c.2.4.2.24', 'c.2.4.2.a', 'c.2.5.1.3', 'c.2.5.3.1', 'c.2.5.4.10', 'c.2.5.4.11', 'c.2.5.6.4', 'c.2.6.9.8', 'c.2.7.1.1', 'c.2.8.2.6', 'c.2.8.3.1', 'c.3.3.01', 'c.3.3.10', 'c.4.07.1', 'c.4.07.2', 'c.4.07.3', 'c.4.07.4', 'c.4.07.5', 'c.4.07.a', 'c.4.08.06', 'c.4.08.09', 'c.4.08.10', 'c.4.08.16', 'c.4.08.20', 'c.4.08.a', 'c.4.19.3', 'c.4.33.2']
cluster_3 = ['c.1.1.3', 'c.1.1.4', 'c.2.1.7', 'c.2.3.1', 'c.2.3.2', 'c.2.4.1.2', 'c.2.4.1.3', 'c.2.4.1.5', 'c.2.4.1.6', 'c.2.4.1.a', 'c.2.4.2.01', 'c.2.4.2.03', 'c.2.4.2.04', 'c.2.4.2.05', 'c.2.4.2.07', 'c.2.4.2.15', 'c.2.4.2.16', 'c.2.4.2.17', 'c.2.4.2.18', 'c.2.4.2.21', 'c.2.4.2.22', 'c.2.4.2.25', 'c.2.4.4.a', 'c.2.4.5.2', 'c.2.5.1.4', 'c.2.5.2.1', 'c.2.5.3.2', 'c.2.5.3.4', 'c.2.5.4.01', 'c.2.5.4.02', 'c.2.5.4.03', 'c.2.5.4.04', 'c.2.5.4.05', 'c.2.5.4.08', 'c.2.5.4.09', 'c.2.5.4.13', 'c.2.5.4.17', 'c.2.5.4.19', 'c.2.5.4.21', 'c.2.5.4.23', 'c.2.5.4.24', 'c.2.5.4.27', 'c.2.5.4.29', 'c.2.5.4.b', 'c.2.5.5.1', 'c.2.5.5.2', 'c.2.5.5.3', 'c.2.5.5.4', 'c.2.5.5.5', 'c.2.5.6.1', 'c.2.5.6.2', 'c.2.5.6.3', 'c.2.5.6.5', 'c.2.5.6.6', 'c.2.5.7.2', 'c.2.5.8.1', 'c.2.6.6.1', 'c.2.6.6.5', 'c.2.6.7.1', 'c.2.6.9.1', 'c.2.6.9.2', 'c.2.6.9.3', 'c.2.6.9.4', 'c.2.6.9.5', 'c.2.6.9.6', 'c.2.6.9.7', 'c.2.8.2.2', 'c.2.8.2.4', 'c.2.8.3.2', 'c.2.8.3.3', 'c.2.8.3.5', 'c.2.8.3.6', 'c.2.8.5.1', 'c.3.3.21', 'c.4.01.1', 'c.4.02.1', 'c.4.03.1', 'c.4.05.1', 'c.4.12.1', 'c.4.12.2', 'c.4.14.1', 'c.4.15.2', 'c.4.15.3', 'c.4.16.1', 'c.4.19.1', 'c.4.21.1', 'c.4.22.1', 'c.4.22.4', 'c.4.22.5', 'c.4.22.6', 'c.4.24.1', 'c.4.25.1', 'c.4.25.2', 'c.4.26.1', 'c.4.27.06', 'c.4.28.1', 'c.4.29.1', 'c.4.29.2', 'c.4.31.1', 'c.4.33.1', 'c.4.80.1', 'c.4.80.4', 'c.5.3.7']
cluster_4 = ['c.0.1.1', 'c.1.7.6', 'c.1.7.7', 'c.2.5.1.2', 'c.3.1.01', 'c.3.1.02', 'c.3.1.03', 'c.3.1.04', 'c.3.1.05', 'c.3.1.06', 'c.3.1.06.1', 'c.3.1.07', 'c.3.1.08', 'c.3.1.11', 'c.3.1.11.1', 'c.3.1.13.2', 'c.3.1.15', 'c.3.1.16', 'c.3.1.17', 'c.3.1.18', 'c.3.1.19', 'c.3.1.20', 'c.3.1.21', 'c.3.2.02', 'c.3.2.03', 'c.3.2.04', 'c.3.3.04', 'c.3.3.05', 'c.3.3.08', 'c.3.3.09', 'c.3.3.11', 'c.4.08.23', 'c.4.08.31', 'c.5.3.1', 'c.5.6.3', 'c.5.6.5', 'c.6.1.18', 'c.6.1.21']
cluster_5 = ['c.2.4.2.b', 'c.2.4.4.6', 'c.2.4.5.1', 'c.2.4.5.3', 'c.2.4.5.4', 'c.2.4.5.5', 'c.2.5.3.3', 'c.2.6.2.1', 'c.2.6.2.a', 'c.2.8.2.3', 'c.3.3.22', 'c.4.13.01', 'c.4.13.03', 'c.4.13.04', 'c.4.13.05', 'c.4.13.07', 'c.4.13.08', 'c.4.13.09', 'c.4.13.10', 'c.4.13.11', 'c.4.13.12', 'c.4.13.13', 'c.4.13.14', 'c.4.13.15', 'c.4.13.a', 'c.4.13.c', 'c.4.17.1']
cluster_6 = ['c.0.2.07', 'c.0.2.08', 'c.1.1.1', 'c.1.1.2', 'c.1.2.1', 'c.1.2.2', 'c.1.4.1.1', 'c.1.4.3', 'c.1.5.1', 'c.1.7.1', 'c.1.7.3', 'c.1.7.4', 'c.1.7.8', 'c.1.8.1.1', 'c.1.8.1.3', 'c.1.8.1.4', 'c.1.8.1.5', 'c.1.8.1.5.1', 'c.1.8.2.1', 'c.1.8.2.2', 'c.1.8.2.4', 'c.2.1.1', 'c.2.1.2', 'c.2.1.3', 'c.2.1.4', 'c.2.1.5', 'c.2.2.2', 'c.2.2.3', 'c.2.2.4', 'c.2.2.5', 'c.2.2.6', 'c.2.4.1.1', 'c.2.4.1.4', 'c.2.4.2.02', 'c.2.4.3.1', 'c.2.5.4.a', 'c.2.8.2.1', 'c.2.8.5.a', 'c.2.8.5.b', 'c.3.2.05', 'c.3.3.02', 'c.3.3.03', 'c.3.3.39', 'c.4.06.1', 'c.4.08.15', 'c.4.13.06', 'c.4.14.2', 'c.4.14.3', 'c.4.16.2', 'c.4.19.2', 'c.4.23.1', 'c.4.32.2', 'c.4.32.e', 'c.4.32.f', 'c.5.1.3', 'c.5.2.4', 'c.5.2.5', 'c.5.3.2', 'c.5.3.3', 'c.5.3.5', 'c.5.3.6', 'c.5.4.11', 'c.5.4.12', 'c.5.5.1', 'c.5.5.2', 'c.5.5.3', 'c.5.5.a', 'c.5.6.1', 'c.5.7.1', 'c.5.7.2', 'c.5.9.1', 'c.5.9.2', 'c.6.1.01', 'c.6.1.02', 'c.6.1.03', 'c.6.1.04', 'c.6.1.05', 'c.6.1.07', 'c.6.1.08', 'c.6.1.09', 'c.6.1.10', 'c.6.1.11', 'c.6.1.12', 'c.6.1.13', 'c.6.1.14', 'c.6.1.15', 'c.6.1.16', 'c.6.1.17', 'c.6.1.19', 'c.6.1.22', 'c.6.1.23', 'c.6.1.24', 'c.6.1.25', 'c.6.1.26', 'c.6.1.27', 'c.6.1.28', 'c.6.2.1', 'c.6.2.2', 'c.6.2.3', 'c.6.2.5']


# In[30]:


clusters = [cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6]


# In[31]:


dict_color_cluster = { #define unique color for every cluster
    0: '#2c4ff9', #dark blue
    1: '#db0f12', #red
    2: '#000000', #black
    3: '#3FB230', #green
    4: '#ff54f9', #pink
    5: '#630AFF', #purple
    6: '#F5770D'  #orange 
}


# In[ ]:


# In[32]:


mds_cluster = MDS(n_components = 2, dissimilarity="precomputed")  #use MDS
embeddings_cluster = mds_cluster.fit_transform(dist_matrix) #the points/vectors of the texts obtained by MDS
embeddings_cluster


# In[33]:


def clusterpicker(label):
    for i in range(len(clusters)):
        if label in clusters[i]:
            return i


# In[34]:


plotdata = pd.DataFrame(embeddings_cluster, columns = ["x", "y"], index = file.index)
plotdata["cluster"] = [clusterpicker(i) for i in file.index]
plotdata["title"] = ["???" if i not in labels else labels[i] for i in file.index]
plotdata["fulltitle"] = [str(i) if i not in fulltitles else fulltitles[i] for i in file.index]
plotdata.head()


# In[35]:


data = [ dict(x=df['x'], y=df['y'], name='Cluster ' + str(name), mode='text', text = df['title'], hovertext = df['fulltitle'], marker = dict(size=10), 
             textfont=dict(
        family='sans serif',
        size=11,
        color=dict_color_cluster[name]
    ))
         for name, df in plotdata.groupby('cluster')]
layout= go.Layout(
    title= 'MDS of corpus',
    hovermode= 'closest',
    legend = dict(font = dict(size = 12)),
    width = 1200,
    height = 1050,
    xaxis= dict(
        title= 'x',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'y',
        ticklen= 5,
        gridwidth= 2,
    ),
    showlegend= True
)
fig= go.Figure(data=data, layout=layout)
py.iplot(fig)


# In[ ]: