#a bunch of import statements for the functions we'll be using
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise
from sklearn.manifold import MDS
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()
file = pd.read_csv("Data/onegramfilteredtfidf.csv", index_col = 0)
file.head()
1-kam-ma[1st]NU | 1/2[1/2]NU | 1/3[1/3]NU | 10-kam-ma[10th]NU | 1000[1000]NU | 100[100]NU | 108000[108000]NU | 1080[1080]NU | 108[108]NU | 10[10]NU | ... | šusi[finger]N | šute?[accept]V/t | šutubur[mixture]N | šutug[reed-hut]N | šutum[storehouse]N | šutur[garment]N | šuš[cover]V/t | šu?i[barber]N | šu?u[stone]N | šu?ura[goose]N | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id_text | |||||||||||||||||||||
c.0.1.1 | 0.22871 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
c.0.1.2 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.091941 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
c.0.2.01 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
c.0.2.02 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
c.0.2.03 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 4302 columns
words = file.columns
texts = file.index
print(words[:10]) #print out the first 10 words as a sanity check
print(len(words))
Index(['1-kam-ma[1st]NU', '1/2[1/2]NU', '1/3[1/3]NU', '10-kam-ma[10th]NU', '1000[1000]NU', '100[100]NU', '108000[108000]NU', '1080[1080]NU', '108[108]NU', '10[10]NU'], dtype='object') 4302
dtm = file.as_matrix()
dtm
array([[ 0.22870994, 0. , 0. , ..., 0. , 0. , 0. ], [ 0. , 0. , 0. , ..., 0. , 0. , 0. ], [ 0. , 0. , 0. , ..., 0. , 0. , 0. ], ..., [ 0. , 0. , 0. , ..., 0. , 0. , 0. ], [ 0. , 0. , 0. , ..., 0. , 0. , 0. ], [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]])
len(dtm)
356
dist_matrix = pairwise.pairwise_distances(dtm, metric='cosine')
pd.DataFrame(dist_matrix, index = file.index, columns = file.index).head()
id_text | c.0.1.1 | c.0.1.2 | c.0.2.01 | c.0.2.02 | c.0.2.03 | c.0.2.04 | c.0.2.06 | c.0.2.07 | c.0.2.08 | c.0.2.11 | ... | c.6.1.23 | c.6.1.24 | c.6.1.25 | c.6.1.26 | c.6.1.27 | c.6.1.28 | c.6.2.1 | c.6.2.2 | c.6.2.3 | c.6.2.5 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id_text | |||||||||||||||||||||
c.0.1.1 | 0.000000 | 0.969666 | 0.966844 | 0.962367 | 0.997159 | 0.990801 | 0.991764 | 0.982808 | 0.983839 | 1.000000 | ... | 0.980535 | 0.990684 | 0.988002 | 0.941854 | 0.986586 | 0.967027 | 0.962033 | 0.953011 | 0.940927 | 0.954584 |
c.0.1.2 | 0.969666 | 0.000000 | 0.784686 | 0.711962 | 0.885036 | 0.692746 | 0.776205 | 0.983191 | 0.839093 | 0.852215 | ... | 0.924037 | 0.991376 | 0.959502 | 0.920016 | 0.982423 | 0.932179 | 0.914273 | 0.919923 | 0.890635 | 0.906842 |
c.0.2.01 | 0.966844 | 0.784686 | 0.000000 | 0.247072 | 0.624002 | 0.277678 | 0.788966 | 0.981840 | 0.908256 | 0.612501 | ... | 0.924557 | 0.984093 | 0.958457 | 0.942169 | 0.960652 | 0.929098 | 0.888292 | 0.904388 | 0.868083 | 0.890089 |
c.0.2.02 | 0.962367 | 0.711962 | 0.247072 | 0.000000 | 0.701013 | 0.308537 | 0.735413 | 0.990235 | 0.874896 | 0.598098 | ... | 0.909307 | 0.986944 | 0.951509 | 0.950456 | 0.947092 | 0.915664 | 0.891771 | 0.908854 | 0.870437 | 0.883582 |
c.0.2.03 | 0.997159 | 0.885036 | 0.624002 | 0.701013 | 0.000000 | 0.618533 | 0.847563 | 0.994129 | 0.953976 | 0.747425 | ... | 0.977511 | 0.996946 | 0.980530 | 0.946146 | 0.985379 | 0.958533 | 0.951213 | 0.987310 | 0.949105 | 0.954732 |
5 rows × 356 columns
names = pd.read_csv('Data/idToTextName.csv', index_col=0, header = None)
names.columns = ["Full title"]
names.index.name = "text ref"
names["display title"] = names["Full title"].str.slice(0,23)
names.head()
Full title | display title | |
---|---|---|
text ref | ||
c.4.12.2 | A hymn to Martu (Martu B) | A hymn to Martu (Martu |
c.4.12.1 | A šir-gida to Martu (Martu A) | A šir-gida to Martu (Ma |
c.4.29.1 | A šir-gida to Nuska (Nuska A) | A šir-gida to Nuska (Nu |
c.4.29.2 | A šir-gida to Nuska (Nuska B) | A šir-gida to Nuska (Nu |
c.4.08.31 | A balbale to Inana (Dumuzid-Inana E1) | A balbale to Inana (Dum |
labels = names["display title"]
fulltitles = names["Full title"]
labels.head()
text ref c.4.12.2 A hymn to Martu (Martu c.4.12.1 A šir-gida to Martu (Ma c.4.29.1 A šir-gida to Nuska (Nu c.4.29.2 A šir-gida to Nuska (Nu c.4.08.31 A balbale to Inana (Dum Name: display title, dtype: object
cluster_0 = ['c.1.4.1.3', 'c.2.4.1.7', 'c.2.4.2.14', 'c.2.4.2.26', 'c.2.4.4.1', 'c.2.4.4.2', 'c.2.4.4.3', 'c.4.07.6', 'c.4.07.7', 'c.4.07.8', 'c.4.07.9', 'c.4.08.01', 'c.4.08.02', 'c.4.08.03', 'c.4.08.04', 'c.4.08.05', 'c.4.08.07', 'c.4.08.08', 'c.4.08.18', 'c.4.08.25', 'c.4.08.26', 'c.4.08.28', 'c.4.08.29', 'c.4.08.30', 'c.4.08.32', 'c.4.08.33', 'c.4.13.02', 'c.4.22.2', 'c.5.5.5']
cluster_1 = ['c.0.1.2', 'c.0.2.01', 'c.0.2.02', 'c.0.2.03', 'c.0.2.04', 'c.0.2.06', 'c.0.2.11', 'c.0.2.13', 'c.1.6.1', 'c.1.6.2', 'c.1.6.3', 'c.2.4.2.20', 'c.2.4.4.4', 'c.2.5.4.15', 'c.2.5.7.1', 'c.4.27.01', 'c.4.27.02', 'c.4.27.03', 'c.4.27.04', 'c.4.27.07', 'c.4.80.2', 'c.5.5.4']
cluster_2 = ['c.1.3.1', 'c.1.3.2', 'c.1.3.3', 'c.1.3.4', 'c.1.3.5', 'c.1.4.1', 'c.1.4.4', 'c.1.8.1.2', 'c.1.8.2.3', 'c.2.1.6', 'c.2.4.2.24', 'c.2.4.2.a', 'c.2.5.1.3', 'c.2.5.3.1', 'c.2.5.4.10', 'c.2.5.4.11', 'c.2.5.6.4', 'c.2.6.9.8', 'c.2.7.1.1', 'c.2.8.2.6', 'c.2.8.3.1', 'c.3.3.01', 'c.3.3.10', 'c.4.07.1', 'c.4.07.2', 'c.4.07.3', 'c.4.07.4', 'c.4.07.5', 'c.4.07.a', 'c.4.08.06', 'c.4.08.09', 'c.4.08.10', 'c.4.08.16', 'c.4.08.20', 'c.4.08.a', 'c.4.19.3', 'c.4.33.2']
cluster_3 = ['c.1.1.3', 'c.1.1.4', 'c.2.1.7', 'c.2.3.1', 'c.2.3.2', 'c.2.4.1.2', 'c.2.4.1.3', 'c.2.4.1.5', 'c.2.4.1.6', 'c.2.4.1.a', 'c.2.4.2.01', 'c.2.4.2.03', 'c.2.4.2.04', 'c.2.4.2.05', 'c.2.4.2.07', 'c.2.4.2.15', 'c.2.4.2.16', 'c.2.4.2.17', 'c.2.4.2.18', 'c.2.4.2.21', 'c.2.4.2.22', 'c.2.4.2.25', 'c.2.4.4.a', 'c.2.4.5.2', 'c.2.5.1.4', 'c.2.5.2.1', 'c.2.5.3.2', 'c.2.5.3.4', 'c.2.5.4.01', 'c.2.5.4.02', 'c.2.5.4.03', 'c.2.5.4.04', 'c.2.5.4.05', 'c.2.5.4.08', 'c.2.5.4.09', 'c.2.5.4.13', 'c.2.5.4.17', 'c.2.5.4.19', 'c.2.5.4.21', 'c.2.5.4.23', 'c.2.5.4.24', 'c.2.5.4.27', 'c.2.5.4.29', 'c.2.5.4.b', 'c.2.5.5.1', 'c.2.5.5.2', 'c.2.5.5.3', 'c.2.5.5.4', 'c.2.5.5.5', 'c.2.5.6.1', 'c.2.5.6.2', 'c.2.5.6.3', 'c.2.5.6.5', 'c.2.5.6.6', 'c.2.5.7.2', 'c.2.5.8.1', 'c.2.6.6.1', 'c.2.6.6.5', 'c.2.6.7.1', 'c.2.6.9.1', 'c.2.6.9.2', 'c.2.6.9.3', 'c.2.6.9.4', 'c.2.6.9.5', 'c.2.6.9.6', 'c.2.6.9.7', 'c.2.8.2.2', 'c.2.8.2.4', 'c.2.8.3.2', 'c.2.8.3.3', 'c.2.8.3.5', 'c.2.8.3.6', 'c.2.8.5.1', 'c.3.3.21', 'c.4.01.1', 'c.4.02.1', 'c.4.03.1', 'c.4.05.1', 'c.4.12.1', 'c.4.12.2', 'c.4.14.1', 'c.4.15.2', 'c.4.15.3', 'c.4.16.1', 'c.4.19.1', 'c.4.21.1', 'c.4.22.1', 'c.4.22.4', 'c.4.22.5', 'c.4.22.6', 'c.4.24.1', 'c.4.25.1', 'c.4.25.2', 'c.4.26.1', 'c.4.27.06', 'c.4.28.1', 'c.4.29.1', 'c.4.29.2', 'c.4.31.1', 'c.4.33.1', 'c.4.80.1', 'c.4.80.4', 'c.5.3.7']
cluster_4 = ['c.0.1.1', 'c.1.7.6', 'c.1.7.7', 'c.2.5.1.2', 'c.3.1.01', 'c.3.1.02', 'c.3.1.03', 'c.3.1.04', 'c.3.1.05', 'c.3.1.06', 'c.3.1.06.1', 'c.3.1.07', 'c.3.1.08', 'c.3.1.11', 'c.3.1.11.1', 'c.3.1.13.2', 'c.3.1.15', 'c.3.1.16', 'c.3.1.17', 'c.3.1.18', 'c.3.1.19', 'c.3.1.20', 'c.3.1.21', 'c.3.2.02', 'c.3.2.03', 'c.3.2.04', 'c.3.3.04', 'c.3.3.05', 'c.3.3.08', 'c.3.3.09', 'c.3.3.11', 'c.4.08.23', 'c.4.08.31', 'c.5.3.1', 'c.5.6.3', 'c.5.6.5', 'c.6.1.18', 'c.6.1.21']
cluster_5 = ['c.2.4.2.b', 'c.2.4.4.6', 'c.2.4.5.1', 'c.2.4.5.3', 'c.2.4.5.4', 'c.2.4.5.5', 'c.2.5.3.3', 'c.2.6.2.1', 'c.2.6.2.a', 'c.2.8.2.3', 'c.3.3.22', 'c.4.13.01', 'c.4.13.03', 'c.4.13.04', 'c.4.13.05', 'c.4.13.07', 'c.4.13.08', 'c.4.13.09', 'c.4.13.10', 'c.4.13.11', 'c.4.13.12', 'c.4.13.13', 'c.4.13.14', 'c.4.13.15', 'c.4.13.a', 'c.4.13.c', 'c.4.17.1']
cluster_6 = ['c.0.2.07', 'c.0.2.08', 'c.1.1.1', 'c.1.1.2', 'c.1.2.1', 'c.1.2.2', 'c.1.4.1.1', 'c.1.4.3', 'c.1.5.1', 'c.1.7.1', 'c.1.7.3', 'c.1.7.4', 'c.1.7.8', 'c.1.8.1.1', 'c.1.8.1.3', 'c.1.8.1.4', 'c.1.8.1.5', 'c.1.8.1.5.1', 'c.1.8.2.1', 'c.1.8.2.2', 'c.1.8.2.4', 'c.2.1.1', 'c.2.1.2', 'c.2.1.3', 'c.2.1.4', 'c.2.1.5', 'c.2.2.2', 'c.2.2.3', 'c.2.2.4', 'c.2.2.5', 'c.2.2.6', 'c.2.4.1.1', 'c.2.4.1.4', 'c.2.4.2.02', 'c.2.4.3.1', 'c.2.5.4.a', 'c.2.8.2.1', 'c.2.8.5.a', 'c.2.8.5.b', 'c.3.2.05', 'c.3.3.02', 'c.3.3.03', 'c.3.3.39', 'c.4.06.1', 'c.4.08.15', 'c.4.13.06', 'c.4.14.2', 'c.4.14.3', 'c.4.16.2', 'c.4.19.2', 'c.4.23.1', 'c.4.32.2', 'c.4.32.e', 'c.4.32.f', 'c.5.1.3', 'c.5.2.4', 'c.5.2.5', 'c.5.3.2', 'c.5.3.3', 'c.5.3.5', 'c.5.3.6', 'c.5.4.11', 'c.5.4.12', 'c.5.5.1', 'c.5.5.2', 'c.5.5.3', 'c.5.5.a', 'c.5.6.1', 'c.5.7.1', 'c.5.7.2', 'c.5.9.1', 'c.5.9.2', 'c.6.1.01', 'c.6.1.02', 'c.6.1.03', 'c.6.1.04', 'c.6.1.05', 'c.6.1.07', 'c.6.1.08', 'c.6.1.09', 'c.6.1.10', 'c.6.1.11', 'c.6.1.12', 'c.6.1.13', 'c.6.1.14', 'c.6.1.15', 'c.6.1.16', 'c.6.1.17', 'c.6.1.19', 'c.6.1.22', 'c.6.1.23', 'c.6.1.24', 'c.6.1.25', 'c.6.1.26', 'c.6.1.27', 'c.6.1.28', 'c.6.2.1', 'c.6.2.2', 'c.6.2.3', 'c.6.2.5']
clusters = [cluster_0, cluster_1, cluster_2, cluster_3, cluster_4, cluster_5, cluster_6]
dict_color_cluster = { #define unique color for every cluster
0: '#2c4ff9', #dark blue
1: '#db0f12', #red
2: '#000000', #black
3: '#3FB230', #green
4: '#ff54f9', #pink
5: '#630AFF', #purple
6: '#F5770D' #orange
}
mds_cluster = MDS(n_components = 2, dissimilarity="precomputed") #use MDS
embeddings_cluster = mds_cluster.fit_transform(dist_matrix) #the points/vectors of the texts obtained by MDS
embeddings_cluster
array([[ 0.0159024 , -0.79343975], [ 0.30929124, 0.57779201], [ 0.38502259, 0.56296131], [ 0.40198954, 0.52788746], [ 0.5237078 , -0.57985784], [ 0.42168646, 0.59510261], [ 0.61032664, -0.23959547], [ 0.35087112, -0.71002111], [ 0.36747653, -0.58628854], [-0.17233303, 0.73485737], [-0.03194299, 0.64354327], [-0.21973462, -0.34134486], [ 0.16484138, -0.38069597], [ 0.1295834 , -0.02177603], [ 0.29973343, -0.22286513], [-0.26703946, -0.42499587], [ 0.00202544, -0.17056458], [ 0.06407636, -0.34096429], [ 0.03637418, -0.11549216], [-0.14680433, -0.33841375], [-0.11483535, -0.76462862], [ 0.02232799, -0.41741538], [-0.13436922, -0.27608851], [-0.42288589, -0.38810496], [-0.34833897, -0.58175706], [-0.35247011, -0.34482465], [-0.45125173, -0.45660919], [ 0.1562604 , -0.60146158], [ 0.00769502, 0.34031422], [-0.03166923, 0.17379403], [ 0.22606301, -0.32636798], [-0.64048807, 0.14132735], [ 0.19702604, -0.68582824], [-0.26656656, 0.39340804], [-0.37286558, -0.45653958], [-0.42965664, 0.58820506], [-0.16013455, -0.58687078], [ 0.41711268, -0.53365285], [-0.0551854 , -0.39489688], [-0.37877385, 0.11641768], [-0.4012246 , -0.25224865], [-0.28949903, 0.0736792 ], [-0.34023625, 0.16965912], [-0.12213217, -0.11398763], [-0.20024242, -0.21753929], [ 0.01026522, -0.2719257 ], [-0.10900369, -0.30828073], [ 0.18623106, -0.7714655 ], [-0.64626741, 0.32364408], [-0.2858827 , 0.76076364], [ 0.43054259, -0.62415583], [-0.15498218, -0.04325657], [-0.63567362, -0.11677318], [ 0.10562736, -0.14151897], [-0.60813237, -0.07402735], [-0.13121986, 0.04014076], [-0.04455022, -0.07393083], [-0.10433445, 0.02713687], [-0.17023622, 0.09541337], [ 0.61351116, -0.36819711], [ 0.21057894, 0.63561798], [-0.20966559, 0.26287563], [ 0.24005422, 0.58136173], [ 0.01165992, 0.18732558], [ 0.708946 , -0.12619041], [ 0.09696637, 0.69481318], [ 0.37703283, 0.51079642], [-0.02688148, -0.76246659], [-0.01482163, 0.57501709], [-0.0693032 , 0.22303237], [-0.10495666, 0.07067764], [-0.05990728, 0.15334597], [ 0.02319311, 0.15000112], [-0.04561891, 0.11164679], [ 0.18227212, 0.25977942], [-0.23747124, -0.6858282 ], [ 0.05249442, 0.27591718], [ 0.13548173, 0.21530646], [ 0.14148886, 0.47530833], [ 0.12069679, 0.16634756], [ 0.40932536, 0.48238284], [-0.14221143, 0.69962014], [ 0.15464212, 0.62594345], [ 0.07073827, 0.05047406], [ 0.41883758, -0.20903896], [-0.43981348, -0.63129788], [-0.30833972, 0.67779379], [-0.19405674, 0.61165601], [ 0.56548356, -0.49433896], [-0.63170337, -0.45117536], [-0.57498463, -0.53852763], [-0.68201756, -0.24304268], [-0.08625739, 0.65216174], [-0.58358312, 0.35936939], [-0.10502053, 0.56314443], [ 0.65275014, 0.297226 ], [ 0.68416531, 0.12391 ], [ 0.56031829, 0.37268553], [ 0.47008484, 0.33557047], [ 0.58647079, 0.35123586], [-0.26621941, 0.6310034 ], [ 0.40187776, 0.64199004], [ 0.61441944, -0.31053018], [-0.08630509, 0.70452423], [ 0.17034717, -0.22659731], [ 0.0914024 , 0.58983119], [ 0.23784361, 0.67457865], [ 0.1040828 , 0.41753919], [ 0.06915766, 0.08237281], [ 0.3587259 , 0.0217625 ], [ 0.51165987, 0.24387378], [ 0.50261183, -0.01958242], [ 0.56044181, -0.03430156], [ 0.39516564, 0.11392051], [ 0.36433214, -0.07098651], [ 0.40646056, -0.57086771], [ 0.23300106, -0.1707609 ], [ 0.41903379, 0.04974433], [ 0.50592758, 0.37833117], [ 0.34548508, 0.04905224], [ 0.01486967, 0.43009143], [ 0.74261312, -0.00392054], [ 0.34122598, 0.12846503], [ 0.42823201, 0.10560009], [ 0.64113345, 0.1342978 ], [ 0.66054753, -0.06156853], [-0.02611354, 0.69938593], [ 0.64786359, -0.10335802], [ 0.0678048 , 0.31696398], [ 0.29717976, 0.42372284], [ 0.35824294, 0.30223899], [ 0.09293163, 0.48890979], [ 0.2498393 , 0.60050814], [ 0.26484201, 0.21499869], [ 0.29133424, 0.07905301], [ 0.16458453, 0.40232354], [ 0.40735167, 0.1729623 ], [ 0.33853429, 0.24903076], [ 0.69457781, -0.28117265], [ 0.08171954, 0.55325132], [ 0.27465606, 0.45375528], [ 0.21857997, -0.07399661], [ 0.42155782, 0.37910668], [ 0.22029345, 0.73343 ], [ 0.40011439, -0.33271938], [ 0.28102327, 0.35579894], [ 0.45841725, 0.30599189], [-0.25695359, 0.55511334], [ 0.44148514, -0.060267 ], [ 0.43286173, 0.23938922], [ 0.69767758, -0.24102138], [ 0.20439458, -0.0473197 ], [ 0.56146522, -0.22158776], [-0.08648801, 0.48885774], [ 0.53184062, -0.27958641], [ 0.28660101, -0.25192144], [ 0.63308327, 0.40162196], [ 0.54110456, 0.50868013], [ 0.55190345, 0.47084031], [ 0.27076683, 0.50828699], [ 0.64328179, -0.00531743], [-0.6028322 , -0.417467 ], [-0.08139197, 0.38248337], [ 0.49953291, 0.17789599], [ 0.1887739 , 0.36720639], [ 0.6891786 , 0.03449731], [ 0.6266213 , 0.34118786], [-0.49773464, 0.45425313], [-0.54593858, -0.3403599 ], [ 0.23617176, -0.56493705], [-0.44162036, 0.33272112], [-0.71301682, 0.24992686], [-0.51699666, 0.37686667], [-0.7944505 , 0.01595367], [-0.33421953, 0.56413842], [-0.78750455, -0.00367117], [-0.50055473, 0.51290171], [-0.46395819, 0.49119277], [-0.554999 , 0.22350675], [-0.63881165, 0.42368213], [-0.42471701, 0.36347783], [-0.57086737, 0.40942273], [-0.49246584, 0.40305327], [-0.37703837, 0.40233876], [-0.45781393, 0.54835839], [-0.5273132 , 0.51261294], [ 0.31726761, -0.51822655], [-0.51436711, 0.11038616], [-0.4346545 , 0.62393105], [-0.77995139, 0.2202165 ], [-0.79751401, -0.06206366], [-0.2286786 , 0.33885876], [ 0.24919604, -0.50701525], [-0.49390118, 0.05022441], [-0.59173284, 0.44686514], [-0.71031042, 0.34007999], [-0.7333389 , 0.30096568], [ 0.2689281 , -0.73135305], [ 0.72501939, -0.05783041], [ 0.38578386, -0.44190171], [ 0.26455214, -0.65076204], [ 0.50866835, -0.37978435], [ 0.5816487 , 0.01320365], [ 0.32030948, -0.40963494], [ 0.57940825, -0.12383067], [ 0.30212888, -0.3411418 ], [ 0.47957115, -0.1326096 ], [ 0.15174622, 0.07525891], [-0.44711379, -0.01510116], [ 0.11701501, 0.67804756], [ 0.15615238, 0.00792065], [ 0.05557892, -0.09300836], [ 0.15430092, -0.28625092], [ 0.61557984, 0.0271489 ], [ 0.03462806, -0.79953489], [-0.31503486, -0.59477099], [-0.39951923, -0.58592799], [-0.02519452, -0.61867532], [-0.21712236, 0.44393548], [-0.31110655, -0.70218081], [-0.3662696 , -0.65125412], [-0.37719079, -0.53842696], [-0.44388619, -0.56959036], [-0.40794199, 0.68315596], [-0.04150721, 0.74904022], [-0.28924066, -0.66885792], [-0.4842938 , -0.48582114], [-0.19552618, -0.75318672], [ 0.05864438, -0.57037457], [-0.04661991, -0.56628555], [-0.08201007, -0.58405721], [-0.50600133, -0.57939381], [-0.06230321, -0.65491292], [-0.72245933, -0.27898148], [ 0.13346201, -0.676616 ], [-0.62330684, -0.37977166], [-0.31212752, -0.48051545], [-0.17967143, -0.55850008], [-0.54303527, -0.41017272], [-0.6905353 , -0.37138062], [ 0.28315733, -0.74982211], [-0.15921098, -0.62781839], [-0.08687657, -0.6865328 ], [ 0.18393454, 0.14952909], [-0.04852311, 0.50116718], [ 0.39051676, -0.27343251], [-0.13855776, -0.72013232], [ 0.67107934, 0.18561168], [ 0.49722392, -0.49060704], [ 0.5425405 , 0.04503637], [-0.77593234, 0.07946731], [ 0.74205219, 0.11779937], [ 0.59901354, 0.2461528 ], [ 0.71432151, -0.17563763], [ 0.51950529, 0.41978316], [ 0.07656592, -0.66221423], [ 0.35865357, 0.3273884 ], [ 0.75678638, 0.0639605 ], [ 0.19417543, 0.70046158], [ 0.20150902, -0.4887006 ], [ 0.65928308, -0.19237929], [ 0.57199686, 0.46325114], [-0.16426593, 0.16775288], [ 0.78864892, 0.20842443], [-0.63971238, -0.24725948], [-0.11988519, 0.46622929], [ 0.62250244, 0.21438548], [ 0.45372825, -0.32977684], [ 0.19785265, -0.74080743], [ 0.32749004, 0.66383865], [ 0.35771681, -0.18145582], [-0.70955716, -0.14800955], [ 0.0205933 , 0.78031753], [ 0.34464722, 0.62633761], [-0.05946615, 0.30119643], [-0.49439907, -0.63982584], [ 0.2202374 , 0.48625719], [ 0.67593651, -0.33120644], [ 0.52086027, -0.2492247 ], [ 0.57114924, -0.53781039], [ 0.55029728, 0.20651248], [ 0.5068572 , -0.45774276], [-0.08799564, 0.78296519], [ 0.69044801, -0.34115528], [ 0.38365851, 0.44504419], [ 0.23273409, 0.32117236], [ 0.04745089, 0.58652472], [-0.44572122, 0.34502175], [ 0.5573364 , -0.429283 ], [-0.14799385, 0.56624452], [ 0.04152952, 0.01650685], [ 0.54014157, 0.1482847 ], [ 0.6217876 , 0.28184874], [ 0.32530289, -0.14269532], [-0.35495334, 0.5198239 ], [-0.4441649 , -0.49186805], [-0.28405851, -0.58142653], [ 0.72765182, 0.22198562], [ 0.39441488, -0.60902464], [ 0.32689684, 0.15625942], [-0.01784821, 0.67501883], [ 0.7660609 , -0.17611779], [-0.11903443, -0.15276664], [-0.19569069, -0.04302288], [-0.73743454, 0.1161918 ], [-0.51200826, 0.00882246], [-0.06314756, -0.31508344], [-0.49511664, -0.15479303], [-0.26013573, 0.14665926], [-0.36492647, -0.04718395], [-0.2919236 , 0.58787887], [-0.70460599, 0.14445795], [-0.65606867, 0.10061062], [ 0.09859829, -0.40503976], [-0.29351106, 0.2054532 ], [-0.4229748 , 0.09029988], [-0.58945418, 0.29080863], [-0.24425176, -0.64428932], [-0.58976529, -0.47434024], [-0.33740053, -0.00339974], [-0.42995553, 0.4811834 ], [ 0.08497679, -0.65217401], [-0.62597418, -0.29750471], [-0.64138693, 0.37507483], [ 0.45251394, 0.57950368], [ 0.30563922, -0.55871921], [-0.27737517, -0.19707226], [-0.40765862, -0.15904125], [-0.25170095, -0.12988737], [-0.53823151, -0.09644348], [-0.59192577, -0.15156508], [-0.38047484, -0.27368055], [-0.70285689, -0.07590616], [-0.55845395, -0.21155659], [-0.714004 , 0.01962562], [-0.43851441, -0.19303728], [-0.60977884, -0.04074469], [-0.19889392, -0.45978961], [-0.58568984, 0.06058926], [-0.46393506, 0.14743922], [-0.56214948, 0.01795848], [-0.62366265, 0.17691773], [ 0.08500858, -0.72630186], [-0.52371012, -0.26646178], [-0.5545008 , 0.53120545], [-0.32257183, -0.13779068], [-0.09149162, -0.47083344], [-0.73119695, -0.20068237], [ 0.10017124, -0.52937989], [-0.46859052, -0.09576958], [ 0.01785886, -0.71709412], [-0.24587204, -0.37512591], [-0.35409806, -0.19456518], [-0.42315256, 0.2015134 ], [-0.25760876, -0.07578718], [-0.28928948, -0.05336599]])
def clusterpicker(label):
for i in range(len(clusters)):
if label in clusters[i]:
return i
plotdata = pd.DataFrame(embeddings_cluster, columns = ["x", "y"], index = file.index)
plotdata["cluster"] = [clusterpicker(i) for i in file.index]
plotdata["title"] = ["???" if i not in labels else labels[i] for i in file.index]
plotdata["fulltitle"] = [str(i) if i not in fulltitles else fulltitles[i] for i in file.index]
plotdata.head()
x | y | cluster | title | fulltitle | |
---|---|---|---|---|---|
id_text | |||||
c.0.1.1 | 0.015902 | -0.793440 | 4 | Ur III catalogue from N | Ur III catalogue from Nibru (N1) |
c.0.1.2 | 0.309291 | 0.577792 | 1 | Ur III catalogue at Yal | Ur III catalogue at Yale (Y1) |
c.0.2.01 | 0.385023 | 0.562961 | 1 | OB catalogue from Nibru | OB catalogue from Nibru (N2) |
c.0.2.02 | 0.401990 | 0.527887 | 1 | OB catalogue in the Lou | OB catalogue in the Louvre (L) |
c.0.2.03 | 0.523708 | -0.579858 | 1 | OB catalogue from Urim | OB catalogue from Urim (U1) |
data = [ dict(x=df['x'], y=df['y'], name='Cluster ' + str(name), mode='text', text = df['title'], hovertext = df['fulltitle'], marker = dict(size=10),
textfont=dict(
family='sans serif',
size=11,
color=dict_color_cluster[name]
))
for name, df in plotdata.groupby('cluster')]
layout= go.Layout(
title= 'MDS of corpus',
hovermode= 'closest',
legend = dict(font = dict(size = 12)),
width = 1200,
height = 1050,
xaxis= dict(
title= 'x',
ticklen= 5,
zeroline= False,
gridwidth= 2,
),
yaxis=dict(
title= 'y',
ticklen= 5,
gridwidth= 2,
),
showlegend= True
)
fig= go.Figure(data=data, layout=layout)
py.iplot(fig)