#!/usr/bin/env python
# coding: utf-8

# # Cosine Similarity (Version A)
# 
# - Loads embeddings (pre-computed from texts)
# - Computes **mean of embedding**-arrays
# - Computes **cosine-similarity of embedding-means** of two points in time
# - **Results**: Similarities of categories between two points in time
#   - **0.9956** **American films** (2010 .. 2020)
#   - **0.9936** **British films** (2010 .. 2020)
#   - **0.9930** **Indian films** (2010 .. 2020)
# - Results can be used in further experiments

# In[1]:


import numpy
print("numpy:   " + numpy.version.version)

import sklearn
import sklearn.metrics
print("sklearn: " + sklearn.__version__)


# In[2]:


# IDs for loading data below
embeddingsDir = "/home/eml4u/EML4U/data/wikipedia-embeddings/"

idAmericanA = "20100408-american-films"
idAmericanB = "20201101-american-films"
idBritishA  = "20100408-british-films"
idBritishB  = "20201101-british-films"
idIndianA   = "20100408-indian-films"
idIndianB   = "20201101-indian-films"


# In[3]:


# Load text files containing embeddings
def getTextfile(dir, id, printfile = True):
    file = dir + id + ".txt"
    if printfile:
        print(file)
    return file

fileEmbeddingsAmericanA = getTextfile(embeddingsDir, idAmericanA)
fileEmbeddingsAmericanB = getTextfile(embeddingsDir, idAmericanB)
fileEmbeddingsBritishA  = getTextfile(embeddingsDir, idBritishA)
fileEmbeddingsBritishB  = getTextfile(embeddingsDir, idBritishB)
fileEmbeddingsIndianA   = getTextfile(embeddingsDir, idIndianA)
fileEmbeddingsIndianB   = getTextfile(embeddingsDir, idIndianB)


# In[4]:


# Load embeddings
def loadEmbeddings(file, note = "", printinfo = True):
    embeddings = numpy.loadtxt(file)
    if printinfo:
        print(str(embeddings.shape) + " " + note)
    return embeddings

embeddingsAmericanA = loadEmbeddings(fileEmbeddingsAmericanA, "AmericanA")
embeddingsAmericanB = loadEmbeddings(fileEmbeddingsAmericanB, "AmericanB")
embeddingsBritishA  = loadEmbeddings(fileEmbeddingsBritishA, "BritishA")
embeddingsBritishB  = loadEmbeddings(fileEmbeddingsBritishB, "BritishB")
embeddingsIndianA   = loadEmbeddings(fileEmbeddingsIndianA, "IndianA")
embeddingsIndianB   = loadEmbeddings(fileEmbeddingsIndianB, "IndianB")

# Create test embeddings (A,B: 2 points of time ; 3 entries/texts each ; 5 dimensions)
embeddingsTestA = numpy.array([[-2.2, -1.1, 0, 1.1, 2.2], [-2.2, -1.1, 0, 3.3, 4.4], [-2.2, -1.1, 0, 5.5, 6.6]])
embeddingsTestB = numpy.array([[-2.2, -1.1, 0, 1.1, 2.2], [-5.5, -4.4, 0, 1.1, 2.2], [-8.8, -4.4, 0, 1.1, 2.2]])
print(str(embeddingsTestA.shape) + " TestA")
print(str(embeddingsTestB.shape) + " TestB")


# In[5]:


# Compute means
# https://numpy.org/doc/1.19/reference/generated/numpy.mean.html?highlight=mean#numpy.mean
# https://numpy.org/doc/1.19/reference/generated/numpy.reshape.html?highlight=reshape#numpy.reshape
def getMean(embeddings, note = "", printinfo = True):
    mean = numpy.mean(embeddings, axis=0)
    if printinfo:
        print(str(type(mean)) + " " + str(mean.shape) + " " +  note)
    return mean

# Compute means
meanAmericanA = getMean(embeddingsAmericanA, "AmericanA")
meanAmericanB = getMean(embeddingsAmericanB, "AmericanB")
meanBritishA  = getMean(embeddingsBritishA, "BritishA")
meanBritishB  = getMean(embeddingsBritishB, "BritishB")
meanIndianA   = getMean(embeddingsIndianA, "IndianA")
meanIndianB   = getMean(embeddingsIndianB, "IndianB")
meanTestA = getMean(embeddingsTestA, "TestA")
meanTestB = getMean(embeddingsTestB, "TestB")
print()

# Check means of test embeddings
if True:
    print("Testing computation of means")
    print(embeddingsTestA, "embeddingsTestA (input)")
    print(meanTestA, "meanTestA")
    print(embeddingsTestB, "embeddingsTestB (input)")
    print(meanTestB, "meanTestB")
    print()


# In[6]:


# Cosine similarity single entries
def getCosineSimilarity(a, b, indexA, indexB, note = "", printinfo = True):
    cosSim = sklearn.metrics.pairwise.cosine_similarity(a[indexA].reshape(1, -1), b[indexB].reshape(1, -1), dense_output=True)[0][0]
    if printinfo:
        print(str(cosSim) + " " + note)
    return cosSim

print("\nTests:\n")

if True:
    print("Test 0 input", "\n", embeddingsTestA[0], "\n", embeddingsTestB[0])
    getCosineSimilarity(embeddingsTestA, embeddingsTestB, 0, 0, "Test 0")
    print("Test 1 input", "\n", embeddingsTestA[1], "\n", embeddingsTestB[1])
    getCosineSimilarity(embeddingsTestA, embeddingsTestB, 1, 1, "Test 1")
    print("Test 2 input", "\n", embeddingsTestA[2], "\n", embeddingsTestB[2])
    getCosineSimilarity(embeddingsTestA, embeddingsTestB, 2, 2, "Test 2")
    print()
    
if True:
    getCosineSimilarity(embeddingsAmericanA, embeddingsAmericanB, 0, 0, "American 0")
    getCosineSimilarity(embeddingsAmericanA, embeddingsAmericanB, 1, 1, "American 1")
    getCosineSimilarity(embeddingsAmericanA, embeddingsAmericanB, 10, 10, "American 10")
    getCosineSimilarity(embeddingsAmericanA, embeddingsAmericanB, 100, 100, "American 100")
    getCosineSimilarity(embeddingsAmericanA, embeddingsAmericanB, 500, 500, "American 500")
    getCosineSimilarity(embeddingsAmericanA, embeddingsAmericanB, 600, 600, "American 600")
    print()
    getCosineSimilarity(embeddingsBritishA, embeddingsBritishB, 0, 0, "British 0")
    getCosineSimilarity(embeddingsBritishA, embeddingsBritishB, 1, 1, "British 1")
    getCosineSimilarity(embeddingsBritishA, embeddingsBritishB, 10, 10, "British 10")
    getCosineSimilarity(embeddingsBritishA, embeddingsBritishB, 100, 100, "British 100")
    getCosineSimilarity(embeddingsBritishA, embeddingsBritishB, 500, 500, "British 500")
    getCosineSimilarity(embeddingsBritishA, embeddingsBritishB, 600, 600, "British 600")
    print()
    getCosineSimilarity(embeddingsIndianA, embeddingsIndianB, 0, 0, "Indian 0")
    getCosineSimilarity(embeddingsIndianA, embeddingsIndianB, 1, 1, "Indian 1")
    getCosineSimilarity(embeddingsIndianA, embeddingsIndianB, 10, 10, "Indian 10")
    getCosineSimilarity(embeddingsIndianA, embeddingsIndianB, 100, 100, "Indian 100")
    getCosineSimilarity(embeddingsIndianA, embeddingsIndianB, 500, 500, "Indian 500")
    getCosineSimilarity(embeddingsIndianA, embeddingsIndianB, 600, 600, "Indian 600")
    print()


# In[7]:


# Cosine similarity
# https://scikit-learn.org/0.23/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html#sklearn.metrics.pairwise.cosine_similarity
# https://numpy.org/doc/1.19/reference/generated/numpy.reshape.html?highlight=reshape#numpy.reshape
def getPairwiseCosineSimilarity(a, b, note = "", printinfo = True):
    if printinfo:
        print(str(type(a)) + " " + str(a.shape) + "\n" + str(type(b)) + " " + str(b.shape))
    cosSim = sklearn.metrics.pairwise.cosine_similarity(a, b, dense_output=True)[0][0]
    if printinfo:
        print(str(cosSim) + " " + note)
    return cosSim
    
# https://scistatcalc.blogspot.com/2015/11/cosine-similarity-calculator.html
# -2.2, -1.1, 0, 3.3, 4.4
# -5.5, -3.3, 0, 1.1, 2.2
# Cosine Similarity between vectors 1 and 2 is 0.701646, cosine of 0.793091 radians

getPairwiseCosineSimilarity(meanTestA.reshape(1, -1), meanTestB.reshape(1, -1), "Test")
print(meanTestA.shape)
print(meanAmericanA.shape)

print("\nResults:\n")
getPairwiseCosineSimilarity(meanAmericanA.reshape(1, -1), meanAmericanB.reshape(1, -1), "American")
getPairwiseCosineSimilarity(meanBritishA.reshape(1, -1), meanBritishB.reshape(1, -1), "British")
getPairwiseCosineSimilarity(meanIndianA.reshape(1, -1), meanIndianB.reshape(1, -1), "Indian")


# In[ ]: