import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


# rolling hash Function
def get_int(x, base = 128):
    """
    Returns the integer value of a string
    using a specific base
    Input: 
    x: a string
    base: the base of the integer we want to 
    convert it into. Default is 128
    """

    int_x = 0
    n = len(x)-1

    # the first digit is the most significant digit
    # so will be multiplied by len(x)-1
    # the next letters are lesser significant
    for i in range(len(x)):
        int_x += ord(x[i])*base**n
        n -= 1

    return int_x 

get_int('day')

print(get_int('aaaaa'))
ord('z'), ord('A')

def rh_get_match(x, y, k):
    """
    Finds all common length-k substrings of x and y
    using rolling hashing on both strings.
    Input:
    - x, y: strings
    - k: int, length of substring
    Output:
    - A list of tuples (i, j) where x[i:i+k] = y[j:j+k]
    """
    base = 128
    size = max(len(x)**2 - 1, 10**6 + 3)
    hash_table = [[] for i in range(size)]

    if(k > len(x) or k > len(y)):
        print('Warning: substring length is larger than the string length')
        return []
        
    # hash function to hash item and store accordingly
    key = get_int(x[:k]) % size
    hash_table[key].append((0, x[:k]))

    new_key = key
    for i in range(len(x)-k):
        
        # adding the next letter
        new_key = (new_key * base + ord(x[k+i])) % size

        # removing the first letter
        new_key = (new_key - ord(x[i])*(base**k % size) ) % size

        # store the item at corresponding slot
        hash_table[new_key].append((i+1, x[i+1: k+i+1]))

    common_string = []

    ### checking with the substrings of y

    # compute the hash value for first substring in y
    checking_key = get_int(y[:k]) % size
    
    # if any tuple exist in the slot, go through all the stored tuple
    # and check their 2nd element, if match, take their 'x-index' from the
    # first element and store it with 'y-index'.
    if(hash_table[checking_key]):

        for j in range(0, len(hash_table[checking_key])):
            stored_word = hash_table[checking_key][j]
            if stored_word[1] == y[:k]:

                common_string.append((stored_word[0], 0))

    # using rolling hashing to find the hash value for all other substring
    new_key = checking_key
    for i in range(len(y)-k):

        # adding the next letter
        new_key = (new_key * base + ord(y[k+i])) % size

        # removing the first letter
        new_key = (new_key - ord(y[i])*(base**k % size) ) % size


        # if any tuple exist in the slot, go through all the stored tuple
        # and check their 2nd element, if match, take their 'x-index' from the
        # first element and store it with 'y-index'.
        if(hash_table[new_key]):

            for j in range(0, len(hash_table[new_key])):
                stored_word = hash_table[new_key][j]
                if stored_word[1] == y[i+1: k+i+1]:

                    common_string.append((stored_word[0], i+1))

    return common_string
    ## your code here

# test 1
x, y, k = 'todayisourfinalday', 'thisisfriday', 3
print(rh_get_match(x,y,k))


# test 2 - same word
x, y, k = 'stayhometobesafe', 'stayhometobesafe', 3
print(rh_get_match(x,y,k))


# test 3 - same word same letter
x, y, k = 'bbbbbbbb', 'bbbbbbbb', 4
print(rh_get_match(x,y,k))

# test 4 - one string is empty
x, y, k = '','bbbbbbbb', 4
rh_get_match(x,y,k)

# test 5 - the substring length is higher
x, y, k = 'asdfgh','bbbbbbbb', 8
rh_get_match(x,y,k)

# reading the words from the shakespeare file

txt_file = open("t8.shakespeare.txt", "r")
 
entries = txt_file.read().split(' ')
lines = [string.replace('\n', '') for string in entries]
all_text = [line for line in lines if line != '']
print(len(all_text)) # all the word in shakespeare

# joined all word together for using it as a sting
full_text = "".join(x for x in all_text)
full_text[100:200]

import math

def frac(x):
    '''
    returns the fractional value of a float
    '''
    return x - math.floor(x)


import time
# comparing hash function with the previous one

# store the key
multiplication_hash = []
divide_hash = []

# storing hashing time
hash_time = 0
rolling_hash_time = 0

for words in all_text:
    
    # multiplication method
    size = 2**20
    c = (math.sqrt(5) - 1)/2
    start = time.time()
    key = math.floor( size * frac(get_int(words) * c))
    end = time.time()
    hash_time += end - start # increase the time
    multiplication_hash.append(key) # append the key

    # multiplication method
    size = 10**6 + 3
    start = time.time()
    key = get_int(words) % size
    end = time.time()
    rolling_hash_time += end - start # increase the time
    divide_hash.append(key) # append the key
    

sns.distplot(divide_hash)
plt.xlabel('Key')
plt.ylabel('Frequency')
plt.title('Division Method')
plt.show()

sns.distplot(multiplication_hash)
plt.xlabel('Key')
plt.ylabel('Frequency')
plt.title('Multiplication Method') 
plt.show()

sns.barplot(x = ['Divide', 'Multiplication'], y = [rolling_hash_time/len(all_text), hash_time/len(all_text)])
plt.ylabel('Hashing time')
plt.show()

def regular_get_match(x, y, k):
    """
    Finds all common length-k substrings of x and y
    not using rolling hashing on both strings.
    
    Input:
    - x, y: strings
    - k: int, length of substring
    Output:
    - A list of tuples (i, j) where x[i:i+k] = y[j:j+k]
    """

    # defining base, siz, c and hash table
    base = 128
    size = max(len(x)**2, 2**20)
    c = (math.sqrt(5) - 1)/2
    hash_table = [[] for i in range(size)]
        
    # hash function to hash the all substring and store acc. to the hash value
    for i in range(len(x) - k + 1):

        key = math.floor( size * frac(get_int(x[i: k+i]) * c))
        hash_table[key].append((i, x[i: k+i]))

    common_string = []

    
    ### checking with the substrings of y

    # using regular hashing to find the hash value for all other substring
    for i in range(len(y) - k + 1):
        
        checking_key = math.floor( size * frac(get_int(y[i: k+i]) * c))
        
        # if any tuple exist in the slot, go through all the stored tuple
        # and check their 2nd element, if match, take their 'x-index' from the
        # first element and store it with 'y-index'.
        if(hash_table[checking_key]):

            for j in range(0, len(hash_table[checking_key])):
                stored_word = hash_table[checking_key][j]
                if stored_word[1] == y[i: k+i]:
                    common_string.append((stored_word[0], i))

    return common_string
    ## your code here

# test 1
x, y, k = 'todayisourfinalday', 'thisisfriday', 3
print(regular_get_match(x,y,k))


# test 2 - same word
x, y, k = 'stayhometobesafe', 'stayhometobesafe', 3
print(regular_get_match(x,y,k))


# test 3 - same word same letter
x, y, k = 'bbbbbbbb', 'bbbbbbbb', 4
print(regular_get_match(x,y,k))

# test 4 - one string is empty
x, y, k = '','bbbbbbbb', 4
regular_get_match(x,y,k)

# test 5 - the substring length is higher
x, y, k = 'asdfgh','bbbbbbbb', 8
regular_get_match(x,y,k)

import string
import random

# random word generator
def randomword(length):
    '''
    Return random word of given length
    '''
    return ''.join(random.choice(string.ascii_lowercase) for i in range(length))

# same letter word generator
def sameletter(length):
    '''
    Return same letter word of given length
    '''
    letter = random.choice(string.ascii_lowercase)
    return ''.join(letter for i in range(length))

# Testing Random word
x = randomword(10000)
y = randomword(100)
# print(x,y)
k = 3
match = rh_get_match(x,y,k)
print(len(match), match )

## Self generating graphs for experimental complexity

# to store the average time
rolling = [[] for i in range(5)]
regular = [[] for i in range(5)]


iteration = 10
n_list = np.array([100,200,300,400,500,600])
k_list = np.array([10,20,30,40,50,60])

for i in n_list:

    for k in k_list:
        
        # to store the total time
        rolling_hash_time = [0 for i in range(5)]
        regular_hash_time = [0 for i in range(5)]

        # 1 - strings from shakespear
        for _ in range(iteration):
            x = full_text[:i]
            y = full_text[-1*i:]

            # rolling
            start = time.time()
            rh_hash = rh_get_match(x,y,k)
            end = time.time()
            rolling_hash_time[0] += end - start

            # regular
            start = time.time()
            regular_hash = regular_get_match(x,y,k)
            end = time.time()
            regular_hash_time[0] += end - start
        
        rolling[0].append(rolling_hash_time[0]/iteration)
        regular[0].append(regular_hash_time[0]/iteration)


        # 2 - random different string
        for _ in range(iteration):
            x = randomword(i)
            y = randomword(i)

            # rolling
            start = time.time()
            rh_hash = rh_get_match(x,y,k)
            end = time.time()
            rolling_hash_time[1] += end - start

            # regular
            start = time.time()
            regular_hash = regular_get_match(x,y,k)
            end = time.time()
            regular_hash_time[1] += end - start

        rolling[1].append(rolling_hash_time[1]/iteration)
        regular[1].append(regular_hash_time[1]/iteration)


        # 3 - same letters but different string
        for _ in range(iteration):
            x = sameletter(i)
            y = sameletter(i)

            # rolling
            start = time.time()
            rh_hash = rh_get_match(x,y,k)
            end = time.time()
            rolling_hash_time[2] += end - start

            # regular
            start = time.time()
            regular_hash = regular_get_match(x,y,k)
            end = time.time()
            regular_hash_time[2] += end - start

        rolling[2].append(rolling_hash_time[2]/iteration)
        regular[2].append(regular_hash_time[2]/iteration)


        # 4 - random letters but same string
        for _ in range(iteration):
            x = randomword(i)
            y = x

            # rolling
            start = time.time()
            rh_hash = rh_get_match(x,y,k)
            end = time.time()
            rolling_hash_time[3] += end - start

            # regular
            start = time.time()
            regular_hash = regular_get_match(x,y,k)
            end = time.time()
            regular_hash_time[3] += end - start


        rolling[3].append(rolling_hash_time[3]/iteration)
        regular[3].append(regular_hash_time[3]/iteration)

        
        # 5 - same letters and same string
        for _ in range(iteration):
            x = sameletter(i)
            y = x

            # rolling
            start = time.time()
            rh_hash = rh_get_match(x,y,k)
            end = time.time()
            rolling_hash_time[4] += end - start

            # regular
            start = time.time()
            regular_hash = regular_get_match(x,y,k)
            end = time.time()
            regular_hash_time[4] += end - start

        rolling[4].append(rolling_hash_time[4]/iteration)
        regular[4].append(regular_hash_time[4]/iteration)

        print(i,k,'done')

n_list = np.array([100,200,300,400,500,600])
k_list = np.array([10,20,30,40,50,60])

data = pd.DataFrame({'n_string':n_list.repeat(len(k_list)),
                     'n_substring': np.tile(k_list, len(n_list)),
                     'rolling_hash_shakespeare': rolling[0],
                     'rolling_hash_random': rolling[1],
                     'rolling_hash_same_letter': rolling[2],
                     'rolling_hash_random_letter_same_string': rolling[3],
                     'rolling_hash_all_same': rolling[4],
                     'regular_hash_shakespeare': regular[0],
                     'regular_hash_random': regular[1],
                     'regular_hash_same_letter': regular[2],
                     'regular_hash_random_letter_same_string': regular[3],
                     'regular_hash_all_same': regular[4]})

data.head(10)

# Strings in shakespeare 

# same k, but different n
sns.lineplot(x = 'n_string', y = 'rolling_hash_shakespeare', data = data[data['n_substring'] == 40], 
             label = 'rolling', palette = 'Greens')
sns.lineplot(x = 'n_string', y = 'regular_hash_shakespeare', data = data[data['n_substring'] == 40], 
             label = 'regular', palette = 'Blues')

plt.title('Strings from Shakespeare, k = 40')
plt.ylabel('Time (s)')
plt.show()

# same n, but different k
sns.lineplot(x = 'n_substring', y = 'rolling_hash_shakespeare', data = data[data['n_string'] == 400], 
             label = 'rolling', palette = 'Greens')
sns.lineplot(x = 'n_substring', y = 'regular_hash_shakespeare', data = data[data['n_string'] == 400], 
             label = 'regular', palette = 'Blues')

plt.title('Strings from Shakespeare, n = 400')
plt.ylabel('Time (s)')
plt.show()

# Random Strings 

# same k, but different n
sns.lineplot(x = 'n_string', y = 'rolling_hash_random', data = data[data['n_substring'] == 40], 
             label = 'rolling')
sns.lineplot(x = 'n_string', y = 'regular_hash_random', data = data[data['n_substring'] == 40], 
             label = 'regular')

plt.title('Random Strings, k = 40')
plt.ylabel('Time (s)')
plt.show()

# same n, but different k
sns.lineplot(x = 'n_substring', y = 'rolling_hash_random', data = data[data['n_string'] == 400], 
             label = 'rolling')
sns.lineplot(x = 'n_substring', y = 'regular_hash_random', data = data[data['n_string'] == 400], 
             label = 'regular')

plt.title('Random Strings, n = 400')
plt.ylabel('Time (s)')
plt.show()

# Same letter different Strings 

# same k, but different n
sns.lineplot(x = 'n_string', y = 'rolling_hash_same_letter', data = data[data['n_substring'] == 40], 
             label = 'rolling')
sns.lineplot(x = 'n_string', y = 'regular_hash_same_letter', data = data[data['n_substring'] == 40], 
             label = 'regular')

plt.title('Same Letter but Different Strings, k = 40')
plt.ylabel('Time (s)')
plt.show()

# same n, but different k
sns.lineplot(x = 'n_substring', y = 'rolling_hash_same_letter', data = data[data['n_string'] == 400], 
             label = 'rolling')
sns.lineplot(x = 'n_substring', y = 'regular_hash_same_letter', data = data[data['n_string'] == 400], 
             label = 'regular')

plt.title('Same Letter but Different Strings, n = 400')
plt.ylabel('Time (s)')
plt.show()

# Random Letter but Same Strings

# same k, but different n
sns.lineplot(x = 'n_string', y = 'rolling_hash_random_letter_same_string', data = data[data['n_substring'] == 40], 
             label = 'rolling', palette = 'Greens')
sns.lineplot(x = 'n_string', y = 'regular_hash_random_letter_same_string', data = data[data['n_substring'] == 40], 
             label = 'regular')

plt.title('Random Letter but Same Strings, k = 40')
plt.ylabel('Time (s)')
plt.show()

# same n, but different k
sns.lineplot(x = 'n_substring', y = 'rolling_hash_random_letter_same_string', data = data[data['n_string'] == 400], 
             label = 'rolling', palette = 'Greens')
sns.lineplot(x = 'n_substring', y = 'regular_hash_random_letter_same_string', data = data[data['n_string'] == 400], 
             label = 'regular')

plt.title('Random Letter but Same Strings, n = 400')
plt.ylabel('Time (s)')
plt.show()

# Same Letter Same Stringss 

# same k, but different n
sns.lineplot(x = 'n_string', y = 'rolling_hash_all_same', data = data[data['n_substring'] == 40], 
             label = 'rolling', palette = 'Greens')
sns.lineplot(x = 'n_string', y = 'regular_hash_all_same', data = data[data['n_substring'] == 40], 
             label = 'regular')

plt.title('Same Letter Same String, k = 40')
plt.ylabel('Time (s)')
plt.show()

# same n, but different k
sns.lineplot(x = 'n_substring', y = 'rolling_hash_all_same', data = data[data['n_string'] == 400], 
             label = 'rolling', palette = 'Greens')
sns.lineplot(x = 'n_substring', y = 'regular_hash_all_same', data = data[data['n_string'] == 400], 
             label = 'regular')

plt.title('Same Letter Same String, n = 400')
plt.ylabel('Time (s)')
plt.show()

data.to_csv('hashing_data_2.csv')