import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pandas as pd # rolling hash Function def get_int(x, base = 128): """ Returns the integer value of a string using a specific base Input: x: a string base: the base of the integer we want to convert it into. Default is 128 """ int_x = 0 n = len(x)-1 # the first digit is the most significant digit # so will be multiplied by len(x)-1 # the next letters are lesser significant for i in range(len(x)): int_x += ord(x[i])*base**n n -= 1 return int_x get_int('day') print(get_int('aaaaa')) ord('z'), ord('A') def rh_get_match(x, y, k): """ Finds all common length-k substrings of x and y using rolling hashing on both strings. Input: - x, y: strings - k: int, length of substring Output: - A list of tuples (i, j) where x[i:i+k] = y[j:j+k] """ base = 128 size = max(len(x)**2 - 1, 10**6 + 3) hash_table = [[] for i in range(size)] if(k > len(x) or k > len(y)): print('Warning: substring length is larger than the string length') return [] # hash function to hash item and store accordingly key = get_int(x[:k]) % size hash_table[key].append((0, x[:k])) new_key = key for i in range(len(x)-k): # adding the next letter new_key = (new_key * base + ord(x[k+i])) % size # removing the first letter new_key = (new_key - ord(x[i])*(base**k % size) ) % size # store the item at corresponding slot hash_table[new_key].append((i+1, x[i+1: k+i+1])) common_string = [] ### checking with the substrings of y # compute the hash value for first substring in y checking_key = get_int(y[:k]) % size # if any tuple exist in the slot, go through all the stored tuple # and check their 2nd element, if match, take their 'x-index' from the # first element and store it with 'y-index'. if(hash_table[checking_key]): for j in range(0, len(hash_table[checking_key])): stored_word = hash_table[checking_key][j] if stored_word[1] == y[:k]: common_string.append((stored_word[0], 0)) # using rolling hashing to find the hash value for all other substring new_key = checking_key for i in range(len(y)-k): # adding the next letter new_key = (new_key * base + ord(y[k+i])) % size # removing the first letter new_key = (new_key - ord(y[i])*(base**k % size) ) % size # if any tuple exist in the slot, go through all the stored tuple # and check their 2nd element, if match, take their 'x-index' from the # first element and store it with 'y-index'. if(hash_table[new_key]): for j in range(0, len(hash_table[new_key])): stored_word = hash_table[new_key][j] if stored_word[1] == y[i+1: k+i+1]: common_string.append((stored_word[0], i+1)) return common_string ## your code here # test 1 x, y, k = 'todayisourfinalday', 'thisisfriday', 3 print(rh_get_match(x,y,k)) # test 2 - same word x, y, k = 'stayhometobesafe', 'stayhometobesafe', 3 print(rh_get_match(x,y,k)) # test 3 - same word same letter x, y, k = 'bbbbbbbb', 'bbbbbbbb', 4 print(rh_get_match(x,y,k)) # test 4 - one string is empty x, y, k = '','bbbbbbbb', 4 rh_get_match(x,y,k) # test 5 - the substring length is higher x, y, k = 'asdfgh','bbbbbbbb', 8 rh_get_match(x,y,k) # reading the words from the shakespeare file txt_file = open("t8.shakespeare.txt", "r") entries = txt_file.read().split(' ') lines = [string.replace('\n', '') for string in entries] all_text = [line for line in lines if line != ''] print(len(all_text)) # all the word in shakespeare # joined all word together for using it as a sting full_text = "".join(x for x in all_text) full_text[100:200] import math def frac(x): ''' returns the fractional value of a float ''' return x - math.floor(x) import time # comparing hash function with the previous one # store the key multiplication_hash = [] divide_hash = [] # storing hashing time hash_time = 0 rolling_hash_time = 0 for words in all_text: # multiplication method size = 2**20 c = (math.sqrt(5) - 1)/2 start = time.time() key = math.floor( size * frac(get_int(words) * c)) end = time.time() hash_time += end - start # increase the time multiplication_hash.append(key) # append the key # multiplication method size = 10**6 + 3 start = time.time() key = get_int(words) % size end = time.time() rolling_hash_time += end - start # increase the time divide_hash.append(key) # append the key sns.distplot(divide_hash) plt.xlabel('Key') plt.ylabel('Frequency') plt.title('Division Method') plt.show() sns.distplot(multiplication_hash) plt.xlabel('Key') plt.ylabel('Frequency') plt.title('Multiplication Method') plt.show() sns.barplot(x = ['Divide', 'Multiplication'], y = [rolling_hash_time/len(all_text), hash_time/len(all_text)]) plt.ylabel('Hashing time') plt.show() def regular_get_match(x, y, k): """ Finds all common length-k substrings of x and y not using rolling hashing on both strings. Input: - x, y: strings - k: int, length of substring Output: - A list of tuples (i, j) where x[i:i+k] = y[j:j+k] """ # defining base, siz, c and hash table base = 128 size = max(len(x)**2, 2**20) c = (math.sqrt(5) - 1)/2 hash_table = [[] for i in range(size)] # hash function to hash the all substring and store acc. to the hash value for i in range(len(x) - k + 1): key = math.floor( size * frac(get_int(x[i: k+i]) * c)) hash_table[key].append((i, x[i: k+i])) common_string = [] ### checking with the substrings of y # using regular hashing to find the hash value for all other substring for i in range(len(y) - k + 1): checking_key = math.floor( size * frac(get_int(y[i: k+i]) * c)) # if any tuple exist in the slot, go through all the stored tuple # and check their 2nd element, if match, take their 'x-index' from the # first element and store it with 'y-index'. if(hash_table[checking_key]): for j in range(0, len(hash_table[checking_key])): stored_word = hash_table[checking_key][j] if stored_word[1] == y[i: k+i]: common_string.append((stored_word[0], i)) return common_string ## your code here # test 1 x, y, k = 'todayisourfinalday', 'thisisfriday', 3 print(regular_get_match(x,y,k)) # test 2 - same word x, y, k = 'stayhometobesafe', 'stayhometobesafe', 3 print(regular_get_match(x,y,k)) # test 3 - same word same letter x, y, k = 'bbbbbbbb', 'bbbbbbbb', 4 print(regular_get_match(x,y,k)) # test 4 - one string is empty x, y, k = '','bbbbbbbb', 4 regular_get_match(x,y,k) # test 5 - the substring length is higher x, y, k = 'asdfgh','bbbbbbbb', 8 regular_get_match(x,y,k) import string import random # random word generator def randomword(length): ''' Return random word of given length ''' return ''.join(random.choice(string.ascii_lowercase) for i in range(length)) # same letter word generator def sameletter(length): ''' Return same letter word of given length ''' letter = random.choice(string.ascii_lowercase) return ''.join(letter for i in range(length)) # Testing Random word x = randomword(10000) y = randomword(100) # print(x,y) k = 3 match = rh_get_match(x,y,k) print(len(match), match ) ## Self generating graphs for experimental complexity # to store the average time rolling = [[] for i in range(5)] regular = [[] for i in range(5)] iteration = 10 n_list = np.array([100,200,300,400,500,600]) k_list = np.array([10,20,30,40,50,60]) for i in n_list: for k in k_list: # to store the total time rolling_hash_time = [0 for i in range(5)] regular_hash_time = [0 for i in range(5)] # 1 - strings from shakespear for _ in range(iteration): x = full_text[:i] y = full_text[-1*i:] # rolling start = time.time() rh_hash = rh_get_match(x,y,k) end = time.time() rolling_hash_time[0] += end - start # regular start = time.time() regular_hash = regular_get_match(x,y,k) end = time.time() regular_hash_time[0] += end - start rolling[0].append(rolling_hash_time[0]/iteration) regular[0].append(regular_hash_time[0]/iteration) # 2 - random different string for _ in range(iteration): x = randomword(i) y = randomword(i) # rolling start = time.time() rh_hash = rh_get_match(x,y,k) end = time.time() rolling_hash_time[1] += end - start # regular start = time.time() regular_hash = regular_get_match(x,y,k) end = time.time() regular_hash_time[1] += end - start rolling[1].append(rolling_hash_time[1]/iteration) regular[1].append(regular_hash_time[1]/iteration) # 3 - same letters but different string for _ in range(iteration): x = sameletter(i) y = sameletter(i) # rolling start = time.time() rh_hash = rh_get_match(x,y,k) end = time.time() rolling_hash_time[2] += end - start # regular start = time.time() regular_hash = regular_get_match(x,y,k) end = time.time() regular_hash_time[2] += end - start rolling[2].append(rolling_hash_time[2]/iteration) regular[2].append(regular_hash_time[2]/iteration) # 4 - random letters but same string for _ in range(iteration): x = randomword(i) y = x # rolling start = time.time() rh_hash = rh_get_match(x,y,k) end = time.time() rolling_hash_time[3] += end - start # regular start = time.time() regular_hash = regular_get_match(x,y,k) end = time.time() regular_hash_time[3] += end - start rolling[3].append(rolling_hash_time[3]/iteration) regular[3].append(regular_hash_time[3]/iteration) # 5 - same letters and same string for _ in range(iteration): x = sameletter(i) y = x # rolling start = time.time() rh_hash = rh_get_match(x,y,k) end = time.time() rolling_hash_time[4] += end - start # regular start = time.time() regular_hash = regular_get_match(x,y,k) end = time.time() regular_hash_time[4] += end - start rolling[4].append(rolling_hash_time[4]/iteration) regular[4].append(regular_hash_time[4]/iteration) print(i,k,'done') n_list = np.array([100,200,300,400,500,600]) k_list = np.array([10,20,30,40,50,60]) data = pd.DataFrame({'n_string':n_list.repeat(len(k_list)), 'n_substring': np.tile(k_list, len(n_list)), 'rolling_hash_shakespeare': rolling[0], 'rolling_hash_random': rolling[1], 'rolling_hash_same_letter': rolling[2], 'rolling_hash_random_letter_same_string': rolling[3], 'rolling_hash_all_same': rolling[4], 'regular_hash_shakespeare': regular[0], 'regular_hash_random': regular[1], 'regular_hash_same_letter': regular[2], 'regular_hash_random_letter_same_string': regular[3], 'regular_hash_all_same': regular[4]}) data.head(10) # Strings in shakespeare # same k, but different n sns.lineplot(x = 'n_string', y = 'rolling_hash_shakespeare', data = data[data['n_substring'] == 40], label = 'rolling', palette = 'Greens') sns.lineplot(x = 'n_string', y = 'regular_hash_shakespeare', data = data[data['n_substring'] == 40], label = 'regular', palette = 'Blues') plt.title('Strings from Shakespeare, k = 40') plt.ylabel('Time (s)') plt.show() # same n, but different k sns.lineplot(x = 'n_substring', y = 'rolling_hash_shakespeare', data = data[data['n_string'] == 400], label = 'rolling', palette = 'Greens') sns.lineplot(x = 'n_substring', y = 'regular_hash_shakespeare', data = data[data['n_string'] == 400], label = 'regular', palette = 'Blues') plt.title('Strings from Shakespeare, n = 400') plt.ylabel('Time (s)') plt.show() # Random Strings # same k, but different n sns.lineplot(x = 'n_string', y = 'rolling_hash_random', data = data[data['n_substring'] == 40], label = 'rolling') sns.lineplot(x = 'n_string', y = 'regular_hash_random', data = data[data['n_substring'] == 40], label = 'regular') plt.title('Random Strings, k = 40') plt.ylabel('Time (s)') plt.show() # same n, but different k sns.lineplot(x = 'n_substring', y = 'rolling_hash_random', data = data[data['n_string'] == 400], label = 'rolling') sns.lineplot(x = 'n_substring', y = 'regular_hash_random', data = data[data['n_string'] == 400], label = 'regular') plt.title('Random Strings, n = 400') plt.ylabel('Time (s)') plt.show() # Same letter different Strings # same k, but different n sns.lineplot(x = 'n_string', y = 'rolling_hash_same_letter', data = data[data['n_substring'] == 40], label = 'rolling') sns.lineplot(x = 'n_string', y = 'regular_hash_same_letter', data = data[data['n_substring'] == 40], label = 'regular') plt.title('Same Letter but Different Strings, k = 40') plt.ylabel('Time (s)') plt.show() # same n, but different k sns.lineplot(x = 'n_substring', y = 'rolling_hash_same_letter', data = data[data['n_string'] == 400], label = 'rolling') sns.lineplot(x = 'n_substring', y = 'regular_hash_same_letter', data = data[data['n_string'] == 400], label = 'regular') plt.title('Same Letter but Different Strings, n = 400') plt.ylabel('Time (s)') plt.show() # Random Letter but Same Strings # same k, but different n sns.lineplot(x = 'n_string', y = 'rolling_hash_random_letter_same_string', data = data[data['n_substring'] == 40], label = 'rolling', palette = 'Greens') sns.lineplot(x = 'n_string', y = 'regular_hash_random_letter_same_string', data = data[data['n_substring'] == 40], label = 'regular') plt.title('Random Letter but Same Strings, k = 40') plt.ylabel('Time (s)') plt.show() # same n, but different k sns.lineplot(x = 'n_substring', y = 'rolling_hash_random_letter_same_string', data = data[data['n_string'] == 400], label = 'rolling', palette = 'Greens') sns.lineplot(x = 'n_substring', y = 'regular_hash_random_letter_same_string', data = data[data['n_string'] == 400], label = 'regular') plt.title('Random Letter but Same Strings, n = 400') plt.ylabel('Time (s)') plt.show() # Same Letter Same Stringss # same k, but different n sns.lineplot(x = 'n_string', y = 'rolling_hash_all_same', data = data[data['n_substring'] == 40], label = 'rolling', palette = 'Greens') sns.lineplot(x = 'n_string', y = 'regular_hash_all_same', data = data[data['n_substring'] == 40], label = 'regular') plt.title('Same Letter Same String, k = 40') plt.ylabel('Time (s)') plt.show() # same n, but different k sns.lineplot(x = 'n_substring', y = 'rolling_hash_all_same', data = data[data['n_string'] == 400], label = 'rolling', palette = 'Greens') sns.lineplot(x = 'n_substring', y = 'regular_hash_all_same', data = data[data['n_string'] == 400], label = 'regular') plt.title('Same Letter Same String, n = 400') plt.ylabel('Time (s)') plt.show() data.to_csv('hashing_data_2.csv')