#!/usr/bin/env python # coding: utf-8 # # Chapter 1: Introduction # # Let's build a vector for input text, e.g., from `doc1`: # In[14]: doc1 = "meeting ... management ... meeting ... management ... meeting " doc1 += "... management ... meeting ... meeting" vector = [0, 0] for word in doc1.split(" "): if word=="management": vector[0] = vector[0] + 1 if word=="meeting": vector[1] = vector[1] + 1 print (vector) # Here is how you can calculate *Euclidean distance* between a document and a query: # In[15]: import math query = [1, 1] doc1 = [3, 5] sq_length = 0 for index in range(0, len(query)): sq_length += math.pow((doc1[index] - query[index]), 2) print (math.sqrt(sq_length)) # Finally, let's estimate *cosine similarity*: # In[16]: import math query = [1, 1] doc1 = [3, 5] def length(vector): sq_length = 0 for index in range(0, len(vector)): sq_length += math.pow(vector[index], 2) return math.sqrt(sq_length) def dot_product(vector1, vector2): if len(vector1)==len(vector2): dot_prod = 0 for index in range(0, len(vector1)): dot_prod += vector1[index]*vector2[index] return dot_prod else: return "Unmatching dimensionality" cosine=dot_product(query, doc1)/(length(query)*length(doc1)) print (cosine) # In[ ]: