%pylab inline
import pandas as pd
import numpy as np
from __future__ import division
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logger = logging.getLogger()
Populating the interactive namespace from numpy and matplotlib
Problem:
The Jaccard Similarity of sets $S$ and $T$: \begin{equation} SIM(S,T) = \frac{| S \cup T |}{| S \cap T |} \end{equation}
In many applicatons, the documents are not identical, yet they share large portions of their text.
eg: Plagiarism, Mirror Pages, Articles from the Same Source.
2. Collaborative Filtering(Similar Tastes)
eg:
+ Online Purchases
+ Movie Ratings
When data is not binary:
- Ignore low-rated customer/movie pairs
- "liked" VS "hated"
- 1-to-5-starts, put it in a set $n$ times. <= Jaccard similarity for bags
#Exercise 3.1.1
logger.setLevel(logging.WARNING)
data = [
set([1, 2, 3, 4]),
set([2, 3, 5, 7]),
set([2, 4, 6]),
]
def Jaccard_similarity_calc(set_a, set_b):
"""calculate the Jaccard similarity of two sets
res = \frac{a \cap b}{a \cup b}
"""
assert isinstance(set_a, set), '{} is not a set'.format(set_a)
assert isinstance(set_b, set), '{} is not a set'.format(set_b)
logging.debug('a:{}, b:{}'.format(set_a, set_b))
logging.debug('inter:{}, union:{}'.format(set_a.intersection(set_b), set_a.union(set_b)))
return len(set_a.intersection(set_b)) / len(set_a.union(set_b))
for comb in list(itertools.combinations(range(3),2)):
set_a, set_b = data[comb[0]], data[comb[1]]
print('a:{}, b:{}, SIM:{:.3f}'.format(set_a, set_b, Jaccard_similarity_calc(set_a, set_b)))
a:set([1, 2, 3, 4]), b:set([2, 3, 5, 7]), SIM:0.333 a:set([1, 2, 3, 4]), b:set([2, 4, 6]), SIM:0.400 a:set([2, 3, 5, 7]), b:set([2, 4, 6]), SIM:0.167
#Exercise 3.1.2
logger.setLevel(logging.WARNING)
data = [
[1, 1, 1, 2],
[1, 1, 2, 2, 3],
[1, 2, 3, 4],
]
def Jaccard_bag_similarity_calc(a, b):
"""calculate the Jaccard bag similarity of two sets
see page 76, movie ratings 3.
intersecton = min(times of element in the two sets)
union = sum of length of the two sets
res = intersection / union
"""
from collections import Counter
count_a = Counter(a)
count_b = Counter(b)
logging.debug('count_a:{}\n count_b:{}'.format(count_a,count_b))
inter = [min(count_a[x], count_b[x]) for x in count_a if x in count_b]
logging.debug('intersection:{}'.format(inter))
return sum(inter) / (len(a) + len(b))
for comb in list(itertools.combinations(range(3),2)):
set_a, set_b = data[comb[0]], data[comb[1]]
print('a:{}, b:{}, JbSIM:{:.3f}'.format(set_a, set_b, Jaccard_bag_similarity_calc(set_a, set_b)))
a:[1, 1, 1, 2], b:[1, 1, 2, 2, 3], JbSIM:0.333 a:[1, 1, 1, 2], b:[1, 2, 3, 4], JbSIM:0.250 a:[1, 1, 2, 2, 3], b:[1, 2, 3, 4], JbSIM:0.333
exercise 3.1.3
引理:
定义
令 $X_A = I\{A\}$,则有 $E[X_A] = \sum I\{A\}P[A] = P[A]$
令 $x_i$ 表示 S 和 T 中有 $i$ 个相同元素,则:
\begin{align} E[x_i] & = P[\text{S 和 T 中有 $i$ 个相同元素}] \\ & = \frac{C_n^i \, C_{n-i}^{2(m-i)}}{C_n^{m} C_n^{m}} \end{align}所以
\begin{align} E[J] & = \sum_{i=0}^m E[x_i] \, J_i \\ & = \frac{\sum_{i=0}^m \, C_n^i \, C_{n-i}^{2(m-i)} \, \frac{i}{2(m-i)}}{C_n^{m} C_n^{m}} \end{align}#todo 推导最终式
k-shingle: any substring of length $k$ found within the document.
hashing Shingles: h(shingle) = bucket
logger.setLevel(logging.WARNING)
def to_alphanumeric_string(s):
"""remove any non-alphanumeric characte in string s.
"""
import re
s = s.lower()
s = re.sub('\W', ' ', s)
logging.debug('after: del special char: {}'.format(s))
s = re.sub('\s+', ' ', s).strip()
logging.debug('after: del multiple whitespace: {}'.format(s))
return s
def k_shingle_extrac(document, k):
"""extract all k-shingles from document.
"""
document = to_alphanumeric_string(document)
assert len(document) >= k, 'k should be less than the length of document'
k_shingles = []
while len(document) >= k:
shingle = document[0:k]
if shingle not in k_shingles:
k_shingles.append(shingle)
document = document[1:]
return k_shingles
print('Example 3.3: {}'.format(k_shingle_extrac('abcdabd', 2)))
print('other example: {}'.format(k_shingle_extrac("Hello, I'm So-So. ", 2)))
Example 3.3: ['ab', 'bc', 'cd', 'da', 'bd'] other example: ['he', 'el', 'll', 'lo', 'o ', ' i', 'i ', ' m', 'm ', ' s', 'so']
#exercise 3.2.1
data = "The most effective way to represent documents as sets, for the purpose of iden- tifying lexically similar documents is to construct from the document the set of short strings that appear within it. "
print('first 10 3-shingles:{}'.format(k_shingle_extrac(data,3)[0:10]))
first 10 3-shingles:['the', 'he ', 'e m', ' mo', 'mos', 'ost', 'st ', 't e', ' ef', 'eff']
#exercise 3.2.2
logger.setLevel(logging.WARNING)
def stopwords_shingles_extract(document, k):
"""extract the stopwords-shingle.
stropwords-shingle = stop-words + k words
"""
stop_words_list = ['the', 'you', 'to', 'as', 'for', 'of', 'is', 'that', 'it', 'from']
document = to_alphanumeric_string(document)
document = document.split()
logging.debug('split:{}'.format(document))
shingles = []
k = k + 1 #len(shingle) = 1 stop-word + k words
while(document):
try:
logging.debug('check:{}'.format(document[0]))
if document[0] in stop_words_list:
shingle = ' '.join(document[0:k])
logging.debug('hit: {}'.format(shingle))
if shingle not in shingles:
shingles.append(shingle)
except IndexError:
logging.debug('Index Error: no of char:{}, k: {}'.format(len(document), k))
k = len(document)
continue
document = document[1:]
return shingles
print('stop-shingles:{}'.format(stopwords_shingles_extract(data, 2)))
stop-shingles:['the most effective', 'to represent documents', 'as sets for', 'for the purpose', 'the purpose of', 'of iden tifying', 'is to construct', 'to construct from', 'from the document', 'the document the', 'the set of', 'of short strings', 'that appear within', 'it']
#exercise 3.2.3
k-shingles 最大数量:所有提取的 shingle 均不同
故从左向右依次取片,有:
the largest number = n - (k - 1) = n - k + 1
#Example 3.6
logger.setLevel(logging.WARNING)
my_database = ['a', 'b', 'c', 'd', 'e']
my_features_dict = {
'S1':['a', 'd'],
'S2':['c'],
'S3':['b', 'd', 'e'],
'S4':['a', 'c', 'd']
}
def matrix_representation_create(database, features_dict):
"""create the matrix representation of one database.
"""
matrix_ = np.zeros((len(database), len(features_dict)), dtype=np.int)
matrix = pd.DataFrame(matrix_, index=database, columns=sorted(features_dict.keys()))
for feature_name, values in features_dict.iteritems():
for value in values:
matrix.loc[value, feature_name] = 1
return matrix
my_matrix = matrix_representation_create(my_database, my_features_dict)
my_matrix
S1 | S2 | S3 | S4 | |
---|---|---|---|---|
a | 1 | 0 | 0 | 1 |
b | 0 | 0 | 1 | 0 |
c | 0 | 1 | 0 | 1 |
d | 1 | 0 | 1 | 1 |
e | 0 | 0 | 1 | 0 |
target: sets of shingles are large =====> much smaller representations called "signatures"
minhash
#Example 3.7
logger.setLevel(logging.WARNING)
def minhash(matrix, row_orders):
"""calculate the minhash value of matrix according to element permutation in Fig 3.3.
"""
hash_fun_names = ['h{}'.format(i) for i in range(1, len(row_orders)+1)]
hash_table = pd.DataFrame(np.zeros((len(row_orders), matrix.shape[1])), index=hash_fun_names, columns=matrix.columns)
for row_order, hash_fun_name in zip(row_orders, hash_fun_names):
matrix_p = matrix.loc[row_order,:]
logging.debug('after permutation: \n{}'.format(matrix_p))
for c in matrix_p.columns:
first_one_index = next((i for i, x in enumerate(matrix_p.loc[:,c]) if x), None)
hash_table.loc[hash_fun_name, c] = row_order[first_one_index]
return hash_table
minhash(my_matrix, [['b', 'e', 'a', 'd', 'c']])
S1 | S2 | S3 | S4 | |
---|---|---|---|---|
h1 | a | c | b | a |
minhash and Jaccard Similarity: $P[h(S1) = h(S2)] = SIM(S1, S2)$
Proof: rows can be divided into three classes:
randomly permute rows, there is at least one colunm whose value is 1: $$ P[h(S1) = h(S2)] = \frac{len(x)}{len(X+Y)} $$
minhash signature:
use vector $[h_1(S), h_2(S), ... , h_n(S)]$ to represent matrix M
namely, reduce the dim: columns(M) -----> n
Computing Minhash Signatures
picking a random permutation of large rows and find its hash value both are time-consuming ==> simulation
logger.setLevel(logging.WARNING)
my_matrix.index = range(my_matrix.shape[0])
print('matrix:\n{}\n'.format(my_matrix))
h_rows = [
[1, 2, 3, 4, 0],
[1, 4, 2, 0, 3]
]
def minhash_by_rows(matrix, row_orders):
"""calculate the minhash value of matrix according to row_permutation in Fig 3.4.
"""
hash_fun_names = ['h{}'.format(i) for i in range(1, len(row_orders)+1)]
hash_table = pd.DataFrame(np.zeros((len(row_orders), matrix.shape[1])), index=hash_fun_names, columns=matrix.columns)
for row_order, hash_fun_name in zip(row_orders, hash_fun_names):
logging.debug('row_order:{}, h:{}'.format(row_order, hash_fun_name))
matrix_p = matrix.copy()
matrix_p.index = row_order #new rows permutation
matrix_p.sort_index(inplace=True) #adjust rows orders
logging.debug('after permutation: \n{}'.format(matrix_p))
for c in matrix_p.columns:
first_one_index = next((i for i, x in enumerate(matrix_p.loc[:,c]) if x), None)
hash_table.loc[hash_fun_name, c] = first_one_index
return hash_table
my_minhash_res = minhash_by_rows(my_matrix, h_rows)
print('minhash: 5dim -> 2dim \n\
true hash res:\n{}\n'.format(my_minhash_res))
print('Minhashing')
for comb in list(itertools.combinations(range(4),2)):
s_a, s_b = 'S{}'.format(comb[0]+1), 'S{}'.format(comb[1]+1)
print('{}-{}'.format(s_a, s_b)),
set_a, set_b = set(my_minhash_res.iloc[:,comb[0]]), set(my_minhash_res.iloc[:,comb[1]])
print('minhash:{:.3f}'.format(Jaccard_similarity_calc(set_a, set_b))),
print('true SIM:{:.3f}'.format(Jaccard_similarity_calc(set(my_features_dict[s_a]), set(my_features_dict[s_b]))))
matrix: S1 S2 S3 S4 0 1 0 0 1 1 0 0 1 0 2 0 1 0 1 3 1 0 1 1 4 0 0 1 0 minhash: 5dim -> 2dim true hash res: S1 S2 S3 S4 h1 1 3 0 1 h2 0 2 0 0 Minhashing S1-S2 minhash:0.000 true SIM:0.000 S1-S3 minhash:0.500 true SIM:0.250 S1-S4 minhash:1.000 true SIM:0.667 S2-S3 minhash:0.000 true SIM:0.000 S2-S4 minhash:0.000 true SIM:0.333 S3-S4 minhash:0.500 true SIM:0.200
对 $M$ 中的所有行做一次随机组合得到 $\bar{M}$,非常耗时。
解决方法是不直接操作矩阵 $M$,而是分两步操作:
h_rows
即可用 $h_1(x) = (x + 1) \% 5$ 和 $h_2(x) = (3x + 1) \% 5$ 生成对应的行号def add_hash_func(a, b, c):
return lambda x: (a*x + b) % c
h_funcs = [
add_hash_func(1, 1, 5),
add_hash_func(3, 1, 5)
]
h_rows = []
for h_func in h_funcs:
h_rows.append(map(h_func, range(5)))
print('h_rows:{}'.format(h_rows))
h_rows:[[1, 2, 3, 4, 0], [1, 4, 2, 0, 3]]
但第 2 步排序仍然需要操作矩阵,见函数 minhash_by_rows
中的语句 matrix_p.sort_index(inplace=True)
。
#Fig 3.4
df_matrix = my_matrix
df_h_rows = pd.DataFrame(np.array(h_rows).T)
df_h_rows.columns = ['h{}'.format(x+1) for x in df_h_rows.columns]
print('Hash functions computed for the matrix:\n{}\n'.format(pd.concat([df_matrix, df_h_rows], axis=1)))
print('signature matrix\n(SIG):\n{}'.format(my_minhash_res))
Hash functions computed for the matrix: S1 S2 S3 S4 h1 h2 0 1 0 0 1 1 1 1 0 0 1 0 2 4 2 0 1 0 1 3 2 3 1 0 1 1 4 0 4 0 0 1 0 0 3 signature matrix (SIG): S1 S2 S3 S4 h1 1 3 0 1 h2 0 2 0 0
Let SIG(i,c) be the element of the signature matrix for the ith hash function and column c.
替代方法是借助思路:$ SIG(i,c) = min(h_i(S_c=1)) $
即对于 $h_i(S_c)$ 值来说,它只可能是 “1” 对应的行号,而第一个 “1” 肯定是最小的行号。
比如,我们想得到 SIG(1,1) 的值,
$S1 = [1, 0, 0, 1, 0]^T$,对应的 $h1 = [1, 2, 3, 4, 0]^T$,
SIG(1,1) 只能取 “1” 的行号,即 1 或 4,
又 SIG 是第一个遇到的 “1”,所以 SIG(1,1) 肯定是 1。
思路总结:
我们对每个 SIG(i,c) 都可以通过遍历 $S_i$ 中 "1" 位置,取出 $h_c$ 中相应位置的行号,行号中最小的值就是解。
而这个思路是可以并行化处理的,有两种方法:
行遍历的方法,粗略想,计算量应该更少。
#todo: 验证计算量
#Example 3.8
logger.setLevel(logging.WARNING)
def minhash_signatures_calc(df_M, hash_funcs, nagging=False):
"""computing minhash signatures by the way in Example 3.8.
"""
logging.debug('data matrix:\n{}\n'.format(df_M))
h = []
for hash_func in hash_funcs:
h.append(map(hash_func, range(df_M.shape[0])))
df_h = pd.DataFrame(np.array(h).T)
df_h.columns = ['h{}'.format(x+1) for x in df_h.columns]
logging.debug('hash matrix:\n{}\n'.format(df_h))
if nagging:
print('hash matrix:\n{}\n'.format(pd.concat([df_matrix, df_h], axis=1)))
df_signatures = pd.DataFrame(np.ones((df_h.shape[1], df_M.shape[1]))*np.inf, index=df_h.columns, columns=df_M.columns)
logging.debug('signatures matrix:\ninit\n{}\n'.format(df_signatures))
for r in df_M.index:
for c in df_h.columns:
r_1_loc = df_M.loc[r,:] == 1
logging.debug('r:{}, c:{}, 1 loc:\n{}\n'.format(r,c, r_1_loc))
sig_c = df_signatures.loc[c,:]
line_bigger_loc = sig_c > df_h.loc[r, c]
logging.debug('bigger row loc:\n{}\n'.format(line_bigger_loc))
sig_c[line_bigger_loc & r_1_loc] = df_h.loc[r, c]
logging.debug('modified:\n{}\n'.format(sig_c))
df_signatures.loc[c,:] = sig_c
if nagging:
print('row:{},\n signature matrix:\n{}\n'.format(r, df_signatures))
return df_signatures
minhash_signatures_calc(df_matrix, h_funcs, nagging=True)
hash matrix: S1 S2 S3 S4 h1 h2 0 1 0 0 1 1 1 1 0 0 1 0 2 4 2 0 1 0 1 3 2 3 1 0 1 1 4 0 4 0 0 1 0 0 3 row:0, signature matrix: S1 S2 S3 S4 h1 1 inf inf 1 h2 1 inf inf 1 row:1, signature matrix: S1 S2 S3 S4 h1 1 inf 2 1 h2 1 inf 4 1 row:2, signature matrix: S1 S2 S3 S4 h1 1 3 2 1 h2 1 2 4 1 row:3, signature matrix: S1 S2 S3 S4 h1 1 3 2 1 h2 0 2 0 0 row:4, signature matrix: S1 S2 S3 S4 h1 1 3 0 1 h2 0 2 0 0
S1 | S2 | S3 | S4 | |
---|---|---|---|---|
h1 | 1 | 3 | 0 | 1 |
h2 | 0 | 2 | 0 | 0 |
#exercise 3.3.1
#generate 120 permutations
h_rows = list(itertools.permutations(range(5),5))
my_minhash_res = minhash_by_rows(my_matrix, h_rows)
for comb in list(itertools.combinations(range(4),2)):
s_a, s_b = 'S{}'.format(comb[0]+1), 'S{}'.format(comb[1]+1)
print('{}-{}'.format(s_a, s_b)),
#calc Jaccard similarity
print('true SIM:{:.3f}'.format(Jaccard_similarity_calc(set(my_features_dict[s_a]), set(my_features_dict[s_b])))),
#calc the fraction of the 120 permutations in which the value is same
print('fraction:{:.3f}'.format(sum(my_minhash_res.loc[:,s_a] == my_minhash_res.loc[:,s_b])/120))
S1-S2 true SIM:0.000 fraction:0.000 S1-S3 true SIM:0.250 fraction:0.250 S1-S4 true SIM:0.667 fraction:0.667 S2-S3 true SIM:0.000 fraction:0.000 S2-S4 true SIM:0.333 fraction:0.333 S3-S4 true SIM:0.200 fraction:0.200
#exercise 3.3.2
h_funcs[2:4] = [
add_hash_func(2, 4, 5),
add_hash_func(3, -1, 5)
]
minhash_signatures_calc(df_matrix, h_funcs, nagging=True)
hash matrix: S1 S2 S3 S4 h1 h2 h3 h4 0 1 0 0 1 1 1 4 4 1 0 0 1 0 2 4 1 2 2 0 1 0 1 3 2 3 0 3 1 0 1 1 4 0 0 3 4 0 0 1 0 0 3 2 1 row:0, signature matrix: S1 S2 S3 S4 h1 1 inf inf 1 h2 1 inf inf 1 h3 4 inf inf 4 h4 4 inf inf 4 row:1, signature matrix: S1 S2 S3 S4 h1 1 inf 2 1 h2 1 inf 4 1 h3 4 inf 1 4 h4 4 inf 2 4 row:2, signature matrix: S1 S2 S3 S4 h1 1 3 2 1 h2 1 2 4 1 h3 4 3 1 3 h4 4 0 2 0 row:3, signature matrix: S1 S2 S3 S4 h1 1 3 2 1 h2 0 2 0 0 h3 0 3 0 0 h4 3 0 2 0 row:4, signature matrix: S1 S2 S3 S4 h1 1 3 0 1 h2 0 2 0 0 h3 0 3 0 0 h4 3 0 1 0
S1 | S2 | S3 | S4 | |
---|---|---|---|---|
h1 | 1 | 3 | 0 | 1 |
h2 | 0 | 2 | 0 | 0 |
h3 | 0 | 3 | 0 | 0 |
h4 | 3 | 0 | 1 | 0 |
#exercise 3.3.3
my_database = range(6)
my_features_dict = {
'S1':[2, 5],
'S2':[0, 1],
'S3':[3, 4],
'S4':[0, 2, 4]
}
df_fig_3_5 = matrix_representation_create(my_database, my_features_dict)
print('Fig 3.5:\n{}\n'.format(df_fig_3_5))
#(a)
h_funcs = [
add_hash_func(2, 1, 6),
add_hash_func(3, 2, 6),
add_hash_func(5, 2, 6)
]
df_matrix = df_fig_3_5
my_minhash_res = minhash_signatures_calc(df_matrix, h_funcs, nagging=True)
my_minhash_res
Fig 3.5: S1 S2 S3 S4 0 0 1 0 1 1 0 1 0 0 2 1 0 0 1 3 0 0 1 0 4 0 0 1 1 5 1 0 0 0 hash matrix: S1 S2 S3 S4 h1 h2 h3 0 0 1 0 1 1 2 2 1 0 1 0 0 3 5 1 2 1 0 0 1 5 2 0 3 0 0 1 0 1 5 5 4 0 0 1 1 3 2 4 5 1 0 0 0 5 5 3 row:0, signature matrix: S1 S2 S3 S4 h1 inf 1 inf 1 h2 inf 2 inf 2 h3 inf 2 inf 2 row:1, signature matrix: S1 S2 S3 S4 h1 inf 1 inf 1 h2 inf 2 inf 2 h3 inf 1 inf 2 row:2, signature matrix: S1 S2 S3 S4 h1 5 1 inf 1 h2 2 2 inf 2 h3 0 1 inf 0 row:3, signature matrix: S1 S2 S3 S4 h1 5 1 1 1 h2 2 2 5 2 h3 0 1 5 0 row:4, signature matrix: S1 S2 S3 S4 h1 5 1 1 1 h2 2 2 2 2 h3 0 1 4 0 row:5, signature matrix: S1 S2 S3 S4 h1 5 1 1 1 h2 2 2 2 2 h3 0 1 4 0
S1 | S2 | S3 | S4 | |
---|---|---|---|---|
h1 | 5 | 1 | 1 | 1 |
h2 | 2 | 2 | 2 | 2 |
h3 | 0 | 1 | 4 | 0 |
#(b) h_3 is a true permutation.
#(c)
for comb in list(itertools.combinations(range(4),2)):
s_a, s_b = 'S{}'.format(comb[0]+1), 'S{}'.format(comb[1]+1)
print('{}-{}'.format(s_a, s_b)),
set_a, set_b = set(my_minhash_res.iloc[:,comb[0]]), set(my_minhash_res.iloc[:,comb[1]])
print('minhash:{:.3f}'.format(Jaccard_similarity_calc(set_a, set_b))),
print('true SIM:{:.3f}'.format(Jaccard_similarity_calc(set(my_features_dict[s_a]), set(my_features_dict[s_b]))))
S1-S2 minhash:0.250 true SIM:0.000 S1-S3 minhash:0.200 true SIM:0.000 S1-S4 minhash:0.500 true SIM:0.250 S2-S3 minhash:0.667 true SIM:0.000 S2-S4 minhash:0.667 true SIM:0.250 S3-S4 minhash:0.500 true SIM:0.250
#exercise 3.3.4
不是很明白意思,大概想法是:
从 $n$ 中抽取 $m$ 个元素集合 $S$ 和 $T$,则从 $S$ 和 $T$ 中各抽一次,元素相同的概率是多少?
#exercise 3.3.5
元素相同的概率是 0
minhash 降低了维度,但计算耗时问题仍然存在:
令样本集有 $k$ 个元素,两两比较次数是 $C_k^2 = O(k^2)$。
若 $k$ 很大,则比较次数成乘方倍增长。
解决思路:
LSH:
"hash" items several times, in such a way: $$P[\text{similar items in the same bucket}] >> P[\text{dissimilar ...}]$$
candidate pair: any pair hashed to the same bucket.
false postive: dissimilar items in the candidate pair.
false negtive: similar items NOT in the cnadidate pair.
if we have minhash signatures for the items, an effective way to choose the hashings is:
也就是说,将两个样本向量分割,只要有一个节点哈希到一起,就认为是候选对。
注意,统计的是每个节点各自哈希的情况,所以每个节点使用相同的哈希函数,但使用独立的哈希表存储中间结果。如下图:
matrix | --> | hash table |
---|---|---|
band 1 | h(v) | hash_talbe_1 |
band 2 | h(v) | hash_talbe_2 |
.... | h(v) | .... |
Analysis the probabilty of two pairs to become a candidate pairs:
Given the pairs $s_a$ and $s_b$ have Jaccard similarity $s$, we use $b$ bands of $r$ row each to split their signature matrix $M_a$ and $M_b$.
threshold: the value of $s$ at which the probability of becoming a candidate is 1/2.
namely, $1 - (1 - s^r)^b = \frac{1}{2}$, the solution $s \approx (1/b)^{1/r}$
#Example 3.11
b = 20
r = 5
s = np.linspace(0, 1, 100)
def p(s, r, b):
return 1 - (1 - s**r)**b
plt.plot(s,p(s, r, b))
s = np.arange(0.2, 0.9, 0.1)
zip(s, p(s, r, b))
[(0.20000000000000001, 0.0063805813047682625), (0.30000000000000004, 0.047494259124971183), (0.40000000000000008, 0.18604955214914409), (0.50000000000000011, 0.47005071531687648), (0.60000000000000009, 0.80190245383822212), (0.70000000000000018, 0.97478054418804061), (0.80000000000000027, 0.99964394210947927)]
if we consider two documents with 80% similarity, then their false negative is $1 - 0.99964... \approx 0.00035$, which means that only roughly 1 in 3000 pairs that are as high as 80% similar will fail to become a candidate pair.
整合前面讲过的方法,找到相似文本的方法如下:
k-shingles
[opt] hash k-shignles to shorter bucket numbers
Sort the document-shigles pairs
不用生成完成的矩阵,而是依据 shingles 顺序后面用到时逐行生成(节省内存)
Pick n, compute the minhash signatures by feed the sorted list line by line
逐行遍历
Choose a threshold $t$
$t \approx (1/b)^{1/r}$ and $n = b \times n$ to determine appropriate $b, n$.
Construct candidate pairs using LSH
Examine each candidate pair's signatures, $SIM > t$?
[opt] check document themselve
#todo: 找个例子,代码实现
LSH 本身的意义:
就是为一个特定的距离 $l$,找到合适的哈希函数 $h$ 来实现 "S" 曲线的效果:
即此距离下越近,哈希到同一区块的可能性越高。
这章介绍距离,为下一章找哈希函数打基础。
定义集合 $X$ 上的度量為一函數距离 $d: X \times X \to \mathbf{R}$
对于 $\forall x, y, z \in X$,应满足:
$d(x, y) \ge 0$
$d(x, y) = 0$ if and only if $x = y$
$d(x, y) = d(y, x)$
$d(x, y) \le d(x, z) + d(z, y)$
Euclidean Distance: $\| \cdot \|_2$
$L_r\text{-norm} = (\sum \| \cdot \|_1^r)^{1/r}$
$L_\infty\text{-norm} = max(\| \cdot \|_1)$
Jaccard Distance: $d(x, y) = 1 - SIM(x, y)$
Cosine Distance:
向量夹角
$d(x, y) = arc cos\{ \frac{x \cdot y}{\|x\|_2 + \|y\|_2} \}$
Edit Distance:
Given $x = x_1 x_2 ...$, $y = y_1 y_2 ...$.
$d(x, y)$ is the smallest number of insertion and deletion of single char($x \to y$).
有动态规划算法,
具体内容见维基Levenshtein distance。
$d(x, y) = len(x) + len(y) - 2 LCS(x, y)$
LCS is the longest common subsequence of $x$ and $y$.
Hamming Distance: $d(x, y) = sum( \, diff(x, y) \,)$
对于每种距离定义,需要找到合适的哈希函数使其满足 $P-s$ 是 "S" 型曲线(越相似,概率越高)。
a family of functions $\{h_i(x)\}$ should statify 3 conditions:
s = np.linspace(0, 1, 100)
plt.plot(1-s, p(s, r, b))
plt.xlabel('Jaccard distance')
plt.ylabel('Probility')
<matplotlib.text.Text at 0x1093815d0>
$h_i(x)$ should be statistically independent, because we need estimate the possbility by using the product rule for independent events.
eg: $s^r$ in $1 - (1 - s^r)^b$ for Jaccard similarity.
efficient in two ways:
当我们使用 “band, then hash" 这种方法来组合哈希函数达到概率 $1 - (1 - s^r)^b$ 变成 "S" 曲线(其实是倒”S"型,如上图),
所以我们需要找到的哈希方法,它应该有将距离接近的候选对更高概率匹配的特性,即 $P-d(x,y)$ 是倒 ”S"。这就引出了接下的内容:local sensitive.
$(d_1, d_2, p_1, p_2)$-sensitive:
eg: For Jaccard distance, the family of minhash functinos is a $(d_1, d_2, 1-d_1, 1-d_2)$-sensitive family for any $d_1$ and $d_2$ where $0 \leq d_1 < d_2 \leq 1$.
#minhash for Jaccard distance
d = 1 - s
d_1, d_2 = 0.4, 0.6
plt.plot(d[d<d_1], p(s, r, b)[d<d_1], d[d>d_2], p(s, r, b)[d>d_2])
plt.xlabel('d(x, y)')
plt.ylabel('Probility of being a candidate')
<matplotlib.text.Text at 0x1094448d0>
Amplifing a Locality-Sensitive Family
Given a $(d_1, d_2, p_1, p_2)$-sensitive family $F = {f_1, ..., f_r}$, we can construct a new family $F'$:
AND-construction:
$f(x) = f(y)$ when all $f_i(x) = f_i(y)$
$$(d_1, d_2, p_1^r, p_2^r)$$
OR-construction:
$f(x) = f(y)$ when any $f_i(x) = f_i(y)$
$$(d_1, d_2, 1-(1-p_1)^r, 1-(1-p_2)^r)$$
#AND-construciton and OR-constuction
dis = d
pos = p(s, r, b)
r = 4
AND_pos = (pos**r)
OR_pos = 1 - (1 - pos)**r
plt.plot(dis, pos, label='origin')
plt.plot(dis, AND_pos, label='AND-cons')
plt.plot(dis, OR_cons, label='OR-cons')
plt.legend(loc='upper right')
plt.xlabel('d(x, y)')
plt.ylabel('P[f(x) = f(y)]')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-20-3bb5aa39df30> in <module>() 10 plt.plot(dis, pos, label='origin') 11 plt.plot(dis, AND_pos, label='AND-cons') ---> 12 plt.plot(dis, OR_cons, label='OR-cons') 13 plt.legend(loc='upper right') 14 plt.xlabel('d(x, y)') NameError: name 'OR_cons' is not defined
As shown in the figure above,
AND-construction: low probability close to 0, namely, false negative$\uparrow$, false postive$\downarrow$.
OR-construction: high probabiity close to 1, namely, false negative$\downarrow$, false postive$\uparrow$.
用 AND- 和 OR- 可以组合成需要的 $(d_1, d_2, p_1, p_2)$。
LSH families for Distance measures
For Jaccard distance,
the family of minhash functinos is a $(d_1, d_2, 1-d_1, 1-d_2)$-sensitive family for any $d_1$ and $d_2$ where $0 \leq d_1 < d_2 \leq 1$.
For Hamming Distance,
given d-dim vector $x$ adn $y$, their hamming distance is $h(x, y)$.
define: $f_i(x)$ to be the $i_{th}$ bit of $x$,
then for a randomly chosen $i$, $P[f_i(x) = f_i(y)] = 1 - h(x,y)/d$.
$F={f_i(x)}$ is $(d_1, d_2, 1 - d_1/d, 1 - d_2/d)$-sensitive for any $d_1 < d_2$
For Euclidean Distance,
$f$ is associated with a randomly chosen line which is divided into segments of length $a$, and the segments of the line are the buckets into which function $f$ hashed points. as shown in Fig (3.13).
$(a/2, 2a, 1/2, 1/3)$
For Cosine Distance,
$x$ 和 $y$ 的夹角是 $h(x,y$.
任选一超平面 $S_i$,取其法向量 $v_i$,
则当 $S_i$ 位于夹角内时,$v_i \cdot x$ 和 $v_i \cdot y$ 符号不同;夹角外时,符号相同。具体见书中图 3.12。
定义:
$f(x) = f(y)$ 有且仅有 $v_f \cdot x$ 和 $v_f \cdot y$ 符号相同。
$(d_1, d_2, (180-d_1)/180, (180-d_2)/180)$
感觉这个函数计算量,未必比直接计算余弦距离小
sketches:
有随机向量集合 $V = \{v_1, ..., v_n\}$,其中 $\forall \, v_i \in \{+1, -1\}^d$
则 sketches of $x$ is
其中
\begin{equation} I(x) = \begin{cases} +1, & \quad x > 0 \\ -1, & \quad x < 0 \\ +1, \, \text{or} \, -1 & \quad x = 0 \end{cases} \end{equation}比较两个向量 $x$ 和 $y$ 的相似性,可转化为 $$P[f(x) = f(y)] = \frac{len(\text{intersection}(sx, sy))}{len(sx)}$$
#example 3.21
x = np.arange(3, 7)
y = np.arange(4, 0, -1)
print('x:{}, y:{}'.format(x,y))
from scipy.spatial.distance import cosine
print('angle between x and y: {:.3f}\n'.format(np.rad2deg(np.arccos(1-cosine(x,y)))))
def sketch_calc(x, v):
sketch = np.dot(x, v)
sketch[sketch>0] = 1
sketch[sketch<=0] = -1
return sketch
v = [[1, -1, 1, 1], [-1, 1, -1, 1], [1, 1, -1, -1]]
v = np.array(v).T
print('v:{}'.format(v))
x_s = sketch_calc(x, v)
print('sketch of x: {}'.format(x_s))
y_s = sketch_calc(y, v)
print('sketch of y: {}'.format(y_s))
print('p = same elemets / total elements: {} / {}'. format(sum((x_s-y_s)==0), len(x_s)))
def sketch_angle_calc(x_s, y_s):
"""calculate angle when similar possibily of two vectors are determined.
p = (180 - d) / 180
"""
p = sum((x_s-y_s)==0) / len(x_s)
d = 180 - 180 * p
return d
print('angle(sketch): {}\n'.format(sketch_angle_calc(x_s, y_s)))
v = list(itertools.product([-1,1],repeat=len(x)))
v = np.array(v).T
print('v:{}'.format(v))
x_s = sketch_calc(x, v)
print('sketch of x: {}'.format(x_s))
y_s = sketch_calc(y, v)
print('sketch of y: {}'.format(y_s))
print('p = same elemets / total elements: {} / {}'. format(sum((x_s-y_s)==0), len(x_s)))
print('angle(sketch): {}'.format(sketch_angle_calc(x_s, y_s)))
x:[3 4 5 6], y:[4 3 2 1] angle between x and y: 38.048 v:[[ 1 -1 1] [-1 1 1] [ 1 -1 -1] [ 1 1 -1]] sketch of x: [ 1 1 -1] sketch of y: [ 1 -1 1] p = same elemets / total elements: 1 / 3 angle(sketch): 120.0 v:[[-1 -1 -1 -1 -1 -1 -1 -1 1 1 1 1 1 1 1 1] [-1 -1 -1 -1 1 1 1 1 -1 -1 -1 -1 1 1 1 1] [-1 -1 1 1 -1 -1 1 1 -1 -1 1 1 -1 -1 1 1] [-1 1 -1 1 -1 1 -1 1 -1 1 -1 1 -1 1 -1 1]] sketch of x: [-1 -1 -1 1 -1 1 -1 1 -1 -1 -1 1 -1 1 1 1] sketch of y: [-1 -1 -1 -1 -1 -1 -1 1 -1 -1 1 1 1 1 1 1] p = same elemets / total elements: 12 / 16 angle(sketch): 45.0
to match data records that refer to the same real-world entity.
找到同一对象在不同数据库里的数据
eg: A 和 B 公司对账(记录方式不尽相同)
相似度比较:
使用"姓名“,”电话“,”地址信息“,三个分别用编辑距离计算,评分(d=0-1, score=100-0),累积总值就是相似度评分 $s$。
全部比较太耗时,所以用 LSH 先找出潜在配对。
对三组分别哈希,用 AND- 构建。
实际中的方法是直接排序:名字,电话,地址均按升序排列,相似的就聚焦在一起。
评价匹配正确率 $f$。
使用其它未使用的记录来做为评据。
如这里使用两个数据库配对项目的创建时间差 $t$ 做为度量:
$\bar{t} = E[\text{random_pick}(t)]$: 随机配对的时间差期望
$t_p = E[t_{xy}: argmax_{x,y} s(x,y)]$: 最大 $s$ 数据集的时间差期望(假设全是正确匹配,$f=1$)。
假设现在用 $s_i$ 做阈值,得到的匹配对算出均值 $t_i$,其正确匹配率是 $f_i$,
则有 $t_i = t_p * f_i + \bar{t} * (1 - f_i)$
解得:
$$f_i = \frac{\bar{t} - t_i}{\bar{t} - t_p}$$
即,我们可以用结果中的其他数据项,组成评价匹配正确率的函数。
特征描述: minutiae 构建指纹的特征向量。
两类问题: 1. 一对多:找出某个指纹的匹配 2. 多对多:找出潜在相同的匹配
假设随机抽取一个指纹的特征向量的一位,其中有 minutiae 的概率是 $P[x] 20\%$。
同一来源的两个指纹在特征向量的一位,一个有 minutiae 时,另一个也有的概率是 $P[Y|X] = 80\%$。
定义哈希函数 $f$ 为:
抽取特征向量的三位,若两个指纹的每一位均含有 minutiae,则做为候选对。
分析:随机对候选的概率是 $0.2^6 = 0.000064$, 同一来源候选的概率是 $0.2^3 \times 0.8^3 = 0.004096$,
我们生成 1024 个 $f_i$ 哈希函数,使用 OR- 构建,再用 AND- 构建,结果见下图。
#Example 3.22
p = np.array([0.2**6, (0.2**3)*(0.8**3)])
print('dissimilar:{:.4f}, similar:{:.4f}'.format(p[0], p[1]))
#1024 OR-cons
p = 1 - (1 - p)**1024
print('dissimilar:{:.4f}, similar:{:.4f}'.format(p[0], p[1]))
#then 2 AND-cons
p = p**2
print('dissimilar:{:.4f}, similar:{:.4f}'.format(p[0], p[1]))
print 1/p[0]
dissimilar:0.0001, similar:0.0041 dissimilar:0.0634, similar:0.9850 dissimilar:0.0040, similar:0.9703 248.496189026
difference of text in prose and in ads/headlines:
Prose has a much greater frequency of stop words ====> define a shingle to be a stop word followd by the next two words.
LSH-based methods appear most effective when the degree of similarity we accept is relatively low.
methods:
all pairs in the sets have a high Jaccard similarity, say at least 0.9.
representing sets as strings:
represent a set by sorting the elements of the universal set in some fixed order and constructing a string of "characters".
eg: 26 lower-case letters, normal alphabetical order: {d, a, b, b} => 'abd'
Length-Based Filtering: sort the strings by length
each string $s$ is compared with those strings $t$ that follow $s$ in the list ($L_s \leq L_t: L_x = len(x)$).
$J_u$ is the upper bound on Jaccard distance between two strings.
SIM(s, t) is at most $L_s/L_t$, and lower bound = $1 - L_s/L_t < J_u$ = upper bound.
hence, $$L_t \leq \frac{L_s}{1 - J_u}$$.
Prefix Indexing: create an index for the firs $p$-symbols of $s$.
$p$: make sure that at least 1 shared symbol in two strings when $SIM(s, t) \geq J$
从相反情况讨论,假设 $SIM(s, t) \geq J$ 但前 $p$ 字符无一相同,则此时 $max \, SIM(s, t) = (L_s - p) / L_s$。那么只要 $J > (L_s - p) / L_s$ 则能保证前 $p$ 字符至少有一个相同。有:
$$p = \lfloor (1 - J)L_s \rfloor + 1$$
Using Position Information: 使用位置信息来判定是否需要比较
索引是 $(x, j)$
若 $s[1:i-1] \cap t[1:i-1] = \emptyset$ 并且 $s[i:-1] = t[j:-1]$
则其 $$SIM(s, t) = \frac{L_s-(i-1)}{L_s+(j-1)} \geq J$$
可解得 $$j \leq (L_s(1 - J) - i + 1 + J)/J$$
Using Position and Length in Indexs:
索引是 (x, j, suffix length).
若有 (s_i, i, p) 和 (t_j, j, q),前面部分无一相同,后面部分两者是子集关系。
则分情况讨论: