-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlsh.py
156 lines (128 loc) · 4.78 KB
/
lsh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#coding: utf-8
"""
DESCRIPTION
===========
This module is a Python implementation of Locality Sensitive Hashing,
which is a alpha version.
AUTHOR
===========
Jason Ding([email protected])
LICENCE
===========
This module is free software that you can use it or modify it.
"""
#encoding:utf-8
import math
import numpy as np
from hamminghash import HammingHash
from buckets import LshBucket
from bitarray import bitarray
"""
try:
from bitarray import bitarray
expect ImportError:
bitarray = None
"""
class LSHBase(object):
"""
Base class for LSH
"""
def __init__(self, **kw):
for key, value in kw.iteritems():
setattr(self, key, value)
def _check_parameters(self):
for param in ('d', 'L', 'k'):
if not hasattr(self, param):
raise TypeError("parameter 'param' is necessary")
if self.k > self.d:
raise ValueError("parameter 'k' must be smaller than parameter 'd'")
class LSH(LSHBase):
"""
>>> from LSH import LSH
>>> lsh = LSH(
... L=L, # number of hash functions
... k=k, # number of reductions
... d=d, # number of dimentions
... )
"""
def __init__(self, L, k, d, **kw):
super(LSH, self).__init__(L=L, k=k, d=d, **kw)
self._check_parameters()
self.hash = HammingHash(L=L, k=k, d=d)
self.bucket = LshBucket(L=L, k=k, d=d)
self.index = False #the symbol whether use the stored indexes
def loadDataSet(self,filename,delim=','):
fr = open(filename)
arrayOfLines = fr.readlines()
#numOfLines = len(arrayOfLines)
for line in arrayOfLines:
line = line.strip()
arrayFromLine = line.split(delim)
vectFromLine = [int(x) for x in arrayFromLine]#transform the string type to integer
self.insert(vectFromLine)
self.bucket.store_buckets()#store all the data and index
def insert(self, vector):
"""insert a vector data to buckets"""
hashed_array = self.hash.do_hashing(vector)
self.bucket.insert(vector, hashed_array)
def insert_index(self,vector):
hashed_array = self.hash.do_hashing_index(vector)
self.bucket.insert(vector, hashed_array)
def nn(self, vector, without_itself=False, index=False):#nearest_neighbours
neighbours = self._neighbours(vector,without_itself,index)
nearest_vector = self._nearest(vector,neighbours)
return nearest_vector
def knn(self,vector,kk,without_itself=False, index=False):
neighbours = self._neighbours(vector,without_itself,index)
knn_vectors = self._k_nearest(vector,kk,neighbours)
return knn_vectors
def _neighbours(self, vector, without_itself=False, index=False):
"""this function extracts some vectors as neighbours with query vector"""
self.index = index
if self.index == False:
hashed_array = self.hash.do_hashing(vector)
else:
self.bucket.load_buckets()
hashed_array = self.hash.do_hashing_index(vector)
neighbours = self.bucket.select(vector, hashed_array, without_itself)
return neighbours
def _k_nearest(self, vector, kk, neighbours):
knn_list = []
for n_vector in neighbours:
dist = self._euclidean_dist(vector,n_vector)
if len(knn_list) < kk:
knn_list.append({'distance':dist,'vector':n_vector})
continue
knn_list.sort(key=lambda x:x['distance'])#sort from small to large
maximum_dist = knn_list[2]['distance']
if dist < maximum_dist:
knn_list[2] = {'distance':dist,'vector':n_vector}
return knn_list
def _nearest(self, vector, neighbours):
"""pick up the nearest vector from neighbours"""
nearest = {}
for n_vector in neighbours:
dist = self._euclidean_dist(vector,n_vector)
if "distance" not in nearest or dist < nearest["distance"]:
"""
'distance' is the key of dictionary(nearest)
'distance' is not in the dict,then build it
dist is shorter,then update
"""
nearest.update(vector = n_vector,distance = dist)
return nearest
def _hamming_dist(self, hashval1, hashval2):
xor_result = bitarray(hashval1) ^ bitarray(hashval2)
return xor_result.count()
def _euclidean_dist(self, vector1, vector2):
sum = 0
for x1, x2 in zip(vector1, vector2):
d = (x1 - x2) ** 2
sum += d
return math.sqrt(sum)
if __name__ == '__main__':
lsh = LSH(L=10,k=5,d=11)
#lsh.loadDataSet('training_set1.txt')
vect = [1,10,1,11,1,13,1,12,1,1,9]
knn_vects = lsh.knn(vect,3,index=True)
print knn_vects