-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimhash.py
28 lines (23 loc) · 831 Bytes
/
simhash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from hashes.simhash import simhash
from os.path import isfile, join
from os import listdir
import multiprocessing
def sh(f):
return [simhash(open(f).read()), f]
def get_compare():
path = "./cc_clean_data"
pool = multiprocessing.Pool(10)
files = [join(path, f) for f in listdir(path) if isfile(join(path, f)) and f.endswith(".dump")]
hashes = []
print("calculating hashes")
hashes = pool.map(sh, files)
print("comparing hashes")
for i in range(len(hashes)):
if (i % 1000 == 0):
print (str(i / 1000), "%")
for j in range(i + 1, len(hashes)):
score = hashes[i][0].similarity(hashes[j][0])
if score > 0.95:
temp_result = [score, hashes[i][1], hashes[j][1]]
print(temp_result)
yield temp_result