-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsim
129 lines (111 loc) · 4.56 KB
/
sim
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
import binascii
import functools
import re
import glob
import sys
import os
from itertools import islice
import fnmatch
import Levenshtein as lev
import textdistance
if len(sys.argv) != 3:
print ("Usage: fuzzy.py [file extension without period] [number of bytes to read]")
sys.exit()
def findfiles(which, where='.'):
'''Returns list of filenames from `where` path matched by 'which'
shell pattern. Matching is case-insensitive.'''
rule = re.compile(fnmatch.translate(which), re.IGNORECASE)
return [name for name in os.listdir(where) if rule.match(name)]
def lcs(s1, s2):
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
longest, x_longest = 0, 0
for x in range(1, 1 + len(s1)):
for y in range(1, 1 + len(s2)):
if s1[x - 1] == s2[y - 1]:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest:
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
return s1[x_longest - longest: x_longest]
format=str(sys.argv[1])
#print (format)
if sys.argv[1] == "all":
#print ("Arvg is all")
format_use="*.*"
else:
format_use="*"+format
bytes_to_read=int(sys.argv[2])
bytes_to_print=str(sys.argv[2])
file_list = findfiles(format_use)
file_list.sort()
match_count=0
no_match_count=0
with open(file_list[0], "rb") as f:
start_bytes = (f.read(bytes_to_read))
print ("\n\nFirst " + bytes_to_print + " binary bytes of " + file_list[0] + " are: " + str(start_bytes))
S1 = str((binascii.hexlify(start_bytes)))
S1 = S1.replace('b\'','')
print("First " + bytes_to_print + " hexadecimal bytes of " + file_list[0] + " are: " + str(S1) + "\n")
with open(file_list[1], "rb") as f:
start_bytes = (f.read(bytes_to_read))
print ("First " + bytes_to_print + " Binary bytes of " + file_list[1] + " are: " + str(start_bytes))
S2 = str((binascii.hexlify(start_bytes)))
S2 = S2.replace('b\'','')
print ("First " + bytes_to_print + " hexadecimal bytes of " + file_list[1] + " are: " + str(S2) + "\n")
#Distance = textdistance.levenshtein(S1,S2),
print("Distance between " + str(S1) + " and " + str(S2) + " is: " + str(textdistance.levenshtein(S1,S2)))
print ("Similarity score is " + str(textdistance.levenshtein.normalized_similarity(S1,S2)))
if str(textdistance.levenshtein(S1,S2)) == "0":
print ("Strings are identical, checking other files.")
common_sequence = lcs(S1, S2)
a = re.search(common_sequence, S1)
for files in file_list:
with open(files, "rb") as f:
start_bytes = (f.read(bytes_to_read))
S1 = str((binascii.hexlify(start_bytes)))
S1 = S1.replace('b\'','')
result = re.search(common_sequence, S1)
if result:
match_count=match_count+1
#print (S1)
print ("File " + files + " contains the string " + common_sequence + " at location " + str(result.span()))
else:
no_match_count=no_match_count+1
print("File " + files + " DOES NOT CONTAIN THE STRING.")
print (str(match_count) + " files with matches.")
print (str(no_match_count) + " files without matches.")
elif int(textdistance.levenshtein(S1,S2)) <= 10:
print ("\nStrings are slightly different, computing wildcard version to match both strings.\n")
print (S1)
print (S2)
print ("\nReplacing differing characters with wildcard.")
which_char = {i for i, (left, right) in enumerate(zip(S1,S2)) if left != right}
list_of_chars = list(which_char)
list_of_chars.reverse()
for val in list_of_chars:
#print ("Replacing value at spot " + str(val) + " with a wildcard.")
rep_string = S1[:val] + "\w" + S1[val+1:]
val = val +2
#print ("val is now: " + str(val))
S1=rep_string
print("Wildcard version of string is: " + rep_string + "\n")
for files in file_list:
with open(files, "rb") as f:
start_bytes = (f.read(bytes_to_read))
S1 = str((binascii.hexlify(start_bytes)))
S1 = S1.replace('b\'','')
result = re.search(rep_string, S1)
if result:
match_count=match_count+1
#print (S1)
print ("File " + files + " contains the string " + rep_string + " at location " + str(result.span()))
else:
no_match_count=no_match_count+1
print("File " + files + " DOES NOT CONTAIN THE STRING.")
print (str(match_count) + " files with matches.")
print (str(no_match_count) + " files without matches.")
else:
print("TOO DIFFERENT")