sim

#!/usr/bin/env python3
import binascii
import functools
import re
import glob
import sys
import os
from itertools import islice
import fnmatch
import Levenshtein as lev
import textdistance

if len(sys.argv) != 3: 
    print ("Usage: fuzzy.py [file extension without period] [number of bytes to read]")
    sys.exit()
    
def findfiles(which, where='.'):
    '''Returns list of filenames from `where` path matched by 'which'
       shell pattern. Matching is case-insensitive.'''
    rule = re.compile(fnmatch.translate(which), re.IGNORECASE)
    return [name for name in os.listdir(where) if rule.match(name)]

def lcs(s1, s2):
  m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
  longest, x_longest = 0, 0
  for x in range(1, 1 + len(s1)):
    for y in range(1, 1 + len(s2)):
      if s1[x - 1] == s2[y - 1]:
        m[x][y] = m[x - 1][y - 1] + 1
        if m[x][y] > longest:
          longest = m[x][y]
          x_longest = x
      else:
        m[x][y] = 0
  return s1[x_longest - longest: x_longest]


format=str(sys.argv[1])
#print (format)
if sys.argv[1] == "all":
    #print ("Arvg is all")
    format_use="*.*"
else:
    format_use="*"+format
bytes_to_read=int(sys.argv[2])
bytes_to_print=str(sys.argv[2])
file_list = findfiles(format_use)
file_list.sort()

match_count=0
no_match_count=0

with open(file_list[0], "rb") as f:
    start_bytes = (f.read(bytes_to_read))
    print ("\n\nFirst " + bytes_to_print + " binary bytes of " + file_list[0] + " are: " + str(start_bytes))
    S1 = str((binascii.hexlify(start_bytes)))
    S1 = S1.replace('b\'','')
    print("First " + bytes_to_print + " hexadecimal bytes of " + file_list[0] + " are: " + str(S1) + "\n")

    
with open(file_list[1], "rb") as f:
    start_bytes = (f.read(bytes_to_read))
    print ("First " + bytes_to_print + " Binary bytes of " + file_list[1] + " are: " + str(start_bytes))
    S2 = str((binascii.hexlify(start_bytes)))
    S2 = S2.replace('b\'','')
    print ("First " + bytes_to_print + " hexadecimal bytes of " + file_list[1] + " are: " + str(S2) + "\n")


#Distance = textdistance.levenshtein(S1,S2),
print("Distance between " + str(S1) + " and " + str(S2) + " is: " + str(textdistance.levenshtein(S1,S2)))
print ("Similarity score is " +  str(textdistance.levenshtein.normalized_similarity(S1,S2)))

if str(textdistance.levenshtein(S1,S2)) == "0":
    print ("Strings are identical, checking other files.")
    common_sequence = lcs(S1, S2)
    a = re.search(common_sequence, S1)
    for files in file_list:
        with open(files, "rb") as f:
            start_bytes = (f.read(bytes_to_read))
            S1 = str((binascii.hexlify(start_bytes)))
            S1 = S1.replace('b\'','')
            result = re.search(common_sequence, S1)
            if result:
                match_count=match_count+1
                #print (S1)
                print ("File " + files + " contains the string " + common_sequence + " at location " + str(result.span()))
            else:
                no_match_count=no_match_count+1
                print("File " + files + " DOES NOT CONTAIN THE STRING.")
    print (str(match_count) + " files with matches.")
    print (str(no_match_count) + " files without matches.")

elif int(textdistance.levenshtein(S1,S2)) <= 10:
    print ("\nStrings are slightly different, computing wildcard version to match both strings.\n")
    print (S1)
    print (S2)
    print ("\nReplacing differing characters with wildcard.")
    which_char = {i for i, (left, right) in enumerate(zip(S1,S2)) if left != right}
    list_of_chars = list(which_char)
    list_of_chars.reverse()
    for val in list_of_chars:
        #print ("Replacing value at spot " + str(val) + " with a wildcard.")
        rep_string = S1[:val] + "\w" + S1[val+1:]
        
        val = val +2
        #print ("val is now: " + str(val))
        S1=rep_string
    print("Wildcard version of string is: " + rep_string + "\n")

    for files in file_list:
        with open(files, "rb") as f:
            start_bytes = (f.read(bytes_to_read))
            S1 = str((binascii.hexlify(start_bytes)))
            S1 = S1.replace('b\'','')
            result = re.search(rep_string, S1)
            if result:
                match_count=match_count+1
                #print (S1)
                print ("File " + files + " contains the string " + rep_string + " at location " + str(result.span()))
            else:
                no_match_count=no_match_count+1
                print("File " + files + " DOES NOT CONTAIN THE STRING.")
    print (str(match_count) + " files with matches.")
    print (str(no_match_count) + " files without matches.")
else:
    print("TOO DIFFERENT")