diff --git a/README.md b/README.md index 8ca2714..8235138 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # comparesv -### CSV Comparison on steriods +### Python CSV Comparison on steriods ## Usage @@ -42,5 +42,50 @@ optional arguments: ## Description +The first file is considered as the source file. It will be compared against the second file. Refer the below options to finetune the way it works. +### Row Match (-rm) +This will define the way how the rows between the files will be identified for comparison + +`order` - This is the default option, This will compare the rows by their position between the files. This can be used if the records in both the files are in same order + +`fuzzy` - This will use fuzzy logic to identify the matching row on second file. This can be used if the records are not in order and most of the data are **text**. + +`deep` - This will use fuzzy logic to identify the matching row on second file. This can be used if the records are not in order and it has **numeric** data. This will look for each row in file1 against all the rows in file2 to find a potential match + +### Column Match (-rm) + +This will define the way how the columns between the files will be identified for comparison + +`exact` - This is the default option, This will compare the columns between the files by their headers for an exact match and select it for comparison. eg. 'Age' and 'Age' columns across the files will be selected for comparison. + +`fuzzy` - This will use fuzzy logic to identify the matching column on second file. This can be used if the column headers across the files are not exactly same by somehow closer. eg. 'age' and 'age of student' columns may be selected for comparison. + +### String Match (-sm) + +This will define the way how the textual data is compared. + +`exact` - This is the default option, This will compare the exact text. + +`fuzzy` - This will use fuzzy logic to find if the texts are closer to each other and identifies the match. + +### Include Additional Rows (-ir) + +If the second file contains more rows than the first file, this option will enable the comparison output to include the remaining rows (uncompared ones). + +### Include Additional Columns (-ic) + +If the second file contains more columns than the first file, this option will enable the comparison output to include the remaining columms. + +### Ignore case (-i) + +This option will ignore the case while comparing the strings. + +### Include Stats (-is) + +This option is enabled by default and it outputs the comparison stats (in percentage) on the console + +### Save Output (-s) + +This option will save the result & values comparison in the current directory. This is enabled by default. \ No newline at end of file diff --git a/comparesv.py b/comparesv.py index d2199b2..3eab80b 100644 --- a/comparesv.py +++ b/comparesv.py @@ -108,8 +108,6 @@ def compare_data(data1, data2, headers1, headers2, matched_headers, **kwargs): added_rows.append(row2) elif mode == 'deleted': deleted_rows.append(row1) - # else: - # common_rows.append(row1) rows_output.append(row_compare_result) if kwargs.get('include_addnl_rows'): @@ -140,15 +138,15 @@ def exist_in_list(option, option_list): def prepare_headers(data1, headers1, headers2, column_match): mapped_headers_index = OrderedDict() - updated_indices2 = [] + mapped_indices2 = [] for index, header in enumerate(headers1): index = -1 if column_match == 'exact': exists, index = exist_in_list(header, headers2) elif column_match == 'fuzzy': - indices2_left = [x for x in range(len(headers2)) if x not in updated_indices2] - index = fuzzy_column_index(header, headers2) - updated_indices2.append(index) + unmapped_header_indices = [x for x in range(len(headers2)) if x not in mapped_indices2] + index = fuzzy_column_index(header, headers2, unmapped_header_indices) + mapped_indices2.append(index) column_data = {} column_data['index'] = index @@ -160,15 +158,20 @@ def prepare_headers(data1, headers1, headers2, column_match): return mapped_headers_index -def fuzzy_column_index(header, headers_list): - exist, index = exist_in_list(header, headers_list) +def fuzzy_column_index(header, headers_list, unmapped_header_indices): + unmapped_headers = [x for i,x in enumerate(headers_list) if i in unmapped_header_indices] + + exist, index = exist_in_list(header, unmapped_headers) if exist: - return index + original_index = unmapped_header_indices[index] + return original_index - highest = process.extractOne(header, headers_list) + highest = process.extractOne(header, unmapped_headers) if highest[1] < ROW_THRESHOLD: return -1 - return headers_list.index(highest[0]) + unmapped_index = unmapped_headers.index(highest[0]) + original_index = unmapped_header_indices[unmapped_index] + return original_index def deep_row_find(row, data2, headers1, headers2, matched_headers, data2_indices_left, opts): @@ -191,17 +194,17 @@ def deep_row_find(row, data2, headers1, headers2, matched_headers, data2_indices return selected_row, selected_index - -def fuzzy_row_find(row, data2, headers1, matched_headers, data2_indices_left): +def fuzzy_row_find(row, data2, headers1, matched_headers, unmapped_indices2): row1 = ' '.join(str(x) for x in row) - rows_list2 = [' '.join(str(x) for x in elem) for index, elem in enumerate(data2) if index in data2_indices_left] - highest = process.extractOne(row1, rows_list2) + unmapped_data2 = [' '.join(str(x) for x in elem) for index, elem in enumerate(data2) if index in unmapped_indices2] + highest = process.extractOne(row1, unmapped_data2) if highest[1] < ROW_THRESHOLD: return None, None - index = rows_list2.index(highest[0]) - return data2[index], index + index = unmapped_data2.index(highest[0]) + original_index = unmapped_indices2[index] + return data2[original_index], original_index def compare_rows(row1, row2, header_index, headers2, opts): @@ -289,4 +292,13 @@ def predict_column_type(data): elif int in data_types: return "int" else: - return "str" \ No newline at end of file + return "str" + +h1 = ["id", "age"] +h2 = ["id", "age","gender"] +d1 = [["A1", 23], ["A2", 24], ["A3", 34]] +d2 = [["A1", 23,"M"], ["A2", 24,"F"], ["A3", 34,"O"]] + +output = run(d1, h1, d2, h2, include_addnl_columns='fuzzy') +from pprint import pprint +pprint(output) \ No newline at end of file diff --git a/tests.py b/tests.py index d12fd47..7db04fc 100644 --- a/tests.py +++ b/tests.py @@ -1,100 +1,136 @@ import comparesv + def test_basic(): - h1 = ["id", "age"] - h2 = ["id", "age"] - d1 = [["A1", 23], ["A2", 24], ["A3", 34]] - d2 = [["A1", 23], ["A2", 24], ["A3", 34]] + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["A1", 23], ["A2", 24], ["A3", 34]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2) + assert result == output['results'] + assert values == output['values'] - result = [[True, True], [True, True], [True, True]] - values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] - output = comparesv.run(d1, h1, d2, h2) - assert result == output['results'] - assert values == output['values'] def test_column_order(): - h1 = ["id", "age"] - h2 = ["age", "id"] - d1 = [["A1", 23], ["A2", 24], ["A3", 34]] - d2 = [[23, "A1"], [24, "A2"], [34, "A3"]] + h1 = ["id", "age"] + h2 = ["age", "id"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [[23, "A1"], [24, "A2"], [34, "A3"]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2) + assert result == output['results'] + assert values == output['values'] - result = [[True, True], [True, True], [True, True]] - values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] - output = comparesv.run(d1, h1, d2, h2) - assert result == output['results'] - assert values == output['values'] def test_fuzzy_column_order(): - h1 = ["id", "age"] - h2 = ["age of student", "identity"] - d1 = [["A1", 23], ["A2", 24], ["A3", 34]] - d2 = [[23, "A1"], [24, "A2"], [34, "A3"]] + h1 = ["id", "age", "building age"] + h2 = ["age of student", "identity", "building age"] + d1 = [["A1", 23, 100], ["A2", 24, 100], ["A3", 34, 100]] + d2 = [[23, "A1", 100], [24, "A2", 100], [34, "A3", 100]] + + result = [[True, True, True], [True, True, True], [True, True, True]] + values = [['[A1]:[A1]', '[23]:[23]','[100]:[100]'], ['[A2]:[A2]', '[24]:[24]','[100]:[100]'], ['[A3]:[A3]', '[34]:[34]','[100]:[100]']] + output = comparesv.run(d1, h1, d2, h2, column_match='fuzzy') + assert result == output['results'] + assert values == output['values'] - result = [[True, True], [True, True], [True, True]] - values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] - output = comparesv.run(d1, h1, d2, h2, column_match='fuzzy') - assert result == output['results'] - assert values == output['values'] def test_row_order_fuzzy(): - h1 = ["id", "age"] - h2 = ["id", "age"] - d1 = [["A1", 23], ["A2", 24], ["A3", 34]] - d2 = [["A2", 24], ["A1", 23], ["A3", 34]] + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["A2", 24], ["A1", 23], ["A3", 34]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy') + assert result == output['results'] + assert values == output['values'] - result = [[True, True], [True, True], [True, True]] - values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']] - output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy') - assert result == output['results'] - assert values == output['values'] def test_extra_column(): - h1 = ["id", "age", "name"] - h2 = ["id", "age"] - d1 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]] - d2 = [["A2", 24], ["A1", 23], ["A3", 34]] + h1 = ["id", "age", "name"] + h2 = ["id", "age"] + d1 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]] + d2 = [["A2", 24], ["A1", 23], ["A3", 34]] + + result = [[True, True, False], [True, True, False], [True, True, False]] + values = [['[A1]:[A1]', '[23]:[23]', '[Alpha]:[]'], ['[A2]:[A2]', '[24]:[24]', '[Beta]:[]'], ['[A3]:[A3]', '[34]:[34]', '[Gamma]:[]']] + output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy') + assert result == output['results'] + assert values == output['values'] - result = [[True, True, False], [True, True, False], [True, True, False]] - values = [['[A1]:[A1]', '[23]:[23]', '[Alpha]:[]'], ['[A2]:[A2]', '[24]:[24]', '[Beta]:[]'], ['[A3]:[A3]', '[34]:[34]', '[Gamma]:[]']] - output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy') - assert result == output['results'] - assert values == output['values'] def test_include_extra_rows(): - h1 = ["id", "age"] - h2 = ["id", "age"] - d1 = [["A1", 23], ["A2", 24], ["A3", 34]] - d2 = [["A1", 23], ["A2", 24], ["A3", 34],["A4", 34]] + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["A1", 23], ["A2", 24], ["A3", 34], ["A4", 34]] + + result = [[True, True], [True, True], [True, True], [False, False]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]'], ['[]:[A4]', '[]:[34]']] + output = comparesv.run(d1, h1, d2, h2, include_addnl_rows=True) + assert result == output['results'] + assert values == output['values'] - result = [[True, True], [True, True], [True, True], [False, False]] - values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]'],['[]:[A4]', '[]:[34]']] - output = comparesv.run(d1, h1, d2, h2, include_addnl_rows=True) - assert result == output['results'] - assert values == output['values'] def test_include_extra_column(): - h1 = ["id", "age"] - h2 = ["id", "age", "name"] - d1 = [["A2", 24], ["A1", 23], ["A3", 34]] - d2 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]] + h1 = ["id", "age"] + h2 = ["id", "age", "name"] + d1 = [["A2", 24], ["A1", 23], ["A3", 34]] + d2 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]] + + output = comparesv.run(d1, h1, d2, h2, include_addnl_columns=True) + result = [[False, False, False], [False, False, False], [True, True, False]] + values = [['[A2]:[A1]', '[24]:[23]', '[]:[Alpha]'], + ['[A1]:[A2]', '[23]:[24]', '[]:[Beta]'], + ['[A3]:[A3]', '[34]:[34]', '[]:[Gamma]']] - output = comparesv.run(d1, h1, d2, h2, include_addnl_columns=True) - result = [[False, False, False], [False, False, False], [True, True, False]] - values = [['[A2]:[A1]', '[24]:[23]', '[]:[Alpha]'], - ['[A1]:[A2]', '[23]:[24]', '[]:[Beta]'], - ['[A3]:[A3]', '[34]:[34]', '[]:[Gamma]']] + assert result == output['results'] + assert values == output['values'] - assert result == output['results'] - assert values == output['values'] def test_basic_case(): - h1 = ["id", "age"] - h2 = ["id", "age"] - d1 = [["A1", 23], ["A2", 24], ["A3", 34]] - d2 = [["a1", 23], ["a2", 24], ["a3", 34]] - - result = [[True, True], [True, True], [True, True]] - values = [['[A1]:[a1]', '[23]:[23]'], ['[A2]:[a2]', '[24]:[24]'], ['[A3]:[a3]', '[34]:[34]']] - output = comparesv.run(d1, h1, d2, h2, ignore_case=True) - assert result == output['results'] - assert values == output['values'] \ No newline at end of file + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["a1", 23], ["a2", 24], ["a3", 34]] + + result = [[True, True], [True, True], [True, True]] + values = [['[A1]:[a1]', '[23]:[23]'], ['[A2]:[a2]', '[24]:[24]'], ['[A3]:[a3]', '[34]:[34]']] + output = comparesv.run(d1, h1, d2, h2, ignore_case=True) + assert result == output['results'] + assert values == output['values'] + +def test_include_rows(): + h1 = ["id", "age"] + h2 = ["id", "age"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["A1", 23], ["A2", 24], ["A3", 34],["A4", 34]] + + result = [[True, True], [True, True], [True, True], [False, False]] + values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]'], ['[]:[A4]', '[]:[34]']] + output = comparesv.run(d1, h1, d2, h2, include_addnl_rows=True) + assert result == output['results'] + assert values == output['values'] + +def test_include_columns(): + h1 = ["id", "age"] + h2 = ["id", "age","gender"] + d1 = [["A1", 23], ["A2", 24], ["A3", 34]] + d2 = [["A1", 23,"M"], ["A2", 24,"F"], ["A3", 34,"O"]] + + result = [[True, True, False], [True, True, False], [True, True, False]] + values = [['[A1]:[A1]', '[23]:[23]', '[]:[M]'], + ['[A2]:[A2]', '[24]:[24]', '[]:[F]'], + ['[A3]:[A3]', '[34]:[34]', '[]:[O]']] + headers = ['id', 'age', 'gender'] + output = comparesv.run(d1, h1, d2, h2, include_addnl_columns=True) + assert result == output['results'] + assert values == output['values'] + assert headers == output['headers'] \ No newline at end of file diff --git a/version.py b/version.py index 47f3457..668e18e 100644 --- a/version.py +++ b/version.py @@ -1 +1 @@ -__version__ = 0.11 \ No newline at end of file +__version__ = 0.12 \ No newline at end of file