Skip to content

Commit

Permalink
updated tests & documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
kishorek committed Jun 22, 2020
1 parent 05c6f73 commit 16b5f6f
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 95 deletions.
47 changes: 46 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# comparesv
### CSV Comparison on steriods
### Python CSV Comparison on steriods

## Usage

Expand Down Expand Up @@ -42,5 +42,50 @@ optional arguments:

## Description

The first file is considered as the source file. It will be compared against the second file. Refer the below options to finetune the way it works.

### Row Match (-rm)

This will define the way how the rows between the files will be identified for comparison

`order` - This is the default option, This will compare the rows by their position between the files. This can be used if the records in both the files are in same order

`fuzzy` - This will use fuzzy logic to identify the matching row on second file. This can be used if the records are not in order and most of the data are **text**.

`deep` - This will use fuzzy logic to identify the matching row on second file. This can be used if the records are not in order and it has **numeric** data. This will look for each row in file1 against all the rows in file2 to find a potential match

### Column Match (-rm)

This will define the way how the columns between the files will be identified for comparison

`exact` - This is the default option, This will compare the columns between the files by their headers for an exact match and select it for comparison. eg. 'Age' and 'Age' columns across the files will be selected for comparison.

`fuzzy` - This will use fuzzy logic to identify the matching column on second file. This can be used if the column headers across the files are not exactly same by somehow closer. eg. 'age' and 'age of student' columns may be selected for comparison.

### String Match (-sm)

This will define the way how the textual data is compared.

`exact` - This is the default option, This will compare the exact text.

`fuzzy` - This will use fuzzy logic to find if the texts are closer to each other and identifies the match.

### Include Additional Rows (-ir)

If the second file contains more rows than the first file, this option will enable the comparison output to include the remaining rows (uncompared ones).

### Include Additional Columns (-ic)

If the second file contains more columns than the first file, this option will enable the comparison output to include the remaining columms.

### Ignore case (-i)

This option will ignore the case while comparing the strings.

### Include Stats (-is)

This option is enabled by default and it outputs the comparison stats (in percentage) on the console

### Save Output (-s)

This option will save the result & values comparison in the current directory. This is enabled by default.
48 changes: 30 additions & 18 deletions comparesv.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,6 @@ def compare_data(data1, data2, headers1, headers2, matched_headers, **kwargs):
added_rows.append(row2)
elif mode == 'deleted':
deleted_rows.append(row1)
# else:
# common_rows.append(row1)
rows_output.append(row_compare_result)

if kwargs.get('include_addnl_rows'):
Expand Down Expand Up @@ -140,15 +138,15 @@ def exist_in_list(option, option_list):

def prepare_headers(data1, headers1, headers2, column_match):
mapped_headers_index = OrderedDict()
updated_indices2 = []
mapped_indices2 = []
for index, header in enumerate(headers1):
index = -1
if column_match == 'exact':
exists, index = exist_in_list(header, headers2)
elif column_match == 'fuzzy':
indices2_left = [x for x in range(len(headers2)) if x not in updated_indices2]
index = fuzzy_column_index(header, headers2)
updated_indices2.append(index)
unmapped_header_indices = [x for x in range(len(headers2)) if x not in mapped_indices2]
index = fuzzy_column_index(header, headers2, unmapped_header_indices)
mapped_indices2.append(index)

column_data = {}
column_data['index'] = index
Expand All @@ -160,15 +158,20 @@ def prepare_headers(data1, headers1, headers2, column_match):
return mapped_headers_index


def fuzzy_column_index(header, headers_list):
exist, index = exist_in_list(header, headers_list)
def fuzzy_column_index(header, headers_list, unmapped_header_indices):
unmapped_headers = [x for i,x in enumerate(headers_list) if i in unmapped_header_indices]

exist, index = exist_in_list(header, unmapped_headers)
if exist:
return index
original_index = unmapped_header_indices[index]
return original_index

highest = process.extractOne(header, headers_list)
highest = process.extractOne(header, unmapped_headers)
if highest[1] < ROW_THRESHOLD:
return -1
return headers_list.index(highest[0])
unmapped_index = unmapped_headers.index(highest[0])
original_index = unmapped_header_indices[unmapped_index]
return original_index


def deep_row_find(row, data2, headers1, headers2, matched_headers, data2_indices_left, opts):
Expand All @@ -191,17 +194,17 @@ def deep_row_find(row, data2, headers1, headers2, matched_headers, data2_indices

return selected_row, selected_index


def fuzzy_row_find(row, data2, headers1, matched_headers, data2_indices_left):
def fuzzy_row_find(row, data2, headers1, matched_headers, unmapped_indices2):
row1 = ' '.join(str(x) for x in row)
rows_list2 = [' '.join(str(x) for x in elem) for index, elem in enumerate(data2) if index in data2_indices_left]
highest = process.extractOne(row1, rows_list2)
unmapped_data2 = [' '.join(str(x) for x in elem) for index, elem in enumerate(data2) if index in unmapped_indices2]
highest = process.extractOne(row1, unmapped_data2)

if highest[1] < ROW_THRESHOLD:
return None, None

index = rows_list2.index(highest[0])
return data2[index], index
index = unmapped_data2.index(highest[0])
original_index = unmapped_indices2[index]
return data2[original_index], original_index


def compare_rows(row1, row2, header_index, headers2, opts):
Expand Down Expand Up @@ -289,4 +292,13 @@ def predict_column_type(data):
elif int in data_types:
return "int"
else:
return "str"
return "str"

h1 = ["id", "age"]
h2 = ["id", "age","gender"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A1", 23,"M"], ["A2", 24,"F"], ["A3", 34,"O"]]

output = run(d1, h1, d2, h2, include_addnl_columns='fuzzy')
from pprint import pprint
pprint(output)
186 changes: 111 additions & 75 deletions tests.py
Original file line number Diff line number Diff line change
@@ -1,100 +1,136 @@
import comparesv


def test_basic():
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A1", 23], ["A2", 24], ["A3", 34]]
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A1", 23], ["A2", 24], ["A3", 34]]

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2)
assert result == output['results']
assert values == output['values']

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2)
assert result == output['results']
assert values == output['values']

def test_column_order():
h1 = ["id", "age"]
h2 = ["age", "id"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [[23, "A1"], [24, "A2"], [34, "A3"]]
h1 = ["id", "age"]
h2 = ["age", "id"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [[23, "A1"], [24, "A2"], [34, "A3"]]

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2)
assert result == output['results']
assert values == output['values']

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2)
assert result == output['results']
assert values == output['values']

def test_fuzzy_column_order():
h1 = ["id", "age"]
h2 = ["age of student", "identity"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [[23, "A1"], [24, "A2"], [34, "A3"]]
h1 = ["id", "age", "building age"]
h2 = ["age of student", "identity", "building age"]
d1 = [["A1", 23, 100], ["A2", 24, 100], ["A3", 34, 100]]
d2 = [[23, "A1", 100], [24, "A2", 100], [34, "A3", 100]]

result = [[True, True, True], [True, True, True], [True, True, True]]
values = [['[A1]:[A1]', '[23]:[23]','[100]:[100]'], ['[A2]:[A2]', '[24]:[24]','[100]:[100]'], ['[A3]:[A3]', '[34]:[34]','[100]:[100]']]
output = comparesv.run(d1, h1, d2, h2, column_match='fuzzy')
assert result == output['results']
assert values == output['values']

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2, column_match='fuzzy')
assert result == output['results']
assert values == output['values']

def test_row_order_fuzzy():
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A2", 24], ["A1", 23], ["A3", 34]]
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A2", 24], ["A1", 23], ["A3", 34]]

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy')
assert result == output['results']
assert values == output['values']

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy')
assert result == output['results']
assert values == output['values']

def test_extra_column():
h1 = ["id", "age", "name"]
h2 = ["id", "age"]
d1 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]]
d2 = [["A2", 24], ["A1", 23], ["A3", 34]]
h1 = ["id", "age", "name"]
h2 = ["id", "age"]
d1 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]]
d2 = [["A2", 24], ["A1", 23], ["A3", 34]]

result = [[True, True, False], [True, True, False], [True, True, False]]
values = [['[A1]:[A1]', '[23]:[23]', '[Alpha]:[]'], ['[A2]:[A2]', '[24]:[24]', '[Beta]:[]'], ['[A3]:[A3]', '[34]:[34]', '[Gamma]:[]']]
output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy')
assert result == output['results']
assert values == output['values']

result = [[True, True, False], [True, True, False], [True, True, False]]
values = [['[A1]:[A1]', '[23]:[23]', '[Alpha]:[]'], ['[A2]:[A2]', '[24]:[24]', '[Beta]:[]'], ['[A3]:[A3]', '[34]:[34]', '[Gamma]:[]']]
output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy')
assert result == output['results']
assert values == output['values']

def test_include_extra_rows():
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A1", 23], ["A2", 24], ["A3", 34],["A4", 34]]
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A1", 23], ["A2", 24], ["A3", 34], ["A4", 34]]

result = [[True, True], [True, True], [True, True], [False, False]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]'], ['[]:[A4]', '[]:[34]']]
output = comparesv.run(d1, h1, d2, h2, include_addnl_rows=True)
assert result == output['results']
assert values == output['values']

result = [[True, True], [True, True], [True, True], [False, False]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]'],['[]:[A4]', '[]:[34]']]
output = comparesv.run(d1, h1, d2, h2, include_addnl_rows=True)
assert result == output['results']
assert values == output['values']

def test_include_extra_column():
h1 = ["id", "age"]
h2 = ["id", "age", "name"]
d1 = [["A2", 24], ["A1", 23], ["A3", 34]]
d2 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]]
h1 = ["id", "age"]
h2 = ["id", "age", "name"]
d1 = [["A2", 24], ["A1", 23], ["A3", 34]]
d2 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]]

output = comparesv.run(d1, h1, d2, h2, include_addnl_columns=True)
result = [[False, False, False], [False, False, False], [True, True, False]]
values = [['[A2]:[A1]', '[24]:[23]', '[]:[Alpha]'],
['[A1]:[A2]', '[23]:[24]', '[]:[Beta]'],
['[A3]:[A3]', '[34]:[34]', '[]:[Gamma]']]

output = comparesv.run(d1, h1, d2, h2, include_addnl_columns=True)
result = [[False, False, False], [False, False, False], [True, True, False]]
values = [['[A2]:[A1]', '[24]:[23]', '[]:[Alpha]'],
['[A1]:[A2]', '[23]:[24]', '[]:[Beta]'],
['[A3]:[A3]', '[34]:[34]', '[]:[Gamma]']]
assert result == output['results']
assert values == output['values']

assert result == output['results']
assert values == output['values']

def test_basic_case():
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["a1", 23], ["a2", 24], ["a3", 34]]

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[a1]', '[23]:[23]'], ['[A2]:[a2]', '[24]:[24]'], ['[A3]:[a3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2, ignore_case=True)
assert result == output['results']
assert values == output['values']
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["a1", 23], ["a2", 24], ["a3", 34]]

result = [[True, True], [True, True], [True, True]]
values = [['[A1]:[a1]', '[23]:[23]'], ['[A2]:[a2]', '[24]:[24]'], ['[A3]:[a3]', '[34]:[34]']]
output = comparesv.run(d1, h1, d2, h2, ignore_case=True)
assert result == output['results']
assert values == output['values']

def test_include_rows():
h1 = ["id", "age"]
h2 = ["id", "age"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A1", 23], ["A2", 24], ["A3", 34],["A4", 34]]

result = [[True, True], [True, True], [True, True], [False, False]]
values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]'], ['[]:[A4]', '[]:[34]']]
output = comparesv.run(d1, h1, d2, h2, include_addnl_rows=True)
assert result == output['results']
assert values == output['values']

def test_include_columns():
h1 = ["id", "age"]
h2 = ["id", "age","gender"]
d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
d2 = [["A1", 23,"M"], ["A2", 24,"F"], ["A3", 34,"O"]]

result = [[True, True, False], [True, True, False], [True, True, False]]
values = [['[A1]:[A1]', '[23]:[23]', '[]:[M]'],
['[A2]:[A2]', '[24]:[24]', '[]:[F]'],
['[A3]:[A3]', '[34]:[34]', '[]:[O]']]
headers = ['id', 'age', 'gender']
output = comparesv.run(d1, h1, d2, h2, include_addnl_columns=True)
assert result == output['results']
assert values == output['values']
assert headers == output['headers']
2 changes: 1 addition & 1 deletion version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = 0.11
__version__ = 0.12

0 comments on commit 16b5f6f

Please sign in to comment.