From c5062f8700b05ff40297a2ccc0f0ca9b6cd1d0ef Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 13 Mar 2024 08:01:26 -0400 Subject: [PATCH 01/24] prelim tree string code - move table creation to separate function --- chispa/schema_comparer.py | 41 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index c341988..f932a00 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -3,10 +3,27 @@ import chispa.six as six +class SchemasNotEqualErrorWide(Exception): + """The schemas are not equal""" + pass + class SchemasNotEqualError(Exception): """The schemas are not equal""" pass +def create_schema_comparison_tree(s1, s2): + pass + +def create_schema_comparison_table(s1, s2): + t = PrettyTable(["schema1", "schema2"]) + zipped = list(six.moves.zip_longest(s1, s2)) + for sf1, sf2 in zipped: + if are_structfields_equal(sf1, sf2, True): + t.add_row([blue(sf1), blue(sf2)]) + else: + t.add_row([sf1, sf2]) + return t + def assert_schema_equality(s1, s2, ignore_nullable=False, ignore_metadata=False): if not ignore_nullable and not ignore_metadata: @@ -26,13 +43,7 @@ def inner(s1, s2, ignore_nullable, ignore_metadata): return True if not inner(s1, s2, ignore_nullable, ignore_metadata): - t = PrettyTable(["schema1", "schema2"]) - zipped = list(six.moves.zip_longest(s1, s2)) - for sf1, sf2 in zipped: - if are_structfields_equal(sf1, sf2, True): - t.add_row([blue(sf1), blue(sf2)]) - else: - t.add_row([sf1, sf2]) + t = create_schema_comparison_table(s1, s2) raise SchemasNotEqualError("\n" + t.get_string()) @@ -41,13 +52,7 @@ def inner(s1, s2, ignore_nullable, ignore_metadata): # I think schema equality operations are really fast to begin with def assert_basic_schema_equality(s1, s2): if s1 != s2: - t = PrettyTable(["schema1", "schema2"]) - zipped = list(six.moves.zip_longest(s1, s2)) - for sf1, sf2 in zipped: - if sf1 == sf2: - t.add_row([blue(sf1), blue(sf2)]) - else: - t.add_row([sf1, sf2]) + t = create_schema_comparison_table(s1, s2) raise SchemasNotEqualError("\n" + t.get_string()) @@ -55,13 +60,7 @@ def assert_basic_schema_equality(s1, s2): # deprecate this. ignore_nullable should be a flag. def assert_schema_equality_ignore_nullable(s1, s2): if not are_schemas_equal_ignore_nullable(s1, s2): - t = PrettyTable(["schema1", "schema2"]) - zipped = list(six.moves.zip_longest(s1, s2)) - for sf1, sf2 in zipped: - if are_structfields_equal(sf1, sf2, True): - t.add_row([blue(sf1), blue(sf2)]) - else: - t.add_row([sf1, sf2]) + t = create_schema_comparison_table(s1, s2) raise SchemasNotEqualError("\n" + t.get_string()) From 1babd2c443c3a8307fb53645d8bddd9dcf104642 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 13 Mar 2024 08:01:41 -0400 Subject: [PATCH 02/24] add prelim test for tree string --- tests/test_schema_comparer.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/test_schema_comparer.py b/tests/test_schema_comparer.py index f679fb0..a294496 100644 --- a/tests/test_schema_comparer.py +++ b/tests/test_schema_comparer.py @@ -38,6 +38,37 @@ def it_throws_when_schema_lengths_differ(): assert_schema_equality(s1, s2) + def it_throws_as_tree_string_for_wide_schemas(): + # create a deeply nested schema + s1 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField("fav_colors", StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True) + ])) + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField("fav_colors", StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True) + ])) + ]) + with pytest.raises(SchemasNotEqualError) as e_info: + assert_schema_equality(s1, s2) + print(e_info.value) + + + def describe_assert_schema_equality_ignore_nullable(): def it_has_good_error_messages_for_different_sized_schemas(): s1 = StructType([ From a2c962f77a2fef7fe108eda3afcd7556799df744 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 13 Mar 2024 20:02:52 -0400 Subject: [PATCH 03/24] add prelim tree string code --- chispa/schema_comparer.py | 75 +++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index f932a00..d0d83ac 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -3,16 +3,47 @@ import chispa.six as six -class SchemasNotEqualErrorWide(Exception): +class SchemasNotEqualError(Exception): """The schemas are not equal""" + pass -class SchemasNotEqualError(Exception): - """The schemas are not equal""" - pass -def create_schema_comparison_tree(s1, s2): - pass +def create_schema_comparison_tree(s1, s2) -> str: + def create_schema_tree(s, indent: int, horizontal_char="-", title: str = "") -> str: + if title: + tree_string = title + "\n" + else: + tree_string = "" + + for sf in s: + nullable = "(nullable = true)" if sf.nullable else "(nullable = false)" + tree_string += f"|{horizontal_char * indent}{sf.name}: {sf.dataType.typeName()} {nullable}\n" + if sf.dataType.typeName() == "struct": + tree_string += create_schema_tree( + sf.dataType, indent + 2, horizontal_char, "" + ) + return tree_string + + tree_space = 6 + horizontal_char = "-" + tree_string = "" + + s1_tree = create_schema_tree(s1, 0, horizontal_char, "schema1") + print(s1_tree) + s1_tree_list = s1_tree.split("\n") + widest_line = max(len(line) for line in s1_tree_list) + s2_padding = widest_line + tree_space + + s2_tree = create_schema_tree(s2, 0, horizontal_char, "schema2") + print(s2_tree) + + tree_string += "schema1\n" + tree_string += s1_tree + tree_string += "schema2\n" + tree_string += s2_tree + return tree_string + def create_schema_comparison_table(s1, s2): t = PrettyTable(["schema1", "schema2"]) @@ -25,6 +56,24 @@ def create_schema_comparison_table(s1, s2): return t +def check_if_schemas_are_wide(s1, s2) -> bool: + contains_nested_structs = any( + sf.dataType.typeName() == "struct" for sf in s1 + ) or any(sf.dataType.typeName() == "struct" for sf in s2) + contains_many_columns = len(s1) > 10 or len(s2) > 10 + return contains_nested_structs or contains_many_columns + + +def handle_schemas_not_equal(s1, s2): + schemas_are_wide = check_if_schemas_are_wide(s1, s2) + if schemas_are_wide: + error_message = create_schema_comparison_tree(s1, s2) + else: + t = create_schema_comparison_table(s1, s2) + error_message = "\n" + t.get_string() + raise SchemasNotEqualError(error_message) + + def assert_schema_equality(s1, s2, ignore_nullable=False, ignore_metadata=False): if not ignore_nullable and not ignore_metadata: assert_basic_schema_equality(s1, s2) @@ -43,8 +92,7 @@ def inner(s1, s2, ignore_nullable, ignore_metadata): return True if not inner(s1, s2, ignore_nullable, ignore_metadata): - t = create_schema_comparison_table(s1, s2) - raise SchemasNotEqualError("\n" + t.get_string()) + handle_schemas_not_equal(s1, s2) # deprecate this @@ -52,16 +100,13 @@ def inner(s1, s2, ignore_nullable, ignore_metadata): # I think schema equality operations are really fast to begin with def assert_basic_schema_equality(s1, s2): if s1 != s2: - t = create_schema_comparison_table(s1, s2) - raise SchemasNotEqualError("\n" + t.get_string()) - + handle_schemas_not_equal(s1, s2) # deprecate this. ignore_nullable should be a flag. def assert_schema_equality_ignore_nullable(s1, s2): if not are_schemas_equal_ignore_nullable(s1, s2): - t = create_schema_comparison_table(s1, s2) - raise SchemasNotEqualError("\n" + t.get_string()) + handle_schemas_not_equal(s1, s2) # deprecate this. ignore_nullable should be a flag. @@ -100,9 +145,9 @@ def are_datatypes_equal_ignore_nullable(dt1, dt2): """ if dt1.typeName() == dt2.typeName(): # Account for array types by inspecting elementType. - if dt1.typeName() == 'array': + if dt1.typeName() == "array": return are_datatypes_equal_ignore_nullable(dt1.elementType, dt2.elementType) - elif dt1.typeName() == 'struct': + elif dt1.typeName() == "struct": return are_schemas_equal_ignore_nullable(dt1, dt2) else: return True From 385f140aab4080672f92d208ceaa798b533b16fa Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 13 Mar 2024 21:58:16 -0400 Subject: [PATCH 04/24] add side by side schema tree string --- chispa/schema_comparer.py | 47 +++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index d0d83ac..551095b 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -10,39 +10,38 @@ class SchemasNotEqualError(Exception): def create_schema_comparison_tree(s1, s2) -> str: - def create_schema_tree(s, indent: int, horizontal_char="-", title: str = "") -> str: - if title: - tree_string = title + "\n" - else: - tree_string = "" - + def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: + tree_string = [] for sf in s: nullable = "(nullable = true)" if sf.nullable else "(nullable = false)" - tree_string += f"|{horizontal_char * indent}{sf.name}: {sf.dataType.typeName()} {nullable}\n" + tree_string += [ + f"|{horizontal_char * indent} {sf.name}: {sf.dataType.typeName()} {nullable}" + ] if sf.dataType.typeName() == "struct": tree_string += create_schema_tree( - sf.dataType, indent + 2, horizontal_char, "" + sf.dataType, indent + 2, horizontal_char ) return tree_string tree_space = 6 horizontal_char = "-" - tree_string = "" - - s1_tree = create_schema_tree(s1, 0, horizontal_char, "schema1") - print(s1_tree) - s1_tree_list = s1_tree.split("\n") - widest_line = max(len(line) for line in s1_tree_list) - s2_padding = widest_line + tree_space - - s2_tree = create_schema_tree(s2, 0, horizontal_char, "schema2") - print(s2_tree) - - tree_string += "schema1\n" - tree_string += s1_tree - tree_string += "schema2\n" - tree_string += s2_tree - return tree_string + + s1_tree = create_schema_tree(s1, 2, horizontal_char) + s2_tree = create_schema_tree(s2, 2, horizontal_char) + + widest_line = max(len(line) for line in s1_tree) + tree_string_combined = "schema1".ljust(widest_line + tree_space) + "schema2\n" + for i in range(max(len(s1_tree), len(s2_tree))): + if i < len(s1_tree): + line1 = s1_tree[i] + else: + line1 = " " * widest_line + if i < len(s2_tree): + line2 = s2_tree[i] + else: + line2 = "" + tree_string_combined += line1.ljust(widest_line + tree_space) + line2 + "\n" + return tree_string_combined def create_schema_comparison_table(s1, s2): From 19082fd1fd647e6c952a1332628a31f3c50b5f30 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 13 Mar 2024 23:01:41 -0400 Subject: [PATCH 05/24] improve tree appearance --- chispa/schema_comparer.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 551095b..0a21ed8 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -15,31 +15,29 @@ def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: for sf in s: nullable = "(nullable = true)" if sf.nullable else "(nullable = false)" tree_string += [ - f"|{horizontal_char * indent} {sf.name}: {sf.dataType.typeName()} {nullable}" + f"{indent * ' '}|{horizontal_char * 2} {sf.name}: {sf.dataType.typeName()} {nullable}" ] if sf.dataType.typeName() == "struct": tree_string += create_schema_tree( - sf.dataType, indent + 2, horizontal_char + sf.dataType, indent + 4, horizontal_char ) return tree_string tree_space = 6 horizontal_char = "-" - s1_tree = create_schema_tree(s1, 2, horizontal_char) - s2_tree = create_schema_tree(s2, 2, horizontal_char) + s1_tree = create_schema_tree(s1, 0, horizontal_char) + s2_tree = create_schema_tree(s2, 0, horizontal_char) widest_line = max(len(line) for line in s1_tree) tree_string_combined = "schema1".ljust(widest_line + tree_space) + "schema2\n" for i in range(max(len(s1_tree), len(s2_tree))): + line1 = "" + line2 = "" if i < len(s1_tree): line1 = s1_tree[i] - else: - line1 = " " * widest_line if i < len(s2_tree): line2 = s2_tree[i] - else: - line2 = "" tree_string_combined += line1.ljust(widest_line + tree_space) + line2 + "\n" return tree_string_combined From 4f624674ffae984a28f4f02382a53bb32c6e282b Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sat, 16 Mar 2024 08:59:18 -0400 Subject: [PATCH 06/24] simplify formatting --- chispa/schema_comparer.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 0a21ed8..54894f1 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -30,7 +30,7 @@ def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: s2_tree = create_schema_tree(s2, 0, horizontal_char) widest_line = max(len(line) for line in s1_tree) - tree_string_combined = "schema1".ljust(widest_line + tree_space) + "schema2\n" + tree_string_combined = "\n\nschema1".ljust(widest_line + tree_space) + "schema2\n" for i in range(max(len(s1_tree), len(s2_tree))): line1 = "" line2 = "" @@ -38,7 +38,18 @@ def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: line1 = s1_tree[i] if i < len(s2_tree): line2 = s2_tree[i] - tree_string_combined += line1.ljust(widest_line + tree_space) + line2 + "\n" + + tree_string_line = line1.ljust(widest_line + tree_space) + line2 + + if line1 == line2: + tree_string_line = line_blue(tree_string_line) + + else: + tree_string_line = line_red(tree_string_line) + + tree_string_combined += tree_string_line + "\n" + + tree_string_combined += bcolors.LightBlue return tree_string_combined From 8468412a838598c24a2039e68e9c58719b8f4c16 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sat, 16 Mar 2024 08:59:35 -0400 Subject: [PATCH 07/24] add helper functions + autoformat --- chispa/bcolors.py | 64 +++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/chispa/bcolors.py b/chispa/bcolors.py index bbbb930..e5a4fcb 100644 --- a/chispa/bcolors.py +++ b/chispa/bcolors.py @@ -1,40 +1,50 @@ class bcolors: - NC = '\033[0m' # No Color, reset all - - Bold = '\033[1m' - Underlined = '\033[4m' - Blink = '\033[5m' - Inverted = '\033[7m' - Hidden = '\033[8m' - - Black = '\033[30m' - Red = '\033[31m' - Green = '\033[32m' - Yellow = '\033[33m' - Blue = '\033[34m' - Purple = '\033[35m' - Cyan = '\033[36m' - LightGray = '\033[37m' - DarkGray = '\033[30m' - LightRed = '\033[31m' - LightGreen = '\033[32m' - LightYellow = '\033[93m' - LightBlue = '\033[34m' - LightPurple = '\033[35m' - LightCyan = '\033[36m' - White = '\033[97m' + NC = "\033[0m" # No Color, reset all + + Bold = "\033[1m" + Underlined = "\033[4m" + Blink = "\033[5m" + Inverted = "\033[7m" + Hidden = "\033[8m" + + Black = "\033[30m" + Red = "\033[31m" + Green = "\033[32m" + Yellow = "\033[33m" + Blue = "\033[34m" + Purple = "\033[35m" + Cyan = "\033[36m" + LightGray = "\033[37m" + DarkGray = "\033[30m" + LightRed = "\033[31m" + LightGreen = "\033[32m" + LightYellow = "\033[93m" + LightBlue = "\033[34m" + LightPurple = "\033[35m" + LightCyan = "\033[36m" + White = "\033[97m" # Style - Bold = '\033[1m' - Underline = '\033[4m' + Bold = "\033[1m" + Underline = "\033[4m" def blue(s: str) -> str: return bcolors.LightBlue + str(s) + bcolors.LightRed +def line_blue(s: str) -> str: + return bcolors.LightBlue + s + bcolors.NC + + +def line_red(s: str) -> str: + return bcolors.LightRed + s + bcolors.NC + + def underline_text(input_text: str) -> str: """ Takes an input string and returns a white, underlined string (based on PrettyTable formatting) """ - return bcolors.White + bcolors.Underline + input_text + bcolors.NC + bcolors.LightRed + return ( + bcolors.White + bcolors.Underline + input_text + bcolors.NC + bcolors.LightRed + ) From 8ffa73f0c22f8d48da9a195df74ff3660f89e34b Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sat, 16 Mar 2024 09:52:49 -0400 Subject: [PATCH 08/24] handle ignore nullable in tree string --- chispa/schema_comparer.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 54894f1..71c52e1 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -9,17 +9,19 @@ class SchemasNotEqualError(Exception): pass -def create_schema_comparison_tree(s1, s2) -> str: +def create_schema_comparison_tree(s1, s2, ignore_nullable: bool) -> str: def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: tree_string = [] - for sf in s: - nullable = "(nullable = true)" if sf.nullable else "(nullable = false)" + for struct_field in s: + nullable = ( + "(nullable = true)" if struct_field.nullable else "(nullable = false)" + ) tree_string += [ - f"{indent * ' '}|{horizontal_char * 2} {sf.name}: {sf.dataType.typeName()} {nullable}" + f"{indent * ' '}|{horizontal_char * 2} {struct_field.name}: {struct_field.dataType.typeName()} {nullable}" ] - if sf.dataType.typeName() == "struct": + if struct_field.dataType.typeName() == "struct": tree_string += create_schema_tree( - sf.dataType, indent + 4, horizontal_char + struct_field.dataType, indent + 4, horizontal_char ) return tree_string @@ -41,7 +43,7 @@ def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: tree_string_line = line1.ljust(widest_line + tree_space) + line2 - if line1 == line2: + if are_schema_strings_equal(line1, line2, ignore_nullable): tree_string_line = line_blue(tree_string_line) else: @@ -53,6 +55,19 @@ def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: return tree_string_combined +def are_schema_strings_equal(s1: str, s2: str, ignore_nullable: bool) -> bool: + if not ignore_nullable: + return s1 == s2 + + s1_no_nullable = s1.replace("(nullable = true)", "").replace( + "(nullable = false)", "" + ) + s2_no_nullable = s2.replace("(nullable = true)", "").replace( + "(nullable = false)", "" + ) + return s1_no_nullable == s2_no_nullable + + def create_schema_comparison_table(s1, s2): t = PrettyTable(["schema1", "schema2"]) zipped = list(six.moves.zip_longest(s1, s2)) @@ -72,10 +87,10 @@ def check_if_schemas_are_wide(s1, s2) -> bool: return contains_nested_structs or contains_many_columns -def handle_schemas_not_equal(s1, s2): +def handle_schemas_not_equal(s1, s2, ignore_nullable: bool): schemas_are_wide = check_if_schemas_are_wide(s1, s2) if schemas_are_wide: - error_message = create_schema_comparison_tree(s1, s2) + error_message = create_schema_comparison_tree(s1, s2, ignore_nullable) else: t = create_schema_comparison_table(s1, s2) error_message = "\n" + t.get_string() @@ -100,7 +115,7 @@ def inner(s1, s2, ignore_nullable, ignore_metadata): return True if not inner(s1, s2, ignore_nullable, ignore_metadata): - handle_schemas_not_equal(s1, s2) + handle_schemas_not_equal(s1, s2, ignore_nullable) # deprecate this @@ -108,13 +123,13 @@ def inner(s1, s2, ignore_nullable, ignore_metadata): # I think schema equality operations are really fast to begin with def assert_basic_schema_equality(s1, s2): if s1 != s2: - handle_schemas_not_equal(s1, s2) + handle_schemas_not_equal(s1, s2, ignore_nullable=False) # deprecate this. ignore_nullable should be a flag. def assert_schema_equality_ignore_nullable(s1, s2): if not are_schemas_equal_ignore_nullable(s1, s2): - handle_schemas_not_equal(s1, s2) + handle_schemas_not_equal(s1, s2, ignore_nullable=True) # deprecate this. ignore_nullable should be a flag. From 4c5cbd5ea588909d878ad0d4a08e805aaa7713f4 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sat, 16 Mar 2024 22:39:33 -0400 Subject: [PATCH 09/24] add print output tests --- .../it_prints_correctly_for_wide_schemas.txt | 1 + ...tly_for_wide_schemas_different_lengths.txt | 1 + ...ectly_for_wide_schemas_ignore_nullable.txt | 1 + ...r_wide_schemas_multiple_nested_structs.txt | 1 + tests/test_schema_comparer.py | 522 ++++++++++++++---- 5 files changed, 408 insertions(+), 118 deletions(-) create mode 100644 tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt create mode 100644 tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt create mode 100644 tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt create mode 100644 tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt new file mode 100644 index 0000000..6133f70 --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt @@ -0,0 +1 @@ +'\n\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[34m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[34m' \ No newline at end of file diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt new file mode 100644 index 0000000..d6be3c0 --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt @@ -0,0 +1 @@ +'\n\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[34m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[34m' \ No newline at end of file diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt new file mode 100644 index 0000000..6133f70 --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt @@ -0,0 +1 @@ +'\n\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[34m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[34m' \ No newline at end of file diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt new file mode 100644 index 0000000..9583d5c --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt @@ -0,0 +1 @@ +'\n\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[34m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[34m' \ No newline at end of file diff --git a/tests/test_schema_comparer.py b/tests/test_schema_comparer.py index a294496..ebe8a73 100644 --- a/tests/test_schema_comparer.py +++ b/tests/test_schema_comparer.py @@ -6,152 +6,406 @@ def describe_assert_schema_equality(): def it_does_nothing_when_equal(): - s1 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - s2 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) assert_schema_equality(s1, s2) - def it_throws_when_column_names_differ(): - s1 = StructType([ - StructField("HAHA", StringType(), True), - StructField("age", IntegerType(), True)]) - s2 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) + s1 = StructType( + [ + StructField("HAHA", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) with pytest.raises(SchemasNotEqualError) as e_info: assert_schema_equality(s1, s2) - def it_throws_when_schema_lengths_differ(): - s1 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - s2 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True)]) + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + ] + ) with pytest.raises(SchemasNotEqualError) as e_info: assert_schema_equality(s1, s2) - def it_throws_as_tree_string_for_wide_schemas(): - # create a deeply nested schema - s1 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField("fav_colors", StructType([ - StructField("red", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("blue", IntegerType(), True) - ])) - ]) - - s2 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField("fav_colors", StructType([ - StructField("orange", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("yellow", IntegerType(), True) - ])) - ]) - with pytest.raises(SchemasNotEqualError) as e_info: - assert_schema_equality(s1, s2) - print(e_info.value) - +def describe_tree_string(): + def it_prints_correctly_for_wide_schemas(): + with open( + "tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt" + ) as f: + expected = f.read() + + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ] + ), + ), + ] + ) + + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ] + ), + ), + ] + ) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False) + assert repr(result) == expected + + def it_prints_correctly_for_wide_schemas_multiple_nested_structs(): + with open( + "tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt" + ) as f: + expected = f.read() + + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField( + "fav_genres", + StructType( + [ + StructField( + "rock", + StructType( + [ + StructField("metal", IntegerType(), True), + StructField("punk", IntegerType(), True), + ] + ), + True, + ), + StructField( + "electronic", + StructType( + [ + StructField("house", IntegerType(), True), + StructField("dubstep", IntegerType(), True), + ] + ), + True, + ), + ] + ), + ), + ] + ) + + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField( + "fav_genres", + StructType( + [ + StructField( + "rock", + StructType( + [ + StructField("metal", IntegerType(), True), + StructField("classic", IntegerType(), True), + ] + ), + True, + ), + StructField( + "electronic", + StructType( + [ + StructField("house", IntegerType(), True), + StructField("dubstep", IntegerType(), True), + ] + ), + True, + ), + StructField( + "pop", + StructType( + [ + StructField("pop", IntegerType(), True), + ] + ), + True, + ), + ] + ), + ), + ] + ) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False) + assert repr(result) == expected + + def it_prints_correctly_for_wide_schemas_ignore_nullable(): + with open( + "tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt" + ) as f: + expected = f.read() + + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ] + ), + ), + ] + ) + + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ] + ), + ), + ] + ) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=True) + assert repr(result) == expected + + def it_prints_correctly_for_wide_schemas_different_lengths(): + with open( + "tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt" + ) as f: + expected = f.read() + + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ] + ), + ), + ] + ) + + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + StructField("purple", IntegerType(), True), + ] + ), + ), + StructField("phone_number", StringType(), True), + ] + ) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False) + assert repr(result) == expected def describe_assert_schema_equality_ignore_nullable(): def it_has_good_error_messages_for_different_sized_schemas(): - s1 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - s2 = StructType([ - StructField("name", StringType(), False), - StructField("age", IntegerType(), True), - StructField("something", IntegerType(), True), - StructField("else", IntegerType(), True) - ]) + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + s2 = StructType( + [ + StructField("name", StringType(), False), + StructField("age", IntegerType(), True), + StructField("something", IntegerType(), True), + StructField("else", IntegerType(), True), + ] + ) with pytest.raises(SchemasNotEqualError) as e_info: assert_schema_equality_ignore_nullable(s1, s2) - def it_does_nothing_when_equal(): - s1 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - s2 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) assert_schema_equality_ignore_nullable(s1, s2) - def it_does_nothing_when_only_nullable_flag_is_different(): - s1 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True)]) - s2 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), False)]) + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), False), + ] + ) assert_schema_equality_ignore_nullable(s1, s2) def describe_are_schemas_equal_ignore_nullable(): def it_returns_true_when_only_nullable_flag_is_different(): - s1 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("coords", ArrayType(DoubleType(), True), True), - ]) - s2 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), False), - StructField("coords", ArrayType(DoubleType(), True), False), - ]) + s1 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("coords", ArrayType(DoubleType(), True), True), + ] + ) + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), False), + StructField("coords", ArrayType(DoubleType(), True), False), + ] + ) assert are_schemas_equal_ignore_nullable(s1, s2) == True - def it_returns_true_when_only_nullable_flag_is_different_within_array_element(): s1 = StructType([StructField("coords", ArrayType(DoubleType(), True), True)]) s2 = StructType([StructField("coords", ArrayType(DoubleType(), False), True)]) assert are_schemas_equal_ignore_nullable(s1, s2) == True def it_returns_true_when_only_nullable_flag_is_different_within_nested_array_element(): - s1 = StructType([StructField("coords", ArrayType(ArrayType(DoubleType(), True), True), True)]) - s2 = StructType([StructField("coords", ArrayType(ArrayType(DoubleType(), False), True), True)]) + s1 = StructType( + [ + StructField( + "coords", ArrayType(ArrayType(DoubleType(), True), True), True + ) + ] + ) + s2 = StructType( + [ + StructField( + "coords", ArrayType(ArrayType(DoubleType(), False), True), True + ) + ] + ) assert are_schemas_equal_ignore_nullable(s1, s2) == True - def it_returns_false_when_the_element_type_is_different_within_array(): s1 = StructType([StructField("coords", ArrayType(DoubleType(), True), True)]) s2 = StructType([StructField("coords", ArrayType(IntegerType(), True), True)]) assert are_schemas_equal_ignore_nullable(s1, s2) == False - def it_returns_false_when_column_names_differ(): - s1 = StructType([ - StructField("blah", StringType(), True), - StructField("age", IntegerType(), True)]) - s2 = StructType([ - StructField("name", StringType(), True), - StructField("age", IntegerType(), False)]) + s1 = StructType( + [ + StructField("blah", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + s2 = StructType( + [ + StructField("name", StringType(), True), + StructField("age", IntegerType(), False), + ] + ) assert are_schemas_equal_ignore_nullable(s1, s2) == False def it_returns_false_when_columns_have_different_order(): - s1 = StructType([ - StructField("blah", StringType(), True), - StructField("age", IntegerType(), True)]) - s2 = StructType([ - StructField("age", IntegerType(), False), - StructField("blah", StringType(), True)]) + s1 = StructType( + [ + StructField("blah", StringType(), True), + StructField("age", IntegerType(), True), + ] + ) + s2 = StructType( + [ + StructField("age", IntegerType(), False), + StructField("blah", StringType(), True), + ] + ) assert are_schemas_equal_ignore_nullable(s1, s2) == False @@ -161,51 +415,83 @@ def it_returns_true_when_only_nullable_flag_is_different_within_array_element(): s2 = StructField("coords", ArrayType(DoubleType(), False), True) assert are_structfields_equal(s1, s2, True) == True - def it_returns_false_when_the_element_type_is_different_within_array(): s1 = StructField("coords", ArrayType(DoubleType(), True), True) s2 = StructField("coords", ArrayType(IntegerType(), True), True) assert are_structfields_equal(s1, s2, True) == False - def it_returns_true_when_the_element_type_is_same_within_struct(): - s1 = StructField("coords", StructType([StructField("hello", DoubleType(), True)]), True) - s2 = StructField("coords", StructType([StructField("hello", DoubleType(), True)]), True) + s1 = StructField( + "coords", StructType([StructField("hello", DoubleType(), True)]), True + ) + s2 = StructField( + "coords", StructType([StructField("hello", DoubleType(), True)]), True + ) assert are_structfields_equal(s1, s2, True) == True - def it_returns_false_when_the_element_type_is_different_within_struct(): - s1 = StructField("coords", StructType([StructField("hello", DoubleType(), True)]), True) - s2 = StructField("coords", StructType([StructField("hello", IntegerType(), True)]), True) + s1 = StructField( + "coords", StructType([StructField("hello", DoubleType(), True)]), True + ) + s2 = StructField( + "coords", StructType([StructField("hello", IntegerType(), True)]), True + ) assert are_structfields_equal(s1, s2, True) == False - def it_returns_false_when_the_element_name_is_different_within_struct(): - s1 = StructField("coords", StructType([StructField("hello", DoubleType(), True)]), True) - s2 = StructField("coords", StructType([StructField("world", DoubleType(), True)]), True) + s1 = StructField( + "coords", StructType([StructField("hello", DoubleType(), True)]), True + ) + s2 = StructField( + "coords", StructType([StructField("world", DoubleType(), True)]), True + ) assert are_structfields_equal(s1, s2, True) == False - - + def it_returns_true_when_different_nullability_within_struct(): - s1 = StructField("coords", StructType([StructField("hello", DoubleType(), True)]), True) - s2 = StructField("coords", StructType([StructField("hello", DoubleType(), False)]), True) + s1 = StructField( + "coords", StructType([StructField("hello", DoubleType(), True)]), True + ) + s2 = StructField( + "coords", StructType([StructField("hello", DoubleType(), False)]), True + ) assert are_structfields_equal(s1, s2, True) == True + def it_returns_false_when_metadata_differs(): s1 = StructField("coords", StringType(), True, {"hi": "whatever"}) s2 = StructField("coords", StringType(), True, {"hi": "no"}) - assert are_structfields_equal(s1, s2, ignore_nullability=True, ignore_metadata=False) is False + assert ( + are_structfields_equal( + s1, s2, ignore_nullability=True, ignore_metadata=False + ) + is False + ) def it_allows_metadata_to_be_ignored(): s1 = StructField("coords", StringType(), True, {"hi": "whatever"}) s2 = StructField("coords", StringType(), True, {"hi": "no"}) - assert are_structfields_equal(s1, s2, ignore_nullability=False, ignore_metadata=True) is True + assert ( + are_structfields_equal( + s1, s2, ignore_nullability=False, ignore_metadata=True + ) + is True + ) def it_allows_nullability_and_metadata_to_be_ignored(): s1 = StructField("coords", StringType(), True, {"hi": "whatever"}) s2 = StructField("coords", StringType(), False, {"hi": "no"}) - assert are_structfields_equal(s1, s2, ignore_nullability=True, ignore_metadata=True) is True + assert ( + are_structfields_equal( + s1, s2, ignore_nullability=True, ignore_metadata=True + ) + is True + ) def it_returns_true_when_metadata_is_the_same(): s1 = StructField("coords", StringType(), True, {"hi": "whatever"}) s2 = StructField("coords", StringType(), True, {"hi": "whatever"}) - assert are_structfields_equal(s1, s2, ignore_nullability=True, ignore_metadata=False) is True + assert ( + are_structfields_equal( + s1, s2, ignore_nullability=True, ignore_metadata=False + ) + is True + ) From f6d994f235d9b892c7aa2584520f66b9ec1fbf54 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sun, 17 Mar 2024 19:38:36 -0400 Subject: [PATCH 10/24] handle metadata, use existing `are_structfields_equal` comparison check --- chispa/schema_comparer.py | 84 ++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 71c52e1..d5315cf 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -9,41 +9,61 @@ class SchemasNotEqualError(Exception): pass -def create_schema_comparison_tree(s1, s2, ignore_nullable: bool) -> str: - def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: +def create_schema_comparison_tree( + s1, s2, ignore_nullable: bool, ignore_metadata: bool +) -> str: + def parse_schema_as_tree(s, indent: int, horizontal_char="-") -> tuple[list, list]: tree_string = [] + fields = [] for struct_field in s: nullable = ( "(nullable = true)" if struct_field.nullable else "(nullable = false)" ) + struct_field_type = struct_field.dataType.typeName() tree_string += [ - f"{indent * ' '}|{horizontal_char * 2} {struct_field.name}: {struct_field.dataType.typeName()} {nullable}" + f"{indent * ' '}|{horizontal_char * 2} {struct_field.name}: {struct_field_type} {nullable}" ] - if struct_field.dataType.typeName() == "struct": - tree_string += create_schema_tree( + if struct_field_type == "struct": + tree_string_nested, fields_nested = parse_schema_as_tree( struct_field.dataType, indent + 4, horizontal_char ) - return tree_string + fields += [struct_field] + tree_string += tree_string_nested + fields += fields_nested + continue + + fields += [struct_field] + return tree_string, fields tree_space = 6 horizontal_char = "-" - s1_tree = create_schema_tree(s1, 0, horizontal_char) - s2_tree = create_schema_tree(s2, 0, horizontal_char) + s1_tree, s1_fields = parse_schema_as_tree(s1, 0, horizontal_char) + s2_tree, s2_fields = parse_schema_as_tree(s2, 0, horizontal_char) widest_line = max(len(line) for line in s1_tree) - tree_string_combined = "\n\nschema1".ljust(widest_line + tree_space) + "schema2\n" - for i in range(max(len(s1_tree), len(s2_tree))): - line1 = "" - line2 = "" + longest_tree = max(len(s1_tree), len(s2_tree)) + schema_gap = widest_line + tree_space + + tree_string_combined = "\nschema1".ljust(schema_gap) + "schema2\n" + for i in range(longest_tree): + line1 = line2 = "" + s1_field = s2_field = None + if i < len(s1_tree): line1 = s1_tree[i] + s1_field = s1_fields[i] if i < len(s2_tree): line2 = s2_tree[i] + s2_field = s2_fields[i] + + tree_string_line = line1.ljust(schema_gap) + line2 - tree_string_line = line1.ljust(widest_line + tree_space) + line2 + if i >= len(s1_fields) or i >= len(s2_fields): + tree_string_combined += line_red(tree_string_line) + "\n" + continue - if are_schema_strings_equal(line1, line2, ignore_nullable): + if are_structfields_equal(s1_field, s2_field, ignore_nullable, ignore_metadata): tree_string_line = line_blue(tree_string_line) else: @@ -51,28 +71,17 @@ def create_schema_tree(s, indent: int, horizontal_char="-") -> list[str]: tree_string_combined += tree_string_line + "\n" - tree_string_combined += bcolors.LightBlue + tree_string_combined += bcolors.NC return tree_string_combined -def are_schema_strings_equal(s1: str, s2: str, ignore_nullable: bool) -> bool: - if not ignore_nullable: - return s1 == s2 - - s1_no_nullable = s1.replace("(nullable = true)", "").replace( - "(nullable = false)", "" - ) - s2_no_nullable = s2.replace("(nullable = true)", "").replace( - "(nullable = false)", "" - ) - return s1_no_nullable == s2_no_nullable - - -def create_schema_comparison_table(s1, s2): +def create_schema_comparison_table( + s1, s2, ignore_nullable: bool, ignore_metadata: bool +): t = PrettyTable(["schema1", "schema2"]) zipped = list(six.moves.zip_longest(s1, s2)) for sf1, sf2 in zipped: - if are_structfields_equal(sf1, sf2, True): + if are_structfields_equal(sf1, sf2, ignore_nullable, ignore_metadata): t.add_row([blue(sf1), blue(sf2)]) else: t.add_row([sf1, sf2]) @@ -87,13 +96,16 @@ def check_if_schemas_are_wide(s1, s2) -> bool: return contains_nested_structs or contains_many_columns -def handle_schemas_not_equal(s1, s2, ignore_nullable: bool): +def handle_schemas_not_equal(s1, s2, ignore_nullable: bool, ignore_metadata: bool): schemas_are_wide = check_if_schemas_are_wide(s1, s2) if schemas_are_wide: - error_message = create_schema_comparison_tree(s1, s2, ignore_nullable) + error_message = create_schema_comparison_tree( + s1, s2, ignore_nullable, ignore_metadata + ) else: - t = create_schema_comparison_table(s1, s2) + t = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata) error_message = "\n" + t.get_string() + print(repr(error_message)) raise SchemasNotEqualError(error_message) @@ -115,7 +127,7 @@ def inner(s1, s2, ignore_nullable, ignore_metadata): return True if not inner(s1, s2, ignore_nullable, ignore_metadata): - handle_schemas_not_equal(s1, s2, ignore_nullable) + handle_schemas_not_equal(s1, s2, ignore_nullable, ignore_metadata) # deprecate this @@ -123,13 +135,13 @@ def inner(s1, s2, ignore_nullable, ignore_metadata): # I think schema equality operations are really fast to begin with def assert_basic_schema_equality(s1, s2): if s1 != s2: - handle_schemas_not_equal(s1, s2, ignore_nullable=False) + handle_schemas_not_equal(s1, s2, ignore_nullable=False, ignore_metadata=False) # deprecate this. ignore_nullable should be a flag. def assert_schema_equality_ignore_nullable(s1, s2): if not are_schemas_equal_ignore_nullable(s1, s2): - handle_schemas_not_equal(s1, s2, ignore_nullable=True) + handle_schemas_not_equal(s1, s2, ignore_nullable=True, ignore_metadata=False) # deprecate this. ignore_nullable should be a flag. From 1fcd1df7e987de17c4ef4f87a157efe85ef3b609 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sun, 17 Mar 2024 19:38:56 -0400 Subject: [PATCH 11/24] update tests --- .../it_prints_correctly_for_wide_schemas.txt | 2 +- ...tly_for_wide_schemas_different_lengths.txt | 2 +- ...ectly_for_wide_schemas_ignore_nullable.txt | 2 +- ...r_wide_schemas_multiple_nested_structs.txt | 2 +- tests/test_schema_comparer.py | 20 +++++++++++++------ 5 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt index 6133f70..7d475b8 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt @@ -1 +1 @@ -'\n\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[34m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[34m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt index d6be3c0..4096c8e 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt @@ -1 +1 @@ -'\n\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[34m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[34m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt index 6133f70..c2dadf3 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt @@ -1 +1 @@ -'\n\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[34m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[34m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = false)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = false)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = false)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt index 9583d5c..91d34bf 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt @@ -1 +1 @@ -'\n\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[34m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[34m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[31m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[31m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file diff --git a/tests/test_schema_comparer.py b/tests/test_schema_comparer.py index ebe8a73..4742fdb 100644 --- a/tests/test_schema_comparer.py +++ b/tests/test_schema_comparer.py @@ -99,7 +99,10 @@ def it_prints_correctly_for_wide_schemas(): ] ) - result = create_schema_comparison_tree(s1, s2, ignore_nullable=False) + result = create_schema_comparison_tree( + s1, s2, ignore_nullable=False, ignore_metadata=False + ) + assert repr(result) == expected def it_prints_correctly_for_wide_schemas_multiple_nested_structs(): @@ -183,7 +186,9 @@ def it_prints_correctly_for_wide_schemas_multiple_nested_structs(): ] ) - result = create_schema_comparison_tree(s1, s2, ignore_nullable=False) + result = create_schema_comparison_tree( + s1, s2, ignore_nullable=False, ignore_metadata=False + ) assert repr(result) == expected def it_prints_correctly_for_wide_schemas_ignore_nullable(): @@ -214,15 +219,15 @@ def it_prints_correctly_for_wide_schemas_ignore_nullable(): s2 = StructType( [ StructField("name", StringType(), True), - StructField("age", IntegerType(), True), + StructField("age", IntegerType(), False), StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), False), StructField( "fav_colors", StructType( [ StructField("orange", IntegerType(), True), - StructField("green", IntegerType(), True), + StructField("green", IntegerType(), False), StructField("yellow", IntegerType(), True), ] ), @@ -230,7 +235,10 @@ def it_prints_correctly_for_wide_schemas_ignore_nullable(): ] ) - result = create_schema_comparison_tree(s1, s2, ignore_nullable=True) + result = create_schema_comparison_tree( + s1, s2, ignore_nullable=True, ignore_metadata=False + ) + assert repr(result) == expected def it_prints_correctly_for_wide_schemas_different_lengths(): From e522788751a41324e781b83203857bbabf920b73 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sun, 17 Mar 2024 19:39:06 -0400 Subject: [PATCH 12/24] add metadata tests --- ...ectly_for_wide_schemas_ignore_metadata.txt | 1 + ...rrectly_for_wide_schemas_with_metadata.txt | 1 + tests/test_schema_comparer.py | 101 +++++++++++++++++- 3 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt create mode 100644 tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt new file mode 100644 index 0000000..7d475b8 --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt @@ -0,0 +1 @@ +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt new file mode 100644 index 0000000..0fc3feb --- /dev/null +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt @@ -0,0 +1 @@ +'\nschema1 schema2\n\x1b[31m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file diff --git a/tests/test_schema_comparer.py b/tests/test_schema_comparer.py index 4742fdb..8e47280 100644 --- a/tests/test_schema_comparer.py +++ b/tests/test_schema_comparer.py @@ -287,7 +287,106 @@ def it_prints_correctly_for_wide_schemas_different_lengths(): ] ) - result = create_schema_comparison_tree(s1, s2, ignore_nullable=False) + result = create_schema_comparison_tree( + s1, s2, ignore_nullable=False, ignore_metadata=False + ) + assert repr(result) == expected + + def it_prints_correctly_for_wide_schemas_ignore_metadata(): + with open( + "tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt" + ) as f: + expected = f.read() + + s1 = StructType( + [ + StructField("name", StringType(), True, {"foo": "bar"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ] + ), + ), + ] + ) + + s2 = StructType( + [ + StructField("name", StringType(), True, {"foo": "baz"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ] + ), + ), + ] + ) + result = create_schema_comparison_tree( + s1, s2, ignore_nullable=False, ignore_metadata=True + ) + assert repr(result) == expected + + def it_prints_correctly_for_wide_schemas_with_metadata(): + with open( + "tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt" + ) as f: + expected = f.read() + + s1 = StructType( + [ + StructField("name", StringType(), True, {"foo": "bar"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ] + ), + ), + ] + ) + + s2 = StructType( + [ + StructField("name", StringType(), True, {"foo": "baz"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType( + [ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ] + ), + ), + ] + ) + + result = create_schema_comparison_tree( + s1, s2, ignore_nullable=False, ignore_metadata=False + ) assert repr(result) == expected From b55f601572d425c01010b64003c09622d5eb8750 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sun, 17 Mar 2024 19:50:08 -0400 Subject: [PATCH 13/24] simplify logic, remove horizontal character param --- chispa/schema_comparer.py | 48 ++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index d5315cf..e53d546 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -12,34 +12,38 @@ class SchemasNotEqualError(Exception): def create_schema_comparison_tree( s1, s2, ignore_nullable: bool, ignore_metadata: bool ) -> str: - def parse_schema_as_tree(s, indent: int, horizontal_char="-") -> tuple[list, list]: - tree_string = [] + def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: + tree_line = [] fields = [] + for struct_field in s: nullable = ( "(nullable = true)" if struct_field.nullable else "(nullable = false)" ) struct_field_type = struct_field.dataType.typeName() - tree_string += [ - f"{indent * ' '}|{horizontal_char * 2} {struct_field.name}: {struct_field_type} {nullable}" - ] - if struct_field_type == "struct": - tree_string_nested, fields_nested = parse_schema_as_tree( - struct_field.dataType, indent + 4, horizontal_char - ) + + struct_prefix = f"{indent * ' '}|{'-' * 2}" + struct_as_string = f"{struct_field.name}: {struct_field_type} {nullable}" + + tree_line += [f"{struct_prefix} {struct_as_string}"] + + if not struct_field_type == "struct": fields += [struct_field] - tree_string += tree_string_nested - fields += fields_nested continue + tree_line_nested, fields_nested = parse_schema_as_tree( + struct_field.dataType, indent + 4 + ) + fields += [struct_field] - return tree_string, fields + tree_line += tree_line_nested + fields += fields_nested - tree_space = 6 - horizontal_char = "-" + return tree_line, fields - s1_tree, s1_fields = parse_schema_as_tree(s1, 0, horizontal_char) - s2_tree, s2_fields = parse_schema_as_tree(s2, 0, horizontal_char) + tree_space = 6 + s1_tree, s1_fields = parse_schema_as_tree(s1, 0) + s2_tree, s2_fields = parse_schema_as_tree(s2, 0) widest_line = max(len(line) for line in s1_tree) longest_tree = max(len(s1_tree), len(s2_tree)) @@ -59,17 +63,10 @@ def parse_schema_as_tree(s, indent: int, horizontal_char="-") -> tuple[list, lis tree_string_line = line1.ljust(schema_gap) + line2 - if i >= len(s1_fields) or i >= len(s2_fields): - tree_string_combined += line_red(tree_string_line) + "\n" - continue - if are_structfields_equal(s1_field, s2_field, ignore_nullable, ignore_metadata): - tree_string_line = line_blue(tree_string_line) - + tree_string_combined += line_blue(tree_string_line) + "\n" else: - tree_string_line = line_red(tree_string_line) - - tree_string_combined += tree_string_line + "\n" + tree_string_combined += line_red(tree_string_line) + "\n" tree_string_combined += bcolors.NC return tree_string_combined @@ -105,7 +102,6 @@ def handle_schemas_not_equal(s1, s2, ignore_nullable: bool, ignore_metadata: boo else: t = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata) error_message = "\n" + t.get_string() - print(repr(error_message)) raise SchemasNotEqualError(error_message) From 01d1689fdab9720d6f331b120f8608bcaec1f864 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Sun, 17 Mar 2024 19:52:55 -0400 Subject: [PATCH 14/24] improve variable names --- chispa/schema_comparer.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index e53d546..cce811a 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -13,7 +13,7 @@ def create_schema_comparison_tree( s1, s2, ignore_nullable: bool, ignore_metadata: bool ) -> str: def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: - tree_line = [] + tree_lines = [] fields = [] for struct_field in s: @@ -25,7 +25,7 @@ def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: struct_prefix = f"{indent * ' '}|{'-' * 2}" struct_as_string = f"{struct_field.name}: {struct_field_type} {nullable}" - tree_line += [f"{struct_prefix} {struct_as_string}"] + tree_lines += [f"{struct_prefix} {struct_as_string}"] if not struct_field_type == "struct": fields += [struct_field] @@ -36,10 +36,10 @@ def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: ) fields += [struct_field] - tree_line += tree_line_nested + tree_lines += tree_line_nested fields += fields_nested - return tree_line, fields + return tree_lines, fields tree_space = 6 s1_tree, s1_fields = parse_schema_as_tree(s1, 0) @@ -49,7 +49,7 @@ def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: longest_tree = max(len(s1_tree), len(s2_tree)) schema_gap = widest_line + tree_space - tree_string_combined = "\nschema1".ljust(schema_gap) + "schema2\n" + tree = "\nschema1".ljust(schema_gap) + "schema2\n" for i in range(longest_tree): line1 = line2 = "" s1_field = s2_field = None @@ -61,15 +61,15 @@ def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: line2 = s2_tree[i] s2_field = s2_fields[i] - tree_string_line = line1.ljust(schema_gap) + line2 + tree_line = line1.ljust(schema_gap) + line2 if are_structfields_equal(s1_field, s2_field, ignore_nullable, ignore_metadata): - tree_string_combined += line_blue(tree_string_line) + "\n" + tree += line_blue(tree_line) + "\n" else: - tree_string_combined += line_red(tree_string_line) + "\n" + tree += line_red(tree_line) + "\n" - tree_string_combined += bcolors.NC - return tree_string_combined + tree += bcolors.NC + return tree def create_schema_comparison_table( From cf27a127d4d982269c0bb4f5f94dda6c02f358e9 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Fri, 22 Mar 2024 18:42:48 -0400 Subject: [PATCH 15/24] add `print_schema_diff` as wrapper to compare two schemas without error --- chispa/schema_comparer.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index cce811a..08ec3a2 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -9,6 +9,24 @@ class SchemasNotEqualError(Exception): pass +def print_schema_diff( + s1, s2, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table" +): + valid_output_formats = ["table", "tree"] + if output_format == "table": + schema_diff = create_schema_comparison_table( + s1, s2, ignore_nullable, ignore_metadata + ) + elif output_format == "tree": + schema_diff = create_schema_comparison_tree( + s1, s2, ignore_nullable, ignore_metadata + ) + else: + raise ValueError(f"output_format must be one of {valid_output_formats}") + + print(schema_diff) + + def create_schema_comparison_tree( s1, s2, ignore_nullable: bool, ignore_metadata: bool ) -> str: From a6b0d7aa4517cd5775812edd019df23746369cdd Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Fri, 22 Mar 2024 19:01:43 -0400 Subject: [PATCH 16/24] add missing return type hints --- chispa/schema_comparer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 08ec3a2..3e6a471 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -11,7 +11,7 @@ class SchemasNotEqualError(Exception): def print_schema_diff( s1, s2, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table" -): +) -> None: valid_output_formats = ["table", "tree"] if output_format == "table": schema_diff = create_schema_comparison_table( @@ -111,7 +111,9 @@ def check_if_schemas_are_wide(s1, s2) -> bool: return contains_nested_structs or contains_many_columns -def handle_schemas_not_equal(s1, s2, ignore_nullable: bool, ignore_metadata: bool): +def handle_schemas_not_equal( + s1, s2, ignore_nullable: bool, ignore_metadata: bool +) -> None: schemas_are_wide = check_if_schemas_are_wide(s1, s2) if schemas_are_wide: error_message = create_schema_comparison_tree( @@ -130,7 +132,9 @@ def assert_schema_equality(s1, s2, ignore_nullable=False, ignore_metadata=False) assert_schema_equality_full(s1, s2, ignore_nullable, ignore_metadata) -def assert_schema_equality_full(s1, s2, ignore_nullable=False, ignore_metadata=False): +def assert_schema_equality_full( + s1, s2, ignore_nullable=False, ignore_metadata=False +) -> None: def inner(s1, s2, ignore_nullable, ignore_metadata): if len(s1) != len(s2): return False From c12fd311ab93a0460a5febec062f2d4219a61bd4 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Tue, 24 Sep 2024 19:52:42 -0400 Subject: [PATCH 17/24] add six package --- poetry.lock | 4 ++-- pyproject.toml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index e26e9b0..595b19a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "astunparse" @@ -1354,4 +1354,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "74fd71f11fb19994adf4b8a836e4eed342eb54e5c2e5a69c391156f16b4a020d" +content-hash = "952eddd69722caf52e4431ef4ac8a402b1b555504089bcddd1ad1f7ed009c0cd" diff --git a/pyproject.toml b/pyproject.toml index d03ffef..75862d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.8,<4.0" prettytable = "^3.10.2" +six = "^1.16.0" [tool.poetry.group.dev.dependencies] pytest = "7.4.2" From 34eeebf80a1db5639409779093e413c98d73cf69 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Tue, 24 Sep 2024 19:53:02 -0400 Subject: [PATCH 18/24] add missing `create_schema_comparison_tree` function --- tests/test_schema_comparer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_schema_comparer.py b/tests/test_schema_comparer.py index e303ec2..5e12903 100644 --- a/tests/test_schema_comparer.py +++ b/tests/test_schema_comparer.py @@ -9,6 +9,7 @@ are_structfields_equal, assert_schema_equality, assert_schema_equality_ignore_nullable, + create_schema_comparison_tree, ) From 0b1ad8e12289a2c549ec877a3c2cdbc7dba278ef Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Tue, 24 Sep 2024 19:53:53 -0400 Subject: [PATCH 19/24] fix unit test failures in `test_schema_comparer` and `test_dataframe_comparer` --- chispa/schema_comparer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 1c69039..45fccae 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -3,9 +3,11 @@ import typing from itertools import zip_longest +import six from prettytable import PrettyTable from pyspark.sql.types import StructField, StructType +from chispa.bcolors import bcolors, line_blue, line_red from chispa.formatting import blue @@ -103,7 +105,7 @@ def create_schema_comparison_table( zipped = list(six.moves.zip_longest(s1, s2)) for sf1, sf2 in zipped: if are_structfields_equal(sf1, sf2, ignore_nullable, ignore_metadata): - t.add_row([blue(sf1), blue(sf2)]) + t.add_row([blue(str(sf1)), blue(str(sf2))]) else: t.add_row([sf1, sf2]) return t From 11437c53c685ab5f9f82c098933cde2ddb591b29 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 25 Sep 2024 12:09:46 -0400 Subject: [PATCH 20/24] replace six with itertools --- chispa/schema_comparer.py | 4 ++-- poetry.lock | 2 +- pyproject.toml | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 45fccae..bf48d07 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -1,9 +1,9 @@ from __future__ import annotations +import itertools import typing from itertools import zip_longest -import six from prettytable import PrettyTable from pyspark.sql.types import StructField, StructType @@ -102,7 +102,7 @@ def create_schema_comparison_table( s1, s2, ignore_nullable: bool, ignore_metadata: bool ): t = PrettyTable(["schema1", "schema2"]) - zipped = list(six.moves.zip_longest(s1, s2)) + zipped = list(itertools.zip_longest(s1, s2)) for sf1, sf2 in zipped: if are_structfields_equal(sf1, sf2, ignore_nullable, ignore_metadata): t.add_row([blue(str(sf1)), blue(str(sf2))]) diff --git a/poetry.lock b/poetry.lock index 595b19a..7d95f06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1354,4 +1354,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = ">=3.8,<4.0" -content-hash = "952eddd69722caf52e4431ef4ac8a402b1b555504089bcddd1ad1f7ed009c0cd" +content-hash = "74fd71f11fb19994adf4b8a836e4eed342eb54e5c2e5a69c391156f16b4a020d" diff --git a/pyproject.toml b/pyproject.toml index 75862d5..d03ffef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,6 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.8,<4.0" prettytable = "^3.10.2" -six = "^1.16.0" [tool.poetry.group.dev.dependencies] pytest = "7.4.2" From a787b7441fccb247649f6cd11806c242b7523ed8 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 25 Sep 2024 12:11:16 -0400 Subject: [PATCH 21/24] remove double import --- chispa/schema_comparer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index bf48d07..e4d50e1 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -1,6 +1,5 @@ from __future__ import annotations -import itertools import typing from itertools import zip_longest @@ -102,7 +101,7 @@ def create_schema_comparison_table( s1, s2, ignore_nullable: bool, ignore_metadata: bool ): t = PrettyTable(["schema1", "schema2"]) - zipped = list(itertools.zip_longest(s1, s2)) + zipped = list(zip_longest(s1, s2)) for sf1, sf2 in zipped: if are_structfields_equal(sf1, sf2, ignore_nullable, ignore_metadata): t.add_row([blue(str(sf1)), blue(str(sf2))]) From 8c036a636b4d30d7a92e537908923398d6ec764b Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 25 Sep 2024 18:36:15 -0400 Subject: [PATCH 22/24] formatting fixes --- chispa/schema_comparer.py | 42 +- .../it_prints_correctly_for_wide_schemas.txt | 2 +- ...tly_for_wide_schemas_different_lengths.txt | 2 +- ...ectly_for_wide_schemas_ignore_metadata.txt | 2 +- ...ectly_for_wide_schemas_ignore_nullable.txt | 2 +- ...r_wide_schemas_multiple_nested_structs.txt | 2 +- ...rrectly_for_wide_schemas_with_metadata.txt | 2 +- tests/test_schema_comparer.py | 538 +++++++----------- 8 files changed, 236 insertions(+), 356 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index e4d50e1..8e7b461 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -16,35 +16,25 @@ class SchemasNotEqualError(Exception): pass -def print_schema_diff( - s1, s2, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table" -) -> None: +def print_schema_diff(s1, s2, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table") -> None: valid_output_formats = ["table", "tree"] if output_format == "table": - schema_diff = create_schema_comparison_table( - s1, s2, ignore_nullable, ignore_metadata - ) + schema_diff = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata) elif output_format == "tree": - schema_diff = create_schema_comparison_tree( - s1, s2, ignore_nullable, ignore_metadata - ) + schema_diff = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata) else: raise ValueError(f"output_format must be one of {valid_output_formats}") print(schema_diff) -def create_schema_comparison_tree( - s1, s2, ignore_nullable: bool, ignore_metadata: bool -) -> str: +def create_schema_comparison_tree(s1, s2, ignore_nullable: bool, ignore_metadata: bool) -> str: def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: tree_lines = [] fields = [] for struct_field in s: - nullable = ( - "(nullable = true)" if struct_field.nullable else "(nullable = false)" - ) + nullable = "(nullable = true)" if struct_field.nullable else "(nullable = false)" struct_field_type = struct_field.dataType.typeName() struct_prefix = f"{indent * ' '}|{'-' * 2}" @@ -56,9 +46,7 @@ def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: fields += [struct_field] continue - tree_line_nested, fields_nested = parse_schema_as_tree( - struct_field.dataType, indent + 4 - ) + tree_line_nested, fields_nested = parse_schema_as_tree(struct_field.dataType, indent + 4) fields += [struct_field] tree_lines += tree_line_nested @@ -97,9 +85,7 @@ def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: return tree -def create_schema_comparison_table( - s1, s2, ignore_nullable: bool, ignore_metadata: bool -): +def create_schema_comparison_table(s1, s2, ignore_nullable: bool, ignore_metadata: bool): t = PrettyTable(["schema1", "schema2"]) zipped = list(zip_longest(s1, s2)) for sf1, sf2 in zipped: @@ -111,21 +97,17 @@ def create_schema_comparison_table( def check_if_schemas_are_wide(s1, s2) -> bool: - contains_nested_structs = any( - sf.dataType.typeName() == "struct" for sf in s1 - ) or any(sf.dataType.typeName() == "struct" for sf in s2) + contains_nested_structs = any(sf.dataType.typeName() == "struct" for sf in s1) or any( + sf.dataType.typeName() == "struct" for sf in s2 + ) contains_many_columns = len(s1) > 10 or len(s2) > 10 return contains_nested_structs or contains_many_columns -def handle_schemas_not_equal( - s1, s2, ignore_nullable: bool, ignore_metadata: bool -) -> None: +def handle_schemas_not_equal(s1, s2, ignore_nullable: bool, ignore_metadata: bool) -> None: schemas_are_wide = check_if_schemas_are_wide(s1, s2) if schemas_are_wide: - error_message = create_schema_comparison_tree( - s1, s2, ignore_nullable, ignore_metadata - ) + error_message = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata) else: t = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata) error_message = "\n" + t.get_string() diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt index 7d475b8..6034fea 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt @@ -1 +1 @@ -'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt index 4096c8e..be22fb0 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt @@ -1 +1 @@ -'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt index 7d475b8..6034fea 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt @@ -1 +1 @@ -'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt index c2dadf3..1fd9390 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt @@ -1 +1 @@ -'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = false)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = false)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = false)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = false)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = false)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = false)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt index 91d34bf..9ca3165 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt @@ -1 +1 @@ -'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[31m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[31m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file +'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[31m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[31m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt index 0fc3feb..79214f3 100644 --- a/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt +++ b/tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt @@ -1 +1 @@ -'\nschema1 schema2\n\x1b[31m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' \ No newline at end of file +'\nschema1 schema2\n\x1b[31m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m' diff --git a/tests/test_schema_comparer.py b/tests/test_schema_comparer.py index 5e12903..d6c2af8 100644 --- a/tests/test_schema_comparer.py +++ b/tests/test_schema_comparer.py @@ -53,337 +53,255 @@ def it_throws_when_schema_lengths_differ(): def describe_tree_string(): def it_prints_correctly_for_wide_schemas(): - with open( - "tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt" - ) as f: + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas.txt") as f: expected = f.read() - s1 = StructType( - [ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("red", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("blue", IntegerType(), True), - ] - ), - ), - ] - ) - - s2 = StructType( - [ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("orange", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("yellow", IntegerType(), True), - ] - ), - ), - ] - ) + s1 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ]), + ), + ]) - result = create_schema_comparison_tree( - s1, s2, ignore_nullable=False, ignore_metadata=False - ) + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) assert repr(result) == expected def it_prints_correctly_for_wide_schemas_multiple_nested_structs(): - with open( - "tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt" - ) as f: + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt") as f: expected = f.read() - s1 = StructType( - [ - StructField("name", StringType(), True), - StructField( - "fav_genres", - StructType( - [ - StructField( - "rock", - StructType( - [ - StructField("metal", IntegerType(), True), - StructField("punk", IntegerType(), True), - ] - ), - True, - ), - StructField( - "electronic", - StructType( - [ - StructField("house", IntegerType(), True), - StructField("dubstep", IntegerType(), True), - ] - ), - True, - ), - ] + s1 = StructType([ + StructField("name", StringType(), True), + StructField( + "fav_genres", + StructType([ + StructField( + "rock", + StructType([ + StructField("metal", IntegerType(), True), + StructField("punk", IntegerType(), True), + ]), + True, + ), + StructField( + "electronic", + StructType([ + StructField("house", IntegerType(), True), + StructField("dubstep", IntegerType(), True), + ]), + True, + ), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField( + "fav_genres", + StructType([ + StructField( + "rock", + StructType([ + StructField("metal", IntegerType(), True), + StructField("classic", IntegerType(), True), + ]), + True, ), - ), - ] - ) - - s2 = StructType( - [ - StructField("name", StringType(), True), - StructField( - "fav_genres", - StructType( - [ - StructField( - "rock", - StructType( - [ - StructField("metal", IntegerType(), True), - StructField("classic", IntegerType(), True), - ] - ), - True, - ), - StructField( - "electronic", - StructType( - [ - StructField("house", IntegerType(), True), - StructField("dubstep", IntegerType(), True), - ] - ), - True, - ), - StructField( - "pop", - StructType( - [ - StructField("pop", IntegerType(), True), - ] - ), - True, - ), - ] + StructField( + "electronic", + StructType([ + StructField("house", IntegerType(), True), + StructField("dubstep", IntegerType(), True), + ]), + True, ), - ), - ] - ) + StructField( + "pop", + StructType([ + StructField("pop", IntegerType(), True), + ]), + True, + ), + ]), + ), + ]) - result = create_schema_comparison_tree( - s1, s2, ignore_nullable=False, ignore_metadata=False - ) + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) assert repr(result) == expected def it_prints_correctly_for_wide_schemas_ignore_nullable(): - with open( - "tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt" - ) as f: + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt") as f: expected = f.read() - s1 = StructType( - [ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("red", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("blue", IntegerType(), True), - ] - ), - ), - ] - ) - - s2 = StructType( - [ - StructField("name", StringType(), True), - StructField("age", IntegerType(), False), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), False), - StructField( - "fav_colors", - StructType( - [ - StructField("orange", IntegerType(), True), - StructField("green", IntegerType(), False), - StructField("yellow", IntegerType(), True), - ] - ), - ), - ] - ) + s1 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), False), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), False), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), False), + StructField("yellow", IntegerType(), True), + ]), + ), + ]) - result = create_schema_comparison_tree( - s1, s2, ignore_nullable=True, ignore_metadata=False - ) + result = create_schema_comparison_tree(s1, s2, ignore_nullable=True, ignore_metadata=False) assert repr(result) == expected def it_prints_correctly_for_wide_schemas_different_lengths(): - with open( - "tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt" - ) as f: + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt") as f: expected = f.read() - s1 = StructType( - [ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("red", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("blue", IntegerType(), True), - ] - ), - ), - ] - ) - - s2 = StructType( - [ - StructField("name", StringType(), True), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("orange", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("yellow", IntegerType(), True), - StructField("purple", IntegerType(), True), - ] - ), - ), - StructField("phone_number", StringType(), True), - ] - ) - - result = create_schema_comparison_tree( - s1, s2, ignore_nullable=False, ignore_metadata=False - ) + s1 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + StructField("purple", IntegerType(), True), + ]), + ), + StructField("phone_number", StringType(), True), + ]) + + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) assert repr(result) == expected def it_prints_correctly_for_wide_schemas_ignore_metadata(): - with open( - "tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt" - ) as f: + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt") as f: expected = f.read() - s1 = StructType( - [ - StructField("name", StringType(), True, {"foo": "bar"}), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("red", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("blue", IntegerType(), True), - ] - ), - ), - ] - ) - - s2 = StructType( - [ - StructField("name", StringType(), True, {"foo": "baz"}), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("orange", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("yellow", IntegerType(), True), - ] - ), - ), - ] - ) - result = create_schema_comparison_tree( - s1, s2, ignore_nullable=False, ignore_metadata=True - ) + s1 = StructType([ + StructField("name", StringType(), True, {"foo": "bar"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True, {"foo": "baz"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ]), + ), + ]) + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=True) assert repr(result) == expected def it_prints_correctly_for_wide_schemas_with_metadata(): - with open( - "tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt" - ) as f: + with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt") as f: expected = f.read() - s1 = StructType( - [ - StructField("name", StringType(), True, {"foo": "bar"}), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("red", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("blue", IntegerType(), True), - ] - ), - ), - ] - ) - - s2 = StructType( - [ - StructField("name", StringType(), True, {"foo": "baz"}), - StructField("age", IntegerType(), True), - StructField("fav_number", IntegerType(), True), - StructField("fav_numbers", ArrayType(IntegerType(), True), True), - StructField( - "fav_colors", - StructType( - [ - StructField("orange", IntegerType(), True), - StructField("green", IntegerType(), True), - StructField("yellow", IntegerType(), True), - ] - ), - ), - ] - ) + s1 = StructType([ + StructField("name", StringType(), True, {"foo": "bar"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("red", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("blue", IntegerType(), True), + ]), + ), + ]) + + s2 = StructType([ + StructField("name", StringType(), True, {"foo": "baz"}), + StructField("age", IntegerType(), True), + StructField("fav_number", IntegerType(), True), + StructField("fav_numbers", ArrayType(IntegerType(), True), True), + StructField( + "fav_colors", + StructType([ + StructField("orange", IntegerType(), True), + StructField("green", IntegerType(), True), + StructField("yellow", IntegerType(), True), + ]), + ), + ]) - result = create_schema_comparison_tree( - s1, s2, ignore_nullable=False, ignore_metadata=False - ) + result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) assert repr(result) == expected @@ -511,39 +429,19 @@ def it_returns_true_when_different_nullability_within_struct(): def it_returns_false_when_metadata_differs(): s1 = StructField("coords", StringType(), True, {"hi": "whatever"}) s2 = StructField("coords", StringType(), True, {"hi": "no"}) - assert ( - are_structfields_equal( - s1, s2, ignore_nullability=True, ignore_metadata=False - ) - is False - ) + assert are_structfields_equal(s1, s2, ignore_nullability=True, ignore_metadata=False) is False def it_allows_metadata_to_be_ignored(): s1 = StructField("coords", StringType(), True, {"hi": "whatever"}) s2 = StructField("coords", StringType(), True, {"hi": "no"}) - assert ( - are_structfields_equal( - s1, s2, ignore_nullability=False, ignore_metadata=True - ) - is True - ) + assert are_structfields_equal(s1, s2, ignore_nullability=False, ignore_metadata=True) is True def it_allows_nullability_and_metadata_to_be_ignored(): s1 = StructField("coords", StringType(), True, {"hi": "whatever"}) s2 = StructField("coords", StringType(), False, {"hi": "no"}) - assert ( - are_structfields_equal( - s1, s2, ignore_nullability=True, ignore_metadata=True - ) - is True - ) + assert are_structfields_equal(s1, s2, ignore_nullability=True, ignore_metadata=True) is True def it_returns_true_when_metadata_is_the_same(): s1 = StructField("coords", StringType(), True, {"hi": "whatever"}) s2 = StructField("coords", StringType(), True, {"hi": "whatever"}) - assert ( - are_structfields_equal( - s1, s2, ignore_nullability=True, ignore_metadata=False - ) - is True - ) + assert are_structfields_equal(s1, s2, ignore_nullability=True, ignore_metadata=False) is True From c54f2f74d6337f4319976f3f55b76eea50731087 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 25 Sep 2024 18:56:58 -0400 Subject: [PATCH 23/24] handle mypy issues --- chispa/schema_comparer.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/chispa/schema_comparer.py b/chispa/schema_comparer.py index 8e7b461..2be6054 100644 --- a/chispa/schema_comparer.py +++ b/chispa/schema_comparer.py @@ -16,20 +16,22 @@ class SchemasNotEqualError(Exception): pass -def print_schema_diff(s1, s2, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table") -> None: +def print_schema_diff( + s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table" +) -> None: valid_output_formats = ["table", "tree"] if output_format == "table": - schema_diff = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata) + schema_diff_table: PrettyTable = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata) + print(schema_diff_table) elif output_format == "tree": - schema_diff = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata) + schema_diff_tree: str = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata) + print(schema_diff_tree) else: raise ValueError(f"output_format must be one of {valid_output_formats}") - print(schema_diff) - -def create_schema_comparison_tree(s1, s2, ignore_nullable: bool, ignore_metadata: bool) -> str: - def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: +def create_schema_comparison_tree(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool) -> str: + def parse_schema_as_tree(s: StructType, indent: int) -> tuple[list[str], list[StructField]]: tree_lines = [] fields = [] @@ -46,7 +48,7 @@ def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: fields += [struct_field] continue - tree_line_nested, fields_nested = parse_schema_as_tree(struct_field.dataType, indent + 4) + tree_line_nested, fields_nested = parse_schema_as_tree(struct_field.dataType, indent + 4) # type: ignore[arg-type] fields += [struct_field] tree_lines += tree_line_nested @@ -85,7 +87,9 @@ def parse_schema_as_tree(s, indent: int) -> tuple[list, list]: return tree -def create_schema_comparison_table(s1, s2, ignore_nullable: bool, ignore_metadata: bool): +def create_schema_comparison_table( + s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool +) -> PrettyTable: t = PrettyTable(["schema1", "schema2"]) zipped = list(zip_longest(s1, s2)) for sf1, sf2 in zipped: @@ -96,7 +100,7 @@ def create_schema_comparison_table(s1, s2, ignore_nullable: bool, ignore_metadat return t -def check_if_schemas_are_wide(s1, s2) -> bool: +def check_if_schemas_are_wide(s1: StructType, s2: StructType) -> bool: contains_nested_structs = any(sf.dataType.typeName() == "struct" for sf in s1) or any( sf.dataType.typeName() == "struct" for sf in s2 ) @@ -104,7 +108,7 @@ def check_if_schemas_are_wide(s1, s2) -> bool: return contains_nested_structs or contains_many_columns -def handle_schemas_not_equal(s1, s2, ignore_nullable: bool, ignore_metadata: bool) -> None: +def handle_schemas_not_equal(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata: bool) -> None: schemas_are_wide = check_if_schemas_are_wide(s1, s2) if schemas_are_wide: error_message = create_schema_comparison_tree(s1, s2, ignore_nullable, ignore_metadata) From 7c5a78b5dda8b7d35462fd65dcb261ff550fd6d4 Mon Sep 17 00:00:00 2001 From: Jeff Brennan Date: Wed, 25 Sep 2024 19:01:22 -0400 Subject: [PATCH 24/24] update tests to include newline added by pre-commit format --- tests/test_schema_comparer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_schema_comparer.py b/tests/test_schema_comparer.py index d6c2af8..40a5916 100644 --- a/tests/test_schema_comparer.py +++ b/tests/test_schema_comparer.py @@ -88,7 +88,7 @@ def it_prints_correctly_for_wide_schemas(): result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) - assert repr(result) == expected + assert repr(result) + "\n" == expected def it_prints_correctly_for_wide_schemas_multiple_nested_structs(): with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_multiple_nested_structs.txt") as f: @@ -152,7 +152,7 @@ def it_prints_correctly_for_wide_schemas_multiple_nested_structs(): ]) result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) - assert repr(result) == expected + assert repr(result) + "\n" == expected def it_prints_correctly_for_wide_schemas_ignore_nullable(): with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_nullable.txt") as f: @@ -190,7 +190,7 @@ def it_prints_correctly_for_wide_schemas_ignore_nullable(): result = create_schema_comparison_tree(s1, s2, ignore_nullable=True, ignore_metadata=False) - assert repr(result) == expected + assert repr(result) + "\n" == expected def it_prints_correctly_for_wide_schemas_different_lengths(): with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_different_lengths.txt") as f: @@ -229,7 +229,7 @@ def it_prints_correctly_for_wide_schemas_different_lengths(): ]) result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) - assert repr(result) == expected + assert repr(result) + "\n" == expected def it_prints_correctly_for_wide_schemas_ignore_metadata(): with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_ignore_metadata.txt") as f: @@ -265,7 +265,7 @@ def it_prints_correctly_for_wide_schemas_ignore_metadata(): ), ]) result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=True) - assert repr(result) == expected + assert repr(result) + "\n" == expected def it_prints_correctly_for_wide_schemas_with_metadata(): with open("tests/data/tree_string/it_prints_correctly_for_wide_schemas_with_metadata.txt") as f: @@ -302,7 +302,7 @@ def it_prints_correctly_for_wide_schemas_with_metadata(): ]) result = create_schema_comparison_tree(s1, s2, ignore_nullable=False, ignore_metadata=False) - assert repr(result) == expected + assert repr(result) + "\n" == expected def describe_assert_schema_equality_ignore_nullable():