Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tree string #95

Merged
merged 25 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c5062f8
prelim tree string code - move table creation to separate function
jeffbrennan Mar 13, 2024
1babd2c
add prelim test for tree string
jeffbrennan Mar 13, 2024
a2c962f
add prelim tree string code
jeffbrennan Mar 14, 2024
385f140
add side by side schema tree string
jeffbrennan Mar 14, 2024
19082fd
improve tree appearance
jeffbrennan Mar 14, 2024
4f62467
simplify formatting
jeffbrennan Mar 16, 2024
8468412
add helper functions + autoformat
jeffbrennan Mar 16, 2024
8ffa73f
handle ignore nullable in tree string
jeffbrennan Mar 16, 2024
4c5cbd5
add print output tests
jeffbrennan Mar 17, 2024
f6d994f
handle metadata, use existing `are_structfields_equal` comparison check
jeffbrennan Mar 17, 2024
1fcd1df
update tests
jeffbrennan Mar 17, 2024
e522788
add metadata tests
jeffbrennan Mar 17, 2024
b55f601
simplify logic, remove horizontal character param
jeffbrennan Mar 17, 2024
01d1689
improve variable names
jeffbrennan Mar 17, 2024
cf27a12
add `print_schema_diff` as wrapper to compare two schemas without error
jeffbrennan Mar 22, 2024
a6b0d7a
add missing return type hints
jeffbrennan Mar 22, 2024
b11b888
Merge branch 'main' into add-tree-string2
jeffbrennan Sep 24, 2024
c12fd31
add six package
jeffbrennan Sep 24, 2024
34eeebf
add missing `create_schema_comparison_tree` function
jeffbrennan Sep 24, 2024
0b1ad8e
fix unit test failures in `test_schema_comparer` and `test_dataframe_…
jeffbrennan Sep 24, 2024
11437c5
replace six with itertools
jeffbrennan Sep 25, 2024
a787b74
remove double import
jeffbrennan Sep 25, 2024
8c036a6
formatting fixes
jeffbrennan Sep 25, 2024
c54f2f7
handle mypy issues
jeffbrennan Sep 25, 2024
7c5a78b
update tests to include newline added by pre-commit format
jeffbrennan Sep 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 37 additions & 27 deletions chispa/bcolors.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,50 @@
class bcolors:
NC = '\033[0m' # No Color, reset all

Bold = '\033[1m'
Underlined = '\033[4m'
Blink = '\033[5m'
Inverted = '\033[7m'
Hidden = '\033[8m'

Black = '\033[30m'
Red = '\033[31m'
Green = '\033[32m'
Yellow = '\033[33m'
Blue = '\033[34m'
Purple = '\033[35m'
Cyan = '\033[36m'
LightGray = '\033[37m'
DarkGray = '\033[30m'
LightRed = '\033[31m'
LightGreen = '\033[32m'
LightYellow = '\033[93m'
LightBlue = '\033[34m'
LightPurple = '\033[35m'
LightCyan = '\033[36m'
White = '\033[97m'
NC = "\033[0m" # No Color, reset all

Bold = "\033[1m"
Underlined = "\033[4m"
Blink = "\033[5m"
Inverted = "\033[7m"
Hidden = "\033[8m"

Black = "\033[30m"
Red = "\033[31m"
Green = "\033[32m"
Yellow = "\033[33m"
Blue = "\033[34m"
Purple = "\033[35m"
Cyan = "\033[36m"
LightGray = "\033[37m"
DarkGray = "\033[30m"
LightRed = "\033[31m"
LightGreen = "\033[32m"
LightYellow = "\033[93m"
LightBlue = "\033[34m"
LightPurple = "\033[35m"
LightCyan = "\033[36m"
White = "\033[97m"

# Style
Bold = '\033[1m'
Underline = '\033[4m'
Bold = "\033[1m"
Underline = "\033[4m"


def blue(s: str) -> str:
return bcolors.LightBlue + str(s) + bcolors.LightRed


def line_blue(s: str) -> str:
return bcolors.LightBlue + s + bcolors.NC


def line_red(s: str) -> str:
return bcolors.LightRed + s + bcolors.NC


def underline_text(input_text: str) -> str:
"""
Takes an input string and returns a white, underlined string (based on PrettyTable formatting)
"""
return bcolors.White + bcolors.Underline + input_text + bcolors.NC + bcolors.LightRed
return (
bcolors.White + bcolors.Underline + input_text + bcolors.NC + bcolors.LightRed
)
157 changes: 127 additions & 30 deletions chispa/schema_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,125 @@


class SchemasNotEqualError(Exception):
"""The schemas are not equal"""
pass
"""The schemas are not equal"""

pass


def print_schema_diff(
s1, s2, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table"
) -> None:
valid_output_formats = ["table", "tree"]
if output_format == "table":
schema_diff = create_schema_comparison_table(
s1, s2, ignore_nullable, ignore_metadata
)
elif output_format == "tree":
schema_diff = create_schema_comparison_tree(
s1, s2, ignore_nullable, ignore_metadata
)
else:
raise ValueError(f"output_format must be one of {valid_output_formats}")

print(schema_diff)


def create_schema_comparison_tree(
s1, s2, ignore_nullable: bool, ignore_metadata: bool
) -> str:
def parse_schema_as_tree(s, indent: int) -> tuple[list, list]:
tree_lines = []
fields = []

for struct_field in s:
nullable = (
"(nullable = true)" if struct_field.nullable else "(nullable = false)"
)
struct_field_type = struct_field.dataType.typeName()

struct_prefix = f"{indent * ' '}|{'-' * 2}"
struct_as_string = f"{struct_field.name}: {struct_field_type} {nullable}"

tree_lines += [f"{struct_prefix} {struct_as_string}"]

if not struct_field_type == "struct":
fields += [struct_field]
continue

tree_line_nested, fields_nested = parse_schema_as_tree(
struct_field.dataType, indent + 4
)

fields += [struct_field]
tree_lines += tree_line_nested
fields += fields_nested

return tree_lines, fields

tree_space = 6
s1_tree, s1_fields = parse_schema_as_tree(s1, 0)
s2_tree, s2_fields = parse_schema_as_tree(s2, 0)

widest_line = max(len(line) for line in s1_tree)
longest_tree = max(len(s1_tree), len(s2_tree))
schema_gap = widest_line + tree_space

tree = "\nschema1".ljust(schema_gap) + "schema2\n"
for i in range(longest_tree):
line1 = line2 = ""
s1_field = s2_field = None

if i < len(s1_tree):
line1 = s1_tree[i]
s1_field = s1_fields[i]
if i < len(s2_tree):
line2 = s2_tree[i]
s2_field = s2_fields[i]

tree_line = line1.ljust(schema_gap) + line2

if are_structfields_equal(s1_field, s2_field, ignore_nullable, ignore_metadata):
tree += line_blue(tree_line) + "\n"
else:
tree += line_red(tree_line) + "\n"

tree += bcolors.NC
return tree


def create_schema_comparison_table(
s1, s2, ignore_nullable: bool, ignore_metadata: bool
):
t = PrettyTable(["schema1", "schema2"])
zipped = list(six.moves.zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, ignore_nullable, ignore_metadata):
t.add_row([blue(sf1), blue(sf2)])
else:
t.add_row([sf1, sf2])
return t


def check_if_schemas_are_wide(s1, s2) -> bool:
contains_nested_structs = any(
sf.dataType.typeName() == "struct" for sf in s1
) or any(sf.dataType.typeName() == "struct" for sf in s2)
contains_many_columns = len(s1) > 10 or len(s2) > 10
return contains_nested_structs or contains_many_columns


def handle_schemas_not_equal(
s1, s2, ignore_nullable: bool, ignore_metadata: bool
) -> None:
schemas_are_wide = check_if_schemas_are_wide(s1, s2)
if schemas_are_wide:
error_message = create_schema_comparison_tree(
s1, s2, ignore_nullable, ignore_metadata
)
else:
t = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata)
error_message = "\n" + t.get_string()
raise SchemasNotEqualError(error_message)


def assert_schema_equality(s1, s2, ignore_nullable=False, ignore_metadata=False):
Expand All @@ -15,7 +132,9 @@ def assert_schema_equality(s1, s2, ignore_nullable=False, ignore_metadata=False)
assert_schema_equality_full(s1, s2, ignore_nullable, ignore_metadata)


def assert_schema_equality_full(s1, s2, ignore_nullable=False, ignore_metadata=False):
def assert_schema_equality_full(
s1, s2, ignore_nullable=False, ignore_metadata=False
) -> None:
def inner(s1, s2, ignore_nullable, ignore_metadata):
if len(s1) != len(s2):
return False
Expand All @@ -26,43 +145,21 @@ def inner(s1, s2, ignore_nullable, ignore_metadata):
return True

if not inner(s1, s2, ignore_nullable, ignore_metadata):
t = PrettyTable(["schema1", "schema2"])
zipped = list(six.moves.zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, True):
t.add_row([blue(sf1), blue(sf2)])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())
handle_schemas_not_equal(s1, s2, ignore_nullable, ignore_metadata)


# deprecate this
# perhaps it is a little faster, but do we really need this?
# I think schema equality operations are really fast to begin with
def assert_basic_schema_equality(s1, s2):
if s1 != s2:
t = PrettyTable(["schema1", "schema2"])
zipped = list(six.moves.zip_longest(s1, s2))
for sf1, sf2 in zipped:
if sf1 == sf2:
t.add_row([blue(sf1), blue(sf2)])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())

handle_schemas_not_equal(s1, s2, ignore_nullable=False, ignore_metadata=False)


# deprecate this. ignore_nullable should be a flag.
def assert_schema_equality_ignore_nullable(s1, s2):
if not are_schemas_equal_ignore_nullable(s1, s2):
t = PrettyTable(["schema1", "schema2"])
zipped = list(six.moves.zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, True):
t.add_row([blue(sf1), blue(sf2)])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())
handle_schemas_not_equal(s1, s2, ignore_nullable=True, ignore_metadata=False)


# deprecate this. ignore_nullable should be a flag.
Expand Down Expand Up @@ -101,9 +198,9 @@ def are_datatypes_equal_ignore_nullable(dt1, dt2):
"""
if dt1.typeName() == dt2.typeName():
# Account for array types by inspecting elementType.
if dt1.typeName() == 'array':
if dt1.typeName() == "array":
return are_datatypes_equal_ignore_nullable(dt1.elementType, dt2.elementType)
elif dt1.typeName() == 'struct':
elif dt1.typeName() == "struct":
return are_schemas_equal_ignore_nullable(dt1, dt2)
else:
return True
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = false)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = false)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = false)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[31m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[31m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[31m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Loading