Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tree string #95

Merged
merged 25 commits into from
Oct 12, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c5062f8
prelim tree string code - move table creation to separate function
jeffbrennan Mar 13, 2024
1babd2c
add prelim test for tree string
jeffbrennan Mar 13, 2024
a2c962f
add prelim tree string code
jeffbrennan Mar 14, 2024
385f140
add side by side schema tree string
jeffbrennan Mar 14, 2024
19082fd
improve tree appearance
jeffbrennan Mar 14, 2024
4f62467
simplify formatting
jeffbrennan Mar 16, 2024
8468412
add helper functions + autoformat
jeffbrennan Mar 16, 2024
8ffa73f
handle ignore nullable in tree string
jeffbrennan Mar 16, 2024
4c5cbd5
add print output tests
jeffbrennan Mar 17, 2024
f6d994f
handle metadata, use existing `are_structfields_equal` comparison check
jeffbrennan Mar 17, 2024
1fcd1df
update tests
jeffbrennan Mar 17, 2024
e522788
add metadata tests
jeffbrennan Mar 17, 2024
b55f601
simplify logic, remove horizontal character param
jeffbrennan Mar 17, 2024
01d1689
improve variable names
jeffbrennan Mar 17, 2024
cf27a12
add `print_schema_diff` as wrapper to compare two schemas without error
jeffbrennan Mar 22, 2024
a6b0d7a
add missing return type hints
jeffbrennan Mar 22, 2024
b11b888
Merge branch 'main' into add-tree-string2
jeffbrennan Sep 24, 2024
c12fd31
add six package
jeffbrennan Sep 24, 2024
34eeebf
add missing `create_schema_comparison_tree` function
jeffbrennan Sep 24, 2024
0b1ad8e
fix unit test failures in `test_schema_comparer` and `test_dataframe_…
jeffbrennan Sep 24, 2024
11437c5
replace six with itertools
jeffbrennan Sep 25, 2024
a787b74
remove double import
jeffbrennan Sep 25, 2024
8c036a6
formatting fixes
jeffbrennan Sep 25, 2024
c54f2f7
handle mypy issues
jeffbrennan Sep 25, 2024
7c5a78b
update tests to include newline added by pre-commit format
jeffbrennan Sep 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions chispa/bcolors.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ def blue(s: str) -> str:
return bcolors.LightBlue + str(s) + bcolors.LightRed


def line_blue(s: str) -> str:
return bcolors.LightBlue + s + bcolors.NC


def line_red(s: str) -> str:
return bcolors.LightRed + s + bcolors.NC


def underline_text(input_text: str) -> str:
"""
Takes an input string and returns a white, underlined string (based on PrettyTable formatting)
Expand Down
145 changes: 121 additions & 24 deletions chispa/schema_comparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import typing
from itertools import zip_longest

import six
Copy link
Collaborator

@fpgmaas fpgmaas Sep 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we need six; We can use itertools.zip_longest from the standard library.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated!

from prettytable import PrettyTable
from pyspark.sql.types import StructField, StructType

from chispa.bcolors import bcolors, line_blue, line_red
from chispa.formatting import blue


Expand All @@ -15,6 +17,122 @@ class SchemasNotEqualError(Exception):
pass


def print_schema_diff(
s1, s2, ignore_nullable: bool, ignore_metadata: bool, output_format: str = "table"
) -> None:
valid_output_formats = ["table", "tree"]
if output_format == "table":
schema_diff = create_schema_comparison_table(
s1, s2, ignore_nullable, ignore_metadata
)
elif output_format == "tree":
schema_diff = create_schema_comparison_tree(
s1, s2, ignore_nullable, ignore_metadata
)
else:
raise ValueError(f"output_format must be one of {valid_output_formats}")

print(schema_diff)


def create_schema_comparison_tree(
s1, s2, ignore_nullable: bool, ignore_metadata: bool
) -> str:
def parse_schema_as_tree(s, indent: int) -> tuple[list, list]:
tree_lines = []
fields = []

for struct_field in s:
nullable = (
"(nullable = true)" if struct_field.nullable else "(nullable = false)"
)
struct_field_type = struct_field.dataType.typeName()

struct_prefix = f"{indent * ' '}|{'-' * 2}"
struct_as_string = f"{struct_field.name}: {struct_field_type} {nullable}"

tree_lines += [f"{struct_prefix} {struct_as_string}"]

if not struct_field_type == "struct":
fields += [struct_field]
continue

tree_line_nested, fields_nested = parse_schema_as_tree(
struct_field.dataType, indent + 4
)

fields += [struct_field]
tree_lines += tree_line_nested
fields += fields_nested

return tree_lines, fields

tree_space = 6
s1_tree, s1_fields = parse_schema_as_tree(s1, 0)
s2_tree, s2_fields = parse_schema_as_tree(s2, 0)

widest_line = max(len(line) for line in s1_tree)
longest_tree = max(len(s1_tree), len(s2_tree))
schema_gap = widest_line + tree_space

tree = "\nschema1".ljust(schema_gap) + "schema2\n"
for i in range(longest_tree):
line1 = line2 = ""
s1_field = s2_field = None

if i < len(s1_tree):
line1 = s1_tree[i]
s1_field = s1_fields[i]
if i < len(s2_tree):
line2 = s2_tree[i]
s2_field = s2_fields[i]

tree_line = line1.ljust(schema_gap) + line2

if are_structfields_equal(s1_field, s2_field, ignore_nullable, ignore_metadata):
tree += line_blue(tree_line) + "\n"
else:
tree += line_red(tree_line) + "\n"

tree += bcolors.NC
return tree


def create_schema_comparison_table(
s1, s2, ignore_nullable: bool, ignore_metadata: bool
):
t = PrettyTable(["schema1", "schema2"])
zipped = list(six.moves.zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, ignore_nullable, ignore_metadata):
t.add_row([blue(str(sf1)), blue(str(sf2))])
else:
t.add_row([sf1, sf2])
return t


def check_if_schemas_are_wide(s1, s2) -> bool:
contains_nested_structs = any(
sf.dataType.typeName() == "struct" for sf in s1
) or any(sf.dataType.typeName() == "struct" for sf in s2)
contains_many_columns = len(s1) > 10 or len(s2) > 10
return contains_nested_structs or contains_many_columns


def handle_schemas_not_equal(
s1, s2, ignore_nullable: bool, ignore_metadata: bool
) -> None:
schemas_are_wide = check_if_schemas_are_wide(s1, s2)
if schemas_are_wide:
error_message = create_schema_comparison_tree(
s1, s2, ignore_nullable, ignore_metadata
)
else:
t = create_schema_comparison_table(s1, s2, ignore_nullable, ignore_metadata)
error_message = "\n" + t.get_string()
raise SchemasNotEqualError(error_message)


def assert_schema_equality(
s1: StructType, s2: StructType, ignore_nullable: bool = False, ignore_metadata: bool = False
) -> None:
Expand All @@ -37,42 +155,21 @@ def inner(s1: StructType, s2: StructType, ignore_nullable: bool, ignore_metadata
return True

if not inner(s1, s2, ignore_nullable, ignore_metadata):
t = PrettyTable(["schema1", "schema2"])
zipped = list(zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, True):
t.add_row([blue(str(sf1)), blue(str(sf2))])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())
handle_schemas_not_equal(s1, s2, ignore_nullable, ignore_metadata)


# deprecate this
# perhaps it is a little faster, but do we really need this?
# I think schema equality operations are really fast to begin with
def assert_basic_schema_equality(s1: StructType, s2: StructType) -> None:
if s1 != s2:
t = PrettyTable(["schema1", "schema2"])
zipped = list(zip_longest(s1, s2))
for sf1, sf2 in zipped:
if sf1 == sf2:
t.add_row([blue(str(sf1)), blue(str(sf2))])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())
handle_schemas_not_equal(s1, s2, ignore_nullable=False, ignore_metadata=False)


# deprecate this. ignore_nullable should be a flag.
def assert_schema_equality_ignore_nullable(s1: StructType, s2: StructType) -> None:
if not are_schemas_equal_ignore_nullable(s1, s2):
t = PrettyTable(["schema1", "schema2"])
zipped = list(zip_longest(s1, s2))
for sf1, sf2 in zipped:
if are_structfields_equal(sf1, sf2, True):
t.add_row([blue(str(sf1)), blue(str(sf2))])
else:
t.add_row([sf1, sf2])
raise SchemasNotEqualError("\n" + t.get_string())
handle_schemas_not_equal(s1, s2, ignore_nullable=True, ignore_metadata=False)


# deprecate this. ignore_nullable should be a flag.
Expand Down
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ classifiers = [
[tool.poetry.dependencies]
python = ">=3.8,<4.0"
prettytable = "^3.10.2"
six = "^1.16.0"

[tool.poetry.group.dev.dependencies]
pytest = "7.4.2"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[31m |-- purple: integer (nullable = true)\x1b[0m\n\x1b[31m |-- phone_number: string (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = false)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = false)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = false)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[34m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[31m|-- fav_genres: struct (nullable = true) |-- fav_genres: struct (nullable = true)\x1b[0m\n\x1b[31m |-- rock: struct (nullable = true) |-- rock: struct (nullable = true)\x1b[0m\n\x1b[34m |-- metal: integer (nullable = true) |-- metal: integer (nullable = true)\x1b[0m\n\x1b[31m |-- punk: integer (nullable = true) |-- classic: integer (nullable = true)\x1b[0m\n\x1b[34m |-- electronic: struct (nullable = true) |-- electronic: struct (nullable = true)\x1b[0m\n\x1b[34m |-- house: integer (nullable = true) |-- house: integer (nullable = true)\x1b[0m\n\x1b[34m |-- dubstep: integer (nullable = true) |-- dubstep: integer (nullable = true)\x1b[0m\n\x1b[31m |-- pop: struct (nullable = true)\x1b[0m\n\x1b[31m |-- pop: integer (nullable = true)\x1b[0m\n\x1b[0m'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'\nschema1 schema2\n\x1b[31m|-- name: string (nullable = true) |-- name: string (nullable = true)\x1b[0m\n\x1b[34m|-- age: integer (nullable = true) |-- age: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_number: integer (nullable = true) |-- fav_number: integer (nullable = true)\x1b[0m\n\x1b[34m|-- fav_numbers: array (nullable = true) |-- fav_numbers: array (nullable = true)\x1b[0m\n\x1b[31m|-- fav_colors: struct (nullable = true) |-- fav_colors: struct (nullable = true)\x1b[0m\n\x1b[31m |-- red: integer (nullable = true) |-- orange: integer (nullable = true)\x1b[0m\n\x1b[34m |-- green: integer (nullable = true) |-- green: integer (nullable = true)\x1b[0m\n\x1b[31m |-- blue: integer (nullable = true) |-- yellow: integer (nullable = true)\x1b[0m\n\x1b[0m'
Loading