From 2733fedfc0ad3eac0a60434a38e31f89456fe3b9 Mon Sep 17 00:00:00 2001 From: Alex Toker Date: Thu, 23 Nov 2023 10:19:27 +0200 Subject: [PATCH 1/3] Bump pyarrow version to 14.0.1 --- requirements.txt | 3 +-- tests/test_flow.py | 49 ++++++++++------------------------------------ 2 files changed, 11 insertions(+), 41 deletions(-) diff --git a/requirements.txt b/requirements.txt index b263ef69..04423637 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,7 @@ v3io~=0.5.14 # and 1.5.* due to https://github.com/pandas-dev/pandas/issues/49203 pandas>=1, !=1.5.*, <3 numpy>=1.16.5,<1.23 -# pyarrow 13 and over cause test failures -pyarrow>=1,<13 +pyarrow>=1,<15 v3io-frames~=0.10.3 v3iofs~=0.1.17 xxhash>=1 diff --git a/tests/test_flow.py b/tests/test_flow.py index 290745ad..ab54e109 100644 --- a/tests/test_flow.py +++ b/tests/test_flow.py @@ -29,6 +29,7 @@ import pyarrow.parquet as pq import pytest from aiohttp import ClientConnectorError, InvalidURL +from packaging import version from pandas.testing import assert_frame_equal import integration.conftest @@ -2721,7 +2722,9 @@ def test_write_to_parquet_partition_by_hash(tmpdir): read_back_df = pd.read_parquet(out_file, columns=columns) read_back_df.sort_values("my_int", inplace=True) read_back_df.reset_index(drop=True, inplace=True) - assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}" + # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data + # is set to datetime64[us], while default DataFrame dtype is datetime64[ns] + assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) def test_write_to_parquet_partition_by_column(tmpdir): @@ -2754,7 +2757,9 @@ def test_write_to_parquet_partition_by_column(tmpdir): read_back_df = pd.read_parquet(out_file, columns=columns) read_back_df.sort_values("my_int", inplace=True) read_back_df.reset_index(drop=True, inplace=True) - assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}" + # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data + # is set to datetime64[us], while default DataFrame dtype is datetime64[ns] + assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) def test_write_to_parquet_with_inference(tmpdir): @@ -3225,43 +3230,9 @@ def test_csv_reader_parquet_write_microsecs(tmpdir): controller.await_termination() read_back_df = pd.read_parquet(out_file, columns=columns) - assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}" - - -def test_csv_reader_parquet_write_nanosecs(tmpdir): - out_file = f"{tmpdir}/test_csv_reader_parquet_write_nanosecs_{uuid.uuid4().hex}/" - columns = ["k", "t"] - - time_format = "%d/%m/%Y %H:%M:%S.%f" - controller = build_flow( - [ - CSVSource( - "tests/test-with-timestamp-nanosecs.csv", - header=True, - key_field="k", - parse_dates="t", - timestamp_format=time_format, - ), - ParquetTarget( - out_file, - columns=columns, - partition_cols=["$year", "$month", "$day", "$hour"], - max_events=2, - ), - ] - ).run() - - expected = pd.DataFrame( - [ - ["m1", datetime.strptime("15/02/2020 02:03:04.123456", time_format)], - ["m2", datetime.strptime("16/02/2020 02:03:04.123456", time_format)], - ], - columns=columns, - ) - controller.await_termination() - read_back_df = pd.read_parquet(out_file, columns=columns) - - assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}" + # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data + # is set to datetime64[us], while default DataFrame dtype is datetime64[ns] + assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) def test_error_in_table_persist(): From 76a064e0835ad51883a83915cb5c8bd78668061b Mon Sep 17 00:00:00 2001 From: Alex Toker Date: Tue, 28 Nov 2023 10:12:16 +0200 Subject: [PATCH 2/3] Addressing Gal's comments --- requirements.txt | 1 + tests/test_flow.py | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/requirements.txt b/requirements.txt index 04423637..3990ec52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ v3io~=0.5.14 # and 1.5.* due to https://github.com/pandas-dev/pandas/issues/49203 pandas>=1, !=1.5.*, <3 numpy>=1.16.5,<1.23 +# <15 is just a safeguard - no tests performed with pyarrow higher then 14 pyarrow>=1,<15 v3io-frames~=0.10.3 v3iofs~=0.1.17 diff --git a/tests/test_flow.py b/tests/test_flow.py index ab54e109..0a402d21 100644 --- a/tests/test_flow.py +++ b/tests/test_flow.py @@ -3235,6 +3235,44 @@ def test_csv_reader_parquet_write_microsecs(tmpdir): assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) +def test_csv_reader_parquet_write_nanosecs_truncation(tmpdir): + out_file = f"{tmpdir}/test_csv_reader_parquet_write_nanosecs_{uuid.uuid4().hex}/" + columns = ["k", "t"] + + time_format = "%d/%m/%Y %H:%M:%S.%f" + controller = build_flow( + [ + CSVSource( + "tests/test-with-timestamp-nanosecs.csv", + header=True, + key_field="k", + parse_dates="t", + timestamp_format=time_format, + ), + ParquetTarget( + out_file, + columns=columns, + partition_cols=["$year", "$month", "$day", "$hour"], + max_events=2, + ), + ] + ).run() + + expected = pd.DataFrame( + [ + ["m1", datetime.strptime("15/02/2020 02:03:04.123456", time_format)], + ["m2", datetime.strptime("16/02/2020 02:03:04.123456", time_format)], + ], + columns=columns, + ) + controller.await_termination() + read_back_df = pd.read_parquet(out_file, columns=columns) + + # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data + # is set to datetime64[us], while default DataFrame dtype is datetime64[ns] + assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) + + def test_error_in_table_persist(): table = Table( "table", From 4f6848589ba976aa95be300499adb5f0d18edb3b Mon Sep 17 00:00:00 2001 From: Assaf Ben-Amitai Date: Tue, 28 Nov 2023 13:15:58 +0200 Subject: [PATCH 3/3] Update requirements.txt Co-authored-by: Gal Topper --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3990ec52..4b9eed8f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ v3io~=0.5.14 # and 1.5.* due to https://github.com/pandas-dev/pandas/issues/49203 pandas>=1, !=1.5.*, <3 numpy>=1.16.5,<1.23 -# <15 is just a safeguard - no tests performed with pyarrow higher then 14 +# <15 is just a safeguard - no tests performed with pyarrow higher than 14 pyarrow>=1,<15 v3io-frames~=0.10.3 v3iofs~=0.1.17