diff --git a/requirements.txt b/requirements.txt index b263ef69..4b9eed8f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,8 @@ v3io~=0.5.14 # and 1.5.* due to https://github.com/pandas-dev/pandas/issues/49203 pandas>=1, !=1.5.*, <3 numpy>=1.16.5,<1.23 -# pyarrow 13 and over cause test failures -pyarrow>=1,<13 +# <15 is just a safeguard - no tests performed with pyarrow higher than 14 +pyarrow>=1,<15 v3io-frames~=0.10.3 v3iofs~=0.1.17 xxhash>=1 diff --git a/tests/test_flow.py b/tests/test_flow.py index 290745ad..0a402d21 100644 --- a/tests/test_flow.py +++ b/tests/test_flow.py @@ -29,6 +29,7 @@ import pyarrow.parquet as pq import pytest from aiohttp import ClientConnectorError, InvalidURL +from packaging import version from pandas.testing import assert_frame_equal import integration.conftest @@ -2721,7 +2722,9 @@ def test_write_to_parquet_partition_by_hash(tmpdir): read_back_df = pd.read_parquet(out_file, columns=columns) read_back_df.sort_values("my_int", inplace=True) read_back_df.reset_index(drop=True, inplace=True) - assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}" + # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data + # is set to datetime64[us], while default DataFrame dtype is datetime64[ns] + assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) def test_write_to_parquet_partition_by_column(tmpdir): @@ -2754,7 +2757,9 @@ def test_write_to_parquet_partition_by_column(tmpdir): read_back_df = pd.read_parquet(out_file, columns=columns) read_back_df.sort_values("my_int", inplace=True) read_back_df.reset_index(drop=True, inplace=True) - assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}" + # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data + # is set to datetime64[us], while default DataFrame dtype is datetime64[ns] + assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) def test_write_to_parquet_with_inference(tmpdir): @@ -3225,10 +3230,12 @@ def test_csv_reader_parquet_write_microsecs(tmpdir): controller.await_termination() read_back_df = pd.read_parquet(out_file, columns=columns) - assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}" + # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data + # is set to datetime64[us], while default DataFrame dtype is datetime64[ns] + assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) -def test_csv_reader_parquet_write_nanosecs(tmpdir): +def test_csv_reader_parquet_write_nanosecs_truncation(tmpdir): out_file = f"{tmpdir}/test_csv_reader_parquet_write_nanosecs_{uuid.uuid4().hex}/" columns = ["k", "t"] @@ -3261,7 +3268,9 @@ def test_csv_reader_parquet_write_nanosecs(tmpdir): controller.await_termination() read_back_df = pd.read_parquet(out_file, columns=columns) - assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}" + # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data + # is set to datetime64[us], while default DataFrame dtype is datetime64[ns] + assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0")) def test_error_in_table_persist():