From 2733fedfc0ad3eac0a60434a38e31f89456fe3b9 Mon Sep 17 00:00:00 2001
From: Alex Toker <alex_toker@mckinsey.com>
Date: Thu, 23 Nov 2023 10:19:27 +0200
Subject: [PATCH 1/3] Bump pyarrow version to 14.0.1

---
 requirements.txt   |  3 +--
 tests/test_flow.py | 49 ++++++++++------------------------------------
 2 files changed, 11 insertions(+), 41 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b263ef69..04423637 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,8 +4,7 @@ v3io~=0.5.14
 # and 1.5.* due to https://github.com/pandas-dev/pandas/issues/49203
 pandas>=1, !=1.5.*, <3
 numpy>=1.16.5,<1.23
-# pyarrow 13 and over cause test failures
-pyarrow>=1,<13
+pyarrow>=1,<15
 v3io-frames~=0.10.3
 v3iofs~=0.1.17
 xxhash>=1
diff --git a/tests/test_flow.py b/tests/test_flow.py
index 290745ad..ab54e109 100644
--- a/tests/test_flow.py
+++ b/tests/test_flow.py
@@ -29,6 +29,7 @@
 import pyarrow.parquet as pq
 import pytest
 from aiohttp import ClientConnectorError, InvalidURL
+from packaging import version
 from pandas.testing import assert_frame_equal
 
 import integration.conftest
@@ -2721,7 +2722,9 @@ def test_write_to_parquet_partition_by_hash(tmpdir):
     read_back_df = pd.read_parquet(out_file, columns=columns)
     read_back_df.sort_values("my_int", inplace=True)
     read_back_df.reset_index(drop=True, inplace=True)
-    assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}"
+    # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data
+    # is set to datetime64[us], while default DataFrame dtype is datetime64[ns]
+    assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0"))
 
 
 def test_write_to_parquet_partition_by_column(tmpdir):
@@ -2754,7 +2757,9 @@ def test_write_to_parquet_partition_by_column(tmpdir):
     read_back_df = pd.read_parquet(out_file, columns=columns)
     read_back_df.sort_values("my_int", inplace=True)
     read_back_df.reset_index(drop=True, inplace=True)
-    assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}"
+    # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data
+    # is set to datetime64[us], while default DataFrame dtype is datetime64[ns]
+    assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0"))
 
 
 def test_write_to_parquet_with_inference(tmpdir):
@@ -3225,43 +3230,9 @@ def test_csv_reader_parquet_write_microsecs(tmpdir):
     controller.await_termination()
     read_back_df = pd.read_parquet(out_file, columns=columns)
 
-    assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}"
-
-
-def test_csv_reader_parquet_write_nanosecs(tmpdir):
-    out_file = f"{tmpdir}/test_csv_reader_parquet_write_nanosecs_{uuid.uuid4().hex}/"
-    columns = ["k", "t"]
-
-    time_format = "%d/%m/%Y %H:%M:%S.%f"
-    controller = build_flow(
-        [
-            CSVSource(
-                "tests/test-with-timestamp-nanosecs.csv",
-                header=True,
-                key_field="k",
-                parse_dates="t",
-                timestamp_format=time_format,
-            ),
-            ParquetTarget(
-                out_file,
-                columns=columns,
-                partition_cols=["$year", "$month", "$day", "$hour"],
-                max_events=2,
-            ),
-        ]
-    ).run()
-
-    expected = pd.DataFrame(
-        [
-            ["m1", datetime.strptime("15/02/2020 02:03:04.123456", time_format)],
-            ["m2", datetime.strptime("16/02/2020 02:03:04.123456", time_format)],
-        ],
-        columns=columns,
-    )
-    controller.await_termination()
-    read_back_df = pd.read_parquet(out_file, columns=columns)
-
-    assert read_back_df.equals(expected), f"{read_back_df}\n!=\n{expected}"
+    # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data
+    # is set to datetime64[us], while default DataFrame dtype is datetime64[ns]
+    assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0"))
 
 
 def test_error_in_table_persist():

From 76a064e0835ad51883a83915cb5c8bd78668061b Mon Sep 17 00:00:00 2001
From: Alex Toker <alex_toker@mckinsey.com>
Date: Tue, 28 Nov 2023 10:12:16 +0200
Subject: [PATCH 2/3] Addressing Gal's comments

---
 requirements.txt   |  1 +
 tests/test_flow.py | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 04423637..3990ec52 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ v3io~=0.5.14
 # and 1.5.* due to https://github.com/pandas-dev/pandas/issues/49203
 pandas>=1, !=1.5.*, <3
 numpy>=1.16.5,<1.23
+# <15 is just a safeguard - no tests performed with pyarrow higher then 14
 pyarrow>=1,<15
 v3io-frames~=0.10.3
 v3iofs~=0.1.17
diff --git a/tests/test_flow.py b/tests/test_flow.py
index ab54e109..0a402d21 100644
--- a/tests/test_flow.py
+++ b/tests/test_flow.py
@@ -3235,6 +3235,44 @@ def test_csv_reader_parquet_write_microsecs(tmpdir):
     assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0"))
 
 
+def test_csv_reader_parquet_write_nanosecs_truncation(tmpdir):
+    out_file = f"{tmpdir}/test_csv_reader_parquet_write_nanosecs_{uuid.uuid4().hex}/"
+    columns = ["k", "t"]
+
+    time_format = "%d/%m/%Y %H:%M:%S.%f"
+    controller = build_flow(
+        [
+            CSVSource(
+                "tests/test-with-timestamp-nanosecs.csv",
+                header=True,
+                key_field="k",
+                parse_dates="t",
+                timestamp_format=time_format,
+            ),
+            ParquetTarget(
+                out_file,
+                columns=columns,
+                partition_cols=["$year", "$month", "$day", "$hour"],
+                max_events=2,
+            ),
+        ]
+    ).run()
+
+    expected = pd.DataFrame(
+        [
+            ["m1", datetime.strptime("15/02/2020 02:03:04.123456", time_format)],
+            ["m2", datetime.strptime("16/02/2020 02:03:04.123456", time_format)],
+        ],
+        columns=columns,
+    )
+    controller.await_termination()
+    read_back_df = pd.read_parquet(out_file, columns=columns)
+
+    # with the introduction of s, ms, us time resolutions in pandas-2.0, the dtype of the parquet data
+    # is set to datetime64[us], while default DataFrame dtype is datetime64[ns]
+    assert_frame_equal(expected, read_back_df, check_dtype=version.parse(pd.__version__) < version.parse("2.0.0"))
+
+
 def test_error_in_table_persist():
     table = Table(
         "table",

From 4f6848589ba976aa95be300499adb5f0d18edb3b Mon Sep 17 00:00:00 2001
From: Assaf Ben-Amitai <Assaf_Ben_Amitai@mckinsey.com>
Date: Tue, 28 Nov 2023 13:15:58 +0200
Subject: [PATCH 3/3] Update requirements.txt

Co-authored-by: Gal Topper <gal.topper@gmail.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 3990ec52..4b9eed8f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ v3io~=0.5.14
 # and 1.5.* due to https://github.com/pandas-dev/pandas/issues/49203
 pandas>=1, !=1.5.*, <3
 numpy>=1.16.5,<1.23
-# <15 is just a safeguard - no tests performed with pyarrow higher then 14
+# <15 is just a safeguard - no tests performed with pyarrow higher than 14
 pyarrow>=1,<15
 v3io-frames~=0.10.3
 v3iofs~=0.1.17