Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add quoting around partition keys for Hive table inputs #834

Merged
merged 14 commits into from
Oct 5, 2022
Merged
10 changes: 10 additions & 0 deletions dask_sql/input_utils/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ def wrapped_read_function(location, column_information, **kwargs):
partition_values = ast.literal_eval(
partition_table_information["Partition Value"]
)
# multiple partition column values returned comma separated string
if "," in partition_values:
partition_values = [x.strip() for x in partition_values.split(",")]

logger.debug(
f"Applying additional partition information as columns: {partition_information}"
Expand Down Expand Up @@ -200,6 +203,9 @@ def _parse_hive_table_description(
"""
cursor.execute(f"USE {schema}")
if partition:
# Hive wants quoted, comma separated list of partition keys
partition = partition.replace("=", '="')
partition = partition.replace("/", '",') + '"'
result = self._fetch_all_results(
cursor, f"DESCRIBE FORMATTED {table_name} PARTITION ({partition})"
)
Expand Down Expand Up @@ -245,6 +251,10 @@ def _parse_hive_table_description(
storage_information[key] = value
last_field = storage_information[key]
elif mode == "table":
# Hive partition values come in a bracketed list
# quoted partition values work regardless of partition column type
if key == "Partition Value":
value = '"' + value.strip("[]") + '"'
table_information[key] = value
last_field = table_information[key]
elif mode == "partition":
Expand Down
30 changes: 29 additions & 1 deletion tests/integration/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def hive_cursor():

tmpdir = tempfile.mkdtemp()
tmpdir_parted = tempfile.mkdtemp()
tmpdir_multiparted = tempfile.mkdtemp()

try:
network = client.networks.create("dask-sql-hive", driver="bridge")
Expand All @@ -75,7 +76,11 @@ def hive_cursor():
hostname="hive-server",
name="hive-server",
network="dask-sql-hive",
volumes=[f"{tmpdir}:{tmpdir}", f"{tmpdir_parted}:{tmpdir_parted}"],
volumes=[
f"{tmpdir}:{tmpdir}",
f"{tmpdir_parted}:{tmpdir_parted}",
f"{tmpdir_multiparted}:{tmpdir_multiparted}",
],
environment={
"HIVE_CORE_CONF_javax_jdo_option_ConnectionURL": "jdbc:postgresql://hive-metastore-postgresql/metastore",
**DEFAULT_CONFIG,
Expand Down Expand Up @@ -147,9 +152,19 @@ def hive_cursor():
cursor.execute("INSERT INTO df_part PARTITION (j=2) (i) VALUES (1)")
cursor.execute("INSERT INTO df_part PARTITION (j=4) (i) VALUES (2)")

cursor.execute(
f"""
CREATE TABLE df_parts (i INTEGER) PARTITIONED BY (j INTEGER, k STRING)
ROW FORMAT DELIMITED STORED AS PARQUET LOCATION '{tmpdir_multiparted}'
"""
)
cursor.execute("INSERT INTO df_parts PARTITION (j=1, k='a') (i) VALUES (1)")
cursor.execute("INSERT INTO df_parts PARTITION (j=2, k='b') (i) VALUES (2)")

# The data files are created as root user by default. Change that:
hive_server.exec_run(["chmod", "a+rwx", "-R", tmpdir])
hive_server.exec_run(["chmod", "a+rwx", "-R", tmpdir_parted])
hive_server.exec_run(["chmod", "a+rwx", "-R", tmpdir_multiparted])

yield cursor
except docker.errors.ImageNotFound:
Expand Down Expand Up @@ -196,3 +211,16 @@ def test_select_partitions(hive_cursor):
expected_df["j"] = expected_df["j"].astype("int64")

assert_eq(result_df, expected_df, check_index=False)


def test_select_multipartitions(hive_cursor):
c = Context()
c.create_table("df_parts", hive_cursor)

result_df = c.sql("SELECT * FROM df_parts")
expected_df = pd.DataFrame({"i": [1, 2], "j": [1, 2], "k": ["a", "b"]})
expected_df["i"] = expected_df["i"].astype("int32")
expected_df["j"] = expected_df["j"].astype("int64")
expected_df["k"] = expected_df["k"].astype("object")

assert_eq(result_df, expected_df, check_index=False)