-
Notifications
You must be signed in to change notification settings - Fork 228
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support partitioning hints for athena iceberg (#1403)
* Support partitioning hints for athena iceberg * Escape partitioning column names * Update docs * Move athena skip marker * marks athena adapter tests essential --------- Co-authored-by: Marcin Rudolf <[email protected]>
- Loading branch information
Showing
9 changed files
with
339 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from typing import Any, Optional, Dict, Protocol, Sequence, Union, Final | ||
|
||
from dateutil import parser | ||
|
||
from dlt.common.pendulum import timezone | ||
from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TColumnSchema | ||
from dlt.destinations.utils import ensure_resource | ||
from dlt.extract import DltResource | ||
from dlt.extract.items import TTableHintTemplate | ||
|
||
|
||
PARTITION_HINT: Final[str] = "x-athena-partition" | ||
|
||
|
||
class PartitionTransformation: | ||
template: str | ||
"""Template string of the transformation including column name placeholder. E.g. `bucket(16, {column_name})`""" | ||
column_name: str | ||
"""Column name to apply the transformation to""" | ||
|
||
def __init__(self, template: str, column_name: str) -> None: | ||
self.template = template | ||
self.column_name = column_name | ||
|
||
|
||
class athena_partition: | ||
"""Helper class to generate iceberg partition transformations | ||
E.g. `athena_partition.bucket(16, "id")` will return a transformation with template `bucket(16, {column_name})` | ||
This can be correctly rendered by the athena loader with escaped column name. | ||
""" | ||
|
||
@staticmethod | ||
def year(column_name: str) -> PartitionTransformation: | ||
"""Partition by year part of a date or timestamp column.""" | ||
return PartitionTransformation("year({column_name})", column_name) | ||
|
||
@staticmethod | ||
def month(column_name: str) -> PartitionTransformation: | ||
"""Partition by month part of a date or timestamp column.""" | ||
return PartitionTransformation("month({column_name})", column_name) | ||
|
||
@staticmethod | ||
def day(column_name: str) -> PartitionTransformation: | ||
"""Partition by day part of a date or timestamp column.""" | ||
return PartitionTransformation("day({column_name})", column_name) | ||
|
||
@staticmethod | ||
def hour(column_name: str) -> PartitionTransformation: | ||
"""Partition by hour part of a date or timestamp column.""" | ||
return PartitionTransformation("hour({column_name})", column_name) | ||
|
||
@staticmethod | ||
def bucket(n: int, column_name: str) -> PartitionTransformation: | ||
"""Partition by hashed value to n buckets.""" | ||
return PartitionTransformation(f"bucket({n}, {{column_name}})", column_name) | ||
|
||
@staticmethod | ||
def truncate(length: int, column_name: str) -> PartitionTransformation: | ||
"""Partition by value truncated to length.""" | ||
return PartitionTransformation(f"truncate({length}, {{column_name}})", column_name) | ||
|
||
|
||
def athena_adapter( | ||
data: Any, | ||
partition: Union[ | ||
str, PartitionTransformation, Sequence[Union[str, PartitionTransformation]] | ||
] = None, | ||
) -> DltResource: | ||
""" | ||
Prepares data for loading into Athena | ||
Args: | ||
data: The data to be transformed. | ||
This can be raw data or an instance of DltResource. | ||
If raw data is provided, the function will wrap it into a `DltResource` object. | ||
partition: Column name(s) or instances of `PartitionTransformation` to partition the table by. | ||
To use a transformation it's best to use the methods of the helper class `athena_partition` | ||
to generate correctly escaped SQL in the loader. | ||
Returns: | ||
A `DltResource` object that is ready to be loaded into BigQuery. | ||
Raises: | ||
ValueError: If any hint is invalid or none are specified. | ||
Examples: | ||
>>> data = [{"name": "Marcel", "department": "Engineering", "date_hired": "2024-01-30"}] | ||
>>> athena_adapter(data, partition=["department", athena_partition.year("date_hired"), athena_partition.bucket(8, "name")]) | ||
[DltResource with hints applied] | ||
""" | ||
resource = ensure_resource(data) | ||
additional_table_hints: Dict[str, TTableHintTemplate[Any]] = {} | ||
|
||
if partition: | ||
if isinstance(partition, str) or not isinstance(partition, Sequence): | ||
partition = [partition] | ||
|
||
# Partition hint is `{column_name: template}`, e.g. `{"department": "{column_name}", "date_hired": "year({column_name})"}` | ||
# Use one dict for all hints instead of storing on column so order is preserved | ||
partition_hint: Dict[str, str] = {} | ||
|
||
for item in partition: | ||
if isinstance(item, PartitionTransformation): | ||
# Client will generate the final SQL string with escaped column name injected | ||
partition_hint[item.column_name] = item.template | ||
else: | ||
# Item is the column name | ||
partition_hint[item] = "{column_name}" | ||
|
||
additional_table_hints[PARTITION_HINT] = partition_hint | ||
|
||
if additional_table_hints: | ||
resource.apply_hints(additional_table_hints=additional_table_hints) | ||
else: | ||
raise ValueError("A value for `partition` must be specified.") | ||
return resource |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from tests.utils import skip_if_not_active | ||
|
||
|
||
skip_if_not_active("athena") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import pytest | ||
|
||
import dlt | ||
from dlt.destinations import filesystem | ||
from dlt.destinations.impl.athena.athena_adapter import athena_adapter, athena_partition | ||
|
||
# mark all tests as essential, do not remove | ||
pytestmark = pytest.mark.essential | ||
|
||
|
||
def test_iceberg_partition_hints(): | ||
"""Create a table with athena partition hints and check that the SQL is generated correctly.""" | ||
|
||
@dlt.resource(table_format="iceberg") | ||
def partitioned_table(): | ||
yield { | ||
"product_id": 1, | ||
"name": "product 1", | ||
"created_at": "2021-01-01T00:00:00Z", | ||
"category": "category 1", | ||
"price": 100.0, | ||
"quantity": 10, | ||
} | ||
|
||
@dlt.resource(table_format="iceberg") | ||
def not_partitioned_table(): | ||
yield {"a": 1, "b": 2} | ||
|
||
athena_adapter( | ||
partitioned_table, | ||
partition=[ | ||
"category", | ||
athena_partition.month("created_at"), | ||
athena_partition.bucket(10, "product_id"), | ||
athena_partition.truncate(2, "name"), | ||
], | ||
) | ||
|
||
pipeline = dlt.pipeline( | ||
"athena_test", | ||
destination="athena", | ||
staging=filesystem("s3://not-a-real-bucket"), | ||
full_refresh=True, | ||
) | ||
|
||
pipeline.extract([partitioned_table, not_partitioned_table]) | ||
pipeline.normalize() | ||
|
||
with pipeline._sql_job_client(pipeline.default_schema) as client: | ||
sql_partitioned = client._get_table_update_sql( | ||
"partitioned_table", | ||
list(pipeline.default_schema.tables["partitioned_table"]["columns"].values()), | ||
False, | ||
)[0] | ||
sql_not_partitioned = client._get_table_update_sql( | ||
"not_partitioned_table", | ||
list(pipeline.default_schema.tables["not_partitioned_table"]["columns"].values()), | ||
False, | ||
)[0] | ||
|
||
# Partition clause is generated with original order | ||
expected_clause = ( | ||
"PARTITIONED BY (`category`, month(`created_at`), bucket(10, `product_id`), truncate(2," | ||
" `name`))" | ||
) | ||
assert expected_clause in sql_partitioned | ||
|
||
# No partition clause otherwise | ||
assert "PARTITIONED BY" not in sql_not_partitioned |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.