From 21ecaffb3ad782874b337ebecdcb0a4f9d498f88 Mon Sep 17 00:00:00 2001 From: Karik Isichei Date: Fri, 15 Jan 2021 17:53:57 +0000 Subject: [PATCH 01/15] Using awswrangler for new internals - Moved code from create_temp_table module to wrangler module - Added wrapper functions to deal with setting boto clients and s3 paths before calling awswrangler - Added some extra functions to create_temp_table - Need to add more tests - Need to update readme --- pydbtools/__init__.py | 20 ++- pydbtools/create_temp_table.py | 103 ----------- pydbtools/get_athena_query_response.py | 22 ++- pydbtools/read_sql.py | 28 ++- pydbtools/utils.py | 84 +++++---- pydbtools/wrangler.py | 230 +++++++++++++++++++++++++ pyproject.toml | 2 - 7 files changed, 343 insertions(+), 146 deletions(-) delete mode 100644 pydbtools/create_temp_table.py create mode 100644 pydbtools/wrangler.py diff --git a/pydbtools/__init__.py b/pydbtools/__init__.py index b3e301e..4eb8717 100644 --- a/pydbtools/__init__.py +++ b/pydbtools/__init__.py @@ -1,7 +1,25 @@ from .read_sql import read_sql -from .create_temp_table import create_temp_table from .get_athena_query_response import get_athena_query_response +from .wrangler import ( + create_temp_table, + read_sql_query, + read_sql_table, + create_athena_bucket, + describe_table, + get_query_columns_types, + get_query_execution, + get_work_group, + repair_table, + show_create_table, + start_query_execution, + stop_query_execution, + wait_query, + start_query_execution_and_wait, + create_temp_table, + create_temp_database, +) + import poetry_version __version__ = poetry_version.extract(source_file=__file__) diff --git a/pydbtools/create_temp_table.py b/pydbtools/create_temp_table.py deleted file mode 100644 index ba6c4f4..0000000 --- a/pydbtools/create_temp_table.py +++ /dev/null @@ -1,103 +0,0 @@ -import os -import sqlparse - -from dataengineeringutils3.s3 import delete_s3_folder_contents - -from pydbtools.utils import ( - temp_database_name_prefix, - get_user_id_and_table_dir, - get_database_name_from_userid, - clean_query, -) - -from pydbtools.get_athena_query_response import get_athena_query_response - - -def check_sql(sql_query: str): - """ - Validates sql_query to confirm it is a select statement - """ - parsed = sqlparse.parse(clean_query(sql_query)) - i = 0 - for p in parsed: - if p.get_type() != "SELECT" or i > 0: - raise ValueError("The sql statement must be a single select query") - i += 1 - - -def create_temp_table( - sql_query: str, - table_name: str, - timeout: int = None, - force_ec2: bool = False, - region_name: str = "eu-west-1", -): - """ - Create a table inside the database from create database - - Args: - sql_query (str): - The SQL table you want to create a temp table out of. Should be a table that starts with a WITH or SELECT clause. - - table_name (str): - The name of the temp table you wish to create - - force_ec2 (bool, optional): - Boolean specifying if the user wants to force boto to get the - credentials from the EC2. This is for dbtools which is the R wrapper that - calls this package via reticulate and requires credentials to be refreshed - via the EC2 instance (and therefore sets this to True) - this is not - necessary when using this in Python. Default is False. - - region_name (str, optional): - Name of the AWS region you want to run queries on. Defaults to "eu-west-1". - """ - - check_sql(sql_query) - - # Create named stuff - user_id, out_path = get_user_id_and_table_dir(force_ec2, region_name) - db_path = os.path.join(out_path, "__athena_temp_db__/") - table_path = os.path.join(db_path, table_name) - temp_db_name = get_database_name_from_userid(user_id) - - create_db_query = f"CREATE DATABASE IF NOT EXISTS {temp_db_name}" - - _ = get_athena_query_response( - sql_query=create_db_query, - timeout=None, - force_ec2=force_ec2, - region_name=region_name, - ) - - # Clear out table every time - delete_s3_folder_contents(table_path) - - drop_table_query = f"DROP TABLE IF EXISTS {temp_db_name}.{table_name}" - - _ = get_athena_query_response( - sql_query=drop_table_query, - timeout=None, - force_ec2=force_ec2, - region_name=region_name, - ) - - print( - f"Creating table {table_name}. To access table contents query __temp__.{table_name}" - ) - ctas_query = f""" - CREATE TABLE {temp_db_name}.{table_name} - WITH ( - format = 'Parquet', - parquet_compression = 'SNAPPY', - external_location = '{table_path}' - ) - as {sql_query} - """ - - _ = get_athena_query_response( - sql_query=ctas_query, - timeout=timeout, - force_ec2=force_ec2, - region_name=region_name, - ) diff --git a/pydbtools/get_athena_query_response.py b/pydbtools/get_athena_query_response.py index b76ec6d..74f115f 100644 --- a/pydbtools/get_athena_query_response.py +++ b/pydbtools/get_athena_query_response.py @@ -1,6 +1,7 @@ import boto3 import time import os +import warnings from pydbtools.utils import ( _athena_meta_conversions, @@ -10,7 +11,6 @@ replace_temp_database_name_reference, ) - def get_athena_query_response( sql_query: str, return_athena_types: bool = False, @@ -19,6 +19,8 @@ def get_athena_query_response( region_name: str = "eu-west-1", ): """ + [DEPRECATED] See start_query_execution_and_wait. + Runs an SQL query against our Athena database and returns a tuple. The first argument is an S3 path to the resulting output and the second is a dictionary of meta data for the table. @@ -30,9 +32,10 @@ def get_athena_query_response( meta data should be named after the athena types (True) or as our agnostic meta data types used in etl_manager (False). Default is False. - timeout: Integer specifying the number of seconds to wait before giving up + timeout: . Will raise warining if not None. + Integer specifying the number of seconds to wait before giving up on the Athena query. If set to None (default) the query will wait - indefinitely. + indefinitely. . force_ec2: Boolean specifying if the user wants to force boto to get the credentials from the EC2. This is for dbtools which is the R wrapper that @@ -40,8 +43,18 @@ def get_athena_query_response( via the EC2 instance (and therefore sets this to True) - this is not necessary when using this in Python. Default is False. """ + usage_warning = ( + "This function is deprecated and will be removed " + "in future releases. Please use the " + "start_query_execution_and_wait function instead." + ) - user_id, out_path = get_user_id_and_table_dir(force_ec2, region_name) + warnings.warn(usage_warning) + user_id, out_path = get_user_id_and_table_dir( + boto3_session=None, + force_ec2=force_ec2, + region_name=region_name + ) temp_db_name = get_database_name_from_userid(user_id) sql_query = replace_temp_database_name_reference(sql_query, temp_db_name) @@ -91,6 +104,7 @@ def get_athena_query_response( MaxResults=1, ) s3_path = athena_status["QueryExecution"]["ResultConfiguration"]["OutputLocation"] + if return_athena_types: meta = [ {"name": c["Name"], "type": c["Type"]} diff --git a/pydbtools/read_sql.py b/pydbtools/read_sql.py index 2b92bf1..6b3c1be 100644 --- a/pydbtools/read_sql.py +++ b/pydbtools/read_sql.py @@ -1,16 +1,23 @@ import pandas as pd +from functools import wraps +import awswrangler as wr # setting s3fs cache to false is to try an fix this Access Denied ListObjectsV2 issue # see below https://github.com/pandas-dev/pandas/issues/27528 from pydbtools.get_athena_query_response import get_athena_query_response -from pydbtools.utils import _pd_dtype_dict_from_metadata, get_file -from gluejobutils.s3 import delete_s3_object - +from pydbtools.utils import ( + _pd_dtype_dict_from_metadata, + get_file, + get_boto_session, + get_default_args +) def read_sql( sql_query, timeout=None, convert_dates=True, cols_as_str=False, *args, **kwargs ): """ + [DEPRECIATED] see read_sql_query. + Takes an athena SQL query and returns a pandas dataframe. The Athena query will write the resulting output into a CSV or txt file depending on the type of query. In both instances these will be read into pandas using pandas.read_csv. You can @@ -29,6 +36,14 @@ def read_sql( table in Athena. Default is False. """ # Run the SQL query + + usage_warning = ( + "This function is deprecated and will be removed " + "in future releases. Please use the " + "read_sql_query function instead." + ) + warnings.warn(usage_warning) + response = get_athena_query_response( sql_query=sql_query, return_athena_types=True, timeout=timeout ) @@ -53,7 +68,10 @@ def read_sql( df = pd.read_csv(f, dtype=dtype, parse_dates=parse_dates, *args, **kwargs) # Delete both the SQL query and the meta data - delete_s3_object(response["s3_path"]) - delete_s3_object(response["s3_path"] + ".metadata") + to_del = [ + response["s3_path"], + response["s3_path"] + ".metadata" + ] + wr.s3.delete_objects(to_del) return df diff --git a/pydbtools/utils.py b/pydbtools/utils.py index c94eb81..ae8b3b1 100644 --- a/pydbtools/utils.py +++ b/pydbtools/utils.py @@ -1,13 +1,10 @@ from typing import Tuple - import numpy as np - -from gluejobutils.s3 import s3_path_to_bucket_key, check_for_s3_file import os import re import sqlparse from s3fs import S3FileSystem - +import inspect import boto3 from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher @@ -24,52 +21,61 @@ temp_database_name_prefix = "mojap_de_temp_" -def check_temp_query(sql_query: str): +def get_default_args(func): + signature = inspect.signature(func) + return { + k: v.default + for k, v in signature.parameters.items() + if v.default is not inspect.Parameter.empty + } + + +def check_temp_query(sql: str): """ Checks if a query to a temporary table has had __temp__ wrapped in quote marks. Args: - sql_query (str): an SQL query + sql (str): an SQL query Raises: ValueError """ - if re.findall(r'["|\']__temp__["|\']\.', sql_query.lower()): + if re.findall(r'["|\']__temp__["|\']\.', sql.lower()): raise ValueError( "When querying a temporary database, __temp__ should not be wrapped in quotes" ) -def clean_query(sql_query: str) -> str: +def clean_query(sql: str) -> str: """ removes trailing whitespace, newlines and final - semicolon from sql_query for use with + semicolon from sql for use with sqlparse package Args: - sql_query (str): The raw SQL query + sql (str): The raw SQL query Returns: str: The cleaned SQL query """ - return " ".join(sql_query.splitlines()).strip().rstrip(";") + return " ".join(sql.splitlines()).strip().rstrip(";") -def replace_temp_database_name_reference(sql_query: str, database_name: str) -> str: +def replace_temp_database_name_reference(sql: str, database_name: str) -> str: """ Replaces references to the user's temp database __temp__ with the database_name string provided. Args: - sql_query (str): The raw SQL query as a string + sql (str): The raw SQL query as a string database_name (str): The database name to replace __temp__ Returns: str: The new SQL query which is sent to Athena """ # check query is valid and clean - parsed = sqlparse.parse(clean_query(sql_query)) + parsed = sqlparse.parse(clean_query(sql)) new_query = [] for query in parsed: check_temp_query(str(query)) @@ -87,10 +93,18 @@ def replace_temp_database_name_reference(sql_query: str, database_name: str) -> def get_user_id_and_table_dir( - force_ec2: bool = False, region_name: str = "eu-west-1" + boto3_session = None, + force_ec2: bool = False, + region_name: str = "eu-west-1" ) -> Tuple[str, str]: - sts_client = get_boto_client("sts", force_ec2=force_ec2, region_name=region_name) + if boto3_session is None: + boto3_session = get_boto_session( + force_ec2=force_ec2, + region_name=region_name + ) + + sts_client = boto3_session.client("sts") sts_resp = sts_client.get_caller_identity() out_path = os.path.join("s3://", bucket, sts_resp["UserId"]) if out_path[-1] != "/": @@ -105,28 +119,36 @@ def get_database_name_from_userid(user_id: str) -> str: return unique_db_name -def get_boto_client( - client_name: str, +def get_boto_session( force_ec2: bool = False, region_name: str = "eu-west-1", ): - + kwargs = { + "region_name": region_name + } if force_ec2: provider = InstanceMetadataProvider( iam_role_fetcher=InstanceMetadataFetcher(timeout=1000, num_attempts=2) ) creds = provider.load().get_frozen_credentials() - client = boto3.client( - client_name, - region_name=region_name, - aws_access_key_id=creds.access_key, - aws_secret_access_key=creds.secret_key, - aws_session_token=creds.token, - ) - else: - client = boto3.client(client_name, region_name=region_name) + kwargs["aws_access_key_id"] = creds.access_key + kwargs["aws_secret_access_key"] = creds.secret_key + kwargs["aws_session_token"] = creds.token + + return boto3.Session(**kwargs) + + +def get_boto_client( + client_name: str, + boto3_session = None, + force_ec2: bool = False, + region_name: str = "eu-west-1", +): + + if boto3_session is None: + boto3_session = get_boto_session(force_ec2=force_ec2, region_name=region_name) - return client + return boto3_session.client(client_name) def get_file(s3_path: str, check_exists: bool = True): @@ -136,9 +158,9 @@ def get_file(s3_path: str, check_exists: bool = True): s3_path: path to file in S3 e.g. s3://bucket/object/path.csv check_exists: If True (default) will check for s3 file existance before returning file. """ - b, k = s3_path_to_bucket_key(s3_path) + b, k = s3_path.replace("s3://", "").split("/", 1) if check_exists: - if not check_for_s3_file(s3_path): + if not wr.s3.does_object_exist(s3_path): raise FileNotFoundError(f"File not found in S3. full path: {s3_path}") fs = S3FileSystem() f = fs.open(os.path.join(b, k), "rb") diff --git a/pydbtools/wrangler.py b/pydbtools/wrangler.py new file mode 100644 index 0000000..6eba32a --- /dev/null +++ b/pydbtools/wrangler.py @@ -0,0 +1,230 @@ +import awswrangler as wr +import awswrangler.athena as ath +import os +import sqlparse +import warnings + +import inspect +import functools + +from pydbtools.utils import ( + temp_database_name_prefix, + get_user_id_and_table_dir, + get_database_name_from_userid, + clean_query, + get_default_args, + get_boto_session, + replace_temp_database_name_reference, +) + +# Wrapper used to set parameters in the athena wrangler functions +# before they are called +def init_athena_params(func): + """ + Takes a wrangler athena function and sets the following: + boto3_session and s3_output_path if exists in function param. + + Args: + func (Callable): An function from wr.athena that requires + boto3_session. If the func has an s3_output this is also + standardised. + + Returns: + Similar function call but with pre-defined params. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Get the boto3 session + setup_defaults = get_default_args(get_boto_session) + setup_kwargs = {} + for k, v in setup_defaults.items(): + setup_kwargs[k] = kwargs.pop(k, v) + boto3_session = get_boto_session(**setup_kwargs) + + # Get parameters from function and overwrite specific params + sig = inspect.signature(func) + argmap = sig.bind_partial(*args, **kwargs).arguments + + if "boto3_session" in argmap: + warn_msg = ( + "Warning parameter 'boto3_session' cannot be set. " + "Is defined by setting 'force_ec2' and 'region' params. " + "(Input boto3_session will be ignored)." + ) + warnings.warn(warn_msg) + argmap["boto3_session"] = boto3_session + + # Check SQL for __temp__ keyword references and set s3 table path + if ("s3_output" in argmap) or ("sql" in argmap): + user_id, s3_output = get_user_id_and_table_dir(boto3_session) + + # Set s3_output to predefined path otherwise skip + if "s3_output" in sig.parameters: + if "s3_output" in argmap: + warn_msg = ( + "Warning parameter 's3_output' cannot be set. " + "Is automatically generated (input ignored)." + ) + warnings.warn(warn_msg) + + # Set s3 to default s3 path + argmap["s3_output"] = s3_output + + # Fix sql before it is passed to athena + if "sql" in argmap: + temp_db_name = get_database_name_from_userid(user_id) + argmap["sql"] = replace_temp_database_name_reference( + argmap["sql"], + temp_db_name + ) + + # Set database to None when not needed + if ( + "database" in sig.parameters and + (argmap.get("database", "__temp__").lower() == "__temp__") + ): + if ( + "ctas_approach" in sig.parameters and + argmap.get("ctas_approach", True) + ): + argmap["database"] = temp_db_name + else: + argmap["database"] = None + + return func(**argmap) + return wrapper + + +# Override all existing awswrangler.athena functions for pydbtools +read_sql_query = init_athena_params(ath.read_sql_query) +read_sql_table = init_athena_params(ath.read_sql_table) +create_athena_bucket = init_athena_params(ath.create_athena_bucket) +describe_table = init_athena_params(ath.describe_table) +get_query_columns_types = init_athena_params(ath.get_query_columns_types) +get_query_execution = init_athena_params(ath.get_query_execution) +get_work_group = init_athena_params(ath.get_work_group) +repair_table = init_athena_params(ath.repair_table) +show_create_table = init_athena_params(ath.show_create_table) +start_query_execution = init_athena_params(ath.start_query_execution) +stop_query_execution = init_athena_params(ath.stop_query_execution) +wait_query = init_athena_params(ath.wait_query) + + +def start_query_execution_and_wait(sql, *args, **kwargs): + """Calls start_query_execution followed by wait_query. + *args and **kwargs are passed to start_query_execution + + Args: + sql (str): An SQL string. Which works with __TEMP__ references. + """ + + query_execution_id = start_query_execution(sql, *args, **kwargs) + return wait_query(query_execution_id) + + +def check_sql(sql: str): + """ + Validates sql to confirm it is a select statement + """ + parsed = sqlparse.parse(clean_query(sql)) + i = 0 + for p in parsed: + if p.get_type() != "SELECT" or i > 0: + raise ValueError("The sql statement must be a single select query") + i += 1 + + +def create_temp_database( + temp_db_name:str = None, + force_ec2: bool = False, + region_name: str = "eu-west-1" +): + if temp_db_name is None: + user_id, _ = get_user_id_and_table_dir( + boto3_session=None, + force_ec2=force_ec2, + region_name=region_name + ) + temp_db_name = get_database_name_from_userid(user_id) + + create_db_query = f"CREATE DATABASE IF NOT EXISTS {temp_db_name}" + + return start_query_execution_and_wait( + sql=create_db_query, + force_ec2=force_ec2, + region_name=region_name, + ) + + +def create_temp_table( + sql: str, + table_name: str, + force_ec2: bool = False, + region_name: str = "eu-west-1", +): + """ + Create a table inside the database from create database + + Args: + sql (str): + The SQL table you want to create a temp table out of. Should be a table that starts with a WITH or SELECT clause. + + table_name (str): + The name of the temp table you wish to create + + force_ec2 (bool, optional): + Boolean specifying if the user wants to force boto to get the + credentials from the EC2. This is for dbtools which is the R wrapper that + calls this package via reticulate and requires credentials to be refreshed + via the EC2 instance (and therefore sets this to True) - this is not + necessary when using this in Python. Default is False. + + region_name (str, optional): + Name of the AWS region you want to run queries on. Defaults to "eu-west-1". + """ + + check_sql(sql) + + # Create named stuff + user_id, out_path = get_user_id_and_table_dir( + boto3_session=None, + force_ec2=force_ec2, + region_name=region_name + ) + db_path = os.path.join(out_path, "__athena_temp_db__/") + table_path = os.path.join(db_path, table_name) + temp_db_name = get_database_name_from_userid(user_id) + + _ = create_temp_database( + temp_db_name, + force_ec2=force_ec2, + region_name=region_name + ) + + # Clear out table every time + wr.s3.delete_objects(table_path) + + drop_table_query = f"DROP TABLE IF EXISTS {temp_db_name}.{table_name}" + + _ = start_query_execution_and_wait( + sql=drop_table_query, + force_ec2=force_ec2, + region_name=region_name, + ) + + ctas_query = f""" + CREATE TABLE {temp_db_name}.{table_name} + WITH ( + format = 'Parquet', + parquet_compression = 'SNAPPY', + external_location = '{table_path}' + ) + as {sql} + """ + + _ = start_query_execution_and_wait( + sql=ctas_query, + force_ec2=force_ec2, + region_name=region_name, + ) diff --git a/pyproject.toml b/pyproject.toml index 2707c38..ca5e84a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,12 +10,10 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.6" boto3 = ">=1.7.4" -gluejobutils = ">=1.0.0" numpy = ">=1.16.1" pandas = ">=0.23.4" s3fs = ">=0.2.2,<0.5.0" sqlparse = "^0.3.1" -dataengineeringutils3 = "^1.1" poetry_version = ">=0.1.5" [tool.poetry.dev-dependencies] From a7c3825f9b5c14cb1ef762811d19ec7516ee04f0 Mon Sep 17 00:00:00 2001 From: Karik Isichei Date: Mon, 18 Jan 2021 10:02:05 +0000 Subject: [PATCH 02/15] Now only calls wrapper function once Each function in the wrangler module now only creates the boto3_session once per call. Also now instictively creates a temp table if ctas_approach is set to true. --- pydbtools/get_athena_query_response.py | 5 +- pydbtools/read_sql.py | 8 +- pydbtools/utils.py | 19 ++-- pydbtools/wrangler.py | 122 ++++++++++++------------- tests/test_tests.py | 2 +- 5 files changed, 73 insertions(+), 83 deletions(-) diff --git a/pydbtools/get_athena_query_response.py b/pydbtools/get_athena_query_response.py index 74f115f..66e0c53 100644 --- a/pydbtools/get_athena_query_response.py +++ b/pydbtools/get_athena_query_response.py @@ -11,6 +11,7 @@ replace_temp_database_name_reference, ) + def get_athena_query_response( sql_query: str, return_athena_types: bool = False, @@ -51,9 +52,7 @@ def get_athena_query_response( warnings.warn(usage_warning) user_id, out_path = get_user_id_and_table_dir( - boto3_session=None, - force_ec2=force_ec2, - region_name=region_name + boto3_session=None, force_ec2=force_ec2, region_name=region_name ) temp_db_name = get_database_name_from_userid(user_id) diff --git a/pydbtools/read_sql.py b/pydbtools/read_sql.py index 6b3c1be..590eb5b 100644 --- a/pydbtools/read_sql.py +++ b/pydbtools/read_sql.py @@ -9,9 +9,10 @@ _pd_dtype_dict_from_metadata, get_file, get_boto_session, - get_default_args + get_default_args, ) + def read_sql( sql_query, timeout=None, convert_dates=True, cols_as_str=False, *args, **kwargs ): @@ -68,10 +69,7 @@ def read_sql( df = pd.read_csv(f, dtype=dtype, parse_dates=parse_dates, *args, **kwargs) # Delete both the SQL query and the meta data - to_del = [ - response["s3_path"], - response["s3_path"] + ".metadata" - ] + to_del = [response["s3_path"], response["s3_path"] + ".metadata"] wr.s3.delete_objects(to_del) return df diff --git a/pydbtools/utils.py b/pydbtools/utils.py index ae8b3b1..c4a386f 100644 --- a/pydbtools/utils.py +++ b/pydbtools/utils.py @@ -93,16 +93,11 @@ def replace_temp_database_name_reference(sql: str, database_name: str) -> str: def get_user_id_and_table_dir( - boto3_session = None, - force_ec2: bool = False, - region_name: str = "eu-west-1" + boto3_session=None, force_ec2: bool = False, region_name: str = "eu-west-1" ) -> Tuple[str, str]: if boto3_session is None: - boto3_session = get_boto_session( - force_ec2=force_ec2, - region_name=region_name - ) + boto3_session = get_boto_session(force_ec2=force_ec2, region_name=region_name) sts_client = boto3_session.client("sts") sts_resp = sts_client.get_caller_identity() @@ -120,12 +115,10 @@ def get_database_name_from_userid(user_id: str) -> str: def get_boto_session( - force_ec2: bool = False, - region_name: str = "eu-west-1", + force_ec2: bool = False, region_name: str = "eu-west-1", ): - kwargs = { - "region_name": region_name - } + print("Get boto session called") + kwargs = {"region_name": region_name} if force_ec2: provider = InstanceMetadataProvider( iam_role_fetcher=InstanceMetadataFetcher(timeout=1000, num_attempts=2) @@ -140,7 +133,7 @@ def get_boto_session( def get_boto_client( client_name: str, - boto3_session = None, + boto3_session=None, force_ec2: bool = False, region_name: str = "eu-west-1", ): diff --git a/pydbtools/wrangler.py b/pydbtools/wrangler.py index 6eba32a..4fec45d 100644 --- a/pydbtools/wrangler.py +++ b/pydbtools/wrangler.py @@ -19,7 +19,7 @@ # Wrapper used to set parameters in the athena wrangler functions # before they are called -def init_athena_params(func): +def init_athena_params(func=None, *, allow_boto3_session=False): """ Takes a wrangler athena function and sets the following: boto3_session and s3_output_path if exists in function param. @@ -32,33 +32,44 @@ def init_athena_params(func): Returns: Similar function call but with pre-defined params. """ + # Allows parameterisation of this wrapper fun + if func is None: + return functools.partial( + init_athena_params, allow_boto3_session=allow_boto3_session + ) @functools.wraps(func) def wrapper(*args, **kwargs): - # Get the boto3 session - setup_defaults = get_default_args(get_boto_session) - setup_kwargs = {} - for k, v in setup_defaults.items(): - setup_kwargs[k] = kwargs.pop(k, v) - boto3_session = get_boto_session(**setup_kwargs) - # Get parameters from function and overwrite specific params sig = inspect.signature(func) argmap = sig.bind_partial(*args, **kwargs).arguments - if "boto3_session" in argmap: - warn_msg = ( - "Warning parameter 'boto3_session' cannot be set. " - "Is defined by setting 'force_ec2' and 'region' params. " - "(Input boto3_session will be ignored)." - ) - warnings.warn(warn_msg) - argmap["boto3_session"] = boto3_session + # If wrapper allows boto3 session being defined by user + # and it has been then do not create new boto3 session + # otherwise do + if allow_boto3_session and argmap.get("boto3_session"): + pass + else: + # Get the boto3 session + setup_defaults = get_default_args(get_boto_session) + setup_kwargs = {} + for k, v in setup_defaults.items(): + setup_kwargs[k] = kwargs.pop(k, v) + boto3_session = get_boto_session(**setup_kwargs) + + if "boto3_session" in argmap: + warn_msg = ( + "Warning parameter 'boto3_session' cannot be set. " + "Is defined by setting 'force_ec2' and 'region' params. " + "(Input boto3_session will be ignored)." + ) + warnings.warn(warn_msg) + argmap["boto3_session"] = boto3_session # Check SQL for __temp__ keyword references and set s3 table path if ("s3_output" in argmap) or ("sql" in argmap): user_id, s3_output = get_user_id_and_table_dir(boto3_session) - + # Set s3_output to predefined path otherwise skip if "s3_output" in sig.parameters: if "s3_output" in argmap: @@ -67,7 +78,7 @@ def wrapper(*args, **kwargs): "Is automatically generated (input ignored)." ) warnings.warn(warn_msg) - + # Set s3 to default s3 path argmap["s3_output"] = s3_output @@ -75,24 +86,22 @@ def wrapper(*args, **kwargs): if "sql" in argmap: temp_db_name = get_database_name_from_userid(user_id) argmap["sql"] = replace_temp_database_name_reference( - argmap["sql"], - temp_db_name + argmap["sql"], temp_db_name ) # Set database to None when not needed - if ( - "database" in sig.parameters and - (argmap.get("database", "__temp__").lower() == "__temp__") + if "database" in sig.parameters and ( + argmap.get("database", "__temp__").lower() == "__temp__" ): - if ( - "ctas_approach" in sig.parameters and - argmap.get("ctas_approach", True) - ): + if "ctas_approach" in sig.parameters and argmap.get("ctas_approach", True): argmap["database"] = temp_db_name + create_temp_database(temp_db_name, boto3_session=boto3_session) + else: argmap["database"] = None return func(**argmap) + return wrapper @@ -111,6 +120,7 @@ def wrapper(*args, **kwargs): wait_query = init_athena_params(ath.wait_query) +@init_athena_params def start_query_execution_and_wait(sql, *args, **kwargs): """Calls start_query_execution followed by wait_query. *args and **kwargs are passed to start_query_execution @@ -119,8 +129,11 @@ def start_query_execution_and_wait(sql, *args, **kwargs): sql (str): An SQL string. Which works with __TEMP__ references. """ - query_execution_id = start_query_execution(sql, *args, **kwargs) - return wait_query(query_execution_id) + # Function wrapper is applied to top of function so we need + # to call the original unwrapped athena fun to ensure the wrapper fun + # is not called again + query_execution_id = ath.start_query_execution(sql, *args, **kwargs) + return ath.wait_query(query_execution_id, boto3_session=kwargs.get("boto3_session")) def check_sql(sql: str): @@ -135,31 +148,30 @@ def check_sql(sql: str): i += 1 +@init_athena_params(allow_boto3_session=True) def create_temp_database( - temp_db_name:str = None, + temp_db_name: str = None, + boto3_session=None, force_ec2: bool = False, - region_name: str = "eu-west-1" + region_name: str = "eu-west-1", ): - if temp_db_name is None: + if temp_db_name is None or temp_db_name.lower().strip() == "__temp__": user_id, _ = get_user_id_and_table_dir( - boto3_session=None, - force_ec2=force_ec2, - region_name=region_name + boto3_session=boto3_session, force_ec2=force_ec2, region_name=region_name ) temp_db_name = get_database_name_from_userid(user_id) create_db_query = f"CREATE DATABASE IF NOT EXISTS {temp_db_name}" - return start_query_execution_and_wait( - sql=create_db_query, - force_ec2=force_ec2, - region_name=region_name, - ) + q_e_id = ath.start_query_execution(create_db_query, boto3_session=boto3_session) + return ath.wait_query(q_e_id, boto3_session=boto3_session) +@init_athena_params def create_temp_table( sql: str, table_name: str, + boto3_session=None, force_ec2: bool = False, region_name: str = "eu-west-1", ): @@ -187,31 +199,21 @@ def create_temp_table( check_sql(sql) # Create named stuff - user_id, out_path = get_user_id_and_table_dir( - boto3_session=None, - force_ec2=force_ec2, - region_name=region_name - ) + user_id, out_path = get_user_id_and_table_dir(boto3_session=boto3_session) db_path = os.path.join(out_path, "__athena_temp_db__/") table_path = os.path.join(db_path, table_name) temp_db_name = get_database_name_from_userid(user_id) - _ = create_temp_database( - temp_db_name, - force_ec2=force_ec2, - region_name=region_name - ) + _ = create_temp_database(temp_db_name, boto3_session) # Clear out table every time - wr.s3.delete_objects(table_path) + wr.s3.delete_objects(table_path, boto3_session=boto3_session) drop_table_query = f"DROP TABLE IF EXISTS {temp_db_name}.{table_name}" - _ = start_query_execution_and_wait( - sql=drop_table_query, - force_ec2=force_ec2, - region_name=region_name, - ) + q_e_id = ath.start_query_execution(drop_table_query, boto3_session=boto3_session) + + _ = ath.wait_query(q_e_id, boto3_session=boto3_session) ctas_query = f""" CREATE TABLE {temp_db_name}.{table_name} @@ -223,8 +225,6 @@ def create_temp_table( as {sql} """ - _ = start_query_execution_and_wait( - sql=ctas_query, - force_ec2=force_ec2, - region_name=region_name, - ) + q_e_id = ath.start_query_execution(ctas_query, boto3_session=boto3_session) + + return ath.wait_query(q_e_id, boto3_session=boto3_session) diff --git a/tests/test_tests.py b/tests/test_tests.py index ce281d9..869e0f5 100644 --- a/tests/test_tests.py +++ b/tests/test_tests.py @@ -32,7 +32,7 @@ def test_read_sql_output(self): "boolean_col", "float_col", "double_col", - "decimal_col" + "decimal_col", ] self.assertTrue(col_test) From a6a0315aeb060c8e91ee0f13618638cb0d9597b8 Mon Sep 17 00:00:00 2001 From: Karik Isichei Date: Tue, 19 Jan 2021 16:21:00 +0000 Subject: [PATCH 03/15] fixing typo@ @ --- pydbtools/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydbtools/utils.py b/pydbtools/utils.py index c4a386f..5e4de16 100644 --- a/pydbtools/utils.py +++ b/pydbtools/utils.py @@ -117,7 +117,7 @@ def get_database_name_from_userid(user_id: str) -> str: def get_boto_session( force_ec2: bool = False, region_name: str = "eu-west-1", ): - print("Get boto session called") + kwargs = {"region_name": region_name} if force_ec2: provider = InstanceMetadataProvider( From 5751cedac1c86beae434a213d9fb43e197ca22fb Mon Sep 17 00:00:00 2001 From: GeorgeKelly Date: Tue, 19 Jan 2021 17:25:40 +0000 Subject: [PATCH 04/15] imports and installs --- pydbtools/read_sql.py | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/pydbtools/read_sql.py b/pydbtools/read_sql.py index 590eb5b..15c89ee 100644 --- a/pydbtools/read_sql.py +++ b/pydbtools/read_sql.py @@ -1,4 +1,5 @@ import pandas as pd +import warnings from functools import wraps import awswrangler as wr diff --git a/pyproject.toml b/pyproject.toml index ca5e84a..3497202 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ numpy = ">=1.16.1" pandas = ">=0.23.4" s3fs = ">=0.2.2,<0.5.0" sqlparse = "^0.3.1" +awswrangler = "^2.3.0" poetry_version = ">=0.1.5" [tool.poetry.dev-dependencies] From b22487f60d4d944bd73628ec1ff4bf113495f535 Mon Sep 17 00:00:00 2001 From: GeorgeKelly Date: Tue, 19 Jan 2021 17:31:53 +0000 Subject: [PATCH 05/15] kwargs --- pydbtools/get_athena_query_response.py | 2 +- pydbtools/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pydbtools/get_athena_query_response.py b/pydbtools/get_athena_query_response.py index 66e0c53..57b1a8a 100644 --- a/pydbtools/get_athena_query_response.py +++ b/pydbtools/get_athena_query_response.py @@ -59,7 +59,7 @@ def get_athena_query_response( sql_query = replace_temp_database_name_reference(sql_query, temp_db_name) out_path = os.path.join(out_path, "__athena_query_dump__/") - athena_client = get_boto_client("athena", force_ec2, region_name) + athena_client = get_boto_client(client_name="athena", force_ec2=force_ec2, region_name=region_name) # Run the athena query response = athena_client.start_query_execution( diff --git a/pydbtools/utils.py b/pydbtools/utils.py index 5e4de16..d14311b 100644 --- a/pydbtools/utils.py +++ b/pydbtools/utils.py @@ -149,7 +149,7 @@ def get_file(s3_path: str, check_exists: bool = True): Returns an file using s3fs without caching objects (workaround for issue #10). s3_path: path to file in S3 e.g. s3://bucket/object/path.csv - check_exists: If True (default) will check for s3 file existance before returning file. + check_exists: If True (default) will check for s3 file existence before returning file. """ b, k = s3_path.replace("s3://", "").split("/", 1) if check_exists: From a7b9c3dc473f47ffa9d75c7c258be7e759bba121 Mon Sep 17 00:00:00 2001 From: GeorgeKelly Date: Tue, 19 Jan 2021 17:33:32 +0000 Subject: [PATCH 06/15] import wrangler --- pydbtools/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pydbtools/utils.py b/pydbtools/utils.py index d14311b..868fd37 100644 --- a/pydbtools/utils.py +++ b/pydbtools/utils.py @@ -7,6 +7,7 @@ import inspect import boto3 from botocore.credentials import InstanceMetadataProvider, InstanceMetadataFetcher +import awswrangler as wr # pydbtools will create a new a new S3 object (then delete it post read). In the first call read # the cache is empty but then filled. If pydbtools is called again the cache is referenced and From 14c4bb3dbf8140f28dc5954d26fc2f2eb77c1e8d Mon Sep 17 00:00:00 2001 From: GeorgeKelly Date: Tue, 19 Jan 2021 17:39:52 +0000 Subject: [PATCH 07/15] no return --- pydbtools/wrangler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydbtools/wrangler.py b/pydbtools/wrangler.py index 4fec45d..bb6c1ad 100644 --- a/pydbtools/wrangler.py +++ b/pydbtools/wrangler.py @@ -227,4 +227,4 @@ def create_temp_table( q_e_id = ath.start_query_execution(ctas_query, boto3_session=boto3_session) - return ath.wait_query(q_e_id, boto3_session=boto3_session) + ath.wait_query(q_e_id, boto3_session=boto3_session) From b1af245cce2254e4fc1c4ac54543a6678b1d1e6b Mon Sep 17 00:00:00 2001 From: GeorgeKelly Date: Tue, 19 Jan 2021 17:53:30 +0000 Subject: [PATCH 08/15] python version changes --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3497202..1fe1a57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,13 +8,13 @@ authors = ["Karik Isichei "] readme = "README.md" [tool.poetry.dependencies] -python = "^3.6" +python = ">=3.6.1, <3.9" boto3 = ">=1.7.4" numpy = ">=1.16.1" pandas = ">=0.23.4" s3fs = ">=0.2.2,<0.5.0" sqlparse = "^0.3.1" -awswrangler = "^2.3.0" +awswrangler = ">=2.3.0" poetry_version = ">=0.1.5" [tool.poetry.dev-dependencies] From 7b7aaaeb19c97b989c464e14210f173708ae550c Mon Sep 17 00:00:00 2001 From: GeorgeKelly Date: Wed, 20 Jan 2021 10:24:58 +0000 Subject: [PATCH 09/15] locking changes --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1fe1a57..551e727 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,13 +8,13 @@ authors = ["Karik Isichei "] readme = "README.md" [tool.poetry.dependencies] -python = ">=3.6.1, <3.9" +python = ">=3.7, <3.9" boto3 = ">=1.7.4" numpy = ">=1.16.1" -pandas = ">=0.23.4" +pandas = "=^1.2" s3fs = ">=0.2.2,<0.5.0" sqlparse = "^0.3.1" -awswrangler = ">=2.3.0" +awswrangler = "^2.3.0" poetry_version = ">=0.1.5" [tool.poetry.dev-dependencies] From ee08034ba8d7bba0fbdca44a896f517f9732c65a Mon Sep 17 00:00:00 2001 From: GeorgeKelly Date: Wed, 20 Jan 2021 10:25:54 +0000 Subject: [PATCH 10/15] do we really need to lint with more than one python version? --- .github/workflows/lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 962a88d..9b92ef9 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -7,7 +7,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.8] steps: - uses: actions/checkout@v2 From d5ac9ab72904983ccb0a831bdfbb778862f7f329 Mon Sep 17 00:00:00 2001 From: GeorgeKelly Date: Wed, 20 Jan 2021 10:46:05 +0000 Subject: [PATCH 11/15] import from different place --- tests/test_sql_parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sql_parse.py b/tests/test_sql_parse.py index 66b620c..f324197 100644 --- a/tests/test_sql_parse.py +++ b/tests/test_sql_parse.py @@ -1,5 +1,5 @@ import pytest -from pydbtools.create_temp_table import check_sql +from pydbtools.wrangler import check_sql from pydbtools.utils import replace_temp_database_name_reference, check_temp_query sql1 = """ From ccd06adc3ed4c5c5413816878ce35e63f190436b Mon Sep 17 00:00:00 2001 From: Karik Isichei Date: Wed, 20 Jan 2021 18:02:03 +0000 Subject: [PATCH 12/15] Adding tests (mainly around init_athena wrapper) - currently failing but making progress. --- pydbtools/wrangler.py | 28 +++++++-- tests/test_wrangler.py | 125 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 tests/test_wrangler.py diff --git a/pydbtools/wrangler.py b/pydbtools/wrangler.py index bb6c1ad..d10b6da 100644 --- a/pydbtools/wrangler.py +++ b/pydbtools/wrangler.py @@ -57,7 +57,7 @@ def wrapper(*args, **kwargs): setup_kwargs[k] = kwargs.pop(k, v) boto3_session = get_boto_session(**setup_kwargs) - if "boto3_session" in argmap: + if argmap.get("boto3_session") is not None: warn_msg = ( "Warning parameter 'boto3_session' cannot be set. " "Is defined by setting 'force_ec2' and 'region' params. " @@ -72,7 +72,7 @@ def wrapper(*args, **kwargs): # Set s3_output to predefined path otherwise skip if "s3_output" in sig.parameters: - if "s3_output" in argmap: + if argmap.get("s3_output") is not None: warn_msg = ( "Warning parameter 's3_output' cannot be set. " "Is automatically generated (input ignored)." @@ -95,7 +95,7 @@ def wrapper(*args, **kwargs): ): if "ctas_approach" in sig.parameters and argmap.get("ctas_approach", True): argmap["database"] = temp_db_name - create_temp_database(temp_db_name, boto3_session=boto3_session) + _create_temp_database(temp_db_name, boto3_session=boto3_session) else: argmap["database"] = None @@ -147,13 +147,33 @@ def check_sql(sql: str): raise ValueError("The sql statement must be a single select query") i += 1 - +# This is not necessary atm but incase future changes are made +# I think it is better to create "public" and "private" method +# where the public function is wrapped by init_athena_params +# this wrapper also calls the private functnio to avoid the wrapper +# calling itself @init_athena_params(allow_boto3_session=True) def create_temp_database( temp_db_name: str = None, boto3_session=None, force_ec2: bool = False, region_name: str = "eu-west-1", +): + out = _create_temp_database( + temp_db_name=temp_db_name, + boto3_session=boto3_session, + force_ec2=force_ec2, + region_name=region_name, + ) + + return out + + +def _create_temp_database( + temp_db_name: str = None, + boto3_session=None, + force_ec2: bool = False, + region_name: str = "eu-west-1", ): if temp_db_name is None or temp_db_name.lower().strip() == "__temp__": user_id, _ = get_user_id_and_table_dir( diff --git a/tests/test_wrangler.py b/tests/test_wrangler.py new file mode 100644 index 0000000..eccb0be --- /dev/null +++ b/tests/test_wrangler.py @@ -0,0 +1,125 @@ +import os + +import boto3 +from moto import mock_s3 +import pytest + +from pydbtools.wrangler import ( + init_athena_params, + get_boto_session, +) + +def mock_get_user_id_and_table_dir( + boto3_session=None, force_ec2: bool = False, region_name: str = "eu-west-1" +): + if ( + isinstance(boto3_session, dict) and + "events" in boto3_session + ): + boto3_session["events"].append("get_user_id_and_table_dir") + return ("user_pytest", "s3://dummy/path/") + + +def mock_create_temp_database( + temp_db_name: str = None, + boto3_session=None, + force_ec2: bool = False, + region_name: str = "eu-west-1", +): + if ( + isinstance(boto3_session, dict) and + "events" in boto3_session + ): + boto3_session["events"].append("_create_temp_database") + return boto3_session + + +def get_empty_boto_log(*args, **kwargs): + return {"name": "boto3_session", "events": []} + +@init_athena_params +def fun_boto_only( + boto3_session=None, + *args, + **kwargs +): + return locals() + + +@init_athena_params +def fun_with_sql( + sql="", + boto3_session=None, + *args, + **kwargs +): + return locals() + +@init_athena_params +def fun_with_sql_and_db_s3( + sql="", + boto3_session=None, + database=None, + s3_output=None, + **kwargs +): + return locals() + +@init_athena_params +def fun_with_db_s3( + boto3_session=None, + database=None, + s3_output=None, + **kwargs +): + return locals() + + +# "sql, boto3_session, kwargs, expected_events, expect_warns", +@pytest.mark.parametrize( +"fun_params, fun, expected_events, expect_warns", +[ + ({}, fun_boto_only, [], False), + ({"boto3_session": "boto3_session"}, fun_boto_only, [], True), + ({}, fun_with_sql, ["get_user_id_and_table_dir"], False), + ({"sql": ""}, fun_with_sql, ["get_user_id_and_table_dir"], False), + ({}, fun_with_sql_and_db_s3, ["get_user_id_and_table_dir", "_create_temp_database"], False), + ({"s3_output": "somewhere"}, fun_with_sql_and_db_s3, ["get_user_id_and_table_dir", "_create_temp_database"], True), +], +ids=[ + "fun_boto_only", + "fun_boto_only::boto3", + "fun_with_sql", + "fun_with_sql::sql", + "fun_with_sql_and_db_s3", + "fun_with_sql_and_db_s3::s3_output" +] +) +def test_init_athena_params(fun_params, fun, expected_events, expect_warns, monkeypatch): + monkeypatch.setattr( + "pydbtools.wrangler.get_boto_session", + get_empty_boto_log + ) + monkeypatch.setattr( + "pydbtools.wrangler.get_user_id_and_table_dir", + mock_get_user_id_and_table_dir + ) + monkeypatch.setattr( + "pydbtools.wrangler.get_database_name_from_userid", + lambda user_id: "mojap_de_temp_pytest" + ) + monkeypatch.setattr( + "pydbtools.wrangler._create_temp_database", + lambda user_id: mojap_de_temp_pytest + ) + + if expect_warns: + with pytest.warns(UserWarning): + out = fun(**fun_params) + else: + out = fun(**fun_params) + + assert out.get("boto3_session").get("events") == expected_events + + if fun.__name__ == "": + pass \ No newline at end of file From 9a455f53c2077c111a02f6c61a8a3b011399f029 Mon Sep 17 00:00:00 2001 From: Karik Isichei Date: Thu, 21 Jan 2021 09:31:40 +0000 Subject: [PATCH 13/15] Added tests for wrapper function and now passing --- pydbtools/get_athena_query_response.py | 4 +- pydbtools/wrangler.py | 33 +++-- tests/test_tests.py | 8 +- tests/test_wrangler.py | 160 ++++++++++++++++--------- 4 files changed, 135 insertions(+), 70 deletions(-) diff --git a/pydbtools/get_athena_query_response.py b/pydbtools/get_athena_query_response.py index 57b1a8a..b32a088 100644 --- a/pydbtools/get_athena_query_response.py +++ b/pydbtools/get_athena_query_response.py @@ -59,7 +59,9 @@ def get_athena_query_response( sql_query = replace_temp_database_name_reference(sql_query, temp_db_name) out_path = os.path.join(out_path, "__athena_query_dump__/") - athena_client = get_boto_client(client_name="athena", force_ec2=force_ec2, region_name=region_name) + athena_client = get_boto_client( + client_name="athena", force_ec2=force_ec2, region_name=region_name + ) # Run the athena query response = athena_client.start_query_execution( diff --git a/pydbtools/wrangler.py b/pydbtools/wrangler.py index d10b6da..f208545 100644 --- a/pydbtools/wrangler.py +++ b/pydbtools/wrangler.py @@ -44,6 +44,11 @@ def wrapper(*args, **kwargs): sig = inspect.signature(func) argmap = sig.bind_partial(*args, **kwargs).arguments + # Create a db flag + database_flag = "database" in sig.parameters and ( + argmap.get("database", "__temp__").lower() == "__temp__" + ) + # If wrapper allows boto3 session being defined by user # and it has been then do not create new boto3 session # otherwise do @@ -66,9 +71,14 @@ def wrapper(*args, **kwargs): warnings.warn(warn_msg) argmap["boto3_session"] = boto3_session - # Check SQL for __temp__ keyword references and set s3 table path - if ("s3_output" in argmap) or ("sql" in argmap): + # Set s3 table path and get temp_db_name + if ( + ("s3_output" in sig.parameters) + or ("sql" in sig.parameters) + or database_flag + ): user_id, s3_output = get_user_id_and_table_dir(boto3_session) + temp_db_name = get_database_name_from_userid(user_id) # Set s3_output to predefined path otherwise skip if "s3_output" in sig.parameters: @@ -84,19 +94,19 @@ def wrapper(*args, **kwargs): # Fix sql before it is passed to athena if "sql" in argmap: - temp_db_name = get_database_name_from_userid(user_id) argmap["sql"] = replace_temp_database_name_reference( argmap["sql"], temp_db_name ) - # Set database to None when not needed - if "database" in sig.parameters and ( - argmap.get("database", "__temp__").lower() == "__temp__" - ): - if "ctas_approach" in sig.parameters and argmap.get("ctas_approach", True): + # Set database to None or set to keyword temp when not needed + if database_flag: + if "ctas_approach" in sig.parameters and argmap.get( + "ctas_approach", sig.parameters["ctas_approach"].default + ): argmap["database"] = temp_db_name _create_temp_database(temp_db_name, boto3_session=boto3_session) - + elif argmap.get("database", "").lower() == "__temp__": + argmap["database"] = temp_db_name else: argmap["database"] = None @@ -147,8 +157,9 @@ def check_sql(sql: str): raise ValueError("The sql statement must be a single select query") i += 1 + # This is not necessary atm but incase future changes are made -# I think it is better to create "public" and "private" method +#  I think it is better to create "public" and "private" method # where the public function is wrapped by init_athena_params # this wrapper also calls the private functnio to avoid the wrapper # calling itself @@ -165,7 +176,7 @@ def create_temp_database( force_ec2=force_ec2, region_name=region_name, ) - + return out diff --git a/tests/test_tests.py b/tests/test_tests.py index 869e0f5..582c42d 100644 --- a/tests/test_tests.py +++ b/tests/test_tests.py @@ -1,14 +1,20 @@ """ Testing DatabaseMeta, TableMeta """ - +import pytest import unittest import numpy as np import pandas as pd import pydbtools as pydb +skip_msg = ( + "Only testable with access to dbtools database. " + "This test is also testing deprecated code so skipping." +) + +@pytest.mark.skip(reason=skip_msg) class ReadSqlTest(unittest.TestCase): """ Test packages read_sql function works diff --git a/tests/test_wrangler.py b/tests/test_wrangler.py index eccb0be..220fb1d 100644 --- a/tests/test_wrangler.py +++ b/tests/test_wrangler.py @@ -9,13 +9,11 @@ get_boto_session, ) + def mock_get_user_id_and_table_dir( boto3_session=None, force_ec2: bool = False, region_name: str = "eu-west-1" ): - if ( - isinstance(boto3_session, dict) and - "events" in boto3_session - ): + if isinstance(boto3_session, dict) and "events" in boto3_session: boto3_session["events"].append("get_user_id_and_table_dir") return ("user_pytest", "s3://dummy/path/") @@ -26,91 +24,134 @@ def mock_create_temp_database( force_ec2: bool = False, region_name: str = "eu-west-1", ): - if ( - isinstance(boto3_session, dict) and - "events" in boto3_session - ): - boto3_session["events"].append("_create_temp_database") + if isinstance(boto3_session, dict) and "events" in boto3_session: + boto3_session["events"].append(f"_create_temp_database({temp_db_name})") return boto3_session def get_empty_boto_log(*args, **kwargs): return {"name": "boto3_session", "events": []} + @init_athena_params -def fun_boto_only( - boto3_session=None, - *args, - **kwargs -): +def fun_boto_only(boto3_session=None, *args, **kwargs): return locals() @init_athena_params -def fun_with_sql( - sql="", - boto3_session=None, - *args, - **kwargs -): +def fun_with_sql(sql=None, boto3_session=None, *args, **kwargs): return locals() + @init_athena_params def fun_with_sql_and_db_s3( - sql="", - boto3_session=None, - database=None, - s3_output=None, - **kwargs + sql=None, boto3_session=None, database=None, s3_output=None, **kwargs ): return locals() + @init_athena_params -def fun_with_db_s3( +def fun_with_sql_db_s3_ctas( + sql=None, boto3_session=None, database=None, s3_output=None, - **kwargs + ctas_approach=False, + **kwargs, ): return locals() # "sql, boto3_session, kwargs, expected_events, expect_warns", @pytest.mark.parametrize( -"fun_params, fun, expected_events, expect_warns", -[ - ({}, fun_boto_only, [], False), - ({"boto3_session": "boto3_session"}, fun_boto_only, [], True), - ({}, fun_with_sql, ["get_user_id_and_table_dir"], False), - ({"sql": ""}, fun_with_sql, ["get_user_id_and_table_dir"], False), - ({}, fun_with_sql_and_db_s3, ["get_user_id_and_table_dir", "_create_temp_database"], False), - ({"s3_output": "somewhere"}, fun_with_sql_and_db_s3, ["get_user_id_and_table_dir", "_create_temp_database"], True), -], -ids=[ - "fun_boto_only", - "fun_boto_only::boto3", - "fun_with_sql", - "fun_with_sql::sql", - "fun_with_sql_and_db_s3", - "fun_with_sql_and_db_s3::s3_output" -] + "fun_params, fun, expected_events, returned_db, expect_warns", + [ + ({}, fun_boto_only, [], None, False), + ({"boto3_session": "boto3_session"}, fun_boto_only, [], None, True), + ({}, fun_with_sql, ["get_user_id_and_table_dir"], None, False), + ({"sql": ""}, fun_with_sql, ["get_user_id_and_table_dir"], None, False), + ({}, fun_with_sql_and_db_s3, ["get_user_id_and_table_dir"], None, False), + ( + {"s3_output": "somewhere"}, + fun_with_sql_and_db_s3, + ["get_user_id_and_table_dir"], + None, + True, + ), + ( + {"database": "__TEMP__"}, + fun_with_sql_and_db_s3, + ["get_user_id_and_table_dir"], + "mojap_de_temp_pytest", + False, + ), + ( + {"database": "user_defined_db"}, + fun_with_sql_and_db_s3, + ["get_user_id_and_table_dir"], + "user_defined_db", + False, + ), + ( + {"ctas_approach": True}, + fun_with_sql_db_s3_ctas, + [ + "get_user_id_and_table_dir", + "_create_temp_database(mojap_de_temp_pytest)", + ], + "mojap_de_temp_pytest", + False, + ), + ( + {"ctas_approach": True, "database": "user_defined_db"}, + fun_with_sql_db_s3_ctas, + ["get_user_id_and_table_dir"], + "user_defined_db", + False, + ), + ( + {"ctas_approach": False}, + fun_with_sql_db_s3_ctas, + ["get_user_id_and_table_dir"], + None, + False, + ), + ( + {"s3_output": "somewhere"}, + fun_with_sql_db_s3_ctas, + ["get_user_id_and_table_dir"], + None, + True, + ), + ], + ids=[ + "fun_boto_only", + "fun_boto_only::boto3", + "fun_with_sql", + "fun_with_sql::sql", + "fun_with_sql_and_db_s3", + "fun_with_sql_and_db_s3::s3_output", + "fun_with_sql_and_db_s3::database-temp", + "fun_with_sql_and_db_s3::database-userdefined", + "fun_with_sql_db_s3_ctas::ctas_approach-True", + "fun_with_sql_db_s3_ctas::ctas_approach-True,database-userdefined", + "fun_with_sql_db_s3_ctas::ctas_approach-False", + "fun_with_sql_db_s3_ctas:s3_output", + ], ) -def test_init_athena_params(fun_params, fun, expected_events, expect_warns, monkeypatch): - monkeypatch.setattr( - "pydbtools.wrangler.get_boto_session", - get_empty_boto_log - ) +def test_init_athena_params( + fun_params, fun, expected_events, expect_warns, returned_db, monkeypatch +): + monkeypatch.setattr("pydbtools.wrangler.get_boto_session", get_empty_boto_log) monkeypatch.setattr( - "pydbtools.wrangler.get_user_id_and_table_dir", - mock_get_user_id_and_table_dir - ) + "pydbtools.wrangler.get_user_id_and_table_dir", mock_get_user_id_and_table_dir + ) monkeypatch.setattr( "pydbtools.wrangler.get_database_name_from_userid", - lambda user_id: "mojap_de_temp_pytest" + lambda user_id: "mojap_de_temp_pytest", ) monkeypatch.setattr( - "pydbtools.wrangler._create_temp_database", - lambda user_id: mojap_de_temp_pytest + "pydbtools.wrangler._create_temp_database", mock_create_temp_database ) if expect_warns: @@ -121,5 +162,10 @@ def test_init_athena_params(fun_params, fun, expected_events, expect_warns, monk assert out.get("boto3_session").get("events") == expected_events - if fun.__name__ == "": - pass \ No newline at end of file + if fun.__name__ in ["fun_boto_only", "fun_with_sql"]: + assert isinstance(out.get("boto3_session"), dict) + + elif fun.__name__ in ["fun_with_sql_and_db_s3", "fun_with_sql_db_s3_ctas"]: + assert isinstance(out.get("boto3_session"), dict) + assert out.get("s3_output") == "s3://dummy/path/" + assert out["database"] == returned_db From 3215ba4f38e5c16d343af323bbb13f1ee0599cbc Mon Sep 17 00:00:00 2001 From: Karik Isichei Date: Thu, 21 Jan 2021 14:54:12 +0000 Subject: [PATCH 14/15] Updated changelog, readme and pyproject toml --- CHANGELOG.md | 8 + README.md | 93 +++++++ poetry.lock | 596 ++++++++++++++++++++++++++++++----------- pydbtools/wrangler.py | 8 +- pyproject.toml | 4 +- tests/test_wrangler.py | 7 +- 6 files changed, 549 insertions(+), 167 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b93f5b9..9b0ebad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v3.0.0 - 2021-01-26 + +### Changed +- `pydbtools` now acts as a wrapper for the athena module in awswrangler +- Previous functions `get_athena_query_response` and `read_sql` are now deprecated (but still currently work with this release. later releases may remove them). +- Allows users to create temporary tables that are stored in a database aliased by the name `__temp__`. SQL queries will replace any reference to `__temp__` with the real database name before the call. + + ## v2.0.2 - 2020-11-26 ### Fixed diff --git a/README.md b/README.md index 15ac866..4f72237 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,98 @@ # pydbtools +## Installation + +> Requires a pip release above 20. + +```bash +pip install "pydbtools @ git+https://github.com/moj-analytical-services/pydbtools" + +## Or install a specific release + +pip install "pydbtools @ git+https://github.com/moj-analytical-services/pydbtools@v3.0.0" +``` + +## Introduction + +This package is a wrapper for [awswrangler](https://aws-data-wrangler.readthedocs.io/en/2.3.0/what.html) that which presets/defines some of the input parameters to the athena module functions to align with our platform setup. See the [awswrangler API reference documentation for Athena](https://aws-data-wrangler.readthedocs.io/en/2.3.0/api.html#amazon-athena) to see what functions you can call from pydbtools. + +The function parameters that are locked down / altered by `pydbtools` are: +- **boto3_session:** This is auto generated by `pydbtools` (in order to grab the user credentials from the sts client - this is needed for the R version of this package which calls this package under the hood. In short forcing refreshed credentials are needed in R as boto3 credentials timeout and do not refresh when using reticulate (at least currently)) +- **s3_output:** The S3 path where database queries are written to. This is defined by `pydbtools` based on the IAM user/role calling the query (ensures that each role can only read/write to a S3 path only they can access). +- **database:** Will either be set to `None` or `__temp__` depending on other user parameters (if `ctas_approach=True`). `__temp__` is an alias to an autogenerated temp database name which is generated from `pydbtools` again based on the IAM user/role. References to this temporary database can be referenced by the keyword `__temp__` in SQL queries see additional functionality to awswrangler section. +- **sql:** We allows reference to the database name `__temp__` which is an alias to a user specific temporary database. When a function call has an SQL parameter the SQL is checked with an SQL parser and then any reference to `__temp__` as a database is replaced with the actual database name which is autogenerated. This replacement only occurs for `SELECT` queries. + +## Additional Functionality + +As well as acting as a wrapper function for awswrangler this package also allows you to do the following: + +### Run query and wait for a response + +This function essentially calls two functions from `awswrangler.athena`. First `start_query_execution` followed by `wait_query`. + +```python +import pydbtools as pydb + +response = pydb.start_query_execution_and_wait("SELECT * from a_database.table LIMIT 10") +``` + +### Create Temporary Tables + +You can use the `create_temp_table` function to write SQL to create a store a temporary table that sits in your `__temp__` database. + +```python +import pydbtools as pydb + +pydb.create_temp_table("SELECT * from a_database.table LIMIT 10", table_name="temp_table_1") +df = pydb.read_sql_query("SELECT * from __temp__.temp_table_1") +df.head() +``` + +## Usage / Examples + +### Simple + +```python +import pydbtools as pydb + +# Run a query using pydbtools +response = pydb.start_query_execution_and_wait("CREATE DATABASE IF NOT EXISTS my_test_database") + +# Read data from an athena query directly into pandas +pydb.read_sql("SELECT * from a_database.table LIMIT 10") + +# Create a temp table to do further seperate SQL queries later on +pydb.create_temp_table("SELECT a_col, count(*) as n FROM a_database.table GROUP BY a_col", table_name="temp_table_1") +df = pydb.read_sql_query("SELECT * FROM __temp__.temp_table_1 WHERE n < 10") +``` + +### More advanced usage + +Get the actual name for your temp database, create your temp db then delete it using awswrangler (not awsrangler will raise an error if the database does not exist) + +```python +import awswrangler as wr +import pydbtools as pydb + +user_id, out_path = pydb.get_user_id_and_table_dir() +temp_db_name = pydb.get_database_name_from_userid(user_id) +print(temp_db_name) +pydb.create_temp_table() +print(wr.catalog.delete_database(name=temp_db_name)) +``` + +# DEPRECATED + +## Functions + +The functions: +- `pydbtools.get_athena_query_response` +- `pydbtools.read_sql` + +Are now deprecated and calls to these functions will raise an warning. They have been replaced by `pydbtools.start_query_execution_and_wait` and `pydbtools.read_sql_query`. + +## Docs for versions below v3.0.0 + This is a simple package that let's you query databases using Amazon Athena and get the s3 path to the athena out (as a csv). This is significantly faster than using the the database drivers so might be a good option when pulling in large data. By default, data is converted into a pandas dataframe with equivalent column data types as the Athena table - see "Meta Data" section below. Note to use this package you need to be added to the StandardDatabaseAccess IAM Policy on the Analytical Platform. Please contact the team if you require access. diff --git a/poetry.lock b/poetry.lock index dcc9795..71e43d1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,314 +1,486 @@ [[package]] -category = "dev" -description = "Atomic file writes." -marker = "sys_platform == \"win32\"" name = "atomicwrites" +version = "1.4.0" +description = "Atomic file writes." +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "1.4.0" [[package]] -category = "dev" -description = "Classes Without Boilerplate" name = "attrs" +version = "20.3.0" +description = "Classes Without Boilerplate" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "20.3.0" [package.extras] -dev = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "furo", "sphinx", "pre-commit"] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "furo", "sphinx", "pre-commit"] docs = ["furo", "sphinx", "zope.interface"] -tests = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] -tests_no_zope = ["coverage (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six"] [[package]] +name = "awswrangler" +version = "2.3.0" +description = "Pandas on AWS." category = "main" -description = "The AWS SDK for Python" +optional = false +python-versions = ">=3.6, <3.9" + +[package.dependencies] +boto3 = ">=1.12.49,<2.0.0" +botocore = ">=1.15.49,<2.0.0" +numpy = ">=1.18.0,<1.20.0" +openpyxl = ">=3.0.0,<3.1.0" +pandas = ">=1.1.0,<=1.2.0" +pg8000 = ">=1.16.0,<1.17.0" +pyarrow = ">=2.0.0,<2.1.0" +pymysql = ">=0.9.0,<1.1.0" +redshift-connector = ">=2.0.0,<2.1.0" + +[package.extras] +sqlserver = ["pyodbc (>=4.0.30,<4.1.0)"] + +[[package]] +name = "beautifulsoup4" +version = "4.9.3" +description = "Screen-scraping library" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +soupsieve = {version = ">1.2", markers = "python_version >= \"3.0\""} + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] name = "boto3" +version = "1.16.57" +description = "The AWS SDK for Python" +category = "main" optional = false python-versions = "*" -version = "1.16.53" [package.dependencies] -botocore = ">=1.19.53,<1.20.0" +botocore = ">=1.19.57,<1.20.0" jmespath = ">=0.7.1,<1.0.0" s3transfer = ">=0.3.0,<0.4.0" [[package]] -category = "main" -description = "Low-level, data-driven core of boto 3." name = "botocore" +version = "1.19.57" +description = "Low-level, data-driven core of boto 3." +category = "main" optional = false python-versions = "*" -version = "1.19.53" [package.dependencies] jmespath = ">=0.7.1,<1.0.0" python-dateutil = ">=2.1,<3.0.0" +urllib3 = {version = ">=1.25.4,<1.27", markers = "python_version != \"3.4\""} -[package.dependencies.urllib3] -python = "<3.4.0 || >=3.5.0" -version = ">=1.25.4,<1.27" +[[package]] +name = "certifi" +version = "2020.12.5" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" +optional = false +python-versions = "*" [[package]] -category = "dev" -description = "Cross-platform colored terminal text." -marker = "sys_platform == \"win32\"" -name = "colorama" +name = "chardet" +version = "4.0.0" +description = "Universal encoding detector for Python 2 and 3" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[[package]] +name = "colorama" version = "0.4.4" +description = "Cross-platform colored terminal text." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] +name = "et-xmlfile" +version = "1.0.1" +description = "An implementation of lxml.xmlfile for the standard library" category = "main" -description = "Data engineering utils Python 3 version" -name = "dataengineeringutils3" optional = false -python-versions = ">=3.6,<4.0" -version = "1.1.0" - -[package.dependencies] -boto3 = ">=1.10,<2.0" +python-versions = "*" [[package]] -category = "main" -description = "File-system specification" name = "fsspec" +version = "0.8.5" +description = "File-system specification" +category = "main" optional = false -python-versions = ">3.5" -version = "0.8.0" +python-versions = ">3.6" + +[package.extras] +abfs = ["adlfs"] +adl = ["adlfs"] +dask = ["dask", "distributed"] +dropbox = ["dropboxdrivefs", "requests", "dropbox"] +gcs = ["gcsfs"] +git = ["pygit2"] +github = ["requests"] +gs = ["gcsfs"] +hdfs = ["pyarrow"] +http = ["requests", "aiohttp"] +s3 = ["s3fs"] +sftp = ["paramiko"] +smb = ["smbprotocol"] +ssh = ["paramiko"] [[package]] +name = "idna" +version = "2.10" +description = "Internationalized Domain Names in Applications (IDNA)" category = "main" -description = "Python 2.7 utils for glue jobs" -name = "gluejobutils" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "3.2.0" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] -category = "dev" -description = "Read metadata from Python packages" -marker = "python_version < \"3.8\"" name = "importlib-metadata" +version = "3.4.0" +description = "Read metadata from Python packages" +category = "dev" optional = false python-versions = ">=3.6" -version = "3.4.0" [package.dependencies] +typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" -[package.dependencies.typing-extensions] -python = "<3.8" -version = ">=3.6.4" - [package.extras] docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] testing = ["pytest (>=3.5,<3.7.3 || >3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "packaging", "pep517", "pyfakefs", "flufl.flake8", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"] [[package]] +name = "jdcal" +version = "1.4.1" +description = "Julian dates from proleptic Gregorian and Julian calendars." category = "main" -description = "JSON Matching Expressions" +optional = false +python-versions = "*" + +[[package]] name = "jmespath" +version = "0.10.0" +description = "JSON Matching Expressions" +category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -version = "0.10.0" [[package]] -category = "dev" -description = "More routines for operating on iterables, beyond itertools" +name = "lxml" +version = "4.6.2" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["beautifulsoup4"] +source = ["Cython (>=0.29.7)"] + +[[package]] name = "more-itertools" +version = "8.6.0" +description = "More routines for operating on iterables, beyond itertools" +category = "dev" optional = false python-versions = ">=3.5" -version = "8.6.0" [[package]] -category = "main" -description = "NumPy is the fundamental package for array computing with Python." name = "numpy" +version = "1.19.5" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" optional = false python-versions = ">=3.6" -version = "1.19.5" [[package]] -category = "dev" -description = "Core utilities for Python packages" +name = "openpyxl" +version = "3.0.6" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +category = "main" +optional = false +python-versions = ">=3.6," + +[package.dependencies] +et-xmlfile = "*" +jdcal = "*" + +[[package]] name = "packaging" +version = "20.8" +description = "Core utilities for Python packages" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "20.8" [package.dependencies] pyparsing = ">=2.0.2" [[package]] -category = "main" -description = "Powerful data structures for data analysis, time series, and statistics" name = "pandas" +version = "1.1.5" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" optional = false -python-versions = ">=3.5.3" -version = "0.25.3" +python-versions = ">=3.6.1" [package.dependencies] -numpy = ">=1.13.3" -python-dateutil = ">=2.6.1" +numpy = ">=1.15.4" +python-dateutil = ">=2.7.3" pytz = ">=2017.2" [package.extras] test = ["pytest (>=4.0.2)", "pytest-xdist", "hypothesis (>=3.58)"] [[package]] -category = "dev" -description = "plugin and hook calling mechanisms for python" +name = "pg8000" +version = "1.16.6" +description = "PostgreSQL interface library" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +scramp = "1.2.0" + +[[package]] name = "pluggy" +version = "0.13.1" +description = "plugin and hook calling mechanisms for python" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "0.13.1" [package.dependencies] -[package.dependencies.importlib-metadata] -python = "<3.8" -version = ">=0.12" +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} [package.extras] dev = ["pre-commit", "tox"] [[package]] -category = "main" -description = "Python library for extracting version from poetry pyproject.toml file" name = "poetry-version" +version = "0.1.5" +description = "Python library for extracting version from poetry pyproject.toml file" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "0.1.5" [package.dependencies] tomlkit = ">=0.4.6,<0.6.0" [[package]] -category = "dev" -description = "library with cross-python path, ini-parsing, io, code, log facilities" name = "py" +version = "1.10.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "1.10.0" [[package]] -category = "dev" -description = "Python parsing module" +name = "pyarrow" +version = "2.0.0" +description = "Python library for Apache Arrow" +category = "main" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +numpy = ">=1.14" + +[[package]] +name = "pymysql" +version = "1.0.2" +description = "Pure Python MySQL Driver" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.extras] +ed25519 = ["PyNaCl (>=1.4.0)"] +rsa = ["cryptography"] + +[[package]] name = "pyparsing" +version = "2.4.7" +description = "Python parsing module" +category = "dev" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -version = "2.4.7" [[package]] -category = "dev" -description = "pytest: simple powerful testing with Python" name = "pytest" +version = "5.4.3" +description = "pytest: simple powerful testing with Python" +category = "dev" optional = false python-versions = ">=3.5" -version = "5.4.3" [package.dependencies] -atomicwrites = ">=1.0" +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} attrs = ">=17.4.0" -colorama = "*" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} more-itertools = ">=4.0.0" packaging = "*" pluggy = ">=0.12,<1.0" py = ">=1.5.0" wcwidth = "*" -[package.dependencies.importlib-metadata] -python = "<3.8" -version = ">=0.12" - [package.extras] checkqa-mypy = ["mypy (v0.761)"] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] [[package]] -category = "main" -description = "Extensions to the standard Python datetime module" name = "python-dateutil" +version = "2.8.1" +description = "Extensions to the standard Python datetime module" +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" -version = "2.8.1" [package.dependencies] six = ">=1.5" [[package]] -category = "main" -description = "World timezone definitions, modern and historical" name = "pytz" +version = "2020.5" +description = "World timezone definitions, modern and historical" +category = "main" optional = false python-versions = "*" -version = "2020.5" [[package]] +name = "redshift-connector" +version = "2.0.872" +description = "Redshift interface library" category = "main" -description = "Convenient Filesystem interface over S3" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +beautifulsoup4 = ">=4.7.0" +boto3 = ">=1.16.8" +botocore = ">=1.19.8" +lxml = ">=4.2.5" +pytz = ">=2020.1" +requests = ">=2.23.0" +scramp = ">=1.2.0" + +[package.extras] +full = ["numpy", "pandas"] + +[[package]] +name = "requests" +version = "2.25.1" +description = "Python HTTP for Humans." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.dependencies] +certifi = ">=2017.4.17" +chardet = ">=3.0.2,<5" +idna = ">=2.5,<3" +urllib3 = ">=1.21.1,<1.27" + +[package.extras] +security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] +socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"] + +[[package]] name = "s3fs" +version = "0.4.2" +description = "Convenient Filesystem interface over S3" +category = "main" optional = false python-versions = ">= 3.5" -version = "0.4.2" [package.dependencies] botocore = ">=1.12.91" fsspec = ">=0.6.0" [[package]] -category = "main" -description = "An Amazon S3 Transfer Manager" name = "s3transfer" +version = "0.3.4" +description = "An Amazon S3 Transfer Manager" +category = "main" optional = false python-versions = "*" -version = "0.3.4" [package.dependencies] botocore = ">=1.12.36,<2.0a.0" [[package]] +name = "scramp" +version = "1.2.0" +description = "An implementation of the SCRAM protocol." category = "main" -description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=3.5" + +[[package]] name = "six" +version = "1.15.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" -version = "1.15.0" [[package]] +name = "soupsieve" +version = "2.1" +description = "A modern CSS selector implementation for Beautiful Soup." category = "main" -description = "Non-validating SQL parser" +optional = false +python-versions = ">=3.5" + +[[package]] name = "sqlparse" +version = "0.3.1" +description = "Non-validating SQL parser" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "0.3.1" [[package]] -category = "main" -description = "Style preserving TOML library" name = "tomlkit" +version = "0.5.11" +description = "Style preserving TOML library" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "0.5.11" [[package]] -category = "dev" -description = "Backported and Experimental Type Hints for Python 3.5+" -marker = "python_version < \"3.8\"" name = "typing-extensions" +version = "3.7.4.3" +description = "Backported and Experimental Type Hints for Python 3.5+" +category = "dev" optional = false python-versions = "*" -version = "3.7.4.3" [[package]] -category = "main" -description = "HTTP library with thread-safe connection pooling, file post, and more." -marker = "python_version != \"3.4\"" name = "urllib3" +version = "1.26.2" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" -version = "1.26.2" [package.extras] brotli = ["brotlipy (>=0.6.0)"] @@ -316,29 +488,29 @@ secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "cer socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"] [[package]] -category = "dev" -description = "Measures the displayed width of unicode strings in a terminal" name = "wcwidth" +version = "0.2.5" +description = "Measures the displayed width of unicode strings in a terminal" +category = "dev" optional = false python-versions = "*" -version = "0.2.5" [[package]] -category = "dev" -description = "Backport of pathlib-compatible object wrapper for zip files" -marker = "python_version < \"3.8\"" name = "zipp" +version = "3.4.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +category = "dev" optional = false python-versions = ">=3.6" -version = "3.4.0" [package.extras] docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] testing = ["pytest (>=3.5,<3.7.3 || >3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "jaraco.test (>=3.2.0)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] [metadata] -content-hash = "f2f710db21e4615c0dcb871f89497a8eabe7df5b2209061c7c60a9c558c2e8b7" -python-versions = "^3.6" +lock-version = "1.1" +python-versions = ">=3.7, <3.9" +content-hash = "26616fbb2829359c8b9a005d7d1f982debddaec7aea0c6c822d6a3ea18a95967" [metadata.files] atomicwrites = [ @@ -349,38 +521,98 @@ attrs = [ {file = "attrs-20.3.0-py2.py3-none-any.whl", hash = "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6"}, {file = "attrs-20.3.0.tar.gz", hash = "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"}, ] +awswrangler = [ + {file = "awswrangler-2.3.0-py3-none-any.whl", hash = "sha256:599e72937d87f84bce95bbd3e7490a49a065ff2c9844401562520994b31471af"}, + {file = "awswrangler-2.3.0-py3.6.egg", hash = "sha256:147b7d6a1521dc0f3410e0b49c7449454fe27a4d20bb125c2ebc491b75fbc9a9"}, + {file = "awswrangler-2.3.0.tar.gz", hash = "sha256:5f369fccf791e80efa7bb70496182857f35233275f66ae62d476e68be8892165"}, +] +beautifulsoup4 = [ + {file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"}, + {file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"}, + {file = "beautifulsoup4-4.9.3.tar.gz", hash = "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25"}, +] boto3 = [ - {file = "boto3-1.16.53-py2.py3-none-any.whl", hash = "sha256:b1e91860fe2cae986f8e8238c12724f7fe4631a183e2c6f6b86714cc98645a6a"}, - {file = "boto3-1.16.53.tar.gz", hash = "sha256:71a0c22a040ac3a785f558628abfea8be86bb30b29003ebd124c51aba97dfeb8"}, + {file = "boto3-1.16.57-py2.py3-none-any.whl", hash = "sha256:2fd3c2f42006988dc8ddae43c988aea481d11e2af7ab1deb83b293640357986c"}, + {file = "boto3-1.16.57.tar.gz", hash = "sha256:4a499cc2f53dd557a88c6db6a552748a2abd83ffeda70ceb71dc8db39a027314"}, ] botocore = [ - {file = "botocore-1.19.53-py2.py3-none-any.whl", hash = "sha256:21677cda7b32492a1a74ac40e51d691a6623578d6700feb9976966d26a576414"}, - {file = "botocore-1.19.53.tar.gz", hash = "sha256:e93539781c43bd64291798a01cc6df2c0ff98e01ae7fe48286942ca8fa351680"}, + {file = "botocore-1.19.57-py2.py3-none-any.whl", hash = "sha256:cf7d108a4d67a0fe670379111927b5d9e0ff1160146c81c326bb9e54c2b8cb19"}, + {file = "botocore-1.19.57.tar.gz", hash = "sha256:c756d65ffa989c5c0e92178175e41abf7b18ad19b2fe2e82e192f085e264e03a"}, +] +certifi = [ + {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"}, + {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"}, +] +chardet = [ + {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"}, + {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"}, ] colorama = [ {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, ] -dataengineeringutils3 = [ - {file = "dataengineeringutils3-1.1.0-py3-none-any.whl", hash = "sha256:96ad4c17988aa6c82208d4833a9887fd7a6c14f25ca0d9153e9d527052371b58"}, - {file = "dataengineeringutils3-1.1.0.tar.gz", hash = "sha256:e2cd9e162724c4dc25efa85eccb49d79c0e74837c893606e3962f431bb546de7"}, +et-xmlfile = [ + {file = "et_xmlfile-1.0.1.tar.gz", hash = "sha256:614d9722d572f6246302c4491846d2c393c199cfa4edc9af593437691683335b"}, ] fsspec = [ - {file = "fsspec-0.8.0-py3-none-any.whl", hash = "sha256:ce109f41ffe62853d5de84888f3e455c39f2a0796c05b558474c77156e19b570"}, - {file = "fsspec-0.8.0.tar.gz", hash = "sha256:176f3fc405471af0f1f1e14cffa3d53ab8906577973d068b976114433c010d9d"}, + {file = "fsspec-0.8.5-py3-none-any.whl", hash = "sha256:5629dc945800873cb2092df806c854e74c2799f4854247bce37ca7171000a7ec"}, + {file = "fsspec-0.8.5.tar.gz", hash = "sha256:890c6ce9325030f03bd2eae81389ddcbcee53bdd475334ca064595e1e45f92a6"}, ] -gluejobutils = [ - {file = "gluejobutils-3.2.0-py2.py3-none-any.whl", hash = "sha256:89c01519af9a09b392b391261bf55d2e6e2375064f6058a15e9bbfa195a7388e"}, - {file = "gluejobutils-3.2.0.tar.gz", hash = "sha256:38dc2ab557fd8d14bd1f4985d0316a5ff35754986752639ea351cf22b819d1ee"}, +idna = [ + {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, + {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, ] importlib-metadata = [ {file = "importlib_metadata-3.4.0-py3-none-any.whl", hash = "sha256:ace61d5fc652dc280e7b6b4ff732a9c2d40db2c0f92bc6cb74e07b73d53a1771"}, {file = "importlib_metadata-3.4.0.tar.gz", hash = "sha256:fa5daa4477a7414ae34e95942e4dd07f62adf589143c875c133c1e53c4eff38d"}, ] +jdcal = [ + {file = "jdcal-1.4.1-py2.py3-none-any.whl", hash = "sha256:1abf1305fce18b4e8aa248cf8fe0c56ce2032392bc64bbd61b5dff2a19ec8bba"}, + {file = "jdcal-1.4.1.tar.gz", hash = "sha256:472872e096eb8df219c23f2689fc336668bdb43d194094b5cc1707e1640acfc8"}, +] jmespath = [ {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, ] +lxml = [ + {file = "lxml-4.6.2-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a9d6bc8642e2c67db33f1247a77c53476f3a166e09067c0474facb045756087f"}, + {file = "lxml-4.6.2-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:791394449e98243839fa822a637177dd42a95f4883ad3dec2a0ce6ac99fb0a9d"}, + {file = "lxml-4.6.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:68a5d77e440df94011214b7db907ec8f19e439507a70c958f750c18d88f995d2"}, + {file = "lxml-4.6.2-cp27-cp27m-win32.whl", hash = "sha256:fc37870d6716b137e80d19241d0e2cff7a7643b925dfa49b4c8ebd1295eb506e"}, + {file = "lxml-4.6.2-cp27-cp27m-win_amd64.whl", hash = "sha256:69a63f83e88138ab7642d8f61418cf3180a4d8cd13995df87725cb8b893e950e"}, + {file = "lxml-4.6.2-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:42ebca24ba2a21065fb546f3e6bd0c58c3fe9ac298f3a320147029a4850f51a2"}, + {file = "lxml-4.6.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:f83d281bb2a6217cd806f4cf0ddded436790e66f393e124dfe9731f6b3fb9afe"}, + {file = "lxml-4.6.2-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:535f067002b0fd1a4e5296a8f1bf88193080ff992a195e66964ef2a6cfec5388"}, + {file = "lxml-4.6.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:366cb750140f221523fa062d641393092813b81e15d0e25d9f7c6025f910ee80"}, + {file = "lxml-4.6.2-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:97db258793d193c7b62d4e2586c6ed98d51086e93f9a3af2b2034af01450a74b"}, + {file = "lxml-4.6.2-cp35-cp35m-win32.whl", hash = "sha256:648914abafe67f11be7d93c1a546068f8eff3c5fa938e1f94509e4a5d682b2d8"}, + {file = "lxml-4.6.2-cp35-cp35m-win_amd64.whl", hash = "sha256:4e751e77006da34643ab782e4a5cc21ea7b755551db202bc4d3a423b307db780"}, + {file = "lxml-4.6.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:681d75e1a38a69f1e64ab82fe4b1ed3fd758717bed735fb9aeaa124143f051af"}, + {file = "lxml-4.6.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:127f76864468d6630e1b453d3ffbbd04b024c674f55cf0a30dc2595137892d37"}, + {file = "lxml-4.6.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4fb85c447e288df535b17ebdebf0ec1cf3a3f1a8eba7e79169f4f37af43c6b98"}, + {file = "lxml-4.6.2-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:5be4a2e212bb6aa045e37f7d48e3e1e4b6fd259882ed5a00786f82e8c37ce77d"}, + {file = "lxml-4.6.2-cp36-cp36m-win32.whl", hash = "sha256:8c88b599e226994ad4db29d93bc149aa1aff3dc3a4355dd5757569ba78632bdf"}, + {file = "lxml-4.6.2-cp36-cp36m-win_amd64.whl", hash = "sha256:6e4183800f16f3679076dfa8abf2db3083919d7e30764a069fb66b2b9eff9939"}, + {file = "lxml-4.6.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d8d3d4713f0c28bdc6c806a278d998546e8efc3498949e3ace6e117462ac0a5e"}, + {file = "lxml-4.6.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:8246f30ca34dc712ab07e51dc34fea883c00b7ccb0e614651e49da2c49a30711"}, + {file = "lxml-4.6.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:923963e989ffbceaa210ac37afc9b906acebe945d2723e9679b643513837b089"}, + {file = "lxml-4.6.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:1471cee35eba321827d7d53d104e7b8c593ea3ad376aa2df89533ce8e1b24a01"}, + {file = "lxml-4.6.2-cp37-cp37m-win32.whl", hash = "sha256:2363c35637d2d9d6f26f60a208819e7eafc4305ce39dc1d5005eccc4593331c2"}, + {file = "lxml-4.6.2-cp37-cp37m-win_amd64.whl", hash = "sha256:f4822c0660c3754f1a41a655e37cb4dbbc9be3d35b125a37fab6f82d47674ebc"}, + {file = "lxml-4.6.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0448576c148c129594d890265b1a83b9cd76fd1f0a6a04620753d9a6bcfd0a4d"}, + {file = "lxml-4.6.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:60a20bfc3bd234d54d49c388950195d23a5583d4108e1a1d47c9eef8d8c042b3"}, + {file = "lxml-4.6.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:2e5cc908fe43fe1aa299e58046ad66981131a66aea3129aac7770c37f590a644"}, + {file = "lxml-4.6.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:50c348995b47b5a4e330362cf39fc503b4a43b14a91c34c83b955e1805c8e308"}, + {file = "lxml-4.6.2-cp38-cp38-win32.whl", hash = "sha256:94d55bd03d8671686e3f012577d9caa5421a07286dd351dfef64791cf7c6c505"}, + {file = "lxml-4.6.2-cp38-cp38-win_amd64.whl", hash = "sha256:7a7669ff50f41225ca5d6ee0a1ec8413f3a0d8aa2b109f86d540887b7ec0d72a"}, + {file = "lxml-4.6.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e0bfe9bb028974a481410432dbe1b182e8191d5d40382e5b8ff39cdd2e5c5931"}, + {file = "lxml-4.6.2-cp39-cp39-manylinux1_i686.whl", hash = "sha256:6fd8d5903c2e53f49e99359b063df27fdf7acb89a52b6a12494208bf61345a03"}, + {file = "lxml-4.6.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7e9eac1e526386df7c70ef253b792a0a12dd86d833b1d329e038c7a235dfceb5"}, + {file = "lxml-4.6.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:7ee8af0b9f7de635c61cdd5b8534b76c52cd03536f29f51151b377f76e214a1a"}, + {file = "lxml-4.6.2-cp39-cp39-win32.whl", hash = "sha256:2e6fd1b8acd005bd71e6c94f30c055594bbd0aa02ef51a22bbfa961ab63b2d75"}, + {file = "lxml-4.6.2-cp39-cp39-win_amd64.whl", hash = "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf"}, + {file = "lxml-4.6.2.tar.gz", hash = "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc"}, +] more-itertools = [ {file = "more-itertools-8.6.0.tar.gz", hash = "sha256:b3a9005928e5bed54076e6e549c792b306fddfe72b2d1d22dd63d42d5d3899cf"}, {file = "more_itertools-8.6.0-py3-none-any.whl", hash = "sha256:8e1a2a43b2f2727425f2b5839587ae37093f19153dc26c0927d1048ff6557330"}, @@ -421,30 +653,43 @@ numpy = [ {file = "numpy-1.19.5-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73"}, {file = "numpy-1.19.5.zip", hash = "sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4"}, ] +openpyxl = [ + {file = "openpyxl-3.0.6-py2.py3-none-any.whl", hash = "sha256:1a4b3869c2500b5c713e8e28341cdada49ecfcff1b10cd9006945f5bcefc090d"}, + {file = "openpyxl-3.0.6.tar.gz", hash = "sha256:b229112b46e158b910a5d1b270b212c42773d39cab24e8db527f775b82afc041"}, +] packaging = [ {file = "packaging-20.8-py2.py3-none-any.whl", hash = "sha256:24e0da08660a87484d1602c30bb4902d74816b6985b93de36926f5bc95741858"}, {file = "packaging-20.8.tar.gz", hash = "sha256:78598185a7008a470d64526a8059de9aaa449238f280fc9eb6b13ba6c4109093"}, ] pandas = [ - {file = "pandas-0.25.3-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:df8864824b1fe488cf778c3650ee59c3a0d8f42e53707de167ba6b4f7d35f133"}, - {file = "pandas-0.25.3-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:7458c48e3d15b8aaa7d575be60e1e4dd70348efcd9376656b72fecd55c59a4c3"}, - {file = "pandas-0.25.3-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:61741f5aeb252f39c3031d11405305b6d10ce663c53bc3112705d7ad66c013d0"}, - {file = "pandas-0.25.3-cp35-cp35m-win32.whl", hash = "sha256:adc3d3a3f9e59a38d923e90e20c4922fc62d1e5a03d083440468c6d8f3f1ae0a"}, - {file = "pandas-0.25.3-cp35-cp35m-win_amd64.whl", hash = "sha256:975c461accd14e89d71772e89108a050fa824c0b87a67d34cedf245f6681fc17"}, - {file = "pandas-0.25.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ee50c2142cdcf41995655d499a157d0a812fce55c97d9aad13bc1eef837ed36c"}, - {file = "pandas-0.25.3-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:4545467a637e0e1393f7d05d61dace89689ad6d6f66f267f86fff737b702cce9"}, - {file = "pandas-0.25.3-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:bbe3eb765a0b1e578833d243e2814b60c825b7fdbf4cdfe8e8aae8a08ed56ecf"}, - {file = "pandas-0.25.3-cp36-cp36m-win32.whl", hash = "sha256:8153705d6545fd9eb6dd2bc79301bff08825d2e2f716d5dced48daafc2d0b81f"}, - {file = "pandas-0.25.3-cp36-cp36m-win_amd64.whl", hash = "sha256:26382aab9c119735908d94d2c5c08020a4a0a82969b7e5eefb92f902b3b30ad7"}, - {file = "pandas-0.25.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:00dff3a8e337f5ed7ad295d98a31821d3d0fe7792da82d78d7fd79b89c03ea9d"}, - {file = "pandas-0.25.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e45055c30a608076e31a9fcd780a956ed3b1fa20db61561b8d88b79259f526f7"}, - {file = "pandas-0.25.3-cp37-cp37m-win32.whl", hash = "sha256:255920e63850dc512ce356233081098554d641ba99c3767dde9e9f35630f994b"}, - {file = "pandas-0.25.3-cp37-cp37m-win_amd64.whl", hash = "sha256:22361b1597c8c2ffd697aa9bf85423afa9e1fcfa6b1ea821054a244d5f24d75e"}, - {file = "pandas-0.25.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9962957a27bfb70ab64103d0a7b42fa59c642fb4ed4cb75d0227b7bb9228535d"}, - {file = "pandas-0.25.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:78bf638993219311377ce9836b3dc05f627a666d0dbc8cec37c0ff3c9ada673b"}, - {file = "pandas-0.25.3-cp38-cp38-win32.whl", hash = "sha256:6a3ac2c87e4e32a969921d1428525f09462770c349147aa8e9ab95f88c71ec71"}, - {file = "pandas-0.25.3-cp38-cp38-win_amd64.whl", hash = "sha256:33970f4cacdd9a0ddb8f21e151bfb9f178afb7c36eb7c25b9094c02876f385c2"}, - {file = "pandas-0.25.3.tar.gz", hash = "sha256:52da74df8a9c9a103af0a72c9d5fdc8e0183a90884278db7f386b5692a2220a4"}, + {file = "pandas-1.1.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:bf23a3b54d128b50f4f9d4675b3c1857a688cc6731a32f931837d72effb2698d"}, + {file = "pandas-1.1.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:5a780260afc88268a9d3ac3511d8f494fdcf637eece62fb9eb656a63d53eb7ca"}, + {file = "pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:b61080750d19a0122469ab59b087380721d6b72a4e7d962e4d7e63e0c4504814"}, + {file = "pandas-1.1.5-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:0de3ddb414d30798cbf56e642d82cac30a80223ad6fe484d66c0ce01a84d6f2f"}, + {file = "pandas-1.1.5-cp36-cp36m-win32.whl", hash = "sha256:70865f96bb38fec46f7ebd66d4b5cfd0aa6b842073f298d621385ae3898d28b5"}, + {file = "pandas-1.1.5-cp36-cp36m-win_amd64.whl", hash = "sha256:19a2148a1d02791352e9fa637899a78e371a3516ac6da5c4edc718f60cbae648"}, + {file = "pandas-1.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:26fa92d3ac743a149a31b21d6f4337b0594b6302ea5575b37af9ca9611e8981a"}, + {file = "pandas-1.1.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:c16d59c15d946111d2716856dd5479221c9e4f2f5c7bc2d617f39d870031e086"}, + {file = "pandas-1.1.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:3be7a7a0ca71a2640e81d9276f526bca63505850add10206d0da2e8a0a325dae"}, + {file = "pandas-1.1.5-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:573fba5b05bf2c69271a32e52399c8de599e4a15ab7cec47d3b9c904125ab788"}, + {file = "pandas-1.1.5-cp37-cp37m-win32.whl", hash = "sha256:21b5a2b033380adbdd36b3116faaf9a4663e375325831dac1b519a44f9e439bb"}, + {file = "pandas-1.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:24c7f8d4aee71bfa6401faeba367dd654f696a77151a8a28bc2013f7ced4af98"}, + {file = "pandas-1.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2860a97cbb25444ffc0088b457da0a79dc79f9c601238a3e0644312fcc14bf11"}, + {file = "pandas-1.1.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:5008374ebb990dad9ed48b0f5d0038124c73748f5384cc8c46904dace27082d9"}, + {file = "pandas-1.1.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:2c2f7c670ea4e60318e4b7e474d56447cf0c7d83b3c2a5405a0dbb2600b9c48e"}, + {file = "pandas-1.1.5-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0a643bae4283a37732ddfcecab3f62dd082996021b980f580903f4e8e01b3c5b"}, + {file = "pandas-1.1.5-cp38-cp38-win32.whl", hash = "sha256:5447ea7af4005b0daf695a316a423b96374c9c73ffbd4533209c5ddc369e644b"}, + {file = "pandas-1.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:4c62e94d5d49db116bef1bd5c2486723a292d79409fc9abd51adf9e05329101d"}, + {file = "pandas-1.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:731568be71fba1e13cae212c362f3d2ca8932e83cb1b85e3f1b4dd77d019254a"}, + {file = "pandas-1.1.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:c61c043aafb69329d0f961b19faa30b1dab709dd34c9388143fc55680059e55a"}, + {file = "pandas-1.1.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:2b1c6cd28a0dfda75c7b5957363333f01d370936e4c6276b7b8e696dd500582a"}, + {file = "pandas-1.1.5-cp39-cp39-win32.whl", hash = "sha256:c94ff2780a1fd89f190390130d6d36173ca59fcfb3fe0ff596f9a56518191ccb"}, + {file = "pandas-1.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:edda9bacc3843dfbeebaf7a701763e68e741b08fccb889c003b0a52f0ee95782"}, + {file = "pandas-1.1.5.tar.gz", hash = "sha256:f10fc41ee3c75a474d3bdf68d396f10782d013d7f67db99c0efbfd0acb99701b"}, +] +pg8000 = [ + {file = "pg8000-1.16.6-py3-none-any.whl", hash = "sha256:66fa16a402f38f8ba664206b4ba4040f24ea9641c4205b2b96a1ff3a613de3be"}, + {file = "pg8000-1.16.6.tar.gz", hash = "sha256:8fc1e6a62ccb7c9830f1e7e9288e2d20eaf373cc8875b5c55b7d5d9b7717be91"}, ] pluggy = [ {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, @@ -458,6 +703,36 @@ py = [ {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, ] +pyarrow = [ + {file = "pyarrow-2.0.0-cp35-cp35m-macosx_10_13_intel.whl", hash = "sha256:6afc71cc9c234f3cdbe971297468755ec3392966cb19d3a6caf42fd7dbc6aaa9"}, + {file = "pyarrow-2.0.0-cp35-cp35m-macosx_10_9_intel.whl", hash = "sha256:eb05038b750a6e16a9680f9d2c40d050796284ea1f94690da8f4f28805af0495"}, + {file = "pyarrow-2.0.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3e33e9003794c9062f4c963a10f2a0d787b83d4d1a517a375294f2293180b778"}, + {file = "pyarrow-2.0.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:ffb306951b5925a0638dc2ef1ab7ce8033f39e5b4e0fef5787b91ef4fa7da19d"}, + {file = "pyarrow-2.0.0-cp35-cp35m-manylinux2014_x86_64.whl", hash = "sha256:dc0d04c42632e65c4fcbe2f82c70109c5f347652844ead285bc1285dc3a67660"}, + {file = "pyarrow-2.0.0-cp35-cp35m-win_amd64.whl", hash = "sha256:916b593a24f2812b9a75adef1143b1dd89d799e1803282fea2829c5dc0b828ea"}, + {file = "pyarrow-2.0.0-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:c801e59ec4e8d9d871e299726a528c3ba3139f2ce2d9cdab101f8483c52eec7c"}, + {file = "pyarrow-2.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0bf43e520c33ceb1dd47263a5326830fca65f18d827f7f7b8fe7e64fc4364d88"}, + {file = "pyarrow-2.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0b358773eb9fb1b31c8217c6c8c0b4681c3dff80562dc23ad5b379f0279dad69"}, + {file = "pyarrow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:1000e491e9a539588ec33a2c2603cf05f1d4629aef375345bfd64f2ab7bc8529"}, + {file = "pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:ce0462cec7f81c4ff87ce1a95c82a8d467606dce6c72e92906ac251c6115f32b"}, + {file = "pyarrow-2.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:16ec87163a2fb4abd48bf79cbdf70a7455faa83740e067c2280cfa45a63ed1f3"}, + {file = "pyarrow-2.0.0-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:acdd18fd83c0be0b53a8e734c0a650fb27bbf4e7d96a8f7eb0a7506ea58bd594"}, + {file = "pyarrow-2.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9a8d3c6baa6e159017d97e8a028ae9eaa2811d8f1ab3d22710c04dcddc0dd7a1"}, + {file = "pyarrow-2.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:652c5dff97624375ed0f97cc8ad6f88ee01953f15c17083917735de171f03fe0"}, + {file = "pyarrow-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:00d8fb8a9b2d9bb2f0ced2765b62c5d72689eed06c47315bca004584b0ccda60"}, + {file = "pyarrow-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:fb69672e69e1b752744ee1e236fdf03aad78ffec905fc5c19adbaf88bac4d0fd"}, + {file = "pyarrow-2.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ccff3a72f70ebfcc002bf75f5ad1248065e5c9c14e0dcfa599a438ea221c5658"}, + {file = "pyarrow-2.0.0-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:bc8c3713086e4a137b3fda4b149440458b1b0bd72f67b1afa2c7068df1edc060"}, + {file = "pyarrow-2.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f4ba9ab479c0172e532f5d73c68e30a31c16b01e09bb21eba9201561231f722"}, + {file = "pyarrow-2.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0db5156a66615591a4a8c66a9a30890a364a259de8d2a6ccb873c7d1740e6c75"}, + {file = "pyarrow-2.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:cf9bf10daadbbf1a360ac1c7dab0b4f8381d81a3f452737bd6ed310d57a88be8"}, + {file = "pyarrow-2.0.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:dd661b6598ce566c6f41d31cc1fc4482308613c2c0c808bd8db33b0643192f84"}, + {file = "pyarrow-2.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:14b02a629986c25e045f81771799e07a8bb3f339898c111314066436769a3dd4"}, +] +pymysql = [ + {file = "PyMySQL-1.0.2-py3-none-any.whl", hash = "sha256:41fc3a0c5013d5f039639442321185532e3e2c8924687abe6537de157d403641"}, + {file = "PyMySQL-1.0.2.tar.gz", hash = "sha256:816927a350f38d56072aeca5dfb10221fe1dc653745853d30a216637f5d7ad36"}, +] pyparsing = [ {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, @@ -474,6 +749,13 @@ pytz = [ {file = "pytz-2020.5-py2.py3-none-any.whl", hash = "sha256:16962c5fb8db4a8f63a26646d8886e9d769b6c511543557bc84e9569fb9a9cb4"}, {file = "pytz-2020.5.tar.gz", hash = "sha256:180befebb1927b16f6b57101720075a984c019ac16b1b7575673bea42c6c3da5"}, ] +redshift-connector = [ + {file = "redshift_connector-2.0.872-py3-none-any.whl", hash = "sha256:7ef655ed33b0e12b4945d8eff91bc14131fbd8318b5ff6fabb8432e4a8920f91"}, +] +requests = [ + {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, + {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, +] s3fs = [ {file = "s3fs-0.4.2-py3-none-any.whl", hash = "sha256:91c1dfb45e5217bd441a7a560946fe865ced6225ff7eb0fb459fe6e601a95ed3"}, {file = "s3fs-0.4.2.tar.gz", hash = "sha256:2ca5de8dc18ad7ad350c0bd01aef0406aa5d0fff78a561f0f710f9d9858abdd0"}, @@ -482,10 +764,18 @@ s3transfer = [ {file = "s3transfer-0.3.4-py2.py3-none-any.whl", hash = "sha256:1e28620e5b444652ed752cf87c7e0cb15b0e578972568c6609f0f18212f259ed"}, {file = "s3transfer-0.3.4.tar.gz", hash = "sha256:7fdddb4f22275cf1d32129e21f056337fd2a80b6ccef1664528145b72c49e6d2"}, ] +scramp = [ + {file = "scramp-1.2.0-py3-none-any.whl", hash = "sha256:74815c25aad1fe0b5fb994e96c3de63e8695164358a80138352aaadfa4760350"}, + {file = "scramp-1.2.0.tar.gz", hash = "sha256:d6865ed1d135ddb124a619d7cd3a5b505f69a7c92e248024dd7e48bc77752af5"}, +] six = [ {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"}, {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"}, ] +soupsieve = [ + {file = "soupsieve-2.1-py3-none-any.whl", hash = "sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851"}, + {file = "soupsieve-2.1.tar.gz", hash = "sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"}, +] sqlparse = [ {file = "sqlparse-0.3.1-py2.py3-none-any.whl", hash = "sha256:022fb9c87b524d1f7862b3037e541f68597a730a8843245c349fc93e1643dc4e"}, {file = "sqlparse-0.3.1.tar.gz", hash = "sha256:e162203737712307dfe78860cc56c8da8a852ab2ee33750e33aeadf38d12c548"}, diff --git a/pydbtools/wrangler.py b/pydbtools/wrangler.py index f208545..8f8a568 100644 --- a/pydbtools/wrangler.py +++ b/pydbtools/wrangler.py @@ -104,7 +104,7 @@ def wrapper(*args, **kwargs): "ctas_approach", sig.parameters["ctas_approach"].default ): argmap["database"] = temp_db_name - _create_temp_database(temp_db_name, boto3_session=boto3_session) + _ = _create_temp_database(temp_db_name, boto3_session=boto3_session) elif argmap.get("database", "").lower() == "__temp__": argmap["database"] = temp_db_name else: @@ -161,7 +161,7 @@ def check_sql(sql: str): # This is not necessary atm but incase future changes are made #  I think it is better to create "public" and "private" method # where the public function is wrapped by init_athena_params -# this wrapper also calls the private functnio to avoid the wrapper +# this wrapper also calls the private function to avoid the wrapper # calling itself @init_athena_params(allow_boto3_session=True) def create_temp_database( @@ -170,15 +170,13 @@ def create_temp_database( force_ec2: bool = False, region_name: str = "eu-west-1", ): - out = _create_temp_database( + _ = _create_temp_database( temp_db_name=temp_db_name, boto3_session=boto3_session, force_ec2=force_ec2, region_name=region_name, ) - return out - def _create_temp_database( temp_db_name: str = None, diff --git a/pyproject.toml b/pyproject.toml index 551e727..8508bed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool] [tool.poetry] name = "pydbtools" -version = "2.0.2" +version = "3.0.0" description = "A python package to query data via amazon athena and bring it into a pandas df" license = "MIT" authors = ["Karik Isichei "] @@ -10,8 +10,6 @@ readme = "README.md" [tool.poetry.dependencies] python = ">=3.7, <3.9" boto3 = ">=1.7.4" -numpy = ">=1.16.1" -pandas = "=^1.2" s3fs = ">=0.2.2,<0.5.0" sqlparse = "^0.3.1" awswrangler = "^2.3.0" diff --git a/tests/test_wrangler.py b/tests/test_wrangler.py index 220fb1d..a6a504d 100644 --- a/tests/test_wrangler.py +++ b/tests/test_wrangler.py @@ -1,12 +1,7 @@ -import os - -import boto3 -from moto import mock_s3 import pytest from pydbtools.wrangler import ( init_athena_params, - get_boto_session, ) @@ -56,7 +51,7 @@ def fun_with_sql_db_s3_ctas( boto3_session=None, database=None, s3_output=None, - ctas_approach=False, + ctas_approach=None, **kwargs, ): return locals() From dd22c1dba4874a48fc8e426b7c3a0195e895f099 Mon Sep 17 00:00:00 2001 From: Karik Isichei Date: Thu, 21 Jan 2021 16:45:50 +0000 Subject: [PATCH 15/15] Added suggestions --- README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4f72237..68038bd 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,28 @@ pip install "pydbtools @ git+https://github.com/moj-analytical-services/pydbtool pip install "pydbtools @ git+https://github.com/moj-analytical-services/pydbtools@v3.0.0" ``` +## Quickstart guide + +### Read an SQL Athena query into a pandas dataframe + +```python +import pydbtools as pydb +df = pydb.read_sql("SELECT * from a_database.table LIMIT 10") +``` + +### Run a query in Athena + +```python +response = pydb.start_query_execution_and_wait("CREATE DATABASE IF NOT EXISTS my_test_database") +``` + +### Create a temporary table to do further separate SQL queries on later + +```python +pydb.create_temp_table("SELECT a_col, count(*) as n FROM a_database.table GROUP BY a_col", table_name="temp_table_1") +df = pydb.read_sql_query("SELECT * FROM __temp__.temp_table_1 WHERE n < 10") +``` + ## Introduction This package is a wrapper for [awswrangler](https://aws-data-wrangler.readthedocs.io/en/2.3.0/what.html) that which presets/defines some of the input parameters to the athena module functions to align with our platform setup. See the [awswrangler API reference documentation for Athena](https://aws-data-wrangler.readthedocs.io/en/2.3.0/api.html#amazon-athena) to see what functions you can call from pydbtools. @@ -68,7 +90,7 @@ df = pydb.read_sql_query("SELECT * FROM __temp__.temp_table_1 WHERE n < 10") ### More advanced usage -Get the actual name for your temp database, create your temp db then delete it using awswrangler (not awsrangler will raise an error if the database does not exist) +Get the actual name for your temp database, create your temp db then delete it using awswrangler (note: `awswrangler` will raise an error if the database does not exist) ```python import awswrangler as wr