diff --git a/.gitignore b/.gitignore
index 342c0a1..64a30a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,11 @@ dist/
.coverage.*
coverage.log
coverage_html
+.cache/
+htmlcov/
+pytest-out.xml
+.pytest_cache/
+build/
# Singer JSON files
properties.json
@@ -32,3 +37,25 @@ tmp
# Docs
docs/_build/
docs/_templates/
+
+# Textio build
+venv*/
+*.tar.gz
+.venv
+.dev
+.assets
+.build
+.test
+.lint
+.code
+
+# Custom environment vars
+.env
+.env.*
+
+# Testing
+
+# Anciliary files
+.vscode/
+*.json.gz
+.ruff_cache/
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..912c44c
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,96 @@
+.PHONY: help clean clean-all clean-assets dev lint
+
+codedir := . # location of code
+testdir := ./*/tests
+scriptdir := .
+
+syspython := python3
+
+pip := venv/bin/pip
+aws := aws # awscli v2 should be installed via homebrew or pipx
+
+code-files := $(shell find $(codedir) -name '*.py' -not \( -path '*__pycache__*' \))
+test-files := $(shell find $(testdir) -name '*.py' -not \( -path '*__pycache__*' \))
+python-script-files := $(shell find $(scriptdir) -name '*.py' -not \( -path '*__pycache__*' \))
+gitish := $(shell git rev-parse --short HEAD)
+
+clean: ## Clean build artifacts but NOT downloaded assets
+ # Python build
+ find $ . -name '__pycache__' -exec rm -Rf {} +
+ find $ . -name '*.py[co]' -delete
+ rm -rf dist
+ rm -rf *.egg-info
+ rm -rf *.egg
+ rm -rf *.eggs
+ rm -rf *.whl
+ rm -rf *.tar.gz
+
+ rm -rf venv
+ rm -f .venv
+ rm -f .dev
+ rm -f .assets
+ rm -f .lint
+
+ # Test
+ rm -rf .cache/
+ rm -f .coverage
+ rm -rf htmlcov/
+ rm -f pytest-out.xml
+
+clean-all: clean clean-assets ## Clean everything
+
+venv:
+ $(syspython) -m venv venv
+
+.venv: venv
+ venv/bin/pip install --progress-bar off --upgrade pip wheel setuptools pip-tools
+ touch .venv
+
+%.txt: %.in
+ venv/bin/pip-compile \
+ --no-emit-index-url \
+ --no-emit-options \
+ --resolver=backtracking \
+ $^ \
+ -o "$@"
+
+## update all python requirements*.txt files based on the corresponding requirements*.in file
+upgrade-dev-deps: .venv
+ rm -f requirements_dev.txt
+ $(MAKE) -sB requirements_dev.txt
+
+.dev: .venv requirements_dev.txt
+ $(pip) install --progress-bar off --upgrade -r requirements_dev.txt
+ touch .dev
+
+.build: .dev
+ $(pip) install --upgrade pip
+ $(pip) install .
+
+.assets: .dev
+ touch .assets
+
+clean-assets: ## Clean only assets so they will be re-downloaded
+ rm -f .assets
+
+.lint: .dev $(code-files) $(test-files)
+ifeq ($(ci), true)
+ venv/bin/black --line-length=101 --safe -v --check $(code-files) $(test-files) $(python-script-files)
+else
+ venv/bin/ruff -v check --fix $(code-files) $(test-files) $(python-script-files)
+endif
+ venv/bin/flake8 --max-line-length=101 $(code-files) $(test-files) $(python-script-files)
+ touch .lint
+
+
+dev: .dev ## Setup the local dev environment
+
+build: .build ## Build package
+
+lint: .lint ## Run flake8 and black linting
+
+unit-tests: dev
+ coverage run -m pytest -vv --disable-pytest-warnings tests/unit && coverage report
+
+tests: dev
+ textioaws assumerole --config predev/predev --duration 14400 coverage run -m pytest -vv --disable-pytest-warnings tests/integration && coverage report
diff --git a/README.md b/README.md
index ceba354..3ef3f67 100644
--- a/README.md
+++ b/README.md
@@ -40,11 +40,11 @@ or
### To run
-Like any other target that's following the singer specificiation:
+Like any other target that's following the singer specification:
`some-singer-tap | target-redshift --config [config.json]`
-It's reading incoming messages from STDIN and using the properites in `config.json` to upload data into Amazon Redshift.
+It's reading incoming messages from STDIN and using the properties in `config.json` to upload data into Amazon Redshift.
**Note**: To avoid version conflicts run `tap` and `targets` in separate virtual environments.
@@ -77,8 +77,8 @@ Full list of options in `config.json`:
| password | String | Yes | Redshift Password |
| dbname | String | Yes | Redshift Database name |
| aws_profile | String | No | AWS profile name for profile based authentication. If not provided, `AWS_PROFILE` environment variable will be used. |
-| aws_access_key_id | String | No | S3 Access Key Id. Used for S3 and Redshfit copy operations. If not provided, `AWS_ACCESS_KEY_ID` environment variable will be used. |
-| aws_secret_access_key | String | No | S3 Secret Access Key. Used for S3 and Redshfit copy operations. If not provided, `AWS_SECRET_ACCESS_KEY` environment variable will be used. |
+| aws_access_key_id | String | No | S3 Access Key Id. Used for S3 and Redshift copy operations. If not provided, `AWS_ACCESS_KEY_ID` environment variable will be used. |
+| aws_secret_access_key | String | No | S3 Secret Access Key. Used for S3 and Redshift copy operations. If not provided, `AWS_SECRET_ACCESS_KEY` environment variable will be used. |
| aws_session_token | String | No | S3 AWS STS token for temporary credentials. If not provided, `AWS_SESSION_TOKEN` environment variable will be used. |
| aws_redshift_copy_role_arn | String | No | AWS Role ARN to be used for the Redshift COPY operation. Used instead of the given AWS keys for the COPY operation if provided - the keys are still used for other S3 operations |
| s3_acl | String | No | S3 Object ACL |
@@ -93,7 +93,7 @@ Full list of options in `config.json`:
| default_target_schema_select_permissions | String | | Grant USAGE privilege on newly created schemas and grant SELECT privilege on newly created tables to a specific list of users or groups. Example: `{"users": ["user_1","user_2"], "groups": ["group_1", "group_2"]}` If `schema_mapping` is not defined then every stream sent by the tap is granted accordingly. |
| schema_mapping | Object | | Useful if you want to load multiple streams from one tap to multiple Redshift schemas.
If the tap sends the `stream_id` in `-` format then this option overwrites the `default_target_schema` value. Note, that using `schema_mapping` you can overwrite the `default_target_schema_select_permissions` value to grant SELECT permissions to different groups per schemas or optionally you can create indices automatically for the replicated tables.
**Note**: This is an experimental feature and recommended to use via PipelineWise YAML files that will generate the object mapping in the right JSON format. For further info check a [PipelineWise YAML Example]
| disable_table_cache | Boolean | | (Default: False) By default the connector caches the available table structures in Redshift at startup. In this way it doesn't need to run additional queries when ingesting data to check if altering the target tables is required. With `disable_table_cache` option you can turn off this caching. You will always see the most recent table structures but will cause an extra query runtime. |
-| add_metadata_columns | Boolean | | (Default: False) Metadata columns add extra row level information about data ingestions, (i.e. when was the row read in source, when was inserted or deleted in redshift etc.) Metadata columns are creating automatically by adding extra columns to the tables with a column prefix `_SDC_`. The metadata columns are documented at https://transferwise.github.io/pipelinewise/data_structure/sdc-columns.html. Enabling metadata columns will flag the deleted rows by setting the `_SDC_DELETED_AT` metadata column. Without the `add_metadata_columns` option the deleted rows from singer taps will not be recongisable in Redshift. |
+| add_metadata_columns | Boolean | | (Default: False) Metadata columns add extra row level information about data ingestion, (i.e. when was the row read in source, when was inserted or deleted in redshift etc.) Metadata columns are creating automatically by adding extra columns to the tables with a column prefix `_SDC_`. The metadata columns are documented at [https://transferwise.github.io/pipelinewise/data_structure/sdc-columns.html](https://transferwise.github.io/pipelinewise/data_structure/sdc-columns.html). Enabling metadata columns will flag the deleted rows by setting the `_SDC_DELETED_AT` metadata column. Without the `add_metadata_columns` option the deleted rows from singer taps will not be recognisable in Redshift. |
| hard_delete | Boolean | | (Default: False) When `hard_delete` option is true then DELETE SQL commands will be performed in Redshift to delete rows in tables. It's achieved by continuously checking the `_SDC_DELETED_AT` metadata column sent by the singer tap. Due to deleting rows requires metadata columns, `hard_delete` option automatically enables the `add_metadata_columns` option as well. |
| data_flattening_max_level | Integer | | (Default: 0) Object type RECORD items from taps can be loaded into VARIANT columns as JSON (default) or we can flatten the schema by creating columns automatically.
When value is 0 (default) then flattening functionality is turned off. |
| primary_key_required | Boolean | | (Default: True) Log based and Incremental replications on tables with no Primary Key cause duplicates when merging UPDATE events. When set to true, stop loading data if no Primary Key is defined. |
@@ -103,11 +103,10 @@ Full list of options in `config.json`:
| slices | Integer | No | The number of slices to split files into prior to running COPY on Redshift. This should be set to the number of Redshift slices. The number of slices per node depends on the node size of the cluster - run `SELECT COUNT(DISTINCT slice) slices FROM stv_slices` to calculate this. Defaults to `1`. |
| temp_dir | String | | (Default: platform-dependent) Directory of temporary CSV files with RECORD messages. |
-### To run tests:
+### To run tests
1. Install python dependencies in a virtual env:
-
```bash
python3 -m venv venv
. venv/bin/activate
@@ -117,7 +116,6 @@ Full list of options in `config.json`:
1. To run unit tests:
-
```bash
coverage run -m pytest -vv --disable-pytest-warnings tests/unit && coverage report
```
@@ -125,26 +123,31 @@ Full list of options in `config.json`:
1. To run integration tests define environment variables first:
```bash
+
+ # get the host, port, user, password by running the following it `analytics-data-warehouse`:
+ # > make -C infrastructure psql config=predev/predev stack=redshift
export TARGET_REDSHIFT_HOST=
export TARGET_REDSHIFT_PORT=
export TARGET_REDSHIFT_USER=
export TARGET_REDSHIFT_PASSWORD=
- export TARGET_REDSHIFT_DBNAME=
- export TARGET_REDSHIFT_SCHEMA=
- export TARGET_REDSHIFT_AWS_ACCESS_KEY=
- export TARGET_REDSHIFT_AWS_SECRET_ACCESS_KEY=
- export TARGET_REDSHIFT_S3_ACL=
- export TARGET_REDSHIFT_S3_BUCKET=
- export TARGET_REDSHIFT_S3_KEY_PREFIX=
-
- coverage run -m pytest -vv --disable-pytest-warnings tests/integration && coverage report
+ export TARGET_REDSHIFT_DBNAME='main'
+ export TARGET_REDSHIFT_SCHEMA='test'
+ export TARGET_REDSHIFT_S3_BUCKET='data-warehouse-meltano-temp-bucket-predev-predev'
+
+ # The following are not required if using password authentication
+ # export TARGET_REDSHIFT_AWS_ACCESS_KEY=
+ # export TARGET_REDSHIFT_AWS_SECRET_ACCESS_KEY=
+ # export TARGET_REDSHIFT_S3_ACL=
+ # export TARGET_REDSHIFT_S3_KEY_PREFIX=
+
+ textioaws assumerole --config predev/predev --duration 14400 coverage run -m pytest -vv --disable-pytest-warnings tests/integration && coverage report
+
```
-### To run pylint:
+### To run pylint
1. Install python dependencies and run python linter
-
```bash
python3 -m venv venv
. venv/bin/activate
diff --git a/requirements_dev.in b/requirements_dev.in
new file mode 100644
index 0000000..ead3f40
--- /dev/null
+++ b/requirements_dev.in
@@ -0,0 +1,8 @@
+black
+coverage
+mock
+pre-commit
+pylint
+pytest
+pytest-cov
+ruff
\ No newline at end of file
diff --git a/requirements_dev.txt b/requirements_dev.txt
new file mode 100644
index 0000000..e63aa0f
--- /dev/null
+++ b/requirements_dev.txt
@@ -0,0 +1,90 @@
+#
+# This file is autogenerated by pip-compile with Python 3.9
+# by the following command:
+#
+# pip-compile --no-emit-index-url --no-emit-options --output-file=requirements_dev.txt requirements_dev.in
+#
+astroid==2.15.6
+ # via pylint
+black==23.7.0
+ # via -r requirements_dev.in
+cfgv==3.4.0
+ # via pre-commit
+click==8.1.7
+ # via black
+coverage[toml]==7.3.1
+ # via
+ # -r requirements_dev.in
+ # pytest-cov
+dill==0.3.7
+ # via pylint
+distlib==0.3.7
+ # via virtualenv
+exceptiongroup==1.1.3
+ # via pytest
+filelock==3.12.3
+ # via virtualenv
+identify==2.5.27
+ # via pre-commit
+iniconfig==2.0.0
+ # via pytest
+isort==5.12.0
+ # via pylint
+lazy-object-proxy==1.9.0
+ # via astroid
+mccabe==0.7.0
+ # via pylint
+mock==5.1.0
+ # via -r requirements_dev.in
+mypy-extensions==1.0.0
+ # via black
+nodeenv==1.8.0
+ # via pre-commit
+packaging==23.1
+ # via
+ # black
+ # pytest
+pathspec==0.11.2
+ # via black
+platformdirs==3.10.0
+ # via
+ # black
+ # pylint
+ # virtualenv
+pluggy==1.3.0
+ # via pytest
+pre-commit==3.4.0
+ # via -r requirements_dev.in
+pylint==2.17.5
+ # via -r requirements_dev.in
+pytest==7.4.2
+ # via
+ # -r requirements_dev.in
+ # pytest-cov
+pytest-cov==4.1.0
+ # via -r requirements_dev.in
+pyyaml==6.0.1
+ # via pre-commit
+ruff==0.0.287
+ # via -r requirements_dev.in
+tomli==2.0.1
+ # via
+ # black
+ # coverage
+ # pylint
+ # pytest
+tomlkit==0.12.1
+ # via pylint
+typing-extensions==4.7.1
+ # via
+ # astroid
+ # black
+ # filelock
+ # pylint
+virtualenv==20.24.5
+ # via pre-commit
+wrapt==1.15.0
+ # via astroid
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
diff --git a/setup.py b/setup.py
index 8213ecd..7e9a817 100644
--- a/setup.py
+++ b/setup.py
@@ -2,41 +2,35 @@
from setuptools import setup
-with open('README.md') as f:
+with open("README.md") as f:
long_description = f.read()
-setup(name="pipelinewise-target-redshift",
- version="1.6.0",
- description="Singer.io target for loading data to Amazon Redshift - PipelineWise compatible",
- long_description=long_description,
- long_description_content_type='text/markdown',
- author="TransferWise",
- url='https://github.com/transferwise/pipelinewise-target-redshift',
- classifiers=[
- 'License :: OSI Approved :: Apache Software License',
- 'Programming Language :: Python :: 3 :: Only'
- ],
- py_modules=["target_redshift"],
- install_requires=[
- 'pipelinewise-singer-python==1.*',
- 'boto3==1.12.39',
- 'psycopg2-binary==2.8.5',
- 'inflection==0.4.0',
- 'joblib==0.16.0'
- ],
- extras_require={
- "test": [
- "pylint==2.4.2",
- "pytest==5.3.0",
- "mock==3.0.5",
- "coverage==4.5.4"
- ]
- },
- entry_points="""
+setup(
+ name="pipelinewise-target-redshift",
+ version="1.7.0",
+ description="Singer.io target for loading data to Amazon Redshift - PipelineWise compatible",
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ author="TransferWise",
+ url="https://github.com/transferwise/pipelinewise-target-redshift",
+ classifiers=[
+ "License :: OSI Approved :: Apache Software License",
+ "Programming Language :: Python :: 3 :: Only",
+ ],
+ py_modules=["target_redshift"],
+ install_requires=[
+ "pipelinewise-singer-python==1.3.0",
+ "boto3==1.34.51",
+ "psycopg2-binary==2.9.9",
+ "inflection==0.5.1",
+ "joblib==1.3.2",
+ ],
+ extras_require={"test": ["pylint==2.17.5", "pytest==7.4.2", "mock==5.1.0", "coverage==7.3.1"]},
+ entry_points="""
[console_scripts]
target-redshift=target_redshift:main
""",
- packages=["target_redshift"],
- package_data = {},
- include_package_data=True,
+ packages=["target_redshift"],
+ package_data={},
+ include_package_data=True,
)
diff --git a/target_redshift/__init__.py b/target_redshift/__init__.py
index bf3c6a8..9a21a36 100644
--- a/target_redshift/__init__.py
+++ b/target_redshift/__init__.py
@@ -1,25 +1,25 @@
#!/usr/bin/env python3
import argparse
+import bz2
+import copy
+import gzip
import io
import json
import os
import sys
-import copy
-import gzip
-import bz2
from datetime import datetime
from decimal import Decimal
+from itertools import islice
from tempfile import mkstemp
from joblib import Parallel, delayed, parallel_backend
from jsonschema import Draft7Validator, FormatChecker
from singer import get_logger
-from itertools import islice
from target_redshift.db_sync import DbSync
-LOGGER = get_logger('target_redshift')
+LOGGER = get_logger("target_redshift")
DEFAULT_BATCH_SIZE_ROWS = 100000
DEFAULT_PARALLELISM = 0 # 0 The number of threads used to flush tables
@@ -28,11 +28,13 @@
class RecordValidationException(Exception):
"""Exception to raise when record validation failed"""
+
pass
class InvalidValidationOperationException(Exception):
"""Exception to raise when internal JSON schema validation process failed"""
+
pass
@@ -55,11 +57,17 @@ def add_metadata_columns_to_schema(schema_message):
Metadata columns gives information about data injections
"""
extended_schema_message = schema_message
- extended_schema_message['schema']['properties']['_sdc_extracted_at'] = {'type': ['null', 'string'],
- 'format': 'date-time'}
- extended_schema_message['schema']['properties']['_sdc_batched_at'] = {'type': ['null', 'string'],
- 'format': 'date-time'}
- extended_schema_message['schema']['properties']['_sdc_deleted_at'] = {'type': ['null', 'string']}
+ extended_schema_message["schema"]["properties"]["_sdc_extracted_at"] = {
+ "type": ["null", "string"],
+ "format": "date-time",
+ }
+ extended_schema_message["schema"]["properties"]["_sdc_batched_at"] = {
+ "type": ["null", "string"],
+ "format": "date-time",
+ }
+ extended_schema_message["schema"]["properties"]["_sdc_deleted_at"] = {
+ "type": ["null", "string"]
+ }
return extended_schema_message
@@ -68,10 +76,12 @@ def add_metadata_values_to_record(record_message, stream_to_sync):
"""Populate metadata _sdc columns from incoming record message
The location of the required attributes are fixed in the stream
"""
- extended_record = record_message['record']
- extended_record['_sdc_extracted_at'] = record_message.get('time_extracted')
- extended_record['_sdc_batched_at'] = datetime.now().isoformat()
- extended_record['_sdc_deleted_at'] = record_message.get('record', {}).get('_sdc_deleted_at')
+ extended_record = record_message["record"]
+ extended_record["_sdc_extracted_at"] = record_message.get("time_extracted")
+ extended_record["_sdc_batched_at"] = datetime.now().isoformat()
+ extended_record["_sdc_deleted_at"] = record_message.get("record", {}).get(
+ "_sdc_deleted_at"
+ )
return extended_record
@@ -79,34 +89,36 @@ def add_metadata_values_to_record(record_message, stream_to_sync):
def emit_state(state):
if state is not None:
line = json.dumps(state)
- LOGGER.info('Emitting state {}'.format(line))
- sys.stdout.write("{}\n".format(line))
+ LOGGER.info(f"Emitting state {line}")
+ sys.stdout.write(f"{line}\n")
sys.stdout.flush()
def get_schema_names_from_config(config):
- default_target_schema = config.get('default_target_schema')
- schema_mapping = config.get('schema_mapping', {})
+ default_target_schema = config.get("default_target_schema")
+ schema_mapping = config.get("schema_mapping", {})
schema_names = []
if default_target_schema:
schema_names.append(default_target_schema)
if schema_mapping:
- for source_schema, target in schema_mapping.items():
- schema_names.append(target.get('target_schema'))
-
+ schema_names.extend(
+ target.get("target_schema")
+ for source_schema, target in schema_mapping.items()
+ )
return schema_names
def load_table_cache(config):
table_cache = []
- if not ('disable_table_cache' in config and config['disable_table_cache']):
+ if not ("disable_table_cache" in config and config["disable_table_cache"]):
LOGGER.info("Getting catalog objects from table cache...")
db = DbSync(config)
table_cache = db.get_table_columns(
- filter_schemas=get_schema_names_from_config(config))
+ filter_schemas=get_schema_names_from_config(config)
+ )
return table_cache
@@ -122,46 +134,51 @@ def persist_lines(config, lines, table_cache=None) -> None:
row_count = {}
stream_to_sync = {}
total_row_count = {}
- batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS)
+ batch_size_rows = config.get("batch_size_rows", DEFAULT_BATCH_SIZE_ROWS)
# Loop over lines from stdin
for line in lines:
try:
o = json.loads(line)
except json.decoder.JSONDecodeError:
- LOGGER.error("Unable to parse:\n{}".format(line))
+ LOGGER.error(f"Unable to parse:\n{line}")
raise
- if 'type' not in o:
- raise Exception("Line is missing required key 'type': {}".format(line))
+ if "type" not in o:
+ raise Exception(f"Line is missing required key 'type': {line}")
- t = o['type']
+ t = o["type"]
- if t == 'RECORD':
- if 'stream' not in o:
- raise Exception("Line is missing required key 'stream': {}".format(line))
- if o['stream'] not in schemas:
+ if t == "RECORD":
+ if "stream" not in o:
+ raise Exception(f"Line is missing required key 'stream': {line}")
+ if o["stream"] not in schemas:
raise Exception(
- "A record for stream {} was encountered before a corresponding schema".format(o['stream']))
+ f'A record for stream {o["stream"]} was encountered before a corresponding schema'
+ )
# Get schema for this record's stream
- stream = o['stream']
+ stream = o["stream"]
# Validate record
- if config.get('validate_records'):
+ if config.get("validate_records"):
try:
- validators[stream].validate(float_to_decimal(o['record']))
+ validators[stream].validate(float_to_decimal(o["record"]))
except Exception as ex:
if type(ex).__name__ == "InvalidOperation":
raise InvalidValidationOperationException(
f"Data validation failed and cannot load to destination. RECORD: {o['record']}\n"
"multipleOf validations that allows long precisions are not supported (i.e. with 15 digits"
- "or more) Try removing 'multipleOf' methods from JSON schema.")
- raise RecordValidationException(f"Record does not pass schema validation. RECORD: {o['record']}")
+ "or more) Try removing 'multipleOf' methods from JSON schema."
+ ) from ex
+ raise RecordValidationException(
+ f"Record does not pass schema validation. RECORD: {o['record']}"
+ ) from ex
- primary_key_string = stream_to_sync[stream].record_primary_key_string(o['record'])
- if not primary_key_string:
- primary_key_string = 'RID-{}'.format(total_row_count[stream])
+ primary_key_string = (
+ stream_to_sync[stream].record_primary_key_string(o["record"])
+ or f"RID-{total_row_count[stream]}"
+ )
if stream not in records_to_load:
records_to_load[stream] = {}
@@ -172,18 +189,16 @@ def persist_lines(config, lines, table_cache=None) -> None:
total_row_count[stream] += 1
# append record
- if config.get('add_metadata_columns') or config.get('hard_delete'):
- records_to_load[stream][primary_key_string] = add_metadata_values_to_record(o, stream_to_sync[stream])
+ if config.get("add_metadata_columns") or config.get("hard_delete"):
+ records_to_load[stream][
+ primary_key_string
+ ] = add_metadata_values_to_record(o, stream_to_sync[stream])
else:
- records_to_load[stream][primary_key_string] = o['record']
+ records_to_load[stream][primary_key_string] = o["record"]
if row_count[stream] >= batch_size_rows:
# flush all streams, delete records if needed, reset counts and then emit current state
- if config.get('flush_all_streams'):
- filter_streams = None
- else:
- filter_streams = [stream]
-
+ filter_streams = None if config.get("flush_all_streams") else [stream]
# Flush and return a new state dict with new positions only for the flushed streams
flushed_state = flush_streams(
records_to_load,
@@ -192,31 +207,41 @@ def persist_lines(config, lines, table_cache=None) -> None:
config,
state,
flushed_state,
- filter_streams=filter_streams)
+ filter_streams=filter_streams,
+ )
# emit last encountered state
emit_state(copy.deepcopy(flushed_state))
- elif t == 'SCHEMA':
- if 'stream' not in o:
- raise Exception("Line is missing required key 'stream': {}".format(line))
+ elif t == "SCHEMA":
+ if "stream" not in o:
+ raise Exception(f"Line is missing required key 'stream': {line}")
- stream = o['stream']
+ stream = o["stream"]
- schemas[stream] = float_to_decimal(o['schema'])
- validators[stream] = Draft7Validator(schemas[stream], format_checker=FormatChecker())
+ schemas[stream] = float_to_decimal(o["schema"])
+ validators[stream] = Draft7Validator(
+ schemas[stream], format_checker=FormatChecker()
+ )
# flush records from previous stream SCHEMA
# if same stream has been encountered again, it means the schema might have been altered
# so previous records need to be flushed
if row_count.get(stream, 0) > 0:
- flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state)
+ flushed_state = flush_streams(
+ records_to_load,
+ row_count,
+ stream_to_sync,
+ config,
+ state,
+ flushed_state,
+ )
# emit latest encountered state
emit_state(flushed_state)
# key_properties key must be available in the SCHEMA message.
- if 'key_properties' not in o:
+ if "key_properties" not in o:
raise Exception("key_properties field is required")
# Log based and Incremental replications on tables with no Primary Key
@@ -227,14 +252,21 @@ def persist_lines(config, lines, table_cache=None) -> None:
# 1) Set ` 'primary_key_required': false ` in the target-redshift config.json
# or
# 2) Use fastsync [postgres-to-redshift, mysql-to-redshift, etc.]
- if config.get('primary_key_required', True) and len(o['key_properties']) == 0:
- LOGGER.critical("Primary key is set to mandatory but not defined in the [{}] stream".format(stream))
+ if (
+ config.get("primary_key_required", True)
+ and len(o["key_properties"]) == 0
+ ):
+ LOGGER.critical(
+ f"Primary key is set to mandatory but not defined in the [{stream}] stream"
+ )
raise Exception("key_properties field is required")
- key_properties[stream] = o['key_properties']
+ key_properties[stream] = o["key_properties"]
- if config.get('add_metadata_columns') or config.get('hard_delete'):
- stream_to_sync[stream] = DbSync(config, add_metadata_columns_to_schema(o), table_cache)
+ if config.get("add_metadata_columns") or config.get("hard_delete"):
+ stream_to_sync[stream] = DbSync(
+ config, add_metadata_columns_to_schema(o), table_cache
+ )
else:
stream_to_sync[stream] = DbSync(config, o, table_cache)
@@ -244,39 +276,41 @@ def persist_lines(config, lines, table_cache=None) -> None:
row_count[stream] = 0
total_row_count[stream] = 0
- elif t == 'ACTIVATE_VERSION':
- LOGGER.debug('ACTIVATE_VERSION message')
+ elif t == "ACTIVATE_VERSION":
+ LOGGER.debug("ACTIVATE_VERSION message")
- elif t == 'STATE':
- LOGGER.debug('Setting state to {}'.format(o['value']))
- state = o['value']
+ elif t == "STATE":
+ LOGGER.debug(f'Setting state to {o["value"]}')
+ state = o["value"]
# Initially set flushed state
if not flushed_state:
flushed_state = copy.deepcopy(state)
else:
- raise Exception("Unknown message type {} in message {}"
- .format(o['type'], o))
-
+ raise Exception(f"Unknown message type {t} in message {o}")
# if some bucket has records that need to be flushed but haven't reached batch size
# then flush all buckets.
if sum(row_count.values()) > 0:
# flush all streams one last time, delete records if needed, reset counts and then emit current state
- flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state)
+ flushed_state = flush_streams(
+ records_to_load, row_count, stream_to_sync, config, state, flushed_state
+ )
# emit latest state
emit_state(copy.deepcopy(flushed_state))
+
# pylint: disable=too-many-arguments
def flush_streams(
- streams,
- row_count,
- stream_to_sync,
- config,
- state,
- flushed_state,
- filter_streams=None):
+ streams,
+ row_count,
+ stream_to_sync,
+ config,
+ state,
+ flushed_state,
+ filter_streams=None,
+):
"""
Flushes all buckets and resets records count to 0 as well as empties records to load list
:param streams: dictionary with records to load per stream
@@ -298,29 +332,24 @@ def flush_streams(
# be loaded but it's not greater than the value of max_parallelism
if parallelism == 0:
n_streams_to_flush = len(streams.keys())
- if n_streams_to_flush > max_parallelism:
- parallelism = max_parallelism
- else:
- parallelism = n_streams_to_flush
-
+ parallelism = min(n_streams_to_flush, max_parallelism)
# Select the required streams to flush
- if filter_streams:
- streams_to_flush = filter_streams
- else:
- streams_to_flush = streams.keys()
-
+ streams_to_flush = filter_streams or streams.keys()
# Single-host, thread-based parallelism
- with parallel_backend('threading', n_jobs=parallelism):
- Parallel()(delayed(load_stream_batch)(
- stream=stream,
- records_to_load=streams[stream],
- row_count=row_count,
- db_sync=stream_to_sync[stream],
- delete_rows=config.get('hard_delete'),
- compression=config.get('compression'),
- slices=config.get('slices'),
- temp_dir=config.get('temp_dir')
- ) for stream in streams_to_flush)
+ with parallel_backend("threading", n_jobs=parallelism):
+ Parallel()(
+ delayed(load_stream_batch)(
+ stream=stream,
+ records_to_load=streams[stream],
+ row_count=row_count,
+ db_sync=stream_to_sync[stream],
+ delete_rows=config.get("hard_delete"),
+ compression=config.get("compression"),
+ slices=config.get("slices"),
+ temp_dir=config.get("temp_dir"),
+ )
+ for stream in streams_to_flush
+ )
# reset flushed stream records to empty to avoid flushing same records
for stream in streams_to_flush:
@@ -329,12 +358,14 @@ def flush_streams(
# Update flushed streams
if filter_streams:
# update flushed_state position if we have state information for the stream
- if stream in state.get('bookmarks', {}):
+ if stream in state.get("bookmarks", {}):
# Create bookmark key if not exists
- if 'bookmarks' not in flushed_state:
- flushed_state['bookmarks'] = {}
+ if "bookmarks" not in flushed_state:
+ flushed_state["bookmarks"] = {}
# Copy the stream bookmark from the latest state
- flushed_state['bookmarks'][stream] = copy.deepcopy(state['bookmarks'][stream])
+ flushed_state["bookmarks"][stream] = copy.deepcopy(
+ state["bookmarks"][stream]
+ )
# If we flush every bucket use the latest state
else:
@@ -344,11 +375,28 @@ def flush_streams(
return flushed_state
-def load_stream_batch(stream, records_to_load, row_count, db_sync, delete_rows=False, compression=None, slices=None, temp_dir=None):
- # Load into redshift
+def load_stream_batch(
+ stream,
+ records_to_load,
+ row_count,
+ db_sync,
+ delete_rows=False,
+ compression=None,
+ slices=None,
+ temp_dir=None,
+):
+ # Load into redshift
try:
if row_count[stream] > 0:
- flush_records(stream, records_to_load, row_count[stream], db_sync, compression, slices, temp_dir)
+ flush_records(
+ stream,
+ records_to_load,
+ row_count[stream],
+ db_sync,
+ compression,
+ slices,
+ temp_dir,
+ )
# Delete soft-deleted, flagged rows - where _sdc_deleted at is not null
if delete_rows:
@@ -356,9 +404,10 @@ def load_stream_batch(stream, records_to_load, row_count, db_sync, delete_rows=F
# reset row count for the current stream
row_count[stream] = 0
- except Exception as e:
+ except Exception:
LOGGER.exception("Failed to load stream %s to Redshift", stream)
- raise e
+ raise
+
def chunk_iterable(iterable, size):
"""Yield successive n-sized chunks from iterable. The last chunk is not padded"""
@@ -371,7 +420,15 @@ def ceiling_division(n, d):
return -(n // -d)
-def flush_records(stream, records_to_load, row_count, db_sync, compression=None, slices=None, temp_dir=None):
+def flush_records(
+ stream,
+ records_to_load,
+ row_count,
+ db_sync,
+ compression=None,
+ slices=None,
+ temp_dir=None,
+):
slices = slices or 1
use_gzip = compression == "gzip"
use_bzip2 = compression == "bzip2"
@@ -384,10 +441,10 @@ def flush_records(stream, records_to_load, row_count, db_sync, compression=None,
file_extension = ".csv"
if use_gzip:
open_method = gzip.open
- file_extension = file_extension + ".gz"
+ file_extension += ".gz"
elif use_bzip2:
open_method = bz2.open
- file_extension = file_extension + ".bz2"
+ file_extension += ".bz2"
if not isinstance(slices, int):
raise Exception("The provided configuration value 'slices' was not an integer")
@@ -404,7 +461,11 @@ def flush_records(stream, records_to_load, row_count, db_sync, compression=None,
list(records_to_load.values()), ceiling_division(len(records_to_load), slices)
)
for chunk_number, chunk in enumerate(chunks, start=1):
- _, csv_file = mkstemp(suffix=file_extension + "." + str(chunk_number), prefix=f'{stream}_', dir=temp_dir)
+ _, csv_file = mkstemp(
+ suffix=f"{file_extension}.{str(chunk_number)}",
+ prefix=f"{stream}_",
+ dir=temp_dir,
+ )
csv_files = csv_files + [csv_file]
with open_method(csv_file, "w+b") as csv_f:
for record in chunk:
@@ -414,7 +475,7 @@ def flush_records(stream, records_to_load, row_count, db_sync, compression=None,
csv_file,
stream,
len(chunk),
- suffix="_" + date_suffix + file_extension + "." + str(chunk_number),
+ suffix=f"_{date_suffix}{file_extension}.{str(chunk_number)}",
)
size_bytes += os.path.getsize(csv_file)
s3_keys = s3_keys + [s3_key]
@@ -431,7 +492,7 @@ def flush_records(stream, records_to_load, row_count, db_sync, compression=None,
def main():
arg_parser = argparse.ArgumentParser()
- arg_parser.add_argument('-c', '--config', help='Config file')
+ arg_parser.add_argument("-c", "--config", help="Config file")
args = arg_parser.parse_args()
if args.config:
@@ -443,11 +504,11 @@ def main():
# Init columns cache
table_cache = load_table_cache(config)
- singer_messages = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+ singer_messages = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8")
persist_lines(config, singer_messages, table_cache)
LOGGER.debug("Exiting normally")
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/target_redshift/db_sync.py b/target_redshift/db_sync.py
index 4a79049..9ad8009 100644
--- a/target_redshift/db_sync.py
+++ b/target_redshift/db_sync.py
@@ -1,4 +1,4 @@
-import collections
+import collections.abc
import itertools
import json
import os
@@ -7,96 +7,93 @@
import time
import boto3
+import inflection
import psycopg2
import psycopg2.extras
-
-import inflection
from singer import get_logger
-
DEFAULT_VARCHAR_LENGTH = 10000
SHORT_VARCHAR_LENGTH = 256
LONG_VARCHAR_LENGTH = 65535
def validate_config(config):
- errors = []
required_config_keys = [
- 'host',
- 'port',
- 'user',
- 'password',
- 'dbname',
- 's3_bucket'
+ "host",
+ "port",
+ "user",
+ "password",
+ "dbname",
+ "s3_bucket",
]
- # Check if mandatory keys exist
- for k in required_config_keys:
- if not config.get(k, None):
- errors.append("Required key is missing from config: [{}]".format(k))
-
+ errors = [
+ f"Required key is missing from config: [{k}]"
+ for k in required_config_keys
+ if not config.get(k, None)
+ ]
# Check target schema config
- config_default_target_schema = config.get('default_target_schema', None)
- config_schema_mapping = config.get('schema_mapping', None)
+ config_default_target_schema = config.get("default_target_schema", None)
+ config_schema_mapping = config.get("schema_mapping", None)
if not config_default_target_schema and not config_schema_mapping:
- errors.append("Neither 'default_target_schema' (string) nor 'schema_mapping' (object) keys set in config.")
+ errors.append(
+ """
+ Neither 'default_target_schema' (string) nor 'schema_mapping'
+ (object) keys set in config.
+ """
+ )
return errors
def column_type(schema_property, with_length=True):
- property_type = schema_property['type']
- property_format = schema_property['format'] if 'format' in schema_property else None
- column_type = 'character varying'
+ property_type = schema_property["type"]
+ property_format = schema_property["format"] if "format" in schema_property else None
+ column_type = "character varying"
varchar_length = DEFAULT_VARCHAR_LENGTH
- if schema_property.get('maxLength', 0) > varchar_length:
+ if schema_property.get("maxLength", 0) > varchar_length:
varchar_length = LONG_VARCHAR_LENGTH
- if 'object' in property_type or 'array' in property_type:
- column_type = 'character varying'
+ if "object" in property_type or "array" in property_type:
+ column_type = "super"
varchar_length = LONG_VARCHAR_LENGTH
# Every date-time JSON value is currently mapped to TIMESTAMP WITHOUT TIME ZONE
#
# TODO: Detect if timezone postfix exists in the JSON and find if TIMESTAMP WITHOUT TIME ZONE or
# TIMESTAMP WITH TIME ZONE is the better column type
- elif property_format == 'date-time':
- column_type = 'timestamp without time zone'
- elif property_format == 'time':
- column_type = 'character varying'
+ elif property_format == "date-time":
+ column_type = "timestamp without time zone"
+ elif property_format == "time":
+ column_type = "character varying"
varchar_length = SHORT_VARCHAR_LENGTH
- elif 'number' in property_type:
- column_type = 'double precision'
- elif 'integer' in property_type and 'string' in property_type:
- column_type = 'character varying'
+ elif "number" in property_type:
+ column_type = "double precision"
+ elif "integer" in property_type and "string" in property_type:
+ column_type = "character varying"
varchar_length = LONG_VARCHAR_LENGTH
- elif 'integer' in property_type:
- column_type = 'numeric'
- elif 'boolean' in property_type:
- column_type = 'boolean'
+ elif "integer" in property_type:
+ column_type = "bigint"
+ elif "boolean" in property_type:
+ column_type = "boolean"
# Add max length to column type if required
- if with_length:
- if column_type == 'character varying' and varchar_length > 0:
- column_type = '{}({})'.format(column_type, varchar_length)
+ if with_length and (column_type == "character varying" and varchar_length > 0):
+ column_type = f"{column_type}({varchar_length})"
return column_type
def column_trans(schema_property):
- property_type = schema_property['type']
- column_trans = ''
- if 'object' in property_type or 'array' in property_type:
- column_trans = 'parse_json'
-
- return column_trans
+ property_type = schema_property["type"]
+ return "parse_json" if "object" in property_type or "array" in property_type else ""
def safe_column_name(name):
- return '"{}"'.format(name).upper()
+ return f'"{name}"'.upper()
def column_clause(name, schema_property):
- return '{} {}'.format(safe_column_name(name), column_type(schema_property))
+ return f"{safe_column_name(name)} {column_type(schema_property)}"
def flatten_key(k, parent_key, sep):
@@ -104,44 +101,57 @@ def flatten_key(k, parent_key, sep):
inflected_key = full_key.copy()
reducer_index = 0
while len(sep.join(inflected_key)) >= 127 and reducer_index < len(inflected_key):
- reduced_key = re.sub(r'[a-z]', '', inflection.camelize(inflected_key[reducer_index]))
- inflected_key[reducer_index] = \
- (reduced_key if len(reduced_key) > 1 else inflected_key[reducer_index][0:3]).lower()
+ reduced_key = re.sub(
+ r"[a-z]", "", inflection.camelize(inflected_key[reducer_index])
+ )
+ inflected_key[reducer_index] = (
+ reduced_key if len(reduced_key) > 1 else inflected_key[reducer_index][:3]
+ ).lower()
reducer_index += 1
return sep.join(inflected_key)
-def flatten_schema(d, parent_key=[], sep='__', level=0, max_level=0):
+def flatten_schema(d, parent_key=[], sep="__", level=0, max_level=0):
items = []
- if 'properties' not in d:
+ if "properties" not in d:
return {}
- for k, v in d['properties'].items():
+ for k, v in d["properties"].items():
new_key = flatten_key(k, parent_key, sep)
- if 'type' in v.keys():
- if 'object' in v['type'] and 'properties' in v and level < max_level:
- items.extend(flatten_schema(v, parent_key + [k], sep=sep, level=level+1, max_level=max_level).items())
+ if "type" in v:
+ if "object" in v["type"] and "properties" in v and level < max_level:
+ items.extend(
+ flatten_schema(
+ v,
+ parent_key + [k],
+ sep=sep,
+ level=level + 1,
+ max_level=max_level,
+ ).items()
+ )
else:
items.append((new_key, v))
else:
if len(v.values()) > 0:
- if list(v.values())[0][0]['type'] == 'string':
- list(v.values())[0][0]['type'] = ['null', 'string']
+ if list(v.values())[0][0]["type"] == "string":
+ list(v.values())[0][0]["type"] = ["null", "string"]
items.append((new_key, list(v.values())[0][0]))
- elif list(v.values())[0][0]['type'] == 'array':
- list(v.values())[0][0]['type'] = ['null', 'array']
+ elif list(v.values())[0][0]["type"] == "array":
+ list(v.values())[0][0]["type"] = ["null", "array"]
items.append((new_key, list(v.values())[0][0]))
- elif list(v.values())[0][0]['type'] == 'object':
- list(v.values())[0][0]['type'] = ['null', 'object']
+ elif list(v.values())[0][0]["type"] == "object":
+ list(v.values())[0][0]["type"] = ["null", "object"]
items.append((new_key, list(v.values())[0][0]))
- key_func = lambda item: item[0]
+ def key_func(item):
+ return item[0]
+
sorted_items = sorted(items, key=key_func)
for k, g in itertools.groupby(sorted_items, key=key_func):
if len(list(g)) > 1:
- raise ValueError('Duplicate column name produced in schema: {}'.format(k))
+ raise ValueError(f"Duplicate column name produced in schema: {k}")
return dict(sorted_items)
@@ -150,34 +160,57 @@ def _should_json_dump_value(key, value, flatten_schema=None):
if isinstance(value, (dict, list)):
return True
- if flatten_schema and key in flatten_schema and 'type' in flatten_schema[key] and set(flatten_schema[key]['type']) == {'null', 'object', 'array'}:
- return True
-
- return False
+ return bool(
+ flatten_schema
+ and key in flatten_schema
+ and "type" in flatten_schema[key]
+ and set(flatten_schema[key]["type"]) == {"null", "object", "array"}
+ )
-#pylint: disable-msg=too-many-arguments
-def flatten_record(d, flatten_schema=None, parent_key=[], sep='__', level=0, max_level=0):
+# pylint: disable-msg=too-many-arguments
+def flatten_record(
+ d, flatten_schema=None, parent_key=None, sep="__", level=0, max_level=0
+):
+ if parent_key is None:
+ parent_key = []
items = []
for k, v in d.items():
new_key = flatten_key(k, parent_key, sep)
- if isinstance(v, collections.MutableMapping) and level < max_level:
- items.extend(flatten_record(v, flatten_schema, parent_key + [k], sep=sep, level=level + 1, max_level=max_level).items())
+ if isinstance(v, collections.abc.MutableMapping) and level < max_level:
+ items.extend(
+ flatten_record(
+ v,
+ flatten_schema,
+ parent_key + [k],
+ sep=sep,
+ level=level + 1,
+ max_level=max_level,
+ ).items()
+ )
else:
- items.append((new_key, json.dumps(v) if _should_json_dump_value(k, v, flatten_schema) else v))
+ items.append(
+ (
+ new_key,
+ json.dumps(v)
+ if _should_json_dump_value(k, v, flatten_schema)
+ else v,
+ )
+ )
return dict(items)
def primary_column_names(stream_schema_message):
- return [safe_column_name(p) for p in stream_schema_message['key_properties']]
+ return [safe_column_name(p) for p in stream_schema_message["key_properties"]]
-def stream_name_to_dict(stream_name, separator='-'):
+def stream_name_to_dict(stream_name, separator="-"):
catalog_name = None
schema_name = None
table_name = stream_name
- # Schema and table name can be derived from stream if it's in - format
+ # Schema and table name can be derived from stream if it's in
+ # - format
s = stream_name.split(separator)
if len(s) == 2:
schema_name = s[0]
@@ -185,12 +218,12 @@ def stream_name_to_dict(stream_name, separator='-'):
if len(s) > 2:
catalog_name = s[0]
schema_name = s[1]
- table_name = '_'.join(s[2:])
+ table_name = "_".join(s[2:])
return {
- 'catalog_name': catalog_name,
- 'schema_name': schema_name,
- 'table_name': table_name
+ "catalog_name": catalog_name,
+ "schema_name": schema_name,
+ "table_name": table_name,
}
@@ -198,42 +231,52 @@ def stream_name_to_dict(stream_name, separator='-'):
class DbSync:
def __init__(self, connection_config, stream_schema_message=None, table_cache=None):
"""
- connection_config: Redshift connection details
-
- stream_schema_message: An instance of the DbSync class is typically used to load
- data only from a certain singer tap stream.
-
- The stream_schema_message holds the destination schema
- name and the JSON schema that will be used to
- validate every RECORDS messages that comes from the stream.
- Schema validation happening before creating CSV and before
- uploading data into Redshift.
-
- If stream_schema_message is not defined then we can use
- the DbSync instance as a generic purpose connection to
- Redshift and can run individual queries. For example
- collecting catalog informations from Redshift for caching
- purposes.
+ connection_config: Redshift connection details
+
+ stream_schema_message: An instance of the DbSync class is typically used to load
+ data only from a certain singer tap stream.
+
+ The stream_schema_message holds the destination schema
+ name and the JSON schema that will be used to
+ validate every RECORDS messages that comes from the stream.
+ Schema validation happening before creating CSV and before
+ uploading data into Redshift.
+
+ If stream_schema_message is not defined then we can use
+ the DbSync instance as a generic purpose connection to
+ Redshift and can run individual queries. For example
+ collecting catalog information from Redshift for caching
+ purposes.
"""
self.connection_config = connection_config
self.stream_schema_message = stream_schema_message
self.table_cache = table_cache
# logger to be used across the class's methods
- self.logger = get_logger('target_redshift')
+ self.logger = get_logger("target_redshift")
# Validate connection configuration
config_errors = validate_config(connection_config)
# Exit if config has errors
if len(config_errors) != 0:
- self.logger.error("Invalid configuration:\n * {}".format('\n * '.join(config_errors)))
+ self.logger.error(
+ "Invalid configuration:\n * {}".format("\n * ".join(config_errors))
+ )
sys.exit(1)
- aws_profile = self.connection_config.get('aws_profile') or os.environ.get('AWS_PROFILE')
- aws_access_key_id = self.connection_config.get('aws_access_key_id') or os.environ.get('AWS_ACCESS_KEY_ID')
- aws_secret_access_key = self.connection_config.get('aws_secret_access_key') or os.environ.get('AWS_SECRET_ACCESS_KEY')
- aws_session_token = self.connection_config.get('aws_session_token') or os.environ.get('AWS_SESSION_TOKEN')
+ aws_profile = self.connection_config.get("aws_profile") or os.environ.get(
+ "AWS_PROFILE"
+ )
+ aws_access_key_id = self.connection_config.get(
+ "aws_access_key_id"
+ ) or os.environ.get("AWS_ACCESS_KEY_ID")
+ aws_secret_access_key = self.connection_config.get(
+ "aws_secret_access_key"
+ ) or os.environ.get("AWS_SECRET_ACCESS_KEY")
+ aws_session_token = self.connection_config.get(
+ "aws_session_token"
+ ) or os.environ.get("AWS_SESSION_TOKEN")
# Init S3 client
# Conditionally pass keys as this seems to affect whether instance credentials are correctly loaded if the keys are None
@@ -241,19 +284,19 @@ def __init__(self, connection_config, stream_schema_message=None, table_cache=No
aws_session = boto3.session.Session(
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
- aws_session_token=aws_session_token
+ aws_session_token=aws_session_token,
)
credentials = aws_session.get_credentials().get_frozen_credentials()
# Explicitly set credentials to those fetched from Boto so we can re-use them in COPY SQL if necessary
- self.connection_config['aws_access_key_id'] = credentials.access_key
- self.connection_config['aws_secret_access_key'] = credentials.secret_key
- self.connection_config['aws_session_token'] = credentials.token
+ self.connection_config["aws_access_key_id"] = credentials.access_key
+ self.connection_config["aws_secret_access_key"] = credentials.secret_key
+ self.connection_config["aws_session_token"] = credentials.token
else:
aws_session = boto3.session.Session(profile_name=aws_profile)
- self.s3 = aws_session.client('s3')
- self.skip_updates = self.connection_config.get('skip_updates', False)
+ self.s3 = aws_session.client("s3")
+ self.skip_updates = self.connection_config.get("skip_updates", False)
self.schema_name = None
self.grantees = None
@@ -278,18 +321,24 @@ def __init__(self, connection_config, stream_schema_message=None, table_cache=No
# }
# }
# }
- config_default_target_schema = self.connection_config.get('default_target_schema', '').strip()
- config_schema_mapping = self.connection_config.get('schema_mapping', {})
+ config_default_target_schema = self.connection_config.get(
+ "default_target_schema", ""
+ ).strip()
+ config_schema_mapping = self.connection_config.get("schema_mapping", {})
- stream_name = stream_schema_message['stream']
- stream_schema_name = stream_name_to_dict(stream_name)['schema_name']
+ stream_name = stream_schema_message["stream"]
+ stream_schema_name = stream_name_to_dict(stream_name)["schema_name"]
if config_schema_mapping and stream_schema_name in config_schema_mapping:
- self.schema_name = config_schema_mapping[stream_schema_name].get('target_schema')
+ self.schema_name = config_schema_mapping[stream_schema_name].get(
+ "target_schema"
+ )
elif config_default_target_schema:
self.schema_name = config_default_target_schema
if not self.schema_name:
- raise Exception("Target schema name not defined in config. Neither 'default_target_schema' (string) nor 'schema_mapping' (object) defines target schema for {} stream.".format(stream_name))
+ raise Exception(
+ f"Target schema name not defined in config. Neither 'default_target_schema' (string) nor 'schema_mapping' (object) defines target schema for {stream_name} stream."
+ )
# Define grantees
# ---------------
@@ -310,45 +359,46 @@ def __init__(self, connection_config, stream_schema_message=None, table_cache=No
# }
# }
# }
- self.grantees = self.connection_config.get('default_target_schema_select_permissions')
+ self.grantees = self.connection_config.get(
+ "default_target_schema_select_permissions"
+ )
if config_schema_mapping and stream_schema_name in config_schema_mapping:
- self.grantees = config_schema_mapping[stream_schema_name].get('target_schema_select_permissions', self.grantees)
+ self.grantees = config_schema_mapping[stream_schema_name].get(
+ "target_schema_select_permissions", self.grantees
+ )
- self.data_flattening_max_level = self.connection_config.get('data_flattening_max_level', 0)
- self.flatten_schema = flatten_schema(stream_schema_message['schema'], max_level=self.data_flattening_max_level)
+ self.data_flattening_max_level = self.connection_config.get(
+ "data_flattening_max_level", 0
+ )
+ self.flatten_schema = flatten_schema(
+ stream_schema_message["schema"],
+ max_level=self.data_flattening_max_level,
+ )
def open_connection(self):
- conn_string = "host='{}' dbname='{}' user='{}' password='{}' port='{}'".format(
- self.connection_config['host'],
- self.connection_config['dbname'],
- self.connection_config['user'],
- self.connection_config['password'],
- self.connection_config['port']
+ return psycopg2.connect(
+ database=self.connection_config["dbname"],
+ user=self.connection_config["user"],
+ password=self.connection_config["password"],
+ host=self.connection_config["host"],
+ port=self.connection_config["port"],
)
- return psycopg2.connect(conn_string)
-
def query(self, query, params=None):
- self.logger.debug("Running query: {}".format(query))
+ self.logger.debug(f"Running query: {query}")
with self.open_connection() as connection:
with connection.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
- cur.execute(
- query,
- params
- )
-
- if cur.rowcount > 0 and cur.description:
- return cur.fetchall()
+ cur.execute(query, params)
- return []
+ return cur.fetchall() if cur.rowcount > 0 and cur.description else []
def table_name(self, stream_name, is_stage=False, without_schema=False):
stream_dict = stream_name_to_dict(stream_name)
- table_name = stream_dict['table_name']
- rs_table_name = table_name.replace('.', '_').replace('-', '_').lower()
+ table_name = stream_dict["table_name"]
+ rs_table_name = table_name.replace(".", "_").replace("-", "_").lower()
if is_stage:
- rs_table_name = 'stg_{}'.format(rs_table_name)
+ rs_table_name = f"stg_{rs_table_name}"
if without_schema:
return f'"{rs_table_name.upper()}"'
@@ -356,61 +406,72 @@ def table_name(self, stream_name, is_stage=False, without_schema=False):
return f'{self.schema_name}."{rs_table_name.upper()}"'
def record_primary_key_string(self, record):
- if len(self.stream_schema_message['key_properties']) == 0:
+ if len(self.stream_schema_message["key_properties"]) == 0:
return None
- flatten = flatten_record(record, self.flatten_schema, max_level=self.data_flattening_max_level)
+ flatten = flatten_record(
+ record, self.flatten_schema, max_level=self.data_flattening_max_level
+ )
try:
- key_props = [str(flatten[p]) for p in self.stream_schema_message['key_properties']]
- except Exception as exc:
- self.logger.info("Cannot find {} primary key(s) in record: {}".format(self.stream_schema_message['key_properties'], flatten))
- raise exc
- return ','.join(key_props)
+ key_props = [
+ str(flatten[p]) for p in self.stream_schema_message["key_properties"]
+ ]
+ except Exception:
+ self.logger.info(
+ f'Cannot find {self.stream_schema_message["key_properties"]} primary key(s) in record: {flatten}'
+ )
+ raise
+ return ",".join(key_props)
def record_to_csv_line(self, record):
- flatten = flatten_record(record, self.flatten_schema, max_level=self.data_flattening_max_level)
- return ','.join(
+ flatten = flatten_record(
+ record, self.flatten_schema, max_level=self.data_flattening_max_level
+ )
+ return ",".join(
[
- json.dumps(flatten[name], ensure_ascii=False) if name in flatten and (flatten[name] == 0 or flatten[name]) else ''
+ json.dumps(flatten[name], ensure_ascii=False)
+ if name in flatten and (flatten[name] == 0 or flatten[name])
+ else ""
for name in self.flatten_schema
]
)
- def put_to_s3(self, file, stream, count, suffix = ""):
- self.logger.info("Uploading {} rows to S3".format(count))
+ def put_to_s3(self, file, stream, count, suffix=""):
+ self.logger.info(f"Uploading {count} rows to S3")
# Generating key in S3 bucket
- bucket = self.connection_config['s3_bucket']
- s3_acl = self.connection_config.get('s3_acl')
- s3_key_prefix = self.connection_config.get('s3_key_prefix', '')
- s3_key = "{}pipelinewise_{}{}".format(s3_key_prefix, stream, suffix)
+ bucket = self.connection_config["s3_bucket"]
+ s3_acl = self.connection_config.get("s3_acl")
+ s3_key_prefix = self.connection_config.get("s3_key_prefix", "")
+ s3_key = f"{s3_key_prefix}pipelinewise_{stream}{suffix}"
- self.logger.info("Target S3 bucket: {}, local file: {}, S3 key: {}".format(bucket, file, s3_key))
+ self.logger.info(
+ f"Target S3 bucket: {bucket}, local file: {file}, S3 key: {s3_key}"
+ )
- extra_args = {'ACL': s3_acl} if s3_acl else None
+ extra_args = {"ACL": s3_acl} if s3_acl else None
self.s3.upload_file(file, bucket, s3_key, ExtraArgs=extra_args)
return s3_key
def delete_from_s3(self, s3_key):
- self.logger.info("Deleting {} from S3".format(s3_key))
- bucket = self.connection_config['s3_bucket']
+ self.logger.info(f"Deleting {s3_key} from S3")
+ bucket = self.connection_config["s3_bucket"]
self.s3.delete_object(Bucket=bucket, Key=s3_key)
# pylint: disable=too-many-locals
def load_csv(self, s3_key, count, size_bytes, compression=False):
stream_schema_message = self.stream_schema_message
- stream = stream_schema_message['stream']
+ stream = stream_schema_message["stream"]
stage_table = self.table_name(stream, is_stage=True)
target_table = self.table_name(stream, is_stage=False)
- self.logger.info("Loading {} rows into '{}'".format(count, self.table_name(stream, is_stage=True)))
+ self.logger.info(
+ f"Loading {count} rows into '{self.table_name(stream, is_stage=True)}'"
+ )
# Get list if columns with types
columns_with_trans = [
- {
- "name": safe_column_name(name),
- "trans": column_trans(schema)
- }
+ {"name": safe_column_name(name), "trans": column_trans(schema)}
for (name, schema) in self.flatten_schema.items()
]
@@ -424,29 +485,48 @@ def load_csv(self, s3_key, count, size_bytes, compression=False):
cur.execute(self.create_table_query(is_stage=True))
# Step 2: Generate copy credentials - prefer role if provided, otherwise use access and secret keys
- copy_credentials = """
+ copy_credentials = (
+ """
iam_role '{aws_role_arn}'
- """.format(aws_role_arn=self.connection_config['aws_redshift_copy_role_arn']) if self.connection_config.get("aws_redshift_copy_role_arn") else """
+ """.format(
+ aws_role_arn=self.connection_config[
+ "aws_redshift_copy_role_arn"
+ ]
+ )
+ if self.connection_config.get("aws_redshift_copy_role_arn")
+ else (
+ """
ACCESS_KEY_ID '{aws_access_key_id}'
SECRET_ACCESS_KEY '{aws_secret_access_key}'
{aws_session_token}
""".format(
- aws_access_key_id=self.connection_config['aws_access_key_id'],
- aws_secret_access_key=self.connection_config['aws_secret_access_key'],
- aws_session_token="SESSION_TOKEN '{}'".format(self.connection_config['aws_session_token']) if self.connection_config.get('aws_session_token') else '',
+ aws_access_key_id=self.connection_config[
+ "aws_access_key_id"
+ ],
+ aws_secret_access_key=self.connection_config[
+ "aws_secret_access_key"
+ ],
+ aws_session_token=f"""SESSION_TOKEN '{self.connection_config["aws_session_token"]}'"""
+ if self.connection_config.get("aws_session_token")
+ else "",
+ )
+ )
)
# Step 3: Generate copy options - Override defaults from config.json if defined
- copy_options = self.connection_config.get('copy_options',"""
+ copy_options = self.connection_config.get(
+ "copy_options",
+ """
EMPTYASNULL BLANKSASNULL TRIMBLANKS TRUNCATECOLUMNS
TIMEFORMAT 'auto'
COMPUPDATE OFF STATUPDATE OFF
- """)
+ """,
+ )
- if compression == "gzip":
- compression_option = " GZIP"
- elif compression == "bzip2":
+ if compression == "bzip2":
compression_option = " BZIP2"
+ elif compression == "gzip":
+ compression_option = " GZIP"
else:
compression_option = ""
@@ -457,20 +537,20 @@ def load_csv(self, s3_key, count, size_bytes, compression=False):
DELIMITER ',' REMOVEQUOTES ESCAPE{compression_option}
""".format(
table=stage_table,
- columns=', '.join([c['name'] for c in columns_with_trans]),
- s3_bucket=self.connection_config['s3_bucket'],
+ columns=", ".join([c["name"] for c in columns_with_trans]),
+ s3_bucket=self.connection_config["s3_bucket"],
s3_key=s3_key,
copy_credentials=copy_credentials,
copy_options=copy_options,
- compression_option=compression_option
+ compression_option=compression_option,
)
- self.logger.debug("Running query: {}".format(copy_sql))
+ self.logger.debug(f"Running query: {copy_sql}")
cur.execute(copy_sql)
# Step 5/a: Insert or Update if primary key defined
# Do UPDATE first and second INSERT to calculate
# the number of affected rows correctly
- if len(stream_schema_message['key_properties']) > 0:
+ if len(stream_schema_message["key_properties"]) > 0:
# Step 5/a/1: Update existing records
if not self.skip_updates:
update_sql = """UPDATE {}
@@ -479,14 +559,16 @@ def load_csv(self, s3_key, count, size_bytes, compression=False):
WHERE {}
""".format(
target_table,
- ', '.join(['{} = s.{}'.format(c['name'], c['name']) for c in columns_with_trans]),
+ ", ".join(
+ [
+ f'{c["name"]} = s.{c["name"]}'
+ for c in columns_with_trans
+ ]
+ ),
stage_table,
- self.primary_key_merge_condition()
+ self.primary_key_merge_condition(),
)
- self.logger.debug("Running query: {}".format(update_sql))
- cur.execute(update_sql)
- updates = cur.rowcount
-
+ updates = self._extracted_from_load_csv_104(update_sql, cur)
# Step 5/a/2: Insert new records
insert_sql = """INSERT INTO {} ({})
SELECT {}
@@ -495,44 +577,60 @@ def load_csv(self, s3_key, count, size_bytes, compression=False):
WHERE {}
""".format(
target_table,
- ', '.join([c['name'] for c in columns_with_trans]),
- ', '.join(['s.{}'.format(c['name']) for c in columns_with_trans]),
+ ", ".join([c["name"] for c in columns_with_trans]),
+ ", ".join([f's.{c["name"]}' for c in columns_with_trans]),
stage_table,
target_table,
self.primary_key_merge_condition(),
- ' AND '.join(['{}.{} IS NULL'.format(target_table, c) for c in primary_column_names(stream_schema_message)])
+ " AND ".join(
+ [
+ f"{target_table}.{c} IS NULL"
+ for c in primary_column_names(stream_schema_message)
+ ]
+ ),
)
- self.logger.debug("Running query: {}".format(insert_sql))
- cur.execute(insert_sql)
- inserts = cur.rowcount
-
- # Step 5/b: Insert only if no primary key
else:
insert_sql = """INSERT INTO {} ({})
SELECT {}
FROM {} s
""".format(
target_table,
- ', '.join([c['name'] for c in columns_with_trans]),
- ', '.join(['s.{}'.format(c['name']) for c in columns_with_trans]),
- stage_table
+ ", ".join([c["name"] for c in columns_with_trans]),
+ ", ".join([f's.{c["name"]}' for c in columns_with_trans]),
+ stage_table,
)
- self.logger.debug("Running query: {}".format(insert_sql))
- cur.execute(insert_sql)
- inserts = cur.rowcount
-
+ inserts = self._extracted_from_load_csv_104(insert_sql, cur)
# Step 6: Drop stage table
cur.execute(self.drop_table_query(is_stage=True))
- self.logger.info('Loading into {}: {}'.format(
- self.table_name(stream, False),
- json.dumps({'inserts': inserts, 'updates': updates, 'size_bytes': size_bytes})))
+ self.logger.info(
+ "Loading into {}: {}".format(
+ self.table_name(stream, False),
+ json.dumps(
+ {
+ "inserts": inserts,
+ "updates": updates,
+ "size_bytes": size_bytes,
+ }
+ ),
+ )
+ )
+
+ # TODO Rename this here and in `load_csv`
+ def _extracted_from_load_csv_104(self, arg0, cur):
+ self.logger.debug(f"Running query: {arg0}")
+ cur.execute(arg0)
+ return cur.rowcount
def primary_key_merge_condition(self):
stream_schema_message = self.stream_schema_message
names = primary_column_names(stream_schema_message)
- return ' AND '.join(['{}.{} = s.{}'.format(
- self.table_name(stream_schema_message['stream'], False), c, c) for c in names])
+ return " AND ".join(
+ [
+ f'{self.table_name(stream_schema_message["stream"], False)}.{c} = s.{c}'
+ for c in names
+ ]
+ )
def column_names(self):
return [safe_column_name(name) for name in self.flatten_schema]
@@ -540,55 +638,57 @@ def column_names(self):
def create_table_query(self, is_stage=False):
stream_schema_message = self.stream_schema_message
columns = [
- column_clause(
- name,
- schema
- )
+ column_clause(name, schema)
for (name, schema) in self.flatten_schema.items()
]
- primary_key = ["PRIMARY KEY ({})".format(', '.join(primary_column_names(stream_schema_message)))] \
- if len(stream_schema_message['key_properties']) else []
-
- return 'CREATE TABLE IF NOT EXISTS {} ({})'.format(
- self.table_name(stream_schema_message['stream'], is_stage),
- ', '.join(columns + primary_key)
+ primary_key = (
+ [f'PRIMARY KEY ({", ".join(primary_column_names(stream_schema_message))})']
+ if len(stream_schema_message["key_properties"])
+ else []
)
+ return f'CREATE TABLE IF NOT EXISTS {self.table_name(stream_schema_message["stream"], is_stage)} ({", ".join(columns + primary_key)})'
+
def drop_table_query(self, is_stage=False):
stream_schema_message = self.stream_schema_message
- return 'DROP TABLE IF EXISTS {}'.format(self.table_name(stream_schema_message['stream'], is_stage))
+ return f'DROP TABLE IF EXISTS {self.table_name(stream_schema_message["stream"], is_stage)}'
def grant_usage_on_schema(self, schema_name, grantee, to_group=False):
- query = "GRANT USAGE ON SCHEMA {} TO {} {}".format(schema_name, 'GROUP' if to_group else '', grantee)
- self.logger.info("Granting USAGE privilege on '{}' schema to '{}'... {}".format(schema_name, grantee, query))
+ query = f'GRANT USAGE ON SCHEMA {schema_name} TO {"GROUP" if to_group else ""} {grantee}'
+ self.logger.info(
+ f"Granting USAGE privilege on '{schema_name}' schema to '{grantee}'... {query}"
+ )
self.query(query)
- def grant_select_on_all_tables_in_schema(self, schema_name, grantee, to_group=False):
- query = "GRANT SELECT ON ALL TABLES IN SCHEMA {} TO {} {}".format(schema_name, 'GROUP' if to_group else '', grantee)
+ def grant_select_on_all_tables_in_schema(
+ self, schema_name, grantee, to_group=False
+ ):
+ query = f'GRANT SELECT ON ALL TABLES IN SCHEMA {schema_name} TO {"GROUP" if to_group else ""} {grantee}'
self.logger.info(
- "Granting SELECT ON ALL TABLES privilegue on '{}' schema to '{}'... {}".format(schema_name, grantee, query))
+ f"Granting SELECT ON ALL TABLES privilege on '{schema_name}' schema to '{grantee}'... {query}"
+ )
self.query(query)
@classmethod
- def grant_privilege(self, schema, grantees, grant_method, to_group=False):
+ def grant_privilege(cls, schema, grantees, grant_method, to_group=False):
if isinstance(grantees, list):
for grantee in grantees:
- grant_method(schema, grantee,to_group)
+ grant_method(schema, grantee, to_group)
elif isinstance(grantees, str):
grant_method(schema, grantees, to_group)
elif isinstance(grantees, dict):
- users = grantees.get('users')
- groups = grantees.get('groups')
+ users = grantees.get("users")
+ groups = grantees.get("groups")
- self.grant_privilege(schema, users, grant_method)
- self.grant_privilege(schema, groups, grant_method, to_group=True)
+ cls.grant_privilege(schema, users, grant_method)
+ cls.grant_privilege(schema, groups, grant_method, to_group=True)
def delete_rows(self, stream):
table = self.table_name(stream, is_stage=False)
- query = "DELETE FROM {} WHERE _sdc_deleted_at IS NOT NULL".format(table)
- self.logger.info("Deleting rows from '{}' table... {}".format(table, query))
- self.logger.info("DELETE {}".format(len(self.query(query))))
+ query = f"DELETE FROM {table} WHERE _sdc_deleted_at IS NOT NULL"
+ self.logger.info(f"Deleting rows from '{table}' table... {query}")
+ self.logger.info(f"DELETE {len(self.query(query))}")
def create_schema_if_not_exists(self):
schema_name = self.schema_name
@@ -596,60 +696,86 @@ def create_schema_if_not_exists(self):
# table_columns_cache is an optional pre-collected list of available objects in redshift
if self.table_cache:
- schema_rows = list(filter(lambda x: x['table_schema'] == schema_name.lower(), self.table_cache))
+ schema_rows = list(
+ filter(
+ lambda x: x["table_schema"] == schema_name.lower(), self.table_cache
+ )
+ )
# Query realtime if not pre-collected
else:
schema_rows = self.query(
- 'SELECT LOWER(schema_name) schema_name FROM information_schema.schemata WHERE LOWER(schema_name) = %s',
- (schema_name.lower(),)
+ "SELECT LOWER(schema_name) schema_name FROM information_schema.schemata WHERE LOWER(schema_name) = %s",
+ (schema_name.lower(),),
)
if len(schema_rows) == 0:
- query = "CREATE SCHEMA IF NOT EXISTS {}".format(schema_name)
- self.logger.info("Schema '{}' does not exist. Creating... {}".format(schema_name, query))
+ query = f"CREATE SCHEMA IF NOT EXISTS {schema_name}"
+ self.logger.info(
+ f"Schema '{schema_name}' does not exist. Creating... {query}"
+ )
self.query(query)
self.grant_privilege(schema_name, self.grantees, self.grant_usage_on_schema)
# Refresh columns cache if required
if self.table_cache:
- self.table_cache = self.get_table_columns(filter_schemas=[self.schema_name])
+ self.table_cache = self.get_table_columns(
+ filter_schemas=[self.schema_name]
+ )
def get_tables(self, table_schema=None):
- return self.query("""SELECT LOWER(table_schema) table_schema, LOWER(table_name) table_name
+ return self.query(
+ """SELECT LOWER(table_schema) table_schema, LOWER(table_name) table_name
FROM information_schema.tables
WHERE LOWER(table_schema) = {}""".format(
- "LOWER(table_schema)" if table_schema is None else "'{}'".format(table_schema.lower())
- ))
+ "LOWER(table_schema)"
+ if table_schema is None
+ else f"'{table_schema.lower()}'"
+ )
+ )
- def get_table_columns(self, table_schema=None, table_name=None, filter_schemas=None):
+ def get_table_columns(
+ self, table_schema=None, table_name=None, filter_schemas=None
+ ):
sql = """SELECT LOWER(c.table_schema) table_schema, LOWER(c.table_name) table_name, c.column_name, c.data_type
FROM information_schema.columns c
WHERE 1=1"""
- if table_schema is not None: sql = sql + " AND LOWER(c.table_schema) = '" + table_schema.lower() + "'"
- if table_name is not None: sql = sql + " AND LOWER(c.table_name) = '" + table_name.replace("\"", "").lower() + "'"
- if filter_schemas is not None: sql = sql + " AND LOWER(c.table_schema) IN (" + ', '.join("'{}'".format(s).lower() for s in filter_schemas) + ")"
+ if table_schema is not None:
+ sql = f"{sql} AND LOWER(c.table_schema) = '{table_schema.lower()}'"
+ if table_name is not None:
+ sql = (
+ f"{sql} AND LOWER(c.table_name) = '"
+ + table_name.replace('"', "").lower()
+ + "'"
+ )
+ if filter_schemas is not None:
+ sql = (
+ sql
+ + " AND LOWER(c.table_schema) IN ("
+ + ", ".join(f"'{s}'".lower() for s in filter_schemas)
+ ) + ")"
return self.query(sql)
def update_columns(self):
stream_schema_message = self.stream_schema_message
- stream = stream_schema_message['stream']
+ stream = stream_schema_message["stream"]
table_name = self.table_name(stream, is_stage=False, without_schema=True)
if self.table_cache:
- columns = list(filter(lambda x: x['table_schema'] == self.schema_name.lower() and
- f'"{x["table_name"].upper()}"' == table_name,
- self.table_cache))
+ columns = list(
+ filter(
+ lambda x: x["table_schema"] == self.schema_name.lower()
+ and f'"{x["table_name"].upper()}"' == table_name,
+ self.table_cache,
+ )
+ )
else:
columns = self.get_table_columns(self.schema_name, table_name)
- columns_dict = {column['column_name'].lower(): column for column in columns}
+ columns_dict = {column["column_name"].lower(): column for column in columns}
columns_to_add = [
- column_clause(
- name,
- properties_schema
- )
+ column_clause(name, properties_schema)
for (name, properties_schema) in self.flatten_schema.items()
if name.lower() not in columns_dict
]
@@ -658,84 +784,100 @@ def update_columns(self):
self.add_column(column, stream)
columns_to_replace = [
- (safe_column_name(name), column_clause(
- name,
- properties_schema
- ))
+ (safe_column_name(name), column_clause(name, properties_schema))
for (name, properties_schema) in self.flatten_schema.items()
- if name.lower() in columns_dict and
- columns_dict[name.lower()]['data_type'].lower() != column_type(properties_schema, with_length=False).lower() and
-
- # Don't alter table if 'timestamp without time zone' detected as the new required column type
- #
- # Target-redshift maps every data-time JSON types to 'timestamp without time zone' but sometimes
- # a 'timestamp with time zone' column is alrady available in the target table
- # (i.e. created by fastsync initial load)
- # We need to exclude this conversion otherwise we loose the data that is already populated
- # in the column
- #
- # TODO: Support both timestamp with/without time zone in target-redshift
- # when extracting data-time values from JSON
- # (Check the column_type function for further details)
- column_type(properties_schema).lower() != 'timestamp without time zone'
+ if name.lower() in columns_dict
+ and columns_dict[name.lower()]["data_type"].lower()
+ != column_type(properties_schema, with_length=False).lower()
+ # Don't alter table if 'timestamp without time zone' detected as the new required column type
+ #
+ # Target-redshift maps every data-time JSON types to 'timestamp without time zone' but sometimes
+ # a 'timestamp with time zone' column is already available in the target table
+ # (i.e. created by fastsync initial load)
+ # We need to exclude this conversion otherwise we loose the data that is already populated
+ # in the column
+ #
+ # TODO: Support both timestamp with/without time zone in target-redshift
+ # when extracting data-time values from JSON
+ # (Check the column_type function for further details)
+ and column_type(properties_schema).lower() != "timestamp without time zone"
]
- for (column_name, column) in columns_to_replace:
+ for column_name, column in columns_to_replace:
self.version_column(column_name, stream)
self.add_column(column, stream)
# Refresh table cache if required
- if self.table_cache and (len(columns_to_add) > 0 or len(columns_to_replace)):
+ if self.table_cache and (columns_to_add or len(columns_to_replace)):
self.table_cache = self.get_table_columns(filter_schemas=[self.schema_name])
def drop_column(self, column_name, stream):
- drop_column = "ALTER TABLE {} DROP COLUMN {}".format(self.table_name(stream, is_stage=False), column_name)
- self.logger.info('Dropping column: {}'.format(drop_column))
+ drop_column = "ALTER TABLE {} DROP COLUMN {}".format(
+ self.table_name(stream, is_stage=False), column_name
+ )
+ self.logger.info(f"Dropping column: {drop_column}")
self.query(drop_column)
def version_column(self, column_name, stream):
- version_column = "ALTER TABLE {} RENAME COLUMN {} TO \"{}_{}\"".format(self.table_name(stream, is_stage=False),
- column_name,
- column_name.replace("\"", ""),
- time.strftime("%Y%m%d_%H%M"))
- self.logger.info('Versioning column: {}'.format(version_column))
+ version_column = 'ALTER TABLE {} RENAME COLUMN {} TO "{}_{}"'.format(
+ self.table_name(stream, is_stage=False),
+ column_name,
+ column_name.replace('"', ""),
+ time.strftime("%Y%m%d_%H%M"),
+ )
+ self.logger.info(f"Versioning column: {version_column}")
self.query(version_column)
def add_column(self, column, stream):
- add_column = "ALTER TABLE {} ADD COLUMN {}".format(self.table_name(stream, is_stage=False), column)
- self.logger.info('Adding column: {}'.format(add_column))
+ add_column = "ALTER TABLE {} ADD COLUMN {}".format(
+ self.table_name(stream, is_stage=False), column
+ )
+ self.logger.info(f"Adding column: {add_column}")
self.query(add_column)
def create_table(self, is_stage=False):
stream_schema_message = self.stream_schema_message
- stream = stream_schema_message['stream']
- self.logger.info("(Re)creating {} table...".format(self.table_name(stream, is_stage)))
+ stream = stream_schema_message["stream"]
+ self.logger.info(f"(Re)creating {self.table_name(stream, is_stage)} table...")
self.query(self.drop_table_query(is_stage=is_stage))
self.query(self.create_table_query(is_stage=is_stage))
def create_table_and_grant_privilege(self, is_stage=False):
self.create_table(is_stage=is_stage)
- self.grant_privilege(self.schema_name, self.grantees, self.grant_select_on_all_tables_in_schema)
+ self.grant_privilege(
+ self.schema_name, self.grantees, self.grant_select_on_all_tables_in_schema
+ )
def sync_table(self):
stream_schema_message = self.stream_schema_message
- stream = stream_schema_message['stream']
+ stream = stream_schema_message["stream"]
table_name = self.table_name(stream, is_stage=False, without_schema=True)
- table_name_with_schema = self.table_name(stream, is_stage=False, without_schema=False)
+ table_name_with_schema = self.table_name(
+ stream, is_stage=False, without_schema=False
+ )
if self.table_cache:
- found_tables = list(filter(lambda x: x['table_schema'] == self.schema_name.lower() and
- f'"{x["table_name"].upper()}"' == table_name,
- self.table_cache))
+ found_tables = list(
+ filter(
+ lambda x: x["table_schema"] == self.schema_name.lower()
+ and f'"{x["table_name"].upper()}"' == table_name,
+ self.table_cache,
+ )
+ )
else:
- found_tables = [table for table in (self.get_tables(self.schema_name.lower()))
- if f'"{table["table_name"].upper()}"' == table_name]
+ found_tables = [
+ table
+ for table in (self.get_tables(self.schema_name.lower()))
+ if f'"{table["table_name"].upper()}"' == table_name
+ ]
# Create target table if not exists
- if len(found_tables) == 0:
- self.logger.info("Table '{}' does not exist. Creating...".format(table_name_with_schema))
+ if not found_tables:
+ self.logger.info(
+ f"Table '{table_name_with_schema}' does not exist. Creating..."
+ )
self.create_table_and_grant_privilege()
else:
- self.logger.info("Table '{}' exists".format(self.schema_name))
+ self.logger.info(f"Table '{self.schema_name}' exists")
self.update_columns()
diff --git a/tests/integration/test_target_redshift.py b/tests/integration/test_target_redshift.py
index 142fa35..44b2fa2 100644
--- a/tests/integration/test_target_redshift.py
+++ b/tests/integration/test_target_redshift.py
@@ -1,15 +1,16 @@
-import pytest
-import os
-import json
-import mock
+import contextlib
import datetime
+import json
+import os
+from unittest import mock
+
+import pytest
+from psycopg2 import InternalError
import target_redshift
from target_redshift import RecordValidationException
from target_redshift.db_sync import DbSync
-from psycopg2 import InternalError
-
try:
import tests.utils as test_utils
except ImportError:
@@ -19,7 +20,7 @@
METADATA_COLUMNS = ["_sdc_extracted_at", "_sdc_batched_at", "_sdc_deleted_at"]
-class TestTargetRedshift(object):
+class TestTargetRedshift:
"""
Integration Tests for PipelineWise Target Redshift
"""
@@ -31,9 +32,7 @@ def setup_method(self):
# Drop target schema
if self.config["default_target_schema"]:
redshift.query(
- "DROP SCHEMA IF EXISTS {} CASCADE".format(
- self.config["default_target_schema"]
- )
+ f'DROP SCHEMA IF EXISTS {self.config["default_target_schema"]} CASCADE'
)
def teardown_method(self):
@@ -66,11 +65,13 @@ def assert_metadata_columns_not_exist(self, rows):
for md_c in METADATA_COLUMNS:
assert md_c not in r
- def assert_three_streams_are_loaded_in_redshift(self,
- should_metadata_columns_exist=False,
- should_hard_deleted_rows=False,
- should_primary_key_required=True,
- should_skip_updates=False):
+ def assert_three_streams_are_loaded_in_redshift(
+ self,
+ should_metadata_columns_exist=False,
+ should_hard_deleted_rows=False,
+ should_primary_key_required=True,
+ should_skip_updates=False,
+ ):
"""
This is a helper assertion that checks if every data from the message-with-three-streams.json
file is available in Redshift tables correctly.
@@ -89,13 +90,13 @@ def assert_three_streams_are_loaded_in_redshift(self,
# Get loaded rows from tables
table_one = redshift.query(
- "SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema)
+ f"SELECT * FROM {target_schema}.test_table_one ORDER BY c_pk"
)
table_two = redshift.query(
- "SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema)
+ f"SELECT * FROM {target_schema}.test_table_two ORDER BY c_pk"
)
table_three = redshift.query(
- "SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema)
+ f"SELECT * FROM {target_schema}.test_table_three ORDER BY c_pk"
)
# ----------------------------------------------------------------------
@@ -175,60 +176,70 @@ def assert_three_streams_are_loaded_in_redshift(self,
self.assert_metadata_columns_not_exist(table_two)
self.assert_metadata_columns_not_exist(table_three)
- def assert_logical_streams_are_in_redshift(self, should_metadata_columns_exist=False):
+ def assert_logical_streams_are_in_redshift(
+ self, should_metadata_columns_exist=False
+ ):
# Get loaded rows from tables
redshift = DbSync(self.config)
- target_schema = self.config.get('default_target_schema', '')
- table_one = redshift.query("SELECT * FROM {}.logical1_table1 ORDER BY cid".format(target_schema))
- table_two = redshift.query("SELECT * FROM {}.logical1_table2 ORDER BY cid".format(target_schema))
- table_three = redshift.query("SELECT * FROM {}.logical2_table1 ORDER BY cid".format(target_schema))
- table_four = redshift.query("SELECT cid, ctimentz, ctimetz FROM {}.logical1_edgydata WHERE cid IN(1,2,3,4,5,6,8,9) ORDER BY cid".format(target_schema))
+ target_schema = self.config.get("default_target_schema", "")
+ table_one = redshift.query(
+ f"SELECT * FROM {target_schema}.logical1_table1 ORDER BY cid"
+ )
+ table_two = redshift.query(
+ f"SELECT * FROM {target_schema}.logical1_table2 ORDER BY cid"
+ )
+ table_three = redshift.query(
+ f"SELECT * FROM {target_schema}.logical2_table1 ORDER BY cid"
+ )
+ table_four = redshift.query(
+ f"SELECT cid, ctimentz, ctimetz FROM {target_schema}.logical1_edgydata WHERE cid IN(1,2,3,4,5,6,8,9) ORDER BY cid"
+ )
# ----------------------------------------------------------------------
# Check rows in table_one
# ----------------------------------------------------------------------
expected_table_one = [
- {'cid': 1, 'cvarchar': "inserted row", 'cvarchar2': None},
- {'cid': 2, 'cvarchar': 'inserted row', "cvarchar2": "inserted row"},
- {'cid': 3, 'cvarchar': "inserted row", 'cvarchar2': "inserted row"},
- {'cid': 4, 'cvarchar': "inserted row", 'cvarchar2': "inserted row"}
+ {"cid": 1, "cvarchar": "inserted row", "cvarchar2": None},
+ {"cid": 2, "cvarchar": "inserted row", "cvarchar2": "inserted row"},
+ {"cid": 3, "cvarchar": "inserted row", "cvarchar2": "inserted row"},
+ {"cid": 4, "cvarchar": "inserted row", "cvarchar2": "inserted row"},
]
# ----------------------------------------------------------------------
# Check rows in table_tow
# ----------------------------------------------------------------------
expected_table_two = [
- {'cid': 1, 'cvarchar': "updated row"},
- {'cid': 2, 'cvarchar': 'updated row'},
- {'cid': 3, 'cvarchar': "updated row"},
- {'cid': 5, 'cvarchar': "updated row"},
- {'cid': 7, 'cvarchar': "updated row"},
- {'cid': 8, 'cvarchar': 'updated row'},
- {'cid': 9, 'cvarchar': "updated row"},
- {'cid': 10, 'cvarchar': 'updated row'}
+ {"cid": 1, "cvarchar": "updated row"},
+ {"cid": 2, "cvarchar": "updated row"},
+ {"cid": 3, "cvarchar": "updated row"},
+ {"cid": 5, "cvarchar": "updated row"},
+ {"cid": 7, "cvarchar": "updated row"},
+ {"cid": 8, "cvarchar": "updated row"},
+ {"cid": 9, "cvarchar": "updated row"},
+ {"cid": 10, "cvarchar": "updated row"},
]
# ----------------------------------------------------------------------
# Check rows in table_three
# ----------------------------------------------------------------------
expected_table_three = [
- {'cid': 1, 'cvarchar': "updated row"},
- {'cid': 2, 'cvarchar': 'updated row'},
- {'cid': 3, 'cvarchar': "updated row"},
+ {"cid": 1, "cvarchar": "updated row"},
+ {"cid": 2, "cvarchar": "updated row"},
+ {"cid": 3, "cvarchar": "updated row"},
]
# ----------------------------------------------------------------------
# Check rows in table_four
# ----------------------------------------------------------------------
expected_table_four = [
- {'cid': 1, 'ctimentz': None, 'ctimetz': None},
- {'cid': 2, 'ctimentz': '23:00:15', 'ctimetz': '23:00:15'},
- {'cid': 3, 'ctimentz': '12:00:15', 'ctimetz': '12:00:15'},
- {'cid': 4, 'ctimentz': '12:00:15', 'ctimetz': '09:00:15'},
- {'cid': 5, 'ctimentz': '12:00:15', 'ctimetz': '15:00:15'},
- {'cid': 6, 'ctimentz': '00:00:00', 'ctimetz': '00:00:00'},
- {'cid': 8, 'ctimentz': '00:00:00', 'ctimetz': '01:00:00'},
- {'cid': 9, 'ctimentz': '00:00:00', 'ctimetz': '00:00:00'}
+ {"cid": 1, "ctimentz": None, "ctimetz": None},
+ {"cid": 2, "ctimentz": "23:00:15", "ctimetz": "23:00:15"},
+ {"cid": 3, "ctimentz": "12:00:15", "ctimetz": "12:00:15"},
+ {"cid": 4, "ctimentz": "12:00:15", "ctimetz": "09:00:15"},
+ {"cid": 5, "ctimentz": "12:00:15", "ctimetz": "15:00:15"},
+ {"cid": 6, "ctimentz": "00:00:00", "ctimetz": "00:00:00"},
+ {"cid": 8, "ctimentz": "00:00:00", "ctimetz": "01:00:00"},
+ {"cid": 9, "ctimentz": "00:00:00", "ctimetz": "00:00:00"},
]
# Check if metadata columns replicated correctly
@@ -244,40 +255,64 @@ def assert_logical_streams_are_in_redshift(self, should_metadata_columns_exist=F
# Check if data replicated correctly
assert self.remove_metadata_columns_from_rows(table_one) == expected_table_one
assert self.remove_metadata_columns_from_rows(table_two) == expected_table_two
- assert self.remove_metadata_columns_from_rows(table_three) == expected_table_three
+ assert (
+ self.remove_metadata_columns_from_rows(table_three) == expected_table_three
+ )
assert self.remove_metadata_columns_from_rows(table_four) == expected_table_four
def assert_logical_streams_are_in_redshift_and_are_empty(self):
# Get loaded rows from tables
redshift = DbSync(self.config)
target_schema = self.config.get("default_target_schema", "")
- table_one = redshift.query("SELECT * FROM {}.logical1_table1 ORDER BY cid".format(target_schema))
- table_two = redshift.query("SELECT * FROM {}.logical1_table2 ORDER BY cid".format(target_schema))
- table_three = redshift.query("SELECT * FROM {}.logical2_table1 ORDER BY cid".format(target_schema))
- table_four = redshift.query("SELECT cid, ctimentz, ctimetz FROM {}.logical1_edgydata WHERE cid IN(1,2,3,4,5,6,8,9) ORDER BY cid".format(target_schema))
+ table_one = redshift.query(
+ f"SELECT * FROM {target_schema}.logical1_table1 ORDER BY cid"
+ )
+ table_two = redshift.query(
+ f"SELECT * FROM {target_schema}.logical1_table2 ORDER BY cid"
+ )
+ table_three = redshift.query(
+ f"SELECT * FROM {target_schema}.logical2_table1 ORDER BY cid"
+ )
+ table_four = redshift.query(
+ f"SELECT cid, ctimentz, ctimetz FROM {target_schema}.logical1_edgydata WHERE cid IN(1,2,3,4,5,6,8,9) ORDER BY cid"
+ )
assert table_one == []
assert table_two == []
assert table_three == []
assert table_four == []
- def assert_binary_data_are_in_snowflake(self, table_name, should_metadata_columns_exist=False):
+ def assert_binary_data_are_in_snowflake(
+ self, table_name, should_metadata_columns_exist=False
+ ):
# Redshift doesn't have binary type. Binary formatted singer values loaded into VARCHAR columns
# Get loaded rows from tables
snowflake = DbSync(self.config)
- target_schema = self.config.get('default_target_schema', '')
- table_one = snowflake.query('SELECT * FROM {}.{} ORDER BY "new"'.format(target_schema, table_name))
+ target_schema = self.config.get("default_target_schema", "")
+ table_one = snowflake.query(
+ f'SELECT * FROM {target_schema}.{table_name} ORDER BY "new"'
+ )
# ----------------------------------------------------------------------
# Check rows in table_one
# ----------------------------------------------------------------------
expected_table_one = [
- {'new': '706b32', 'data': '6461746132', 'created_at': datetime.datetime(2019, 12, 17, 16, 2, 55)},
- {'new': '706b34', 'data': '6461746134', 'created_at': datetime.datetime(2019, 12, 17, 16, 32, 22)},
+ {
+ "new": "706b32",
+ "data": "6461746132",
+ "created_at": datetime.datetime(2019, 12, 17, 16, 2, 55),
+ },
+ {
+ "new": "706b34",
+ "data": "6461746134",
+ "created_at": datetime.datetime(2019, 12, 17, 16, 32, 22),
+ },
]
if should_metadata_columns_exist:
- assert self.remove_metadata_columns_from_rows(table_one) == expected_table_one
+ assert (
+ self.remove_metadata_columns_from_rows(table_one) == expected_table_one
+ )
else:
assert table_one == expected_table_one
@@ -299,22 +334,22 @@ def test_message_order(self):
def test_loading_tables(self):
"""Loading multiple tables from the same input tap with various columns types"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- # Turning off client-side encryption and load
- self.config["client_side_encryption_master_key"] = ""
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json",
+ "",
+ "client_side_encryption_master_key",
+ )
+ )
self.assert_three_streams_are_loaded_in_redshift()
def test_loading_tables_with_metadata_columns(self):
"""Loading multiple tables from the same input tap with various columns types"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- # Turning on adding metadata columns
- self.config["add_metadata_columns"] = True
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json", True, "add_metadata_columns"
+ )
+ )
# Check if data loaded correctly and metadata columns exist
self.assert_three_streams_are_loaded_in_redshift(
should_metadata_columns_exist=True
@@ -322,49 +357,47 @@ def test_loading_tables_with_metadata_columns(self):
def test_loading_tables_with_defined_parallelism(self):
"""Loading multiple tables from the same input tap with various columns types"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- # Using fixed 1 thread parallelism
- self.config["parallelism"] = 1
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json", 1, "parallelism"
+ )
+ )
self.assert_three_streams_are_loaded_in_redshift()
def test_loading_tables_with_defined_slice_number(self):
"""Loading multiple tables from the same input tap with various columns types with a defined slice number"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- self.config["slices"] = 4
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json", 4, "slices"
+ )
+ )
self.assert_three_streams_are_loaded_in_redshift()
def test_loading_tables_with_gzip_compression(self):
"""Loading multiple tables from the same input tap with various columns types and gzip compression"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- self.config["compression"] = "gzip"
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json", "gzip", "compression"
+ )
+ )
self.assert_three_streams_are_loaded_in_redshift()
def test_loading_tables_with_bz2_compression(self):
"""Loading multiple tables from the same input tap with various columns types and bz2 compression"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- self.config["compression"] = "bz2"
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json", "bz2", "compression"
+ )
+ )
self.assert_three_streams_are_loaded_in_redshift()
def test_loading_tables_with_hard_delete(self):
"""Loading multiple tables from the same input tap with deleted rows"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- # Turning on hard delete mode
- self.config["hard_delete"] = True
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json", True, "hard_delete"
+ )
+ )
# Check if data loaded correctly and metadata columns exist
self.assert_three_streams_are_loaded_in_redshift(
should_metadata_columns_exist=True, should_hard_deleted_rows=True
@@ -384,90 +417,96 @@ def test_loading_with_multiple_schema(self):
def test_loading_table_with_reserved_word_as_name_and_hard_delete(self):
"""Loading a table where the name is a reserved word with deleted rows"""
- tap_lines = test_utils.get_test_tap_lines('messages-with-reserved-name-as-table-name.json')
-
- # Turning on hard delete mode
- self.config['hard_delete'] = True
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-reserved-name-as-table-name.json",
+ True,
+ "hard_delete",
+ )
+ )
# Check if data loaded correctly and metadata columns exist
self.assert_binary_data_are_in_snowflake(
- table_name='"ORDER"',
- should_metadata_columns_exist=True
+ table_name='"ORDER"', should_metadata_columns_exist=True
)
def test_loading_table_with_space(self):
"""Loading a table where the name has space"""
- tap_lines = test_utils.get_test_tap_lines('messages-with-space-in-table-name.json')
-
- # Turning on hard delete mode
- self.config['hard_delete'] = True
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-space-in-table-name.json", True, "hard_delete"
+ )
+ )
# Check if data loaded correctly and metadata columns exist
self.assert_binary_data_are_in_snowflake(
table_name='"table with space and uppercase"',
- should_metadata_columns_exist=True
+ should_metadata_columns_exist=True,
)
def test_loading_unicode_characters(self):
"""Loading unicode encoded characters"""
- tap_lines = test_utils.get_test_tap_lines(
- "messages-with-unicode-characters.json"
+ table_unicode = self._extracted_from_test_nested_schema_unflattening_3(
+ "messages-with-unicode-characters.json",
+ "SELECT * FROM {}.test_table_unicode ORDER BY c_pk",
)
-
- # Load with default settings
- target_redshift.persist_lines(self.config, tap_lines)
-
- # Get loaded rows from tables
- redshift = DbSync(self.config)
- target_schema = self.config.get("default_target_schema", "")
- table_unicode = redshift.query(
- "SELECT * FROM {}.test_table_unicode ORDER BY c_pk".format(target_schema)
+ assert (
+ self.remove_metadata_columns_from_rows(table_unicode)
+ == [
+ {
+ "c_int": 1,
+ "c_pk": 1,
+ "c_varchar": "Hello world, Καλημέρα κόσμε, コンニチハ",
+ },
+ {
+ "c_int": 2,
+ "c_pk": 2,
+ "c_varchar": "Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.",
+ },
+ {
+ "c_int": 3,
+ "c_pk": 3,
+ "c_varchar": "Russian: Зарегистрируйтесь сейчас на Десятую Международную Конференцию по",
+ },
+ {"c_int": 4, "c_pk": 4, "c_varchar": "Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช"},
+ {
+ "c_int": 5,
+ "c_pk": 5,
+ "c_varchar": "Arabic: لقد لعبت أنت وأصدقاؤك لمدة وحصلتم علي من إجمالي النقاط",
+ },
+ {
+ "c_int": 6,
+ "c_pk": 6,
+ "c_varchar": "Special Characters: [\"\\,'!@£$%^&*()]\\\\",
+ },
+ ]
)
- assert self.remove_metadata_columns_from_rows(table_unicode) == [
- {"c_int": 1, "c_pk": 1, "c_varchar": "Hello world, Καλημέρα κόσμε, コンニチハ"},
- {"c_int": 2, "c_pk": 2, "c_varchar": "Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年."},
- {"c_int": 3, "c_pk": 3, "c_varchar":
- "Russian: Зарегистрируйтесь сейчас на Десятую Международную Конференцию по"},
- {"c_int": 4, "c_pk": 4, "c_varchar": "Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช"},
- {"c_int": 5, "c_pk": 5, "c_varchar": "Arabic: لقد لعبت أنت وأصدقاؤك لمدة وحصلتم علي من إجمالي النقاط"},
- {"c_int": 6, "c_pk": 6, "c_varchar": "Special Characters: [\"\\,'!@£$%^&*()]\\\\"},
- ]
-
def test_loading_long_text(self):
"""Loading long texts"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-long-texts.json")
-
- # Load with default settings
- target_redshift.persist_lines(self.config, tap_lines)
-
- # Get loaded rows from tables
- redshift = DbSync(self.config)
- target_schema = self.config.get("default_target_schema", "")
- table_long_texts = redshift.query(
- "SELECT * FROM {}.test_table_long_texts ORDER BY c_pk".format(target_schema)
+ table_long_texts = self._extracted_from_test_nested_schema_unflattening_3(
+ "messages-with-long-texts.json",
+ "SELECT * FROM {}.test_table_long_texts ORDER BY c_pk",
)
-
# Test not very long texts by exact match
- assert self.remove_metadata_columns_from_rows(table_long_texts)[:3] == [
- {
- "c_int": 1,
- "c_pk": 1,
- "c_varchar": "Up to 128 characters: Lorem ipsum dolor sit amet, consectetuer adipiscing elit.",
- },
- {
- "c_int": 2,
- "c_pk": 2,
- "c_varchar": "Up to 256 characters: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies.",
- },
- {
- "c_int": 3,
- "c_pk": 3,
- "c_varchar": "Up to 1024 characters: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu, consequat vitae, eleifend ac, enim. Aliquam lorem ante, dapibus in, viverra quis, feugiat a, tellus. Phasellus viverra nulla ut metus varius laoreet. Quisque rutrum. Aenean imperdiet. Etiam ultricies nisi vel augue. Curabitur ullamcorper ultricies nisi. Nam eget dui. Etiam rhoncus. Maecenas tempus, tellus eget condimentum rhoncus, sem quam semper libero, sit amet adipiscing sem neque sed ipsum.",
- },
- ]
+ assert (
+ self.remove_metadata_columns_from_rows(table_long_texts)[:3]
+ == [
+ {
+ "c_int": 1,
+ "c_pk": 1,
+ "c_varchar": "Up to 128 characters: Lorem ipsum dolor sit amet, consectetuer adipiscing elit.",
+ },
+ {
+ "c_int": 2,
+ "c_pk": 2,
+ "c_varchar": "Up to 256 characters: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies.",
+ },
+ {
+ "c_int": 3,
+ "c_pk": 3,
+ "c_varchar": "Up to 1024 characters: Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu, consequat vitae, eleifend ac, enim. Aliquam lorem ante, dapibus in, viverra quis, feugiat a, tellus. Phasellus viverra nulla ut metus varius laoreet. Quisque rutrum. Aenean imperdiet. Etiam ultricies nisi vel augue. Curabitur ullamcorper ultricies nisi. Nam eget dui. Etiam rhoncus. Maecenas tempus, tellus eget condimentum rhoncus, sem quam semper libero, sit amet adipiscing sem neque sed ipsum.",
+ },
+ ]
+ )
# Test very long texts by string length
record_4k = table_long_texts[3]
@@ -490,43 +529,46 @@ def test_loading_long_text(self):
def test_non_db_friendly_columns(self):
"""Loading non-db friendly columns like, camelcase, minus signs, etc."""
- tap_lines = test_utils.get_test_tap_lines(
- "messages-with-non-db-friendly-columns.json"
- )
-
- # Load with default settings
- target_redshift.persist_lines(self.config, tap_lines)
-
- # Get loaded rows from tables
- redshift = DbSync(self.config)
- target_schema = self.config.get("default_target_schema", "")
- table_non_db_friendly_columns = redshift.query(
- "SELECT * FROM {}.test_table_non_db_friendly_columns ORDER BY c_pk".format(
- target_schema
+ table_non_db_friendly_columns = (
+ self._extracted_from_test_nested_schema_unflattening_3(
+ "messages-with-non-db-friendly-columns.json",
+ "SELECT * FROM {}.test_table_non_db_friendly_columns ORDER BY c_pk",
)
)
-
assert self.remove_metadata_columns_from_rows(
table_non_db_friendly_columns
) == [
- {"c_pk": 1, "camelcasecolumn": "Dummy row 1", "minus-column": "Dummy row 1"},
- {"c_pk": 2, "camelcasecolumn": "Dummy row 2", "minus-column": "Dummy row 2"},
- {"c_pk": 3, "camelcasecolumn": "Dummy row 3", "minus-column": "Dummy row 3"},
- {"c_pk": 4, "camelcasecolumn": "Dummy row 4", "minus-column": "Dummy row 4"},
- {"c_pk": 5, "camelcasecolumn": "Dummy row 5", "minus-column": "Dummy row 5"},
+ {
+ "c_pk": 1,
+ "camelcasecolumn": "Dummy row 1",
+ "minus-column": "Dummy row 1",
+ },
+ {
+ "c_pk": 2,
+ "camelcasecolumn": "Dummy row 2",
+ "minus-column": "Dummy row 2",
+ },
+ {
+ "c_pk": 3,
+ "camelcasecolumn": "Dummy row 3",
+ "minus-column": "Dummy row 3",
+ },
+ {
+ "c_pk": 4,
+ "camelcasecolumn": "Dummy row 4",
+ "minus-column": "Dummy row 4",
+ },
+ {
+ "c_pk": 5,
+ "camelcasecolumn": "Dummy row 5",
+ "minus-column": "Dummy row 5",
+ },
]
def test_nested_schema_unflattening(self):
"""Loading nested JSON objects into VARIANT columns without flattening"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-nested-schema.json")
-
- # Load with default settings - Flattening disabled
- target_redshift.persist_lines(self.config, tap_lines)
-
- # Get loaded rows from tables - Transform JSON to string at query time
- redshift = DbSync(self.config)
- target_schema = self.config.get("default_target_schema", "")
- unflattened_table = redshift.query(
+ unflattened_table = self._extracted_from_test_nested_schema_unflattening_3(
+ "messages-with-nested-schema.json",
"""
SELECT c_pk
,c_array
@@ -534,54 +576,60 @@ def test_nested_schema_unflattening(self):
,c_object_with_props
,c_nested_object
FROM {}.test_table_nested_schema
- ORDER BY c_pk""".format(
- target_schema
- )
+ ORDER BY c_pk""",
)
-
# Should be valid nested JSON strings
- assert self.remove_metadata_columns_from_rows(unflattened_table) == [
- {
- "c_pk": 1,
- "c_array": "[1, 2, 3]",
- "c_object": '{"key_1": "value_1"}',
- "c_object_with_props": '{"key_1": "value_1"}',
- "c_nested_object": '{"nested_prop_1": "nested_value_1", "nested_prop_2": "nested_value_2", "nested_prop_3": {"multi_nested_prop_1": "multi_value_1", "multi_nested_prop_2": "multi_value_2"}}',
- }
- ]
-
- def test_nested_schema_flattening(self):
- """Loading nested JSON objects with flattening and not not flattening"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-nested-schema.json")
-
- # Turning on data flattening
- self.config["data_flattening_max_level"] = 10
+ assert (
+ self.remove_metadata_columns_from_rows(unflattened_table)
+ == [
+ {
+ "c_pk": 1,
+ "c_array": "[1, 2, 3]",
+ "c_object": '{"key_1": "value_1"}',
+ "c_object_with_props": '{"key_1": "value_1"}',
+ "c_nested_object": '{"nested_prop_1": "nested_value_1", "nested_prop_2": "nested_value_2", "nested_prop_3": {"multi_nested_prop_1": "multi_value_1", "multi_nested_prop_2": "multi_value_2"}}',
+ }
+ ]
+ )
- # Load with default settings - Flattening disabled
+ # TODO Rename this here and in `test_loading_unicode_characters`, `test_loading_long_text`, `test_non_db_friendly_columns` and `test_nested_schema_unflattening`
+ def _extracted_from_test_nested_schema_unflattening_3(self, arg0, arg1):
+ tap_lines = test_utils.get_test_tap_lines(arg0)
target_redshift.persist_lines(self.config, tap_lines)
+ redshift = DbSync(self.config)
+ target_schema = self.config.get("default_target_schema", "")
+ return redshift.query(arg1.format(target_schema))
+ def test_nested_schema_flattening(self):
+ """Loading nested JSON objects with flattening and not not flattening"""
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-nested-schema.json", 10, "data_flattening_max_level"
+ )
+ )
# Get loaded rows from tables
redshift = DbSync(self.config)
target_schema = self.config.get("default_target_schema", "")
flattened_table = redshift.query(
- "SELECT * FROM {}.test_table_nested_schema ORDER BY c_pk".format(
- target_schema
- )
+ f"SELECT * FROM {target_schema}.test_table_nested_schema ORDER BY c_pk"
)
# Should be flattened columns
- assert self.remove_metadata_columns_from_rows(flattened_table) == [
- {
- "c_pk": 1,
- "c_array": "[1, 2, 3]",
- "c_object": None, # Cannot map RECORD to SCHEMA. SCHEMA doesn't have properties that requires for flattening
- "c_object_with_props__key_1": "value_1",
- "c_nested_object__nested_prop_1": "nested_value_1",
- "c_nested_object__nested_prop_2": "nested_value_2",
- "c_nested_object__nested_prop_3__multi_nested_prop_1": "multi_value_1",
- "c_nested_object__nested_prop_3__multi_nested_prop_2": "multi_value_2",
- }
- ]
+ assert (
+ self.remove_metadata_columns_from_rows(flattened_table)
+ == [
+ {
+ "c_pk": 1,
+ "c_array": "[1, 2, 3]",
+ "c_object": None, # Cannot map RECORD to SCHEMA. SCHEMA doesn't have properties that requires for flattening
+ "c_object_with_props__key_1": "value_1",
+ "c_nested_object__nested_prop_1": "nested_value_1",
+ "c_nested_object__nested_prop_2": "nested_value_2",
+ "c_nested_object__nested_prop_3__multi_nested_prop_1": "multi_value_1",
+ "c_nested_object__nested_prop_3__multi_nested_prop_2": "multi_value_2",
+ }
+ ]
+ )
def test_column_name_change(self):
"""Tests correct renaming of redshift columns after source change"""
@@ -600,13 +648,13 @@ def test_column_name_change(self):
redshift = DbSync(self.config)
target_schema = self.config.get("default_target_schema", "")
table_one = redshift.query(
- "SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema)
+ f"SELECT * FROM {target_schema}.test_table_one ORDER BY c_pk"
)
table_two = redshift.query(
- "SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema)
+ f"SELECT * FROM {target_schema}.test_table_two ORDER BY c_pk"
)
table_three = redshift.query(
- "SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema)
+ f"SELECT * FROM {target_schema}.test_table_three ORDER BY c_pk"
)
# Get the previous column name from information schema in test_table_two
@@ -619,9 +667,7 @@ def test_column_name_change(self):
AND table_name = 'test_table_two'
ORDER BY ordinal_position
LIMIT 1
- """.format(
- self.config.get("dbname", "").lower(), target_schema.lower()
- )
+ """.format(self.config.get("dbname", "").lower(), target_schema.lower())
)[0]["column_name"]
# Table one should have no changes
@@ -694,14 +740,10 @@ def test_grant_privileges(self):
redshift = DbSync(self.config)
redshift.query("DROP USER IF EXISTS user_1")
redshift.query("DROP USER IF EXISTS user_2")
- try:
+ with contextlib.suppress(Exception):
redshift.query("DROP GROUP group_1") # DROP GROUP has no IF EXISTS
- except:
- pass
- try:
+ with contextlib.suppress(Exception):
redshift.query("DROP GROUP group_2")
- except:
- pass
redshift.query("CREATE USER user_1 WITH PASSWORD 'Abcdefgh1234'")
redshift.query("CREATE USER user_2 WITH PASSWORD 'Abcdefgh1234'")
redshift.query("CREATE GROUP group_1 WITH USER user_1, user_2")
@@ -709,27 +751,21 @@ def test_grant_privileges(self):
# When grantees is a string then privileges should be granted to single user
redshift.query(
- "DROP SCHEMA IF EXISTS {} CASCADE".format(
- self.config["default_target_schema"]
- )
+ f'DROP SCHEMA IF EXISTS {self.config["default_target_schema"]} CASCADE'
)
self.config["default_target_schema_select_permissions"] = "user_1"
target_redshift.persist_lines(self.config, tap_lines)
# When grantees is a list then privileges should be granted to list of user
redshift.query(
- "DROP SCHEMA IF EXISTS {} CASCADE".format(
- self.config["default_target_schema"]
- )
+ f'DROP SCHEMA IF EXISTS {self.config["default_target_schema"]} CASCADE'
)
self.config["default_target_schema_select_permissions"] = ["user_1", "user_2"]
target_redshift.persist_lines(self.config, tap_lines)
# Grant privileges to list of users
redshift.query(
- "DROP SCHEMA IF EXISTS {} CASCADE".format(
- self.config["default_target_schema"]
- )
+ f'DROP SCHEMA IF EXISTS {self.config["default_target_schema"]} CASCADE'
)
self.config["default_target_schema_select_permissions"] = {
"users": ["user_1", "user_2"]
@@ -738,9 +774,7 @@ def test_grant_privileges(self):
# Grant privileges to list of groups
redshift.query(
- "DROP SCHEMA IF EXISTS {} CASCADE".format(
- self.config["default_target_schema"]
- )
+ f'DROP SCHEMA IF EXISTS {self.config["default_target_schema"]} CASCADE'
)
self.config["default_target_schema_select_permissions"] = {
"groups": ["group_1", "group_2"]
@@ -749,9 +783,7 @@ def test_grant_privileges(self):
# Grant privileges to mix of list of users and groups
redshift.query(
- "DROP SCHEMA IF EXISTS {} CASCADE".format(
- self.config["default_target_schema"]
- )
+ f'DROP SCHEMA IF EXISTS {self.config["default_target_schema"]} CASCADE'
)
self.config["default_target_schema_select_permissions"] = {
"users": ["user_1", "user_2"],
@@ -761,9 +793,7 @@ def test_grant_privileges(self):
# Granting not existing user should raise exception
redshift.query(
- "DROP SCHEMA IF EXISTS {} CASCADE".format(
- self.config["default_target_schema"]
- )
+ f'DROP SCHEMA IF EXISTS {self.config["default_target_schema"]} CASCADE'
)
with pytest.raises(Exception):
self.config["default_target_schema_select_permissions"] = {
@@ -773,9 +803,7 @@ def test_grant_privileges(self):
# Granting not existing group should raise exception
redshift.query(
- "DROP SCHEMA IF EXISTS {} CASCADE".format(
- self.config["default_target_schema"]
- )
+ f'DROP SCHEMA IF EXISTS {self.config["default_target_schema"]} CASCADE'
)
with pytest.raises(Exception):
self.config["default_target_schema_select_permissions"] = {
@@ -785,13 +813,13 @@ def test_grant_privileges(self):
def test_custom_copy_options(self):
"""Test loading data with custom copy options"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- # Loading with identical copy option should pass
- self.config[
- "copy_options"
- ] = "EMPTYASNULL TRIMBLANKS FILLRECORD TRUNCATECOLUMNS"
- target_redshift.persist_lines(self.config, tap_lines)
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json",
+ "EMPTYASNULL TRIMBLANKS FILLRECORD TRUNCATECOLUMNS",
+ "copy_options",
+ )
+ )
def test_copy_using_aws_environment(self):
"""Test loading data with aws in the environment rather than explicitly provided access keys"""
@@ -830,248 +858,719 @@ def test_invalid_custom_copy_options(self):
with pytest.raises(Exception):
target_redshift.persist_lines(self.config, tap_lines)
- def test_logical_streams_from_pg_with_hard_delete_and_default_batch_size_should_pass(self):
+ def test_logical_streams_from_pg_with_hard_delete_and_default_batch_size_should_pass(
+ self
+ ):
"""Tests logical streams from pg with inserts, updates and deletes"""
- tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json')
-
- # Turning on hard delete mode
- self.config['hard_delete'] = True
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-pg-logical-streams.json", True, "hard_delete"
+ )
+ )
self.assert_logical_streams_are_in_redshift(should_metadata_columns_exist=True)
- def test_logical_streams_from_pg_with_hard_delete_and_batch_size_of_5_should_pass(self):
+ def test_logical_streams_from_pg_with_hard_delete_and_batch_size_of_5_should_pass(
+ self
+ ):
"""Tests logical streams from pg with inserts, updates and deletes"""
- tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json')
+ tap_lines = test_utils.get_test_tap_lines("messages-pg-logical-streams.json")
# Turning on hard delete mode
- self.config['hard_delete'] = True
- self.config['batch_size_rows'] = 5
+ self.config["hard_delete"] = True
+ self.config["batch_size_rows"] = 5
target_redshift.persist_lines(self.config, tap_lines)
self.assert_logical_streams_are_in_redshift(should_metadata_columns_exist=True)
- def test_logical_streams_from_pg_with_hard_delete_and_batch_size_of_5_and_no_records_should_pass(self):
+ def test_logical_streams_from_pg_with_hard_delete_and_batch_size_of_5_and_no_records_should_pass(
+ self,
+ ):
"""Tests logical streams from pg with inserts, updates and deletes"""
- tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams-no-records.json')
+ tap_lines = test_utils.get_test_tap_lines(
+ "messages-pg-logical-streams-no-records.json"
+ )
# Turning on hard delete mode
- self.config['hard_delete'] = True
- self.config['batch_size_rows'] = 5
+ self.config["hard_delete"] = True
+ self.config["batch_size_rows"] = 5
target_redshift.persist_lines(self.config, tap_lines)
self.assert_logical_streams_are_in_redshift_and_are_empty()
- @mock.patch('target_redshift.emit_state')
+ @mock.patch("target_redshift.emit_state")
def test_flush_streams_with_no_intermediate_flushes(self, mock_emit_state):
"""Test emitting states when no intermediate flush required"""
mock_emit_state.get.return_value = None
- tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json')
+ tap_lines = test_utils.get_test_tap_lines("messages-pg-logical-streams.json")
# Set batch size big enough to never has to flush in the middle
- self.config['hard_delete'] = True
- self.config['batch_size_rows'] = 1000
+ self.config["hard_delete"] = True
+ self.config["batch_size_rows"] = 1000
target_redshift.persist_lines(self.config, tap_lines)
# State should be emitted only once with the latest received STATE message
- assert mock_emit_state.mock_calls == \
- [
- mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}})
- ]
+ assert mock_emit_state.mock_calls == [
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ )
+ ]
# Every table should be loaded correctly
self.assert_logical_streams_are_in_redshift(should_metadata_columns_exist=True)
- @mock.patch('target_redshift.emit_state')
+ @mock.patch("target_redshift.emit_state")
def test_flush_streams_with_intermediate_flushes(self, mock_emit_state):
"""Test emitting states when intermediate flushes required"""
mock_emit_state.get.return_value = None
- tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json')
+ tap_lines = test_utils.get_test_tap_lines("messages-pg-logical-streams.json")
# Set batch size small enough to trigger multiple stream flushes
- self.config['hard_delete'] = True
- self.config['batch_size_rows'] = 10
+ self.config["hard_delete"] = True
+ self.config["batch_size_rows"] = 10
target_redshift.persist_lines(self.config, tap_lines)
# State should be emitted multiple times, updating the positions only in the stream which got flushed
- assert mock_emit_state.call_args_list == \
- [
- # Flush #1 - Flushed edgydata until lsn: 108197216
- mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #2 - Flushed logical1-logical1_table2 until lsn: 108201336
- mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #3 - Flushed logical1-logical1_table2 until lsn: 108237600
- mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #4 - Flushed logical1-logical1_table2 until lsn: 108238768
- mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #5 - Flushed logical1-logical1_table2 until lsn: 108239704,
- mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108196176, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #6 - Last flush, update every stream lsn: 108240872,
- mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- ]
+ assert mock_emit_state.call_args_list == [
+ # Flush #1 - Flushed edgydata until lsn: 108197216
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #2 - Flushed logical1-logical1_table2 until lsn: 108201336
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108201336,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #3 - Flushed logical1-logical1_table2 until lsn: 108237600
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108237600,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #4 - Flushed logical1-logical1_table2 until lsn: 108238768
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108238768,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #5 - Flushed logical1-logical1_table2 until lsn: 108239704,
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108239896,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108196176,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #6 - Last flush, update every stream lsn: 108240872,
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ ]
# Every table should be loaded correctly
self.assert_logical_streams_are_in_redshift(should_metadata_columns_exist=True)
def test_record_validation(self):
"""Test validating records"""
- tap_lines = test_utils.get_test_tap_lines('messages-with-invalid-records.json')
+ tap_lines = test_utils.get_test_tap_lines("messages-with-invalid-records.json")
# Loading invalid records when record validation enabled should fail at ...
- self.config['validate_records'] = True
+ self.config["validate_records"] = True
with pytest.raises(RecordValidationException):
target_redshift.persist_lines(self.config, tap_lines)
# Loading invalid records when record validation disabled should fail at load time
- self.config['validate_records'] = False
+ self.config["validate_records"] = False
with pytest.raises(InternalError):
target_redshift.persist_lines(self.config, tap_lines)
- @mock.patch('target_redshift.emit_state')
- def test_flush_streams_with_intermediate_flushes_on_all_streams(self, mock_emit_state):
+ @mock.patch("target_redshift.emit_state")
+ def test_flush_streams_with_intermediate_flushes_on_all_streams(
+ self, mock_emit_state
+ ):
"""Test emitting states when intermediate flushes required and flush_all_streams is enabled"""
mock_emit_state.get.return_value = None
- tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json')
+ tap_lines = test_utils.get_test_tap_lines("messages-pg-logical-streams.json")
# Set batch size small enough to trigger multiple stream flushes
- self.config['hard_delete'] = True
- self.config['batch_size_rows'] = 10
- self.config['flush_all_streams'] = True
+ self.config["hard_delete"] = True
+ self.config["batch_size_rows"] = 10
+ self.config["flush_all_streams"] = True
target_redshift.persist_lines(self.config, tap_lines)
# State should be emitted 6 times, flushing every stream and updating every stream position
- assert mock_emit_state.call_args_list == \
- [
- # Flush #1 - Flush every stream until lsn: 108197216
- mock.call({"currently_syncing": None, "bookmarks": {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108197216, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #2 - Flush every stream until lsn 108201336
- mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108201336, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #3 - Flush every stream until lsn: 108237600
- mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108237600, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #4 - Flush every stream until lsn: 108238768
- mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108238768, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #5 - Flush every stream until lsn: 108239704,
- mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108239896, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- # Flush #6 - Last flush, update every stream until lsn: 108240872,
- mock.call({'currently_syncing': None, 'bookmarks': {
- "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723596, "xmin": None},
- "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723618, "xmin": None},
- "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723635, "xmin": None},
- "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723651, "xmin": None},
- "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
- "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
- "public2-wearehere": {}}}),
- ]
+ assert mock_emit_state.call_args_list == [
+ # Flush #1 - Flush every stream until lsn: 108197216
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108197216,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #2 - Flush every stream until lsn 108201336
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108201336,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108201336,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108201336,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108201336,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #3 - Flush every stream until lsn: 108237600
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108237600,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108237600,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108237600,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108237600,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #4 - Flush every stream until lsn: 108238768
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108238768,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108238768,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108238768,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108238768,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #5 - Flush every stream until lsn: 108239704,
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108239896,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108239896,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108239896,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108239896,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ # Flush #6 - Last flush, update every stream until lsn: 108240872,
+ mock.call(
+ {
+ "currently_syncing": None,
+ "bookmarks": {
+ "logical1-logical1_edgydata": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723596,
+ "xmin": None,
+ },
+ "logical1-logical1_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723618,
+ "xmin": None,
+ },
+ "logical1-logical1_table2": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723635,
+ "xmin": None,
+ },
+ "logical2-logical2_table1": {
+ "last_replication_method": "LOG_BASED",
+ "lsn": 108240872,
+ "version": 1570922723651,
+ "xmin": None,
+ },
+ "public-city": {
+ "last_replication_method": "INCREMENTAL",
+ "replication_key": "id",
+ "version": 1570922723667,
+ "replication_key_value": 4079,
+ },
+ "public-country": {
+ "last_replication_method": "FULL_TABLE",
+ "version": 1570922730456,
+ "xmin": None,
+ },
+ "public2-wearehere": {},
+ },
+ }
+ ),
+ ]
# Every table should be loaded correctly
self.assert_logical_streams_are_in_redshift(should_metadata_columns_exist=True)
def test_loading_tables_with_skip_updates(self):
"""Loading records with existing primary keys but skip updates"""
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams.json")
-
- # Turn on skip_updates mode
- self.config["skip_updates"] = True
- target_redshift.persist_lines(self.config, tap_lines)
+ tap_lines = self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json", True, "skip_updates"
+ )
self.assert_three_streams_are_loaded_in_redshift()
# Load some new records with upserts
- tap_lines = test_utils.get_test_tap_lines("messages-with-three-streams-upserts.json")
+ tap_lines = test_utils.get_test_tap_lines(
+ "messages-with-three-streams-upserts.json"
+ )
target_redshift.persist_lines(self.config, tap_lines)
self.assert_three_streams_are_loaded_in_redshift(should_skip_updates=True)
def test_loading_tables_with_custom_temp_dir(self):
"""Loading multiple tables from the same input tap using custom temp directory"""
- tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')
-
- # Setting custom temp_dir
- self.config['temp_dir'] = ('~/.pipelinewise/tmp')
- target_redshift.persist_lines(self.config, tap_lines)
-
+ (
+ self._extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ "messages-with-three-streams.json",
+ "~/.pipelinewise/tmp",
+ "temp_dir",
+ )
+ )
self.assert_three_streams_are_loaded_in_redshift()
+
+ # TODO Rename this here and in `test_loading_tables`, `test_loading_tables_with_metadata_columns`, `test_loading_tables_with_defined_parallelism`, `test_loading_tables_with_defined_slice_number`, `test_loading_tables_with_gzip_compression`, `test_loading_tables_with_bz2_compression`, `test_loading_tables_with_hard_delete`, `test_loading_table_with_reserved_word_as_name_and_hard_delete`, `test_loading_table_with_space`, `test_nested_schema_flattening`, `test_custom_copy_options`, `test_logical_streams_from_pg_with_hard_delete_and_default_batch_size_should_pass`, `test_loading_tables_with_skip_updates` and `test_loading_tables_with_custom_temp_dir`
+ def _extracted_from_test_loading_tables_with_custom_temp_dir_3(
+ self, arg0, arg1, arg2
+ ):
+ result = test_utils.get_test_tap_lines(arg0)
+ self.config[arg2] = arg1
+ target_redshift.persist_lines(self.config, result)
+ return result
diff --git a/tests/integration/utils.py b/tests/integration/utils.py
index 395e83f..dfcce6b 100644
--- a/tests/integration/utils.py
+++ b/tests/integration/utils.py
@@ -1,58 +1,45 @@
import os
-import json
def get_db_config():
- config = {}
+ config = {"host": os.environ.get("TARGET_REDSHIFT_HOST")}
- # --------------------------------------------------------------------------
- # Default configuration settings for integration tests.
- # --------------------------------------------------------------------------
- # The following values needs to be defined in environment variables with
- # valid details to a Redshift cluster, AWS IAM role and an S3 bucket
- # --------------------------------------------------------------------------
- # Redshift cluster
- config['host'] = os.environ.get('TARGET_REDSHIFT_HOST')
- config['port'] = os.environ.get('TARGET_REDSHIFT_PORT')
- config['user'] = os.environ.get('TARGET_REDSHIFT_USER')
- config['password'] = os.environ.get('TARGET_REDSHIFT_PASSWORD')
- config['dbname'] = os.environ.get('TARGET_REDSHIFT_DBNAME')
- config['default_target_schema'] = os.environ.get("TARGET_REDSHIFT_SCHEMA")
+ config["port"] = os.environ.get("TARGET_REDSHIFT_PORT")
+ config["user"] = os.environ.get("TARGET_REDSHIFT_USER")
+ config["password"] = os.environ.get("TARGET_REDSHIFT_PASSWORD")
+ config["dbname"] = os.environ.get("TARGET_REDSHIFT_DBNAME")
+ config["default_target_schema"] = os.environ.get("TARGET_REDSHIFT_SCHEMA")
# AWS IAM and S3 bucket
- config['aws_access_key_id'] = os.environ.get('TARGET_REDSHIFT_AWS_ACCESS_KEY')
- config['aws_secret_access_key'] = os.environ.get('TARGET_REDSHIFT_AWS_SECRET_ACCESS_KEY')
- config['s3_acl'] = os.environ.get('TARGET_REDSHIFT_S3_ACL')
- config['s3_bucket'] = os.environ.get('TARGET_REDSHIFT_S3_BUCKET')
- config['s3_key_prefix'] = os.environ.get('TARGET_REDSHIFT_S3_KEY_PREFIX')
-
+ config["aws_access_key_id"] = os.environ.get("TARGET_REDSHIFT_AWS_ACCESS_KEY")
+ config["aws_secret_access_key"] = os.environ.get(
+ "TARGET_REDSHIFT_AWS_SECRET_ACCESS_KEY"
+ )
+ config["s3_acl"] = os.environ.get("TARGET_REDSHIFT_S3_ACL")
+ config["s3_bucket"] = os.environ.get("TARGET_REDSHIFT_S3_BUCKET")
+ config["s3_key_prefix"] = os.environ.get("TARGET_REDSHIFT_S3_KEY_PREFIX")
# --------------------------------------------------------------------------
# The following variables needs to be empty.
# The tests cases will set them automatically whenever it's needed
# --------------------------------------------------------------------------
- config['disable_table_cache'] = None
- config['schema_mapping'] = None
- config['add_metadata_columns'] = None
- config['hard_delete'] = None
- config['aws_redshift_copy_role_arn'] = None
- config['flush_all_streams'] = None
- config['validate_records'] = None
+ config["disable_table_cache"] = None
+ config["schema_mapping"] = None
+ config["add_metadata_columns"] = None
+ config["hard_delete"] = None
+ config["aws_redshift_copy_role_arn"] = None
+ config["flush_all_streams"] = None
+ config["validate_records"] = None
return config
def get_test_config():
- db_config = get_db_config()
-
- return db_config
+ return get_db_config()
def get_test_tap_lines(filename):
lines = []
- with open('{}/resources/{}'.format(os.path.dirname(__file__), filename)) as tap_stdout:
- for line in tap_stdout.readlines():
- lines.append(line)
-
+ with open(f"{os.path.dirname(__file__)}/resources/{filename}") as tap_stdout:
+ lines.extend(iter(tap_stdout))
return lines
-
diff --git a/tests/unit/test_db_sync.py b/tests/unit/test_db_sync.py
index 8808b7f..a35b423 100644
--- a/tests/unit/test_db_sync.py
+++ b/tests/unit/test_db_sync.py
@@ -1,33 +1,31 @@
-import pytest
import target_redshift
-class TestTargetRedshift(object):
+class TestTargetRedshift:
"""
Unit Tests for PipelineWise Target Redshift
"""
+
def setup_method(self):
self.config = {}
-
def teardown_method(self):
pass
-
def test_config_validation(self):
"""Test configuration validator"""
validator = target_redshift.db_sync.validate_config
empty_config = {}
minimal_config = {
- 'host': "dummy-value",
- 'port': 5439,
- 'user': "dummy-value",
- 'password': "dummy-value",
- 'dbname': "dummy-value",
- 'aws_access_key_id': "dummy-value",
- 'aws_secret_access_key': "dummy-value",
- 's3_bucket': "dummy-value",
- 'default_target_schema': "dummy-value"
+ "host": "dummy-value",
+ "port": 5439,
+ "user": "dummy-value",
+ "password": "dummy-value",
+ "dbname": "dummy-value",
+ "aws_access_key_id": "dummy-value",
+ "aws_secret_access_key": "dummy-value",
+ "s3_bucket": "dummy-value",
+ "default_target_schema": "dummy-value",
}
# Config validator returns a list of errors
@@ -36,90 +34,105 @@ def test_config_validation(self):
# Empty configuration should fail - (nr_of_errors > 0)
assert len(validator(empty_config)) > 0
- # Minimal configuratino should pass - (nr_of_errors == 0)
+ # Minimal configuration should pass - (nr_of_errors == 0)
assert len(validator(minimal_config)) == 0
# Configuration without schema references - (nr_of_errors >= 0)
config_with_no_schema = minimal_config.copy()
- config_with_no_schema.pop('default_target_schema')
+ config_with_no_schema.pop("default_target_schema")
assert len(validator(config_with_no_schema)) > 0
# Configuration with schema mapping - (nr_of_errors == 0)
config_with_schema_mapping = minimal_config.copy()
- config_with_schema_mapping.pop('default_target_schema')
- config_with_schema_mapping['schema_mapping'] = {
- "dummy_stream": {
- "target_schema": "dummy_schema"
- }
+ config_with_schema_mapping.pop("default_target_schema")
+ config_with_schema_mapping["schema_mapping"] = {
+ "dummy_stream": {"target_schema": "dummy_schema"}
}
assert len(validator(config_with_schema_mapping)) == 0
-
def test_column_type_mapping(self):
"""Test JSON type to Redshift column type mappings"""
mapper = target_redshift.db_sync.column_type
# Incoming JSON schema types
- json_str = {"type": ["string"] }
- json_str_or_null = {"type": ["string", "null"] }
- json_dt = {"type": ["string"] , "format": "date-time"}
- json_dt_or_null = {"type": ["string", "null"] , "format": "date-time"}
- json_t = {"type": ["string"] , "format": "time"}
- json_t_or_null = {"type": ["string", "null"] , "format": "time"}
- json_num = {"type": ["number"] }
- json_int = {"type": ["integer"] }
- json_int_or_str = {"type": ["integer", "string"] }
- json_bool = {"type": ["boolean"] }
- json_obj = {"type": ["object"] }
- json_arr = {"type": ["array"] }
-
- # Mapping from JSON schema types ot Redshift column types
- assert mapper(json_str) == 'character varying(10000)'
- assert mapper(json_str_or_null) == 'character varying(10000)'
- assert mapper(json_dt) == 'timestamp without time zone'
- assert mapper(json_dt_or_null) == 'timestamp without time zone'
- assert mapper(json_t) == 'character varying(256)'
- assert mapper(json_t_or_null) == 'character varying(256)'
- assert mapper(json_num) == 'double precision'
- assert mapper(json_int) == 'numeric'
- assert mapper(json_int_or_str) == 'character varying(65535)'
- assert mapper(json_bool) == 'boolean'
- assert mapper(json_obj) == 'character varying(65535)'
- assert mapper(json_arr) == 'character varying(65535)'
+ json_str = {"type": ["string"]}
+ json_str_or_null = {"type": ["string", "null"]}
+ json_dt = {"type": ["string"], "format": "date-time"}
+ json_dt_or_null = {"type": ["string", "null"], "format": "date-time"}
+ json_t = {"type": ["string"], "format": "time"}
+ json_t_or_null = {"type": ["string", "null"], "format": "time"}
+ json_num = {"type": ["number"]}
+ json_int = {"type": ["integer"]}
+ json_int_or_str = {"type": ["integer", "string"]}
+ json_bool = {"type": ["boolean"]}
+ json_obj = {"type": ["object"]}
+ json_arr = {"type": ["array"]}
+ # Mapping from JSON schema types ot Redshift column types
+ assert mapper(json_str) == "character varying(10000)"
+ assert mapper(json_str_or_null) == "character varying(10000)"
+ assert mapper(json_dt) == "timestamp without time zone"
+ assert mapper(json_dt_or_null) == "timestamp without time zone"
+ assert mapper(json_t) == "character varying(256)"
+ assert mapper(json_t_or_null) == "character varying(256)"
+ assert mapper(json_num) == "double precision"
+ assert mapper(json_int) == "bigint"
+ assert mapper(json_int_or_str) == "character varying(65535)"
+ assert mapper(json_bool) == "boolean"
+ assert mapper(json_obj) == "super"
+ assert mapper(json_arr) == "super"
def test_stream_name_to_dict(self):
"""Test identifying catalog, schema and table names from fully qualified stream and table names"""
# Singer stream name format (Default '-' separator)
- assert \
- target_redshift.db_sync.stream_name_to_dict('my_table') == \
- {"catalog_name": None, "schema_name": None, "table_name": "my_table"}
+ assert target_redshift.db_sync.stream_name_to_dict("my_table") == {
+ "catalog_name": None,
+ "schema_name": None,
+ "table_name": "my_table",
+ }
# Singer stream name format (Default '-' separator)
- assert \
- target_redshift.db_sync.stream_name_to_dict('my_schema-my_table') == \
- {"catalog_name": None, "schema_name": "my_schema", "table_name": "my_table"}
+ assert target_redshift.db_sync.stream_name_to_dict("my_schema-my_table") == {
+ "catalog_name": None,
+ "schema_name": "my_schema",
+ "table_name": "my_table",
+ }
# Singer stream name format (Default '-' separator)
- assert \
- target_redshift.db_sync.stream_name_to_dict('my_catalog-my_schema-my_table') == \
- {"catalog_name": "my_catalog", "schema_name": "my_schema", "table_name": "my_table"}
+ assert target_redshift.db_sync.stream_name_to_dict(
+ "my_catalog-my_schema-my_table"
+ ) == {
+ "catalog_name": "my_catalog",
+ "schema_name": "my_schema",
+ "table_name": "my_table",
+ }
# Redshift table format (Custom '.' separator)
- assert \
- target_redshift.db_sync.stream_name_to_dict('my_table', separator='.') == \
- {"catalog_name": None, "schema_name": None, "table_name": "my_table"}
+ assert target_redshift.db_sync.stream_name_to_dict(
+ "my_table", separator="."
+ ) == {
+ "catalog_name": None,
+ "schema_name": None,
+ "table_name": "my_table",
+ }
# Redshift table format (Custom '.' separator)
- assert \
- target_redshift.db_sync.stream_name_to_dict('my_schema.my_table', separator='.') == \
- {"catalog_name": None, "schema_name": "my_schema", "table_name": "my_table"}
+ assert target_redshift.db_sync.stream_name_to_dict(
+ "my_schema.my_table", separator="."
+ ) == {
+ "catalog_name": None,
+ "schema_name": "my_schema",
+ "table_name": "my_table",
+ }
# Redshift table format (Custom '.' separator)
- assert \
- target_redshift.db_sync.stream_name_to_dict('my_catalog.my_schema.my_table', separator='.') == \
- {"catalog_name": "my_catalog", "schema_name": "my_schema", "table_name": "my_table"}
-
+ assert target_redshift.db_sync.stream_name_to_dict(
+ "my_catalog.my_schema.my_table", separator="."
+ ) == {
+ "catalog_name": "my_catalog",
+ "schema_name": "my_schema",
+ "table_name": "my_table",
+ }
def test_flatten_schema(self):
"""Test flattening of SCHEMA messages"""
@@ -134,9 +147,11 @@ def test_flatten_schema(self):
"properties": {
"c_pk": {"type": ["null", "integer"]},
"c_varchar": {"type": ["null", "string"]},
- "c_int": {"type": ["null", "integer"]}}}
- # NO FLATTENNING - Schema with simple properties should be a plain dictionary
- assert flatten_schema(not_nested_schema) == not_nested_schema['properties']
+ "c_int": {"type": ["null", "integer"]},
+ },
+ }
+ # NO FLATTENING - Schema with simple properties should be a plain dictionary
+ assert flatten_schema(not_nested_schema) == not_nested_schema["properties"]
nested_schema_with_no_properties = {
"type": "object",
@@ -144,9 +159,14 @@ def test_flatten_schema(self):
"c_pk": {"type": ["null", "integer"]},
"c_varchar": {"type": ["null", "string"]},
"c_int": {"type": ["null", "integer"]},
- "c_obj": {"type": ["null", "object"]}}}
- # NO FLATTENNING - Schema with object type property but without further properties should be a plain dictionary
- assert flatten_schema(nested_schema_with_no_properties) == nested_schema_with_no_properties['properties']
+ "c_obj": {"type": ["null", "object"]},
+ },
+ }
+ # NO FLATTENING - Schema with object type property but without further properties should be a plain dictionary
+ assert (
+ flatten_schema(nested_schema_with_no_properties)
+ == nested_schema_with_no_properties["properties"]
+ )
nested_schema_with_properties = {
"type": "object",
@@ -163,52 +183,53 @@ def test_flatten_schema(self):
"type": ["null", "object"],
"properties": {
"multi_nested_prop1": {"type": ["null", "string"]},
- "multi_nested_prop2": {"type": ["null", "string"]}
- }
- }
- }
- }
- }
+ "multi_nested_prop2": {"type": ["null", "string"]},
+ },
+ },
+ },
+ },
+ },
}
- # NO FLATTENNING - Schema with object type property but without further properties should be a plain dictionary
+ # NO FLATTENING - Schema with object type property but without further properties should be a plain dictionary
# No flattening (default)
- assert flatten_schema(nested_schema_with_properties) == nested_schema_with_properties['properties']
+ assert (
+ flatten_schema(nested_schema_with_properties)
+ == nested_schema_with_properties["properties"]
+ )
- # NO FLATTENNING - Schema with object type property but without further properties should be a plain dictionary
+ # NO FLATTENING - Schema with object type property but without further properties should be a plain dictionary
# max_level: 0 : No flattening (default)
- assert flatten_schema(nested_schema_with_properties, max_level=0) == nested_schema_with_properties['properties']
-
- # FLATTENNING - Schema with object type property but without further properties should be a dict with flattened properties
- assert \
- flatten_schema(nested_schema_with_properties, max_level=1) == \
- {
- 'c_pk': {'type': ['null', 'integer']},
- 'c_varchar': {'type': ['null', 'string']},
- 'c_int': {'type': ['null', 'integer']},
- 'c_obj__nested_prop1': {'type': ['null', 'string']},
- 'c_obj__nested_prop2': {'type': ['null', 'string']},
- 'c_obj__nested_prop3': {
- 'type': ['null', 'object'],
- "properties": {
- "multi_nested_prop1": {"type": ["null", "string"]},
- "multi_nested_prop2": {"type": ["null", "string"]}
- }
- }
+ assert (
+ flatten_schema(nested_schema_with_properties, max_level=0)
+ == nested_schema_with_properties["properties"]
+ )
+
+ # FLATTENING - Schema with object type property but without further properties should be a dict with flattened properties
+ assert flatten_schema(nested_schema_with_properties, max_level=1) == {
+ "c_pk": {"type": ["null", "integer"]},
+ "c_varchar": {"type": ["null", "string"]},
+ "c_int": {"type": ["null", "integer"]},
+ "c_obj__nested_prop1": {"type": ["null", "string"]},
+ "c_obj__nested_prop2": {"type": ["null", "string"]},
+ "c_obj__nested_prop3": {
+ "type": ["null", "object"],
+ "properties": {
+ "multi_nested_prop1": {"type": ["null", "string"]},
+ "multi_nested_prop2": {"type": ["null", "string"]},
+ },
+ },
}
- # FLATTENNING - Schema with object type property but without further properties should be a dict with flattened properties
- assert \
- flatten_schema(nested_schema_with_properties, max_level=10) == \
- {
- 'c_pk': {'type': ['null', 'integer']},
- 'c_varchar': {'type': ['null', 'string']},
- 'c_int': {'type': ['null', 'integer']},
- 'c_obj__nested_prop1': {'type': ['null', 'string']},
- 'c_obj__nested_prop2': {'type': ['null', 'string']},
- 'c_obj__nested_prop3__multi_nested_prop1': {'type': ['null', 'string']},
- 'c_obj__nested_prop3__multi_nested_prop2': {'type': ['null', 'string']}
- }
-
+ # FLATTENING - Schema with object type property but without further properties should be a dict with flattened properties
+ assert flatten_schema(nested_schema_with_properties, max_level=10) == {
+ "c_pk": {"type": ["null", "integer"]},
+ "c_varchar": {"type": ["null", "string"]},
+ "c_int": {"type": ["null", "integer"]},
+ "c_obj__nested_prop1": {"type": ["null", "string"]},
+ "c_obj__nested_prop2": {"type": ["null", "string"]},
+ "c_obj__nested_prop3__multi_nested_prop1": {"type": ["null", "string"]},
+ "c_obj__nested_prop3__multi_nested_prop2": {"type": ["null", "string"]},
+ }
def test_flatten_record(self):
"""Test flattening of RECORD messages"""
@@ -219,7 +240,7 @@ def test_flatten_record(self):
assert flatten_record(empty_record) == {}
not_nested_record = {"c_pk": 1, "c_varchar": "1", "c_int": 1}
- # NO FLATTENNING - Record with simple properties should be a plain dictionary
+ # NO FLATTENING - Record with simple properties should be a plain dictionary
assert flatten_record(not_nested_record) == not_nested_record
nested_record = {
@@ -232,93 +253,70 @@ def test_flatten_record(self):
"nested_prop3": {
"multi_nested_prop1": "multi_value_1",
"multi_nested_prop2": "multi_value_2",
- }}}
+ },
+ },
+ }
- # NO FLATTENNING - No flattening (default)
- assert \
- flatten_record(nested_record) == \
- {
+ # NO FLATTENING - No flattening (default)
+ assert (
+ flatten_record(nested_record)
+ == {
"c_pk": 1,
"c_varchar": "1",
"c_int": 1,
- "c_obj": '{"nested_prop1": "value_1", "nested_prop2": "value_2", "nested_prop3": {"multi_nested_prop1": "multi_value_1", "multi_nested_prop2": "multi_value_2"}}'
+ "c_obj": '{"nested_prop1": "value_1", "nested_prop2": "value_2", "nested_prop3": {"multi_nested_prop1": "multi_value_1", "multi_nested_prop2": "multi_value_2"}}',
}
+ )
- # NO FLATTENNING
+ # NO FLATTENING
# max_level: 0 : No flattening (default)
- assert \
- flatten_record(nested_record, max_level=0) == \
- {
+ assert (
+ flatten_record(nested_record, max_level=0)
+ == {
"c_pk": 1,
"c_varchar": "1",
"c_int": 1,
- "c_obj": '{"nested_prop1": "value_1", "nested_prop2": "value_2", "nested_prop3": {"multi_nested_prop1": "multi_value_1", "multi_nested_prop2": "multi_value_2"}}'
+ "c_obj": '{"nested_prop1": "value_1", "nested_prop2": "value_2", "nested_prop3": {"multi_nested_prop1": "multi_value_1", "multi_nested_prop2": "multi_value_2"}}',
}
+ )
- # SEMI FLATTENNING
+ # SEMI FLATTENING
# max_level: 1 : Semi-flattening (default)
- assert \
- flatten_record(nested_record, max_level=1) == \
- {
+ assert (
+ flatten_record(nested_record, max_level=1)
+ == {
"c_pk": 1,
"c_varchar": "1",
"c_int": 1,
"c_obj__nested_prop1": "value_1",
"c_obj__nested_prop2": "value_2",
- "c_obj__nested_prop3": '{"multi_nested_prop1": "multi_value_1", "multi_nested_prop2": "multi_value_2"}'
+ "c_obj__nested_prop3": '{"multi_nested_prop1": "multi_value_1", "multi_nested_prop2": "multi_value_2"}',
}
+ )
- # FLATTENNING
- assert \
- flatten_record(nested_record, max_level=10) == \
- {
- "c_pk": 1,
- "c_varchar": "1",
- "c_int": 1,
- "c_obj__nested_prop1": "value_1",
- "c_obj__nested_prop2": "value_2",
- "c_obj__nested_prop3__multi_nested_prop1": "multi_value_1",
- "c_obj__nested_prop3__multi_nested_prop2": "multi_value_2"
- }
+ # FLATTENING
+ assert flatten_record(nested_record, max_level=10) == {
+ "c_pk": 1,
+ "c_varchar": "1",
+ "c_int": 1,
+ "c_obj__nested_prop1": "value_1",
+ "c_obj__nested_prop2": "value_2",
+ "c_obj__nested_prop3__multi_nested_prop1": "multi_value_1",
+ "c_obj__nested_prop3__multi_nested_prop2": "multi_value_2",
+ }
def test_flatten_record_with_flatten_schema(self):
flatten_record = target_redshift.db_sync.flatten_record
- flatten_schema = {
- "id": {
- "type": [
- "object",
- "array",
- "null"
- ]
- }
- }
+ flatten_schema = {"id": {"type": ["object", "array", "null"]}}
test_cases = [
- (
- True,
- {
- "id": 1,
- "data": "xyz"
- },
- {
- "id": "1",
- "data": "xyz"
- }
- ),
- (
- False,
- {
- "id": 1,
- "data": "xyz"
- },
- {
- "id": 1,
- "data": "xyz"
- }
- )
+ (True, {"id": 1, "data": "xyz"}, {"id": "1", "data": "xyz"}),
+ (False, {"id": 1, "data": "xyz"}, {"id": 1, "data": "xyz"}),
]
- for idx, (should_use_flatten_schema, record, expected_output) in enumerate(test_cases):
- output = flatten_record(record, flatten_schema if should_use_flatten_schema else None)
+ for should_use_flatten_schema, record, expected_output in test_cases:
+ output = flatten_record(
+ record, flatten_schema if should_use_flatten_schema else None
+ )
assert output == expected_output
diff --git a/tests/unit/test_target_rs.py b/tests/unit/test_target_rs.py
index 1e39212..3670cfb 100644
--- a/tests/unit/test_target_rs.py
+++ b/tests/unit/test_target_rs.py
@@ -1,23 +1,22 @@
-import pytest
-import mock
import os
+from unittest import mock
import target_redshift
-class TestTargetRedshift(object):
-
+class TestTargetRedshift:
def setup_method(self):
self.config = {}
- @mock.patch('target_redshift.flush_streams')
- @mock.patch('target_redshift.DbSync')
- def test_persist_lines_with_40_records_and_batch_size_of_20_expect_flushing_once(self, dbSync_mock,
- flush_streams_mock):
- self.config['batch_size_rows'] = 20
- self.config['flush_all_streams'] = True
+ @mock.patch("target_redshift.flush_streams")
+ @mock.patch("target_redshift.DbSync")
+ def test_persist_lines_with_40_records_and_batch_size_of_20_expect_flushing_once(
+ self, dbSync_mock, flush_streams_mock
+ ):
+ self.config["batch_size_rows"] = 20
+ self.config["flush_all_streams"] = True
- with open(f'{os.path.dirname(__file__)}/resources/logical-streams.json', 'r') as f:
+ with open(f"{os.path.dirname(__file__)}/resources/logical-streams.json") as f:
lines = f.readlines()
instance = dbSync_mock.return_value