diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 52fc0e706..3fcfe3d37 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -35,6 +35,10 @@ body: required: false - label: dbt-athena-community required: false + - label: dbt-bigquery + required: false + - label: dbt-spark + required: false - type: textarea attributes: label: Current Behavior diff --git a/.github/ISSUE_TEMPLATE/regression-report.yml b/.github/ISSUE_TEMPLATE/regression-report.yml index 95f073ca9..1df804d6e 100644 --- a/.github/ISSUE_TEMPLATE/regression-report.yml +++ b/.github/ISSUE_TEMPLATE/regression-report.yml @@ -30,6 +30,10 @@ body: required: false - label: dbt-athena-community required: false + - label: dbt-bigquery + required: false + - label: dbt-spark + required: false - type: textarea attributes: label: Current Behavior diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 5b5c4e85b..cea36c85c 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -6,6 +6,7 @@ updates: - "/dbt-tests-adapter" - "/dbt-athena" - "/dbt-athena-community" + - "/dbt-bigquery" - "/dbt-spark" schedule: interval: "daily" @@ -15,7 +16,9 @@ updates: update-types: - version-update:semver-patch - package-ecosystem: "docker" - directory: "dbt-spark/docker" + directories: + - "/dbt-bigquery/docker" + - "/dbt-spark/docker" schedule: interval: "weekly" rebase-strategy: "disabled" diff --git a/.github/workflows/_generate-changelog.yml b/.github/workflows/_generate-changelog.yml index be9ffaf6e..139552568 100644 --- a/.github/workflows/_generate-changelog.yml +++ b/.github/workflows/_generate-changelog.yml @@ -35,6 +35,7 @@ on: - "dbt-adapters" - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" merge: description: "Choose whether to merge the changelog branch" diff --git a/.github/workflows/_integration-tests.yml b/.github/workflows/_integration-tests.yml index 307285228..c6a40f13b 100644 --- a/.github/workflows/_integration-tests.yml +++ b/.github/workflows/_integration-tests.yml @@ -31,6 +31,7 @@ on: options: - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" branch: description: "Choose the branch to test" @@ -96,6 +97,36 @@ jobs: - run: hatch run integration-tests working-directory: ./${{ inputs.package }} + integration-tests-bigquery: + if: ${{ inputs.package == 'dbt-bigquery' }} + runs-on: ${{ inputs.os }} + environment: + name: "dbt-bigquery" + env: + BIGQUERY_TEST_SERVICE_ACCOUNT_JSON: ${{ secrets.BIGQUERY_TEST_SERVICE_ACCOUNT_JSON }} + BIGQUERY_TEST_ALT_DATABASE: ${{ vars.BIGQUERY_TEST_ALT_DATABASE }} + BIGQUERY_TEST_NO_ACCESS_DATABASE: ${{ vars.BIGQUERY_TEST_NO_ACCESS_DATABASE }} + DBT_TEST_USER_1: ${{ vars.DBT_TEST_USER_1 }} + DBT_TEST_USER_2: ${{ vars.DBT_TEST_USER_2 }} + DBT_TEST_USER_3: ${{ vars.DBT_TEST_USER_3 }} + DATAPROC_REGION: ${{ vars.DATAPROC_REGION }} + DATAPROC_CLUSTER_NAME: ${{ vars.DATAPROC_CLUSTER_NAME }} + GCS_BUCKET: ${{ vars.GCS_BUCKET }} + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.branch }} + repository: ${{ inputs.repository }} + - uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + - uses: pypa/hatch@install + - run: hatch run integration-tests tests/functional -k "not TestPython" + working-directory: ./${{ inputs.package }} + - run: hatch run integration-tests tests/functional -n1 -k "TestPython" + if: ${{ inputs.python-version == '3.9' }} # we only run this for one version to run in series + working-directory: ./${{ inputs.package }} + integration-tests-spark: if: ${{ inputs.package == 'dbt-spark' }} runs-on: ${{ inputs.os }} diff --git a/.github/workflows/_publish-internal.yml b/.github/workflows/_publish-internal.yml index 7a6f34738..7da5a7bcc 100644 --- a/.github/workflows/_publish-internal.yml +++ b/.github/workflows/_publish-internal.yml @@ -23,6 +23,7 @@ on: options: - "dbt-adapters" - "dbt-athena" + - "dbt-bigquery" - "dbt-spark" deploy-to: description: "Choose whether to publish to test or prod" diff --git a/.github/workflows/_publish-pypi.yml b/.github/workflows/_publish-pypi.yml index efe174294..260090b82 100644 --- a/.github/workflows/_publish-pypi.yml +++ b/.github/workflows/_publish-pypi.yml @@ -24,6 +24,7 @@ on: - "dbt-adapters" - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" deploy-to: description: "Choose whether to publish to test or prod" diff --git a/.github/workflows/_unit-tests.yml b/.github/workflows/_unit-tests.yml index eddda99cd..ebed4002c 100644 --- a/.github/workflows/_unit-tests.yml +++ b/.github/workflows/_unit-tests.yml @@ -32,6 +32,7 @@ on: - "dbt-adapters" - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" branch: description: "Choose the branch to test" diff --git a/.github/workflows/_verify-build.yml b/.github/workflows/_verify-build.yml index 19730883e..3e12455e9 100644 --- a/.github/workflows/_verify-build.yml +++ b/.github/workflows/_verify-build.yml @@ -33,6 +33,7 @@ on: - "dbt-tests-adapter" - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" branch: description: "Choose the branch to build" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index fad2c2939..e3b89c924 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -12,6 +12,7 @@ on: - "dbt-tests-adapter" - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" deploy-to: description: "Choose whether to publish to test or prod" diff --git a/.github/workflows/pull-request-checks.yml b/.github/workflows/pull-request-checks.yml index d6529c6ad..7dd687757 100644 --- a/.github/workflows/pull-request-checks.yml +++ b/.github/workflows/pull-request-checks.yml @@ -34,6 +34,7 @@ jobs: - "dbt-tests-adapter" - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" os: [ubuntu-22.04] python-version: ["3.9", "3.10", "3.11", "3.12"] @@ -53,6 +54,7 @@ jobs: - "dbt-adapters" - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" os: [ ubuntu-22.04 ] python-version: ["3.9", "3.10", "3.11", "3.12"] @@ -71,6 +73,7 @@ jobs: package: - "dbt-athena" - "dbt-athena-community" + - "dbt-bigquery" - "dbt-spark" os: [ubuntu-22.04] python-version: ["3.9", "3.10", "3.11", "3.12"] diff --git a/.gitignore b/.gitignore index 84c738c69..ab6a2ba4d 100644 --- a/.gitignore +++ b/.gitignore @@ -158,6 +158,9 @@ cython_debug/ # PyCharm .idea/ +# AWS credentials +.aws/ + # MacOS .DS_Store diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 51154c51d..112c7c94f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,7 +34,7 @@ repos: rev: 7.1.1 hooks: - id: flake8 - exclude: dbt/adapters/events/adapter_types_pb2.py|tests/functional/|dbt-spark/tests/ + exclude: dbt/adapters/events/adapter_types_pb2.py|tests/functional/|dbt-spark/tests/|dbt-bigquery/tests/ args: - --max-line-length=99 - --select=E,F,W diff --git a/dbt-bigquery/.changes/0.0.0.md b/dbt-bigquery/.changes/0.0.0.md new file mode 100644 index 000000000..dd49fb1b7 --- /dev/null +++ b/dbt-bigquery/.changes/0.0.0.md @@ -0,0 +1,9 @@ +## Previous Releases +For information on prior major and minor releases, see their changelogs: +- [1.6](https://github.com/dbt-labs/dbt-bigquery/blob/1.6.latest/CHANGELOG.md) +- [1.5](https://github.com/dbt-labs/dbt-bigquery/blob/1.5.latest/CHANGELOG.md) +- [1.4](https://github.com/dbt-labs/dbt-bigquery/blob/1.4.latest/CHANGELOG.md) +- [1.3](https://github.com/dbt-labs/dbt-bigquery/blob/1.3.latest/CHANGELOG.md) +- [1.2](https://github.com/dbt-labs/dbt-bigquery/blob/1.2.latest/CHANGELOG.md) +- [1.1](https://github.com/dbt-labs/dbt-bigquery/blob/1.1.latest/CHANGELOG.md) +- [1.0](https://github.com/dbt-labs/dbt-bigquery/blob/1.0.latest/CHANGELOG.md) diff --git a/dbt-bigquery/.changes/README.md b/dbt-bigquery/.changes/README.md new file mode 100644 index 000000000..c8c37e3f5 --- /dev/null +++ b/dbt-bigquery/.changes/README.md @@ -0,0 +1,3 @@ +# CHANGELOG + +To view information about the changelog operation we suggest reading this [README](https://github.com/dbt-labs/dbt-bigquery/blob/main/.changes/README.md) found in `dbt-bigquery`. diff --git a/dbt-bigquery/.changes/header.tpl.md b/dbt-bigquery/.changes/header.tpl.md new file mode 100644 index 000000000..b0468a97e --- /dev/null +++ b/dbt-bigquery/.changes/header.tpl.md @@ -0,0 +1,6 @@ +# dbt-bigquery Changelog + +- This file provides a full account of all changes to `dbt-bigquery`. +- Changes are listed under the (pre)release in which they first appear. Subsequent releases include changes from previous releases. +- "Breaking changes" listed under a version may require action from end users or external maintainers when upgrading to that version. +- Do not edit this file directly. This file is auto-generated using [changie](https://github.com/miniscruff/changie). For details on how to document a change, see [the contributing guide](https://github.com/dbt-labs/dbt-bigquery/blob/main/CONTRIBUTING.md#adding-changelog-entry) diff --git a/dbt-bigquery/.changes/unreleased/.gitkeep b/dbt-bigquery/.changes/unreleased/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/dbt-bigquery/.changes/unreleased/Features-20241202-223835.yaml b/dbt-bigquery/.changes/unreleased/Features-20241202-223835.yaml new file mode 100644 index 000000000..ab59abd99 --- /dev/null +++ b/dbt-bigquery/.changes/unreleased/Features-20241202-223835.yaml @@ -0,0 +1,6 @@ +kind: Features +body: Allow copy_partitions in microbatch +time: 2024-12-02T22:38:35.479052Z +custom: + Author: borjavb + Issue: "1414" diff --git a/dbt-bigquery/.changes/unreleased/Fixes-20241120-163101.yaml b/dbt-bigquery/.changes/unreleased/Fixes-20241120-163101.yaml new file mode 100644 index 000000000..ba1f4e937 --- /dev/null +++ b/dbt-bigquery/.changes/unreleased/Fixes-20241120-163101.yaml @@ -0,0 +1,7 @@ +kind: Fixes +body: Fix issue where dbt-bigquery was not retrying in certain retryable scenarios, + e.g. 503's +time: 2024-11-20T16:31:01.60689-05:00 +custom: + Author: mikealfare + Issue: "682" diff --git a/dbt-bigquery/.changes/unreleased/Fixes-20241204-105846.yaml b/dbt-bigquery/.changes/unreleased/Fixes-20241204-105846.yaml new file mode 100644 index 000000000..2693e4513 --- /dev/null +++ b/dbt-bigquery/.changes/unreleased/Fixes-20241204-105846.yaml @@ -0,0 +1,7 @@ +kind: Fixes +body: Cast `event_time` to a timestamp prior to comparing against microbatch start/end + time +time: 2024-12-04T10:58:46.573608-05:00 +custom: + Author: michelleark + Issue: "1422" diff --git a/dbt-bigquery/.changes/unreleased/Fixes-20241205-133606.yaml b/dbt-bigquery/.changes/unreleased/Fixes-20241205-133606.yaml new file mode 100644 index 000000000..b88a0981c --- /dev/null +++ b/dbt-bigquery/.changes/unreleased/Fixes-20241205-133606.yaml @@ -0,0 +1,6 @@ +kind: Fixes +body: Fix issue where rate limit errors on table service calls are not retried +time: 2024-12-05T13:36:06.436005-05:00 +custom: + Author: mikealfare + Issue: "1423" diff --git a/dbt-bigquery/.changes/unreleased/Fixes-20241211-144752.yaml b/dbt-bigquery/.changes/unreleased/Fixes-20241211-144752.yaml new file mode 100644 index 000000000..e666d5c31 --- /dev/null +++ b/dbt-bigquery/.changes/unreleased/Fixes-20241211-144752.yaml @@ -0,0 +1,6 @@ +kind: Fixes +body: Fix retry scenarios so that dbt always retries when BigQuery recommends a retry +time: 2024-12-11T14:47:52.36905-05:00 +custom: + Author: mikealfare + Issue: "263" diff --git a/dbt-bigquery/.changes/unreleased/Under the Hood-20241117-194746.yaml b/dbt-bigquery/.changes/unreleased/Under the Hood-20241117-194746.yaml new file mode 100644 index 000000000..e8658ee20 --- /dev/null +++ b/dbt-bigquery/.changes/unreleased/Under the Hood-20241117-194746.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Move from setup.py to pyproject.toml and to hatch as a dev tool +time: 2024-11-17T19:47:46.341-05:00 +custom: + Author: mikealfare + Issue: "1407" diff --git a/dbt-bigquery/.changie.yaml b/dbt-bigquery/.changie.yaml new file mode 100644 index 000000000..7ca7c3797 --- /dev/null +++ b/dbt-bigquery/.changie.yaml @@ -0,0 +1,131 @@ +changesDir: .changes +unreleasedDir: unreleased +headerPath: header.tpl.md +versionHeaderPath: "" +changelogPath: CHANGELOG.md +versionExt: md +envPrefix: "CHANGIE_" +versionFormat: '## dbt-bigquery {{.Version}} - {{.Time.Format "January 02, 2006"}}' +kindFormat: '### {{.Kind}}' +changeFormat: |- + {{- $IssueList := list }} + {{- $changes := splitList " " $.Custom.Issue }} + {{- range $issueNbr := $changes }} + {{- $changeLink := "[#nbr](https://github.com/dbt-labs/dbt-bigquery/issues/nbr)" | replace "nbr" $issueNbr }} + {{- $IssueList = append $IssueList $changeLink }} + {{- end -}} + - {{.Body}} ({{ range $index, $element := $IssueList }}{{if $index}}, {{end}}{{$element}}{{end}}) + +kinds: +- label: Breaking Changes +- label: Features +- label: Fixes +- label: Under the Hood +- label: Dependencies + changeFormat: |- + {{- $PRList := list }} + {{- $changes := splitList " " $.Custom.PR }} + {{- range $pullrequest := $changes }} + {{- $changeLink := "[#nbr](https://github.com/dbt-labs/dbt-bigquery/pull/nbr)" | replace "nbr" $pullrequest }} + {{- $PRList = append $PRList $changeLink }} + {{- end -}} + - {{.Body}} ({{ range $index, $element := $PRList }}{{if $index}}, {{end}}{{$element}}{{end}}) + skipGlobalChoices: true + additionalChoices: + - key: Author + label: GitHub Username(s) (separated by a single space if multiple) + type: string + minLength: 3 + - key: PR + label: GitHub Pull Request Number (separated by a single space if multiple) + type: string + minLength: 1 +- label: Security + changeFormat: |- + {{- $PRList := list }} + {{- $changes := splitList " " $.Custom.PR }} + {{- range $pullrequest := $changes }} + {{- $changeLink := "[#nbr](https://github.com/dbt-labs/dbt-bigquery/pull/nbr)" | replace "nbr" $pullrequest }} + {{- $PRList = append $PRList $changeLink }} + {{- end -}} + - {{.Body}} ({{ range $index, $element := $PRList }}{{if $index}}, {{end}}{{$element}}{{end}}) + skipGlobalChoices: true + additionalChoices: + - key: Author + label: GitHub Username(s) (separated by a single space if multiple) + type: string + minLength: 3 + - key: PR + label: GitHub Pull Request Number (separated by a single space if multiple) + type: string + minLength: 1 + +newlines: + afterChangelogHeader: 1 + afterKind: 1 + afterChangelogVersion: 1 + beforeKind: 1 + endOfVersion: 1 + +custom: +- key: Author + label: GitHub Username(s) (separated by a single space if multiple) + type: string + minLength: 3 +- key: Issue + label: GitHub Issue Number (separated by a single space if multiple) + type: string + minLength: 1 + +footerFormat: | + {{- $contributorDict := dict }} + {{- /* ensure all names in this list are all lowercase for later matching purposes */}} + {{- $core_team := splitList " " .Env.CORE_TEAM }} + {{- /* ensure we always skip snyk and dependabot in addition to the core team */}} + {{- $maintainers := list "dependabot[bot]" "snyk-bot"}} + {{- range $team_member := $core_team }} + {{- $team_member_lower := lower $team_member }} + {{- $maintainers = append $maintainers $team_member_lower }} + {{- end }} + {{- range $change := .Changes }} + {{- $authorList := splitList " " $change.Custom.Author }} + {{- /* loop through all authors for a single changelog */}} + {{- range $author := $authorList }} + {{- $authorLower := lower $author }} + {{- /* we only want to include non-core team contributors */}} + {{- if not (has $authorLower $maintainers)}} + {{- $changeList := splitList " " $change.Custom.Author }} + {{- $IssueList := list }} + {{- $changeLink := $change.Kind }} + {{- if or (eq $change.Kind "Dependencies") (eq $change.Kind "Security") }} + {{- $changes := splitList " " $change.Custom.PR }} + {{- range $issueNbr := $changes }} + {{- $changeLink := "[#nbr](https://github.com/dbt-labs/dbt-bigquery/pull/nbr)" | replace "nbr" $issueNbr }} + {{- $IssueList = append $IssueList $changeLink }} + {{- end -}} + {{- else }} + {{- $changes := splitList " " $change.Custom.Issue }} + {{- range $issueNbr := $changes }} + {{- $changeLink := "[#nbr](https://github.com/dbt-labs/dbt-bigquery/issues/nbr)" | replace "nbr" $issueNbr }} + {{- $IssueList = append $IssueList $changeLink }} + {{- end -}} + {{- end }} + {{- /* check if this contributor has other changes associated with them already */}} + {{- if hasKey $contributorDict $author }} + {{- $contributionList := get $contributorDict $author }} + {{- $contributionList = concat $contributionList $IssueList }} + {{- $contributorDict := set $contributorDict $author $contributionList }} + {{- else }} + {{- $contributionList := $IssueList }} + {{- $contributorDict := set $contributorDict $author $contributionList }} + {{- end }} + {{- end}} + {{- end}} + {{- end }} + {{- /* no indentation here for formatting so the final markdown doesn't have unneeded indentations */}} + {{- if $contributorDict}} + ### Contributors + {{- range $k,$v := $contributorDict }} + - [@{{$k}}](https://github.com/{{$k}}) ({{ range $index, $element := $v }}{{if $index}}, {{end}}{{$element}}{{end}}) + {{- end }} + {{- end }} diff --git a/dbt-bigquery/CHANGELOG.md b/dbt-bigquery/CHANGELOG.md new file mode 100644 index 000000000..ade60b8f6 --- /dev/null +++ b/dbt-bigquery/CHANGELOG.md @@ -0,0 +1,16 @@ +# dbt-bigquery Changelog + +- This file provides a full account of all changes to `dbt-bigquery`. +- Changes are listed under the (pre)release in which they first appear. Subsequent releases include changes from previous releases. +- "Breaking changes" listed under a version may require action from end users or external maintainers when upgrading to that version. +- Do not edit this file directly. This file is auto-generated using [changie](https://github.com/miniscruff/changie). For details on how to document a change, see [the contributing guide](https://github.com/dbt-labs/dbt-bigquery/blob/main/CONTRIBUTING.md#adding-changelog-entry) + +## Previous Releases +For information on prior major and minor releases, see their changelogs: +- [1.6](https://github.com/dbt-labs/dbt-bigquery/blob/1.6.latest/CHANGELOG.md) +- [1.5](https://github.com/dbt-labs/dbt-bigquery/blob/1.5.latest/CHANGELOG.md) +- [1.4](https://github.com/dbt-labs/dbt-bigquery/blob/1.4.latest/CHANGELOG.md) +- [1.3](https://github.com/dbt-labs/dbt-bigquery/blob/1.3.latest/CHANGELOG.md) +- [1.2](https://github.com/dbt-labs/dbt-bigquery/blob/1.2.latest/CHANGELOG.md) +- [1.1](https://github.com/dbt-labs/dbt-bigquery/blob/1.1.latest/CHANGELOG.md) +- [1.0](https://github.com/dbt-labs/dbt-bigquery/blob/1.0.latest/CHANGELOG.md) diff --git a/dbt-bigquery/CONTRIBUTING.md b/dbt-bigquery/CONTRIBUTING.md new file mode 100644 index 000000000..f915af713 --- /dev/null +++ b/dbt-bigquery/CONTRIBUTING.md @@ -0,0 +1,109 @@ +# Contributing to `dbt-bigquery` + +1. [About this document](#about-this-document) +3. [Getting the code](#getting-the-code) +5. [Running `dbt-bigquery` in development](#running-dbt-bigquery-in-development) +6. [Testing](#testing) +7. [Updating Docs](#updating-docs) +7. [Submitting a Pull Request](#submitting-a-pull-request) + +## About this document +This document is a guide intended for folks interested in contributing to `dbt-bigquery`. Below, we document the process by which members of the community should create issues and submit pull requests (PRs) in this repository. It is not intended as a guide for using `dbt-bigquery`, and it assumes a certain level of familiarity with Python concepts such as virtualenvs, `pip`, python modules, filesystems, and so on. This guide assumes you are using macOS or Linux and are comfortable with the command line. + +For those wishing to contribute we highly suggest reading the [dbt-core](https://github.com/dbt-labs/dbt-core/blob/main/CONTRIBUTING.md), if you haven't already. Almost all of the information there is applicable to contributing here, too! + +### Signing the CLA + +Please note that all contributors to `dbt-bigquery` must sign the [Contributor License Agreement](https://docs.getdbt.com/docs/contributor-license-agreements) to have their Pull Request merged into an `dbt-bigquery` codebase. If you are unable to sign the CLA, then the `dbt-bigquery` maintainers will unfortunately be unable to merge your Pull Request. You are, however, welcome to open issues and comment on existing ones. + + +## Getting the code + +You will need `git` in order to download and modify the `dbt-bigquery` source code. You can find direction [here](https://github.com/git-guides/install-git) on how to install `git`. + +### External contributors + +If you are not a member of the `dbt-labs` GitHub organization, you can contribute to `dbt-bigquery` by forking the `dbt-bigquery` repository. For a detailed overview on forking, check out the [GitHub docs on forking](https://help.github.com/en/articles/fork-a-repo). In short, you will need to: + +1. fork the `dbt-bigquery` repository +2. clone your fork locally +3. check out a new branch for your proposed changes +4. push changes to your fork +5. open a pull request against `dbt-labs/dbt-bigquery` from your forked repository + +### dbt Labs contributors + +If you are a member of the `dbt Labs` GitHub organization, you will have push access to the `dbt-bigquery` repo. Rather than forking `dbt-bigquery` to make your changes, just clone the repository, check out a new branch, and push directly to that branch. + + +## Running `dbt-bigquery` in development + +### Installation + +First make sure that you set up your `virtualenv` as described in [Setting up an environment](https://github.com/dbt-labs/dbt-core/blob/HEAD/CONTRIBUTING.md#setting-up-an-environment). Ensure you have the latest version of pip installed with `pip install --upgrade pip`. Next, install `dbt-bigquery` latest dependencies: + +```sh +pip install -e . -r dev-requirements.txt +``` + +When `dbt-bigquery` is installed this way, any changes you make to the `dbt-bigquery` source code will be reflected immediately in your next `dbt-bigquery` run. + +To confirm you have the correct version of `dbt-core` installed please run `dbt --version` and `which dbt`. + +## Testing + +### Initial Setup + +`dbt-bigquery` contains [unit](https://github.com/dbt-labs/dbt-bigquery/tree/main/tests/unit) and [functional](https://github.com/dbt-labs/dbt-bigquery/tree/main/tests/functional) tests. functional tests require testing against an actual BigQuery warehouse. We have CI set up to test against a BigQuery warehouse. In order to run functional tests locally, you will need a `test.env` file in the root of the repository that contains credentials for BigQuery. + +Note: This `test.env` file is git-ignored, but please be _extra_ careful to never check in credentials or other sensitive information when developing. To create your `test.env` file, copy the provided example file, then supply your relevant credentials. + +``` +cp test.env.example test.env +$EDITOR test.env +``` + +### Test commands +There are a few methods for running tests locally. + +#### `tox` +`tox` takes care of managing Python virtualenvs and installing dependencies in order to run tests. You can also run tests in parallel, for example you can run unit tests for Python 3.9, Python 3.10, and Python 3.11 in parallel with `tox -p`. Also, you can run unit tests for specific python versions with `tox -e py39`. The configuration of these tests are located in `tox.ini`. + +#### `pytest` +Finally, you can also run a specific test or group of tests using `pytest` directly. With a Python virtualenv active and dev dependencies installed you can do things like: + +```sh +# run specific bigquery functional tests +python -m pytest -m profile_bigquery tests/functional/adapter/test_aliases.py::TestSameTestSameAliasDifferentDatabasesBigQuery +# run all unit tests in a file +python -m pytest tests/unit/test_bigquery_adapter.py +# run a specific unit test +python -m pytest tests/unit/test_bigquery_adapter.py::TestBigQueryAdapter::test_copy_table_materialization_table +``` +## Updating Docs + +Many changes will require and update to the `dbt-bigquery` docs here are some useful resources. + +- Docs are [here](https://docs.getdbt.com/). +- The docs repo for making changes is located [here]( https://github.com/dbt-labs/docs.getdbt.com). +- The changes made are likely to impact one or both of [BigQuery Profile](https://docs.getdbt.com/reference/warehouse-profiles/bigquery-profile), or [BigQuery Configs](https://docs.getdbt.com/reference/resource-configs/bigquery-configs). +- We ask every community member who makes a user-facing change to open an issue or PR regarding doc changes. + +## Adding CHANGELOG Entry + +We use [changie](https://changie.dev) to generate `CHANGELOG` entries. **Note:** Do not edit the `CHANGELOG.md` directly. Your modifications will be lost. + +Follow the steps to [install `changie`](https://changie.dev/guide/installation/) for your system. + +Once changie is installed and your PR is created, simply run `changie new` and changie will walk you through the process of creating a changelog entry. Commit the file that's created and your changelog entry is complete! + +You don't need to worry about which `dbt-bigquery` version your change will go into. Just create the changelog entry with `changie`, and open your PR against the `main` branch. All merged changes will be included in the next minor version of `dbt-bigquery`. The Core maintainers _may_ choose to "backport" specific changes in order to patch older minor versions. In that case, a maintainer will take care of that backport after merging your PR, before releasing the new version of `dbt-bigquery`. + + +## Submitting a Pull Request + +dbt Labs provides a CI environment to test changes to the `dbt-bigquery` adapter and periodic checks against the development version of `dbt-core` through Github Actions. + +A `dbt-bigquery` maintainer will review your PR. They may suggest code revision for style or clarity, or request that you add unit or functional test(s). These are good things! We believe that, with a little bit of help, anyone can contribute high-quality code. + +Once all tests are passing, you have updated the changelog to reflect and tag your issue/pr for reference with a small description of the change, and your PR has been approved, a `dbt-bigquery` maintainer will merge your changes into the active development branch. And that's it! Happy developing :tada: diff --git a/dbt-bigquery/LICENSE.md b/dbt-bigquery/LICENSE.md new file mode 100644 index 000000000..536bebee0 --- /dev/null +++ b/dbt-bigquery/LICENSE.md @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 dbt Labs, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dbt-bigquery/README.md b/dbt-bigquery/README.md new file mode 100644 index 000000000..6ec77bd2e --- /dev/null +++ b/dbt-bigquery/README.md @@ -0,0 +1,39 @@ +

+ dbt logo +

+

+ + Unit Tests Badge + + + Integration Tests Badge + +

+ +**[dbt](https://www.getdbt.com/)** enables data analysts and engineers to transform their data using the same practices that software engineers use to build applications. + +dbt is the T in ELT. Organize, cleanse, denormalize, filter, rename, and pre-aggregate the raw data in your warehouse so that it's ready for analysis. + +## dbt-bigquery + +The `dbt-bigquery` package contains all of the code enabling dbt to work with Google BigQuery. For +more information on using dbt with BigQuery, consult [the docs](https://docs.getdbt.com/docs/profile-bigquery). + +## Getting started + +- [Install dbt](https://docs.getdbt.com/docs/installation) +- Read the [introduction](https://docs.getdbt.com/docs/introduction/) and [viewpoint](https://docs.getdbt.com/docs/about/viewpoint/) + +## Join the dbt Community + +- Be part of the conversation in the [dbt Community Slack](http://community.getdbt.com/) +- Read more on the [dbt Community Discourse](https://discourse.getdbt.com) + +## Reporting bugs and contributing code + +- Want to report a bug or request a feature? Let us know on [Slack](http://community.getdbt.com/), or open [an issue](https://github.com/dbt-labs/dbt-bigquery/issues/new) +- Want to help us build dbt? Check out the [Contributing Guide](https://github.com/dbt-labs/dbt-bigquery/blob/HEAD/CONTRIBUTING.md) + +## Code of Conduct + +Everyone interacting in the dbt project's codebases, issue trackers, chat rooms, and mailing lists is expected to follow the [dbt Code of Conduct](https://community.getdbt.com/code-of-conduct). diff --git a/dbt-bigquery/docker/Dockerfile b/dbt-bigquery/docker/Dockerfile new file mode 100644 index 000000000..8f371d6b4 --- /dev/null +++ b/dbt-bigquery/docker/Dockerfile @@ -0,0 +1,37 @@ +# this image gets published to GHCR for production use +ARG py_version=3.11.2 + +FROM python:$py_version-slim-bullseye AS base + +RUN apt-get update \ + && apt-get dist-upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential=12.9 \ + ca-certificates=20210119 \ + git=1:2.30.2-1+deb11u2 \ + libpq-dev=13.18-0+deb11u1 \ + make=4.3-4.1 \ + openssh-client=1:8.4p1-5+deb11u3 \ + software-properties-common=0.96.20.2-2.1 \ + && apt-get clean \ + && rm -rf \ + /var/lib/apt/lists/* \ + /tmp/* \ + /var/tmp/* + +ENV PYTHONIOENCODING=utf-8 +ENV LANG=C.UTF-8 + +RUN python -m pip install --upgrade "pip==24.0" "setuptools==69.2.0" "wheel==0.43.0" --no-cache-dir + + +FROM base AS dbt-bigquery + +ARG commit_ref=main + +HEALTHCHECK CMD dbt --version || exit 1 + +WORKDIR /usr/app/dbt/ +ENTRYPOINT ["dbt"] + +RUN python -m pip install --no-cache-dir "dbt-bigquery @ git+https://github.com/dbt-labs/dbt-bigquery@${commit_ref}" diff --git a/dbt-bigquery/docker/README.md b/dbt-bigquery/docker/README.md new file mode 100644 index 000000000..8c60deaa3 --- /dev/null +++ b/dbt-bigquery/docker/README.md @@ -0,0 +1,58 @@ +# Docker for dbt +This docker file is suitable for building dbt Docker images locally or using with CI/CD to automate populating a container registry. + + +## Building an image: +This Dockerfile can create images for the following target: `dbt-bigquery` + +In order to build a new image, run the following docker command. +```shell +docker build --tag --target dbt-bigquery +``` +--- +> **Note:** Docker must be configured to use [BuildKit](https://docs.docker.com/develop/develop-images/build_enhancements/) in order for images to build properly! + +--- + +By default the image will be populated with the latest version of `dbt-bigquery` on `main`. +If you need to use a different version you can specify it by git ref using the `--build-arg` flag: +```shell +docker build --tag \ + --target dbt-bigquery \ + --build-arg commit_ref= \ + +``` + +### Examples: +To build an image named "my-dbt" that supports Snowflake using the latest releases: +```shell +cd dbt-core/docker +docker build --tag my-dbt --target dbt-bigquery . +``` + +To build an image named "my-other-dbt" that supports Snowflake using the adapter version 1.0.0b1: +```shell +cd dbt-core/docker +docker build \ + --tag my-other-dbt \ + --target dbt-bigquery \ + --build-arg commit_ref=v1.0.0b1 \ + . +``` + +## Running an image in a container: +The `ENTRYPOINT` for this Dockerfile is the command `dbt` so you can bind-mount your project to `/usr/app` and use dbt as normal: +```shell +docker run \ + --network=host \ + --mount type=bind,source=path/to/project,target=/usr/app \ + --mount type=bind,source=path/to/profiles.yml,target=/root/.dbt/profiles.yml \ + my-dbt \ + ls +``` +--- +**Notes:** +* Bind-mount sources _must_ be an absolute path +* You may need to make adjustments to the docker networking setting depending on the specifics of your data warehouse/database host. + +--- diff --git a/dbt-bigquery/docker/dev.Dockerfile b/dbt-bigquery/docker/dev.Dockerfile new file mode 100644 index 000000000..f122f5343 --- /dev/null +++ b/dbt-bigquery/docker/dev.Dockerfile @@ -0,0 +1,50 @@ +# this image does not get published, it is intended for local development only, see `Makefile` for usage +FROM ubuntu:24.04 AS base + +# prevent python installation from asking for time zone region +ARG DEBIAN_FRONTEND=noninteractive + +# add python repository +RUN apt-get update \ + && apt-get install -y software-properties-common=0.99.48 \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get clean \ + && rm -rf \ + /var/lib/apt/lists/* \ + /tmp/* \ + /var/tmp/* + +# install python +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential=12.10ubuntu1 \ + git-all=1:2.43.0-1ubuntu7.1 \ + python3.9=3.9.20-1+noble1 \ + python3.9-dev=3.9.20-1+noble1 \ + python3.9-distutils=3.9.20-1+noble1 \ + python3.9-venv=3.9.20-1+noble1 \ + python3-pip=24.0+dfsg-1ubuntu1 \ + python3-wheel=0.42.0-2 \ + && apt-get clean \ + && rm -rf \ + /var/lib/apt/lists/* \ + /tmp/* \ + /var/tmp/* + +# update the default system interpreter to the newly installed version +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1 + + +FROM base AS dbt-bigquery-dev + +HEALTHCHECK CMD python --version || exit 1 + +# send stdout/stderr to terminal +ENV PYTHONUNBUFFERED=1 + +# setup mount for local code +WORKDIR /opt/code +VOLUME /opt/code + +# create a virtual environment +RUN python3 -m venv /opt/venv diff --git a/dbt-bigquery/hatch.toml b/dbt-bigquery/hatch.toml new file mode 100644 index 000000000..eb972b66a --- /dev/null +++ b/dbt-bigquery/hatch.toml @@ -0,0 +1,64 @@ +[version] +path = "src/dbt/adapters/bigquery/__version__.py" + +[build.targets.sdist] +packages = ["src/dbt"] +sources = ["src"] + +[build.targets.wheel] +packages = ["src/dbt"] +sources = ["src"] + +[envs.default] +python = "3.9" +dependencies = [ + "dbt-adapters @ git+https://github.com/dbt-labs/dbt-adapters.git", + "dbt-common @ git+https://github.com/dbt-labs/dbt-common.git", + "dbt-tests-adapter @ git+https://github.com/dbt-labs/dbt-adapters.git#subdirectory=dbt-tests-adapter", + "dbt-core @ git+https://github.com/dbt-labs/dbt-core.git#subdirectory=core", + "ddtrace==2.3.0", + "ipdb~=0.13.13", + "pre-commit==3.7.0", + "freezegun", + "pytest>=7.0,<8.0", + "pytest-csv~=3.0", + "pytest-dotenv", + "pytest-logbook~=1.2", + "pytest-mock", + "pytest-xdist", +] + +[envs.default.scripts] +setup = "pre-commit install" +code-quality = "pre-commit run --all-files" +unit-tests = "python -m pytest {args:tests/unit}" +integration-tests = "python -m pytest --profile service_account {args:tests/functional}" +docker-dev = [ + "docker build -f docker/dev.Dockerfile -t dbt-bigquery-dev .", + "docker run --rm -it --name dbt-bigquery-dev -v $(shell pwd):/opt/code dbt-bigquery-dev", +] + +[envs.build] +detached = true +dependencies = [ + "wheel", + "twine", + "check-wheel-contents", +] + +[envs.build.scripts] +check-all = [ + "- check-wheel", + "- check-sdist", +] +check-wheel = [ + "twine check dist/*", + "find ./dist/dbt_bigquery-*.whl -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/", + "pip freeze | grep dbt-bigquery", +] +check-sdist = [ + "check-wheel-contents dist/*.whl --ignore W007,W008", + "find ./dist/dbt_bigquery-*.gz -maxdepth 1 -type f | xargs python -m pip install --force-reinstall --find-links=dist/", + "pip freeze | grep dbt-bigquery", +] +docker-prod = "docker build -f docker/Dockerfile -t dbt-bigquery ." diff --git a/dbt-bigquery/pyproject.toml b/dbt-bigquery/pyproject.toml new file mode 100644 index 000000000..b2d55b25f --- /dev/null +++ b/dbt-bigquery/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +dynamic = ["version"] +name = "dbt-bigquery" +description = "The BigQuery adapter plugin for dbt" +readme = "README.md" +keywords = ["dbt", "adapter", "adapters", "database", "elt", "dbt-core", "dbt Core", "dbt Cloud", "dbt Labs", "bigquery", "google"] +requires-python = ">=3.9.0" +authors = [{ name = "dbt Labs", email = "info@dbtlabs.com" }] +maintainers = [{ name = "dbt Labs", email = "info@dbtlabs.com" }] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: Apache Software License", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "dbt-common>=1.10,<2.0", + "dbt-adapters>=1.7,<2.0", + # 3.20 introduced pyarrow>=3.0 under the `pandas` extra + "google-cloud-bigquery[pandas]>=3.0,<4.0", + "google-cloud-storage~=2.4", + "google-cloud-dataproc~=5.0", + # ---- + # Expect compatibility with all new versions of these packages, so lower bounds only. + "google-api-core>=2.11.0", + # add dbt-core to ensure backwards compatibility of installation, this is not a functional dependency + "dbt-core>=1.8.0", +] + +[project.urls] +Homepage = "https://github.com/dbt-labs/dbt-bigquery" +Documentation = "https://docs.getdbt.com" +Repository = "https://github.com/dbt-labs/dbt-bigquery.git" +Issues = "https://github.com/dbt-labs/dbt-bigquery/issues" +Changelog = "https://github.com/dbt-labs/dbt-bigquery/blob/main/CHANGELOG.md" + +[tool.mypy] +mypy_path = "third-party-stubs/" + +[tool.pytest.ini_options] +testpaths = ["tests/functional", "tests/unit"] +env_files = ["test.env"] +addopts = "-v -n auto" +color = true +filterwarnings = [ + "ignore:.*'soft_unicode' has been renamed to 'soft_str'*:DeprecationWarning", + "ignore:unclosed file .*:ResourceWarning", +] diff --git a/dbt-bigquery/src/dbt/__init__.py b/dbt-bigquery/src/dbt/__init__.py new file mode 100644 index 000000000..b36383a61 --- /dev/null +++ b/dbt-bigquery/src/dbt/__init__.py @@ -0,0 +1,3 @@ +from pkgutil import extend_path + +__path__ = extend_path(__path__, __name__) diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/__init__.py b/dbt-bigquery/src/dbt/adapters/bigquery/__init__.py new file mode 100644 index 000000000..74fa17cda --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/__init__.py @@ -0,0 +1,12 @@ +from dbt.adapters.bigquery.column import BigQueryColumn +from dbt.adapters.bigquery.connections import BigQueryConnectionManager +from dbt.adapters.bigquery.credentials import BigQueryCredentials +from dbt.adapters.bigquery.impl import BigQueryAdapter, GrantTarget, PartitionConfig +from dbt.adapters.bigquery.relation import BigQueryRelation + +from dbt.adapters.base import AdapterPlugin +from dbt.include import bigquery + +Plugin = AdapterPlugin( + adapter=BigQueryAdapter, credentials=BigQueryCredentials, include_path=bigquery.PACKAGE_PATH +) diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/__version__.py b/dbt-bigquery/src/dbt/adapters/bigquery/__version__.py new file mode 100644 index 000000000..1af777a62 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/__version__.py @@ -0,0 +1 @@ +version = "1.10.0a1" diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/clients.py b/dbt-bigquery/src/dbt/adapters/bigquery/clients.py new file mode 100644 index 000000000..722266240 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/clients.py @@ -0,0 +1,69 @@ +from google.api_core.client_info import ClientInfo +from google.api_core.client_options import ClientOptions +from google.auth.exceptions import DefaultCredentialsError +from google.cloud.bigquery import Client as BigQueryClient, DEFAULT_RETRY as BQ_DEFAULT_RETRY +from google.cloud.dataproc_v1 import BatchControllerClient, JobControllerClient +from google.cloud.storage import Client as StorageClient +from google.cloud.storage.retry import DEFAULT_RETRY as GCS_DEFAULT_RETRY + +from dbt.adapters.events.logging import AdapterLogger + +import dbt.adapters.bigquery.__version__ as dbt_version +from dbt.adapters.bigquery.credentials import ( + BigQueryCredentials, + create_google_credentials, + set_default_credentials, +) + + +_logger = AdapterLogger("BigQuery") + + +def create_bigquery_client(credentials: BigQueryCredentials) -> BigQueryClient: + try: + return _create_bigquery_client(credentials) + except DefaultCredentialsError: + _logger.info("Please log into GCP to continue") + set_default_credentials() + return _create_bigquery_client(credentials) + + +@GCS_DEFAULT_RETRY +def create_gcs_client(credentials: BigQueryCredentials) -> StorageClient: + return StorageClient( + project=credentials.execution_project, + credentials=create_google_credentials(credentials), + ) + + +# dataproc does not appear to have a default retry like BQ and GCS +def create_dataproc_job_controller_client(credentials: BigQueryCredentials) -> JobControllerClient: + return JobControllerClient( + credentials=create_google_credentials(credentials), + client_options=ClientOptions(api_endpoint=_dataproc_endpoint(credentials)), + ) + + +# dataproc does not appear to have a default retry like BQ and GCS +def create_dataproc_batch_controller_client( + credentials: BigQueryCredentials, +) -> BatchControllerClient: + return BatchControllerClient( + credentials=create_google_credentials(credentials), + client_options=ClientOptions(api_endpoint=_dataproc_endpoint(credentials)), + ) + + +@BQ_DEFAULT_RETRY +def _create_bigquery_client(credentials: BigQueryCredentials) -> BigQueryClient: + return BigQueryClient( + credentials.execution_project, + create_google_credentials(credentials), + location=getattr(credentials, "location", None), + client_info=ClientInfo(user_agent=f"dbt-bigquery-{dbt_version.version}"), + client_options=ClientOptions(quota_project_id=credentials.quota_project), + ) + + +def _dataproc_endpoint(credentials: BigQueryCredentials) -> str: + return f"{credentials.dataproc_region}-dataproc.googleapis.com:443" diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/column.py b/dbt-bigquery/src/dbt/adapters/bigquery/column.py new file mode 100644 index 000000000..a676fef4b --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/column.py @@ -0,0 +1,300 @@ +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar, Union + +from google.cloud.bigquery import SchemaField + +from dbt.adapters.base.column import Column + + +_PARENT_DATA_TYPE_KEY = "__parent_data_type" + +Self = TypeVar("Self", bound="BigQueryColumn") + + +@dataclass(init=False) +class BigQueryColumn(Column): + TYPE_LABELS = { + "TEXT": "STRING", + "FLOAT": "FLOAT64", + "INTEGER": "INT64", + } + fields: List[Self] # type: ignore + mode: str + + def __init__( + self, + column: str, + dtype: str, + fields: Optional[Iterable[SchemaField]] = None, + mode: str = "NULLABLE", + ) -> None: + super().__init__(column, dtype) + + if fields is None: + fields = [] + + self.fields = self.wrap_subfields(fields) + self.mode = mode + + @classmethod + def wrap_subfields(cls: Type[Self], fields: Iterable[SchemaField]) -> List[Self]: + return [cls.create_from_field(field) for field in fields] + + @classmethod + def create_from_field(cls: Type[Self], field: SchemaField) -> Self: + return cls( + field.name, + cls.translate_type(field.field_type), + field.fields, + field.mode, + ) + + @classmethod + def _flatten_recursive(cls: Type[Self], col: Self, prefix: Optional[str] = None) -> List[Self]: + if prefix is None: + prefix = [] # type: ignore[assignment] + + if len(col.fields) == 0: + prefixed_name = ".".join(prefix + [col.column]) # type: ignore[operator] + new_col = cls(prefixed_name, col.dtype, col.fields, col.mode) + return [new_col] + + new_fields = [] + for field in col.fields: + new_prefix = prefix + [col.column] # type: ignore[operator] + new_fields.extend(cls._flatten_recursive(field, new_prefix)) + + return new_fields + + def flatten(self): + return self._flatten_recursive(self) + + @property + def quoted(self): + return "`{}`".format(self.column) + + def literal(self, value): + return "cast({} as {})".format(value, self.dtype) + + @property + def data_type(self) -> str: + if self.dtype.upper() == "RECORD": + subcols = [ + "{} {}".format(col.quoted, col.data_type) for col in self.fields # type: ignore[attr-defined] + ] + field_type = "STRUCT<{}>".format(", ".join(subcols)) + + else: + field_type = self.dtype + + if self.mode.upper() == "REPEATED": + return "ARRAY<{}>".format(field_type) + + else: + return field_type + + @classmethod + def numeric_type(cls, dtype: str, precision: Any, scale: Any) -> str: + # BigQuery makes life much harder if precision + scale are specified + # even if they're fed in here, just return the data type by itself + return dtype + + def is_string(self) -> bool: + return self.dtype.lower() == "string" + + def is_integer(self) -> bool: + return self.dtype.lower() == "int64" + + def is_numeric(self) -> bool: + return self.dtype.lower() == "numeric" + + def is_float(self): + return self.dtype.lower() == "float64" + + def can_expand_to(self: Self, other_column: Self) -> bool: + """returns True if both columns are strings""" + return self.is_string() and other_column.is_string() + + def __repr__(self) -> str: + return "".format(self.name, self.data_type, self.mode) + + def column_to_bq_schema(self) -> SchemaField: + """Convert a column to a bigquery schema object.""" + kwargs = {} + if len(self.fields) > 0: + fields = [field.column_to_bq_schema() for field in self.fields] # type: ignore[attr-defined] + kwargs = {"fields": fields} + + return SchemaField(self.name, self.dtype, self.mode, **kwargs) + + +def get_nested_column_data_types( + columns: Dict[str, Dict[str, Any]], + constraints: Optional[Dict[str, str]] = None, +) -> Dict[str, Dict[str, Optional[str]]]: + """ + columns: + * Dictionary where keys are of flat columns names and values are dictionary of column attributes + * column names with "." indicate a nested column within a STRUCT type + * e.g. {"a": {"name": "a", "data_type": "string", ...}} + constraints: + * Dictionary where keys are flat column names and values are rendered constraints for the column + * If provided, rendered column is included in returned "data_type" values. + returns: + * Dictionary where keys are root column names and values are corresponding nested data_type values. + * Fields other than "name" and "data_type" are __not__ preserved in the return value for nested columns. + * Fields other than "name" and "data_type" are preserved in the return value for flat columns. + + Example: + columns: { + "a": {"name": "a", "data_type": "string", "description": ...}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + "b.nested2": {"name": "b.nested2", "data_type": "string"} + } + + returns: { + "a": {"name": "a", "data_type": "string"}, + "b": {"name": "b": "data_type": "struct} + } + """ + constraints = constraints or {} + + nested_column_data_types: Dict[str, Optional[Union[str, Dict]]] = {} + for column in columns.values(): + _update_nested_column_data_types( + column["name"], + column.get("data_type"), + constraints.get(column["name"]), + nested_column_data_types, + ) + + formatted_nested_column_data_types: Dict[str, Dict[str, Optional[str]]] = {} + for column_name, unformatted_column_type in nested_column_data_types.items(): + formatted_nested_column_data_types[column_name] = { + "name": column_name, + "data_type": _format_nested_data_type(unformatted_column_type), + } + + # add column configs back to flat columns + for column_name in formatted_nested_column_data_types: + if column_name in columns: + formatted_nested_column_data_types[column_name].update( + { + k: v + for k, v in columns[column_name].items() + if k not in formatted_nested_column_data_types[column_name] + } + ) + + return formatted_nested_column_data_types + + +def _update_nested_column_data_types( + column_name: str, + column_data_type: Optional[str], + column_rendered_constraint: Optional[str], + nested_column_data_types: Dict[str, Optional[Union[str, Dict]]], +) -> None: + """ + Recursively update nested_column_data_types given a column_name, column_data_type, and optional column_rendered_constraint. + + Examples: + >>> nested_column_data_types = {} + >>> BigQueryAdapter._update_nested_column_data_types("a", "string", "not_null", nested_column_data_types) + >>> nested_column_data_types + {"a": "string not null"} + >>> BigQueryAdapter._update_nested_column_data_types("b.c", "string", "not_null", nested_column_data_types) + >>> nested_column_data_types + {"a": "string not null", "b": {"c": "string not null"}} + >>> BigQueryAdapter._update_nested_column_data_types("b.d", "string", None, nested_column_data_types) + >>> nested_column_data_types + {"a": "string not null", "b": {"c": "string not null", "d": "string"}} + """ + column_name_parts = column_name.split(".") + root_column_name = column_name_parts[0] + + if len(column_name_parts) == 1: + # Base case: column is not nested - store its data_type concatenated with constraint if provided. + column_data_type_and_constraints = ( + ( + column_data_type + if column_rendered_constraint is None + else f"{column_data_type} {column_rendered_constraint}" + ) + if column_data_type + else None + ) + + if existing_nested_column_data_type := nested_column_data_types.get(root_column_name): + assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy + # entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY" + existing_nested_column_data_type.update( + {_PARENT_DATA_TYPE_KEY: column_data_type_and_constraints} + ) + else: + nested_column_data_types.update({root_column_name: column_data_type_and_constraints}) + else: + parent_data_type = nested_column_data_types.get(root_column_name) + if isinstance(parent_data_type, dict): + # nested dictionary already initialized + pass + elif parent_data_type is None: + # initialize nested dictionary + nested_column_data_types.update({root_column_name: {}}) + else: + # a parent specified its base type -- preserve its data_type and potential rendered constraints + # this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc + nested_column_data_types.update( + {root_column_name: {_PARENT_DATA_TYPE_KEY: parent_data_type}} + ) + + # Recursively process rest of remaining column name + remaining_column_name = ".".join(column_name_parts[1:]) + remaining_column_data_types = nested_column_data_types[root_column_name] + assert isinstance(remaining_column_data_types, dict) # keeping mypy happy + _update_nested_column_data_types( + remaining_column_name, + column_data_type, + column_rendered_constraint, + remaining_column_data_types, + ) + + +def _format_nested_data_type( + unformatted_nested_data_type: Optional[Union[str, Dict[str, Any]]] +) -> Optional[str]: + """ + Recursively format a (STRUCT) data type given an arbitrarily nested data type structure. + + Examples: + >>> BigQueryAdapter._format_nested_data_type("string") + 'string' + >>> BigQueryAdapter._format_nested_data_type({'c': 'string not_null', 'd': 'string'}) + 'struct' + >>> BigQueryAdapter._format_nested_data_type({'c': 'string not_null', 'd': {'e': 'string'}}) + 'struct>' + """ + if unformatted_nested_data_type is None: + return None + elif isinstance(unformatted_nested_data_type, str): + return unformatted_nested_data_type + else: + parent_data_type, *parent_constraints = unformatted_nested_data_type.pop( + _PARENT_DATA_TYPE_KEY, "" + ).split() or [None] + + formatted_nested_types = [ + f"{column_name} {_format_nested_data_type(column_type) or ''}".strip() + for column_name, column_type in unformatted_nested_data_type.items() + ] + + formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>""" + + if parent_data_type and parent_data_type.lower() == "array": + formatted_nested_type = f"""array<{formatted_nested_type}>""" + + if parent_constraints: + parent_constraints = " ".join(parent_constraints) + formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}""" + + return formatted_nested_type diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/connections.py b/dbt-bigquery/src/dbt/adapters/bigquery/connections.py new file mode 100644 index 000000000..bb062f330 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/connections.py @@ -0,0 +1,613 @@ +from collections import defaultdict +from concurrent.futures import TimeoutError +from contextlib import contextmanager +from dataclasses import dataclass +import json +from multiprocessing.context import SpawnContext +import re +from typing import Dict, Hashable, List, Optional, Tuple, TYPE_CHECKING +import uuid + +from google.auth.exceptions import RefreshError +from google.cloud.bigquery import ( + Client, + CopyJobConfig, + Dataset, + DatasetReference, + LoadJobConfig, + QueryJobConfig, + QueryPriority, + SchemaField, + Table, + TableReference, +) +from google.cloud.exceptions import BadRequest, Forbidden, NotFound + +from dbt_common.events.contextvars import get_node_info +from dbt_common.events.functions import fire_event +from dbt_common.exceptions import DbtDatabaseError, DbtRuntimeError +from dbt_common.invocation import get_invocation_id +from dbt.adapters.base import BaseConnectionManager +from dbt.adapters.contracts.connection import ( + AdapterRequiredConfig, + AdapterResponse, + ConnectionState, +) +from dbt.adapters.events.logging import AdapterLogger +from dbt.adapters.events.types import SQLQuery +from dbt.adapters.exceptions.connection import FailedToConnectError + +from dbt.adapters.bigquery.clients import create_bigquery_client +from dbt.adapters.bigquery.credentials import Priority +from dbt.adapters.bigquery.retry import RetryFactory + +if TYPE_CHECKING: + # Indirectly imported via agate_helper, which is lazy loaded further downfile. + # Used by mypy for earlier type hints. + import agate + + +logger = AdapterLogger("BigQuery") + + +BQ_QUERY_JOB_SPLIT = "-----Query Job SQL Follows-----" + + +@dataclass +class BigQueryAdapterResponse(AdapterResponse): + bytes_processed: Optional[int] = None + bytes_billed: Optional[int] = None + location: Optional[str] = None + project_id: Optional[str] = None + job_id: Optional[str] = None + slot_ms: Optional[int] = None + + +class BigQueryConnectionManager(BaseConnectionManager): + TYPE = "bigquery" + + def __init__(self, profile: AdapterRequiredConfig, mp_context: SpawnContext): + super().__init__(profile, mp_context) + self.jobs_by_thread: Dict[Hashable, List[str]] = defaultdict(list) + self._retry = RetryFactory(profile.credentials) + + @classmethod + def handle_error(cls, error, message): + error_msg = "\n".join([item["message"] for item in error.errors]) + if hasattr(error, "query_job"): + logger.error( + cls._bq_job_link( + error.query_job.location, error.query_job.project, error.query_job.job_id + ) + ) + raise DbtDatabaseError(error_msg) + + def clear_transaction(self): + pass + + @contextmanager + def exception_handler(self, sql): + try: + yield + + except BadRequest as e: + message = "Bad request while running query" + self.handle_error(e, message) + + except Forbidden as e: + message = "Access denied while running query" + self.handle_error(e, message) + + except NotFound as e: + message = "Not found while running query" + self.handle_error(e, message) + + except RefreshError as e: + message = ( + "Unable to generate access token, if you're using " + "impersonate_service_account, make sure your " + 'initial account has the "roles/' + 'iam.serviceAccountTokenCreator" role on the ' + "account you are trying to impersonate.\n\n" + f"{str(e)}" + ) + raise DbtRuntimeError(message) + + except Exception as e: + logger.debug("Unhandled error while running:\n{}".format(sql)) + logger.debug(e) + if isinstance(e, DbtRuntimeError): + # during a sql query, an internal to dbt exception was raised. + # this sounds a lot like a signal handler and probably has + # useful information, so raise it without modification. + raise + exc_message = str(e) + # the google bigquery library likes to add the query log, which we + # don't want to log. Hopefully they never change this! + if BQ_QUERY_JOB_SPLIT in exc_message: + exc_message = exc_message.split(BQ_QUERY_JOB_SPLIT)[0].strip() + raise DbtRuntimeError(exc_message) + + def cancel_open(self): + names = [] + this_connection = self.get_if_exists() + with self.lock: + for thread_id, connection in self.thread_connections.items(): + if connection is this_connection: + continue + + if connection.handle is not None and connection.state == ConnectionState.OPEN: + client: Client = connection.handle + for job_id in self.jobs_by_thread.get(thread_id, []): + with self.exception_handler(f"Cancel job: {job_id}"): + client.cancel_job( + job_id, + retry=self._retry.create_reopen_with_deadline(connection), + ) + self.close(connection) + + if connection.name is not None: + names.append(connection.name) + return names + + @classmethod + def close(cls, connection): + connection.handle.close() + connection.state = ConnectionState.CLOSED + + return connection + + def begin(self): + pass + + def commit(self): + pass + + def format_bytes(self, num_bytes): + if num_bytes: + for unit in ["Bytes", "KiB", "MiB", "GiB", "TiB", "PiB"]: + if abs(num_bytes) < 1024.0: + return f"{num_bytes:3.1f} {unit}" + num_bytes /= 1024.0 + + num_bytes *= 1024.0 + return f"{num_bytes:3.1f} {unit}" + + else: + return num_bytes + + def format_rows_number(self, rows_number): + for unit in ["", "k", "m", "b", "t"]: + if abs(rows_number) < 1000.0: + return f"{rows_number:3.1f}{unit}".strip() + rows_number /= 1000.0 + + rows_number *= 1000.0 + return f"{rows_number:3.1f}{unit}".strip() + + @classmethod + def open(cls, connection): + if connection.state == ConnectionState.OPEN: + logger.debug("Connection is already open, skipping open.") + return connection + + try: + connection.handle = create_bigquery_client(connection.credentials) + connection.state = ConnectionState.OPEN + return connection + + except Exception as e: + logger.debug(f"""Got an error when attempting to create a bigquery " "client: '{e}'""") + connection.handle = None + connection.state = ConnectionState.FAIL + raise FailedToConnectError(str(e)) + + @classmethod + def get_table_from_response(cls, resp) -> "agate.Table": + from dbt_common.clients import agate_helper + + column_names = [field.name for field in resp.schema] + return agate_helper.table_from_data_flat(resp, column_names) + + def get_labels_from_query_comment(cls): + if ( + hasattr(cls.profile, "query_comment") + and cls.profile.query_comment + and cls.profile.query_comment.job_label + and cls.query_header + ): + query_comment = cls.query_header.comment.query_comment + return cls._labels_from_query_comment(query_comment) + + return {} + + def generate_job_id(self) -> str: + # Generating a fresh job_id for every _query_and_results call to avoid job_id reuse. + # Generating a job id instead of persisting a BigQuery-generated one after client.query is called. + # Using BigQuery's job_id can lead to a race condition if a job has been started and a termination + # is sent before the job_id was stored, leading to a failure to cancel the job. + # By predetermining job_ids (uuid4), we can persist the job_id before the job has been kicked off. + # Doing this, the race condition only leads to attempting to cancel a job that doesn't exist. + job_id = str(uuid.uuid4()) + thread_id = self.get_thread_identifier() + self.jobs_by_thread[thread_id].append(job_id) + return job_id + + def raw_execute( + self, + sql, + use_legacy_sql=False, + limit: Optional[int] = None, + dry_run: bool = False, + ): + conn = self.get_thread_connection() + + fire_event(SQLQuery(conn_name=conn.name, sql=sql, node_info=get_node_info())) + + labels = self.get_labels_from_query_comment() + + labels["dbt_invocation_id"] = get_invocation_id() + + job_params = { + "use_legacy_sql": use_legacy_sql, + "labels": labels, + "dry_run": dry_run, + } + + priority = conn.credentials.priority + if priority == Priority.Batch: + job_params["priority"] = QueryPriority.BATCH + else: + job_params["priority"] = QueryPriority.INTERACTIVE + + maximum_bytes_billed = conn.credentials.maximum_bytes_billed + if maximum_bytes_billed is not None and maximum_bytes_billed != 0: + job_params["maximum_bytes_billed"] = maximum_bytes_billed + + with self.exception_handler(sql): + job_id = self.generate_job_id() + + return self._query_and_results( + conn, + sql, + job_params, + job_id, + limit=limit, + ) + + def execute( + self, sql, auto_begin=False, fetch=None, limit: Optional[int] = None + ) -> Tuple[BigQueryAdapterResponse, "agate.Table"]: + sql = self._add_query_comment(sql) + # auto_begin is ignored on bigquery, and only included for consistency + query_job, iterator = self.raw_execute(sql, limit=limit) + + if fetch: + table = self.get_table_from_response(iterator) + else: + from dbt_common.clients import agate_helper + + table = agate_helper.empty_table() + + message = "OK" + code = None + num_rows = None + bytes_processed = None + bytes_billed = None + location = None + job_id = None + project_id = None + num_rows_formatted = None + processed_bytes = None + slot_ms = None + + if query_job.statement_type == "CREATE_VIEW": + code = "CREATE VIEW" + + elif query_job.statement_type == "CREATE_TABLE_AS_SELECT": + code = "CREATE TABLE" + conn = self.get_thread_connection() + client = conn.handle + query_table = client.get_table(query_job.destination) + num_rows = query_table.num_rows + + elif query_job.statement_type == "SCRIPT": + code = "SCRIPT" + + elif query_job.statement_type in ["INSERT", "DELETE", "MERGE", "UPDATE"]: + code = query_job.statement_type + num_rows = query_job.num_dml_affected_rows + + elif query_job.statement_type == "SELECT": + code = "SELECT" + conn = self.get_thread_connection() + client = conn.handle + # use anonymous table for num_rows + query_table = client.get_table(query_job.destination) + num_rows = query_table.num_rows + + # set common attributes + bytes_processed = query_job.total_bytes_processed + bytes_billed = query_job.total_bytes_billed + slot_ms = query_job.slot_millis + processed_bytes = self.format_bytes(bytes_processed) + location = query_job.location + job_id = query_job.job_id + project_id = query_job.project + if num_rows is not None: + num_rows_formatted = self.format_rows_number(num_rows) + message = f"{code} ({num_rows_formatted} rows, {processed_bytes} processed)" + elif bytes_processed is not None: + message = f"{code} ({processed_bytes} processed)" + else: + message = f"{code}" + + response = BigQueryAdapterResponse( + _message=message, + rows_affected=num_rows, + code=code, + bytes_processed=bytes_processed, + bytes_billed=bytes_billed, + location=location, + project_id=project_id, + job_id=job_id, + slot_ms=slot_ms, + ) + + return response, table + + def dry_run(self, sql: str) -> BigQueryAdapterResponse: + """Run the given sql statement with the `dry_run` job parameter set. + + This will allow BigQuery to validate the SQL and immediately return job cost + estimates, which we capture in the BigQueryAdapterResponse. Invalid SQL + will result in an exception. + """ + sql = self._add_query_comment(sql) + query_job, _ = self.raw_execute(sql, dry_run=True) + + # TODO: Factor this repetitive block out into a factory method on + # BigQueryAdapterResponse + message = f"Ran dry run query for statement of type {query_job.statement_type}" + bytes_billed = query_job.total_bytes_billed + processed_bytes = self.format_bytes(query_job.total_bytes_processed) + location = query_job.location + project_id = query_job.project + job_id = query_job.job_id + slot_ms = query_job.slot_millis + + return BigQueryAdapterResponse( + _message=message, + code="DRY RUN", + bytes_billed=bytes_billed, + bytes_processed=processed_bytes, + location=location, + project_id=project_id, + job_id=job_id, + slot_ms=slot_ms, + ) + + @staticmethod + def _bq_job_link(location, project_id, job_id) -> str: + return f"https://console.cloud.google.com/bigquery?project={project_id}&j=bq:{location}:{job_id}&page=queryresults" + + def get_partitions_metadata(self, table): + def standard_to_legacy(table): + return table.project + ":" + table.dataset + "." + table.identifier + + legacy_sql = "SELECT * FROM [" + standard_to_legacy(table) + "$__PARTITIONS_SUMMARY__]" + + sql = self._add_query_comment(legacy_sql) + # auto_begin is ignored on bigquery, and only included for consistency + _, iterator = self.raw_execute(sql, use_legacy_sql=True) + return self.get_table_from_response(iterator) + + def copy_bq_table(self, source, destination, write_disposition) -> None: + conn = self.get_thread_connection() + client: Client = conn.handle + + # ------------------------------------------------------------------------------- + # BigQuery allows to use copy API using two different formats: + # 1. client.copy_table(source_table_id, destination_table_id) + # where source_table_id = "your-project.source_dataset.source_table" + # 2. client.copy_table(source_table_ids, destination_table_id) + # where source_table_ids = ["your-project.your_dataset.your_table_name", ...] + # Let's use uniform function call and always pass list there + # ------------------------------------------------------------------------------- + if type(source) is not list: + source = [source] + + source_ref_array = [ + self.table_ref(src_table.database, src_table.schema, src_table.table) + for src_table in source + ] + destination_ref = self.table_ref( + destination.database, destination.schema, destination.table + ) + + logger.debug( + 'Copying table(s) "{}" to "{}" with disposition: "{}"', + ", ".join(source_ref.path for source_ref in source_ref_array), + destination_ref.path, + write_disposition, + ) + + msg = 'copy table "{}" to "{}"'.format( + ", ".join(source_ref.path for source_ref in source_ref_array), + destination_ref.path, + ) + with self.exception_handler(msg): + copy_job = client.copy_table( + source_ref_array, + destination_ref, + job_config=CopyJobConfig(write_disposition=write_disposition), + retry=self._retry.create_reopen_with_deadline(conn), + ) + copy_job.result(timeout=self._retry.create_job_execution_timeout(fallback=300)) + + def write_dataframe_to_table( + self, + client: Client, + file_path: str, + database: str, + schema: str, + identifier: str, + table_schema: List[SchemaField], + field_delimiter: str, + fallback_timeout: Optional[float] = None, + ) -> None: + load_config = LoadJobConfig( + skip_leading_rows=1, + schema=table_schema, + field_delimiter=field_delimiter, + ) + table = self.table_ref(database, schema, identifier) + self._write_file_to_table(client, file_path, table, load_config, fallback_timeout) + + def write_file_to_table( + self, + client: Client, + file_path: str, + database: str, + schema: str, + identifier: str, + fallback_timeout: Optional[float] = None, + **kwargs, + ) -> None: + config = kwargs["kwargs"] + if "schema" in config: + config["schema"] = json.load(config["schema"]) + load_config = LoadJobConfig(**config) + table = self.table_ref(database, schema, identifier) + self._write_file_to_table(client, file_path, table, load_config, fallback_timeout) + + def _write_file_to_table( + self, + client: Client, + file_path: str, + table: TableReference, + config: LoadJobConfig, + fallback_timeout: Optional[float] = None, + ) -> None: + + with self.exception_handler("LOAD TABLE"): + with open(file_path, "rb") as f: + job = client.load_table_from_file(f, table, rewind=True, job_config=config) + + response = job.result(retry=self._retry.create_retry(fallback=fallback_timeout)) + + if response.state != "DONE": + raise DbtRuntimeError("BigQuery Timeout Exceeded") + + elif response.error_result: + message = "\n".join(error["message"].strip() for error in response.errors) + raise DbtRuntimeError(message) + + @staticmethod + def dataset_ref(database, schema): + return DatasetReference(project=database, dataset_id=schema) + + @staticmethod + def table_ref(database, schema, table_name): + dataset_ref = DatasetReference(database, schema) + return TableReference(dataset_ref, table_name) + + def get_bq_table(self, database, schema, identifier) -> Table: + """Get a bigquery table for a schema/model.""" + conn = self.get_thread_connection() + client: Client = conn.handle + # backwards compatibility: fill in with defaults if not specified + database = database or conn.credentials.database + schema = schema or conn.credentials.schema + return client.get_table(self.table_ref(database, schema, identifier)) + + def drop_dataset(self, database, schema) -> None: + conn = self.get_thread_connection() + client: Client = conn.handle + with self.exception_handler("drop dataset"): + client.delete_dataset( + dataset=self.dataset_ref(database, schema), + delete_contents=True, + not_found_ok=True, + retry=self._retry.create_reopen_with_deadline(conn), + ) + + def create_dataset(self, database, schema) -> Dataset: + conn = self.get_thread_connection() + client: Client = conn.handle + with self.exception_handler("create dataset"): + return client.create_dataset( + dataset=self.dataset_ref(database, schema), + exists_ok=True, + retry=self._retry.create_reopen_with_deadline(conn), + ) + + def list_dataset(self, database: str): + # The database string we get here is potentially quoted. + # Strip that off for the API call. + conn = self.get_thread_connection() + client: Client = conn.handle + with self.exception_handler("list dataset"): + # this is similar to how we have to deal with listing tables + all_datasets = client.list_datasets( + project=database.strip("`"), + max_results=10000, + retry=self._retry.create_reopen_with_deadline(conn), + ) + return [ds.dataset_id for ds in all_datasets] + + def _query_and_results( + self, + conn, + sql, + job_params, + job_id, + limit: Optional[int] = None, + ): + client: Client = conn.handle + """Query the client and wait for results.""" + # Cannot reuse job_config if destination is set and ddl is used + query_job = client.query( + query=sql, + job_config=QueryJobConfig(**job_params), + job_id=job_id, # note, this disables retry since the job_id will have been used + timeout=self._retry.create_job_creation_timeout(), + ) + if ( + query_job.location is not None + and query_job.job_id is not None + and query_job.project is not None + ): + logger.debug( + self._bq_job_link(query_job.location, query_job.project, query_job.job_id) + ) + + timeout = self._retry.create_job_execution_timeout() + try: + iterator = query_job.result(max_results=limit, timeout=timeout) + except TimeoutError: + exc = f"Operation did not complete within the designated timeout of {timeout} seconds." + raise TimeoutError(exc) + return query_job, iterator + + def _labels_from_query_comment(self, comment: str) -> Dict: + try: + comment_labels = json.loads(comment) + except (TypeError, ValueError): + return {"query_comment": _sanitize_label(comment)} + return { + _sanitize_label(key): _sanitize_label(str(value)) + for key, value in comment_labels.items() + } + + +_SANITIZE_LABEL_PATTERN = re.compile(r"[^a-z0-9_-]") + +_VALIDATE_LABEL_LENGTH_LIMIT = 63 + + +def _sanitize_label(value: str) -> str: + """Return a legal value for a BigQuery label.""" + value = value.strip().lower() + value = _SANITIZE_LABEL_PATTERN.sub("_", value) + return value[:_VALIDATE_LABEL_LENGTH_LIMIT] diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/credentials.py b/dbt-bigquery/src/dbt/adapters/bigquery/credentials.py new file mode 100644 index 000000000..94d70a931 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/credentials.py @@ -0,0 +1,269 @@ +import base64 +import binascii +from dataclasses import dataclass, field +from functools import lru_cache +from typing import Any, Dict, Iterable, Optional, Tuple, Union + +from google.auth import default +from google.auth.exceptions import DefaultCredentialsError +from google.auth.impersonated_credentials import Credentials as ImpersonatedCredentials +from google.oauth2.credentials import Credentials as GoogleCredentials +from google.oauth2.service_account import Credentials as ServiceAccountCredentials +from mashumaro import pass_through + +from dbt_common.clients.system import run_cmd +from dbt_common.dataclass_schema import ExtensibleDbtClassMixin, StrEnum +from dbt_common.exceptions import DbtConfigError, DbtRuntimeError +from dbt.adapters.contracts.connection import Credentials +from dbt.adapters.events.logging import AdapterLogger +from dbt.adapters.exceptions.connection import FailedToConnectError + + +_logger = AdapterLogger("BigQuery") + + +class Priority(StrEnum): + Interactive = "interactive" + Batch = "batch" + + +@dataclass +class DataprocBatchConfig(ExtensibleDbtClassMixin): + def __init__(self, batch_config): + self.batch_config = batch_config + + +class _BigQueryConnectionMethod(StrEnum): + OAUTH = "oauth" + OAUTH_SECRETS = "oauth-secrets" + SERVICE_ACCOUNT = "service-account" + SERVICE_ACCOUNT_JSON = "service-account-json" + + +@dataclass +class BigQueryCredentials(Credentials): + method: _BigQueryConnectionMethod = None # type: ignore + + # BigQuery allows an empty database / project, where it defers to the + # environment for the project + database: Optional[str] = None + schema: Optional[str] = None + execution_project: Optional[str] = None + quota_project: Optional[str] = None + location: Optional[str] = None + priority: Optional[Priority] = None + maximum_bytes_billed: Optional[int] = None + impersonate_service_account: Optional[str] = None + + job_retry_deadline_seconds: Optional[int] = None + job_retries: Optional[int] = 1 + job_creation_timeout_seconds: Optional[int] = None + job_execution_timeout_seconds: Optional[int] = None + + # Keyfile json creds (unicode or base 64 encoded) + keyfile: Optional[str] = None + keyfile_json: Optional[Dict[str, Any]] = None + + # oauth-secrets + token: Optional[str] = None + refresh_token: Optional[str] = None + client_id: Optional[str] = None + client_secret: Optional[str] = None + token_uri: Optional[str] = None + + dataproc_region: Optional[str] = None + dataproc_cluster_name: Optional[str] = None + gcs_bucket: Optional[str] = None + + dataproc_batch: Optional[DataprocBatchConfig] = field( + metadata={ + "serialization_strategy": pass_through, + }, + default=None, + ) + + scopes: Optional[Tuple[str, ...]] = ( + "https://www.googleapis.com/auth/bigquery", + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/drive", + ) + + _ALIASES = { + # 'legacy_name': 'current_name' + "project": "database", + "dataset": "schema", + "target_project": "target_database", + "target_dataset": "target_schema", + "retries": "job_retries", + "timeout_seconds": "job_execution_timeout_seconds", + } + + def __post_init__(self): + if self.keyfile_json and "private_key" in self.keyfile_json: + self.keyfile_json["private_key"] = self.keyfile_json["private_key"].replace( + "\\n", "\n" + ) + if not self.method: + raise DbtRuntimeError("Must specify authentication method") + + if not self.schema: + raise DbtRuntimeError("Must specify schema") + + @property + def type(self): + return "bigquery" + + @property + def unique_field(self): + return self.database + + def _connection_keys(self): + return ( + "method", + "database", + "execution_project", + "schema", + "location", + "priority", + "maximum_bytes_billed", + "impersonate_service_account", + "job_retry_deadline_seconds", + "job_retries", + "job_creation_timeout_seconds", + "job_execution_timeout_seconds", + "timeout_seconds", + "client_id", + "token_uri", + "dataproc_region", + "dataproc_cluster_name", + "gcs_bucket", + "dataproc_batch", + ) + + @classmethod + def __pre_deserialize__(cls, d: Dict[Any, Any]) -> Dict[Any, Any]: + # We need to inject the correct value of the database (aka project) at + # this stage, ref + # https://github.com/dbt-labs/dbt/pull/2908#discussion_r532927436. + + # `database` is an alias of `project` in BigQuery + if "database" not in d: + _, database = _create_bigquery_defaults() + d["database"] = database + # `execution_project` default to dataset/project + if "execution_project" not in d: + d["execution_project"] = d["database"] + return d + + +def set_default_credentials() -> None: + try: + run_cmd(".", ["gcloud", "--version"]) + except OSError as e: + _logger.debug(e) + msg = """ + dbt requires the gcloud SDK to be installed to authenticate with BigQuery. + Please download and install the SDK, or use a Service Account instead. + + https://cloud.google.com/sdk/ + """ + raise DbtRuntimeError(msg) + + run_cmd(".", ["gcloud", "auth", "application-default", "login"]) + + +def create_google_credentials(credentials: BigQueryCredentials) -> GoogleCredentials: + if credentials.impersonate_service_account: + return _create_impersonated_credentials(credentials) + return _create_google_credentials(credentials) + + +def _create_impersonated_credentials(credentials: BigQueryCredentials) -> ImpersonatedCredentials: + if credentials.scopes and isinstance(credentials.scopes, Iterable): + target_scopes = list(credentials.scopes) + else: + target_scopes = [] + + return ImpersonatedCredentials( + source_credentials=_create_google_credentials(credentials), + target_principal=credentials.impersonate_service_account, + target_scopes=target_scopes, + ) + + +def _create_google_credentials(credentials: BigQueryCredentials) -> GoogleCredentials: + + if credentials.method == _BigQueryConnectionMethod.OAUTH: + creds, _ = _create_bigquery_defaults(scopes=credentials.scopes) + + elif credentials.method == _BigQueryConnectionMethod.SERVICE_ACCOUNT: + creds = ServiceAccountCredentials.from_service_account_file( + credentials.keyfile, scopes=credentials.scopes + ) + + elif credentials.method == _BigQueryConnectionMethod.SERVICE_ACCOUNT_JSON: + details = credentials.keyfile_json + if _is_base64(details): # type:ignore + details = _base64_to_string(details) + creds = ServiceAccountCredentials.from_service_account_info( + details, scopes=credentials.scopes + ) + + elif credentials.method == _BigQueryConnectionMethod.OAUTH_SECRETS: + creds = GoogleCredentials( + token=credentials.token, + refresh_token=credentials.refresh_token, + client_id=credentials.client_id, + client_secret=credentials.client_secret, + token_uri=credentials.token_uri, + scopes=credentials.scopes, + ) + + else: + raise FailedToConnectError(f"Invalid `method` in profile: '{credentials.method}'") + + return creds + + +@lru_cache() +def _create_bigquery_defaults(scopes=None) -> Tuple[Any, Optional[str]]: + """ + Returns (credentials, project_id) + + project_id is returned available from the environment; otherwise None + """ + # Cached, because the underlying implementation shells out, taking ~1s + try: + return default(scopes=scopes) + except DefaultCredentialsError as e: + raise DbtConfigError(f"Failed to authenticate with supplied credentials\nerror:\n{e}") + + +def _is_base64(s: Union[str, bytes]) -> bool: + """ + Checks if the given string or bytes object is valid Base64 encoded. + + Args: + s: The string or bytes object to check. + + Returns: + True if the input is valid Base64, False otherwise. + """ + + if isinstance(s, str): + # For strings, ensure they consist only of valid Base64 characters + if not s.isascii(): + return False + # Convert to bytes for decoding + s = s.encode("ascii") + + try: + # Use the 'validate' parameter to enforce strict Base64 decoding rules + base64.b64decode(s, validate=True) + return True + except (TypeError, binascii.Error): + return False + + +def _base64_to_string(b): + return base64.b64decode(b).decode("utf-8") diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/dataset.py b/dbt-bigquery/src/dbt/adapters/bigquery/dataset.py new file mode 100644 index 000000000..a4504294a --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/dataset.py @@ -0,0 +1,47 @@ +from typing import List + +from google.cloud.bigquery import AccessEntry, Dataset + +from dbt.adapters.events.logging import AdapterLogger + + +logger = AdapterLogger("BigQuery") + + +def is_access_entry_in_dataset(dataset: Dataset, access_entry: AccessEntry) -> bool: + """Check if the access entry already exists in the dataset. + + Args: + dataset (Dataset): the dataset to be updated + access_entry (AccessEntry): the access entry to be added to the dataset + + Returns: + bool: True if entry exists in dataset, False otherwise + """ + access_entries: List[AccessEntry] = dataset.access_entries + # we can't simply check if an access entry is in the list as the current equality check + # does not work because the locally created AccessEntry can have extra properties. + for existing_entry in access_entries: + role_match = existing_entry.role == access_entry.role + entity_type_match = existing_entry.entity_type == access_entry.entity_type + property_match = existing_entry._properties.items() <= access_entry._properties.items() + if role_match and entity_type_match and property_match: + return True + return False + + +def add_access_entry_to_dataset(dataset: Dataset, access_entry: AccessEntry) -> Dataset: + """Adds an access entry to a dataset, always use access_entry_present_in_dataset to check + if the access entry already exists before calling this function. + + Args: + dataset (Dataset): the dataset to be updated + access_entry (AccessEntry): the access entry to be added to the dataset + + Returns: + Dataset: the updated dataset + """ + access_entries: List[AccessEntry] = dataset.access_entries + access_entries.append(access_entry) + dataset.access_entries = access_entries + return dataset diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/impl.py b/dbt-bigquery/src/dbt/adapters/bigquery/impl.py new file mode 100644 index 000000000..51c457129 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/impl.py @@ -0,0 +1,971 @@ +from dataclasses import dataclass +from datetime import datetime +from multiprocessing.context import SpawnContext +import threading +from typing import ( + Any, + Dict, + FrozenSet, + Iterable, + List, + Optional, + Tuple, + TYPE_CHECKING, + Type, + Set, + Union, +) + +import google.api_core +import google.auth +import google.oauth2 +import google.cloud.bigquery +from google.cloud.bigquery import AccessEntry, Client, SchemaField, Table as BigQueryTable +import google.cloud.exceptions +import pytz + +from dbt_common.contracts.constraints import ( + ColumnLevelConstraint, + ConstraintType, + ModelLevelConstraint, +) +from dbt_common.dataclass_schema import dbtClassMixin +from dbt_common.events.functions import fire_event +import dbt_common.exceptions +import dbt_common.exceptions.base +from dbt_common.utils import filter_null_values +from dbt.adapters.base import ( + AdapterConfig, + BaseAdapter, + BaseRelation, + ConstraintSupport, + PythonJobHelper, + RelationType, + SchemaSearchMap, + available, +) +from dbt.adapters.base.impl import FreshnessResponse +from dbt.adapters.cache import _make_ref_key_dict +from dbt.adapters.capability import Capability, CapabilityDict, CapabilitySupport, Support +from dbt.adapters.contracts.connection import AdapterResponse +from dbt.adapters.contracts.macros import MacroResolverProtocol +from dbt.adapters.contracts.relation import RelationConfig +from dbt.adapters.events.logging import AdapterLogger +from dbt.adapters.events.types import SchemaCreation, SchemaDrop + +from dbt.adapters.bigquery.column import BigQueryColumn, get_nested_column_data_types +from dbt.adapters.bigquery.connections import BigQueryAdapterResponse, BigQueryConnectionManager +from dbt.adapters.bigquery.dataset import add_access_entry_to_dataset, is_access_entry_in_dataset +from dbt.adapters.bigquery.python_submissions import ( + ClusterDataprocHelper, + ServerlessDataProcHelper, +) +from dbt.adapters.bigquery.relation import BigQueryRelation +from dbt.adapters.bigquery.relation_configs import ( + BigQueryBaseRelationConfig, + BigQueryMaterializedViewConfig, + PartitionConfig, +) +from dbt.adapters.bigquery.utility import sql_escape + +if TYPE_CHECKING: + # Indirectly imported via agate_helper, which is lazy loaded further downfile. + # Used by mypy for earlier type hints. + import agate + + +logger = AdapterLogger("BigQuery") + +# Write dispositions for bigquery. +WRITE_APPEND = google.cloud.bigquery.job.WriteDisposition.WRITE_APPEND +WRITE_TRUNCATE = google.cloud.bigquery.job.WriteDisposition.WRITE_TRUNCATE + +CREATE_SCHEMA_MACRO_NAME = "create_schema" +_dataset_lock = threading.Lock() + + +@dataclass +class GrantTarget(dbtClassMixin): + dataset: str + project: str + + def render(self): + return f"{self.project}.{self.dataset}" + + +@dataclass +class BigqueryConfig(AdapterConfig): + cluster_by: Optional[Union[List[str], str]] = None + partition_by: Optional[Dict[str, Any]] = None + kms_key_name: Optional[str] = None + labels: Optional[Dict[str, str]] = None + partitions: Optional[List[str]] = None + grant_access_to: Optional[List[Dict[str, str]]] = None + hours_to_expiration: Optional[int] = None + require_partition_filter: Optional[bool] = None + partition_expiration_days: Optional[int] = None + merge_update_columns: Optional[str] = None + enable_refresh: Optional[bool] = None + refresh_interval_minutes: Optional[int] = None + max_staleness: Optional[str] = None + enable_list_inference: Optional[bool] = None + intermediate_format: Optional[str] = None + + +class BigQueryAdapter(BaseAdapter): + RELATION_TYPES = { + "TABLE": RelationType.Table, + "VIEW": RelationType.View, + "MATERIALIZED_VIEW": RelationType.MaterializedView, + "EXTERNAL": RelationType.External, + } + + Relation = BigQueryRelation + Column = BigQueryColumn + ConnectionManager = BigQueryConnectionManager + + AdapterSpecificConfigs = BigqueryConfig + + CONSTRAINT_SUPPORT = { + ConstraintType.check: ConstraintSupport.NOT_SUPPORTED, + ConstraintType.not_null: ConstraintSupport.ENFORCED, + ConstraintType.unique: ConstraintSupport.NOT_SUPPORTED, + ConstraintType.primary_key: ConstraintSupport.NOT_ENFORCED, + ConstraintType.foreign_key: ConstraintSupport.NOT_ENFORCED, + } + + _capabilities: CapabilityDict = CapabilityDict( + { + Capability.TableLastModifiedMetadata: CapabilitySupport(support=Support.Full), + Capability.SchemaMetadataByRelations: CapabilitySupport(support=Support.Full), + } + ) + + def __init__(self, config, mp_context: SpawnContext) -> None: + super().__init__(config, mp_context) + self.connections: BigQueryConnectionManager = self.connections + + ### + # Implementations of abstract methods + ### + + @classmethod + def date_function(cls) -> str: + return "CURRENT_TIMESTAMP()" + + @classmethod + def is_cancelable(cls) -> bool: + return True + + def drop_relation(self, relation: BigQueryRelation) -> None: + is_cached = self._schema_is_cached(relation.database, relation.schema) + if is_cached: + self.cache_dropped(relation) + + conn = self.connections.get_thread_connection() + + table_ref = self.get_table_ref_from_relation(relation) + + # mimic "drop if exists" functionality that's ubiquitous in most sql implementations + conn.handle.delete_table(table_ref, not_found_ok=True) + + def truncate_relation(self, relation: BigQueryRelation) -> None: + raise dbt_common.exceptions.base.NotImplementedError( + "`truncate` is not implemented for this adapter!" + ) + + def rename_relation( + self, from_relation: BigQueryRelation, to_relation: BigQueryRelation + ) -> None: + conn = self.connections.get_thread_connection() + client = conn.handle + + from_table_ref = self.get_table_ref_from_relation(from_relation) + from_table = client.get_table(from_table_ref) + if ( + from_table.table_type == "VIEW" + or from_relation.type == RelationType.View + or to_relation.type == RelationType.View + ): + raise dbt_common.exceptions.DbtRuntimeError( + "Renaming of views is not currently supported in BigQuery" + ) + + to_table_ref = self.get_table_ref_from_relation(to_relation) + + self.cache_renamed(from_relation, to_relation) + client.copy_table(from_table_ref, to_table_ref) + client.delete_table(from_table_ref) + + @available + def list_schemas(self, database: str) -> List[str]: + return self.connections.list_dataset(database) + + @available.parse(lambda *a, **k: False) + def check_schema_exists(self, database: str, schema: str) -> bool: + conn = self.connections.get_thread_connection() + client = conn.handle + + dataset_ref = self.connections.dataset_ref(database, schema) + # try to do things with the dataset. If it doesn't exist it will 404. + # we have to do it this way to handle underscore-prefixed datasets, + # which appear in neither the information_schema.schemata view nor the + # list_datasets method. + try: + next(iter(client.list_tables(dataset_ref, max_results=1))) + except StopIteration: + pass + except google.api_core.exceptions.NotFound: + # the schema does not exist + return False + return True + + @available.parse(lambda *a, **k: {}) + @classmethod + def nest_column_data_types( + cls, + columns: Dict[str, Dict[str, Any]], + constraints: Optional[Dict[str, str]] = None, + ) -> Dict[str, Dict[str, Optional[str]]]: + return get_nested_column_data_types(columns, constraints) + + def get_columns_in_relation(self, relation: BigQueryRelation) -> List[BigQueryColumn]: + try: + table = self.connections.get_bq_table( + database=relation.database, schema=relation.schema, identifier=relation.identifier + ) + return self._get_dbt_columns_from_bq_table(table) + + except (ValueError, google.cloud.exceptions.NotFound) as e: + logger.debug("get_columns_in_relation error: {}".format(e)) + return [] + + @available.parse(lambda *a, **k: []) + def add_time_ingestion_partition_column(self, partition_by, columns) -> List[BigQueryColumn]: + """Add time ingestion partition column to columns list""" + columns.append( + self.Column( + partition_by.insertable_time_partitioning_field(), + partition_by.data_type, + None, + "NULLABLE", + ) + ) + return columns + + def expand_column_types(self, goal: BigQueryRelation, current: BigQueryRelation) -> None: + # This is a no-op on BigQuery + pass + + def expand_target_column_types( + self, from_relation: BigQueryRelation, to_relation: BigQueryRelation + ) -> None: + # This is a no-op on BigQuery + pass + + @available.parse_list + def list_relations_without_caching( + self, schema_relation: BigQueryRelation + ) -> List[BigQueryRelation]: + connection = self.connections.get_thread_connection() + client = connection.handle + + dataset_ref = self.connections.dataset_ref( + schema_relation.database, schema_relation.schema + ) + + all_tables = client.list_tables( + dataset_ref, + # BigQuery paginates tables by alphabetizing them, and using + # the name of the last table on a page as the key for the + # next page. If that key table gets dropped before we run + # list_relations, then this will 404. So, we avoid this + # situation by making the page size sufficiently large. + # see: https://github.com/dbt-labs/dbt/issues/726 + # TODO: cache the list of relations up front, and then we + # won't need to do this + max_results=100000, + ) + + # This will 404 if the dataset does not exist. This behavior mirrors + # the implementation of list_relations for other adapters + try: + return [self._bq_table_to_relation(table) for table in all_tables] # type: ignore[misc] + except google.api_core.exceptions.NotFound: + return [] + except google.api_core.exceptions.Forbidden as exc: + logger.debug("list_relations_without_caching error: {}".format(str(exc))) + return [] + + def get_relation( + self, database: str, schema: str, identifier: str + ) -> Optional[BigQueryRelation]: + if self._schema_is_cached(database, schema): + # if it's in the cache, use the parent's model of going through + # the relations cache and picking out the relation + return super().get_relation(database=database, schema=schema, identifier=identifier) + + try: + table = self.connections.get_bq_table(database, schema, identifier) + except google.api_core.exceptions.NotFound: + table = None + return self._bq_table_to_relation(table) + + # BigQuery added SQL support for 'create schema' + 'drop schema' in March 2021 + # Unfortunately, 'drop schema' runs into permissions issues during tests + # Most of the value here comes from user overrides of 'create_schema' + + # TODO: the code below is copy-pasted from SQLAdapter.create_schema. Is there a better way? + def create_schema(self, relation: BigQueryRelation) -> None: + # use SQL 'create schema' + relation = relation.without_identifier() + + fire_event(SchemaCreation(relation=_make_ref_key_dict(relation))) + kwargs = { + "relation": relation, + } + self.execute_macro(CREATE_SCHEMA_MACRO_NAME, kwargs=kwargs) + self.commit_if_has_connection() + # we can't update the cache here, as if the schema already existed we + # don't want to (incorrectly) say that it's empty + + def drop_schema(self, relation: BigQueryRelation) -> None: + # still use a client method, rather than SQL 'drop schema ... cascade' + database = relation.database + schema = relation.schema + logger.debug('Dropping schema "{}.{}".', database, schema) # in lieu of SQL + fire_event(SchemaDrop(relation=_make_ref_key_dict(relation))) + self.connections.drop_dataset(database, schema) + self.cache.drop_schema(database, schema) + + @classmethod + def quote(cls, identifier: str) -> str: + return "`{}`".format(identifier) + + @classmethod + def convert_text_type(cls, agate_table: "agate.Table", col_idx: int) -> str: + return "string" + + @classmethod + def convert_number_type(cls, agate_table: "agate.Table", col_idx: int) -> str: + import agate + + decimals = agate_table.aggregate(agate.MaxPrecision(col_idx)) # type: ignore[attr-defined] + return "float64" if decimals else "int64" + + @classmethod + def convert_integer_type(cls, agate_table: "agate.Table", col_idx: int) -> str: + return "int64" + + @classmethod + def convert_boolean_type(cls, agate_table: "agate.Table", col_idx: int) -> str: + return "bool" + + @classmethod + def convert_datetime_type(cls, agate_table: "agate.Table", col_idx: int) -> str: + return "datetime" + + @classmethod + def convert_date_type(cls, agate_table: "agate.Table", col_idx: int) -> str: + return "date" + + @classmethod + def convert_time_type(cls, agate_table: "agate.Table", col_idx: int) -> str: + return "time" + + ### + # Implementation details + ### + def _make_match_kwargs(self, database: str, schema: str, identifier: str) -> Dict[str, str]: + return filter_null_values( + { + "database": database, + "identifier": identifier, + "schema": schema, + } + ) + + def _get_dbt_columns_from_bq_table(self, table) -> List[BigQueryColumn]: + "Translates BQ SchemaField dicts into dbt BigQueryColumn objects" + + columns = [] + for col in table.schema: + # BigQuery returns type labels that are not valid type specifiers + dtype = self.Column.translate_type(col.field_type) + column = self.Column(col.name, dtype, col.fields, col.mode) + columns.append(column) + + return columns + + def _agate_to_schema( + self, agate_table: "agate.Table", column_override: Dict[str, str] + ) -> List[SchemaField]: + """Convert agate.Table with column names to a list of bigquery schemas.""" + bq_schema = [] + for idx, col_name in enumerate(agate_table.column_names): + inferred_type = self.convert_agate_type(agate_table, idx) + type_ = column_override.get(col_name, inferred_type) + bq_schema.append(SchemaField(col_name, type_)) + return bq_schema + + @available.parse(lambda *a, **k: "") + def copy_table(self, source, destination, materialization): + if materialization == "incremental": + write_disposition = WRITE_APPEND + elif materialization == "table": + write_disposition = WRITE_TRUNCATE + else: + raise dbt_common.exceptions.CompilationError( + 'Copy table materialization must be "copy" or "table", but ' + f"config.get('copy_materialization', 'table') was " + f"{materialization}" + ) + + self.connections.copy_bq_table(source, destination, write_disposition) + + return "COPY TABLE with materialization: {}".format(materialization) + + @available.parse(lambda *a, **k: []) + def get_column_schema_from_query(self, sql: str) -> List[BigQueryColumn]: + """Get a list of the column names and data types from the given sql. + + :param str sql: The sql to execute. + :return: List[BigQueryColumn] + """ + _, iterator = self.connections.raw_execute(sql) + columns = [self.Column.create_from_field(field) for field in iterator.schema] + flattened_columns = [] + for column in columns: + flattened_columns += column.flatten() + return flattened_columns + + @available.parse(lambda *a, **k: False) + def get_columns_in_select_sql(self, select_sql: str) -> List[BigQueryColumn]: + try: + conn = self.connections.get_thread_connection() + client = conn.handle + query_job, iterator = self.connections.raw_execute(select_sql) + query_table = client.get_table(query_job.destination) + return self._get_dbt_columns_from_bq_table(query_table) + + except (ValueError, google.cloud.exceptions.NotFound) as e: + logger.debug("get_columns_in_select_sql error: {}".format(e)) + return [] + + def _bq_table_to_relation(self, bq_table) -> Union[BigQueryRelation, None]: + if bq_table is None: + return None + + return self.Relation.create( + database=bq_table.project, + schema=bq_table.dataset_id, + identifier=bq_table.table_id, + quote_policy={"schema": True, "identifier": True}, + type=self.RELATION_TYPES.get(bq_table.table_type, RelationType.External), + ) + + @classmethod + def warning_on_hooks(cls, hook_type): + msg = "{} is not supported in bigquery and will be ignored" + logger.info(msg) + + @available + def add_query(self, sql, auto_begin=True, bindings=None, abridge_sql_log=False): + if self.nice_connection_name() in ["on-run-start", "on-run-end"]: + self.warning_on_hooks(self.nice_connection_name()) + else: + raise dbt_common.exceptions.base.NotImplementedError( + "`add_query` is not implemented for this adapter!" + ) + + ### + # Special bigquery adapter methods + ### + + @staticmethod + def _partitions_match(table, conf_partition: Optional[PartitionConfig]) -> bool: + """ + Check if the actual and configured partitions for a table are a match. + BigQuery tables can be replaced if: + - Both tables are not partitioned, OR + - Both tables are partitioned using the exact same configs + + If there is a mismatch, then the table cannot be replaced directly. + """ + is_partitioned = table.range_partitioning or table.time_partitioning + + if not is_partitioned and not conf_partition: + return True + elif conf_partition and table.time_partitioning is not None: + table_field = ( + table.time_partitioning.field.lower() if table.time_partitioning.field else None + ) + + table_granularity = table.partitioning_type + conf_table_field = conf_partition.field + return ( + table_field == conf_table_field.lower() + or (conf_partition.time_ingestion_partitioning and table_field is not None) + ) and table_granularity.lower() == conf_partition.granularity.lower() + elif conf_partition and table.range_partitioning is not None: + dest_part = table.range_partitioning + conf_part = conf_partition.range or {} + + return ( + dest_part.field == conf_partition.field + and dest_part.range_.start == conf_part.get("start") + and dest_part.range_.end == conf_part.get("end") + and dest_part.range_.interval == conf_part.get("interval") + ) + else: + return False + + @staticmethod + def _clusters_match(table, conf_cluster) -> bool: + """ + Check if the actual and configured clustering columns for a table + are a match. BigQuery tables can be replaced if clustering columns + match exactly. + """ + if isinstance(conf_cluster, str): + conf_cluster = [conf_cluster] + + return table.clustering_fields == conf_cluster + + @available.parse(lambda *a, **k: True) + def is_replaceable( + self, relation, conf_partition: Optional[PartitionConfig], conf_cluster + ) -> bool: + """ + Check if a given partition and clustering column spec for a table + can replace an existing relation in the database. BigQuery does not + allow tables to be replaced with another table that has a different + partitioning spec. This method returns True if the given config spec is + identical to that of the existing table. + """ + if not relation: + return True + + try: + table = self.connections.get_bq_table( + database=relation.database, schema=relation.schema, identifier=relation.identifier + ) + except google.cloud.exceptions.NotFound: + return True + + return all( + ( + self._partitions_match(table, conf_partition), + self._clusters_match(table, conf_cluster), + ) + ) + + @available + def parse_partition_by(self, raw_partition_by: Any) -> Optional[PartitionConfig]: + """ + dbt v0.16.0 expects `partition_by` to be a dictionary where previously + it was a string. Check the type of `partition_by`, raise error + or warning if string, and attempt to convert to dict. + """ + return PartitionConfig.parse(raw_partition_by) + + def get_table_ref_from_relation(self, relation: BaseRelation): + return self.connections.table_ref(relation.database, relation.schema, relation.identifier) + + def _update_column_dict(self, bq_column_dict, dbt_columns, parent=""): + """ + Helper function to recursively traverse the schema of a table in the + update_column_descriptions function below. + + bq_column_dict should be a dict as obtained by the to_api_repr() + function of a SchemaField object. + """ + if parent: + dotted_column_name = "{}.{}".format(parent, bq_column_dict["name"]) + else: + dotted_column_name = bq_column_dict["name"] + + if dotted_column_name in dbt_columns: + column_config = dbt_columns[dotted_column_name] + bq_column_dict["description"] = column_config.get("description") + if bq_column_dict["type"] != "RECORD": + bq_column_dict["policyTags"] = {"names": column_config.get("policy_tags", list())} + + new_fields = [] + for child_col_dict in bq_column_dict.get("fields", list()): + new_child_column_dict = self._update_column_dict( + child_col_dict, dbt_columns, parent=dotted_column_name + ) + new_fields.append(new_child_column_dict) + + bq_column_dict["fields"] = new_fields + + return bq_column_dict + + @available.parse_none + def update_columns(self, relation, columns): + if len(columns) == 0: + return + + conn = self.connections.get_thread_connection() + table_ref = self.get_table_ref_from_relation(relation) + table = conn.handle.get_table(table_ref) + + new_schema = [] + for bq_column in table.schema: + bq_column_dict = bq_column.to_api_repr() + new_bq_column_dict = self._update_column_dict(bq_column_dict, columns) + new_schema.append(SchemaField.from_api_repr(new_bq_column_dict)) + + new_table = google.cloud.bigquery.Table(table_ref, schema=new_schema) + conn.handle.update_table(new_table, ["schema"]) + + @available.parse_none + def update_table_description( + self, database: str, schema: str, identifier: str, description: str + ): + conn = self.connections.get_thread_connection() + client = conn.handle + + table_ref = self.connections.table_ref(database, schema, identifier) + table = client.get_table(table_ref) + table.description = description + client.update_table(table, ["description"]) + + @available.parse_none + def alter_table_add_columns(self, relation, columns): + logger.debug('Adding columns ({}) to table {}".'.format(columns, relation)) + + conn = self.connections.get_thread_connection() + client = conn.handle + + table_ref = self.get_table_ref_from_relation(relation) + table = client.get_table(table_ref) + + new_columns = [col.column_to_bq_schema() for col in columns] + new_schema = table.schema + new_columns + + new_table = google.cloud.bigquery.Table(table_ref, schema=new_schema) + client.update_table(new_table, ["schema"]) + + @available.parse_none + def load_dataframe( + self, + database: str, + schema: str, + table_name: str, + agate_table: "agate.Table", + column_override: Dict[str, str], + field_delimiter: str, + ) -> None: + connection = self.connections.get_thread_connection() + client: Client = connection.handle + table_schema = self._agate_to_schema(agate_table, column_override) + file_path = agate_table.original_abspath # type: ignore + + self.connections.write_dataframe_to_table( + client, + file_path, + database, + schema, + table_name, + table_schema, + field_delimiter, + fallback_timeout=300, + ) + + @available.parse_none + def upload_file( + self, + local_file_path: str, + database: str, + table_schema: str, + table_name: str, + **kwargs, + ) -> None: + connection = self.connections.get_thread_connection() + client: Client = connection.handle + + self.connections.write_file_to_table( + client, + local_file_path, + database, + table_schema, + table_name, + fallback_timeout=300, + **kwargs, + ) + + @classmethod + def _catalog_filter_table( + cls, table: "agate.Table", used_schemas: FrozenSet[Tuple[str, str]] + ) -> "agate.Table": + table = table.rename( + column_names={col.name: col.name.replace("__", ":") for col in table.columns} + ) + return super()._catalog_filter_table(table, used_schemas) + + def _get_catalog_schemas(self, relation_config: Iterable[RelationConfig]) -> SchemaSearchMap: + candidates = super()._get_catalog_schemas(relation_config) + db_schemas: Dict[str, Set[str]] = {} + result = SchemaSearchMap() + + for candidate, schemas in candidates.items(): + database = candidate.database + if database not in db_schemas: + db_schemas[database] = set(self.list_schemas(database)) + if candidate.schema in db_schemas[database]: + result[candidate] = schemas + else: + logger.debug( + "Skipping catalog for {}.{} - schema does not exist".format( + database, candidate.schema + ) + ) + return result + + def calculate_freshness_from_metadata( + self, + source: BaseRelation, + macro_resolver: Optional[MacroResolverProtocol] = None, + ) -> Tuple[Optional[AdapterResponse], FreshnessResponse]: + conn = self.connections.get_thread_connection() + client: Client = conn.handle + + table_ref = self.get_table_ref_from_relation(source) + table = client.get_table(table_ref) + snapshot = datetime.now(tz=pytz.UTC) + + freshness = FreshnessResponse( + max_loaded_at=table.modified, + snapshotted_at=snapshot, + age=(snapshot - table.modified).total_seconds(), + ) + + return None, freshness + + @available.parse(lambda *a, **k: {}) + def get_common_options( + self, config: Dict[str, Any], node: Dict[str, Any], temporary: bool = False + ) -> Dict[str, Any]: + opts = {} + + if (config.get("hours_to_expiration") is not None) and (not temporary): + expiration = f'TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {config.get("hours_to_expiration")} hour)' + opts["expiration_timestamp"] = expiration + + if config.persist_relation_docs() and "description" in node: # type: ignore[attr-defined] + description = sql_escape(node["description"]) + opts["description"] = '"""{}"""'.format(description) + + if config.get("labels"): + labels = config.get("labels", {}) + opts["labels"] = list(labels.items()) # type: ignore[assignment] + + return opts + + @available.parse(lambda *a, **k: {}) + def get_table_options( + self, config: Dict[str, Any], node: Dict[str, Any], temporary: bool + ) -> Dict[str, Any]: + opts = self.get_common_options(config, node, temporary) + + if config.get("kms_key_name") is not None: + opts["kms_key_name"] = f"'{config.get('kms_key_name')}'" + + if temporary: + opts["expiration_timestamp"] = "TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 12 hour)" + else: + # It doesn't apply the `require_partition_filter` option for a temporary table + # so that we avoid the error by not specifying a partition with a temporary table + # in the incremental model. + if ( + config.get("require_partition_filter") is not None + and config.get("partition_by") is not None + ): + opts["require_partition_filter"] = config.get("require_partition_filter") + if config.get("partition_expiration_days") is not None: + opts["partition_expiration_days"] = config.get("partition_expiration_days") + + return opts + + @available.parse(lambda *a, **k: {}) + def get_view_options(self, config: Dict[str, Any], node: Dict[str, Any]) -> Dict[str, Any]: + opts = self.get_common_options(config, node) + return opts + + @available.parse(lambda *a, **k: True) + def get_bq_table(self, relation: BigQueryRelation) -> Optional[BigQueryTable]: + try: + table = self.connections.get_bq_table( + relation.database, relation.schema, relation.identifier + ) + except google.cloud.exceptions.NotFound: + table = None + return table + + @available.parse(lambda *a, **k: True) + def describe_relation( + self, relation: BigQueryRelation + ) -> Optional[BigQueryBaseRelationConfig]: + if relation.type == RelationType.MaterializedView: + bq_table = self.get_bq_table(relation) + parser = BigQueryMaterializedViewConfig + else: + raise dbt_common.exceptions.DbtRuntimeError( + f"The method `BigQueryAdapter.describe_relation` is not implemented " + f"for the relation type: {relation.type}" + ) + if bq_table: + return parser.from_bq_table(bq_table) + return None + + @available.parse_none + def grant_access_to(self, entity, entity_type, role, grant_target_dict) -> None: + """ + Given an entity, grants it access to a dataset. + """ + conn: BigQueryConnectionManager = self.connections.get_thread_connection() + client = conn.handle + GrantTarget.validate(grant_target_dict) + grant_target = GrantTarget.from_dict(grant_target_dict) + if entity_type == "view": + entity = self.get_table_ref_from_relation(entity).to_api_repr() + with _dataset_lock: + dataset_ref = self.connections.dataset_ref(grant_target.project, grant_target.dataset) + dataset = client.get_dataset(dataset_ref) + access_entry = AccessEntry(role, entity_type, entity) + # only perform update if access entry not in dataset + if is_access_entry_in_dataset(dataset, access_entry): + logger.warning(f"Access entry {access_entry} " f"already exists in dataset") + else: + dataset = add_access_entry_to_dataset(dataset, access_entry) + client.update_dataset(dataset, ["access_entries"]) + + @available.parse_none + def get_dataset_location(self, relation): + conn = self.connections.get_thread_connection() + client = conn.handle + dataset_ref = self.connections.dataset_ref(relation.project, relation.dataset) + dataset = client.get_dataset(dataset_ref) + return dataset.location + + def get_rows_different_sql( + self, + relation_a: BigQueryRelation, + relation_b: BigQueryRelation, + column_names: Optional[List[str]] = None, + except_operator="EXCEPT DISTINCT", + ) -> str: + return super().get_rows_different_sql( + relation_a=relation_a, + relation_b=relation_b, + column_names=column_names, + except_operator=except_operator, + ) + + def timestamp_add_sql(self, add_to: str, number: int = 1, interval: str = "hour") -> str: + return f"timestamp_add({add_to}, interval {number} {interval})" + + def string_add_sql( + self, + add_to: str, + value: str, + location="append", + ) -> str: + if location == "append": + return f"concat({add_to}, '{value}')" + elif location == "prepend": + return f"concat('{value}', {add_to})" + else: + raise dbt_common.exceptions.DbtRuntimeError( + f'Got an unexpected location value of "{location}"' + ) + + # This is used by the test suite + def run_sql_for_tests(self, sql, fetch, conn=None): + """For the testing framework. + Run an SQL query on a bigquery adapter. No cursors, transactions, + etc. to worry about""" + + do_fetch = fetch != "None" + _, res = self.execute(sql, fetch=do_fetch) + + # convert dataframe to matrix-ish repr + if fetch == "one": + return res[0] + else: + return list(res) + + def generate_python_submission_response(self, submission_result) -> BigQueryAdapterResponse: + return BigQueryAdapterResponse(_message="OK") + + @property + def default_python_submission_method(self) -> str: + return "serverless" + + @property + def python_submission_helpers(self) -> Dict[str, Type[PythonJobHelper]]: + return { + "cluster": ClusterDataprocHelper, + "serverless": ServerlessDataProcHelper, + } + + @available + @classmethod + def render_raw_columns_constraints(cls, raw_columns: Dict[str, Dict[str, Any]]) -> List: + rendered_constraints: Dict[str, str] = {} + for raw_column in raw_columns.values(): + for con in raw_column.get("constraints", None): + constraint = cls._parse_column_constraint(con) + rendered_constraint = cls.process_parsed_constraint( + constraint, cls.render_column_constraint + ) + + if rendered_constraint: + column_name = raw_column["name"] + if column_name not in rendered_constraints: + rendered_constraints[column_name] = rendered_constraint + else: + rendered_constraints[column_name] += f" {rendered_constraint}" + + nested_columns = cls.nest_column_data_types(raw_columns, rendered_constraints) + rendered_column_constraints = [ + f"{cls.quote(column['name']) if column.get('quote') else column['name']} {column['data_type']}" + for column in nested_columns.values() + ] + return rendered_column_constraints + + @classmethod + def render_column_constraint(cls, constraint: ColumnLevelConstraint) -> Optional[str]: + c = super().render_column_constraint(constraint) + if ( + constraint.type == ConstraintType.primary_key + or constraint.type == ConstraintType.foreign_key + ): + return f"{c} not enforced" if c else None + return c + + @classmethod + def render_model_constraint(cls, constraint: ModelLevelConstraint) -> Optional[str]: + c = super().render_model_constraint(constraint) + if ( + constraint.type == ConstraintType.primary_key + or constraint.type == ConstraintType.foreign_key + ): + return f"{c} not enforced" if c else None + + return c + + def debug_query(self): + """Override for DebugTask method""" + self.execute("select 1 as id") + + def validate_sql(self, sql: str) -> AdapterResponse: + """Submit the given SQL to the engine for validation, but not execution. + + This submits the query with the `dry_run` flag set True. + + :param str sql: The sql to validate + """ + return self.connections.dry_run(sql) diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/python_submissions.py b/dbt-bigquery/src/dbt/adapters/bigquery/python_submissions.py new file mode 100644 index 000000000..cd7f7d86f --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/python_submissions.py @@ -0,0 +1,165 @@ +from typing import Dict, Union +import uuid + +from google.cloud.dataproc_v1 import Batch, CreateBatchRequest, Job, RuntimeConfig + +from dbt.adapters.base import PythonJobHelper +from dbt.adapters.events.logging import AdapterLogger +from google.protobuf.json_format import ParseDict + +from dbt.adapters.bigquery.credentials import BigQueryCredentials, DataprocBatchConfig +from dbt.adapters.bigquery.clients import ( + create_dataproc_batch_controller_client, + create_dataproc_job_controller_client, + create_gcs_client, +) +from dbt.adapters.bigquery.retry import RetryFactory + + +_logger = AdapterLogger("BigQuery") + + +_DEFAULT_JAR_FILE_URI = "gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.13-0.34.0.jar" + + +class _BaseDataProcHelper(PythonJobHelper): + def __init__(self, parsed_model: Dict, credentials: BigQueryCredentials) -> None: + # validate all additional stuff for python is set + for required_config in ["dataproc_region", "gcs_bucket"]: + if not getattr(credentials, required_config): + raise ValueError( + f"Need to supply {required_config} in profile to submit python job" + ) + + self._storage_client = create_gcs_client(credentials) + self._project = credentials.execution_project + self._region = credentials.dataproc_region + + schema = parsed_model["schema"] + identifier = parsed_model["alias"] + self._model_file_name = f"{schema}/{identifier}.py" + self._gcs_bucket = credentials.gcs_bucket + self._gcs_path = f"gs://{credentials.gcs_bucket}/{self._model_file_name}" + + # set retry policy, default to timeout after 24 hours + retry = RetryFactory(credentials) + self._polling_retry = retry.create_polling( + model_timeout=parsed_model["config"].get("timeout") + ) + + def _write_to_gcs(self, compiled_code: str) -> None: + bucket = self._storage_client.get_bucket(self._gcs_bucket) + blob = bucket.blob(self._model_file_name) + blob.upload_from_string(compiled_code) + + +class ClusterDataprocHelper(_BaseDataProcHelper): + def __init__(self, parsed_model: Dict, credentials: BigQueryCredentials) -> None: + super().__init__(parsed_model, credentials) + self._job_controller_client = create_dataproc_job_controller_client(credentials) + self._cluster_name = parsed_model["config"].get( + "dataproc_cluster_name", credentials.dataproc_cluster_name + ) + + if not self._cluster_name: + raise ValueError( + "Need to supply dataproc_cluster_name in profile or config to submit python job with cluster submission method" + ) + + def submit(self, compiled_code: str) -> Job: + _logger.debug(f"Submitting cluster job to: {self._cluster_name}") + + self._write_to_gcs(compiled_code) + + request = { + "project_id": self._project, + "region": self._region, + "job": { + "placement": {"cluster_name": self._cluster_name}, + "pyspark_job": { + "main_python_file_uri": self._gcs_path, + }, + }, + } + + # submit the job + operation = self._job_controller_client.submit_job_as_operation(request) + + # wait for the job to complete + response: Job = operation.result(polling=self._polling_retry) + + if response.status.state == 6: + raise ValueError(response.status.details) + + return response + + +class ServerlessDataProcHelper(_BaseDataProcHelper): + def __init__(self, parsed_model: Dict, credentials: BigQueryCredentials) -> None: + super().__init__(parsed_model, credentials) + self._batch_controller_client = create_dataproc_batch_controller_client(credentials) + self._batch_id = parsed_model["config"].get("batch_id", str(uuid.uuid4())) + self._jar_file_uri = parsed_model["config"].get("jar_file_uri", _DEFAULT_JAR_FILE_URI) + self._dataproc_batch = credentials.dataproc_batch + + def submit(self, compiled_code: str) -> Batch: + _logger.debug(f"Submitting batch job with id: {self._batch_id}") + + self._write_to_gcs(compiled_code) + + request = CreateBatchRequest( + parent=f"projects/{self._project}/locations/{self._region}", + batch=self._create_batch(), + batch_id=self._batch_id, + ) + + # submit the batch + operation = self._batch_controller_client.create_batch(request) + + # wait for the batch to complete + response: Batch = operation.result(polling=self._polling_retry) + + return response + + def _create_batch(self) -> Batch: + # create the Dataproc Serverless job config + # need to pin dataproc version to 1.1 as it now defaults to 2.0 + # https://cloud.google.com/dataproc-serverless/docs/concepts/properties + # https://cloud.google.com/dataproc-serverless/docs/reference/rest/v1/projects.locations.batches#runtimeconfig + batch = Batch( + { + "runtime_config": RuntimeConfig( + version="1.1", + properties={ + "spark.executor.instances": "2", + }, + ), + "pyspark_batch": { + "main_python_file_uri": self._gcs_path, + "jar_file_uris": [self._jar_file_uri], + }, + } + ) + + # Apply configuration from dataproc_batch key, possibly overriding defaults. + if self._dataproc_batch: + batch = _update_batch_from_config(self._dataproc_batch, batch) + + return batch + + +def _update_batch_from_config( + config_dict: Union[Dict, DataprocBatchConfig], target: Batch +) -> Batch: + try: + # updates in place + ParseDict(config_dict, target._pb) + except Exception as e: + docurl = ( + "https://cloud.google.com/dataproc-serverless/docs/reference/rpc/google.cloud.dataproc.v1" + "#google.cloud.dataproc.v1.Batch" + ) + raise ValueError( + f"Unable to parse dataproc_batch as valid batch specification. See {docurl}. {str(e)}" + ) from e + return target diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/relation.py b/dbt-bigquery/src/dbt/adapters/bigquery/relation.py new file mode 100644 index 000000000..037761918 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/relation.py @@ -0,0 +1,208 @@ +from dataclasses import dataclass, field +from itertools import chain, islice +from typing import FrozenSet, Optional, TypeVar + +from dbt_common.exceptions import CompilationError +from dbt_common.utils.dict import filter_null_values +from dbt.adapters.base.relation import ( + BaseRelation, + ComponentName, + InformationSchema, + EventTimeFilter, +) +from dbt.adapters.contracts.relation import RelationConfig, RelationType +from dbt.adapters.relation_configs import RelationConfigChangeAction + +from dbt.adapters.bigquery.relation_configs import ( + BigQueryClusterConfigChange, + BigQueryMaterializedViewConfig, + BigQueryMaterializedViewConfigChangeset, + BigQueryOptionsConfigChange, + BigQueryPartitionConfigChange, +) + + +Self = TypeVar("Self", bound="BigQueryRelation") + + +@dataclass(frozen=True, eq=False, repr=False) +class BigQueryRelation(BaseRelation): + quote_character: str = "`" + location: Optional[str] = None + require_alias: bool = False + + renameable_relations: FrozenSet[RelationType] = field( + default_factory=lambda: frozenset( + { + RelationType.Table, + } + ) + ) + + replaceable_relations: FrozenSet[RelationType] = field( + default_factory=lambda: frozenset( + { + RelationType.View, + RelationType.Table, + } + ) + ) + + def matches( + self, + database: Optional[str] = None, + schema: Optional[str] = None, + identifier: Optional[str] = None, + ) -> bool: + search = filter_null_values( + { + ComponentName.Database: database, + ComponentName.Schema: schema, + ComponentName.Identifier: identifier, + } + ) + + if not search: + # nothing was passed in + pass + + for k, v in search.items(): + if not self._is_exactish_match(k, v): + return False + + return True + + @property + def project(self): + return self.database + + @property + def dataset(self): + return self.schema + + @classmethod + def materialized_view_from_relation_config( + cls, relation_config: RelationConfig + ) -> BigQueryMaterializedViewConfig: + return BigQueryMaterializedViewConfig.from_relation_config(relation_config) + + @classmethod + def materialized_view_config_changeset( + cls, + existing_materialized_view: BigQueryMaterializedViewConfig, + relation_config: RelationConfig, + ) -> Optional[BigQueryMaterializedViewConfigChangeset]: + config_change_collection = BigQueryMaterializedViewConfigChangeset() + new_materialized_view = cls.materialized_view_from_relation_config(relation_config) + + if new_materialized_view.options != existing_materialized_view.options: + config_change_collection.options = BigQueryOptionsConfigChange( + action=RelationConfigChangeAction.alter, + context=new_materialized_view.options, + ) + + if new_materialized_view.partition != existing_materialized_view.partition: + # the existing PartitionConfig is not hashable, but since we need to do + # a full refresh either way, we don't need to provide a context + config_change_collection.partition = BigQueryPartitionConfigChange( + action=RelationConfigChangeAction.alter, + ) + + if new_materialized_view.cluster != existing_materialized_view.cluster: + config_change_collection.cluster = BigQueryClusterConfigChange( + action=RelationConfigChangeAction.alter, + context=new_materialized_view.cluster, + ) + + if config_change_collection.has_changes: + return config_change_collection + return None + + def information_schema(self, identifier: Optional[str] = None) -> "BigQueryInformationSchema": + return BigQueryInformationSchema.from_relation(self, identifier) + + def _render_event_time_filtered(self, event_time_filter: EventTimeFilter) -> str: + """ + Returns "" if start and end are both None + """ + filter = "" + if event_time_filter.start and event_time_filter.end: + filter = f"cast({event_time_filter.field_name} as timestamp) >= '{event_time_filter.start}' and cast({event_time_filter.field_name} as timestamp) < '{event_time_filter.end}'" + elif event_time_filter.start: + filter = ( + f"cast({event_time_filter.field_name} as timestamp) >= '{event_time_filter.start}'" + ) + elif event_time_filter.end: + filter = ( + f"cast({event_time_filter.field_name} as timestamp) < '{event_time_filter.end}'" + ) + + return filter + + +@dataclass(frozen=True, eq=False, repr=False) +class BigQueryInformationSchema(InformationSchema): + quote_character: str = "`" + location: Optional[str] = None + + @classmethod + def get_include_policy(cls, relation, information_schema_view): + schema = True + if information_schema_view in ("SCHEMATA", "SCHEMATA_OPTIONS", None): + schema = False + + identifier = True + if information_schema_view == "__TABLES__": + identifier = False + + # In the future, let's refactor so that location/region can also be a + # ComponentName, so that we can have logic like: + # + # region = False + # if information_schema_view == "OBJECT_PRIVILEGES": + # region = True + + return relation.include_policy.replace( + schema=schema, + identifier=identifier, + ) + + def get_region_identifier(self) -> str: + region_id = f"region-{self.location}" + return self.quoted(region_id) + + @classmethod + def from_relation(cls, relation, information_schema_view): + info_schema = super().from_relation(relation, information_schema_view) + if information_schema_view == "OBJECT_PRIVILEGES": + # OBJECT_PRIVILEGES require a location. If the location is blank there is nothing + # the user can do about it. + if not relation.location: + msg = ( + f'No location/region found when trying to retrieve "{information_schema_view}"' + ) + raise CompilationError(msg) + info_schema = info_schema.incorporate(location=relation.location) + return info_schema + + # override this method to interpolate the region identifier, + # if a location is required for this information schema view + def _render_iterator(self): + iterator = super()._render_iterator() + if self.location: + return chain( + islice(iterator, 1), # project, + [(None, self.get_region_identifier())], # region id, + islice(iterator, 1, None), # remaining components + ) + else: + return iterator + + def replace(self, **kwargs): + if "information_schema_view" in kwargs: + view = kwargs["information_schema_view"] + # we also need to update the include policy, unless the caller did + # in which case it's their problem + if "include_policy" not in kwargs: + kwargs["include_policy"] = self.get_include_policy(self, view) + return super().replace(**kwargs) diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/__init__.py b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/__init__.py new file mode 100644 index 000000000..9ccdec1e0 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/__init__.py @@ -0,0 +1,21 @@ +from dbt.adapters.bigquery.relation_configs._base import BigQueryBaseRelationConfig +from dbt.adapters.bigquery.relation_configs._cluster import ( + BigQueryClusterConfig, + BigQueryClusterConfigChange, +) +from dbt.adapters.bigquery.relation_configs._materialized_view import ( + BigQueryMaterializedViewConfig, + BigQueryMaterializedViewConfigChangeset, +) +from dbt.adapters.bigquery.relation_configs._options import ( + BigQueryOptionsConfig, + BigQueryOptionsConfigChange, +) +from dbt.adapters.bigquery.relation_configs._partition import ( + PartitionConfig, + BigQueryPartitionConfigChange, +) +from dbt.adapters.bigquery.relation_configs._policies import ( + BigQueryIncludePolicy, + BigQueryQuotePolicy, +) diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_base.py b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_base.py new file mode 100644 index 000000000..8bc861587 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_base.py @@ -0,0 +1,68 @@ +from dataclasses import dataclass +from typing import Optional, Dict, TYPE_CHECKING + +from dbt.adapters.base.relation import Policy +from dbt.adapters.relation_configs import RelationConfigBase +from google.cloud.bigquery import Table as BigQueryTable +from typing_extensions import Self + +from dbt.adapters.bigquery.relation_configs._policies import ( + BigQueryIncludePolicy, + BigQueryQuotePolicy, +) +from dbt.adapters.contracts.relation import ComponentName, RelationConfig + +if TYPE_CHECKING: + # Indirectly imported via agate_helper, which is lazy loaded further downfile. + # Used by mypy for earlier type hints. + import agate + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class BigQueryBaseRelationConfig(RelationConfigBase): + @classmethod + def include_policy(cls) -> Policy: + return BigQueryIncludePolicy() + + @classmethod + def quote_policy(cls) -> Policy: + return BigQueryQuotePolicy() + + @classmethod + def from_relation_config(cls, relation_config: RelationConfig) -> Self: + relation_config_dict = cls.parse_relation_config(relation_config) + relation = cls.from_dict(relation_config_dict) + return relation + + @classmethod + def parse_relation_config(cls, relation_config: RelationConfig) -> Dict: + raise NotImplementedError( + "`parse_model_node()` needs to be implemented on this RelationConfigBase instance" + ) + + @classmethod + def from_bq_table(cls, table: BigQueryTable) -> Self: + relation_config = cls.parse_bq_table(table) + relation = cls.from_dict(relation_config) + return relation + + @classmethod + def parse_bq_table(cls, table: BigQueryTable) -> Dict: + raise NotImplementedError("`parse_bq_table()` is not implemented for this relation type") + + @classmethod + def _render_part(cls, component: ComponentName, value: Optional[str]) -> Optional[str]: + if cls.include_policy().get_part(component) and value: + if cls.quote_policy().get_part(component): + return f'"{value}"' + return value.lower() + return None + + @classmethod + def _get_first_row(cls, results: "agate.Table") -> "agate.Row": + try: + return results.rows[0] + except IndexError: + import agate + + return agate.Row(values=set()) diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_cluster.py b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_cluster.py new file mode 100644 index 000000000..b3dbaf2e9 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_cluster.py @@ -0,0 +1,54 @@ +from dataclasses import dataclass +from typing import Any, Dict, FrozenSet, Optional + +from dbt.adapters.relation_configs import RelationConfigChange +from dbt.adapters.contracts.relation import RelationConfig +from google.cloud.bigquery import Table as BigQueryTable +from typing_extensions import Self + +from dbt.adapters.bigquery.relation_configs._base import BigQueryBaseRelationConfig + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class BigQueryClusterConfig(BigQueryBaseRelationConfig): + """ + This config manages table options supporting clustering. See the following for more information: + - https://docs.getdbt.com/reference/resource-configs/bigquery-configs#using-table-partitioning-and-clustering + - https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#clustering_column_list + + - fields: set of columns to cluster on + - Note: can contain up to four columns + """ + + fields: FrozenSet[str] + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any]) -> Self: + kwargs_dict = {"fields": config_dict.get("fields")} + return super().from_dict(kwargs_dict) + + @classmethod + def parse_relation_config(cls, relation_config: RelationConfig) -> Dict[str, Any]: + config_dict = {} + + if cluster_by := relation_config.config.extra.get("cluster_by"): + # users may input a single field as a string + if isinstance(cluster_by, str): + cluster_by = [cluster_by] + config_dict.update({"fields": frozenset(cluster_by)}) + + return config_dict + + @classmethod + def parse_bq_table(cls, table: BigQueryTable) -> Dict[str, Any]: + config_dict = {"fields": frozenset(table.clustering_fields)} + return config_dict + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class BigQueryClusterConfigChange(RelationConfigChange): + context: Optional[BigQueryClusterConfig] + + @property + def requires_full_refresh(self) -> bool: + return True diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_materialized_view.py b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_materialized_view.py new file mode 100644 index 000000000..7c63ba3bc --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_materialized_view.py @@ -0,0 +1,132 @@ +from dataclasses import dataclass +from typing import Any, Dict, Optional + +from dbt.adapters.contracts.relation import ( + RelationConfig, + ComponentName, +) +from google.cloud.bigquery import Table as BigQueryTable + +from dbt.adapters.bigquery.relation_configs._base import BigQueryBaseRelationConfig +from dbt.adapters.bigquery.relation_configs._options import ( + BigQueryOptionsConfig, + BigQueryOptionsConfigChange, +) +from dbt.adapters.bigquery.relation_configs._partition import ( + BigQueryPartitionConfigChange, + PartitionConfig, +) +from dbt.adapters.bigquery.relation_configs._cluster import ( + BigQueryClusterConfig, + BigQueryClusterConfigChange, +) + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class BigQueryMaterializedViewConfig(BigQueryBaseRelationConfig): + """ + This config follow the specs found here: + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_materialized_view_statement + + The following parameters are configurable by dbt: + - table_id: name of the materialized view + - dataset_id: dataset name of the materialized view + - project_id: project name of the database + - options: options that get set in `SET OPTIONS()` clause + - partition: object containing partition information + - cluster: object containing cluster information + """ + + table_id: str + dataset_id: str + project_id: str + options: BigQueryOptionsConfig + partition: Optional[PartitionConfig] = None + cluster: Optional[BigQueryClusterConfig] = None + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any]) -> "BigQueryMaterializedViewConfig": + # required + kwargs_dict: Dict[str, Any] = { + "table_id": cls._render_part(ComponentName.Identifier, config_dict["table_id"]), + "dataset_id": cls._render_part(ComponentName.Schema, config_dict["dataset_id"]), + "project_id": cls._render_part(ComponentName.Database, config_dict["project_id"]), + "options": BigQueryOptionsConfig.from_dict(config_dict["options"]), + } + + # optional + if partition := config_dict.get("partition"): + kwargs_dict.update({"partition": PartitionConfig.parse(partition)}) + + if cluster := config_dict.get("cluster"): + kwargs_dict.update({"cluster": BigQueryClusterConfig.from_dict(cluster)}) + + materialized_view: "BigQueryMaterializedViewConfig" = super().from_dict(kwargs_dict) + return materialized_view + + @classmethod + def parse_relation_config(cls, relation_config: RelationConfig) -> Dict[str, Any]: + config_dict = { + "table_id": relation_config.identifier, + "dataset_id": relation_config.schema, + "project_id": relation_config.database, + # despite this being a foreign object, there will always be options because of defaults + "options": BigQueryOptionsConfig.parse_relation_config(relation_config), + } + + # optional + if relation_config.config and "partition_by" in relation_config.config: + config_dict.update({"partition": PartitionConfig.parse_model_node(relation_config)}) + + if relation_config.config and "cluster_by" in relation_config.config: + config_dict.update( + {"cluster": BigQueryClusterConfig.parse_relation_config(relation_config)} + ) + + return config_dict + + @classmethod + def parse_bq_table(cls, table: BigQueryTable) -> Dict[str, Any]: + config_dict = { + "table_id": table.table_id, + "dataset_id": table.dataset_id, + "project_id": table.project, + # despite this being a foreign object, there will always be options because of defaults + "options": BigQueryOptionsConfig.parse_bq_table(table), + } + + # optional + if table.time_partitioning or table.range_partitioning: + config_dict.update({"partition": PartitionConfig.parse_bq_table(table)}) + + if table.clustering_fields: + config_dict.update({"cluster": BigQueryClusterConfig.parse_bq_table(table)}) + + return config_dict + + +@dataclass +class BigQueryMaterializedViewConfigChangeset: + options: Optional[BigQueryOptionsConfigChange] = None + partition: Optional[BigQueryPartitionConfigChange] = None + cluster: Optional[BigQueryClusterConfigChange] = None + + @property + def requires_full_refresh(self) -> bool: + return any( + { + self.options.requires_full_refresh if self.options else False, + self.partition.requires_full_refresh if self.partition else False, + self.cluster.requires_full_refresh if self.cluster else False, + } + ) + + @property + def has_changes(self) -> bool: + return any( + { + self.options if self.options else False, + self.partition if self.partition else False, + self.cluster if self.cluster else False, + } + ) diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_options.py b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_options.py new file mode 100644 index 000000000..7fd8797df --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_options.py @@ -0,0 +1,159 @@ +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Any, Dict, Optional + +from dbt.adapters.relation_configs import RelationConfigChange +from dbt.adapters.contracts.relation import RelationConfig +from google.cloud.bigquery import Table as BigQueryTable +from typing_extensions import Self + +from dbt.adapters.bigquery.relation_configs._base import BigQueryBaseRelationConfig +from dbt.adapters.bigquery.utility import bool_setting, float_setting, sql_escape + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class BigQueryOptionsConfig(BigQueryBaseRelationConfig): + """ + This config manages materialized view options. See the following for more information: + - https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#materialized_view_option_list + """ + + enable_refresh: Optional[bool] = True + refresh_interval_minutes: Optional[float] = 30 + expiration_timestamp: Optional[datetime] = None + max_staleness: Optional[str] = None + kms_key_name: Optional[str] = None + description: Optional[str] = None + labels: Optional[Dict[str, str]] = None + + def as_ddl_dict(self) -> Dict[str, Any]: + """ + Reformat `options_dict` so that it can be passed into the `bigquery_options()` macro. + + Options should be flattened and filtered prior to passing into this method. For example: + - the "auto refresh" set of options should be flattened into the root instead of stuck under "auto_refresh" + - any option that comes in set as `None` will be unset; this happens mostly due to config changes + """ + + def boolean(x): + return x + + def numeric(x): + return x + + def string(x): + return f"'{x}'" + + def escaped_string(x): + return f'"""{sql_escape(x)}"""' + + def interval(x): + return x + + def array(x): + return list(x.items()) + + option_formatters = { + "enable_refresh": boolean, + "refresh_interval_minutes": numeric, + "expiration_timestamp": interval, + "max_staleness": interval, + "kms_key_name": string, + "description": escaped_string, + "labels": array, + } + + def formatted_option(name: str) -> Optional[Any]: + value = getattr(self, name) + if value is not None: + formatter = option_formatters[name] + return formatter(value) + return None + + options = { + option: formatted_option(option) + for option, option_formatter in option_formatters.items() + if formatted_option(option) is not None + } + + return options + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any]) -> Self: + setting_formatters = { + "enable_refresh": bool_setting, + "refresh_interval_minutes": float_setting, + "expiration_timestamp": None, + "max_staleness": None, + "kms_key_name": None, + "description": None, + "labels": None, + } + + def formatted_setting(name: str) -> Any: + value = config_dict.get(name) + if formatter := setting_formatters[name]: + return formatter(value) + return value + + kwargs_dict = {attribute: formatted_setting(attribute) for attribute in setting_formatters} + + # avoid picking up defaults on dependent options + # e.g. don't set `refresh_interval_minutes` = 30 when the user has `enable_refresh` = False + if kwargs_dict["enable_refresh"] is False: + kwargs_dict.update({"refresh_interval_minutes": None, "max_staleness": None}) + + options: Self = super().from_dict(kwargs_dict) + return options + + @classmethod + def parse_relation_config(cls, relation_config: RelationConfig) -> Dict[str, Any]: + config_dict = { + option: relation_config.config.extra.get(option) + for option in [ + "enable_refresh", + "refresh_interval_minutes", + "expiration_timestamp", + "max_staleness", + "kms_key_name", + "description", + "labels", + ] + } + + # update dbt-specific versions of these settings + if hours_to_expiration := relation_config.config.extra.get("hours_to_expiration"): + config_dict.update( + {"expiration_timestamp": datetime.now() + timedelta(hours=hours_to_expiration)} + ) + if not relation_config.config.persist_docs: + del config_dict["description"] + + return config_dict + + @classmethod + def parse_bq_table(cls, table: BigQueryTable) -> Dict[str, Any]: + config_dict = { + "enable_refresh": table.mview_enable_refresh, + "refresh_interval_minutes": table.mview_refresh_interval.seconds / 60, + "expiration_timestamp": table.expires, + "max_staleness": None, + "description": table.description, + } + + # map the empty dict to None + if labels := table.labels: + config_dict.update({"labels": labels}) + + if encryption_configuration := table.encryption_configuration: + config_dict.update({"kms_key_name": encryption_configuration.kms_key_name}) + return config_dict + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class BigQueryOptionsConfigChange(RelationConfigChange): + context: BigQueryOptionsConfig + + @property + def requires_full_refresh(self) -> bool: + return False diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_partition.py b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_partition.py new file mode 100644 index 000000000..e1a5ac171 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_partition.py @@ -0,0 +1,161 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import dbt_common.exceptions +from dbt.adapters.relation_configs import RelationConfigChange +from dbt.adapters.contracts.relation import RelationConfig +from dbt_common.dataclass_schema import dbtClassMixin, ValidationError +from google.cloud.bigquery.table import Table as BigQueryTable + + +@dataclass +class PartitionConfig(dbtClassMixin): + field: str + data_type: str = "date" + granularity: str = "day" + range: Optional[Dict[str, Any]] = None + time_ingestion_partitioning: bool = False + copy_partitions: bool = False + + PARTITION_DATE = "_PARTITIONDATE" + PARTITION_TIME = "_PARTITIONTIME" + + def data_type_for_partition(self): + """Return the data type of partitions for replacement. + When time_ingestion_partitioning is enabled, the data type supported are date & timestamp. + """ + if not self.time_ingestion_partitioning: + return self.data_type + + return "date" if self.data_type == "date" else "timestamp" + + def reject_partition_field_column(self, columns: List[Any]) -> List[str]: + return [c for c in columns if not c.name.upper() == self.field.upper()] + + def data_type_should_be_truncated(self): + """Return true if the data type should be truncated instead of cast to the data type.""" + return not ( + self.data_type == "int64" or (self.data_type == "date" and self.granularity == "day") + ) + + def time_partitioning_field(self) -> str: + """Return the time partitioning field name based on the data type. + The default is _PARTITIONTIME, but for date it is _PARTITIONDATE + else it will fail statements for type mismatch.""" + if self.data_type == "date": + return self.PARTITION_DATE + else: + return self.PARTITION_TIME + + def insertable_time_partitioning_field(self) -> str: + """Return the insertable time partitioning field name based on the data type. + Practically, only _PARTITIONTIME works so far. + The function is meant to keep the call sites consistent as it might evolve.""" + return self.PARTITION_TIME + + def render(self, alias: Optional[str] = None): + column: str = ( + self.field if not self.time_ingestion_partitioning else self.time_partitioning_field() + ) + if alias: + column = f"{alias}.{column}" + + if self.data_type_should_be_truncated(): + return f"{self.data_type}_trunc({column}, {self.granularity})" + else: + return column + + def render_wrapped(self, alias: Optional[str] = None): + """Wrap the partitioning column when time involved to ensure it is properly cast to matching time.""" + # if data type is going to be truncated, no need to wrap + if ( + self.data_type in ("date", "timestamp", "datetime") + and not self.data_type_should_be_truncated() + and not ( + self.time_ingestion_partitioning and self.data_type == "date" + ) # _PARTITIONDATE is already a date + ): + return f"{self.data_type}({self.render(alias)})" + else: + return self.render(alias) + + @classmethod + def parse(cls, raw_partition_by) -> Optional["PartitionConfig"]: + if raw_partition_by is None: + return None + try: + cls.validate(raw_partition_by) + return cls.from_dict( + { + key: (value.lower() if isinstance(value, str) else value) + for key, value in raw_partition_by.items() + } + ) + except ValidationError as exc: + raise dbt_common.exceptions.base.DbtValidationError( + "Could not parse partition config" + ) from exc + except TypeError: + raise dbt_common.exceptions.CompilationError( + f"Invalid partition_by config:\n" + f" Got: {raw_partition_by}\n" + f' Expected a dictionary with "field" and "data_type" keys' + ) + + @classmethod + def parse_model_node(cls, relation_config: RelationConfig) -> Dict[str, Any]: + """ + Parse model node into a raw config for `PartitionConfig.parse` + + - Note: + This doesn't currently collect `time_ingestion_partitioning` and `copy_partitions` + because this was built for materialized views, which do not support those settings. + """ + config_dict: Dict[str, Any] = relation_config.config.extra.get("partition_by") + if "time_ingestion_partitioning" in config_dict: + del config_dict["time_ingestion_partitioning"] + if "copy_partitions" in config_dict: + del config_dict["copy_partitions"] + return config_dict + + @classmethod + def parse_bq_table(cls, table: BigQueryTable) -> Dict[str, Any]: + """ + Parse the BQ Table object into a raw config for `PartitionConfig.parse` + + - Note: + This doesn't currently collect `time_ingestion_partitioning` and `copy_partitions` + because this was built for materialized views, which do not support those settings. + """ + if time_partitioning := table.time_partitioning: + field_types = {field.name: field.field_type.lower() for field in table.schema} + config_dict = { + "field": time_partitioning.field, + "data_type": field_types[time_partitioning.field], + "granularity": time_partitioning.type_, + } + + elif range_partitioning := table.range_partitioning: + config_dict = { + "field": range_partitioning.field, + "data_type": "int64", + "range": { + "start": range_partitioning.range_.start, + "end": range_partitioning.range_.end, + "interval": range_partitioning.range_.interval, + }, + } + + else: + config_dict = {} + + return config_dict + + +@dataclass(frozen=True, eq=True, unsafe_hash=True) +class BigQueryPartitionConfigChange(RelationConfigChange): + context: Optional[Any] = None + + @property + def requires_full_refresh(self) -> bool: + return True diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_policies.py b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_policies.py new file mode 100644 index 000000000..4467c4340 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/relation_configs/_policies.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass + +from dbt.adapters.base.relation import Policy + + +class BigQueryIncludePolicy(Policy): + database: bool = True + schema: bool = True + identifier: bool = True + + +@dataclass +class BigQueryQuotePolicy(Policy): + database: bool = True + schema: bool = True + identifier: bool = True diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/retry.py b/dbt-bigquery/src/dbt/adapters/bigquery/retry.py new file mode 100644 index 000000000..cc197a7d3 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/retry.py @@ -0,0 +1,114 @@ +from typing import Callable, Optional + +from google.api_core.future.polling import DEFAULT_POLLING +from google.api_core.retry import Retry +from google.cloud.bigquery.retry import DEFAULT_JOB_RETRY, _job_should_retry +from requests.exceptions import ConnectionError + +from dbt.adapters.contracts.connection import Connection, ConnectionState +from dbt.adapters.events.logging import AdapterLogger +from dbt.adapters.exceptions.connection import FailedToConnectError + +from dbt.adapters.bigquery.clients import create_bigquery_client +from dbt.adapters.bigquery.credentials import BigQueryCredentials + + +_logger = AdapterLogger("BigQuery") + +_MINUTE = 60.0 +_DAY = 24 * 60 * 60.0 + + +class RetryFactory: + + def __init__(self, credentials: BigQueryCredentials) -> None: + self._retries = credentials.job_retries or 0 + self._job_creation_timeout = credentials.job_creation_timeout_seconds + self._job_execution_timeout = credentials.job_execution_timeout_seconds + self._job_deadline = credentials.job_retry_deadline_seconds + + def create_job_creation_timeout(self, fallback: float = _MINUTE) -> float: + return ( + self._job_creation_timeout or fallback + ) # keep _MINUTE here so it's not overridden by passing fallback=None + + def create_job_execution_timeout(self, fallback: float = _DAY) -> float: + return ( + self._job_execution_timeout or fallback + ) # keep _DAY here so it's not overridden by passing fallback=None + + def create_retry(self, fallback: Optional[float] = None) -> Retry: + return DEFAULT_JOB_RETRY.with_timeout(self._job_execution_timeout or fallback or _DAY) + + def create_polling(self, model_timeout: Optional[float] = None) -> Retry: + return DEFAULT_POLLING.with_timeout(model_timeout or self._job_execution_timeout or _DAY) + + def create_reopen_with_deadline(self, connection: Connection) -> Retry: + """ + This strategy mimics what was accomplished with _retry_and_handle + """ + + retry = DEFAULT_JOB_RETRY.with_delay(maximum=3.0).with_predicate( + _DeferredException(self._retries) + ) + + # there is no `with_on_error` method, but we want to retain the defaults on `DEFAULT_JOB_RETRY + retry._on_error = _create_reopen_on_error(connection) + + # don't override the default deadline to None if the user did not provide one, + # the process will never end + if deadline := self._job_deadline: + return retry.with_deadline(deadline) + + return retry + + +class _DeferredException: + """ + Count ALL errors, not just retryable errors, up to a threshold. + Raise the next error, regardless of whether it is retryable. + """ + + def __init__(self, retries: int) -> None: + self._retries: int = retries + self._error_count = 0 + + def __call__(self, error: Exception) -> bool: + # exit immediately if the user does not want retries + if self._retries == 0: + return False + + # count all errors + self._error_count += 1 + + # if the error is retryable, and we haven't breached the threshold, log and continue + if _job_should_retry(error) and self._error_count <= self._retries: + _logger.debug( + f"Retry attempt {self._error_count} of {self._retries} after error: {repr(error)}" + ) + return True + + # otherwise raise + return False + + +def _create_reopen_on_error(connection: Connection) -> Callable[[Exception], None]: + + def on_error(error: Exception): + if isinstance(error, (ConnectionResetError, ConnectionError)): + _logger.warning("Reopening connection after {!r}".format(error)) + connection.handle.close() + + try: + connection.handle = create_bigquery_client(connection.credentials) + connection.state = ConnectionState.OPEN + + except Exception as e: + _logger.debug( + f"""Got an error when attempting to create a bigquery " "client: '{e}'""" + ) + connection.handle = None + connection.state = ConnectionState.FAIL + raise FailedToConnectError(str(e)) + + return on_error diff --git a/dbt-bigquery/src/dbt/adapters/bigquery/utility.py b/dbt-bigquery/src/dbt/adapters/bigquery/utility.py new file mode 100644 index 000000000..5914280a3 --- /dev/null +++ b/dbt-bigquery/src/dbt/adapters/bigquery/utility.py @@ -0,0 +1,45 @@ +import json +from typing import Any, Optional + +import dbt_common.exceptions + + +def bool_setting(value: Optional[Any] = None) -> Optional[bool]: + if value is None: + return None + elif isinstance(value, bool): + return value + elif isinstance(value, str): + # don't do bool(value) as that is equivalent to: len(value) > 0 + if value.lower() == "true": + return True + elif value.lower() == "false": + return False + else: + raise ValueError( + f"Invalid input, " + f"expecting `bool` or `str` ex. (True, False, 'true', 'False'), received: {value}" + ) + else: + raise TypeError( + f"Invalid type for bool evaluation, " + f"expecting `bool` or `str`, received: {type(value)}" + ) + + +def float_setting(value: Optional[Any] = None) -> Optional[float]: + if value is None: + return None + elif any(isinstance(value, i) for i in [int, float, str]): + return float(value) + else: + raise TypeError( + f"Invalid type for float evaluation, " + f"expecting `int`, `float`, or `str`, received: {type(value)}" + ) + + +def sql_escape(string): + if not isinstance(string, str): + raise dbt_common.exceptions.CompilationError(f"cannot escape a non-string: {string}") + return json.dumps(string)[1:-1] diff --git a/dbt-bigquery/src/dbt/include/bigquery/__init__.py b/dbt-bigquery/src/dbt/include/bigquery/__init__.py new file mode 100644 index 000000000..b177e5d49 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/__init__.py @@ -0,0 +1,3 @@ +import os + +PACKAGE_PATH = os.path.dirname(__file__) diff --git a/dbt-bigquery/src/dbt/include/bigquery/dbt_project.yml b/dbt-bigquery/src/dbt/include/bigquery/dbt_project.yml new file mode 100644 index 000000000..b4e88b7b0 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/dbt_project.yml @@ -0,0 +1,5 @@ +config-version: 2 +name: dbt_bigquery +version: 1.0 + +macro-paths: ["macros"] diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/adapters.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/adapters.sql new file mode 100644 index 000000000..f166e5d05 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/adapters.sql @@ -0,0 +1,195 @@ +{% macro bigquery__create_table_as(temporary, relation, compiled_code, language='sql') -%} + {%- if language == 'sql' -%} + {%- set raw_partition_by = config.get('partition_by', none) -%} + {%- set raw_cluster_by = config.get('cluster_by', none) -%} + {%- set sql_header = config.get('sql_header', none) -%} + + {%- set partition_config = adapter.parse_partition_by(raw_partition_by) -%} + {%- if partition_config.time_ingestion_partitioning -%} + {%- set columns = get_columns_with_types_in_query_sql(sql) -%} + {%- set table_dest_columns_csv = columns_without_partition_fields_csv(partition_config, columns) -%} + {%- set columns = '(' ~ table_dest_columns_csv ~ ')' -%} + {%- endif -%} + + {{ sql_header if sql_header is not none }} + + create or replace table {{ relation }} + {%- set contract_config = config.get('contract') -%} + {%- if contract_config.enforced -%} + {{ get_assert_columns_equivalent(compiled_code) }} + {{ get_table_columns_and_constraints() }} + {%- set compiled_code = get_select_subquery(compiled_code) %} + {% else %} + {#-- cannot do contracts at the same time as time ingestion partitioning -#} + {{ columns }} + {% endif %} + {{ partition_by(partition_config) }} + {{ cluster_by(raw_cluster_by) }} + + {{ bigquery_table_options(config, model, temporary) }} + + {#-- PARTITION BY cannot be used with the AS query_statement clause. + https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#partition_expression + -#} + {%- if not partition_config.time_ingestion_partitioning %} + as ( + {{ compiled_code }} + ); + {%- endif %} + {%- elif language == 'python' -%} + {#-- + N.B. Python models _can_ write to temp views HOWEVER they use a different session + and have already expired by the time they need to be used (I.E. in merges for incremental models) + + TODO: Deep dive into spark sessions to see if we can reuse a single session for an entire + dbt invocation. + --#} + + {#-- when a user wants to change the schema of an existing relation, they must intentionally drop the table in the dataset --#} + {%- set old_relation = adapter.get_relation(database=relation.database, schema=relation.schema, identifier=relation.identifier) -%} + {%- if (old_relation.is_table and (should_full_refresh())) -%} + {% do adapter.drop_relation(relation) %} + {%- endif -%} + {{ py_write_table(compiled_code=compiled_code, target_relation=relation.quote(database=False, schema=False, identifier=False)) }} + {%- else -%} + {% do exceptions.raise_compiler_error("bigquery__create_table_as macro didn't get supported language, it got %s" % language) %} + {%- endif -%} + +{%- endmacro -%} + +{% macro bigquery__create_view_as(relation, sql) -%} + {%- set sql_header = config.get('sql_header', none) -%} + + {{ sql_header if sql_header is not none }} + + create or replace view {{ relation }} + {{ bigquery_view_options(config, model) }} + {%- set contract_config = config.get('contract') -%} + {%- if contract_config.enforced -%} + {{ get_assert_columns_equivalent(sql) }} + {%- endif %} + as {{ sql }}; + +{% endmacro %} + +{% macro bigquery__drop_schema(relation) -%} + {{ adapter.drop_schema(relation) }} +{% endmacro %} + +{% macro bigquery__get_columns_in_relation(relation) -%} + {{ return(adapter.get_columns_in_relation(relation)) }} +{% endmacro %} + + +{% macro bigquery__list_relations_without_caching(schema_relation) -%} + {{ return(adapter.list_relations_without_caching(schema_relation)) }} +{%- endmacro %} + + +{% macro bigquery__list_schemas(database) -%} + {{ return(adapter.list_schemas(database)) }} +{% endmacro %} + + +{% macro bigquery__check_schema_exists(information_schema, schema) %} + {{ return(adapter.check_schema_exists(information_schema.database, schema)) }} +{% endmacro %} + +{#-- relation-level macro is not implemented. This is handled in the CTAs statement #} +{% macro bigquery__persist_docs(relation, model, for_relation, for_columns) -%} + {% if for_columns and config.persist_column_docs() and model.columns %} + {% do alter_column_comment(relation, model.columns) %} + {% endif %} +{% endmacro %} + +{% macro bigquery__alter_column_comment(relation, column_dict) -%} + {% do adapter.update_columns(relation, column_dict) %} +{% endmacro %} + +{% macro bigquery__alter_relation_add_columns(relation, add_columns) %} + + {% set sql -%} + + alter {{ relation.type }} {{ relation }} + {% for column in add_columns %} + add column {{ column.name }} {{ column.data_type }}{{ ',' if not loop.last }} + {% endfor %} + + {%- endset -%} + + {{ return(run_query(sql)) }} + +{% endmacro %} + +{% macro bigquery__alter_relation_drop_columns(relation, drop_columns) %} + + {% set sql -%} + + alter {{ relation.type }} {{ relation }} + + {% for column in drop_columns %} + drop column {{ column.name }}{{ ',' if not loop.last }} + {% endfor %} + + {%- endset -%} + + {{ return(run_query(sql)) }} + +{% endmacro %} + + +{% macro bigquery__alter_column_type(relation, column_name, new_column_type) -%} + {#-- Changing a column's data type using a query requires you to scan the entire table. + The query charges can be significant if the table is very large. + + https://cloud.google.com/bigquery/docs/manually-changing-schemas#changing_a_columns_data_type + #} + {% set relation_columns = get_columns_in_relation(relation) %} + + {% set sql %} + select + {%- for col in relation_columns -%} + {% if col.column == column_name %} + CAST({{ col.quoted }} AS {{ new_column_type }}) AS {{ col.quoted }} + {%- else %} + {{ col.quoted }} + {%- endif %} + {%- if not loop.last %},{% endif -%} + {%- endfor %} + from {{ relation }} + {% endset %} + + {% call statement('alter_column_type') %} + {{ create_table_as(False, relation, sql)}} + {%- endcall %} + +{% endmacro %} + + +{% macro bigquery__test_unique(model, column_name) %} + +with dbt_test__target as ( + + select {{ column_name }} as unique_field + from {{ model }} + where {{ column_name }} is not null + +) + +select + unique_field, + count(*) as n_records + +from dbt_test__target +group by unique_field +having count(*) > 1 + +{% endmacro %} + +{% macro bigquery__upload_file(local_file_path, database, table_schema, table_name) %} + + {{ log("kwargs: " ~ kwargs) }} + + {% do adapter.upload_file(local_file_path, database, table_schema, table_name, kwargs=kwargs) %} + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/adapters/apply_grants.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/adapters/apply_grants.sql new file mode 100644 index 000000000..e344862ae --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/adapters/apply_grants.sql @@ -0,0 +1,20 @@ +{% macro bigquery__get_show_grant_sql(relation) %} + {% set location = adapter.get_dataset_location(relation) %} + {% set relation = relation.incorporate(location=location) %} + + select privilege_type, grantee + from {{ relation.information_schema("OBJECT_PRIVILEGES") }} + where object_schema = "{{ relation.dataset }}" + and object_name = "{{ relation.identifier }}" + -- filter out current user + and split(grantee, ':')[offset(1)] != session_user() +{% endmacro %} + + +{%- macro bigquery__get_grant_sql(relation, privilege, grantee) -%} + grant `{{ privilege }}` on {{ relation.type }} {{ relation }} to {{ '\"' + grantee|join('\", \"') + '\"' }} +{%- endmacro -%} + +{%- macro bigquery__get_revoke_sql(relation, privilege, grantee) -%} + revoke `{{ privilege }}` on {{ relation.type }} {{ relation }} from {{ '\"' + grantee|join('\", \"') + '\"' }} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/adapters/columns.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/adapters/columns.sql new file mode 100644 index 000000000..c540f1e2d --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/adapters/columns.sql @@ -0,0 +1,10 @@ +{% macro bigquery__get_empty_subquery_sql(select_sql, select_sql_header=none) %} + {%- if select_sql_header is not none -%} + {{ select_sql_header }} + {%- endif -%} + select * from ( + {{ select_sql }} + ) as __dbt_sbq + where false and current_timestamp() = current_timestamp() + limit 0 +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/by_relation.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/by_relation.sql new file mode 100644 index 000000000..adaa740f6 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/by_relation.sql @@ -0,0 +1,36 @@ +{% macro bigquery__get_catalog_relations(information_schema, relations) -%} + + {%- if (relations | length) == 0 -%} + {# Hopefully nothing cares about the columns we return when there are no rows #} + {%- set query = "select 1 as id limit 0" -%} + + {%- else -%} + {%- set query -%} + with + table_shards_stage as ({{ _bigquery__get_table_shards_sql(information_schema) }}), + table_shards as ( + select * from table_shards_stage + where ( + {%- for relation in relations -%} + ( + upper(table_schema) = upper('{{ relation.schema }}') + and upper(table_name) = upper('{{ relation.identifier }}') + ) + {%- if not loop.last %} or {% endif -%} + {%- endfor -%} + ) + ), + tables as ({{ _bigquery__get_tables_sql() }}), + table_stats as ({{ _bigquery__get_table_stats_sql() }}), + + columns as ({{ _bigquery__get_columns_sql(information_schema) }}), + column_stats as ({{ _bigquery__get_column_stats_sql() }}) + + {{ _bigquery__get_extended_catalog_sql() }} + {%- endset -%} + + {%- endif -%} + + {{ return(run_query(query)) }} + +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/by_schema.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/by_schema.sql new file mode 100644 index 000000000..0d36f2b84 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/by_schema.sql @@ -0,0 +1,32 @@ +{% macro bigquery__get_catalog(information_schema, schemas) -%} + + {%- if (schemas | length) == 0 -%} + {# Hopefully nothing cares about the columns we return when there are no rows #} + {%- set query = "select 1 as id limit 0" -%} + + {%- else -%} + {%- set query -%} + with + table_shards as ( + {{ _bigquery__get_table_shards_sql(information_schema) }} + where ( + {%- for schema in schemas -%} + upper(tables.dataset_id) = upper('{{ schema }}') + {%- if not loop.last %} or {% endif -%} + {%- endfor -%} + ) + ), + tables as ({{ _bigquery__get_tables_sql() }}), + table_stats as ({{ _bigquery__get_table_stats_sql() }}), + + columns as ({{ _bigquery__get_columns_sql(information_schema) }}), + column_stats as ({{ _bigquery__get_column_stats_sql() }}) + + {{ _bigquery__get_extended_catalog_sql() }} + {%- endset -%} + + {%- endif -%} + + {{ return(run_query(query)) }} + +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/catalog.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/catalog.sql new file mode 100644 index 000000000..268debc5f --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/catalog/catalog.sql @@ -0,0 +1,179 @@ +{% macro _bigquery__get_table_shards_sql(information_schema) %} + select + tables.project_id as table_catalog, + tables.dataset_id as table_schema, + coalesce(REGEXP_EXTRACT(tables.table_id, '^(.+)[0-9]{8}$'), tables.table_id) as table_name, + tables.table_id as shard_name, + REGEXP_EXTRACT(tables.table_id, '^.+([0-9]{8})$') as shard_index, + REGEXP_CONTAINS(tables.table_id, '^.+[0-9]{8}$') and tables.type = 1 as is_date_shard, + case + when materialized_views.table_name is not null then 'materialized view' + when tables.type = 1 then 'table' + when tables.type = 2 then 'view' + else 'external' + end as table_type, + tables.type = 1 as is_table, + JSON_VALUE(table_description.option_value) as table_comment, + tables.size_bytes, + tables.row_count + from {{ information_schema.replace(information_schema_view='__TABLES__') }} tables + left join {{ information_schema.replace(information_schema_view='MATERIALIZED_VIEWS') }} materialized_views + on materialized_views.table_catalog = tables.project_id + and materialized_views.table_schema = tables.dataset_id + and materialized_views.table_name = tables.table_id + left join {{ information_schema.replace(information_schema_view='TABLE_OPTIONS') }} table_description + on table_description.table_catalog = tables.project_id + and table_description.table_schema = tables.dataset_id + and table_description.table_name = tables.table_id + and table_description.option_name = 'description' +{% endmacro %} + + +{% macro _bigquery__get_tables_sql() %} + select distinct + table_catalog, + table_schema, + table_name, + is_date_shard, + table_type, + is_table, + table_comment + from table_shards +{% endmacro %} + + +{% macro _bigquery__get_table_stats_sql() %} + select + table_catalog, + table_schema, + table_name, + max(shard_name) as latest_shard_name, + min(shard_index) as shard_min, + max(shard_index) as shard_max, + count(shard_index) as shard_count, + sum(size_bytes) as size_bytes, + sum(row_count) as row_count + from table_shards + group by 1, 2, 3 +{% endmacro %} + + +{% macro _bigquery__get_columns_sql(information_schema) %} + select + columns.table_catalog, + columns.table_schema, + columns.table_name as shard_name, + coalesce(paths.field_path, '') as column_name, + -- invent a row number to account for nested fields + -- BQ does not treat these nested properties as independent fields + row_number() over ( + partition by + columns.table_catalog, + columns.table_schema, + columns.table_name + order by + columns.ordinal_position, + paths.field_path + ) as column_index, + coalesce(paths.data_type, '') as column_type, + paths.description as column_comment, + case when columns.is_partitioning_column = 'YES' then 1 else 0 end as is_partitioning_column, + case when columns.is_partitioning_column = 'YES' then paths.field_path end as partition_column, + case when columns.clustering_ordinal_position is not null then 1 else 0 end as is_clustering_column, + case when columns.clustering_ordinal_position is not null then paths.field_path end as cluster_column, + columns.clustering_ordinal_position + from {{ information_schema.replace(information_schema_view='COLUMNS') }} columns + join {{ information_schema.replace(information_schema_view='COLUMN_FIELD_PATHS') }} paths + on paths.table_catalog = columns.table_catalog + and paths.table_schema = columns.table_schema + and paths.table_name = columns.table_name + and paths.column_name = columns.column_name + where columns.ordinal_position is not null +{% endmacro %} + + +{% macro _bigquery__get_column_stats_sql() %} + select + table_catalog, + table_schema, + shard_name, + max(is_partitioning_column) = 1 as is_partitioned, + max(partition_column) as partition_column, + max(is_clustering_column) = 1 as is_clustered, + array_to_string( + array_agg( + cluster_column ignore nulls + order by clustering_ordinal_position + ), ', ' + ) as clustering_columns + from columns + group by 1, 2, 3 +{% endmacro %} + + +{% macro _bigquery__get_extended_catalog_sql() %} + select + tables.table_catalog as table_database, + tables.table_schema, + case + when tables.is_date_shard then concat(tables.table_name, '*') + else tables.table_name + end as table_name, + tables.table_type, + tables.table_comment, + -- coalesce column metadata fields to ensure they are non-null for catalog generation + -- external table columns are not present in COLUMN_FIELD_PATHS + coalesce(columns.column_name, '') as column_name, + coalesce(columns.column_index, 1) as column_index, + coalesce(columns.column_type, '') as column_type, + coalesce(columns.column_comment, '') as column_comment, + + 'Shard count' as `stats__date_shards__label`, + table_stats.shard_count as `stats__date_shards__value`, + 'The number of date shards in this table' as `stats__date_shards__description`, + tables.is_date_shard as `stats__date_shards__include`, + + 'Shard (min)' as `stats__date_shard_min__label`, + table_stats.shard_min as `stats__date_shard_min__value`, + 'The first date shard in this table' as `stats__date_shard_min__description`, + tables.is_date_shard as `stats__date_shard_min__include`, + + 'Shard (max)' as `stats__date_shard_max__label`, + table_stats.shard_max as `stats__date_shard_max__value`, + 'The last date shard in this table' as `stats__date_shard_max__description`, + tables.is_date_shard as `stats__date_shard_max__include`, + + '# Rows' as `stats__num_rows__label`, + table_stats.row_count as `stats__num_rows__value`, + 'Approximate count of rows in this table' as `stats__num_rows__description`, + tables.is_table as `stats__num_rows__include`, + + 'Approximate Size' as `stats__num_bytes__label`, + table_stats.size_bytes as `stats__num_bytes__value`, + 'Approximate size of table as reported by BigQuery' as `stats__num_bytes__description`, + tables.is_table as `stats__num_bytes__include`, + + 'Partitioned By' as `stats__partitioning_type__label`, + column_stats.partition_column as `stats__partitioning_type__value`, + 'The partitioning column for this table' as `stats__partitioning_type__description`, + column_stats.is_partitioned as `stats__partitioning_type__include`, + + 'Clustered By' as `stats__clustering_fields__label`, + column_stats.clustering_columns as `stats__clustering_fields__value`, + 'The clustering columns for this table' as `stats__clustering_fields__description`, + column_stats.is_clustered as `stats__clustering_fields__include` + + from tables + join table_stats + on table_stats.table_catalog = tables.table_catalog + and table_stats.table_schema = tables.table_schema + and table_stats.table_name = tables.table_name + left join column_stats + on column_stats.table_catalog = tables.table_catalog + and column_stats.table_schema = tables.table_schema + and column_stats.shard_name = table_stats.latest_shard_name + left join columns + on columns.table_catalog = tables.table_catalog + and columns.table_schema = tables.table_schema + and columns.shard_name = table_stats.latest_shard_name +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/etc.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/etc.sql new file mode 100644 index 000000000..59b61473e --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/etc.sql @@ -0,0 +1,15 @@ +{% macro date_sharded_table(base_name) %} + {{ return(base_name ~ "[DBT__PARTITION_DATE]") }} +{% endmacro %} + +{% macro grant_access_to(entity, entity_type, role, grant_target_dict) -%} + {% do adapter.grant_access_to(entity, entity_type, role, grant_target_dict) %} +{% endmacro %} + +{%- macro get_partitions_metadata(table) -%} + {%- if execute -%} + {%- set res = adapter.get_partitions_metadata(table) -%} + {{- return(res) -}} + {%- endif -%} + {{- return(None) -}} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/clone.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/clone.sql new file mode 100644 index 000000000..3964be2b3 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/clone.sql @@ -0,0 +1,9 @@ +{% macro bigquery__can_clone_table() %} + {{ return(True) }} +{% endmacro %} + +{% macro bigquery__create_or_replace_clone(this_relation, defer_relation) %} + create or replace + table {{ this_relation }} + clone {{ defer_relation }} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/copy.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/copy.sql new file mode 100644 index 000000000..3ce2de184 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/copy.sql @@ -0,0 +1,33 @@ +{% materialization copy, adapter='bigquery' -%} + + {# Setup #} + {{ run_hooks(pre_hooks) }} + + {% set destination = this.incorporate(type='table') %} + + {# there can be several ref() or source() according to BQ copy API docs #} + {# cycle over ref() and source() to create source tables array #} + {% set source_array = [] %} + {% for ref_table in model.refs %} + {{ source_array.append(ref(ref_table.get('package'), ref_table.name, version=ref_table.get('version'))) }} + {% endfor %} + + {% for src_table in model.sources %} + {{ source_array.append(source(*src_table)) }} + {% endfor %} + + {# Call adapter copy_table function #} + {%- set result_str = adapter.copy_table( + source_array, + destination, + config.get('copy_materialization', default = 'table')) -%} + + {{ store_result('main', response=result_str) }} + + {# Clean up #} + {{ run_hooks(post_hooks) }} + {%- do apply_grants(target_relation, grant_config) -%} + {{ adapter.commit() }} + + {{ return({'relations': [destination]}) }} +{%- endmaterialization %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental.sql new file mode 100644 index 000000000..25a83b0c6 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental.sql @@ -0,0 +1,180 @@ +{% macro dbt_bigquery_validate_get_incremental_strategy(config) %} + {#-- Find and validate the incremental strategy #} + {%- set strategy = config.get("incremental_strategy") or 'merge' -%} + + {% set invalid_strategy_msg -%} + Invalid incremental strategy provided: {{ strategy }} + Expected one of: 'merge', 'insert_overwrite', 'microbatch' + {%- endset %} + {% if strategy not in ['merge', 'insert_overwrite', 'microbatch'] %} + {% do exceptions.raise_compiler_error(invalid_strategy_msg) %} + {% endif %} + + {% if strategy == 'microbatch' %} + {% do bq_validate_microbatch_config(config) %} + {% endif %} + + {% do return(strategy) %} +{% endmacro %} + +{% macro source_sql_with_partition(partition_by, source_sql) %} + + {%- if partition_by.time_ingestion_partitioning %} + {{ return(wrap_with_time_ingestion_partitioning_sql(partition_by, source_sql, False)) }} + {% else %} + {{ return(source_sql) }} + {%- endif -%} + +{% endmacro %} + +{% macro bq_create_table_as(partition_by, temporary, relation, compiled_code, language='sql') %} + {%- set _dbt_max_partition = declare_dbt_max_partition(this, partition_by, compiled_code, language) -%} + {% if partition_by.time_ingestion_partitioning and language == 'python' %} + {% do exceptions.raise_compiler_error( + "Python models do not support ingestion time partitioning" + ) %} + {% elif partition_by.time_ingestion_partitioning and language == 'sql' %} + {#-- Create the table before inserting data as ingestion time partitioned tables can't be created with the transformed data --#} + {% do run_query(create_table_as(temporary, relation, compiled_code)) %} + {{ return(_dbt_max_partition + bq_insert_into_ingestion_time_partitioned_table_sql(relation, compiled_code)) }} + {% else %} + {{ return(_dbt_max_partition + create_table_as(temporary, relation, compiled_code, language)) }} + {% endif %} +{% endmacro %} + +{% macro bq_generate_incremental_build_sql( + strategy, tmp_relation, target_relation, sql, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions, incremental_predicates +) %} + {#-- if partitioned, use BQ scripting to get the range of partition values to be updated --#} + {% if strategy == 'insert_overwrite' %} + + {% set build_sql = bq_generate_incremental_insert_overwrite_build_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions + ) %} + + {% elif strategy == 'microbatch' %} + + {% set build_sql = bq_generate_microbatch_build_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions + ) %} + + {% else %} {# strategy == 'merge' #} + {% set build_sql = bq_generate_incremental_merge_build_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, dest_columns, tmp_relation_exists, incremental_predicates + ) %} + + {% endif %} + + {{ return(build_sql) }} + +{% endmacro %} + +{% materialization incremental, adapter='bigquery', supported_languages=['sql', 'python'] -%} + + {%- set unique_key = config.get('unique_key') -%} + {%- set full_refresh_mode = (should_full_refresh()) -%} + {%- set language = model['language'] %} + + {%- set target_relation = this %} + {%- set existing_relation = load_relation(this) %} + {%- set tmp_relation = make_temp_relation(this) %} + + {#-- Validate early so we don't run SQL if the strategy is invalid --#} + {% set strategy = dbt_bigquery_validate_get_incremental_strategy(config) -%} + + {%- set raw_partition_by = config.get('partition_by', none) -%} + {%- set partition_by = adapter.parse_partition_by(raw_partition_by) -%} + {%- set partitions = config.get('partitions', none) -%} + {%- set cluster_by = config.get('cluster_by', none) -%} + + {% set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') %} + {% set incremental_predicates = config.get('predicates', default=none) or config.get('incremental_predicates', default=none) %} + + -- grab current tables grants config for comparison later on + {% set grant_config = config.get('grants') %} + + {{ run_hooks(pre_hooks) }} + + {% if partition_by.copy_partitions is true and strategy not in ['insert_overwrite', 'microbatch'] %} {#-- We can't copy partitions with merge strategy --#} + {% set wrong_strategy_msg -%} + The 'copy_partitions' option requires the 'incremental_strategy' option to be set to 'insert_overwrite' or 'microbatch'. + {%- endset %} + {% do exceptions.raise_compiler_error(wrong_strategy_msg) %} + + {% elif existing_relation is none %} + {%- call statement('main', language=language) -%} + {{ bq_create_table_as(partition_by, False, target_relation, compiled_code, language) }} + {%- endcall -%} + + {% elif existing_relation.is_view %} + {#-- There's no way to atomically replace a view with a table on BQ --#} + {{ adapter.drop_relation(existing_relation) }} + {%- call statement('main', language=language) -%} + {{ bq_create_table_as(partition_by, False, target_relation, compiled_code, language) }} + {%- endcall -%} + + {% elif full_refresh_mode %} + {#-- If the partition/cluster config has changed, then we must drop and recreate --#} + {% if not adapter.is_replaceable(existing_relation, partition_by, cluster_by) %} + {% do log("Hard refreshing " ~ existing_relation ~ " because it is not replaceable") %} + {{ adapter.drop_relation(existing_relation) }} + {% endif %} + {%- call statement('main', language=language) -%} + {{ bq_create_table_as(partition_by, False, target_relation, compiled_code, language) }} + {%- endcall -%} + + {% else %} + {%- if language == 'python' and strategy == 'insert_overwrite' -%} + {#-- This lets us move forward assuming no python will be directly templated into a query --#} + {%- set python_unsupported_msg -%} + The 'insert_overwrite' strategy is not yet supported for python models. + {%- endset %} + {% do exceptions.raise_compiler_error(python_unsupported_msg) %} + {%- endif -%} + + {% set tmp_relation_exists = false %} + {% if on_schema_change != 'ignore' or language == 'python' %} + {#-- Check first, since otherwise we may not build a temp table --#} + {#-- Python always needs to create a temp table --#} + {%- call statement('create_tmp_relation', language=language) -%} + {{ bq_create_table_as(partition_by, True, tmp_relation, compiled_code, language) }} + {%- endcall -%} + {% set tmp_relation_exists = true %} + {#-- Process schema changes. Returns dict of changes if successful. Use source columns for upserting/merging --#} + {% set dest_columns = process_schema_changes(on_schema_change, tmp_relation, existing_relation) %} + {% endif %} + + {% if not dest_columns %} + {% set dest_columns = adapter.get_columns_in_relation(existing_relation) %} + {% endif %} + {#-- Add time ingestion pseudo column to destination column as not part of the 'schema' but still need it for actual data insertion --#} + {% if partition_by.time_ingestion_partitioning %} + {% set dest_columns = adapter.add_time_ingestion_partition_column(partition_by, dest_columns) %} + {% endif %} + + {% set build_sql = bq_generate_incremental_build_sql( + strategy, tmp_relation, target_relation, compiled_code, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, partition_by.copy_partitions, incremental_predicates + ) %} + + {%- call statement('main') -%} + {{ build_sql }} + {% endcall %} + + {% endif %} + + {{ run_hooks(post_hooks) }} + + {% set target_relation = this.incorporate(type='table') %} + + {% set should_revoke = should_revoke(existing_relation, full_refresh_mode) %} + {% do apply_grants(target_relation, grant_config, should_revoke) %} + + {% do persist_docs(target_relation, model) %} + + {%- if tmp_relation_exists -%} + {{ adapter.drop_relation(tmp_relation) }} + {%- endif -%} + + {{ return({'relations': [target_relation]}) }} + +{%- endmaterialization %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/common.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/common.sql new file mode 100644 index 000000000..1c02f4912 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/common.sql @@ -0,0 +1,33 @@ +{% macro declare_dbt_max_partition(relation, partition_by, compiled_code, language='sql') %} + + {#-- TODO: revisit partitioning with python models --#} + {%- if '_dbt_max_partition' in compiled_code and language == 'sql' -%} + + declare _dbt_max_partition {{ partition_by.data_type_for_partition() }} default ( + select max({{ partition_by.field }}) from {{ this }} + where {{ partition_by.field }} is not null + ); + + {%- endif -%} + +{% endmacro %} + +{% macro predicate_for_avoid_require_partition_filter(target='DBT_INTERNAL_DEST') %} + + {%- set raw_partition_by = config.get('partition_by', none) -%} + {%- set partition_config = adapter.parse_partition_by(raw_partition_by) -%} + {%- set predicate = none -%} + + {% if partition_config and config.get('require_partition_filter') -%} + {%- set partition_field = partition_config.time_partitioning_field() if partition_config.time_ingestion_partitioning else partition_config.field -%} + {% set predicate %} + ( + `{{ target }}`.`{{ partition_field }}` is null + or `{{ target }}`.`{{ partition_field }}` is not null + ) + {% endset %} + {%- endif -%} + + {{ return(predicate) }} + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/insert_overwrite.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/insert_overwrite.sql new file mode 100644 index 000000000..3ba67931e --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/insert_overwrite.sql @@ -0,0 +1,166 @@ +{% macro bq_generate_incremental_insert_overwrite_build_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions +) %} + {% if partition_by is none %} + {% set missing_partition_msg -%} + The 'insert_overwrite' strategy requires the `partition_by` config. + {%- endset %} + {% do exceptions.raise_compiler_error(missing_partition_msg) %} + {% endif %} + + {% set build_sql = bq_insert_overwrite_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions + ) %} + + {{ return(build_sql) }} + +{% endmacro %} + +{% macro bq_copy_partitions(tmp_relation, target_relation, partitions, partition_by) %} + + {% for partition in partitions %} + {% if partition_by.data_type == 'int64' %} + {% set partition = partition | as_text %} + {% elif partition_by.granularity == 'hour' %} + {% set partition = partition.strftime("%Y%m%d%H") %} + {% elif partition_by.granularity == 'day' %} + {% set partition = partition.strftime("%Y%m%d") %} + {% elif partition_by.granularity == 'month' %} + {% set partition = partition.strftime("%Y%m") %} + {% elif partition_by.granularity == 'year' %} + {% set partition = partition.strftime("%Y") %} + {% endif %} + {% set tmp_relation_partitioned = api.Relation.create(database=tmp_relation.database, schema=tmp_relation.schema, identifier=tmp_relation.table ~ '$' ~ partition, type=tmp_relation.type) %} + {% set target_relation_partitioned = api.Relation.create(database=target_relation.database, schema=target_relation.schema, identifier=target_relation.table ~ '$' ~ partition, type=target_relation.type) %} + {% do adapter.copy_table(tmp_relation_partitioned, target_relation_partitioned, "table") %} + {% endfor %} + +{% endmacro %} + +{% macro bq_insert_overwrite_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions +) %} + {% if partitions is not none and partitions != [] %} {# static #} + {{ bq_static_insert_overwrite_sql(tmp_relation, target_relation, sql, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions) }} + {% else %} {# dynamic #} + {{ bq_dynamic_insert_overwrite_sql(tmp_relation, target_relation, sql, unique_key, partition_by, dest_columns, tmp_relation_exists, copy_partitions) }} + {% endif %} +{% endmacro %} + +{% macro bq_static_insert_overwrite_sql( + tmp_relation, target_relation, sql, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions +) %} + + {% set predicate -%} + {{ partition_by.render_wrapped(alias='DBT_INTERNAL_DEST') }} in ( + {{ partitions | join (', ') }} + ) + {%- endset %} + + {%- set source_sql -%} + ( + {% if partition_by.time_ingestion_partitioning and tmp_relation_exists -%} + select + {{ partition_by.insertable_time_partitioning_field() }}, + * from {{ tmp_relation }} + {% elif tmp_relation_exists -%} + select + * from {{ tmp_relation }} + {%- elif partition_by.time_ingestion_partitioning -%} + {{ wrap_with_time_ingestion_partitioning_sql(partition_by, sql, True) }} + {%- else -%} + {{sql}} + {%- endif %} + + ) + {%- endset -%} + + {% if copy_partitions %} + {% do bq_copy_partitions(tmp_relation, target_relation, partitions, partition_by) %} + {% else %} + + {#-- In case we're putting the model SQL _directly_ into the MERGE statement, + we need to prepend the MERGE statement with the user-configured sql_header, + which may be needed to resolve that model SQL (e.g. referencing a variable or UDF in the header) + in the "temporary table exists" case, we save the model SQL result as a temp table first, wherein the + sql_header is included by the create_table_as macro. + #} + -- 1. run the merge statement + {{ get_insert_overwrite_merge_sql(target_relation, source_sql, dest_columns, [predicate], include_sql_header = not tmp_relation_exists) }}; + + {%- if tmp_relation_exists -%} + -- 2. clean up the temp table + drop table if exists {{ tmp_relation }}; + {%- endif -%} + + {% endif %} +{% endmacro %} + +{% macro bq_dynamic_copy_partitions_insert_overwrite_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, dest_columns, tmp_relation_exists, copy_partitions + ) %} + {%- if tmp_relation_exists is false -%} + {# We run temp table creation in a separated script to move to partitions copy if it doesn't already exist #} + {%- call statement('create_tmp_relation_for_copy', language='sql') -%} + {{ bq_create_table_as(partition_by, True, tmp_relation, sql, 'sql') + }} + {%- endcall %} + {%- endif -%} + {%- set partitions_sql -%} + select distinct {{ partition_by.render_wrapped() }} + from {{ tmp_relation }} + {%- endset -%} + {%- set partitions = run_query(partitions_sql).columns[0].values() -%} + {# We copy the partitions #} + {%- do bq_copy_partitions(tmp_relation, target_relation, partitions, partition_by) -%} + -- Clean up the temp table + drop table if exists {{ tmp_relation }} +{% endmacro %} + +{% macro bq_dynamic_insert_overwrite_sql(tmp_relation, target_relation, sql, unique_key, partition_by, dest_columns, tmp_relation_exists, copy_partitions) %} + {%- if copy_partitions is true %} + {{ bq_dynamic_copy_partitions_insert_overwrite_sql(tmp_relation, target_relation, sql, unique_key, partition_by, dest_columns, tmp_relation_exists, copy_partitions) }} + {% else -%} + {% set predicate -%} + {{ partition_by.render_wrapped(alias='DBT_INTERNAL_DEST') }} in unnest(dbt_partitions_for_replacement) + {%- endset %} + + {%- set source_sql -%} + ( + select + {% if partition_by.time_ingestion_partitioning -%} + {{ partition_by.insertable_time_partitioning_field() }}, + {%- endif -%} + * from {{ tmp_relation }} + ) + {%- endset -%} + + -- generated script to merge partitions into {{ target_relation }} + declare dbt_partitions_for_replacement array<{{ partition_by.data_type_for_partition() }}>; + + {# have we already created the temp table to check for schema changes? #} + {% if not tmp_relation_exists %} + -- 1. create a temp table with model data + {{ bq_create_table_as(partition_by, True, tmp_relation, sql, 'sql') }} + {% else %} + -- 1. temp table already exists, we used it to check for schema changes + {% endif %} + {%- set partition_field = partition_by.time_partitioning_field() if partition_by.time_ingestion_partitioning else partition_by.render_wrapped() -%} + + -- 2. define partitions to update + set (dbt_partitions_for_replacement) = ( + select as struct + -- IGNORE NULLS: this needs to be aligned to _dbt_max_partition, which ignores null + array_agg(distinct {{ partition_field }} IGNORE NULLS) + from {{ tmp_relation }} + ); + + -- 3. run the merge statement + {{ get_insert_overwrite_merge_sql(target_relation, source_sql, dest_columns, [predicate]) }}; + + -- 4. clean up the temp table + drop table if exists {{ tmp_relation }} + + {% endif %} + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/merge.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/merge.sql new file mode 100644 index 000000000..a204caed9 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/merge.sql @@ -0,0 +1,34 @@ +{% macro bq_generate_incremental_merge_build_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, dest_columns, tmp_relation_exists, incremental_predicates +) %} + {%- set source_sql -%} + {%- if tmp_relation_exists -%} + ( + select + {% if partition_by.time_ingestion_partitioning -%} + {{ partition_by.insertable_time_partitioning_field() }}, + {%- endif -%} + * from {{ tmp_relation }} + ) + {%- else -%} {#-- wrap sql in parens to make it a subquery --#} + ( + {%- if partition_by.time_ingestion_partitioning -%} + {{ wrap_with_time_ingestion_partitioning_sql(partition_by, sql, True) }} + {%- else -%} + {{sql}} + {%- endif %} + ) + {%- endif -%} + {%- endset -%} + + {%- set predicates = [] if incremental_predicates is none else [] + incremental_predicates -%} + {%- set avoid_require_partition_filter = predicate_for_avoid_require_partition_filter() -%} + {%- if avoid_require_partition_filter is not none -%} + {% do predicates.append(avoid_require_partition_filter) %} + {%- endif -%} + + {% set build_sql = get_merge_sql(target_relation, source_sql, unique_key, dest_columns, predicates) %} + + {{ return(build_sql) }} + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/microbatch.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/microbatch.sql new file mode 100644 index 000000000..d4c4b7453 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/microbatch.sql @@ -0,0 +1,28 @@ +{% macro bq_validate_microbatch_config(config) %} + {% if config.get("partition_by") is none %} + {% set missing_partition_msg -%} + The 'microbatch' strategy requires a `partition_by` config. + {%- endset %} + {% do exceptions.raise_compiler_error(missing_partition_msg) %} + {% endif %} + + {% if config.get("partition_by").granularity != config.get('batch_size') %} + {% set invalid_partition_by_granularity_msg -%} + The 'microbatch' strategy requires a `partition_by` config with the same granularity as its configured `batch_size`. + Got: + `batch_size`: {{ config.get('batch_size') }} + `partition_by.granularity`: {{ config.get("partition_by").granularity }} + {%- endset %} + {% do exceptions.raise_compiler_error(invalid_partition_by_granularity_msg) %} + {% endif %} +{% endmacro %} + +{% macro bq_generate_microbatch_build_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions +) %} + {% set build_sql = bq_insert_overwrite_sql( + tmp_relation, target_relation, sql, unique_key, partition_by, partitions, dest_columns, tmp_relation_exists, copy_partitions + ) %} + + {{ return(build_sql) }} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/time_ingestion_tables.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/time_ingestion_tables.sql new file mode 100644 index 000000000..e44a6a94f --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/incremental_strategy/time_ingestion_tables.sql @@ -0,0 +1,50 @@ +{% macro wrap_with_time_ingestion_partitioning_sql(partition_by, sql, is_nested) %} + + select TIMESTAMP({{ partition_by.field }}) as {{ partition_by.insertable_time_partitioning_field() }}, * EXCEPT({{ partition_by.field }}) from ( + {{ sql }} + ){%- if not is_nested -%};{%- endif -%} + +{% endmacro %} + +{% macro get_quoted_with_types_csv(columns) %} + {% set quoted = [] %} + {% for col in columns -%} + {%- do quoted.append(adapter.quote(col.name) ~ " " ~ col.data_type) -%} + {%- endfor %} + {%- set dest_cols_csv = quoted | join(', ') -%} + {{ return(dest_cols_csv) }} + +{% endmacro %} + +{% macro columns_without_partition_fields_csv(partition_config, columns) -%} + {%- set columns_no_partition = partition_config.reject_partition_field_column(columns) -%} + {% set columns_names = get_quoted_with_types_csv(columns_no_partition) %} + {{ return(columns_names) }} + +{%- endmacro -%} + +{% macro bq_insert_into_ingestion_time_partitioned_table_sql(target_relation, sql) -%} + {%- set sql_header = config.get('sql_header', none) -%} + {{ sql_header if sql_header is not none }} + {%- set raw_partition_by = config.get('partition_by', none) -%} + {%- set partition_by = adapter.parse_partition_by(raw_partition_by) -%} + {% set dest_columns = adapter.get_columns_in_relation(target_relation) %} + {%- set dest_columns_csv = get_quoted_csv(dest_columns | map(attribute="name")) -%} + + insert into {{ target_relation }} ({{ partition_by.insertable_time_partitioning_field() }}, {{ dest_columns_csv }}) + {{ wrap_with_time_ingestion_partitioning_sql(partition_by, sql, False) }} + +{%- endmacro -%} + +{% macro get_columns_with_types_in_query_sql(select_sql) %} + {% set sql %} + {%- set sql_header = config.get('sql_header', none) -%} + {{ sql_header if sql_header is not none }} + select * from ( + {{ select_sql }} + ) as __dbt_sbq + where false + limit 0 + {% endset %} + {{ return(adapter.get_columns_in_select_sql(sql)) }} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/seed.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/seed.sql new file mode 100644 index 000000000..c89d00598 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/seed.sql @@ -0,0 +1,24 @@ + +{% macro bigquery__create_csv_table(model, agate_table) %} + -- no-op +{% endmacro %} + +{% macro bigquery__reset_csv_table(model, full_refresh, old_relation, agate_table) %} + {{ adapter.drop_relation(old_relation) }} +{% endmacro %} + +{% macro bigquery__load_csv_rows(model, agate_table) %} + + {%- set column_override = model['config'].get('column_types', {}) -%} + {{ adapter.load_dataframe(model['database'], model['schema'], model['alias'], + agate_table, column_override, model['config']['delimiter']) }} + + {% call statement() %} + alter table {{ this.render() }} set {{ bigquery_table_options(config, model) }} + {% endcall %} + + {% if config.persist_relation_docs() and 'description' in model %} + + {{ adapter.update_table_description(model['database'], model['schema'], model['alias'], model['description']) }} + {% endif %} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/snapshot.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/snapshot.sql new file mode 100644 index 000000000..836a44c8d --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/snapshot.sql @@ -0,0 +1,15 @@ +{% macro bigquery__snapshot_hash_arguments(args) -%} + to_hex(md5(concat({%- for arg in args -%} + coalesce(cast({{ arg }} as string), ''){% if not loop.last %}, '|',{% endif -%} + {%- endfor -%} + ))) +{%- endmacro %} + +{% macro bigquery__create_columns(relation, columns) %} + {{ adapter.alter_table_add_columns(relation, columns) }} +{% endmacro %} + +{% macro bigquery__post_snapshot(staging_relation) %} + -- Clean up the snapshot temp table + {% do drop_relation(staging_relation) %} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/table.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/table.sql new file mode 100644 index 000000000..41bb69770 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/table.sql @@ -0,0 +1,141 @@ +{% materialization table, adapter='bigquery', supported_languages=['sql', 'python']-%} + + {%- set language = model['language'] -%} + {%- set identifier = model['alias'] -%} + {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%} + {%- set exists_not_as_table = (old_relation is not none and not old_relation.is_table) -%} + {%- set target_relation = api.Relation.create(database=database, schema=schema, identifier=identifier, type='table') -%} + + -- grab current tables grants config for comparision later on + {%- set grant_config = config.get('grants') -%} + + {{ run_hooks(pre_hooks) }} + + {# + We only need to drop this thing if it is not a table. + If it _is_ already a table, then we can overwrite it without downtime + Unlike table -> view, no need for `--full-refresh`: dropping a view is no big deal + #} + {%- if exists_not_as_table -%} + {{ adapter.drop_relation(old_relation) }} + {%- endif -%} + + -- build model + {%- set raw_partition_by = config.get('partition_by', none) -%} + {%- set partition_by = adapter.parse_partition_by(raw_partition_by) -%} + {%- set cluster_by = config.get('cluster_by', none) -%} + {% if not adapter.is_replaceable(old_relation, partition_by, cluster_by) %} + {% do log("Hard refreshing " ~ old_relation ~ " because it is not replaceable") %} + {% do adapter.drop_relation(old_relation) %} + {% endif %} + + -- build model + {%- call statement('main', language=language) -%} + {{ create_table_as(False, target_relation, compiled_code, language) }} + {%- endcall -%} + + {{ run_hooks(post_hooks) }} + + {% set should_revoke = should_revoke(old_relation, full_refresh_mode=True) %} + {% do apply_grants(target_relation, grant_config, should_revoke) %} + + {% do persist_docs(target_relation, model) %} + + {{ return({'relations': [target_relation]}) }} + +{% endmaterialization %} + +{% macro py_write_table(compiled_code, target_relation) %} +from pyspark.sql import SparkSession +{%- set raw_partition_by = config.get('partition_by', none) -%} +{%- set raw_cluster_by = config.get('cluster_by', none) -%} +{%- set enable_list_inference = config.get('enable_list_inference', true) -%} +{%- set intermediate_format = config.get('intermediate_format', none) -%} + +{%- set partition_config = adapter.parse_partition_by(raw_partition_by) %} + +spark = SparkSession.builder.appName('smallTest').getOrCreate() + +spark.conf.set("viewsEnabled","true") +spark.conf.set("temporaryGcsBucket","{{target.gcs_bucket}}") +spark.conf.set("enableListInference", "{{ enable_list_inference }}") +{% if intermediate_format %} +spark.conf.set("intermediateFormat", "{{ intermediate_format }}") +{% endif %} + +{{ compiled_code }} +dbt = dbtObj(spark.read.format("bigquery").load) +df = model(dbt, spark) + +# COMMAND ---------- +# this is materialization code dbt generated, please do not modify + +import pyspark +# make sure pandas exists before using it +try: + import pandas + pandas_available = True +except ImportError: + pandas_available = False + +# make sure pyspark.pandas exists before using it +try: + import pyspark.pandas + pyspark_pandas_api_available = True +except ImportError: + pyspark_pandas_api_available = False + +# make sure databricks.koalas exists before using it +try: + import databricks.koalas + koalas_available = True +except ImportError: + koalas_available = False + +# preferentially convert pandas DataFrames to pandas-on-Spark or Koalas DataFrames first +# since they know how to convert pandas DataFrames better than `spark.createDataFrame(df)` +# and converting from pandas-on-Spark to Spark DataFrame has no overhead +if pyspark_pandas_api_available and pandas_available and isinstance(df, pandas.core.frame.DataFrame): + df = pyspark.pandas.frame.DataFrame(df) +elif koalas_available and pandas_available and isinstance(df, pandas.core.frame.DataFrame): + df = databricks.koalas.frame.DataFrame(df) + +# convert to pyspark.sql.dataframe.DataFrame +if isinstance(df, pyspark.sql.dataframe.DataFrame): + pass # since it is already a Spark DataFrame +elif pyspark_pandas_api_available and isinstance(df, pyspark.pandas.frame.DataFrame): + df = df.to_spark() +elif koalas_available and isinstance(df, databricks.koalas.frame.DataFrame): + df = df.to_spark() +elif pandas_available and isinstance(df, pandas.core.frame.DataFrame): + df = spark.createDataFrame(df) +else: + msg = f"{type(df)} is not a supported type for dbt Python materialization" + raise Exception(msg) + +# For writeMethod we need to use "indirect" if materializing a partitioned table +# otherwise we can use "direct". Note that indirect will fail if the GCS bucket has a retention policy set on it. +{%- if partition_config %} + {%- set write_method = 'indirect' -%} +{%- else %} + {% set write_method = 'direct' -%} +{%- endif %} + +df.write \ + .mode("overwrite") \ + .format("bigquery") \ + .option("writeMethod", "{{ write_method }}") \ + .option("writeDisposition", 'WRITE_TRUNCATE') \ + {%- if partition_config is not none %} + {%- if partition_config.data_type | lower in ('date','timestamp','datetime') %} + .option("partitionField", "{{- partition_config.field -}}") \ + {%- if partition_config.granularity is not none %} + .option("partitionType", "{{- partition_config.granularity| upper -}}") \ + {%- endif %} + {%- endif %} + {%- endif %} + {%- if raw_cluster_by is not none %} + .option("clusteredFields", "{{- raw_cluster_by | join(',') -}}") \ + {%- endif %} + .save("{{target_relation}}") +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/view.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/view.sql new file mode 100644 index 000000000..fd05129f9 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/materializations/view.sql @@ -0,0 +1,29 @@ + +{% macro bigquery__handle_existing_table(full_refresh, old_relation) %} + {%- if full_refresh -%} + {{ adapter.drop_relation(old_relation) }} + {%- else -%} + {{ exceptions.relation_wrong_type(old_relation, 'view') }} + {%- endif -%} +{% endmacro %} + + +{% materialization view, adapter='bigquery' -%} + -- grab current tables grants config for comparision later on + {% set grant_config = config.get('grants') %} + + {% set to_return = bigquery__create_or_replace_view() %} + + {% set target_relation = this.incorporate(type='view') %} + + {% do persist_docs(target_relation, model) %} + + {% if config.get('grant_access_to') %} + {% for grant_target_dict in config.get('grant_access_to') %} + {% do adapter.grant_access_to(this, 'view', None, grant_target_dict) %} + {% endfor %} + {% endif %} + + {% do return(to_return) %} + +{%- endmaterialization %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/python_model/python.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/python_model/python.sql new file mode 100644 index 000000000..adbab752e --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/python_model/python.sql @@ -0,0 +1,3 @@ +{% macro bigquery__resolve_model_name(input_model_name) -%} + {{ input_model_name | string | replace('`', '') | replace('"', '\"') }} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/cluster.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/cluster.sql new file mode 100644 index 000000000..3d2e640ad --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/cluster.sql @@ -0,0 +1,13 @@ +{% macro cluster_by(raw_cluster_by) %} + {%- if raw_cluster_by is not none -%} + cluster by {% if raw_cluster_by is string -%} + {% set raw_cluster_by = [raw_cluster_by] %} + {%- endif -%} + {%- for cluster in raw_cluster_by -%} + {{ cluster }} + {%- if not loop.last -%}, {% endif -%} + {%- endfor -%} + + {% endif %} + +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/drop.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/drop.sql new file mode 100644 index 000000000..7a50704a0 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/drop.sql @@ -0,0 +1,3 @@ +{% macro bigquery__drop_relation(relation) -%} + {% do adapter.drop_relation(relation) %} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/alter.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/alter.sql new file mode 100644 index 000000000..e71f869ae --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/alter.sql @@ -0,0 +1,25 @@ +{% macro bigquery__get_alter_materialized_view_as_sql( + relation, + configuration_changes, + sql, + existing_relation, + backup_relation, + intermediate_relation +) %} + + {% if configuration_changes.requires_full_refresh %} + {{ get_replace_sql(existing_relation, relation, sql) }} + {% else %} + + alter materialized view {{ relation }} + set {{ bigquery_options(configuration_changes.options.context.as_ddl_dict()) }} + + {%- endif %} + +{% endmacro %} + +{% macro bigquery__get_materialized_view_configuration_changes(existing_relation, new_config) %} + {% set _existing_materialized_view = adapter.describe_relation(existing_relation) %} + {% set _configuration_changes = existing_relation.materialized_view_config_changeset(_existing_materialized_view, new_config.model) %} + {% do return(_configuration_changes) %} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/create.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/create.sql new file mode 100644 index 000000000..d3e8c7685 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/create.sql @@ -0,0 +1,11 @@ +{% macro bigquery__get_create_materialized_view_as_sql(relation, sql) %} + + {%- set materialized_view = adapter.Relation.materialized_view_from_relation_config(config.model) -%} + + create materialized view if not exists {{ relation }} + {% if materialized_view.partition %}{{ partition_by(materialized_view.partition) }}{% endif %} + {% if materialized_view.cluster %}{{ cluster_by(materialized_view.cluster.fields) }}{% endif %} + {{ bigquery_options(materialized_view.options.as_ddl_dict()) }} + as {{ sql }} + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/drop.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/drop.sql new file mode 100644 index 000000000..76e87f7fe --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/drop.sql @@ -0,0 +1,3 @@ +{% macro bigquery__drop_materialized_view(relation) %} + drop materialized view if exists {{ relation }} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/refresh.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/refresh.sql new file mode 100644 index 000000000..40ad59f7b --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/refresh.sql @@ -0,0 +1,3 @@ +{% macro bigquery__refresh_materialized_view(relation) %} + call bq.refresh_materialized_view('{{ relation.database }}.{{ relation.schema }}.{{ relation.identifier }}') +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/replace.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/replace.sql new file mode 100644 index 000000000..2e4a0b69f --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/materialized_view/replace.sql @@ -0,0 +1,11 @@ +{% macro bigquery__get_replace_materialized_view_as_sql(relation, sql) %} + + {%- set materialized_view = adapter.Relation.materialized_view_from_relation_config(config.model) -%} + + create or replace materialized view if not exists {{ relation }} + {% if materialized_view.partition %}{{ partition_by(materialized_view.partition) }}{% endif %} + {% if materialized_view.cluster %}{{ cluster_by(materialized_view.cluster.fields) }}{% endif %} + {{ bigquery_options(materialized_view.options.as_ddl_dict()) }} + as {{ sql }} + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/options.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/options.sql new file mode 100644 index 000000000..11f5d8541 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/options.sql @@ -0,0 +1,8 @@ +{% macro bigquery_options(opts) %} + {% set options -%} + OPTIONS({% for opt_key, opt_val in opts.items() %} + {{ opt_key }}={{ opt_val }}{{ "," if not loop.last }} + {% endfor %}) + {%- endset %} + {%- do return(options) -%} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/partition.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/partition.sql new file mode 100644 index 000000000..238baeca0 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/partition.sql @@ -0,0 +1,15 @@ +{% macro partition_by(partition_config) -%} + {%- if partition_config is none -%} + {% do return('') %} + {%- elif partition_config.time_ingestion_partitioning -%} + partition by {{ partition_config.render_wrapped() }} + {%- elif partition_config.data_type | lower in ('date','timestamp','datetime') -%} + partition by {{ partition_config.render() }} + {%- elif partition_config.data_type | lower in ('int64') -%} + {%- set range = partition_config.range -%} + partition by range_bucket( + {{ partition_config.field }}, + generate_array({{ range.start}}, {{ range.end }}, {{ range.interval }}) + ) + {%- endif -%} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/rename.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/rename.sql new file mode 100644 index 000000000..c8abfea6f --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/rename.sql @@ -0,0 +1,3 @@ +{% macro bigquery__rename_relation(from_relation, to_relation) -%} + {% do adapter.rename_relation(from_relation, to_relation) %} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/drop.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/drop.sql new file mode 100644 index 000000000..cffb41e06 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/drop.sql @@ -0,0 +1,3 @@ +{% macro bigquery__drop_table(relation) %} + drop table if exists {{ relation }} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/options.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/options.sql new file mode 100644 index 000000000..9f9b6b6d1 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/options.sql @@ -0,0 +1,4 @@ +{% macro bigquery_table_options(config, node, temporary) %} + {% set opts = adapter.get_table_options(config, node, temporary) %} + {%- do return(bigquery_options(opts)) -%} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/rename.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/rename.sql new file mode 100644 index 000000000..eff0d5d7f --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/table/rename.sql @@ -0,0 +1,3 @@ +{%- macro bigquery__get_rename_table_sql(relation, new_name) -%} + alter table {{ relation }} rename to {{ new_name }} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/drop.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/drop.sql new file mode 100644 index 000000000..6f269af1d --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/drop.sql @@ -0,0 +1,3 @@ +{% macro bigquery__drop_view(relation) %} + drop view if exists {{ relation }} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/options.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/options.sql new file mode 100644 index 000000000..bed2176b9 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/options.sql @@ -0,0 +1,4 @@ +{% macro bigquery_view_options(config, node) %} + {% set opts = adapter.get_view_options(config, node) %} + {%- do return(bigquery_options(opts)) -%} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/rename.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/rename.sql new file mode 100644 index 000000000..f21a2868c --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/rename.sql @@ -0,0 +1,3 @@ +{%- macro bigquery__get_rename_view_sql(relation, new_name) -%} + alter view {{ relation }} rename to {{ new_name }} +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/replace.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/replace.sql new file mode 100644 index 000000000..9799eaf87 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/relations/view/replace.sql @@ -0,0 +1,54 @@ +/* {# + Core materialization implementation. BigQuery and Snowflake are similar + because both can use `create or replace view` where the resulting view's columns + are not necessarily the same as those of the existing view. On Redshift, this would + result in: ERROR: cannot change number of columns in view + + This implementation is superior to the create_temp, swap_with_existing, drop_old + paradigm because transactions don't run DDL queries atomically on Snowflake. By using + `create or replace view`, the materialization becomes atomic in nature. +#} */ + +{% macro bigquery__create_or_replace_view() %} + {%- set identifier = model['alias'] -%} + + {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%} + {%- set exists_as_view = (old_relation is not none and old_relation.is_view) -%} + + {%- set target_relation = api.Relation.create( + identifier=identifier, schema=schema, database=database, + type='view') -%} + {% set grant_config = config.get('grants') %} + + {{ run_hooks(pre_hooks) }} + + -- If there's a table with the same name and we weren't told to full refresh, + -- that's an error. If we were told to full refresh, drop it. This behavior differs + -- for Snowflake and BigQuery, so multiple dispatch is used. + {%- if old_relation is not none and not old_relation.is_view -%} + {{ handle_existing_table(should_full_refresh(), old_relation) }} + {%- endif -%} + + -- build model + {% call statement('main') -%} + {{ get_create_view_as_sql(target_relation, sql) }} + {%- endcall %} + + {% set should_revoke = should_revoke(exists_as_view, full_refresh_mode=True) %} + {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} + + {{ run_hooks(post_hooks) }} + + {{ return({'relations': [target_relation]}) }} + +{% endmacro %} + + +{% macro handle_existing_table(full_refresh, old_relation) %} + {{ adapter.dispatch('handle_existing_table', 'dbt')(full_refresh, old_relation) }} +{% endmacro %} + +{% macro default__handle_existing_table(full_refresh, old_relation) %} + {{ log("Dropping relation " ~ old_relation ~ " because it is of type " ~ old_relation.type) }} + {{ adapter.drop_relation(old_relation) }} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_append.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_append.sql new file mode 100644 index 000000000..78bd5cc43 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_append.sql @@ -0,0 +1,3 @@ +{% macro bigquery__array_append(array, new_element) -%} + {{ array_concat(array, array_construct([new_element])) }} +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_concat.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_concat.sql new file mode 100644 index 000000000..eff8f524a --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_concat.sql @@ -0,0 +1,3 @@ +{% macro bigquery__array_concat(array_1, array_2) -%} + array_concat({{ array_1 }}, {{ array_2 }}) +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_construct.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_construct.sql new file mode 100644 index 000000000..270b1f785 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/array_construct.sql @@ -0,0 +1,7 @@ +{% macro bigquery__array_construct(inputs, data_type) -%} + {% if inputs|length > 0 %} + [ {{ inputs|join(' , ') }} ] + {% else %} + ARRAY<{{data_type}}>[] + {% endif %} +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/bool_or.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/bool_or.sql new file mode 100644 index 000000000..5b2e371f9 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/bool_or.sql @@ -0,0 +1,5 @@ +{% macro bigquery__bool_or(expression) -%} + + logical_or({{ expression }}) + +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/date.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/date.sql new file mode 100644 index 000000000..0f3b85aca --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/date.sql @@ -0,0 +1,3 @@ +{% macro bigquery__date(year, month, day) -%} + date({{ year }}, {{ month }}, {{ day }}) +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/date_trunc.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/date_trunc.sql new file mode 100644 index 000000000..f534c2014 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/date_trunc.sql @@ -0,0 +1,7 @@ +{% macro bigquery__date_trunc(datepart, date) -%} + timestamp_trunc( + cast({{date}} as timestamp), + {{datepart}} + ) + +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/dateadd.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/dateadd.sql new file mode 100644 index 000000000..8c6131512 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/dateadd.sql @@ -0,0 +1,8 @@ +{% macro bigquery__dateadd(datepart, interval, from_date_or_timestamp) %} + + datetime_add( + cast( {{ from_date_or_timestamp }} as datetime), + interval {{ interval }} {{ datepart }} + ) + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/datediff.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/datediff.sql new file mode 100644 index 000000000..12bdcb3b0 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/datediff.sql @@ -0,0 +1,15 @@ +{% macro bigquery__datediff(first_date, second_date, datepart) -%} + + {% if dbt_version[0] == 1 and dbt_version[2] >= 2 %} + {{ return(dbt.datediff(first_date, second_date, datepart)) }} + {% else %} + + datetime_diff( + cast({{second_date}} as datetime), + cast({{first_date}} as datetime), + {{datepart}} + ) + + {% endif %} + +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/escape_single_quotes.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/escape_single_quotes.sql new file mode 100644 index 000000000..0f39b4e85 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/escape_single_quotes.sql @@ -0,0 +1,4 @@ +{# /*BigQuery uses a single backslash: they're -> they\'re. The second backslash is to escape it from Jinja */ #} +{% macro bigquery__escape_single_quotes(expression) -%} +{{ expression | replace("'", "\\'") }} +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/except.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/except.sql new file mode 100644 index 000000000..561004722 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/except.sql @@ -0,0 +1,5 @@ +{% macro bigquery__except() %} + + except distinct + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql new file mode 100644 index 000000000..1a4193c71 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql @@ -0,0 +1,37 @@ +{% macro bigquery__format_column(column) -%} + {% set data_type = column.data_type %} + {% set formatted = column.column.lower() ~ " " ~ data_type %} + {{ return({'name': column.name, 'data_type': data_type, 'formatted': formatted}) }} +{%- endmacro -%} + +{% macro bigquery__get_empty_schema_sql(columns) %} + {%- set col_err = [] -%} + {% for col in columns.values() %} + {%- if col['data_type'] is not defined -%} + {{ col_err.append(col['name']) }} + {%- endif -%} + {%- endfor -%} + {%- if (col_err | length) > 0 -%} + {{ exceptions.column_type_missing(column_names=col_err) }} + {%- endif -%} + + {%- set columns = adapter.nest_column_data_types(columns) -%} + {{ return(dbt.default__get_empty_schema_sql(columns)) }} +{% endmacro %} + +{% macro bigquery__get_select_subquery(sql) %} + select {{ adapter.dispatch('get_column_names')() }} + from ( + {{ sql }} + ) as model_subq +{%- endmacro %} + +{% macro bigquery__get_column_names() %} + {#- loop through nested user_provided_columns to get column names -#} + {%- set user_provided_columns = adapter.nest_column_data_types(model['columns']) -%} + {%- for i in user_provided_columns %} + {%- set col = user_provided_columns[i] -%} + {%- set col_name = adapter.quote(col['name']) if col.get('quote') else col['name'] -%} + {{ col_name }}{{ ", " if not loop.last }} + {%- endfor -%} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/hash.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/hash.sql new file mode 100644 index 000000000..7150bf332 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/hash.sql @@ -0,0 +1,3 @@ +{% macro bigquery__hash(field) -%} + to_hex({{dbt.default__hash(field)}}) +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/intersect.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/intersect.sql new file mode 100644 index 000000000..a2a348718 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/intersect.sql @@ -0,0 +1,5 @@ +{% macro bigquery__intersect() %} + + intersect distinct + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/listagg.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/listagg.sql new file mode 100644 index 000000000..928d36d21 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/listagg.sql @@ -0,0 +1,14 @@ +{% macro bigquery__listagg(measure, delimiter_text, order_by_clause, limit_num) -%} + + string_agg( + {{ measure }}, + {{ delimiter_text }} + {% if order_by_clause -%} + {{ order_by_clause }} + {%- endif %} + {% if limit_num -%} + limit {{ limit_num }} + {%- endif %} + ) + +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/position.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/position.sql new file mode 100644 index 000000000..0afc5676a --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/position.sql @@ -0,0 +1,9 @@ +{% macro bigquery__position(substring_text, string_text) %} + + strpos( + {{ string_text }}, + {{ substring_text }} + + ) + +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/right.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/right.sql new file mode 100644 index 000000000..f0748a0a0 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/right.sql @@ -0,0 +1,12 @@ +{% macro bigquery__right(string_text, length_expression) %} + + case when {{ length_expression }} = 0 + then '' + else + substr( + {{ string_text }}, + -1 * ({{ length_expression }}) + ) + end + +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/safe_cast.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/safe_cast.sql new file mode 100644 index 000000000..ec312af11 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/safe_cast.sql @@ -0,0 +1,27 @@ +{% macro bigquery__safe_cast(field, type) %} +{%- if type.lower().startswith('array') and field is iterable and (field is not string and field is not mapping) and field | length > 0 -%} + {#-- Extract nested type from 'array' --#} + {% set nested_type = type.lower()[6:-1] %} + {#-- BigQuery does not support direct casts to arrays. instead, each element must be cast individually + reaggregated into an array --#} + {%- if cast_from_string_unsupported_for(nested_type) %} + (select array_agg(safe_cast(i as {{ nested_type }})) from unnest([ + {%- for nested_field in field %} + {{ nested_field.strip('"').strip("'") }}{{ ',' if not loop.last }} + {%- endfor %} + ]) i) + {%- else -%} + (select array_agg(safe_cast(i as {{nested_type}})) from unnest({{field}}) i) + {%- endif -%} + +{%- elif type.lower() == 'json' and field is mapping -%} + safe_cast(json {{ dbt.string_literal(tojson(field)) }} as json) +{%- elif cast_from_string_unsupported_for(type) and field is string -%} + safe_cast({{field.strip('"').strip("'")}} as {{type}}) +{%- else -%} + safe_cast({{field}} as {{type}}) +{%- endif -%} +{% endmacro %} + +{% macro cast_from_string_unsupported_for(type) %} + {{ return(type.lower().startswith('struct') or type.lower() == 'geography') }} +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/split_part.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/split_part.sql new file mode 100644 index 000000000..a13f0d8ce --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/split_part.sql @@ -0,0 +1,20 @@ +{% macro bigquery__split_part(string_text, delimiter_text, part_number) %} + + {% if part_number >= 0 %} + split( + {{ string_text }}, + {{ delimiter_text }} + )[safe_offset({{ part_number - 1 }})] + {% else %} + split( + {{ string_text }}, + {{ delimiter_text }} + )[safe_offset( + length({{ string_text }}) + - length( + replace({{ string_text }}, {{ delimiter_text }}, '') + ) + 1 + {{ part_number }} + )] + {% endif %} + +{% endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/string_literal.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/string_literal.sql new file mode 100644 index 000000000..07e67319a --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/string_literal.sql @@ -0,0 +1,3 @@ +{%- macro bigquery__string_literal(value) -%} + '''{{ value }}''' +{%- endmacro -%} diff --git a/dbt-bigquery/src/dbt/include/bigquery/macros/utils/timestamps.sql b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/timestamps.sql new file mode 100644 index 000000000..cdcbfd51e --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/macros/utils/timestamps.sql @@ -0,0 +1,12 @@ +{% macro bigquery__current_timestamp() -%} + current_timestamp() +{%- endmacro %} + +{% macro bigquery__snapshot_string_as_time(timestamp) -%} + {%- set result = 'TIMESTAMP("' ~ timestamp ~ '")' -%} + {{ return(result) }} +{%- endmacro %} + +{% macro bigquery__current_timestamp_backcompat() -%} + current_timestamp +{%- endmacro %} diff --git a/dbt-bigquery/src/dbt/include/bigquery/profile_template.yml b/dbt-bigquery/src/dbt/include/bigquery/profile_template.yml new file mode 100644 index 000000000..d6cefdfb8 --- /dev/null +++ b/dbt-bigquery/src/dbt/include/bigquery/profile_template.yml @@ -0,0 +1,27 @@ +fixed: + type: bigquery + priority: interactive + job_retries: 1 +prompts: + _choose_authentication_method: + oauth: + _fixed_method: oauth + service_account: + _fixed_method: service-account + keyfile: + hint: '/path/to/bigquery/keyfile.json' + project: + hint: 'GCP project id' + dataset: + hint: 'the name of your dbt dataset' + threads: + hint: '1 or more' + type: 'int' + job_execution_timeout_seconds: + default: 300 + type: 'int' + _choose_location: + US: + _fixed_location: US + EU: + _fixed_location: EU diff --git a/dbt-bigquery/test.env.example b/dbt-bigquery/test.env.example new file mode 100644 index 000000000..ffe9ee060 --- /dev/null +++ b/dbt-bigquery/test.env.example @@ -0,0 +1,17 @@ +# Note: These values will come from your BigQuery account and GCP projects. + +# Test Environment field definitions +# Name of a GCP project you don't have access to query. +BIGQUERY_TEST_NO_ACCESS_DATABASE= +# Authentication method required to hookup to BigQuery via client library. +BIGQUERY_TEST_SERVICE_ACCOUNT_JSON='{}' + +# tests for local ci/cd +DBT_TEST_USER_1="group:buildbot@dbtlabs.com" +DBT_TEST_USER_2="group:engineering-core-team@dbtlabs.com" +DBT_TEST_USER_3="serviceAccount:dbt-integration-test-user@dbt-test-env.iam.gserviceaccount.com" + +# only needed for python model +DATAPROC_REGION=us- +DATAPROC_CLUSTER_NAME= +GCS_BUCKET= diff --git a/dbt-bigquery/tests/__init__.py b/dbt-bigquery/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dbt-bigquery/tests/boundary/test_bigquery_sdk.py b/dbt-bigquery/tests/boundary/test_bigquery_sdk.py new file mode 100644 index 000000000..b8e6c9995 --- /dev/null +++ b/dbt-bigquery/tests/boundary/test_bigquery_sdk.py @@ -0,0 +1,18 @@ +import pytest + +from dbt.tests.util import get_connection +from google.cloud.bigquery import Client, DatasetReference, TableReference +from google.api_core.exceptions import NotFound + + +@pytest.mark.parametrize("table_name", ["this_table_does_not_exist"]) +def test_get_table_does_not_exist(project, table_name): + """ + TODO: replace dbt project methods with direct connection instantiation + """ + with get_connection(project.adapter) as conn: + client: Client = conn.handle + dataset_ref = DatasetReference(project.database, project.test_schema) + table_ref = TableReference(dataset_ref, table_name) + with pytest.raises(NotFound): + client.get_table(table_ref) diff --git a/dbt-bigquery/tests/conftest.py b/dbt-bigquery/tests/conftest.py new file mode 100644 index 000000000..33f7f9d17 --- /dev/null +++ b/dbt-bigquery/tests/conftest.py @@ -0,0 +1,59 @@ +import pytest +import os +import json +from dbt.adapters.bigquery.credentials import _is_base64, _base64_to_string + +# Import the functional fixtures as a plugin +# Note: fixtures with session scope need to be local + +pytest_plugins = ["dbt.tests.fixtures.project"] + + +def pytest_addoption(parser): + parser.addoption("--profile", action="store", default="oauth", type=str) + + +@pytest.fixture(scope="class") +def dbt_profile_target(request): + profile_type = request.config.getoption("--profile") + if profile_type == "oauth": + target = oauth_target() + elif profile_type == "service_account": + target = service_account_target() + else: + raise ValueError(f"Invalid profile type '{profile_type}'") + return target + + +def oauth_target(): + return { + "type": "bigquery", + "method": "oauth", + "threads": 1, + "job_retries": 2, + "dataproc_region": os.getenv("DATAPROC_REGION"), + "dataproc_cluster_name": os.getenv("DATAPROC_CLUSTER_NAME"), + "gcs_bucket": os.getenv("GCS_BUCKET"), + } + + +def service_account_target(): + credentials_json_str = os.getenv("BIGQUERY_TEST_SERVICE_ACCOUNT_JSON").replace("'", "") + if _is_base64(credentials_json_str): + credentials_json_str = _base64_to_string(credentials_json_str) + credentials = json.loads(credentials_json_str) + project_id = credentials.get("project_id") + return { + "type": "bigquery", + "method": "service-account-json", + "threads": 1, + "job_retries": 2, + "project": project_id, + "keyfile_json": credentials, + # following 3 for python model + "dataproc_region": os.getenv("DATAPROC_REGION"), + "dataproc_cluster_name": os.getenv( + "DATAPROC_CLUSTER_NAME" + ), # only needed for cluster submission method + "gcs_bucket": os.getenv("GCS_BUCKET"), + } diff --git a/dbt-bigquery/tests/functional/adapter/catalog_tests/files.py b/dbt-bigquery/tests/functional/adapter/catalog_tests/files.py new file mode 100644 index 000000000..595517bf2 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/catalog_tests/files.py @@ -0,0 +1,33 @@ +MY_SEED = """ +id,value,record_valid_date +1,100,2023-01-01 00:00:00 +2,200,2023-01-02 00:00:00 +3,300,2023-01-02 00:00:00 +""".strip() + + +MY_TABLE = """ +{{ config( + materialized='table', +) }} +select * +from {{ ref('my_seed') }} +""" + + +MY_VIEW = """ +{{ config( + materialized='view', +) }} +select * +from {{ ref('my_seed') }} +""" + + +MY_MATERIALIZED_VIEW = """ +{{ config( + materialized='materialized_view', +) }} +select * +from {{ ref('my_table') }} +""" diff --git a/dbt-bigquery/tests/functional/adapter/catalog_tests/test_relation_types.py b/dbt-bigquery/tests/functional/adapter/catalog_tests/test_relation_types.py new file mode 100644 index 000000000..96beb69c9 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/catalog_tests/test_relation_types.py @@ -0,0 +1,44 @@ +from dbt.contracts.results import CatalogArtifact +from dbt.tests.util import run_dbt +import pytest + +from tests.functional.adapter.catalog_tests import files + + +class TestCatalogRelationTypes: + @pytest.fixture(scope="class", autouse=True) + def seeds(self): + return {"my_seed.csv": files.MY_SEED} + + @pytest.fixture(scope="class", autouse=True) + def models(self): + yield { + "my_table.sql": files.MY_TABLE, + "my_view.sql": files.MY_VIEW, + "my_materialized_view.sql": files.MY_MATERIALIZED_VIEW, + } + + @pytest.fixture(scope="class", autouse=True) + def docs(self, project): + run_dbt(["seed"]) + run_dbt(["run"]) + yield run_dbt(["docs", "generate"]) + + @pytest.mark.parametrize( + "node_name,relation_type", + [ + ("seed.test.my_seed", "table"), + ("model.test.my_table", "table"), + ("model.test.my_view", "view"), + ("model.test.my_materialized_view", "materialized view"), + ], + ) + def test_relation_types_populate_correctly( + self, docs: CatalogArtifact, node_name: str, relation_type: str + ): + """ + This test addresses: https://github.com/dbt-labs/dbt-bigquery/issues/995 + """ + assert node_name in docs.nodes + node = docs.nodes[node_name] + assert node.metadata.type == relation_type diff --git a/dbt-bigquery/tests/functional/adapter/column_types/fixtures.py b/dbt-bigquery/tests/functional/adapter/column_types/fixtures.py new file mode 100644 index 000000000..88175a88b --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/column_types/fixtures.py @@ -0,0 +1,49 @@ +_MACRO_TEST_ALTER_COLUMN_TYPE = """ +{% macro test_alter_column_type(model_name, column_name, new_column_type) %} + {% set relation = ref(model_name) %} + {{ alter_column_type(relation, column_name, new_column_type) }} +{% endmacro %} +""" + +_MODEL_SQL = """ +select + CAST(1 as int64) as int64_col, + CAST(2.0 as float64) as float64_col, + CAST(3.0 as numeric) as numeric_col, + CAST('3' as string) as string_col, +""" + +_MODEL_ALT_SQL = """ +{{ config(materialized='table') }} +select + CAST(1 as int64) as int64_col, + CAST(2.0 as float64) as float64_col, + CAST(3.0 as numeric) as numeric_col, + CAST('3' as string) as string_col, +""" + +_SCHEMA_YML = """ +version: 2 +models: + - name: model + data_tests: + - is_type: + column_map: + int64_col: ['integer', 'number'] + float64_col: ['float', 'number'] + numeric_col: ['numeric', 'number'] + string_col: ['string', 'not number'] +""" + +_ALT_SCHEMA_YML = """ +version: 2 +models: + - name: model + data_tests: + - is_type: + column_map: + int64_col: ['string', 'not number'] + float64_col: ['float', 'number'] + numeric_col: ['numeric', 'number'] + string_col: ['string', 'not number'] +""" diff --git a/dbt-bigquery/tests/functional/adapter/column_types/test_alter_column_types.py b/dbt-bigquery/tests/functional/adapter/column_types/test_alter_column_types.py new file mode 100644 index 000000000..2d32e6a3c --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/column_types/test_alter_column_types.py @@ -0,0 +1,39 @@ +import pytest +import yaml +from dbt.tests.util import run_dbt +from dbt.tests.adapter.column_types.test_column_types import BaseColumnTypes +from dbt.tests.adapter.column_types.fixtures import macro_test_is_type_sql +from tests.functional.adapter.column_types.fixtures import ( + _MACRO_TEST_ALTER_COLUMN_TYPE, + _MODEL_ALT_SQL, + _ALT_SCHEMA_YML, +) + + +class BaseAlterColumnTypes(BaseColumnTypes): + @pytest.fixture(scope="class") + def macros(self): + return { + "test_is_type.sql": macro_test_is_type_sql, + "test_alter_column_type.sql": _MACRO_TEST_ALTER_COLUMN_TYPE, + } + + def run_and_alter_and_test(self, alter_column_type_args): + results = run_dbt(["run"]) + assert len(results) == 1 + run_dbt(["run-operation", "test_alter_column_type", "--args", alter_column_type_args]) + results = run_dbt(["test"]) + assert len(results) == 1 + + +class TestBigQueryAlterColumnTypes(BaseAlterColumnTypes): + @pytest.fixture(scope="class") + def models(self): + return {"model.sql": _MODEL_ALT_SQL, "schema.yml": _ALT_SCHEMA_YML} + + def test_bigquery_alter_column_types(self, project): + alter_column_type_args = yaml.safe_dump( + {"model_name": "model", "column_name": "int64_col", "new_column_type": "string"} + ) + + self.run_and_alter_and_test(alter_column_type_args) diff --git a/dbt-bigquery/tests/functional/adapter/column_types/test_column_types.py b/dbt-bigquery/tests/functional/adapter/column_types/test_column_types.py new file mode 100644 index 000000000..54675aa01 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/column_types/test_column_types.py @@ -0,0 +1,12 @@ +import pytest +from dbt.tests.adapter.column_types.test_column_types import BaseColumnTypes +from tests.functional.adapter.column_types.fixtures import _MODEL_SQL, _SCHEMA_YML + + +class TestBigQueryColumnTypes(BaseColumnTypes): + @pytest.fixture(scope="class") + def models(self): + return {"model.sql": _MODEL_SQL, "schema.yml": _SCHEMA_YML} + + def test_run_and_test(self, project): + self.run_and_test() diff --git a/dbt-bigquery/tests/functional/adapter/constraints/fixtures.py b/dbt-bigquery/tests/functional/adapter/constraints/fixtures.py new file mode 100644 index 000000000..415043403 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/constraints/fixtures.py @@ -0,0 +1,118 @@ +my_model_struct_wrong_data_type_sql = """ +{{ config(materialized = "table") }} + +select + STRUCT(1 AS struct_column_being_tested, "test" AS another_struct_column) as a +""" + +my_model_struct_correct_data_type_sql = """ +{{ config(materialized = "table")}} + +select + STRUCT("test" AS struct_column_being_tested, "test" AS b) as a +""" + +model_struct_data_type_schema_yml = """ +version: 2 +models: + - name: contract_struct_wrong + config: + contract: + enforced: true + columns: + - name: a.struct_column_being_tested + data_type: string + - name: a.b + data_type: string + + - name: contract_struct_correct + config: + contract: + enforced: true + columns: + - name: a.struct_column_being_tested + data_type: string + - name: a.b + data_type: string +""" + +my_model_double_struct_wrong_data_type_sql = """ +{{ config(materialized = "table") }} + +select + STRUCT( + STRUCT(1 AS struct_column_being_tested, "test" AS c) as b, + "test" as d + ) as a +""" + +my_model_double_struct_correct_data_type_sql = """ +{{ config(materialized = "table") }} + +select + STRUCT( + STRUCT("test" AS struct_column_being_tested, "test" AS c) as b, + "test" as d + ) as a +""" + +model_double_struct_data_type_schema_yml = """ +version: 2 +models: + - name: contract_struct_wrong + config: + contract: + enforced: true + columns: + - name: a.b.struct_column_being_tested + data_type: string + - name: a.b.c + data_type: string + - name: a.d + data_type: string + + - name: contract_struct_correct + config: + contract: + enforced: true + columns: + - name: a.b.struct_column_being_tested + data_type: string + - name: a.b.c + data_type: string + - name: a.d + data_type: string +""" + + +my_model_struct_sql = """ +{{ + config( + materialized = "table" + ) +}} + +select STRUCT("test" as nested_column, "test" as nested_column2) as id +""" + + +model_struct_schema_yml = """ +version: 2 +models: + - name: my_model + config: + contract: + enforced: true + columns: + - name: id.nested_column + quote: true + data_type: string + description: hello + constraints: + - type: not_null + - type: unique + - name: id.nested_column2 + data_type: string + constraints: + - type: unique +""" diff --git a/dbt-bigquery/tests/functional/adapter/constraints/test_constraints.py b/dbt-bigquery/tests/functional/adapter/constraints/test_constraints.py new file mode 100644 index 000000000..013f2948b --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/constraints/test_constraints.py @@ -0,0 +1,382 @@ +import pytest +from dbt.tests.adapter.constraints.test_constraints import ( + BaseTableConstraintsColumnsEqual, + BaseViewConstraintsColumnsEqual, + BaseIncrementalConstraintsColumnsEqual, + BaseTableContractSqlHeader, + BaseIncrementalContractSqlHeader, + BaseConstraintsRuntimeDdlEnforcement, + BaseConstraintsRollback, + BaseIncrementalConstraintsRuntimeDdlEnforcement, + BaseIncrementalConstraintsRollback, + BaseModelConstraintsRuntimeEnforcement, + BaseConstraintQuotedColumn, +) +from dbt.tests.adapter.constraints.fixtures import ( + my_model_sql, + my_incremental_model_sql, + my_model_wrong_order_sql, + my_model_view_wrong_order_sql, + my_model_incremental_wrong_order_sql, + my_model_wrong_name_sql, + my_model_view_wrong_name_sql, + my_model_incremental_wrong_name_sql, + my_model_with_quoted_column_name_sql, + model_schema_yml, + constrained_model_schema_yml, + model_contract_header_schema_yml, + model_quoted_column_schema_yml, + model_fk_constraint_schema_yml, + my_model_wrong_order_depends_on_fk_sql, + foreign_key_model_sql, + my_model_incremental_wrong_order_depends_on_fk_sql, +) + +from tests.functional.adapter.constraints.fixtures import ( + my_model_struct_wrong_data_type_sql, + my_model_struct_correct_data_type_sql, + my_model_double_struct_wrong_data_type_sql, + my_model_double_struct_correct_data_type_sql, + model_struct_data_type_schema_yml, + model_double_struct_data_type_schema_yml, + my_model_struct_sql, + model_struct_schema_yml, +) + +from dbt.tests.util import run_dbt_and_capture, run_dbt + +_expected_sql_bigquery = """ +create or replace table ( + id INT64 not null primary key not enforced references (id) not enforced, + color string, + date_day string +) +OPTIONS() +as ( + select id, + color, + date_day from + ( + -- depends_on: + select 'blue' as color, + 1 as id, + '2019-01-01' as date_day + ) as model_subq +); +""" + +_expected_struct_sql_bigquery = """ +create or replace table ( + id struct +) +OPTIONS() +as ( + select id from + ( + select STRUCT("test" as nested_column, "test" as nested_column2) as id + ) as model_subq +); +""" + +# Different on BigQuery: +# Switch from text to string handled by aliasing +constraints_yml = model_schema_yml +model_constraints_yml = constrained_model_schema_yml + +my_model_contract_sql_header_sql = """ +{{ + config( + materialized = "table" + ) +}} + +{% call set_sql_header(config) %} +DECLARE DEMO STRING DEFAULT 'hello world'; +{% endcall %} + +SELECT DEMO as column_name +""" + +my_model_incremental_contract_sql_header_sql = """ +{{ + config( + materialized = "incremental", + on_schema_change="append_new_columns" + ) +}} + +{% call set_sql_header(config) %} +DECLARE DEMO STRING DEFAULT 'hello world'; +{% endcall %} + +SELECT DEMO as column_name +""" + + +class BigQueryColumnEqualSetup: + @pytest.fixture + def string_type(self): + return "STRING" + + @pytest.fixture + def int_type(self): + return "INT64" + + @pytest.fixture + def data_types(self, int_type, string_type): + # sql_column_value, schema_data_type, error_data_type + return [ + ["1", int_type, int_type], + ["'1'", string_type, string_type], + ["cast('2019-01-01' as date)", "date", "DATE"], + ["true", "bool", "BOOL"], + ["cast('2013-11-03 00:00:00-07' as TIMESTAMP)", "timestamp", "TIMESTAMP"], + ["['a','b','c']", f"ARRAY<{string_type}>", f"ARRAY<{string_type}>"], + ["[1,2,3]", f"ARRAY<{int_type}>", f"ARRAY<{int_type}>"], + ["cast(1 as NUMERIC)", "numeric", "NUMERIC"], + ["""JSON '{"name": "Cooper", "forname": "Alice"}'""", "json", "JSON"], + ] + + +class TestBigQueryTableConstraintsColumnsEqual( + BigQueryColumnEqualSetup, BaseTableConstraintsColumnsEqual +): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_wrong_order.sql": my_model_wrong_order_sql, + "my_model_wrong_name.sql": my_model_wrong_name_sql, + "constraints_schema.yml": constraints_yml, + } + + +class TestBigQueryViewConstraintsColumnsEqual( + BigQueryColumnEqualSetup, BaseViewConstraintsColumnsEqual +): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_wrong_order.sql": my_model_view_wrong_order_sql, + "my_model_wrong_name.sql": my_model_view_wrong_name_sql, + "constraints_schema.yml": constraints_yml, + } + + +class TestBigQueryIncrementalConstraintsColumnsEqual( + BigQueryColumnEqualSetup, BaseIncrementalConstraintsColumnsEqual +): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_wrong_order.sql": my_model_incremental_wrong_order_sql, + "my_model_wrong_name.sql": my_model_incremental_wrong_name_sql, + "constraints_schema.yml": constraints_yml, + } + + +class TestBigQueryTableContractsSqlHeader(BaseTableContractSqlHeader): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_contract_sql_header.sql": my_model_contract_sql_header_sql, + "constraints_schema.yml": model_contract_header_schema_yml, + } + + +class TestBigQueryIncrementalContractsSqlHeader(BaseIncrementalContractSqlHeader): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_contract_sql_header.sql": my_model_incremental_contract_sql_header_sql, + "constraints_schema.yml": model_contract_header_schema_yml, + } + + +class BaseStructContract: + @pytest.fixture + def wrong_schema_data_type(self): + return "INT64" + + @pytest.fixture + def correct_schema_data_type(self): + return "STRING" + + @pytest.fixture(scope="class") + def models(self): + return { + "contract_struct_schema.yml": model_struct_data_type_schema_yml, + "contract_struct_wrong.sql": my_model_struct_wrong_data_type_sql, + "contract_struct_correct.sql": my_model_struct_correct_data_type_sql, + } + + def test__struct_contract_wrong_data_type( + self, project, correct_schema_data_type, wrong_schema_data_type + ): + results, log_output = run_dbt_and_capture( + ["run", "-s", "contract_struct_wrong"], expect_pass=False + ) + assert len(results) == 1 + assert results[0].node.config.contract.enforced + + expected = [ + "struct_column_being_tested", + wrong_schema_data_type, + correct_schema_data_type, + "data type mismatch", + ] + assert all([(exp in log_output or exp.upper() in log_output) for exp in expected]) + + def test__struct_contract_correct_data_type(self, project): + results = run_dbt(["run", "-s", "contract_struct_correct"]) + + assert len(results) == 1 + assert results[0].node.config.contract.enforced + + +class TestBigQueryStructContract(BaseStructContract): + pass + + +class TestBigQueryDoubleStructContract(BaseStructContract): + @pytest.fixture(scope="class") + def models(self): + return { + "contract_struct_schema.yml": model_double_struct_data_type_schema_yml, + "contract_struct_wrong.sql": my_model_double_struct_wrong_data_type_sql, + "contract_struct_correct.sql": my_model_double_struct_correct_data_type_sql, + } + + +class TestBigQueryTableConstraintsRuntimeDdlEnforcement(BaseConstraintsRuntimeDdlEnforcement): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model.sql": my_model_wrong_order_depends_on_fk_sql, + "foreign_key_model.sql": foreign_key_model_sql, + "constraints_schema.yml": model_fk_constraint_schema_yml, + } + + @pytest.fixture(scope="class") + def expected_sql(self, project): + return _expected_sql_bigquery + + +class TestBigQueryStructTableConstraintsRuntimeDdlEnforcement( + BaseConstraintsRuntimeDdlEnforcement +): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model.sql": my_model_struct_sql, + "constraints_schema.yml": model_struct_schema_yml, + } + + @pytest.fixture(scope="class") + def expected_sql(self, project): + return _expected_struct_sql_bigquery + + +class TestBigQueryTableConstraintsRollback(BaseConstraintsRollback): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model.sql": my_model_sql, + "constraints_schema.yml": constraints_yml, + } + + @pytest.fixture(scope="class") + def expected_error_messages(self): + return ["Required field id cannot be null"] + + +class TestBigQueryIncrementalConstraintsRuntimeDdlEnforcement( + BaseIncrementalConstraintsRuntimeDdlEnforcement +): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model.sql": my_model_incremental_wrong_order_depends_on_fk_sql, + "foreign_key_model.sql": foreign_key_model_sql, + "constraints_schema.yml": model_fk_constraint_schema_yml, + } + + @pytest.fixture(scope="class") + def expected_sql(self, project): + return _expected_sql_bigquery + + +class TestBigQueryIncrementalConstraintsRollback(BaseIncrementalConstraintsRollback): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model.sql": my_incremental_model_sql, + "constraints_schema.yml": constraints_yml, + } + + @pytest.fixture(scope="class") + def expected_error_messages(self): + return ["Required field id cannot be null"] + + +class TestBigQueryModelConstraintsRuntimeEnforcement(BaseModelConstraintsRuntimeEnforcement): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model.sql": my_model_wrong_order_depends_on_fk_sql, + "foreign_key_model.sql": foreign_key_model_sql, + "constraints_schema.yml": constrained_model_schema_yml, + } + + @pytest.fixture(scope="class") + def expected_sql(self): + return """ +create or replace table ( + id INT64 not null, + color string, + date_day string, + primary key (id) not enforced, + foreign key (id) references (id) not enforced +) +OPTIONS() +as ( + select id, + color, + date_day from + ( + -- depends_on: + select + 'blue' as color, + 1 as id, + '2019-01-01' as date_day + ) as model_subq +); +""" + + +class TestBigQueryConstraintQuotedColumn(BaseConstraintQuotedColumn): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model.sql": my_model_with_quoted_column_name_sql, + "constraints_schema.yml": model_quoted_column_schema_yml, + } + + @pytest.fixture(scope="class") + def expected_sql(self): + return """ +create or replace table ( + id INT64 not null, + `from` string not null, + date_day string +) +options() +as ( + select id, `from`, date_day + from ( + select + 'blue' as `from`, + 1 as id, + '2019-01-01' as date_day + ) as model_subq +); +""" diff --git a/dbt-bigquery/tests/functional/adapter/dbt_clone/test_dbt_clone.py b/dbt-bigquery/tests/functional/adapter/dbt_clone/test_dbt_clone.py new file mode 100644 index 000000000..189a3f067 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/dbt_clone/test_dbt_clone.py @@ -0,0 +1,20 @@ +import pytest +from dbt.tests.adapter.dbt_clone.test_dbt_clone import BaseClonePossible + + +class TestBigQueryClonePossible(BaseClonePossible): + @pytest.fixture(autouse=True) + def clean_up(self, project): + yield + with project.adapter.connection_named("__test"): + relation = project.adapter.Relation.create( + database=project.database, schema=f"{project.test_schema}_seeds" + ) + project.adapter.drop_schema(relation) + + relation = project.adapter.Relation.create( + database=project.database, schema=project.test_schema + ) + project.adapter.drop_schema(relation) + + pass diff --git a/dbt-bigquery/tests/functional/adapter/dbt_show/test_dbt_show.py b/dbt-bigquery/tests/functional/adapter/dbt_show/test_dbt_show.py new file mode 100644 index 000000000..6794547a5 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/dbt_show/test_dbt_show.py @@ -0,0 +1,61 @@ +import pytest +from dbt.tests.adapter.dbt_show.test_dbt_show import ( + BaseShowSqlHeader, + BaseShowLimit, + BaseShowDoesNotHandleDoubleLimit, +) + +from dbt.tests.util import run_dbt + +model_with_json_struct = """ + select * + from ( + select + struct< + k array< + struct + > + >( + [ + struct( + 1 as c1, + to_json(struct(1 as a)) as c2 + ) + ] + ) + as v + ) + """ + +model_with_null_json_struct = """ + select (struct(null)) as null_struct +""" + + +class TestBigQueryShowLimit(BaseShowLimit): + pass + + +class TestBigQueryShowSqlHeader(BaseShowSqlHeader): + pass + + +# Added to check if dbt show works with JSON struct +# Addresses: https://github.com/dbt-labs/dbt-bigquery/issues/972 +class TestBigQueryShowSqlWorksWithJSONStruct: + @pytest.fixture(scope="class") + def models(self): + return { + "json_struct_model.sql": model_with_json_struct, + "null_json_struct_model.sql": model_with_null_json_struct, + } + + def test_sql_header(self, project): + run_dbt(["show", "--select", "json_struct_model", "-d"]) + + def test_show_with_null_json_struct(self, project): + run_dbt(["show", "--select", "null_json_struct_model"]) + + +class TestBigQueryShowDoesNotHandleDoubleLimit(BaseShowDoesNotHandleDoubleLimit): + DATABASE_ERROR_MESSAGE = "Syntax error: Expected end of input but got keyword LIMIT" diff --git a/dbt-bigquery/tests/functional/adapter/describe_relation/_files.py b/dbt-bigquery/tests/functional/adapter/describe_relation/_files.py new file mode 100644 index 000000000..ac0203049 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/describe_relation/_files.py @@ -0,0 +1,88 @@ +MY_SEED = """ +id,value,record_date +1,100,2023-01-01 12:00:00 +2,200,2023-01-02 12:00:00 +3,300,2023-01-02 12:00:00 +""".strip() + + +MY_BASE_TABLE = """ +{{ config( + materialized='table', + partition_by={ + "field": "record_date", + "data_type": "datetime", + "granularity": "day" + }, + cluster_by=["id", "value"] +) }} +select + id, + value, + record_date +from {{ ref('my_seed') }} +""" + + +MY_MATERIALIZED_VIEW = """ +{{ config( + materialized='materialized_view', + partition_by={ + "field": "record_date", + "data_type": "datetime", + "granularity": "day" + }, + cluster_by="id", +) }} +select + id, + value, + record_date +from {{ ref('my_base_table') }} +""" + + +MY_OTHER_BASE_TABLE = """ +{{ config( + materialized='table', + partition_by={ + "field": "value", + "data_type": "int64", + "range": { + "start": 0, + "end": 500, + "interval": 50 + } + }, + cluster_by=["id", "value"] +) }} +select + id, + value, + record_date +from {{ ref('my_seed') }} +""" + + +MY_OTHER_MATERIALIZED_VIEW = """ +{{ config( + materialized='materialized_view', + partition_by={ + "field": "value", + "data_type": "int64", + "range": { + "start": 0, + "end": 500, + "interval": 50 + } + }, + cluster_by="id", + enable_refresh=False, + refresh_interval_minutes=60 +) }} +select + id, + value, + record_date +from {{ ref('my_other_base_table') }} +""" diff --git a/dbt-bigquery/tests/functional/adapter/describe_relation/test_describe_relation.py b/dbt-bigquery/tests/functional/adapter/describe_relation/test_describe_relation.py new file mode 100644 index 000000000..5ec995d11 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/describe_relation/test_describe_relation.py @@ -0,0 +1,100 @@ +import pytest + +from dbt.adapters.base.relation import BaseRelation +from dbt.adapters.contracts.relation import RelationType +from dbt.tests.util import get_connection, run_dbt + +from dbt.adapters.bigquery.relation_configs import BigQueryMaterializedViewConfig +from tests.functional.adapter.describe_relation import _files + + +class TestDescribeRelation: + @pytest.fixture(scope="class", autouse=True) + def seeds(self): + return {"my_seed.csv": _files.MY_SEED} + + @pytest.fixture(scope="class", autouse=True) + def models(self): + yield { + "my_base_table.sql": _files.MY_BASE_TABLE, + "my_materialized_view.sql": _files.MY_MATERIALIZED_VIEW, + "my_other_base_table.sql": _files.MY_OTHER_BASE_TABLE, + "my_other_materialized_view.sql": _files.MY_OTHER_MATERIALIZED_VIEW, + } + + @pytest.fixture(scope="class") + def my_seed(self, project) -> BaseRelation: + return project.adapter.Relation.create( + identifier="my_seed", + schema=project.test_schema, + database=project.database, + type=RelationType.Table, + ) + + @pytest.fixture(scope="class") + def my_base_table(self, project) -> BaseRelation: + """ + The base table for a materialized view needs to be partitioned in + the same way as the materialized view. So if we want to create a partitioned + materialized view, we need to partition the base table. This table is a + select * on the seed table, plus a partition. + """ + return project.adapter.Relation.create( + identifier="my_base_table", + schema=project.test_schema, + database=project.database, + type=RelationType.Table, + ) + + @pytest.fixture(scope="class") + def my_materialized_view(self, project) -> BaseRelation: + return project.adapter.Relation.create( + identifier="my_materialized_view", + schema=project.test_schema, + database=project.database, + type=RelationType.MaterializedView, + ) + + @pytest.fixture(scope="class") + def my_other_materialized_view(self, project) -> BaseRelation: + return project.adapter.Relation.create( + identifier="my_other_materialized_view", + schema=project.test_schema, + database=project.database, + type=RelationType.MaterializedView, + ) + + @pytest.fixture(scope="class", autouse=True) + def setup(self, project, my_base_table, my_materialized_view): + run_dbt(["seed"]) + run_dbt(["run"]) + yield + project.run_sql(f"drop schema if exists {project.test_schema} cascade") + + def test_describe_materialized_view(self, project, my_materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(my_materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.table_id == f'"{my_materialized_view.identifier}"' + assert results.dataset_id == f'"{my_materialized_view.schema}"' + assert results.project_id == f'"{my_materialized_view.database}"' + assert results.partition.field == "record_date" + assert results.partition.data_type == "datetime" + assert results.partition.granularity == "day" + assert results.cluster.fields == frozenset({"id"}) + assert results.options.enable_refresh is True + assert results.options.refresh_interval_minutes == 30 + + def test_describe_other_materialized_view(self, project, my_other_materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(my_other_materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.table_id == f'"{my_other_materialized_view.identifier}"' + assert results.dataset_id == f'"{my_other_materialized_view.schema}"' + assert results.project_id == f'"{my_other_materialized_view.database}"' + assert results.partition.field == "value" + assert results.partition.data_type == "int64" + assert results.partition.range == {"start": 0, "end": 500, "interval": 50} + assert results.cluster.fields == frozenset({"id"}) + assert results.options.enable_refresh is False + assert results.options.refresh_interval_minutes == 30 # BQ returns it to the default diff --git a/dbt-bigquery/tests/functional/adapter/empty/test_empty.py b/dbt-bigquery/tests/functional/adapter/empty/test_empty.py new file mode 100644 index 000000000..3bf47f35d --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/empty/test_empty.py @@ -0,0 +1,9 @@ +from dbt.tests.adapter.empty.test_empty import BaseTestEmpty, BaseTestEmptyInlineSourceRef + + +class TestBigQueryEmpty(BaseTestEmpty): + pass + + +class TestBigQueryEmptyInlineSourceRef(BaseTestEmptyInlineSourceRef): + pass diff --git a/dbt-bigquery/tests/functional/adapter/expected_stats.py b/dbt-bigquery/tests/functional/adapter/expected_stats.py new file mode 100644 index 000000000..560d8de73 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/expected_stats.py @@ -0,0 +1,62 @@ +from dbt.tests.util import AnyString, AnyInteger + + +def bigquery_stats(is_table, partition=None, cluster=None): + stats = {} + + if is_table: + stats.update( + { + "num_bytes": { + "id": "num_bytes", + "label": AnyString(), + "value": AnyInteger(), + "description": AnyString(), + "include": True, + }, + "num_rows": { + "id": "num_rows", + "label": AnyString(), + "value": AnyInteger(), + "description": AnyString(), + "include": True, + }, + } + ) + + if partition is not None: + stats.update( + { + "partitioning_type": { + "id": "partitioning_type", + "label": AnyString(), + "value": partition, + "description": AnyString(), + "include": True, + } + } + ) + + if cluster is not None: + stats.update( + { + "clustering_fields": { + "id": "clustering_fields", + "label": AnyString(), + "value": cluster, + "description": AnyString(), + "include": True, + } + } + ) + + has_stats = { + "id": "has_stats", + "label": "Has Stats?", + "value": bool(stats), + "description": "Indicates whether there are statistics for this table", + "include": False, + } + stats["has_stats"] = has_stats + + return stats diff --git a/dbt-bigquery/tests/functional/adapter/hooks/data/seed_model.sql b/dbt-bigquery/tests/functional/adapter/hooks/data/seed_model.sql new file mode 100644 index 000000000..ccea4830f --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/hooks/data/seed_model.sql @@ -0,0 +1,16 @@ +drop table if exists `{schema}.on_model_hook`; + +create table `{schema}.on_model_hook` ( + test_state STRING, -- start|end + target_dbname STRING, + target_host STRING, + target_name STRING, + target_schema STRING, + target_type STRING, + target_user STRING, + target_pass STRING, + target_threads INTEGER, + run_started_at STRING, + invocation_id STRING, + thread_id STRING +); diff --git a/dbt-bigquery/tests/functional/adapter/hooks/data/seed_run.sql b/dbt-bigquery/tests/functional/adapter/hooks/data/seed_run.sql new file mode 100644 index 000000000..b39ba4e2d --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/hooks/data/seed_run.sql @@ -0,0 +1,17 @@ + +drop table if exists {schema}.on_run_hook; + +create table {schema}.on_run_hook ( + test_state STRING, -- start|end + target_dbname STRING, + target_host STRING, + target_name STRING, + target_schema STRING, + target_type STRING, + target_user STRING, + target_pass STRING, + target_threads INTEGER, + run_started_at STRING, + invocation_id STRING, + thread_id STRING +); diff --git a/dbt-bigquery/tests/functional/adapter/hooks/test_model_hooks.py b/dbt-bigquery/tests/functional/adapter/hooks/test_model_hooks.py new file mode 100644 index 000000000..0d0d46c6b --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/hooks/test_model_hooks.py @@ -0,0 +1,36 @@ +from dbt.tests.adapter.hooks import test_model_hooks as core_base +import pytest + + +class TestBigQueryPrePostModelHooks(core_base.TestPrePostModelHooks): + def check_hooks(self, state, project, host, count=1): + self.get_ctx_vars(state, count=count, project=project) + + +class TestBigQueryPrePostModelHooksUnderscores(core_base.TestPrePostModelHooksUnderscores): + def check_hooks(self, state, project, host, count=1): + self.get_ctx_vars(state, count=count, project=project) + + +class TestBigQueryHookRefs(core_base.TestHookRefs): + def check_hooks(self, state, project, host, count=1): + self.get_ctx_vars(state, count=count, project=project) + + +class TestBigQueryPrePostModelHooksOnSeeds(core_base.TestPrePostModelHooksOnSeeds): + def check_hooks(self, state, project, host, count=1): + self.get_ctx_vars(state, count=count, project=project) + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "seed-paths": ["seeds"], + "models": {}, + "seeds": { + "+post-hook": [ + "alter table {{ this }} add column new_col int", + "update {{ this }} set new_col = 1 where 1=1", + ], + "quote_columns": True, + }, + } diff --git a/dbt-bigquery/tests/functional/adapter/hooks/test_run_hooks.py b/dbt-bigquery/tests/functional/adapter/hooks/test_run_hooks.py new file mode 100644 index 000000000..23cb4ebde --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/hooks/test_run_hooks.py @@ -0,0 +1,38 @@ +import pytest +from dbt.tests.adapter.hooks import test_run_hooks as core_base + + +class TestPrePostRunHooksBigQuery(core_base.TestPrePostRunHooks): + def check_hooks(self, state, project, host): + self.get_ctx_vars(state, project) + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + # The create and drop table statements here validate that these hooks run + # in the same order that they are defined. Drop before create is an error. + # Also check that the table does not exist below. + "on-run-start": [ + "{{ custom_run_hook('start', target, run_started_at, invocation_id) }}", + "create table {{ target.schema }}.start_hook_order_test ( id int )", + "drop table {{ target.schema }}.start_hook_order_test", + "{{ log(env_var('TERM_TEST'), info=True) }}", + ], + "on-run-end": [ + "{{ custom_run_hook('end', target, run_started_at, invocation_id) }}", + "create table {{ target.schema }}.end_hook_order_test ( id int )", + "drop table {{ target.schema }}.end_hook_order_test", + "create table {{ target.schema }}.schemas ( schema string )", + "insert into {{ target.schema }}.schemas (schema) values {% for schema in schemas %}( '{{ schema }}' ){% if not loop.last %},{% endif %}{% endfor %}", + "create table {{ target.schema }}.db_schemas ( db string, schema string )", + "insert into {{ target.schema }}.db_schemas (db, schema) values {% for db, schema in database_schemas %}('{{ db }}', '{{ schema }}' ){% if not loop.last %},{% endif %}{% endfor %}", + ], + "seeds": { + "quote_columns": False, + }, + } + + +class TestAfterRunHooksBigQuery(core_base.TestAfterRunHooks): + def check_hooks(self, state, project, host): + self.get_ctx_vars(state, project) diff --git a/dbt-bigquery/tests/functional/adapter/incremental/incremental_strategy_fixtures.py b/dbt-bigquery/tests/functional/adapter/incremental/incremental_strategy_fixtures.py new file mode 100644 index 000000000..21d5f15b6 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/incremental/incremental_strategy_fixtures.py @@ -0,0 +1,649 @@ +merge_range_sql = """ +{{ + config( + materialized="incremental", + unique_key="id", + cluster_by="id", + partition_by={ + "field": "id", + "data_type": "int64", + "range": { + "start": 1, + "end": 10, + "interval": 1 + } + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-01' as datetime) as date_time + + {% else %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-02' as datetime) as date_time union all + select 5 as id, cast('2020-01-02' as datetime) as date_time union all + select 6 as id, cast('2020-01-02' as datetime) as date_time + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where id >= (select max(id) from {{ this }}) +{% endif %} +""".lstrip() + +merge_time_sql = """ +{{ + config( + materialized="incremental", + unique_key="id", + cluster_by="id", + partition_by={ + "field": "date_time", + "data_type": "dateTime" + } + ) +}} + + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-01' as datetime) as date_time + + {% else %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-02' as datetime) as date_time union all + select 5 as id, cast('2020-01-02' as datetime) as date_time union all + select 6 as id, cast('2020-01-02' as datetime) as date_time + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_time > (select max(date_time) from {{ this }}) +{% endif %} +""".lstrip() + +merge_time_with_require_partition_sql = """ +{{ + config( + materialized="incremental", + unique_key="id", + cluster_by="id", + partition_by={ + "field": "date_time", + "data_type": "dateTime" + }, + post_hook=" + create or replace view `{{ schema }}.incremental_merge_time_with_require_partition_view` + as select * from {{ this }} where date_time is null or date_time is not null + ", + require_partition_filter=true + ) +}} + + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-01' as datetime) as date_time + + {% else %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-02' as datetime) as date_time union all + select 5 as id, cast('2020-01-02' as datetime) as date_time union all + select 6 as id, cast('2020-01-02' as datetime) as date_time + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_time > ( + select max(date_time) + from {{ this }} + where ( + date_time is null + or date_time is not null + ) +) + +{% endif %} +""".lstrip() + +overwrite_date_sql = """ +{{ + config( + materialized="incremental", + incremental_strategy='insert_overwrite', + cluster_by="id", + partition_by={ + "field": "date_day", + "data_type": "date" + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as date) as date_day union all + select 2 as id, cast('2020-01-01' as date) as date_day union all + select 3 as id, cast('2020-01-01' as date) as date_day union all + select 4 as id, cast('2020-01-01' as date) as date_day + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 partition + -- with the 2 records below, but add two more in the 2020-01-02 partition + select 10 as id, cast('2020-01-01' as date) as date_day union all + select 20 as id, cast('2020-01-01' as date) as date_day union all + select 30 as id, cast('2020-01-02' as date) as date_day union all + select 40 as id, cast('2020-01-02' as date) as date_day + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_day >= _dbt_max_partition +{% endif %} +""".lstrip() + +overwrite_day_sql = """ +{{ + config( + materialized="incremental", + incremental_strategy='insert_overwrite', + cluster_by="id", + partition_by={ + "field": "date_time", + "data_type": "datetime" + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-01' as datetime) as date_time + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 partition + -- with the 2 records below, but add two more in the 2020-01-02 partition + select 10 as id, cast('2020-01-01' as datetime) as date_time union all + select 20 as id, cast('2020-01-01' as datetime) as date_time union all + select 30 as id, cast('2020-01-02' as datetime) as date_time union all + select 40 as id, cast('2020-01-02' as datetime) as date_time + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_time >= _dbt_max_partition +{% endif %} +""".lstrip() + +overwrite_day_with_copy_partitions_sql = """ +{{ + config( + materialized="incremental", + incremental_strategy='insert_overwrite', + cluster_by="id", + partition_by={ + "field": "date_time", + "data_type": "datetime", + "copy_partitions": true + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-01' as datetime) as date_time + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 partition + -- with the 2 records below, but add two more in the 2020-01-02 partition + select 10 as id, cast('2020-01-01' as datetime) as date_time union all + select 20 as id, cast('2020-01-01' as datetime) as date_time union all + select 30 as id, cast('2020-01-02' as datetime) as date_time union all + select 40 as id, cast('2020-01-02' as datetime) as date_time + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_time >= _dbt_max_partition +{% endif %} +""".lstrip() + +overwrite_day_with_time_partition_datetime_sql = """ +{{ + config( + materialized="incremental", + incremental_strategy='insert_overwrite', + cluster_by="id", + partition_by={ + "field": "date_day", + "data_type": "date", + "time_ingestion_partitioning": true + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as date) as date_day union all + select 2 as id, cast('2020-01-01' as date) as date_day union all + select 3 as id, cast('2020-01-01' as date) as date_day union all + select 4 as id, cast('2020-01-01' as date) as date_day + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 partition + -- with the 2 records below, but add two more in the 2020-01-02 partition + select 10 as id, cast('2020-01-01' as date) as date_day union all + select 20 as id, cast('2020-01-01' as date) as date_day union all + select 30 as id, cast('2020-01-02' as date) as date_day union all + select 40 as id, cast('2020-01-02' as date) as date_day + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_day >= '2020-01-01' +{% endif %} +""".lstrip() + +overwrite_partitions_sql = """ +{{ + config( + materialized="incremental", + incremental_strategy='insert_overwrite', + cluster_by="id", + partitions=["'2020-01-01'","'2020-01-02'"], + partition_by={ + "field": "date_day", + "data_type": "date" + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as date) as date_day union all + select 2 as id, cast('2020-01-01' as date) as date_day union all + select 3 as id, cast('2020-01-01' as date) as date_day union all + select 4 as id, cast('2020-01-01' as date) as date_day + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 partition + -- with the 2 records below, but add two more in the 2020-01-02 partition + select 10 as id, cast('2020-01-01' as date) as date_day union all + select 20 as id, cast('2020-01-01' as date) as date_day union all + select 30 as id, cast('2020-01-02' as date) as date_day union all + select 40 as id, cast('2020-01-02' as date) as date_day + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_day in ({{ config.get("partitions") | join(",") }}) +{% endif %} +-- Test comment to prevent recurrence of https://github.com/dbt-labs/dbt-bigquery/issues/896 +""".lstrip() + +overwrite_range_sql = """ +{{ + config( + materialized="incremental", + incremental_strategy='insert_overwrite', + cluster_by="id", + partition_by={ + "field": "date_int", + "data_type": "int64", + "range": { + "start": 20200101, + "end": 20200110, + "interval": 1 + } + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, 20200101 as date_int union all + select 2 as id, 20200101 as date_int union all + select 3 as id, 20200101 as date_int union all + select 4 as id, 20200101 as date_int + + {% else %} + + -- we want to overwrite the 4 records in the 20200101 partition + -- with the 2 records below, but add two more in the 20200102 partition + select 10 as id, 20200101 as date_int union all + select 20 as id, 20200101 as date_int union all + select 30 as id, 20200102 as date_int union all + select 40 as id, 20200102 as date_int + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_int >= _dbt_max_partition +{% endif %} +""".lstrip() + +overwrite_time_sql = """ +{{ + config( + materialized="incremental", + incremental_strategy='insert_overwrite', + cluster_by="id", + partition_by={ + "field": "date_hour", + "data_type": "datetime", + "granularity": "hour" + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour union all + select 2 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour union all + select 3 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour union all + select 4 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 01:00:00 partition + -- with the 2 records below, but add two more in the 2020-01-00 02:00:00 partition + select 10 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour union all + select 20 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour union all + select 30 as id, cast('2020-01-01 02:00:00' as datetime) as date_hour union all + select 40 as id, cast('2020-01-01 02:00:00' as datetime) as date_hour + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_hour >= _dbt_max_partition +{% endif %} +""".lstrip() + +overwrite_day_with_time_ingestion_sql = """ +{{ + config( + materialized="incremental", + incremental_strategy='insert_overwrite', + cluster_by="id", + partition_by={ + "field": "date_time", + "data_type": "datetime", + "time_ingestion_partitioning": true + }, + require_partition_filter=true + ) +}} + + +{%- call set_sql_header(config) %} + CREATE TEMP FUNCTION asDateTime(date STRING) AS ( + cast(date as datetime) + ); +{%- endcall %} + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, asDateTime('2020-01-01') as date_time union all + select 2 as id, asDateTime('2020-01-01') as date_time union all + select 3 as id, asDateTime('2020-01-01') as date_time union all + select 4 as id, asDateTime('2020-01-01') as date_time + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 partition + -- with the 2 records below, but add two more in the 2020-01-02 partition + select 10 as id, asDateTime('2020-01-01') as date_time union all + select 20 as id, asDateTime('2020-01-01') as date_time union all + select 30 as id, cast('2020-01-02' as datetime) as date_time union all + select 40 as id, cast('2020-01-02' as datetime) as date_time + + {% endif %} + +) + +select * from data + +{% if is_incremental() %} +where date_time > '2020-01-01' +{% endif %} +""".lstrip() + +overwrite_static_day_sql = """ +{% set partitions_to_replace = [ + "'2020-01-01'", + "'2020-01-02'", +] %} + +{{ + config( + materialized="incremental", + incremental_strategy="insert_overwrite", + cluster_by="id", + partition_by={ + "field": "date_time", + "data_type": "datetime", + "granularity": "day" + }, + partitions=partitions_to_replace, + on_schema_change="sync_all_columns" + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, cast('2020-01-01' as datetime) as date_time union all + select 2 as id, cast('2020-01-01' as datetime) as date_time union all + select 3 as id, cast('2020-01-01' as datetime) as date_time union all + select 4 as id, cast('2020-01-01' as datetime) as date_time + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 partition + -- with the 2 records below, but add two more in the 2020-01-02 partition + select 10 as id, cast('2020-01-01' as datetime) as date_time union all + select 20 as id, cast('2020-01-01' as datetime) as date_time union all + select 30 as id, cast('2020-01-02' as datetime) as date_time union all + select 40 as id, cast('2020-01-02' as datetime) as date_time + + {% endif %} + +) + +select * from data +""".lstrip() + +microbatch_model_no_unique_id_sql = """ +{{ config( + materialized='incremental', + incremental_strategy='microbatch', + partition_by={ + 'field': 'event_time', + 'data_type': 'timestamp', + 'granularity': 'day' + }, + event_time='event_time', + batch_size='day', + begin=modules.datetime.datetime(2020, 1, 1, 0, 0, 0) + ) +}} +select id, cast(event_time as timestamp) as event_time from {{ ref('input_model') }} +""" + +microbatch_input_sql = """ +{{ config(materialized='table', event_time='event_time') }} +select 1 as id, TIMESTAMP '2020-01-01 00:00:00-0' as event_time +union all +select 2 as id, TIMESTAMP '2020-01-02 00:00:00-0' as event_time +union all +select 3 as id, TIMESTAMP '2020-01-03 00:00:00-0' as event_time +""" + +microbatch_input_event_time_date_sql = """ +{{ config(materialized='table', event_time='event_time') }} +select 1 as id, DATE '2020-01-01' as event_time +union all +select 2 as id, DATE '2020-01-02' as event_time +union all +select 3 as id, DATE '2020-01-03' as event_time +""" + +microbatch_input_event_time_datetime_sql = """ +{{ config(materialized='table', event_time='event_time') }} +select 1 as id, DATETIME '2020-01-01' as event_time +union all +select 2 as id, DATETIME '2020-01-02' as event_time +union all +select 3 as id, DATETIME '2020-01-03' as event_time +""" + +microbatch_model_no_partition_by_sql = """ +{{ config( + materialized='incremental', + incremental_strategy='microbatch', + event_time='event_time', + batch_size='day', + begin=modules.datetime.datetime(2020, 1, 1, 0, 0, 0) + ) +}} +select * from {{ ref('input_model') }} +""" + + +microbatch_model_invalid_partition_by_sql = """ +{{ config( + materialized='incremental', + incremental_strategy='microbatch', + event_time='event_time', + batch_size='day', + begin=modules.datetime.datetime(2020, 1, 1, 0, 0, 0), + partition_by={ + 'field': 'event_time', + 'data_type': 'timestamp', + 'granularity': 'hour' + } + ) +}} +select * from {{ ref('input_model') }} +""" + +microbatch_model_no_unique_id_copy_partitions_sql = """ +{{ config( + materialized='incremental', + incremental_strategy='microbatch', + partition_by={ + 'field': 'event_time', + 'data_type': 'timestamp', + 'granularity': 'day', + 'copy_partitions': true + }, + event_time='event_time', + batch_size='day', + begin=modules.datetime.datetime(2020, 1, 1, 0, 0, 0) + ) +}} +select * from {{ ref('input_model') }} +""" diff --git a/dbt-bigquery/tests/functional/adapter/incremental/seeds.py b/dbt-bigquery/tests/functional/adapter/incremental/seeds.py new file mode 100644 index 000000000..9198bd244 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/incremental/seeds.py @@ -0,0 +1,57 @@ +seed_data_csv = """ +id,dupe +1,a +2,a +3,a +4,a +""".lstrip() + +seed_incremental_overwrite_date_expected_csv = """ +id,date_day +10,2020-01-01 +20,2020-01-01 +30,2020-01-02 +40,2020-01-02 +""".lstrip() + +seed_incremental_overwrite_day_expected_csv = """ +id,date_time +10,2020-01-01 00:00:00 +20,2020-01-01 00:00:00 +30,2020-01-02 00:00:00 +40,2020-01-02 00:00:00 +""".lstrip() + +seed_incremental_overwrite_range_expected_csv = """ +id,date_int +10,20200101 +20,20200101 +30,20200102 +40,20200102 +""".lstrip() + +seed_incremental_overwrite_time_expected_csv = """ +id,date_hour +10,2020-01-01 01:00:00 +20,2020-01-01 01:00:00 +30,2020-01-01 02:00:00 +40,2020-01-01 02:00:00 +""".lstrip() + +seed_merge_expected_csv = """ +id,date_time +1,2020-01-01 00:00:00 +2,2020-01-01 00:00:00 +3,2020-01-01 00:00:00 +4,2020-01-02 00:00:00 +5,2020-01-02 00:00:00 +6,2020-01-02 00:00:00 +""".lstrip() + +seed_incremental_overwrite_day_with_time_partition_expected_csv = """ +id +10 +20 +30 +40 +""".lstrip() diff --git a/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_merge_exclude_columns.py b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_merge_exclude_columns.py new file mode 100644 index 000000000..022ebca07 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_merge_exclude_columns.py @@ -0,0 +1,7 @@ +from dbt.tests.adapter.incremental.test_incremental_merge_exclude_columns import ( + BaseMergeExcludeColumns, +) + + +class TestMergeExcludeColumns(BaseMergeExcludeColumns): + pass diff --git a/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_microbatch.py b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_microbatch.py new file mode 100644 index 000000000..912f96eec --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_microbatch.py @@ -0,0 +1,88 @@ +import os +import pytest +from unittest import mock + +from dbt.tests.util import run_dbt_and_capture +from dbt.tests.adapter.incremental.test_incremental_microbatch import ( + BaseMicrobatch, + patch_microbatch_end_time, +) + +from tests.functional.adapter.incremental.incremental_strategy_fixtures import ( + microbatch_model_no_unique_id_sql, + microbatch_input_sql, + microbatch_model_no_partition_by_sql, + microbatch_model_invalid_partition_by_sql, + microbatch_model_no_unique_id_copy_partitions_sql, + microbatch_input_event_time_date_sql, + microbatch_input_event_time_datetime_sql, +) + + +class TestBigQueryMicrobatch(BaseMicrobatch): + @pytest.fixture(scope="class") + def microbatch_model_sql(self) -> str: + return microbatch_model_no_unique_id_sql + + +class TestBigQueryMicrobatchInputWithDate(TestBigQueryMicrobatch): + @pytest.fixture(scope="class") + def input_model_sql(self) -> str: + return microbatch_input_event_time_date_sql + + @pytest.fixture(scope="class") + def insert_two_rows_sql(self, project) -> str: + test_schema_relation = project.adapter.Relation.create( + database=project.database, schema=project.test_schema + ) + return f"insert into {test_schema_relation}.input_model (id, event_time) values (4, DATE '2020-01-04'), (5, DATE '2020-01-05')" + + +class TestBigQueryMicrobatchInputWithDatetime(TestBigQueryMicrobatch): + @pytest.fixture(scope="class") + def input_model_sql(self) -> str: + return microbatch_input_event_time_datetime_sql + + @pytest.fixture(scope="class") + def insert_two_rows_sql(self, project) -> str: + test_schema_relation = project.adapter.Relation.create( + database=project.database, schema=project.test_schema + ) + return f"insert into {test_schema_relation}.input_model (id, event_time) values (4, DATETIME '2020-01-04'), (5, DATETIME '2020-01-05')" + + +class TestBigQueryMicrobatchMissingPartitionBy: + @pytest.fixture(scope="class") + def models(self) -> str: + return { + "microbatch.sql": microbatch_model_no_partition_by_sql, + "input_model.sql": microbatch_input_sql, + } + + def test_execution_failure_no_partition_by(self, project): + with patch_microbatch_end_time("2020-01-03 13:57:00"): + _, stdout = run_dbt_and_capture(["run"], expect_pass=False) + assert "The 'microbatch' strategy requires a `partition_by` config" in stdout + + +class TestBigQueryMicrobatchInvalidPartitionByGranularity: + @pytest.fixture(scope="class") + def models(self) -> str: + return { + "microbatch.sql": microbatch_model_invalid_partition_by_sql, + "input_model.sql": microbatch_input_sql, + } + + def test_execution_failure_no_partition_by(self, project): + with patch_microbatch_end_time("2020-01-03 13:57:00"): + _, stdout = run_dbt_and_capture(["run"], expect_pass=False) + assert ( + "The 'microbatch' strategy requires a `partition_by` config with the same granularity as its configured `batch_size`" + in stdout + ) + + +class TestBigQueryMicrobatchWithCopyPartitions(BaseMicrobatch): + @pytest.fixture(scope="class") + def microbatch_model_sql(self) -> str: + return microbatch_model_no_unique_id_copy_partitions_sql diff --git a/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_on_schema_change.py b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_on_schema_change.py new file mode 100644 index 000000000..65e855241 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_on_schema_change.py @@ -0,0 +1,253 @@ +import pytest + +from dbt.tests.adapter.incremental.test_incremental_on_schema_change import ( + BaseIncrementalOnSchemaChangeSetup, + BaseIncrementalOnSchemaChange, +) + +from dbt.tests.adapter.incremental.fixtures import ( + _MODELS__A, + _MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_TARGET, +) + + +class TestIncrementalOnSchemaChange(BaseIncrementalOnSchemaChange): + pass + + +_MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_DYNAMIC_INSERT_OVERWRITE = """ +{{ + config( + materialized='incremental', + unique_key='id', + on_schema_change='sync_all_columns', + partition_by={ + "field": "id", + "data_type": "int64", + "range": { + "start": 1, + "end": 6, + "interval": 1 + } + }, + incremental_strategy='insert_overwrite' + ) +}} + +WITH source_data AS (SELECT * FROM {{ ref('model_a') }} ) + +{% set string_type = 'string' %} + +{% if is_incremental() %} + +SELECT id, + cast(field1 as {{string_type}}) as field1, + cast(field3 as {{string_type}}) as field3, -- to validate new fields + cast(field4 as {{string_type}}) AS field4 -- to validate new fields + +FROM source_data WHERE id > _dbt_max_partition + +{% else %} + +select id, + cast(field1 as {{string_type}}) as field1, + cast(field2 as {{string_type}}) as field2 + +from source_data where id <= 3 + +{% endif %} +""" + +_MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_TIME_INGESTION_PARTITIONING = """ + +{{ + config( + materialized="incremental", + on_schema_change='sync_all_columns', + incremental_strategy='insert_overwrite', + partition_by={ + "field": "date_hour", + "data_type": "datetime", + "granularity": "hour", + "time_ingestion_partitioning": true + } + ) +}} + + +with data as ( + + {% if not is_incremental() %} + + select 1 as id, + cast('2020-01-01 01:00:00' as datetime) as date_hour, + 1 as field_1, + 2 as field_2, + STRUCT(1 as `group`, 2 as `WHERE`, 3 as group_2, 4 as WHERE_TO) as field_struct union all + select 2 as id, + cast('2020-01-01 01:00:00' as datetime) as date_hour, + 1 as field_1, + 2 as field_2, + STRUCT(1 as `group`, 2 as `WHERE`, 3 as group_2, 4 as WHERE_TO) union all + select 3 as id, + cast('2020-01-01 01:00:00' as datetime) as date_hour, + 1 as field_1, + 2 as field_2, + STRUCT(2 as `group`, 2 as `WHERE`, 3 as group_2, 4 as WHERE_TO) union all + select 4 as id, + cast('2020-01-01 01:00:00' as datetime) as date_hour, + 1 as field_1, + 2 as field_2, + STRUCT(2 as `group`, 2 as `WHERE`, 3 as group_2, 4 as WHERE_TO) + + {% else %} + + -- we want to overwrite the 4 records in the 2020-01-01 01:00:00 partition + -- with the 2 records below, but add two more in the 2020-01-00 02:00:00 partition + select 10 as id, + cast('2020-01-01 01:00:00' as datetime) as date_hour, + 3 as field_3, + 2 as field_2 union all + select 20 as id, + cast('2020-01-01 01:00:00' as datetime) as date_hour, + 3 as field_3, + 2 as field_2 union all + select 30 as id, + cast('2020-01-01 02:00:00' as datetime) as date_hour, + 3 as field_3, + 2 as field_2 union all + select 40 as id, + cast('2020-01-01 02:00:00' as datetime) as date_hour, + 3 as field_3, + 2 as field_2 + + {% endif %} + +) + +select * from data +""" + +_MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_TIME_INGESTION_PARTITIONING_TARGET = """ +{{ + config( + materialized="incremental", + on_schema_change='sync_all_columns', + partition_by={ + "field": "date_hour", + "data_type": "datetime", + "granularity": "hour", + "time_ingestion_partitioning": true + } + ) +}} + +{% if not is_incremental() %} + + select 10 as id, + cast('2020-01-01 01:00:00' as datetime) as date_hour, + 3 as field_3, + 2 as field_2 + union all + select 30 as id, + cast('2020-01-01 02:00:00' as datetime) as date_hour, + 3 as field_3, + 2 as field_2 + +{% else %} + + select 20 as id, + cast('2020-01-01 01:00:00' as datetime) as date_hour, + 3 as field_3, + 2 as field_2 + union all + select 40 as id, + cast('2020-01-01 02:00:00' as datetime) as date_hour, + 3 as field_3, + 2 as field_2 + +{% endif %} +""" + +_MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_DYNAMIC_INSERT_OVERWRITE_COPY_PARTITIONS = """ +{{ + config( + materialized='incremental', + unique_key='id', + on_schema_change='sync_all_columns', + partition_by={ + "field": "id", + "data_type": "int64", + "range": { + "start": 1, + "end": 7, + "interval": 1 + }, + "copy_partitions": true + }, + incremental_strategy='insert_overwrite' + ) +}} + +WITH source_data AS (SELECT * FROM {{ ref('model_a') }} ) + +{% set string_type = 'string' %} + +{% if is_incremental() %} + +SELECT id, + cast(field1 as {{string_type}}) as field1, + cast(field3 as {{string_type}}) as field3, -- to validate new fields + cast(field4 as {{string_type}}) AS field4 -- to validate new fields + +FROM source_data WHERE id > _dbt_max_partition + +{% else %} + +select id, + cast(field1 as {{string_type}}) as field1, + cast(field2 as {{string_type}}) as field2 + +from source_data where id <= 3 + +{% endif %} +""" + + +class TestIncrementalOnSchemaChangeBigQuerySpecific(BaseIncrementalOnSchemaChangeSetup): + @pytest.fixture(scope="class") + def models(self): + return { + "model_a.sql": _MODELS__A, + "incremental_sync_all_columns_dynamic_insert_overwrite.sql": _MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_DYNAMIC_INSERT_OVERWRITE, + "incremental_sync_all_columns_dynamic_insert_overwrite_copy_partitions.sql": _MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_DYNAMIC_INSERT_OVERWRITE_COPY_PARTITIONS, + "incremental_sync_all_columns_target.sql": _MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_TARGET, + "incremental_sync_all_columns_time_ingestion_partitioning.sql": _MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_TIME_INGESTION_PARTITIONING, + "incremental_sync_all_columns_time_ingestion_partitioning_target.sql": _MODELS__INCREMENTAL_SYNC_ALL_COLUMNS_TIME_INGESTION_PARTITIONING_TARGET, + } + + def test_run_incremental_sync_all_columns_dynamic_insert_overwrite(self, project): + select = "model_a incremental_sync_all_columns_dynamic_insert_overwrite incremental_sync_all_columns_target" + compare_source = "incremental_sync_all_columns_dynamic_insert_overwrite" + compare_target = "incremental_sync_all_columns_target" + self.run_twice_and_assert(select, compare_source, compare_target, project) + + def test_run_incremental_sync_all_columns_dynamic_insert_overwrite_copy_partitions( + self, project + ): + select = ( + "model_a incremental_sync_all_columns_dynamic_insert_overwrite_copy_partitions " + "incremental_sync_all_columns_target" + ) + compare_source = "incremental_sync_all_columns_dynamic_insert_overwrite_copy_partitions" + compare_target = "incremental_sync_all_columns_target" + self.run_twice_and_assert(select, compare_source, compare_target, project) + + def test_run_incremental_sync_all_columns_time_ingestion_partitioning(self, project): + select = ( + "model_a incremental_sync_all_columns_time_ingestion_partitioning " + "incremental_sync_all_columns_time_ingestion_partitioning_target" + ) + compare_source = "incremental_sync_all_columns_time_ingestion_partitioning" + compare_target = "incremental_sync_all_columns_time_ingestion_partitioning_target" + self.run_twice_and_assert(select, compare_source, compare_target, project) diff --git a/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_predicates.py b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_predicates.py new file mode 100644 index 000000000..b3cbed8a8 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_predicates.py @@ -0,0 +1,24 @@ +import pytest +from dbt.tests.adapter.incremental.test_incremental_predicates import BaseIncrementalPredicates + + +class TestIncrementalPredicatesMergeBigQuery(BaseIncrementalPredicates): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "models": { + "+incremental_predicates": ["dbt_internal_dest.id != 2"], + "+incremental_strategy": "merge", + } + } + + +class TestPredicatesMergeBigQuery(BaseIncrementalPredicates): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "models": { + "+predicates": ["dbt_internal_dest.id != 2"], + "+incremental_strategy": "merge", + } + } diff --git a/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_strategies.py b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_strategies.py new file mode 100644 index 000000000..1a339d601 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_strategies.py @@ -0,0 +1,109 @@ +import pytest +from dbt.tests.util import ( + check_relations_equal, + get_relation_columns, + run_dbt, +) +from dbt.tests.adapter.simple_seed.test_seed import SeedConfigBase +from tests.functional.adapter.incremental.seeds import ( + seed_data_csv, + seed_incremental_overwrite_date_expected_csv, + seed_incremental_overwrite_day_expected_csv, + seed_incremental_overwrite_range_expected_csv, + seed_incremental_overwrite_time_expected_csv, + seed_merge_expected_csv, + seed_incremental_overwrite_day_with_time_partition_expected_csv, +) +from tests.functional.adapter.incremental.incremental_strategy_fixtures import ( + merge_range_sql, + merge_time_sql, + merge_time_with_require_partition_sql, + overwrite_date_sql, + overwrite_day_sql, + overwrite_day_with_copy_partitions_sql, + overwrite_partitions_sql, + overwrite_range_sql, + overwrite_time_sql, + overwrite_day_with_time_ingestion_sql, + overwrite_day_with_time_partition_datetime_sql, + overwrite_static_day_sql, +) + + +class TestBigQueryScripting(SeedConfigBase): + @pytest.fixture(scope="class") + def schema(self): + return "bigquery_test" + + @pytest.fixture(scope="class") + def models(self): + return { + "incremental_merge_range.sql": merge_range_sql, + "incremental_merge_time.sql": merge_time_sql, + "incremental_merge_time_with_require_partition.sql": merge_time_with_require_partition_sql, + "incremental_overwrite_date.sql": overwrite_date_sql, + "incremental_overwrite_day.sql": overwrite_day_sql, + "incremental_overwrite_day_with_copy_partitions.sql": overwrite_day_with_copy_partitions_sql, + "incremental_overwrite_partitions.sql": overwrite_partitions_sql, + "incremental_overwrite_range.sql": overwrite_range_sql, + "incremental_overwrite_time.sql": overwrite_time_sql, + "incremental_overwrite_day_with_time_partition.sql": overwrite_day_with_time_ingestion_sql, + "incremental_overwrite_day_with_time_partition_datetime.sql": overwrite_day_with_time_partition_datetime_sql, + "incremental_overwrite_static_day.sql": overwrite_static_day_sql, + } + + @pytest.fixture(scope="class") + def seeds(self): + return { + "data_seed.csv": seed_data_csv, + "merge_expected.csv": seed_merge_expected_csv, + "incremental_overwrite_time_expected.csv": seed_incremental_overwrite_time_expected_csv, + "incremental_overwrite_date_expected.csv": seed_incremental_overwrite_date_expected_csv, + "incremental_overwrite_day_expected.csv": seed_incremental_overwrite_day_expected_csv, + "incremental_overwrite_range_expected.csv": seed_incremental_overwrite_range_expected_csv, + "incremental_overwrite_day_with_time_partition_expected.csv": seed_incremental_overwrite_day_with_time_partition_expected_csv, + } + + def test__bigquery_assert_incremental_configurations_apply_the_right_strategy(self, project): + run_dbt(["seed"]) + results = run_dbt() + assert len(results) == 12 + + results = run_dbt() + assert len(results) == 12 + incremental_strategies = [ + ("incremental_merge_range", "merge_expected"), + ("incremental_merge_time", "merge_expected"), + ("incremental_merge_time_with_require_partition_view", "merge_expected"), + ("incremental_overwrite_time", "incremental_overwrite_time_expected"), + ("incremental_overwrite_date", "incremental_overwrite_date_expected"), + ("incremental_overwrite_partitions", "incremental_overwrite_date_expected"), + ("incremental_overwrite_day", "incremental_overwrite_day_expected"), + ("incremental_overwrite_range", "incremental_overwrite_range_expected"), + ( + "incremental_overwrite_day_with_time_partition_datetime", + "incremental_overwrite_day_with_time_partition_expected", + ), + ("incremental_overwrite_static_day", "incremental_overwrite_day_expected"), + ] + db_with_schema = f"{project.database}.{project.test_schema}" + for incremental_strategy in incremental_strategies: + created_table = f"{db_with_schema}.{incremental_strategy[0]}" + expected_table = f"{db_with_schema}.{incremental_strategy[1]}" + check_relations_equal(project.adapter, [created_table, expected_table]) + + # since this table requires a partition filter which check_relations_equal doesn't support extra where clauses + # we just check column types + created = get_relation_columns( + project.adapter, "incremental_overwrite_day_with_copy_partitions" + ) + expected = get_relation_columns(project.adapter, "incremental_overwrite_day_expected") + assert created == expected + + created = get_relation_columns( + project.adapter, "incremental_overwrite_day_with_time_partition" + ) + expected = get_relation_columns( + project.adapter, "incremental_overwrite_day_with_time_partition_expected" + ) + assert created == expected diff --git a/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_unique_id.py b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_unique_id.py new file mode 100644 index 000000000..a061adfb5 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/incremental/test_incremental_unique_id.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.incremental.test_incremental_unique_id import BaseIncrementalUniqueKey + + +class TestUniqueKeyBigQuery(BaseIncrementalUniqueKey): + pass diff --git a/dbt-bigquery/tests/functional/adapter/materialized_view_tests/_files.py b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/_files.py new file mode 100644 index 000000000..86714036a --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/_files.py @@ -0,0 +1,80 @@ +MY_SEED = """ +id,value,record_valid_date +1,100,2023-01-01 00:00:00 +2,200,2023-01-02 00:00:00 +3,300,2023-01-02 00:00:00 +""".strip() + + +MY_BASE_TABLE = """ +{{ config( + materialized='table', + partition_by={ + "field": "record_valid_date", + "data_type": "datetime", + "granularity": "day" + }, + cluster_by=["id", "value"] +) }} +select + id, + value, + record_valid_date +from {{ ref('my_seed') }} +""" + + +# the whitespace to the left on partition matters here +MY_MATERIALIZED_VIEW = """ +{{ config( + materialized='materialized_view', + partition_by={ + "field": "record_valid_date", + "data_type": "datetime", + "granularity": "day" + }, + cluster_by=["id", "value"], + enable_refresh=True, + refresh_interval_minutes=60, + max_staleness="INTERVAL 45 MINUTE" +) }} +select + id, + value, + record_valid_date +from {{ ref('my_base_table') }} +""" + + +# the whitespace to the left on partition matters here +MY_OTHER_BASE_TABLE = """ +{{ config( + materialized='table', + partition_by={ + "field": "value", + "data_type": "int64", + "range": { + "start": 0, + "end": 500, + "interval": 50 + } + }, + cluster_by=["id", "value"] +) }} +select + id, + value, + record_valid_date +from {{ ref('my_seed') }} +""" + + +MY_MINIMAL_MATERIALIZED_VIEW = """ +{{ + config( + materialized = 'materialized_view', + ) +}} + +select * from {{ ref('my_seed') }} +""" diff --git a/dbt-bigquery/tests/functional/adapter/materialized_view_tests/_mixin.py b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/_mixin.py new file mode 100644 index 000000000..4eb98e047 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/_mixin.py @@ -0,0 +1,104 @@ +from typing import Optional, Tuple + +import pytest + +from dbt.adapters.base.relation import BaseRelation +from dbt.adapters.contracts.relation import RelationType +from dbt.tests.adapter.materialized_view.files import MY_TABLE, MY_VIEW +from dbt.tests.util import ( + get_connection, + get_model_file, + run_dbt, + set_model_file, +) + +from tests.functional.adapter.materialized_view_tests import _files + + +class BigQueryMaterializedViewMixin: + @pytest.fixture(scope="class") + def my_base_table(self, project) -> BaseRelation: + """ + The base table for a materialized view needs to be partitioned in + the same way as the materialized view. So if we want to create a partitioned + materialized view, we need to partition the base table. This table is a + select * on the seed table, plus a partition. + """ + return project.adapter.Relation.create( + identifier="my_base_table", + schema=project.test_schema, + database=project.database, + type=RelationType.Table, + ) + + @pytest.fixture(scope="class") + def my_other_base_table(self, project) -> BaseRelation: + """ + Following the sentiment of `my_base_table` above, if we want to alter the partition + on the materialized view, we either need to update the partition on the base table, + or we need a second table with a different partition. + """ + return project.adapter.Relation.create( + identifier="my_other_base_table", + schema=project.test_schema, + database=project.database, + type=RelationType.Table, + ) + + @pytest.fixture(scope="function", autouse=True) + def setup(self, project, my_base_table, my_other_base_table, my_materialized_view): # type: ignore + run_dbt(["seed"]) + run_dbt(["run", "--full-refresh"]) + + # the tests touch these files, store their contents in memory + initial_model = get_model_file(project, my_materialized_view) + + yield + + # and then reset them after the test runs + set_model_file(project, my_materialized_view, initial_model) + project.run_sql(f"drop schema if exists {project.test_schema} cascade") + + @pytest.fixture(scope="class", autouse=True) + def seeds(self): + return {"my_seed.csv": _files.MY_SEED} + + @pytest.fixture(scope="class", autouse=True) + def models(self): + yield { + "my_table.sql": MY_TABLE, + "my_view.sql": MY_VIEW, + "my_base_table.sql": _files.MY_BASE_TABLE, + "my_other_base_table.sql": _files.MY_OTHER_BASE_TABLE, + "my_materialized_view.sql": _files.MY_MATERIALIZED_VIEW, + } + + @staticmethod + def insert_record(project, table: BaseRelation, record: Tuple[int, int]) -> None: + my_id, value = record + project.run_sql(f"insert into {table} (id, value) values ({my_id}, {value})") + + @staticmethod + def refresh_materialized_view(project, materialized_view: BaseRelation) -> None: + sql = f""" + call bq.refresh_materialized_view( + '{materialized_view.database}.{materialized_view.schema}.{materialized_view.identifier}' + ) + """ + project.run_sql(sql) + + @staticmethod + def query_row_count(project, relation: BaseRelation) -> int: + sql = f"select count(*) from {relation}" + return project.run_sql(sql, fetch="one")[0] + + # look into syntax + @staticmethod + def query_relation_type(project, relation: BaseRelation) -> Optional[str]: + with get_connection(project.adapter) as conn: + table = conn.handle.get_table( + project.adapter.connections.get_bq_table( + relation.database, relation.schema, relation.identifier + ) + ) + return table.table_type.lower() diff --git a/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view.py b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view.py new file mode 100644 index 000000000..4e980c2e4 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view.py @@ -0,0 +1,53 @@ +import pytest + +from dbt.tests.util import run_dbt +from dbt.tests.adapter.materialized_view.basic import MaterializedViewBasic + +from tests.functional.adapter.materialized_view_tests._mixin import BigQueryMaterializedViewMixin +from tests.functional.adapter.materialized_view_tests import _files + + +class TestBigqueryMaterializedViewsBasic(BigQueryMaterializedViewMixin, MaterializedViewBasic): + def test_view_replaces_materialized_view(self, project, my_materialized_view): + """ + We don't support replacing a view with another object in dbt-bigquery unless we use --full-refresh + """ + run_dbt(["run", "--models", my_materialized_view.identifier]) + assert self.query_relation_type(project, my_materialized_view) == "materialized_view" + + self.swap_materialized_view_to_view(project, my_materialized_view) + + # add --full-refresh + run_dbt(["run", "--models", my_materialized_view.identifier, "--full-refresh"]) + assert self.query_relation_type(project, my_materialized_view) == "view" + + @pytest.mark.skip( + "It looks like BQ updates the materialized view almost immediately, which fails this test." + ) + def test_materialized_view_only_updates_after_refresh( + self, project, my_materialized_view, my_seed + ): + pass + + +class TestMaterializedViewRerun: + """ + This addresses: https://github.com/dbt-labs/dbt-bigquery/issues/1007 + + This effectively tests that defaults get properly set so that the run is idempotent. + If the defaults are not properly set, changes could appear when there are no changes + and cause unexpected scenarios. + """ + + @pytest.fixture(scope="class", autouse=True) + def models(self): + return {"my_minimal_materialized_view.sql": _files.MY_MINIMAL_MATERIALIZED_VIEW} + + @pytest.fixture(scope="class", autouse=True) + def seeds(self): + return {"my_seed.csv": _files.MY_SEED} + + def test_minimal_materialized_view_is_idempotent(self, project): + run_dbt(["seed"]) + run_dbt(["run"]) + run_dbt(["run"]) diff --git a/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_changes.py b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_changes.py new file mode 100644 index 000000000..c821c68fc --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_changes.py @@ -0,0 +1,95 @@ +from dbt.tests.adapter.materialized_view.changes import ( + MaterializedViewChanges, + MaterializedViewChangesApplyMixin, + MaterializedViewChangesContinueMixin, + MaterializedViewChangesFailMixin, +) +from dbt.tests.util import get_connection, get_model_file, set_model_file + +from dbt.adapters.bigquery.relation_configs import BigQueryMaterializedViewConfig + +from tests.functional.adapter.materialized_view_tests._mixin import BigQueryMaterializedViewMixin + + +class BigQueryMaterializedViewChanges(BigQueryMaterializedViewMixin, MaterializedViewChanges): + @staticmethod + def check_start_state(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.options.enable_refresh is True + assert results.options.refresh_interval_minutes == 60 + assert results.partition.field == "record_valid_date" + assert results.partition.data_type == "datetime" + assert results.partition.granularity == "day" + assert results.cluster.fields == frozenset({"id", "value"}) + + @staticmethod + def change_config_via_alter(project, materialized_view): + initial_model = get_model_file(project, materialized_view) + new_model = initial_model.replace("enable_refresh=True", "enable_refresh=False") + set_model_file(project, materialized_view, new_model) + + @staticmethod + def check_state_alter_change_is_applied(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + # these change when run manually + assert results.options.enable_refresh is False + assert results.options.refresh_interval_minutes == 30 # BQ returns it to the default + + @staticmethod + def change_config_via_replace(project, materialized_view): + initial_model = get_model_file(project, materialized_view) + # the whitespace to the left on partition matters here + old_partition = """ + partition_by={ + "field": "record_valid_date", + "data_type": "datetime", + "granularity": "day" + },""" + new_partition = """ + partition_by={ + "field": "value", + "data_type": "int64", + "range": { + "start": 0, + "end": 500, + "interval": 50 + } + },""" + new_model = ( + initial_model.replace(old_partition, new_partition) + .replace("'my_base_table'", "'my_other_base_table'") + .replace('cluster_by=["id", "value"]', 'cluster_by="id"') + ) + set_model_file(project, materialized_view, new_model) + + @staticmethod + def check_state_replace_change_is_applied(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.partition.field == "value" + assert results.partition.data_type == "int64" + assert results.partition.range == {"start": 0, "end": 500, "interval": 50} + assert results.cluster.fields == frozenset({"id"}) + + +class TestBigQueryMaterializedViewChangesApply( + BigQueryMaterializedViewChanges, MaterializedViewChangesApplyMixin +): + pass + + +class TestBigQueryMaterializedViewChangesContinue( + BigQueryMaterializedViewChanges, MaterializedViewChangesContinueMixin +): + pass + + +class TestBigQueryMaterializedViewChangesFail( + BigQueryMaterializedViewChanges, MaterializedViewChangesFailMixin +): + pass diff --git a/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_cluster_changes.py b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_cluster_changes.py new file mode 100644 index 000000000..74e174d4f --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_cluster_changes.py @@ -0,0 +1,69 @@ +from dbt.tests.adapter.materialized_view.changes import ( + MaterializedViewChanges, + MaterializedViewChangesApplyMixin, + MaterializedViewChangesContinueMixin, + MaterializedViewChangesFailMixin, +) +from dbt.tests.util import get_connection, get_model_file, set_model_file + +from dbt.adapters.bigquery.relation_configs import BigQueryMaterializedViewConfig + +from tests.functional.adapter.materialized_view_tests._mixin import BigQueryMaterializedViewMixin + + +class BigQueryMaterializedViewClusterChanges( + BigQueryMaterializedViewMixin, MaterializedViewChanges +): + @staticmethod + def check_start_state(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.options.enable_refresh is True + assert results.options.refresh_interval_minutes == 60 + assert results.cluster.fields == frozenset({"id", "value"}) + + @staticmethod + def change_config_via_alter(project, materialized_view): + initial_model = get_model_file(project, materialized_view) + new_model = initial_model.replace("enable_refresh=True", "enable_refresh=False") + set_model_file(project, materialized_view, new_model) + + @staticmethod + def check_state_alter_change_is_applied(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.options.enable_refresh is False + assert results.options.refresh_interval_minutes == 30 # BQ returns it to the default + + @staticmethod + def change_config_via_replace(project, materialized_view): + initial_model = get_model_file(project, materialized_view) + new_model = initial_model.replace('cluster_by=["id", "value"]', 'cluster_by="id"') + set_model_file(project, materialized_view, new_model) + + @staticmethod + def check_state_replace_change_is_applied(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.cluster.fields == frozenset({"id"}) + + +class TestBigQueryMaterializedViewClusterChangesApply( + BigQueryMaterializedViewClusterChanges, MaterializedViewChangesApplyMixin +): + pass + + +class TestBigQueryMaterializedViewClusterChangesContinue( + BigQueryMaterializedViewClusterChanges, MaterializedViewChangesContinueMixin +): + pass + + +class TestBigQueryMaterializedViewClusterChangesFail( + BigQueryMaterializedViewClusterChanges, MaterializedViewChangesFailMixin +): + pass diff --git a/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_partition_changes.py b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_partition_changes.py new file mode 100644 index 000000000..7f396ae1b --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/materialized_view_tests/test_materialized_view_partition_changes.py @@ -0,0 +1,93 @@ +from dbt.tests.adapter.materialized_view.changes import ( + MaterializedViewChanges, + MaterializedViewChangesApplyMixin, + MaterializedViewChangesContinueMixin, + MaterializedViewChangesFailMixin, +) +from dbt.tests.util import get_connection, get_model_file, set_model_file + +from dbt.adapters.bigquery.relation_configs import BigQueryMaterializedViewConfig + +from tests.functional.adapter.materialized_view_tests._mixin import BigQueryMaterializedViewMixin + + +class BigQueryMaterializedViewPartitionChanges( + BigQueryMaterializedViewMixin, MaterializedViewChanges +): + @staticmethod + def check_start_state(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.options.enable_refresh is True + assert results.options.refresh_interval_minutes == 60 + assert results.partition.field == "record_valid_date" + assert results.partition.data_type == "datetime" + assert results.partition.granularity == "day" + + @staticmethod + def change_config_via_alter(project, materialized_view): + initial_model = get_model_file(project, materialized_view) + new_model = initial_model.replace("enable_refresh=True", "enable_refresh=False") + set_model_file(project, materialized_view, new_model) + + @staticmethod + def check_state_alter_change_is_applied(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + # these change when run manually + assert results.options.enable_refresh is False + assert results.options.refresh_interval_minutes == 30 # BQ returns it to the default + + @staticmethod + def change_config_via_replace(project, materialized_view): + initial_model = get_model_file(project, materialized_view) + # the whitespace to the left on partition matters here + old_partition = """ + partition_by={ + "field": "record_valid_date", + "data_type": "datetime", + "granularity": "day" + },""" + new_partition = """ + partition_by={ + "field": "value", + "data_type": "int64", + "range": { + "start": 0, + "end": 500, + "interval": 50 + } + },""" + new_model = initial_model.replace(old_partition, new_partition).replace( + "'my_base_table'", "'my_other_base_table'" + ) + set_model_file(project, materialized_view, new_model) + + @staticmethod + def check_state_replace_change_is_applied(project, materialized_view): + with get_connection(project.adapter): + results = project.adapter.describe_relation(materialized_view) + assert isinstance(results, BigQueryMaterializedViewConfig) + assert results.partition.field == "value" + assert results.partition.data_type == "int64" + assert results.partition.range == {"start": 0, "end": 500, "interval": 50} + + +class TestBigQueryMaterializedViewPartitionChangesApply( + BigQueryMaterializedViewPartitionChanges, MaterializedViewChangesApplyMixin +): + pass + + +class TestBigQueryMaterializedViewPartitionChangesContinue( + BigQueryMaterializedViewPartitionChanges, MaterializedViewChangesContinueMixin +): + pass + + +class TestBigQueryMaterializedViewPartitionChangesFail( + BigQueryMaterializedViewPartitionChanges, MaterializedViewChangesFailMixin +): + pass diff --git a/dbt-bigquery/tests/functional/adapter/query_comment_test/test_job_label.py b/dbt-bigquery/tests/functional/adapter/query_comment_test/test_job_label.py new file mode 100644 index 000000000..af984a8c4 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/query_comment_test/test_job_label.py @@ -0,0 +1,52 @@ +import pytest + +from google.cloud.bigquery.client import Client + +from dbt.tests.util import run_dbt + + +_MACRO__BQ_LABELS = """ +{% macro bq_labels() %}{ + "system": "{{ env_var('LABEL_SYSTEM', 'my_system') }}", + "env_type": "{{ env_var('LABEL_ENV', 'dev') }}" +}{% endmacro %} +""" +_MODEL__MY_TABLE = """ +{{ config(materialized= "table") }} +select 1 as id +""" + + +class TestQueryCommentJobLabel: + @pytest.fixture(scope="class") + def models(self): + return {"my_table.sql": _MODEL__MY_TABLE} + + @pytest.fixture(scope="class") + def macros(self): + return {"bq_labels.sql": _MACRO__BQ_LABELS} + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "query-comment": { + "comment": "{{ bq_labels() }}", + "job-label": True, + "append": True, + } + } + + def test_query_comments_displays_as_job_labels(self, project): + """ + Addresses this regression in dbt-bigquery 1.6: + https://github.com/dbt-labs/dbt-bigquery/issues/863 + """ + results = run_dbt(["run"]) + job_id = results.results[0].adapter_response.get("job_id") + with project.adapter.connection_named("_test"): + client: Client = project.adapter.connections.get_thread_connection().handle + job = client.get_job(job_id=job_id) + + # this is what should happen + assert job.labels.get("system") == "my_system" + assert job.labels.get("env_type") == "dev" diff --git a/dbt-bigquery/tests/functional/adapter/query_comment_test/test_query_comment.py b/dbt-bigquery/tests/functional/adapter/query_comment_test/test_query_comment.py new file mode 100644 index 000000000..efa138065 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/query_comment_test/test_query_comment.py @@ -0,0 +1,32 @@ +from dbt.tests.adapter.query_comment.test_query_comment import ( + BaseQueryComments, + BaseMacroQueryComments, + BaseMacroArgsQueryComments, + BaseMacroInvalidQueryComments, + BaseNullQueryComments, + BaseEmptyQueryComments, +) + + +class TestQueryCommentsBigQuery(BaseQueryComments): + pass + + +class TestMacroQueryCommentsBigQuery(BaseMacroQueryComments): + pass + + +class TestMacroArgsQueryCommentsBigQuery(BaseMacroArgsQueryComments): + pass + + +class TestMacroInvalidQueryCommentsBigQuery(BaseMacroInvalidQueryComments): + pass + + +class TestNullQueryCommentsBigQuery(BaseNullQueryComments): + pass + + +class TestEmptyQueryCommentsBigQuery(BaseEmptyQueryComments): + pass diff --git a/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/fixtures.py b/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/fixtures.py new file mode 100644 index 000000000..9c13750f4 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/fixtures.py @@ -0,0 +1,368 @@ +# +# Models +# + +clustered_model_sql = """ +{{ + config( + materialized = "table", + partition_by = {"field": "updated_at", "data_type": "date"}, + cluster_by = "dupe", + ) +}} + +select * from {{ ref('view_model') }} +""".lstrip() + +funky_case_sql = """ +select 1 as id +""".lstrip() + +labeled_model_sql = """ +{{ + config( + materialized = "table", + labels = {'town': 'fish', 'analytics': 'yes'} + ) +}} + +select * from {{ ref('view_model') }} +""".lstrip() + +multi_clustered_model_sql = """ +{{ + config( + materialized = "table", + partition_by = {"field": "updated_at", "data_type": "date"}, + cluster_by = ["dupe","id"], + ) +}} + +select * from {{ ref('view_model') }} +""".lstrip() + + +partitioned_model_sql = """ +{{ + config( + materialized = "table", + partition_by = {'field': 'updated_at', 'data_type': 'date'}, + ) +}} + +select * from {{ ref('view_model') }} +""".lstrip() + +schema_yml = """ +version: 2 +models: +- name: view_model + description: | + View model description "with double quotes" + and with 'single quotes' as welll as other; + '''abc123''' + reserved -- characters + -- + /* comment */ + columns: + - name: dupe + tests: + - unique + - name: id + tests: + - not_null + - unique + - name: updated_at + tests: + - not_null + tests: + - was_materialized: + name: view_model + type: view +- name: table_model + description: | + View model description "with double quotes" + and with 'single quotes' as welll as other; + '''abc123''' + reserved -- characters + -- + /* comment */ + columns: + - name: id + tests: + - not_null + tests: + - was_materialized: + name: table_model + type: table +- name: fUnKyCaSe + columns: + - name: id + tests: + - not_null + - unique + tests: + - was_materialized: + name: fUnKyCaSe + type: view + + +sources: + - name: raw + project: "{{ target.database }}" + dataset: "{{ target.schema }}" + tables: + - name: seed + identifier: data_seed +""".lstrip() + +sql_header_model_sql = """ +{{ config(materialized="table") }} + +{# This will fail if it is not extracted correctly #} +{% call set_sql_header(config) %} + CREATE TEMPORARY FUNCTION a_to_b(str STRING) + RETURNS STRING AS ( + CASE + WHEN LOWER(str) = 'a' THEN 'b' + ELSE str + END + ); +{% endcall %} + +select a_to_b(dupe) as dupe from {{ ref('view_model') }} +""".lstrip() + +sql_header_model_incr_sql = """ +{{ config(materialized="incremental") }} + +{# This will fail if it is not extracted correctly #} +{% call set_sql_header(config) %} + DECLARE int_var INT64 DEFAULT 42; + + CREATE TEMPORARY FUNCTION a_to_b(str STRING) + RETURNS STRING AS ( + CASE + WHEN LOWER(str) = 'a' THEN 'b' + ELSE str + END + ); +{% endcall %} + +select a_to_b(dupe) as dupe from {{ ref('view_model') }} +""".lstrip() + +sql_header_model_incr_insert_overwrite_sql = """ +{# + Ensure that the insert overwrite incremental strategy + works correctly when a UDF is used in a sql_header. The + failure mode here is that dbt might inject the UDF header + twice: once for the `create table` and then again for the + merge statement. +#} + +{{ config( + materialized="incremental", + incremental_strategy='insert_overwrite', + partition_by={"field": "dt", "data_type": "date"} +) }} + +{# This will fail if it is not extracted correctly #} +{% call set_sql_header(config) %} + DECLARE int_var INT64 DEFAULT 42; + + CREATE TEMPORARY FUNCTION a_to_b(str STRING) + RETURNS STRING AS ( + CASE + WHEN LOWER(str) = 'a' THEN 'b' + ELSE str + END + ); +{% endcall %} + +select + current_date() as dt, + a_to_b(dupe) as dupe + +from {{ ref('view_model') }} +""".lstrip() + +sql_header_model_incr_insert_overwrite_static_sql = """ +{# + Ensure that the insert overwrite incremental strategy + works correctly when a UDF is used in a sql_header. The + failure mode here is that dbt might inject the UDF header + twice: once for the `create table` and then again for the + merge statement. +#} + +{{ config( + materialized="incremental", + incremental_strategy='insert_overwrite', + partition_by={"field": "dt", "data_type": "date"}, + partitions=["'2020-01-1'"] +) }} + +{# This will fail if it is not extracted correctly #} +{% call set_sql_header(config) %} + CREATE TEMPORARY FUNCTION a_to_b(str STRING) + RETURNS STRING AS ( + CASE + WHEN LOWER(str) = 'a' THEN 'b' + ELSE str + END + ); +{% endcall %} + +select + cast('2020-01-01' as date) as dt, + a_to_b(dupe) as dupe + +from {{ ref('view_model') }} +""".lstrip() + +tabel_model_sql = """ +{{ + config( + materialized = "table", + persist_docs={ "relation": true, "columns": true, "schema": true } + ) +}} + +select * from {{ ref('view_model') }} +""".lstrip() + +view_model_sql = """ +{{ + config( + materialized = "view", + persist_docs={ "relation": true, "columns": true, "schema": true } + ) +}} + + +select + id, + current_date as updated_at, + dupe + +from {{ source('raw', 'seed') }} +""".lstrip() + +# +# Macros +# + +test_creation_sql = """ +{% test was_materialized(model, name, type) %} + + {#-- don't run this query in the parsing step #} + {%- if model -%} + {%- set table = adapter.get_relation(database=model.database, schema=model.schema, + identifier=model.name) -%} + {%- else -%} + {%- set table = {} -%} + {%- endif -%} + + {% if table %} + select '{{ table.type }} does not match expected value {{ type }}' + from (select true) + where '{{ table.type }}' != '{{ type }}' + {% endif %} + +{% endtest %} +""".lstrip() + +test_int_inference_sql = """ +{% macro assert_eq(value, expected, msg) %} + {% if value != expected %} + {% do exceptions.raise_compiler_error(msg ~ value) %} + {% endif %} +{% endmacro %} + + +{% macro test_int_inference() %} + + {% set sql %} + select + 0 as int_0, + 1 as int_1, + 2 as int_2 + {% endset %} + + {% set result = run_query(sql) %} + {% do assert_eq((result | length), 1, 'expected 1 result, got ') %} + {% set actual_0 = result[0]['int_0'] %} + {% set actual_1 = result[0]['int_1'] %} + {% set actual_2 = result[0]['int_2'] %} + + {% do assert_eq(actual_0, 0, 'expected expected actual_0 to be 0, it was ') %} + {% do assert_eq((actual_0 | string), '0', 'expected string form of actual_0 to be 0, it was ') %} + {% do assert_eq((actual_0 * 2), 0, 'expected actual_0 * 2 to be 0, it was ') %} {# not 00 #} + + {% do assert_eq(actual_1, 1, 'expected actual_1 to be 1, it was ') %} + {% do assert_eq((actual_1 | string), '1', 'expected string form of actual_1 to be 1, it was ') %} + {% do assert_eq((actual_1 * 2), 2, 'expected actual_1 * 2 to be 2, it was ') %} {# not 11 #} + + {% do assert_eq(actual_2, 2, 'expected actual_2 to be 2, it was ') %} + {% do assert_eq((actual_2 | string), '2', 'expected string form of actual_2 to be 2, it was ') %} + {% do assert_eq((actual_2 * 2), 4, 'expected actual_2 * 2 to be 4, it was ') %} {# not 22 #} + +{% endmacro %} +""".lstrip() + +test_project_for_job_id_sql = """ +{% test project_for_job_id(model, region, unique_schema_id, project_id) %} +select 1 +from `region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT +where date(creation_time) = current_date + and job_project = {{project_id}} + and destination_table.dataset_id = {{unique_schema_id}} +{% endtest %} +""".lstrip() + +wrapped_macros_sql = """ +{% macro my_create_schema(db_name, schema_name) %} + {% if not execute %} + {% do return(None) %} + {% endif %} + {% set relation = api.Relation.create(database=db_name, schema=schema_name).without_identifier() %} + {% do create_schema(relation) %} +{% endmacro %} + +{% macro my_drop_schema(db_name, schema_name) %} + {% if not execute %} + {% do return(None) %} + {% endif %} + {% set relation = api.Relation.create(database=db_name, schema=schema_name).without_identifier() %} + {% do drop_schema(relation) %} +{% endmacro %} + + +{% macro my_create_table_as(db_name, schema_name, table_name) %} + {% if not execute %} + {% do return(None) %} + {% endif %} + {% set relation = api.Relation.create(database=db_name, schema=schema_name, identifier=table_name) %} + {% do run_query(create_table_as(false, relation, 'select 1 as id')) %} +{% endmacro %} + + +{% macro ensure_one_relation_in(db_name, schema_name) %} + {% if not execute %} + {% do return(None) %} + {% endif %} + {% set relation = api.Relation.create(database=db_name, schema=schema_name).without_identifier() %} + {% set results = list_relations_without_caching(relation) %} + {% set rlen = (results | length) %} + {% if rlen != 1 %} + {% do exceptions.raise_compiler_error('Incorect number of results (expected 1): ' ~ rlen) %} + {% endif %} + {% set result = results[0] %} + {% set columns = get_columns_in_relation(result) %} + {% set clen = (columns | length) %} + {% if clen != 1 %} + {% do exceptions.raise_compiler_error('Incorrect number of columns (expected 1): ' ~ clen) %} + {% endif %} +{% endmacro %} +""".lstrip() diff --git a/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/seeds.py b/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/seeds.py new file mode 100644 index 000000000..9198bd244 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/seeds.py @@ -0,0 +1,57 @@ +seed_data_csv = """ +id,dupe +1,a +2,a +3,a +4,a +""".lstrip() + +seed_incremental_overwrite_date_expected_csv = """ +id,date_day +10,2020-01-01 +20,2020-01-01 +30,2020-01-02 +40,2020-01-02 +""".lstrip() + +seed_incremental_overwrite_day_expected_csv = """ +id,date_time +10,2020-01-01 00:00:00 +20,2020-01-01 00:00:00 +30,2020-01-02 00:00:00 +40,2020-01-02 00:00:00 +""".lstrip() + +seed_incremental_overwrite_range_expected_csv = """ +id,date_int +10,20200101 +20,20200101 +30,20200102 +40,20200102 +""".lstrip() + +seed_incremental_overwrite_time_expected_csv = """ +id,date_hour +10,2020-01-01 01:00:00 +20,2020-01-01 01:00:00 +30,2020-01-01 02:00:00 +40,2020-01-01 02:00:00 +""".lstrip() + +seed_merge_expected_csv = """ +id,date_time +1,2020-01-01 00:00:00 +2,2020-01-01 00:00:00 +3,2020-01-01 00:00:00 +4,2020-01-02 00:00:00 +5,2020-01-02 00:00:00 +6,2020-01-02 00:00:00 +""".lstrip() + +seed_incremental_overwrite_day_with_time_partition_expected_csv = """ +id +10 +20 +30 +40 +""".lstrip() diff --git a/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/test_simple_bigquery_view.py b/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/test_simple_bigquery_view.py new file mode 100644 index 000000000..6c7bc03e0 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/simple_bigquery_view/test_simple_bigquery_view.py @@ -0,0 +1,116 @@ +import pytest +import random +import time +from dbt.tests.util import run_dbt +from dbt.tests.adapter.simple_seed.test_seed import SeedConfigBase +from tests.functional.adapter.simple_bigquery_view.seeds import ( + seed_data_csv, + seed_merge_expected_csv, + seed_incremental_overwrite_time_expected_csv, + seed_incremental_overwrite_date_expected_csv, + seed_incremental_overwrite_day_expected_csv, + seed_incremental_overwrite_range_expected_csv, +) +from tests.functional.adapter.simple_bigquery_view.fixtures import ( + clustered_model_sql, + funky_case_sql, + labeled_model_sql, + multi_clustered_model_sql, + partitioned_model_sql, + sql_header_model_sql, + sql_header_model_incr_sql, + sql_header_model_incr_insert_overwrite_sql, + sql_header_model_incr_insert_overwrite_static_sql, + tabel_model_sql, + view_model_sql, + schema_yml, + test_creation_sql, + test_int_inference_sql, + test_project_for_job_id_sql, + wrapped_macros_sql, +) + + +class BaseBigQueryRun(SeedConfigBase): + @pytest.fixture(scope="class") + def schema(self): + return "bigquery_test" + + @pytest.fixture(scope="class") + def macros(self): + return { + "test_creation.sql": test_creation_sql, + "test_int_inference.sql": test_int_inference_sql, + "test_project_for_job_id.sql": test_project_for_job_id_sql, + "wrapped_macros.sql": wrapped_macros_sql, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "clustered_model.sql": clustered_model_sql, + "fUnKyCaSe.sql": funky_case_sql, + "labeled_model.sql": labeled_model_sql, + "multi_clustered_model.sql": multi_clustered_model_sql, + "partitioned_model.sql": partitioned_model_sql, + "sql_header_model.sql": sql_header_model_sql, + "sql_header_model_incr.sql": sql_header_model_incr_sql, + "sql_header_model_incr_insert_overwrite.sql": sql_header_model_incr_insert_overwrite_sql, + "sql_header_model_incr_insert_overwrite_static.sql": sql_header_model_incr_insert_overwrite_static_sql, + "table_model.sql": tabel_model_sql, + "view_model.sql": view_model_sql, + "schema.yml": schema_yml, + } + + @pytest.fixture(scope="class") + def seeds(self): + return { + "data_seed.csv": seed_data_csv, + "merge_expected.csv": seed_merge_expected_csv, + "incremental_overwrite_time_expected.csv": seed_incremental_overwrite_time_expected_csv, + "incremental_overwrite_date_expected.csv": seed_incremental_overwrite_date_expected_csv, + "incremental_overwrite_day_expected.csv": seed_incremental_overwrite_day_expected_csv, + "incremental_overwrite_range_expected.csv": seed_incremental_overwrite_range_expected_csv, + } + + def assert_nondupes_pass(self, project): + # The 'dupe' model should fail, but all others should pass + test_results = run_dbt(["test"], expect_pass=False) + + for test_result in test_results: + if "dupe" in test_result.node.name: + assert test_result.status == "fail" + assert not test_result.skipped + assert test_result.failures > 0 + + # assert that actual tests pass + else: + assert test_result.status == "pass" + assert not test_result.skipped + assert test_result.failures == 0 + + +class TestSimpleBigQueryRun(BaseBigQueryRun): + def test__bigquery_simple_run(self, project): + # make sure seed works twice. Full-refresh is a no-op + run_dbt(["seed"]) + run_dbt(["seed", "--full-refresh"]) + + results = run_dbt() + # Bump expected number of results when adding new model + assert len(results) == 11 + self.assert_nondupes_pass(project) + + +class TestUnderscoreBigQueryRun(BaseBigQueryRun): + prefix = "_test{}{:04}".format(int(time.time()), random.randint(0, 9999)) + + def test_bigquery_run_twice(self, project): + run_dbt(["seed"]) + results = run_dbt() + assert len(results) == 11 + + results = run_dbt() + assert len(results) == 11 + + self.assert_nondupes_pass(project) diff --git a/dbt-bigquery/tests/functional/adapter/simple_copy/fixtures.py b/dbt-bigquery/tests/functional/adapter/simple_copy/fixtures.py new file mode 100644 index 000000000..d53b368b2 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/simple_copy/fixtures.py @@ -0,0 +1,534 @@ +_SEEDS__SEED_MERGE_COLS_INITIAL = """ +load_date,id,first_name,last_name,email,gender,ip_address +2021-03-05,1,Jack,Hunter,jhunter0@pbs.org,Male,59.80.20.168 +2021-03-05,2,Kathryn,Walker,kwalker1@ezinearticles.com,Female,194.121.179.35 +2021-03-05,3,Gerald,Ryan,gryan2@com.com,Male,11.3.212.243 +2021-03-05,4,Bonnie,Spencer,bspencer3@ameblo.jp,Female,216.32.196.175 +2021-03-05,5,Harold,Taylor,htaylor4@people.com.cn,Male,253.10.246.136 +2021-03-05,6,Jacqueline,Griffin,jgriffin5@t.co,Female,16.13.192.220 +2021-03-05,7,Wanda,Arnold,warnold6@google.nl,Female,232.116.150.64 +2021-03-05,8,Craig,Ortiz,cortiz7@sciencedaily.com,Male,199.126.106.13 +2021-03-05,9,Gary,Day,gday8@nih.gov,Male,35.81.68.186 +2021-03-05,10,Rose,Wright,rwright9@yahoo.co.jp,Female,236.82.178.100 +2021-03-05,11,Raymond,Kelley,rkelleya@fc2.com,Male,213.65.166.67 +2021-03-05,12,Gerald,Robinson,grobinsonb@disqus.com,Male,72.232.194.193 +2021-03-05,13,Mildred,Martinez,mmartinezc@samsung.com,Female,198.29.112.5 +2021-03-05,14,Dennis,Arnold,darnoldd@google.com,Male,86.96.3.250 +2021-03-05,15,Judy,Gray,jgraye@opensource.org,Female,79.218.162.245 +2021-03-05,16,Theresa,Garza,tgarzaf@epa.gov,Female,21.59.100.54 +2021-03-05,17,Gerald,Robertson,grobertsong@csmonitor.com,Male,131.134.82.96 +2021-03-05,18,Philip,Hernandez,phernandezh@adobe.com,Male,254.196.137.72 +2021-03-05,19,Julia,Gonzalez,jgonzalezi@cam.ac.uk,Female,84.240.227.174 +2021-03-05,20,Andrew,Davis,adavisj@patch.com,Male,9.255.67.25 +2021-03-05,21,Kimberly,Harper,kharperk@foxnews.com,Female,198.208.120.253 +2021-03-05,22,Mark,Martin,mmartinl@marketwatch.com,Male,233.138.182.153 +2021-03-05,23,Cynthia,Ruiz,cruizm@google.fr,Female,18.178.187.201 +2021-03-05,24,Samuel,Carroll,scarrolln@youtu.be,Male,128.113.96.122 +2021-03-05,25,Jennifer,Larson,jlarsono@vinaora.com,Female,98.234.85.95 +2021-03-05,26,Ashley,Perry,aperryp@rakuten.co.jp,Female,247.173.114.52 +2021-03-05,27,Howard,Rodriguez,hrodriguezq@shutterfly.com,Male,231.188.95.26 +2021-03-05,28,Amy,Brooks,abrooksr@theatlantic.com,Female,141.199.174.118 +2021-03-05,29,Louise,Warren,lwarrens@adobe.com,Female,96.105.158.28 +2021-03-05,30,Tina,Watson,twatsont@myspace.com,Female,251.142.118.177 +2021-03-05,31,Janice,Kelley,jkelleyu@creativecommons.org,Female,239.167.34.233 +2021-03-05,32,Terry,Mccoy,tmccoyv@bravesites.com,Male,117.201.183.203 +2021-03-05,33,Jeffrey,Morgan,jmorganw@surveymonkey.com,Male,78.101.78.149 +2021-03-05,34,Louis,Harvey,lharveyx@sina.com.cn,Male,51.50.0.167 +2021-03-05,35,Philip,Miller,pmillery@samsung.com,Male,103.255.222.110 +2021-03-05,36,Willie,Marshall,wmarshallz@ow.ly,Male,149.219.91.68 +2021-03-05,37,Patrick,Lopez,plopez10@redcross.org,Male,250.136.229.89 +2021-03-05,38,Adam,Jenkins,ajenkins11@harvard.edu,Male,7.36.112.81 +2021-03-05,39,Benjamin,Cruz,bcruz12@linkedin.com,Male,32.38.98.15 +2021-03-05,40,Ruby,Hawkins,rhawkins13@gmpg.org,Female,135.171.129.255 +2021-03-05,41,Carlos,Barnes,cbarnes14@a8.net,Male,240.197.85.140 +2021-03-05,42,Ruby,Griffin,rgriffin15@bravesites.com,Female,19.29.135.24 +2021-03-05,43,Sean,Mason,smason16@icq.com,Male,159.219.155.249 +2021-03-05,44,Anthony,Payne,apayne17@utexas.edu,Male,235.168.199.218 +2021-03-05,45,Steve,Cruz,scruz18@pcworld.com,Male,238.201.81.198 +2021-03-05,46,Anthony,Garcia,agarcia19@flavors.me,Male,25.85.10.18 +2021-03-05,47,Doris,Lopez,dlopez1a@sphinn.com,Female,245.218.51.238 +2021-03-05,48,Susan,Nichols,snichols1b@freewebs.com,Female,199.99.9.61 +2021-03-05,49,Wanda,Ferguson,wferguson1c@yahoo.co.jp,Female,236.241.135.21 +2021-03-05,50,Andrea,Pierce,apierce1d@google.co.uk,Female,132.40.10.209 +2021-03-05,51,Lawrence,Phillips,lphillips1e@jugem.jp,Male,72.226.82.87 +2021-03-05,52,Judy,Gilbert,jgilbert1f@multiply.com,Female,196.250.15.142 +2021-03-05,53,Eric,Williams,ewilliams1g@joomla.org,Male,222.202.73.126 +2021-03-05,54,Ralph,Romero,rromero1h@sogou.com,Male,123.184.125.212 +2021-03-05,55,Jean,Wilson,jwilson1i@ocn.ne.jp,Female,176.106.32.194 +2021-03-05,56,Lori,Reynolds,lreynolds1j@illinois.edu,Female,114.181.203.22 +2021-03-05,57,Donald,Moreno,dmoreno1k@bbc.co.uk,Male,233.249.97.60 +2021-03-05,58,Steven,Berry,sberry1l@eepurl.com,Male,186.193.50.50 +2021-03-05,59,Theresa,Shaw,tshaw1m@people.com.cn,Female,120.37.71.222 +2021-03-05,60,John,Stephens,jstephens1n@nationalgeographic.com,Male,191.87.127.115 +2021-03-05,61,Richard,Jacobs,rjacobs1o@state.tx.us,Male,66.210.83.155 +2021-03-05,62,Andrew,Lawson,alawson1p@over-blog.com,Male,54.98.36.94 +2021-03-05,63,Peter,Morgan,pmorgan1q@rambler.ru,Male,14.77.29.106 +2021-03-05,64,Nicole,Garrett,ngarrett1r@zimbio.com,Female,21.127.74.68 +2021-03-05,65,Joshua,Kim,jkim1s@edublogs.org,Male,57.255.207.41 +2021-03-05,66,Ralph,Roberts,rroberts1t@people.com.cn,Male,222.143.131.109 +2021-03-05,67,George,Montgomery,gmontgomery1u@smugmug.com,Male,76.75.111.77 +2021-03-05,68,Gerald,Alvarez,galvarez1v@flavors.me,Male,58.157.186.194 +2021-03-05,69,Donald,Olson,dolson1w@whitehouse.gov,Male,69.65.74.135 +2021-03-05,70,Carlos,Morgan,cmorgan1x@pbs.org,Male,96.20.140.87 +2021-03-05,71,Aaron,Stanley,astanley1y@webnode.com,Male,163.119.217.44 +2021-03-05,72,Virginia,Long,vlong1z@spiegel.de,Female,204.150.194.182 +2021-03-05,73,Robert,Berry,rberry20@tripadvisor.com,Male,104.19.48.241 +2021-03-05,74,Antonio,Brooks,abrooks21@unesco.org,Male,210.31.7.24 +2021-03-05,75,Ruby,Garcia,rgarcia22@ovh.net,Female,233.218.162.214 +2021-03-05,76,Jack,Hanson,jhanson23@blogtalkradio.com,Male,31.55.46.199 +2021-03-05,77,Kathryn,Nelson,knelson24@walmart.com,Female,14.189.146.41 +2021-03-05,78,Jason,Reed,jreed25@printfriendly.com,Male,141.189.89.255 +2021-03-05,79,George,Coleman,gcoleman26@people.com.cn,Male,81.189.221.144 +2021-03-05,80,Rose,King,rking27@ucoz.com,Female,212.123.168.231 +2021-03-05,81,Johnny,Holmes,jholmes28@boston.com,Male,177.3.93.188 +2021-03-05,82,Katherine,Gilbert,kgilbert29@altervista.org,Female,199.215.169.61 +2021-03-05,83,Joshua,Thomas,jthomas2a@ustream.tv,Male,0.8.205.30 +2021-03-05,84,Julie,Perry,jperry2b@opensource.org,Female,60.116.114.192 +2021-03-05,85,Richard,Perry,rperry2c@oracle.com,Male,181.125.70.232 +2021-03-05,86,Kenneth,Ruiz,kruiz2d@wikimedia.org,Male,189.105.137.109 +2021-03-05,87,Jose,Morgan,jmorgan2e@webnode.com,Male,101.134.215.156 +2021-03-05,88,Donald,Campbell,dcampbell2f@goo.ne.jp,Male,102.120.215.84 +2021-03-05,89,Debra,Collins,dcollins2g@uol.com.br,Female,90.13.153.235 +2021-03-05,90,Jesse,Johnson,jjohnson2h@stumbleupon.com,Male,225.178.125.53 +2021-03-05,91,Elizabeth,Stone,estone2i@histats.com,Female,123.184.126.221 +2021-03-05,92,Angela,Rogers,arogers2j@goodreads.com,Female,98.104.132.187 +2021-03-05,93,Emily,Dixon,edixon2k@mlb.com,Female,39.190.75.57 +2021-03-05,94,Albert,Scott,ascott2l@tinypic.com,Male,40.209.13.189 +2021-03-05,95,Barbara,Peterson,bpeterson2m@ow.ly,Female,75.249.136.180 +2021-03-05,96,Adam,Greene,agreene2n@fastcompany.com,Male,184.173.109.144 +2021-03-05,97,Earl,Sanders,esanders2o@hc360.com,Male,247.34.90.117 +2021-03-05,98,Angela,Brooks,abrooks2p@mtv.com,Female,10.63.249.126 +2021-03-05,99,Harold,Foster,hfoster2q@privacy.gov.au,Male,139.214.40.244 +2021-03-05,100,Carl,Meyer,cmeyer2r@disqus.com,Male,204.117.7.88 +""".lstrip() + +_SEEDS__SEED_MERGE_COLS_UPDATE = """ +load_date,id,first_name,last_name,email,gender,ip_address +2021-03-05,1,Jack,Hunter,jhunter0@pbs.org,Male,59.80.20.168 +2021-03-05,2,Kathryn,Walker,kwalker1@ezinearticles.com,Female,194.121.179.35 +2021-03-05,3,Gerald,Ryan,gryan2@com.com,Male,11.3.212.243 +2021-03-05,4,Bonnie,Spencer,bspencer3@ameblo.jp,Female,216.32.196.175 +2021-03-05,5,Harold,Taylor,htaylor4@people.com.cn,Male,253.10.246.136 +2021-03-05,6,Jacqueline,Griffin,jgriffin5@t.co,Female,16.13.192.220 +2021-03-05,7,Wanda,Arnold,warnold6@google.nl,Female,232.116.150.64 +2021-03-05,8,Craig,Ortiz,cortiz7@sciencedaily.com,Male,199.126.106.13 +2021-03-05,9,Gary,Day,gday8@nih.gov,Male,35.81.68.186 +2021-03-05,10,Rose,Wright,rwright9@yahoo.co.jp,Female,236.82.178.100 +2021-03-05,11,Raymond,Kelley,rkelleya@fc2.com,Male,213.65.166.67 +2021-03-05,12,Gerald,Robinson,grobinsonb@disqus.com,Male,72.232.194.193 +2021-03-05,13,Mildred,Martinez,mmartinezc@samsung.com,Female,198.29.112.5 +2021-03-05,14,Dennis,Arnold,darnoldd@google.com,Male,86.96.3.250 +2021-03-05,15,Judy,Gray,jgraye@opensource.org,Female,79.218.162.245 +2021-03-05,16,Theresa,Garza,tgarzaf@epa.gov,Female,21.59.100.54 +2021-03-05,17,Gerald,Robertson,grobertsong@csmonitor.com,Male,131.134.82.96 +2021-03-05,18,Philip,Hernandez,phernandezh@adobe.com,Male,254.196.137.72 +2021-03-05,19,Julia,Gonzalez,jgonzalezi@cam.ac.uk,Female,84.240.227.174 +2021-03-05,20,Andrew,Davis,adavisj@patch.com,Male,9.255.67.25 +2021-03-05,21,Kimberly,Harper,kharperk@foxnews.com,Female,198.208.120.253 +2021-03-05,22,Mark,Martin,mmartinl@marketwatch.com,Male,233.138.182.153 +2021-03-05,23,Cynthia,Ruiz,cruizm@google.fr,Female,18.178.187.201 +2021-03-05,24,Samuel,Carroll,scarrolln@youtu.be,Male,128.113.96.122 +2021-03-05,25,Jennifer,Larson,jlarsono@vinaora.com,Female,98.234.85.95 +2021-03-05,26,Ashley,Perry,aperryp@rakuten.co.jp,Female,247.173.114.52 +2021-03-05,27,Howard,Rodriguez,hrodriguezq@shutterfly.com,Male,231.188.95.26 +2021-03-05,28,Amy,Brooks,abrooksr@theatlantic.com,Female,141.199.174.118 +2021-03-05,29,Louise,Warren,lwarrens@adobe.com,Female,96.105.158.28 +2021-03-05,30,Tina,Watson,twatsont@myspace.com,Female,251.142.118.177 +2021-03-05,31,Janice,Kelley,jkelleyu@creativecommons.org,Female,239.167.34.233 +2021-03-05,32,Terry,Mccoy,tmccoyv@bravesites.com,Male,117.201.183.203 +2021-03-05,33,Jeffrey,Morgan,jmorganw@surveymonkey.com,Male,78.101.78.149 +2021-03-05,34,Louis,Harvey,lharveyx@sina.com.cn,Male,51.50.0.167 +2021-03-05,35,Philip,Miller,pmillery@samsung.com,Male,103.255.222.110 +2021-03-05,36,Willie,Marshall,wmarshallz@ow.ly,Male,149.219.91.68 +2021-03-05,37,Patrick,Lopez,plopez10@redcross.org,Male,250.136.229.89 +2021-03-05,38,Adam,Jenkins,ajenkins11@harvard.edu,Male,7.36.112.81 +2021-03-05,39,Benjamin,Cruz,bcruz12@linkedin.com,Male,32.38.98.15 +2021-03-05,40,Ruby,Hawkins,rhawkins13@gmpg.org,Female,135.171.129.255 +2021-03-05,41,Carlos,Barnes,cbarnes14@a8.net,Male,240.197.85.140 +2021-03-05,42,Ruby,Griffin,rgriffin15@bravesites.com,Female,19.29.135.24 +2021-03-05,43,Sean,Mason,smason16@icq.com,Male,159.219.155.249 +2021-03-05,44,Anthony,Payne,apayne17@utexas.edu,Male,235.168.199.218 +2021-03-05,45,Steve,Cruz,scruz18@pcworld.com,Male,238.201.81.198 +2021-03-05,46,Anthony,Garcia,agarcia19@flavors.me,Male,25.85.10.18 +2021-03-05,47,Doris,Lopez,dlopez1a@sphinn.com,Female,245.218.51.238 +2021-03-05,48,Susan,Nichols,snichols1b@freewebs.com,Female,199.99.9.61 +2021-03-05,49,Wanda,Ferguson,wferguson1c@yahoo.co.jp,Female,236.241.135.21 +2021-03-05,50,Andrea,Pierce,apierce1d@google.co.uk,Female,132.40.10.209 +2021-03-05,51,Lawrence,Phillips,lphillips1e@jugem.jp,Male,72.226.82.87 +2021-03-05,52,Judy,Gilbert,jgilbert1f@multiply.com,Female,196.250.15.142 +2021-03-05,53,Eric,Williams,ewilliams1g@joomla.org,Male,222.202.73.126 +2021-03-05,54,Ralph,Romero,rromero1h@sogou.com,Male,123.184.125.212 +2021-03-05,55,Jean,Wilson,jwilson1i@ocn.ne.jp,Female,176.106.32.194 +2021-03-05,56,Lori,Reynolds,lreynolds1j@illinois.edu,Female,114.181.203.22 +2021-03-05,57,Donald,Moreno,dmoreno1k@bbc.co.uk,Male,233.249.97.60 +2021-03-05,58,Steven,Berry,sberry1l@eepurl.com,Male,186.193.50.50 +2021-03-05,59,Theresa,Shaw,tshaw1m@people.com.cn,Female,120.37.71.222 +2021-03-05,60,John,Stephens,jstephens1n@nationalgeographic.com,Male,191.87.127.115 +2021-03-05,61,Richard,Jacobs,rjacobs1o@state.tx.us,Male,66.210.83.155 +2021-03-05,62,Andrew,Lawson,alawson1p@over-blog.com,Male,54.98.36.94 +2021-03-05,63,Peter,Morgan,pmorgan1q@rambler.ru,Male,14.77.29.106 +2021-03-05,64,Nicole,Garrett,ngarrett1r@zimbio.com,Female,21.127.74.68 +2021-03-05,65,Joshua,Kim,jkim1s@edublogs.org,Male,57.255.207.41 +2021-03-05,66,Ralph,Roberts,rroberts1t@people.com.cn,Male,222.143.131.109 +2021-03-05,67,George,Montgomery,gmontgomery1u@smugmug.com,Male,76.75.111.77 +2021-03-05,68,Gerald,Alvarez,galvarez1v@flavors.me,Male,58.157.186.194 +2021-03-05,69,Donald,Olson,dolson1w@whitehouse.gov,Male,69.65.74.135 +2021-03-05,70,Carlos,Morgan,cmorgan1x@pbs.org,Male,96.20.140.87 +2021-03-05,71,Aaron,Stanley,astanley1y@webnode.com,Male,163.119.217.44 +2021-03-05,72,Virginia,Long,vlong1z@spiegel.de,Female,204.150.194.182 +2021-03-05,73,Robert,Berry,rberry20@tripadvisor.com,Male,104.19.48.241 +2021-03-05,74,Antonio,Brooks,abrooks21@unesco.org,Male,210.31.7.24 +2021-03-05,75,Ruby,Garcia,rgarcia22@ovh.net,Female,233.218.162.214 +2021-03-05,76,Jack,Hanson,jhanson23@blogtalkradio.com,Male,31.55.46.199 +2021-03-05,77,Kathryn,Nelson,knelson24@walmart.com,Female,14.189.146.41 +2021-03-05,78,Jason,Reed,jreed25@printfriendly.com,Male,141.189.89.255 +2021-03-05,79,George,Coleman,gcoleman26@people.com.cn,Male,81.189.221.144 +2021-03-05,80,Rose,King,rking27@ucoz.com,Female,212.123.168.231 +2021-03-05,81,Johnny,Holmes,jholmes28@boston.com,Male,177.3.93.188 +2021-03-05,82,Katherine,Gilbert,kgilbert29@altervista.org,Female,199.215.169.61 +2021-03-05,83,Joshua,Thomas,jthomas2a@ustream.tv,Male,0.8.205.30 +2021-03-05,84,Julie,Perry,jperry2b@opensource.org,Female,60.116.114.192 +2021-03-05,85,Richard,Perry,rperry2c@oracle.com,Male,181.125.70.232 +2021-03-05,86,Kenneth,Ruiz,kruiz2d@wikimedia.org,Male,189.105.137.109 +2021-03-05,87,Jose,Morgan,jmorgan2e@webnode.com,Male,101.134.215.156 +2021-03-05,88,Donald,Campbell,dcampbell2f@goo.ne.jp,Male,102.120.215.84 +2021-03-05,89,Debra,Collins,dcollins2g@uol.com.br,Female,90.13.153.235 +2021-03-05,90,Jesse,Johnson,jjohnson2h@stumbleupon.com,Male,225.178.125.53 +2021-03-05,91,Elizabeth,Stone,estone2i@histats.com,Female,123.184.126.221 +2021-03-05,92,Angela,Rogers,arogers2j@goodreads.com,Female,98.104.132.187 +2021-03-05,93,Emily,Dixon,edixon2k@mlb.com,Female,39.190.75.57 +2021-03-05,94,Albert,Scott,ascott2l@tinypic.com,Male,40.209.13.189 +2021-03-05,95,Barbara,Peterson,bpeterson2m@ow.ly,Female,75.249.136.180 +2021-03-05,96,Adam,Greene,agreene2n@fastcompany.com,Male,184.173.109.144 +2021-03-05,97,Earl,Sanders,esanders2o@hc360.com,Male,247.34.90.117 +2021-03-05,98,Angela,Brooks,abrooks2p@mtv.com,Female,10.63.249.126 +2021-03-05,99,Harold,Foster,hfoster2q@privacy.gov.au,Male,139.214.40.244 +2021-03-05,100,Carl,Meyer,cmeyer2r@disqus.com,Male,204.117.7.88 +2021-03-06,20,Andrew,Davis,adavisj@reddit.com,Male,9.255.67.25 +2021-03-06,83,Josh,Thomas,jthomas2a@ustream.tv,Male,0.8.205.30 +2021-03-06,92,Angela,Scott,ascott2j@goodreads.com,Female,98.119.208.155 +2021-03-06,101,Michael,Perez,mperez0@chronoengine.com,Male,106.239.70.175 +2021-03-06,102,Shawn,Mccoy,smccoy1@reddit.com,Male,24.165.76.182 +2021-03-06,103,Kathleen,Payne,kpayne2@cargocollective.com,Female,113.207.168.106 +2021-03-06,104,Jimmy,Cooper,jcooper3@cargocollective.com,Male,198.24.63.114 +2021-03-06,105,Katherine,Rice,krice4@typepad.com,Female,36.97.186.238 +2021-03-06,106,Sarah,Ryan,sryan5@gnu.org,Female,119.117.152.40 +2021-03-06,107,Martin,Mcdonald,mmcdonald6@opera.com,Male,8.76.38.115 +2021-03-06,108,Frank,Robinson,frobinson7@wunderground.com,Male,186.14.64.194 +2021-03-06,109,Jennifer,Franklin,jfranklin8@mail.ru,Female,91.216.3.131 +2021-03-06,110,Henry,Welch,hwelch9@list-manage.com,Male,176.35.182.168 +2021-03-06,111,Fred,Snyder,fsnydera@reddit.com,Male,217.106.196.54 +2021-03-06,112,Amy,Dunn,adunnb@nba.com,Female,95.39.163.195 +2021-03-06,113,Kathleen,Meyer,kmeyerc@cdc.gov,Female,164.142.188.214 +2021-03-06,114,Steve,Ferguson,sfergusond@reverbnation.com,Male,138.22.204.251 +2021-03-06,115,Teresa,Hill,thille@dion.ne.jp,Female,82.84.228.235 +2021-03-06,116,Amanda,Harper,aharperf@mail.ru,Female,16.123.56.176 +2021-03-06,117,Kimberly,Ray,krayg@xing.com,Female,48.66.48.12 +2021-03-06,118,Johnny,Knight,jknighth@jalbum.net,Male,99.30.138.123 +2021-03-06,119,Virginia,Freeman,vfreemani@tiny.cc,Female,225.172.182.63 +2021-03-06,120,Anna,Austin,aaustinj@diigo.com,Female,62.111.227.148 +2021-03-06,121,Willie,Hill,whillk@mail.ru,Male,0.86.232.249 +2021-03-06,122,Sean,Harris,sharrisl@zdnet.com,Male,117.165.133.249 +2021-03-06,123,Mildred,Adams,madamsm@usatoday.com,Female,163.44.97.46 +2021-03-06,124,David,Graham,dgrahamn@zimbio.com,Male,78.13.246.202 +2021-03-06,125,Victor,Hunter,vhuntero@ehow.com,Male,64.156.179.139 +2021-03-06,126,Aaron,Ruiz,aruizp@weebly.com,Male,34.194.68.78 +2021-03-06,127,Benjamin,Brooks,bbrooksq@jalbum.net,Male,20.192.189.107 +2021-03-06,128,Lisa,Wilson,lwilsonr@japanpost.jp,Female,199.152.130.217 +2021-03-06,129,Benjamin,King,bkings@comsenz.com,Male,29.189.189.213 +2021-03-06,130,Christina,Williamson,cwilliamsont@boston.com,Female,194.101.52.60 +2021-03-06,131,Jane,Gonzalez,jgonzalezu@networksolutions.com,Female,109.119.12.87 +2021-03-06,132,Thomas,Owens,towensv@psu.edu,Male,84.168.213.153 +2021-03-06,133,Katherine,Moore,kmoorew@naver.com,Female,183.150.65.24 +2021-03-06,134,Jennifer,Stewart,jstewartx@yahoo.com,Female,38.41.244.58 +2021-03-06,135,Sara,Tucker,stuckery@topsy.com,Female,181.130.59.184 +2021-03-06,136,Harold,Ortiz,hortizz@vkontakte.ru,Male,198.231.63.137 +2021-03-06,137,Shirley,James,sjames10@yelp.com,Female,83.27.160.104 +2021-03-06,138,Dennis,Johnson,djohnson11@slate.com,Male,183.178.246.101 +2021-03-06,139,Louise,Weaver,lweaver12@china.com.cn,Female,1.14.110.18 +2021-03-06,140,Maria,Armstrong,marmstrong13@prweb.com,Female,181.142.1.249 +2021-03-06,141,Gloria,Cruz,gcruz14@odnoklassniki.ru,Female,178.232.140.243 +2021-03-06,142,Diana,Spencer,dspencer15@ifeng.com,Female,125.153.138.244 +2021-03-06,143,Kelly,Nguyen,knguyen16@altervista.org,Female,170.13.201.119 +2021-03-06,144,Jane,Rodriguez,jrodriguez17@biblegateway.com,Female,12.102.249.81 +2021-03-06,145,Scott,Brown,sbrown18@geocities.jp,Male,108.174.99.192 +2021-03-06,146,Norma,Cruz,ncruz19@si.edu,Female,201.112.156.197 +2021-03-06,147,Marie,Peters,mpeters1a@mlb.com,Female,231.121.197.144 +2021-03-06,148,Lillian,Carr,lcarr1b@typepad.com,Female,206.179.164.163 +2021-03-06,149,Judy,Nichols,jnichols1c@t-online.de,Female,158.190.209.194 +2021-03-06,150,Billy,Long,blong1d@yahoo.com,Male,175.20.23.160 +2021-03-06,151,Howard,Reid,hreid1e@exblog.jp,Male,118.99.196.20 +2021-03-06,152,Laura,Ferguson,lferguson1f@tuttocitta.it,Female,22.77.87.110 +2021-03-06,153,Anne,Bailey,abailey1g@geocities.com,Female,58.144.159.245 +2021-03-06,154,Rose,Morgan,rmorgan1h@ehow.com,Female,118.127.97.4 +2021-03-06,155,Nicholas,Reyes,nreyes1i@google.ru,Male,50.135.10.252 +2021-03-06,156,Joshua,Kennedy,jkennedy1j@house.gov,Male,154.6.163.209 +2021-03-06,157,Paul,Watkins,pwatkins1k@upenn.edu,Male,177.236.120.87 +2021-03-06,158,Kathryn,Kelly,kkelly1l@businessweek.com,Female,70.28.61.86 +2021-03-06,159,Adam,Armstrong,aarmstrong1m@techcrunch.com,Male,133.235.24.202 +2021-03-06,160,Norma,Wallace,nwallace1n@phoca.cz,Female,241.119.227.128 +2021-03-06,161,Timothy,Reyes,treyes1o@google.cn,Male,86.28.23.26 +2021-03-06,162,Elizabeth,Patterson,epatterson1p@sun.com,Female,139.97.159.149 +2021-03-06,163,Edward,Gomez,egomez1q@google.fr,Male,158.103.108.255 +2021-03-06,164,David,Cox,dcox1r@friendfeed.com,Male,206.80.80.58 +2021-03-06,165,Brenda,Wood,bwood1s@over-blog.com,Female,217.207.44.179 +2021-03-06,166,Adam,Walker,awalker1t@blogs.com,Male,253.211.54.93 +2021-03-06,167,Michael,Hart,mhart1u@wix.com,Male,230.206.200.22 +2021-03-06,168,Jesse,Ellis,jellis1v@google.co.uk,Male,213.254.162.52 +2021-03-06,169,Janet,Powell,jpowell1w@un.org,Female,27.192.194.86 +2021-03-06,170,Helen,Ford,hford1x@creativecommons.org,Female,52.160.102.168 +2021-03-06,171,Gerald,Carpenter,gcarpenter1y@about.me,Male,36.30.194.218 +2021-03-06,172,Kathryn,Oliver,koliver1z@army.mil,Female,202.63.103.69 +2021-03-06,173,Alan,Berry,aberry20@gov.uk,Male,246.157.112.211 +2021-03-06,174,Harry,Andrews,handrews21@ameblo.jp,Male,195.108.0.12 +2021-03-06,175,Andrea,Hall,ahall22@hp.com,Female,149.162.163.28 +2021-03-06,176,Barbara,Wells,bwells23@behance.net,Female,224.70.72.1 +2021-03-06,177,Anne,Wells,awells24@apache.org,Female,180.168.81.153 +2021-03-06,178,Harry,Harper,hharper25@rediff.com,Male,151.87.130.21 +2021-03-06,179,Jack,Ray,jray26@wufoo.com,Male,220.109.38.178 +2021-03-06,180,Phillip,Hamilton,phamilton27@joomla.org,Male,166.40.47.30 +2021-03-06,181,Shirley,Hunter,shunter28@newsvine.com,Female,97.209.140.194 +2021-03-06,182,Arthur,Daniels,adaniels29@reuters.com,Male,5.40.240.86 +2021-03-06,183,Virginia,Rodriguez,vrodriguez2a@walmart.com,Female,96.80.164.184 +2021-03-06,184,Christina,Ryan,cryan2b@hibu.com,Female,56.35.5.52 +2021-03-06,185,Theresa,Mendoza,tmendoza2c@vinaora.com,Female,243.42.0.210 +2021-03-06,186,Jason,Cole,jcole2d@ycombinator.com,Male,198.248.39.129 +2021-03-06,187,Phillip,Bryant,pbryant2e@rediff.com,Male,140.39.116.251 +2021-03-06,188,Adam,Torres,atorres2f@sun.com,Male,101.75.187.135 +2021-03-06,189,Margaret,Johnston,mjohnston2g@ucsd.edu,Female,159.30.69.149 +2021-03-06,190,Paul,Payne,ppayne2h@hhs.gov,Male,199.234.140.220 +2021-03-06,191,Todd,Willis,twillis2i@businessweek.com,Male,191.59.136.214 +2021-03-06,192,Willie,Oliver,woliver2j@noaa.gov,Male,44.212.35.197 +2021-03-06,193,Frances,Robertson,frobertson2k@go.com,Female,31.117.65.136 +2021-03-06,194,Gregory,Hawkins,ghawkins2l@joomla.org,Male,91.3.22.49 +2021-03-06,195,Lisa,Perkins,lperkins2m@si.edu,Female,145.95.31.186 +2021-03-06,196,Jacqueline,Anderson,janderson2n@cargocollective.com,Female,14.176.0.187 +2021-03-06,197,Shirley,Diaz,sdiaz2o@ucla.edu,Female,207.12.95.46 +2021-03-06,198,Nicole,Meyer,nmeyer2p@flickr.com,Female,231.79.115.13 +2021-03-06,199,Mary,Gray,mgray2q@constantcontact.com,Female,210.116.64.253 +2021-03-06,200,Jean,Mcdonald,jmcdonald2r@baidu.com,Female,122.239.235.117 +""".lstrip() + + +_SEEDS__SEED_MERGE_COLS_EXPECTED_RESULT = """ +load_date,id,first_name,last_name,email,gender,ip_address +2021-03-05,1,Jack,Hunter,jhunter0@pbs.org,Male,59.80.20.168 +2021-03-05,2,Kathryn,Walker,kwalker1@ezinearticles.com,Female,194.121.179.35 +2021-03-05,3,Gerald,Ryan,gryan2@com.com,Male,11.3.212.243 +2021-03-05,4,Bonnie,Spencer,bspencer3@ameblo.jp,Female,216.32.196.175 +2021-03-05,5,Harold,Taylor,htaylor4@people.com.cn,Male,253.10.246.136 +2021-03-05,6,Jacqueline,Griffin,jgriffin5@t.co,Female,16.13.192.220 +2021-03-05,7,Wanda,Arnold,warnold6@google.nl,Female,232.116.150.64 +2021-03-05,8,Craig,Ortiz,cortiz7@sciencedaily.com,Male,199.126.106.13 +2021-03-05,9,Gary,Day,gday8@nih.gov,Male,35.81.68.186 +2021-03-05,10,Rose,Wright,rwright9@yahoo.co.jp,Female,236.82.178.100 +2021-03-05,11,Raymond,Kelley,rkelleya@fc2.com,Male,213.65.166.67 +2021-03-05,12,Gerald,Robinson,grobinsonb@disqus.com,Male,72.232.194.193 +2021-03-05,13,Mildred,Martinez,mmartinezc@samsung.com,Female,198.29.112.5 +2021-03-05,14,Dennis,Arnold,darnoldd@google.com,Male,86.96.3.250 +2021-03-05,15,Judy,Gray,jgraye@opensource.org,Female,79.218.162.245 +2021-03-05,16,Theresa,Garza,tgarzaf@epa.gov,Female,21.59.100.54 +2021-03-05,17,Gerald,Robertson,grobertsong@csmonitor.com,Male,131.134.82.96 +2021-03-05,18,Philip,Hernandez,phernandezh@adobe.com,Male,254.196.137.72 +2021-03-05,19,Julia,Gonzalez,jgonzalezi@cam.ac.uk,Female,84.240.227.174 +2021-03-05,20,Andrew,Davis,adavisj@reddit.com,Male,9.255.67.25 +2021-03-05,21,Kimberly,Harper,kharperk@foxnews.com,Female,198.208.120.253 +2021-03-05,22,Mark,Martin,mmartinl@marketwatch.com,Male,233.138.182.153 +2021-03-05,23,Cynthia,Ruiz,cruizm@google.fr,Female,18.178.187.201 +2021-03-05,24,Samuel,Carroll,scarrolln@youtu.be,Male,128.113.96.122 +2021-03-05,25,Jennifer,Larson,jlarsono@vinaora.com,Female,98.234.85.95 +2021-03-05,26,Ashley,Perry,aperryp@rakuten.co.jp,Female,247.173.114.52 +2021-03-05,27,Howard,Rodriguez,hrodriguezq@shutterfly.com,Male,231.188.95.26 +2021-03-05,28,Amy,Brooks,abrooksr@theatlantic.com,Female,141.199.174.118 +2021-03-05,29,Louise,Warren,lwarrens@adobe.com,Female,96.105.158.28 +2021-03-05,30,Tina,Watson,twatsont@myspace.com,Female,251.142.118.177 +2021-03-05,31,Janice,Kelley,jkelleyu@creativecommons.org,Female,239.167.34.233 +2021-03-05,32,Terry,Mccoy,tmccoyv@bravesites.com,Male,117.201.183.203 +2021-03-05,33,Jeffrey,Morgan,jmorganw@surveymonkey.com,Male,78.101.78.149 +2021-03-05,34,Louis,Harvey,lharveyx@sina.com.cn,Male,51.50.0.167 +2021-03-05,35,Philip,Miller,pmillery@samsung.com,Male,103.255.222.110 +2021-03-05,36,Willie,Marshall,wmarshallz@ow.ly,Male,149.219.91.68 +2021-03-05,37,Patrick,Lopez,plopez10@redcross.org,Male,250.136.229.89 +2021-03-05,38,Adam,Jenkins,ajenkins11@harvard.edu,Male,7.36.112.81 +2021-03-05,39,Benjamin,Cruz,bcruz12@linkedin.com,Male,32.38.98.15 +2021-03-05,40,Ruby,Hawkins,rhawkins13@gmpg.org,Female,135.171.129.255 +2021-03-05,41,Carlos,Barnes,cbarnes14@a8.net,Male,240.197.85.140 +2021-03-05,42,Ruby,Griffin,rgriffin15@bravesites.com,Female,19.29.135.24 +2021-03-05,43,Sean,Mason,smason16@icq.com,Male,159.219.155.249 +2021-03-05,44,Anthony,Payne,apayne17@utexas.edu,Male,235.168.199.218 +2021-03-05,45,Steve,Cruz,scruz18@pcworld.com,Male,238.201.81.198 +2021-03-05,46,Anthony,Garcia,agarcia19@flavors.me,Male,25.85.10.18 +2021-03-05,47,Doris,Lopez,dlopez1a@sphinn.com,Female,245.218.51.238 +2021-03-05,48,Susan,Nichols,snichols1b@freewebs.com,Female,199.99.9.61 +2021-03-05,49,Wanda,Ferguson,wferguson1c@yahoo.co.jp,Female,236.241.135.21 +2021-03-05,50,Andrea,Pierce,apierce1d@google.co.uk,Female,132.40.10.209 +2021-03-05,51,Lawrence,Phillips,lphillips1e@jugem.jp,Male,72.226.82.87 +2021-03-05,52,Judy,Gilbert,jgilbert1f@multiply.com,Female,196.250.15.142 +2021-03-05,53,Eric,Williams,ewilliams1g@joomla.org,Male,222.202.73.126 +2021-03-05,54,Ralph,Romero,rromero1h@sogou.com,Male,123.184.125.212 +2021-03-05,55,Jean,Wilson,jwilson1i@ocn.ne.jp,Female,176.106.32.194 +2021-03-05,56,Lori,Reynolds,lreynolds1j@illinois.edu,Female,114.181.203.22 +2021-03-05,57,Donald,Moreno,dmoreno1k@bbc.co.uk,Male,233.249.97.60 +2021-03-05,58,Steven,Berry,sberry1l@eepurl.com,Male,186.193.50.50 +2021-03-05,59,Theresa,Shaw,tshaw1m@people.com.cn,Female,120.37.71.222 +2021-03-05,60,John,Stephens,jstephens1n@nationalgeographic.com,Male,191.87.127.115 +2021-03-05,61,Richard,Jacobs,rjacobs1o@state.tx.us,Male,66.210.83.155 +2021-03-05,62,Andrew,Lawson,alawson1p@over-blog.com,Male,54.98.36.94 +2021-03-05,63,Peter,Morgan,pmorgan1q@rambler.ru,Male,14.77.29.106 +2021-03-05,64,Nicole,Garrett,ngarrett1r@zimbio.com,Female,21.127.74.68 +2021-03-05,65,Joshua,Kim,jkim1s@edublogs.org,Male,57.255.207.41 +2021-03-05,66,Ralph,Roberts,rroberts1t@people.com.cn,Male,222.143.131.109 +2021-03-05,67,George,Montgomery,gmontgomery1u@smugmug.com,Male,76.75.111.77 +2021-03-05,68,Gerald,Alvarez,galvarez1v@flavors.me,Male,58.157.186.194 +2021-03-05,69,Donald,Olson,dolson1w@whitehouse.gov,Male,69.65.74.135 +2021-03-05,70,Carlos,Morgan,cmorgan1x@pbs.org,Male,96.20.140.87 +2021-03-05,71,Aaron,Stanley,astanley1y@webnode.com,Male,163.119.217.44 +2021-03-05,72,Virginia,Long,vlong1z@spiegel.de,Female,204.150.194.182 +2021-03-05,73,Robert,Berry,rberry20@tripadvisor.com,Male,104.19.48.241 +2021-03-05,74,Antonio,Brooks,abrooks21@unesco.org,Male,210.31.7.24 +2021-03-05,75,Ruby,Garcia,rgarcia22@ovh.net,Female,233.218.162.214 +2021-03-05,76,Jack,Hanson,jhanson23@blogtalkradio.com,Male,31.55.46.199 +2021-03-05,77,Kathryn,Nelson,knelson24@walmart.com,Female,14.189.146.41 +2021-03-05,78,Jason,Reed,jreed25@printfriendly.com,Male,141.189.89.255 +2021-03-05,79,George,Coleman,gcoleman26@people.com.cn,Male,81.189.221.144 +2021-03-05,80,Rose,King,rking27@ucoz.com,Female,212.123.168.231 +2021-03-05,81,Johnny,Holmes,jholmes28@boston.com,Male,177.3.93.188 +2021-03-05,82,Katherine,Gilbert,kgilbert29@altervista.org,Female,199.215.169.61 +2021-03-05,83,Joshua,Thomas,jthomas2a@ustream.tv,Male,0.8.205.30 +2021-03-05,84,Julie,Perry,jperry2b@opensource.org,Female,60.116.114.192 +2021-03-05,85,Richard,Perry,rperry2c@oracle.com,Male,181.125.70.232 +2021-03-05,86,Kenneth,Ruiz,kruiz2d@wikimedia.org,Male,189.105.137.109 +2021-03-05,87,Jose,Morgan,jmorgan2e@webnode.com,Male,101.134.215.156 +2021-03-05,88,Donald,Campbell,dcampbell2f@goo.ne.jp,Male,102.120.215.84 +2021-03-05,89,Debra,Collins,dcollins2g@uol.com.br,Female,90.13.153.235 +2021-03-05,90,Jesse,Johnson,jjohnson2h@stumbleupon.com,Male,225.178.125.53 +2021-03-05,91,Elizabeth,Stone,estone2i@histats.com,Female,123.184.126.221 +2021-03-05,92,Angela,Rogers,ascott2j@goodreads.com,Female,98.119.208.155 +2021-03-05,93,Emily,Dixon,edixon2k@mlb.com,Female,39.190.75.57 +2021-03-05,94,Albert,Scott,ascott2l@tinypic.com,Male,40.209.13.189 +2021-03-05,95,Barbara,Peterson,bpeterson2m@ow.ly,Female,75.249.136.180 +2021-03-05,96,Adam,Greene,agreene2n@fastcompany.com,Male,184.173.109.144 +2021-03-05,97,Earl,Sanders,esanders2o@hc360.com,Male,247.34.90.117 +2021-03-05,98,Angela,Brooks,abrooks2p@mtv.com,Female,10.63.249.126 +2021-03-05,99,Harold,Foster,hfoster2q@privacy.gov.au,Male,139.214.40.244 +2021-03-05,100,Carl,Meyer,cmeyer2r@disqus.com,Male,204.117.7.88 +2021-03-06,101,Michael,Perez,mperez0@chronoengine.com,Male,106.239.70.175 +2021-03-06,102,Shawn,Mccoy,smccoy1@reddit.com,Male,24.165.76.182 +2021-03-06,103,Kathleen,Payne,kpayne2@cargocollective.com,Female,113.207.168.106 +2021-03-06,104,Jimmy,Cooper,jcooper3@cargocollective.com,Male,198.24.63.114 +2021-03-06,105,Katherine,Rice,krice4@typepad.com,Female,36.97.186.238 +2021-03-06,106,Sarah,Ryan,sryan5@gnu.org,Female,119.117.152.40 +2021-03-06,107,Martin,Mcdonald,mmcdonald6@opera.com,Male,8.76.38.115 +2021-03-06,108,Frank,Robinson,frobinson7@wunderground.com,Male,186.14.64.194 +2021-03-06,109,Jennifer,Franklin,jfranklin8@mail.ru,Female,91.216.3.131 +2021-03-06,110,Henry,Welch,hwelch9@list-manage.com,Male,176.35.182.168 +2021-03-06,111,Fred,Snyder,fsnydera@reddit.com,Male,217.106.196.54 +2021-03-06,112,Amy,Dunn,adunnb@nba.com,Female,95.39.163.195 +2021-03-06,113,Kathleen,Meyer,kmeyerc@cdc.gov,Female,164.142.188.214 +2021-03-06,114,Steve,Ferguson,sfergusond@reverbnation.com,Male,138.22.204.251 +2021-03-06,115,Teresa,Hill,thille@dion.ne.jp,Female,82.84.228.235 +2021-03-06,116,Amanda,Harper,aharperf@mail.ru,Female,16.123.56.176 +2021-03-06,117,Kimberly,Ray,krayg@xing.com,Female,48.66.48.12 +2021-03-06,118,Johnny,Knight,jknighth@jalbum.net,Male,99.30.138.123 +2021-03-06,119,Virginia,Freeman,vfreemani@tiny.cc,Female,225.172.182.63 +2021-03-06,120,Anna,Austin,aaustinj@diigo.com,Female,62.111.227.148 +2021-03-06,121,Willie,Hill,whillk@mail.ru,Male,0.86.232.249 +2021-03-06,122,Sean,Harris,sharrisl@zdnet.com,Male,117.165.133.249 +2021-03-06,123,Mildred,Adams,madamsm@usatoday.com,Female,163.44.97.46 +2021-03-06,124,David,Graham,dgrahamn@zimbio.com,Male,78.13.246.202 +2021-03-06,125,Victor,Hunter,vhuntero@ehow.com,Male,64.156.179.139 +2021-03-06,126,Aaron,Ruiz,aruizp@weebly.com,Male,34.194.68.78 +2021-03-06,127,Benjamin,Brooks,bbrooksq@jalbum.net,Male,20.192.189.107 +2021-03-06,128,Lisa,Wilson,lwilsonr@japanpost.jp,Female,199.152.130.217 +2021-03-06,129,Benjamin,King,bkings@comsenz.com,Male,29.189.189.213 +2021-03-06,130,Christina,Williamson,cwilliamsont@boston.com,Female,194.101.52.60 +2021-03-06,131,Jane,Gonzalez,jgonzalezu@networksolutions.com,Female,109.119.12.87 +2021-03-06,132,Thomas,Owens,towensv@psu.edu,Male,84.168.213.153 +2021-03-06,133,Katherine,Moore,kmoorew@naver.com,Female,183.150.65.24 +2021-03-06,134,Jennifer,Stewart,jstewartx@yahoo.com,Female,38.41.244.58 +2021-03-06,135,Sara,Tucker,stuckery@topsy.com,Female,181.130.59.184 +2021-03-06,136,Harold,Ortiz,hortizz@vkontakte.ru,Male,198.231.63.137 +2021-03-06,137,Shirley,James,sjames10@yelp.com,Female,83.27.160.104 +2021-03-06,138,Dennis,Johnson,djohnson11@slate.com,Male,183.178.246.101 +2021-03-06,139,Louise,Weaver,lweaver12@china.com.cn,Female,1.14.110.18 +2021-03-06,140,Maria,Armstrong,marmstrong13@prweb.com,Female,181.142.1.249 +2021-03-06,141,Gloria,Cruz,gcruz14@odnoklassniki.ru,Female,178.232.140.243 +2021-03-06,142,Diana,Spencer,dspencer15@ifeng.com,Female,125.153.138.244 +2021-03-06,143,Kelly,Nguyen,knguyen16@altervista.org,Female,170.13.201.119 +2021-03-06,144,Jane,Rodriguez,jrodriguez17@biblegateway.com,Female,12.102.249.81 +2021-03-06,145,Scott,Brown,sbrown18@geocities.jp,Male,108.174.99.192 +2021-03-06,146,Norma,Cruz,ncruz19@si.edu,Female,201.112.156.197 +2021-03-06,147,Marie,Peters,mpeters1a@mlb.com,Female,231.121.197.144 +2021-03-06,148,Lillian,Carr,lcarr1b@typepad.com,Female,206.179.164.163 +2021-03-06,149,Judy,Nichols,jnichols1c@t-online.de,Female,158.190.209.194 +2021-03-06,150,Billy,Long,blong1d@yahoo.com,Male,175.20.23.160 +2021-03-06,151,Howard,Reid,hreid1e@exblog.jp,Male,118.99.196.20 +2021-03-06,152,Laura,Ferguson,lferguson1f@tuttocitta.it,Female,22.77.87.110 +2021-03-06,153,Anne,Bailey,abailey1g@geocities.com,Female,58.144.159.245 +2021-03-06,154,Rose,Morgan,rmorgan1h@ehow.com,Female,118.127.97.4 +2021-03-06,155,Nicholas,Reyes,nreyes1i@google.ru,Male,50.135.10.252 +2021-03-06,156,Joshua,Kennedy,jkennedy1j@house.gov,Male,154.6.163.209 +2021-03-06,157,Paul,Watkins,pwatkins1k@upenn.edu,Male,177.236.120.87 +2021-03-06,158,Kathryn,Kelly,kkelly1l@businessweek.com,Female,70.28.61.86 +2021-03-06,159,Adam,Armstrong,aarmstrong1m@techcrunch.com,Male,133.235.24.202 +2021-03-06,160,Norma,Wallace,nwallace1n@phoca.cz,Female,241.119.227.128 +2021-03-06,161,Timothy,Reyes,treyes1o@google.cn,Male,86.28.23.26 +2021-03-06,162,Elizabeth,Patterson,epatterson1p@sun.com,Female,139.97.159.149 +2021-03-06,163,Edward,Gomez,egomez1q@google.fr,Male,158.103.108.255 +2021-03-06,164,David,Cox,dcox1r@friendfeed.com,Male,206.80.80.58 +2021-03-06,165,Brenda,Wood,bwood1s@over-blog.com,Female,217.207.44.179 +2021-03-06,166,Adam,Walker,awalker1t@blogs.com,Male,253.211.54.93 +2021-03-06,167,Michael,Hart,mhart1u@wix.com,Male,230.206.200.22 +2021-03-06,168,Jesse,Ellis,jellis1v@google.co.uk,Male,213.254.162.52 +2021-03-06,169,Janet,Powell,jpowell1w@un.org,Female,27.192.194.86 +2021-03-06,170,Helen,Ford,hford1x@creativecommons.org,Female,52.160.102.168 +2021-03-06,171,Gerald,Carpenter,gcarpenter1y@about.me,Male,36.30.194.218 +2021-03-06,172,Kathryn,Oliver,koliver1z@army.mil,Female,202.63.103.69 +2021-03-06,173,Alan,Berry,aberry20@gov.uk,Male,246.157.112.211 +2021-03-06,174,Harry,Andrews,handrews21@ameblo.jp,Male,195.108.0.12 +2021-03-06,175,Andrea,Hall,ahall22@hp.com,Female,149.162.163.28 +2021-03-06,176,Barbara,Wells,bwells23@behance.net,Female,224.70.72.1 +2021-03-06,177,Anne,Wells,awells24@apache.org,Female,180.168.81.153 +2021-03-06,178,Harry,Harper,hharper25@rediff.com,Male,151.87.130.21 +2021-03-06,179,Jack,Ray,jray26@wufoo.com,Male,220.109.38.178 +2021-03-06,180,Phillip,Hamilton,phamilton27@joomla.org,Male,166.40.47.30 +2021-03-06,181,Shirley,Hunter,shunter28@newsvine.com,Female,97.209.140.194 +2021-03-06,182,Arthur,Daniels,adaniels29@reuters.com,Male,5.40.240.86 +2021-03-06,183,Virginia,Rodriguez,vrodriguez2a@walmart.com,Female,96.80.164.184 +2021-03-06,184,Christina,Ryan,cryan2b@hibu.com,Female,56.35.5.52 +2021-03-06,185,Theresa,Mendoza,tmendoza2c@vinaora.com,Female,243.42.0.210 +2021-03-06,186,Jason,Cole,jcole2d@ycombinator.com,Male,198.248.39.129 +2021-03-06,187,Phillip,Bryant,pbryant2e@rediff.com,Male,140.39.116.251 +2021-03-06,188,Adam,Torres,atorres2f@sun.com,Male,101.75.187.135 +2021-03-06,189,Margaret,Johnston,mjohnston2g@ucsd.edu,Female,159.30.69.149 +2021-03-06,190,Paul,Payne,ppayne2h@hhs.gov,Male,199.234.140.220 +2021-03-06,191,Todd,Willis,twillis2i@businessweek.com,Male,191.59.136.214 +2021-03-06,192,Willie,Oliver,woliver2j@noaa.gov,Male,44.212.35.197 +2021-03-06,193,Frances,Robertson,frobertson2k@go.com,Female,31.117.65.136 +2021-03-06,194,Gregory,Hawkins,ghawkins2l@joomla.org,Male,91.3.22.49 +2021-03-06,195,Lisa,Perkins,lperkins2m@si.edu,Female,145.95.31.186 +2021-03-06,196,Jacqueline,Anderson,janderson2n@cargocollective.com,Female,14.176.0.187 +2021-03-06,197,Shirley,Diaz,sdiaz2o@ucla.edu,Female,207.12.95.46 +2021-03-06,198,Nicole,Meyer,nmeyer2p@flickr.com,Female,231.79.115.13 +2021-03-06,199,Mary,Gray,mgray2q@constantcontact.com,Female,210.116.64.253 +2021-03-06,200,Jean,Mcdonald,jmcdonald2r@baidu.com,Female,122.239.235.117 +""".lstrip() + + +_MODELS_INCREMENTAL_UPDATE_COLS = """ +{{ + config( + materialized = "incremental", + unique_key = "id", + merge_update_columns = ["email", "ip_address"] + ) +}} + +select * from {{ ref('seed') }} + +{% if is_incremental() %} + + where load_date > (select max(load_date) from {{this}}) + +{% endif %} +""" diff --git a/dbt-bigquery/tests/functional/adapter/simple_copy/test_simple_copy.py b/dbt-bigquery/tests/functional/adapter/simple_copy/test_simple_copy.py new file mode 100644 index 000000000..a0ad14684 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/simple_copy/test_simple_copy.py @@ -0,0 +1,42 @@ +import pytest + +from pathlib import Path + +from dbt.tests.util import run_dbt, rm_file, write_file, check_relations_equal + +from dbt.tests.adapter.simple_copy.test_simple_copy import SimpleCopyBase + +from tests.functional.adapter.simple_copy.fixtures import ( + _MODELS_INCREMENTAL_UPDATE_COLS, + _SEEDS__SEED_MERGE_COLS_INITIAL, + _SEEDS__SEED_MERGE_COLS_UPDATE, + _SEEDS__SEED_MERGE_COLS_EXPECTED_RESULT, +) + + +class TestSimpleCopyBase(SimpleCopyBase): + pass + + +class TestIncrementalMergeColumns: + @pytest.fixture(scope="class") + def models(self): + return {"incremental_update_cols.sql": _MODELS_INCREMENTAL_UPDATE_COLS} + + @pytest.fixture(scope="class") + def seeds(self): + return {"seed.csv": _SEEDS__SEED_MERGE_COLS_INITIAL} + + def test_incremental_merge_columns(self, project): + run_dbt(["seed"]) + run_dbt(["run"]) + + main_seed_file = project.project_root / Path("seeds") / Path("seed.csv") + expected_seed_file = project.project_root / Path("seeds") / Path("expected_result.csv") + rm_file(main_seed_file) + write_file(_SEEDS__SEED_MERGE_COLS_UPDATE, main_seed_file) + write_file(_SEEDS__SEED_MERGE_COLS_EXPECTED_RESULT, expected_seed_file) + + run_dbt(["seed"]) + run_dbt(["run"]) + check_relations_equal(project.adapter, ["incremental_update_cols", "expected_result"]) diff --git a/dbt-bigquery/tests/functional/adapter/sources_freshness_tests/files.py b/dbt-bigquery/tests/functional/adapter/sources_freshness_tests/files.py new file mode 100644 index 000000000..eaca96648 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/sources_freshness_tests/files.py @@ -0,0 +1,23 @@ +SCHEMA_YML = """version: 2 +sources: + - name: test_source + freshness: + warn_after: {count: 10, period: hour} + error_after: {count: 1, period: day} + schema: "{{ env_var('DBT_GET_LAST_RELATION_TEST_SCHEMA') }}" + tables: + - name: test_source +""" + +SEED_TEST_SOURCE_CSV = """ +id,name +1,Martin +2,Jeter +3,Ruth +4,Gehrig +5,DiMaggio +6,Torre +7,Mantle +8,Berra +9,Maris +""".strip() diff --git a/dbt-bigquery/tests/functional/adapter/sources_freshness_tests/test_get_relation_last_modified.py b/dbt-bigquery/tests/functional/adapter/sources_freshness_tests/test_get_relation_last_modified.py new file mode 100644 index 000000000..08e263edb --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/sources_freshness_tests/test_get_relation_last_modified.py @@ -0,0 +1,30 @@ +import os +import pytest + +from dbt.tests.util import run_dbt + +from tests.functional.adapter.sources_freshness_tests import files + + +class TestGetLastRelationModified: + @pytest.fixture(scope="class") + def seeds(self): + return {"test_source.csv": files.SEED_TEST_SOURCE_CSV} + + @pytest.fixture(scope="class") + def models(self): + return {"schema.yml": files.SCHEMA_YML} + + @pytest.fixture(scope="class", autouse=True) + def setup(self, project): + # we need the schema name for the sources section + os.environ["DBT_GET_LAST_RELATION_TEST_SCHEMA"] = project.test_schema + run_dbt(["seed"]) + yield + del os.environ["DBT_GET_LAST_RELATION_TEST_SCHEMA"] + + def test_get_last_relation_modified(self, project): + results = run_dbt(["source", "freshness"]) + assert len(results) == 1 + result = results[0] + assert result.status == "pass" diff --git a/dbt-bigquery/tests/functional/adapter/store_test_failures_tests/test_store_test_failures.py b/dbt-bigquery/tests/functional/adapter/store_test_failures_tests/test_store_test_failures.py new file mode 100644 index 000000000..b9de2b80b --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/store_test_failures_tests/test_store_test_failures.py @@ -0,0 +1,48 @@ +import pytest + +from dbt.tests.adapter.store_test_failures_tests import basic +from dbt.tests.adapter.store_test_failures_tests.test_store_test_failures import ( + StoreTestFailuresBase, +) + + +TEST_AUDIT_SCHEMA_SUFFIX = "dbt_test__aud" + + +class TestBigQueryStoreTestFailures(StoreTestFailuresBase): + @pytest.fixture(scope="function", autouse=True) + def teardown_method(self, project): + yield + relation = project.adapter.Relation.create( + database=project.database, schema=f"{project.test_schema}_{TEST_AUDIT_SCHEMA_SUFFIX}" + ) + + project.adapter.drop_schema(relation) + + def test_store_and_assert(self, project): + self.run_tests_store_one_failure(project) + self.run_tests_store_failures_and_assert(project) + + +class TestStoreTestFailuresAsInteractions(basic.StoreTestFailuresAsInteractions): + pass + + +class TestStoreTestFailuresAsProjectLevelOff(basic.StoreTestFailuresAsProjectLevelOff): + pass + + +class TestStoreTestFailuresAsProjectLevelView(basic.StoreTestFailuresAsProjectLevelView): + pass + + +class TestStoreTestFailuresAsGeneric(basic.StoreTestFailuresAsGeneric): + pass + + +class TestStoreTestFailuresAsProjectLevelEphemeral(basic.StoreTestFailuresAsProjectLevelEphemeral): + pass + + +class TestStoreTestFailuresAsExceptions(basic.StoreTestFailuresAsExceptions): + pass diff --git a/dbt-bigquery/tests/functional/adapter/test_aliases.py b/dbt-bigquery/tests/functional/adapter/test_aliases.py new file mode 100644 index 000000000..5ce13c8bc --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_aliases.py @@ -0,0 +1,92 @@ +import pytest +import os +from dbt.tests.adapter.aliases.test_aliases import BaseAliases, BaseSameAliasDifferentDatabases + +MACROS__BIGQUERY_CAST_SQL = """ +{% macro bigquery__string_literal(s) %} + cast('{{ s }}' as string) +{% endmacro %} +""" + +MACROS__EXPECT_VALUE_SQL = """ +-- cross-db compatible test, similar to accepted_values + +{% test expect_value(model, field, value) %} + +select * +from {{ model }} +where {{ field }} != '{{ value }}' + +{% endtest %} +""" + +MODELS_DUPE_CUSTOM_DATABASE_A = """ +select {{ string_literal(this.name) }} as tablename +""" + +MODELS_DUPE_CUSTOM_DATABASE_B = """ +select {{ string_literal(this.name) }} as tablename +""" + +MODELS_SCHEMA_YML = """ +version: 2 +models: +- name: model_a + data_tests: + - expect_value: + field: tablename + value: duped_alias +- name: model_b + data_tests: + - expect_value: + field: tablename + value: duped_alias +""" + + +class TestAliasesBigQuery(BaseAliases): + @pytest.fixture(scope="class") + def macros(self): + return { + "bigquery_cast.sql": MACROS__BIGQUERY_CAST_SQL, + "expect_value.sql": MACROS__EXPECT_VALUE_SQL, + } + + +class TestSameTestSameAliasDifferentDatabasesBigQuery(BaseSameAliasDifferentDatabases): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "config-version": 2, + "macro-paths": ["macros"], + "models": { + "test": { + "alias": "duped_alias", + "model_b": {"database": os.getenv("BIGQUERY_TEST_ALT_DATABASE")}, + }, + }, + } + + @pytest.fixture(scope="class") + def macros(self): + return { + "bigquery_cast.sql": MACROS__BIGQUERY_CAST_SQL, + "expect_value.sql": MACROS__EXPECT_VALUE_SQL, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "schema.yml": MODELS_SCHEMA_YML, + "model_a.sql": MODELS_DUPE_CUSTOM_DATABASE_A, + "model_b.sql": MODELS_DUPE_CUSTOM_DATABASE_B, + } + + @pytest.fixture(autouse=True) + def clean_up(self, project): + yield + with project.adapter.connection_named("__test"): + relation = project.adapter.Relation.create( + database=os.getenv("BIGQUERY_TEST_ALT_DATABASE"), schema=project.test_schema + ) + project.adapter.drop_schema(relation) diff --git a/dbt-bigquery/tests/functional/adapter/test_basic.py b/dbt-bigquery/tests/functional/adapter/test_basic.py new file mode 100644 index 000000000..f95f043cf --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_basic.py @@ -0,0 +1,81 @@ +import pytest + +from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations +from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests +from dbt.tests.adapter.basic.test_singular_tests_ephemeral import ( + BaseSingularTestsEphemeral, +) +from dbt.tests.adapter.basic.test_empty import BaseEmpty +from dbt.tests.adapter.basic.test_ephemeral import BaseEphemeral +from dbt.tests.adapter.basic.test_incremental import BaseIncremental +from dbt.tests.adapter.basic.test_generic_tests import BaseGenericTests +from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols +from dbt.tests.adapter.basic.test_snapshot_timestamp import BaseSnapshotTimestamp +from dbt.tests.adapter.basic.test_adapter_methods import BaseAdapterMethod +from dbt.tests.adapter.basic.test_validate_connection import BaseValidateConnection +from dbt.tests.adapter.basic.test_docs_generate import BaseDocsGenerate +from dbt.tests.adapter.basic.expected_catalog import base_expected_catalog +from tests.functional.adapter.expected_stats import bigquery_stats + + +class TestSimpleMaterializationsBigQuery(BaseSimpleMaterializations): + # This test requires a full-refresh to replace a table with a view + @pytest.fixture(scope="class") + def test_config(self): + return {"require_full_refresh": True} + + +class TestSingularTestsBigQuery(BaseSingularTests): + pass + + +class TestSingularTestsEphemeralBigQuery(BaseSingularTestsEphemeral): + pass + + +class TestEmptyBigQuery(BaseEmpty): + pass + + +class TestEphemeralBigQuery(BaseEphemeral): + pass + + +class TestIncrementalBigQuery(BaseIncremental): + pass + + +class TestGenericTestsBigQuery(BaseGenericTests): + pass + + +class TestSnapshotCheckColsBigQuery(BaseSnapshotCheckCols): + pass + + +class TestSnapshotTimestampBigQuery(BaseSnapshotTimestamp): + pass + + +class TestBaseAdapterMethodBigQuery(BaseAdapterMethod): + pass + + +class TestBigQueryValidateConnection(BaseValidateConnection): + pass + + +class TestDocsGenerateBigQuery(BaseDocsGenerate): + @pytest.fixture(scope="class") + def expected_catalog(self, project): + return base_expected_catalog( + project, + role=None, + id_type="INT64", + text_type="STRING", + time_type="DATETIME", + view_type="view", + table_type="table", + model_stats=bigquery_stats(False), + seed_stats=bigquery_stats(True), + ) diff --git a/dbt-bigquery/tests/functional/adapter/test_changing_relation_type.py b/dbt-bigquery/tests/functional/adapter/test_changing_relation_type.py new file mode 100644 index 000000000..b2918cffc --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_changing_relation_type.py @@ -0,0 +1,9 @@ +from dbt.tests.adapter.relations.test_changing_relation_type import BaseChangeRelationTypeValidator + + +class TestBigQueryChangeRelationTypes(BaseChangeRelationTypeValidator): + def test_changing_materialization_changes_relation_type(self, project): + self._run_and_check_materialization("view") + self._run_and_check_materialization("table", extra_args=["--full-refresh"]) + self._run_and_check_materialization("view", extra_args=["--full-refresh"]) + self._run_and_check_materialization("incremental", extra_args=["--full-refresh"]) diff --git a/dbt-bigquery/tests/functional/adapter/test_copy_materialization.py b/dbt-bigquery/tests/functional/adapter/test_copy_materialization.py new file mode 100644 index 000000000..77d5f4af6 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_copy_materialization.py @@ -0,0 +1,62 @@ +import pytest +from pathlib import Path +from dbt.tests.util import run_dbt, write_file, check_relations_equal + +_SEED_A = """ +load_date,id,first_name,last_name,email,gender,ip_address +2021-03-05,1,Jack,Hunter,jhunter0@pbs.org,Male,59.80.20.168 +2021-03-05,2,Kathryn,Walker,kwalker1@ezinearticles.com,Female,194.121.179.35 +2021-03-05,3,Gerald,Ryan,gryan2@com.com,Male,11.3.212.243 +""".lstrip() + +_SEED_B = """ +load_date,id,first_name,last_name,email,gender,ip_address +2021-03-05,4,Bonnie,Spencer,bspencer3@ameblo.jp,Female,216.32.196.175 +2021-03-05,5,Harold,Taylor,htaylor4@people.com.cn,Male,253.10.246.136 +""".lstrip() + +_EXPECTED_RESULT = """ +load_date,id,first_name,last_name,email,gender,ip_address +2021-03-05,1,Jack,Hunter,jhunter0@pbs.org,Male,59.80.20.168 +2021-03-05,2,Kathryn,Walker,kwalker1@ezinearticles.com,Female,194.121.179.35 +2021-03-05,3,Gerald,Ryan,gryan2@com.com,Male,11.3.212.243 +2021-03-05,4,Bonnie,Spencer,bspencer3@ameblo.jp,Female,216.32.196.175 +2021-03-05,5,Harold,Taylor,htaylor4@people.com.cn,Male,253.10.246.136 +""".lstrip() + +_COPY_MODEL = """ +{{ config( + materialized="copy", + copy_materialization="incremental", +) }} + +SELECT * FROM {{ ref("seed") }} +""" + + +class BaseCopyModelConfig: + @pytest.fixture(scope="class") + def models(self): + return {"copy_model.sql": _COPY_MODEL} + + @pytest.fixture(scope="class") + def seeds(self): + return { + "seed.csv": _SEED_A, + "expected_result.csv": _EXPECTED_RESULT, + } + + +class TestCopyMaterialization(BaseCopyModelConfig): + def test_incremental_copy(self, project): + run_dbt(["seed"]) + run_dbt(["run"]) + + # Replace original seed _SEED_A with _SEED_B + seed_file = project.project_root / Path("seeds") / Path("seed.csv") + write_file(_SEED_B, seed_file) + + run_dbt(["seed"]) + run_dbt(["run"]) + + check_relations_equal(project.adapter, ["copy_model", "expected_result"]) diff --git a/dbt-bigquery/tests/functional/adapter/test_dbt_debug.py b/dbt-bigquery/tests/functional/adapter/test_dbt_debug.py new file mode 100644 index 000000000..0741d7228 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_dbt_debug.py @@ -0,0 +1,14 @@ +from dbt.tests.util import run_dbt +from dbt.tests.adapter.dbt_debug.test_dbt_debug import BaseDebug, BaseDebugProfileVariable + + +class TestDebugBigQuery(BaseDebug): + def test_ok_bigquery(self, project): + run_dbt(["debug"]) + assert "ERROR" not in self.capsys.readouterr().out + + +class TestDebugProfileVariableBigQuery(BaseDebugProfileVariable): + def test_ok_bigquery(self, project): + run_dbt(["debug"]) + assert "ERROR" not in self.capsys.readouterr().out diff --git a/dbt-bigquery/tests/functional/adapter/test_grant_access_to.py b/dbt-bigquery/tests/functional/adapter/test_grant_access_to.py new file mode 100644 index 000000000..633cebe92 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_grant_access_to.py @@ -0,0 +1,102 @@ +import time + +import pytest + +from dbt.tests.util import run_dbt + + +def select_1(dataset: str, materialized: str): + config = f"""config( + materialized='{materialized}', + grant_access_to=[ + {{'project': 'dbt-test-env', 'dataset': '{dataset}'}}, + ] + )""" + return ( + "{{" + + config + + "}}" + + """ + SELECT 1 as one""" + ) + + +BAD_CONFIG_TABLE_NAME = "bad_view" +BAD_CONFIG_TABLE = """ +{{ config( + materialized='view', + grant_access_to=[ + {'project': 'dbt-test-env', 'dataset': 'NonExistentDataset'}, + ] +) }} + +SELECT 1 as one +""" + +BAD_CONFIG_CHILD_TABLE = "SELECT 1 as one FROM {{ref('" + BAD_CONFIG_TABLE_NAME + "')}}" + + +def get_schema_name(base_schema_name: str) -> str: + return f"{base_schema_name}_grant_access" + + +class TestAccessGrantSucceeds: + @pytest.fixture(scope="class") + def setup_grant_schema( + self, + project, + unique_schema, + ): + with project.adapter.connection_named("__test_grants"): + relation = project.adapter.Relation.create( + database=project.database, + schema=get_schema_name(unique_schema), + identifier="grant_access", + ) + project.adapter.create_schema(relation) + yield relation + + @pytest.fixture(scope="class") + def teardown_grant_schema( + self, + project, + unique_schema, + ): + yield + with project.adapter.connection_named("__test_grants"): + relation = project.adapter.Relation.create( + database=project.database, schema=get_schema_name(unique_schema) + ) + project.adapter.drop_schema(relation) + + @pytest.fixture(scope="class") + def models(self, unique_schema): + dataset = get_schema_name(unique_schema) + return { + "select_1.sql": select_1(dataset=dataset, materialized="view"), + "select_1_table.sql": select_1(dataset=dataset, materialized="table"), + } + + def test_grant_access_succeeds(self, project, setup_grant_schema, teardown_grant_schema): + # Need to run twice to validate idempotency + results = run_dbt(["run"]) + assert len(results) == 2 + time.sleep(10) + results = run_dbt(["run"]) + assert len(results) == 2 + + +class TestAccessGrantFails: + @pytest.fixture(scope="class") + def models(self): + return { + "bad_config_table_child.sql": BAD_CONFIG_CHILD_TABLE, + f"{BAD_CONFIG_TABLE_NAME}.sql": BAD_CONFIG_TABLE, + } + + def test_grant_access_fails_without_running_child_table(self, project): + # Need to run twice to validate idempotency + results = run_dbt(["run"], expect_pass=False) + assert results[0].status == "error" + assert results[1].status == "skipped" + assert results[0].message.startswith("404 GET https://bigquery.googleapis.com/") diff --git a/dbt-bigquery/tests/functional/adapter/test_grants.py b/dbt-bigquery/tests/functional/adapter/test_grants.py new file mode 100644 index 000000000..38f7e0ccc --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_grants.py @@ -0,0 +1,43 @@ +from dbt.tests.adapter.grants.base_grants import BaseGrants +from dbt.tests.adapter.grants.test_model_grants import BaseModelGrants +from dbt.tests.adapter.grants.test_incremental_grants import BaseIncrementalGrants +from dbt.tests.adapter.grants.test_invalid_grants import BaseInvalidGrants +from dbt.tests.adapter.grants.test_seed_grants import BaseSeedGrants +from dbt.tests.adapter.grants.test_snapshot_grants import BaseSnapshotGrants + + +class BaseGrantsBigQuery(BaseGrants): + def privilege_grantee_name_overrides(self): + return { + "select": "roles/bigquery.dataViewer", + "insert": "roles/bigquery.dataEditor", + "fake_privilege": "roles/invalid", + "invalid_user": "user:fake@dbtlabs.com", + } + + +class TestModelGrantsBigQuery(BaseGrantsBigQuery, BaseModelGrants): + pass + + +class TestIncrementalGrantsBigQuery(BaseGrantsBigQuery, BaseIncrementalGrants): + pass + + +class TestSeedGrantsBigQuery(BaseGrantsBigQuery, BaseSeedGrants): + # seeds in dbt-bigquery are always "full refreshed," in such a way that + # the grants do not carry over + def seeds_support_partial_refresh(self): + return False + + +class TestSnapshotGrantsBigQuery(BaseGrantsBigQuery, BaseSnapshotGrants): + pass + + +class TestInvalidGrantsBigQuery(BaseGrantsBigQuery, BaseInvalidGrants): + def grantee_does_not_exist_error(self): + return "User fake@dbtlabs.com does not exist." + + def privilege_does_not_exist_error(self): + return "Role roles/invalid is not supported for this resource." diff --git a/dbt-bigquery/tests/functional/adapter/test_json_keyfile.py b/dbt-bigquery/tests/functional/adapter/test_json_keyfile.py new file mode 100644 index 000000000..a5caaebdf --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_json_keyfile.py @@ -0,0 +1,87 @@ +import base64 +import json +import pytest +from dbt.adapters.bigquery.credentials import _is_base64 + + +def string_to_base64(s): + return base64.b64encode(s.encode("utf-8")) + + +@pytest.fixture +def example_json_keyfile(): + keyfile = json.dumps( + { + "type": "service_account", + "project_id": "", + "private_key_id": "", + "private_key": "-----BEGIN PRIVATE KEY----------END PRIVATE KEY-----\n", + "client_email": "", + "client_id": "", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "", + } + ) + + return keyfile + + +@pytest.fixture +def example_json_keyfile_b64(): + keyfile = json.dumps( + { + "type": "service_account", + "project_id": "", + "private_key_id": "", + "private_key": "-----BEGIN PRIVATE KEY----------END PRIVATE KEY-----\n", + "client_email": "", + "client_id": "", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "", + } + ) + + return string_to_base64(keyfile) + + +def test_valid_base64_strings(example_json_keyfile_b64): + valid_strings = [ + "SGVsbG8gV29ybGQh", # "Hello World!" + "Zm9vYmFy", # "foobar" + "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVowMTIzNDU2Nzg5", # A long string + "", # Empty string + example_json_keyfile_b64.decode("utf-8"), + ] + + for s in valid_strings: + assert _is_base64(s) is True + + +def test_valid_base64_bytes(example_json_keyfile_b64): + valid_bytes = [ + b"SGVsbG8gV29ybGQh", # "Hello World!" + b"Zm9vYmFy", # "foobar" + b"QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVowMTIzNDU2Nzg5", # A long string + b"", # Empty bytes + example_json_keyfile_b64, + ] + for s in valid_bytes: + assert _is_base64(s) is True + + +def test_invalid_base64(example_json_keyfile): + invalid_inputs = [ + "This is not Base64", + "SGVsbG8gV29ybGQ", # Incorrect padding + "Invalid#Base64", + 12345, # Not a string or bytes + b"Invalid#Base64", + "H\xffGVsbG8gV29ybGQh", # Contains invalid character \xff + example_json_keyfile, + ] + for s in invalid_inputs: + assert _is_base64(s) is False diff --git a/dbt-bigquery/tests/functional/adapter/test_persist_docs.py b/dbt-bigquery/tests/functional/adapter/test_persist_docs.py new file mode 100644 index 000000000..d2c7509f5 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_persist_docs.py @@ -0,0 +1,213 @@ +import json +import pytest + +from dbt.tests.util import run_dbt + +from dbt.tests.adapter.persist_docs.test_persist_docs import ( + BasePersistDocsBase, + BasePersistDocs, +) + +_MODELS__TABLE_MODEL_NESTED = """ +{{ config(materialized='table') }} +SELECT + STRUCT( + STRUCT( + 1 AS level_3_a, + 2 AS level_3_b + ) AS level_2 + ) AS level_1 +""" + +_MODELS__VIEW_MODEL_NESTED = """ +{{ config(materialized='view') }} +SELECT + STRUCT( + STRUCT( + 1 AS level_3_a, + 2 AS level_3_b + ) AS level_2 + ) AS level_1 +""" + +_PROPERTIES__MODEL_COMMENTS = """ +version: 2 + +models: + - name: table_model_nested + columns: + - name: level_1 + description: level_1 column description + - name: level_1.level_2 + description: level_2 column description + - name: level_1.level_2.level_3_a + description: level_3 column description + - name: view_model_nested + columns: + - name: level_1 + description: level_1 column description + - name: level_1.level_2 + description: level_2 column description + - name: level_1.level_2.level_3_a + description: level_3 column description +""" + + +class TestBasePersistDocs(BasePersistDocs): + def _assert_has_view_comments( + self, view_node, has_node_comments=True, has_column_comments=True + ): + view_comment = view_node["metadata"]["comment"] + if has_node_comments: + assert view_comment.startswith("View model description") + self._assert_common_comments(view_comment) + else: + assert not view_comment + + view_id_comment = view_node["columns"]["id"]["comment"] + if has_column_comments: + assert view_id_comment.startswith("id Column description") + self._assert_common_comments(view_id_comment) + else: + assert not view_id_comment + + view_name_comment = view_node["columns"]["name"]["comment"] + assert not view_name_comment + + +class TestPersistDocsSimple(BasePersistDocsBase): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "models": { + "test": { + "+persist_docs": { + "relation": True, + "columns": True, + }, + } + }, + "seeds": { + "test": { + "+persist_docs": { + "relation": True, + "columns": True, + }, + } + }, + } + + def test_persist_docs(self, project): + run_dbt(["seed"]) + run_dbt() + desc_map = { + "seed": "Seed model description", + "table_model": "Table model description", + "view_model": "View model description", + } + for node_id in ["seed", "table_model", "view_model"]: + with project.adapter.connection_named("_test"): + client = project.adapter.connections.get_thread_connection().handle + + table_id = "{}.{}.{}".format(project.database, project.test_schema, node_id) + bq_table = client.get_table(table_id) + + bq_schema = bq_table.schema + + assert bq_table.description.startswith(desc_map[node_id]) + assert bq_schema[0].description.startswith("id Column description ") + if not node_id.startswith("view"): + assert bq_schema[1].description.startswith( + "Some stuff here and then a call to" + ) + + +class TestPersistDocsColumnMissing(BasePersistDocsBase): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "models": { + "test": { + "+persist_docs": { + "columns": True, + }, + } + } + } + + def test_missing_column(self, project): + run_dbt() + + +class TestPersistDocsNested(BasePersistDocsBase): + @pytest.fixture(scope="class") + def properties(self): + return {"schema.yml": _PROPERTIES__MODEL_COMMENTS} + + @pytest.fixture(scope="class") + def models(self): + return { + "table_model_nested.sql": _MODELS__TABLE_MODEL_NESTED, + "view_model_nested.sql": _MODELS__VIEW_MODEL_NESTED, + } + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "models": { + "test": { + "+persist_docs": { + "relation": True, + "columns": True, + }, + } + } + } + + def test_persist_docs(self, project): + """ + run dbt and use the bigquery client from the adapter to check if the + colunmn descriptions are persisted on the test model table and view. + + Next, generate the catalog and check if the comments are also included. + + Note: dbt-bigquery does not allow comments on models with children nodes + """ + run_dbt(["seed"]) + run_dbt() + + run_dbt(["docs", "generate"]) + with open("target/catalog.json") as fp: + catalog_data = json.load(fp) + assert "nodes" in catalog_data + assert len(catalog_data["nodes"]) == 3 # seed, table, and view model + + for node_id in ["table_model_nested", "view_model_nested"]: + # check the descriptions using the api + with project.adapter.connection_named("_test"): + client = project.adapter.connections.get_thread_connection().handle + + table_id = "{}.{}.{}".format(project.database, project.test_schema, node_id) + bq_schema = client.get_table(table_id).schema + + level_1_field = bq_schema[0] + assert level_1_field.description == "level_1 column description" + + level_2_field = level_1_field.fields[0] + assert level_2_field.description == "level_2 column description" + + level_2_field = level_1_field.fields[0] + level_3_field = level_2_field.fields[0] + assert level_3_field.description == "level_3 column description" + + # check the descriptions in the catalog + node = catalog_data["nodes"]["model.test.{}".format(node_id)] + + level_1_column = node["columns"]["level_1"] + assert level_1_column["comment"] == "level_1 column description" + + level_2_column = node["columns"]["level_1.level_2"] + assert level_2_column["comment"] == "level_2 column description" + + level_3_column = node["columns"]["level_1.level_2.level_3_a"] + assert level_3_column["comment"] == "level_3 column description" diff --git a/dbt-bigquery/tests/functional/adapter/test_python_model.py b/dbt-bigquery/tests/functional/adapter/test_python_model.py new file mode 100644 index 000000000..bc5d8c91c --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_python_model.py @@ -0,0 +1,269 @@ +import os +import pytest +import time +from dbt.tests.util import run_dbt, run_dbt_and_capture, write_file +import dbt.tests.adapter.python_model.test_python_model as dbt_tests + +TEST_SKIP_MESSAGE = ( + "Skipping the Tests since Dataproc serverless is not stable. " "TODO: Fix later" +) + +blocks_for_thirty_sec = """ +def model(dbt, _): + dbt.config( + materialized='table', + timeout=5 + ) + import pandas as pd + data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + df = pd.DataFrame.from_dict(data) + import time + time.sleep(30) + return df +""" + + +class TestPythonModelDataprocTimeoutTest: + @pytest.fixture(scope="class") + def models(self): + return {"30_sec_python_model.py": blocks_for_thirty_sec} + + def test_model_times_out(self, project): + result, output = run_dbt_and_capture(["run"], expect_pass=False) + assert len(result) == 1 + assert "Operation did not complete within the designated timeout of 5 seconds." in output + + +class TestPythonModelDataproc(dbt_tests.BasePythonModelTests): + pass + + +@pytest.mark.skip(reason=TEST_SKIP_MESSAGE) +class TestPythonIncrementalMatsDataproc(dbt_tests.BasePythonIncrementalTests): + pass + + +models__simple_python_model = """ +import pandas + +def model(dbt, spark): + dbt.config( + materialized='table', + ) + data = [[1,2]] * 10 + return spark.createDataFrame(data, schema=['test', 'test2']) +""" + +macro__partition_count_sql = """ +{% test number_partitions(model, expected) %} + + {%- set result = get_partitions_metadata(model) %} + + {% if result %} + {% set partitions = result.columns['partition_id'].values() %} + {% else %} + {% set partitions = () %} + {% endif %} + + {% set actual = partitions | length %} + {% set success = 1 if model and actual == expected else 0 %} + + select 'Expected {{ expected }}, but got {{ actual }}' as validation_error + from (select true) + where {{ success }} = 0 + +{% endtest %} +""" + +models__partitioned_model_python = """ +import pandas as pd + +def model(dbt, spark): + dbt.config( + materialized='table', + partition_by={ + "field": "C", + "data_type": "timestamp", + "granularity": "day", + }, + cluster_by=["A"], + ) + random_array = [ + ["A", -157.9871329592354], + ["B", -528.9769041860632], + ["B", 941.0504221837489], + ["B", 919.5903586746183], + ["A", -121.25678519054622], + ["A", 254.9985130814921], + ["A", 833.2963094260072], + ] + + df = pd.DataFrame(random_array, columns=["A", "B"]) + + df["C"] = pd.to_datetime('now') + + final_df = df[["A", "B", "C"]] + + return final_df +""" + +models__partitioned_model_yaml = """ +models: + - name: python_partitioned_model + description: A random table with a calculated column defined in python. + config: + batch_id: '{{ run_started_at.strftime("%Y-%m-%d-%H-%M-%S") }}-python-partitioned' + tests: + - number_partitions: + expected: "{{ var('expected', 1) }}" + columns: + - name: A + description: Column A + - name: B + description: Column B + - name: C + description: Column C +""" + + +class TestPythonPartitionedModels: + @pytest.fixture(scope="class") + def macros(self): + return {"partition_metadata.sql": macro__partition_count_sql} + + @pytest.fixture(scope="class") + def models(self): + return { + "python_partitioned_model.py": models__partitioned_model_python, + "python_partitioned_model.yml": models__partitioned_model_yaml, + } + + def test_multiple_named_python_models(self, project): + result = run_dbt(["run"]) + assert len(result) == 1 + + test_results = run_dbt(["test"]) + for result in test_results: + assert result.status == "pass" + assert not result.skipped + assert result.failures == 0 + + +models__simple_python_model_v2 = """ +import pandas + +def model(dbt, spark): + dbt.config( + materialized='table', + ) + data = [[1,2]] * 10 + return spark.createDataFrame(data, schema=['test1', 'test3']) +""" + +models__python_array_batch_id_python = """ +import pandas as pd + +def model(dbt, spark): + random_array = [ + [9001.3985362160208, -157.9871329592354], + [-817.8786101352823, -528.9769041860632], + [-886.6488625065194, 941.0504221837489], + [6.69525238666165, 919.5903586746183], + [754.3718741592056, -121.25678519054622], + [-352.3158889341157, 254.9985130814921], + [563.0633042715097, 833.2963094260072], + ] + + df = pd.DataFrame(random_array, columns=["A", "B"]) + + df["C"] = df["A"] * df["B"] + + final_df = df[["A", "B", "C"]] + + return final_df +""" + +models__python_array_batch_id_yaml = """ +models: + - name: python_array_batch_id + description: A random table with a calculated column defined in python. + columns: + - name: A + description: Column A + - name: B + description: Column B + - name: C + description: Column C +""" + +custom_ts_id = str("custom-" + str(time.time()).replace(".", "-")) + +models__bad_python_array_batch_id_yaml = f""" +models: + - name: python_array_batch_id + description: A random table with a calculated column defined in python. + config: + batch_id: {custom_ts_id}-python-array + columns: + - name: A + description: Column A + - name: B + description: Column B + - name: C + description: Column C +""" + + +class TestPythonBatchIdModels: + @pytest.fixture(scope="class") + def models(self): + return { + "python_array_batch_id.py": models__python_array_batch_id_python, + "python_array_batch_id.yml": models__python_array_batch_id_yaml, + } + + def test_multiple_named_python_models(self, project): + result, output = run_dbt_and_capture(["run"], expect_pass=True) + time.sleep(5) # In case both runs are submitted simultaneously + result_two, output_two = run_dbt_and_capture(["run"], expect_pass=True) + assert len(result) == 1 + assert len(result_two) == 1 + + +class TestPythonDuplicateBatchIdModels: + @pytest.fixture(scope="class") + def models(self): + return { + "python_array_batch_id.py": models__python_array_batch_id_python, + "python_array_batch_id.yml": models__bad_python_array_batch_id_yaml, + } + + def test_multiple_python_models_fixed_id(self, project): + result, output = run_dbt_and_capture(["run"], expect_pass=True) + result_two, output_two = run_dbt_and_capture(["run"], expect_pass=False) + assert result_two[0].message.startswith("409 Already exists: Failed to create batch:") + assert len(result) == 1 + assert len(result_two) == 1 + + +@pytest.mark.skip(reason=TEST_SKIP_MESSAGE) +class TestChangingSchemaDataproc: + @pytest.fixture(scope="class") + def models(self): + return {"simple_python_model.py": models__simple_python_model} + + def test_changing_schema(self, project, logs_dir): + run_dbt(["run"]) + write_file( + models__simple_python_model_v2, + project.project_root + "/models", + "simple_python_model.py", + ) + run_dbt(["run"]) + log_file = os.path.join(logs_dir, "dbt.log") + with open(log_file, "r") as f: + log = f.read() + # validate #5510 log_code_execution works + assert "On model.test.simple_python_model:" in log + assert "return spark.createDataFrame(data, schema=['test1', 'test3'])" in log + assert "Execution status: OK in" in log diff --git a/dbt-bigquery/tests/functional/adapter/test_simple_seed.py b/dbt-bigquery/tests/functional/adapter/test_simple_seed.py new file mode 100644 index 000000000..5ec19d420 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_simple_seed.py @@ -0,0 +1,192 @@ +import pytest +from dbt.tests.adapter.simple_seed.fixtures import macros__schema_test +from dbt.tests.adapter.simple_seed.seeds import seeds__enabled_in_config_csv, seeds__tricky_csv +from dbt.tests.adapter.simple_seed.test_seed import SeedConfigBase +from dbt.tests.adapter.simple_seed.test_seed import BaseTestEmptySeed +from dbt.tests.adapter.utils.base_utils import run_dbt + +_SEED_CONFIGS_CSV = """ +seed_id,stuff +1,a +2,b +""".lstrip() + +_SCHEMA_YML = """ +version: 2 +seeds: +- name: seed_enabled + columns: + - name: birthday + tests: + - column_type: + type: STRING + - name: seed_id + tests: + - column_type: + type: FLOAT64 + +- name: seed_tricky + columns: + - name: seed_id + tests: + - column_type: + type: INT64 + - name: seed_id_str + tests: + - column_type: + type: STRING + - name: a_bool + tests: + - column_type: + type: BOOLEAN + - name: looks_like_a_bool + tests: + - column_type: + type: STRING + - name: a_date + tests: + - column_type: + type: DATETIME + - name: looks_like_a_date + tests: + - column_type: + type: STRING + - name: relative + tests: + - column_type: + type: STRING + - name: weekday + tests: + - column_type: + type: STRING + +- name: seed_configs + config: + hours_to_expiration: 2 + labels: + contains_pii: 'yes' + contains_pie: 'no' +""".lstrip() + + +class TestSimpleSeedConfigs(SeedConfigBase): + @pytest.fixture(scope="class") + def schema(self): + return "simple_seed" + + @pytest.fixture(scope="class") + def seeds(self): + return { + "seed_enabled.csv": seeds__enabled_in_config_csv, + "seed_tricky.csv": seeds__tricky_csv, + "seed_configs.csv": _SEED_CONFIGS_CSV, + } + + @pytest.fixture(scope="class") + def macros(self): + return { + "schema_test.sql": macros__schema_test, + } + + @pytest.fixture(scope="class") + def models(self): + return {"models-bq.yml": _SCHEMA_YML} + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "config-version": 2, + "seeds": { + "test": { + "enabled": False, + "quote_columns": True, + "seed_enabled": { + "enabled": True, + "+column_types": self.seed_enabled_types(), + }, + "seed_tricky": { + "enabled": True, + "+column_types": self.seed_tricky_types(), + }, + "seed_configs": { + "enabled": True, + }, + }, + }, + } + + @staticmethod + def seed_enabled_types(): + return { + "seed_id": "FLOAT64", + "birthday": "STRING", + } + + @staticmethod + def seed_tricky_types(): + return { + "seed_id_str": "STRING", + "looks_like_a_bool": "STRING", + "looks_like_a_date": "STRING", + } + + @staticmethod + def table_labels(): + return {"contains_pii": "yes", "contains_pie": "no"} + + def test__bigquery_simple_seed_with_column_override_bigquery(self, project): + seed_results = run_dbt(["seed"]) + assert len(seed_results) == 3 + test_results = run_dbt(["test"]) + assert len(test_results) == 10 + + def test__bigquery_seed_table_with_labels_config_bigquery(self, project): + seed_results = run_dbt(["seed"]) + assert len(seed_results) == 3 + with project.adapter.connection_named("_test"): + client = project.adapter.connections.get_thread_connection().handle + table_id = "{}.{}.{}".format(project.database, project.test_schema, "seed_configs") + bq_table = client.get_table(table_id) + + assert bq_table.labels + assert bq_table.labels == self.table_labels() + assert bq_table.expires + + +class TestBigQueryEmptySeed(BaseTestEmptySeed): + pass + + +class TestBigQuerySeedWithUniqueDelimiter(TestSimpleSeedConfigs): + @pytest.fixture(scope="class") + def seeds(self): + return { + "seed_enabled.csv": seeds__enabled_in_config_csv.replace(",", "|"), + "seed_tricky.csv": seeds__tricky_csv.replace(",", "\t"), + "seed_configs.csv": _SEED_CONFIGS_CSV, + } + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "config-version": 2, + "seeds": { + "test": { + "enabled": False, + "quote_columns": True, + "seed_enabled": { + "enabled": True, + "+column_types": self.seed_enabled_types(), + "delimiter": "|", + }, + "seed_tricky": { + "enabled": True, + "+column_types": self.seed_tricky_types(), + "delimiter": "\t", + }, + "seed_configs": { + "enabled": True, + }, + }, + }, + } diff --git a/dbt-bigquery/tests/functional/adapter/test_simple_snaphot.py b/dbt-bigquery/tests/functional/adapter/test_simple_snaphot.py new file mode 100644 index 000000000..50da51a76 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_simple_snaphot.py @@ -0,0 +1,103 @@ +import pytest + +from dbt.tests.util import run_dbt + +from dbt.tests.adapter.simple_snapshot.test_snapshot import ( + BaseSimpleSnapshotBase, + BaseSnapshotCheck, +) + +SNAPSHOT_TIMESTAMP_SQL = """ +{% snapshot snapshot %} + {{ config( + target_database=database, + target_schema=schema, + unique_key='id', + strategy='timestamp', + updated_at='updated_at_ts', + invalidate_hard_deletes=True, + ) }} + select *, timestamp(updated_at) as updated_at_ts from {{ ref('fact') }} +{% endsnapshot %} +""" + + +class TestSnapshot(BaseSimpleSnapshotBase): + # Not importing the base case because the test_updates* tests need modification for updating intervals + @pytest.fixture(scope="class") + def snapshots(self): + # Using the snapshot defined in the class itself rather than the base case + # Reason: dbt-bigquery:#3710: UNION ALL issue when running snapshots with invalidate_hard_deletes=True + return {"snapshot.sql": SNAPSHOT_TIMESTAMP_SQL} + + def test_updates_are_captured_by_snapshot(self, project): + """ + Update the last 5 records. Show that all ids are current, but the last 5 reflect updates. + """ + date_add_expression = "date_add(updated_at, interval 1 day)" + self.update_fact_records({"updated_at": date_add_expression}, "id between 16 and 20") + run_dbt(["snapshot"]) + self._assert_results( + ids_with_current_snapshot_records=range(1, 21), + ids_with_closed_out_snapshot_records=range(16, 21), + ) + + def test_inserts_are_captured_by_snapshot(self, project): + """ + Insert 10 records. Show that there are 30 records in `snapshot`, all of which are current. + """ + self.insert_fact_records("id between 21 and 30") + run_dbt(["snapshot"]) + self._assert_results( + ids_with_current_snapshot_records=range(1, 31), ids_with_closed_out_snapshot_records=[] + ) + + def test_deletes_are_captured_by_snapshot(self, project): + """ + Hard delete the last five records. Show that there are now only 15 current records and 5 expired records. + """ + self.delete_fact_records("id between 16 and 20") + run_dbt(["snapshot"]) + self._assert_results( + ids_with_current_snapshot_records=range(1, 16), + ids_with_closed_out_snapshot_records=range(16, 21), + ) + + def test_revives_are_captured_by_snapshot(self, project): + """ + Delete the last five records and run snapshot to collect that information, then revive 3 of those records. + Show that there are now 18 current records and 5 expired records. + """ + self.delete_fact_records("id between 16 and 20") + run_dbt(["snapshot"]) + self.insert_fact_records("id between 16 and 18") + run_dbt(["snapshot"]) + self._assert_results( + ids_with_current_snapshot_records=range(1, 19), + ids_with_closed_out_snapshot_records=range(16, 21), + ) + + def test_new_column_captured_by_snapshot(self, project): + """ + Add a column to `fact` and populate the last 10 records with a non-null value. + Show that all ids are current, but the last 10 reflect updates and the first 10 don't + i.e. if the column is added, but not updated, the record doesn't reflect that it's updated + """ + self.add_fact_column("full_name", "varchar(200) default null") + date_add_expression = "date_add(date(updated_at), interval 1 day)" + self.update_fact_records( + { + "full_name": "first_name || ' ' || last_name", + "updated_at": date_add_expression, + }, + "id between 11 and 20", + ) + run_dbt(["snapshot"]) + self._assert_results( + ids_with_current_snapshot_records=range(1, 21), + ids_with_closed_out_snapshot_records=range(11, 21), + ) + + +class TestSnapshotCheck(BaseSnapshotCheck): + pass diff --git a/dbt-bigquery/tests/functional/adapter/test_string_literal_macro.py b/dbt-bigquery/tests/functional/adapter/test_string_literal_macro.py new file mode 100644 index 000000000..d67f4be71 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/test_string_literal_macro.py @@ -0,0 +1,17 @@ +import pytest +from dbt.tests.util import run_dbt + + +_MODEL_SQL = """ +select {{ dbt.string_literal('my multiline +string') }} as test +""" + + +class TestStringLiteralQuoting: + @pytest.fixture(scope="class") + def models(self): + return {"my_model.sql": _MODEL_SQL} + + def test_string_literal_quoting(self, project): + run_dbt() diff --git a/dbt-bigquery/tests/functional/adapter/unit_testing/test_unit_testing.py b/dbt-bigquery/tests/functional/adapter/unit_testing/test_unit_testing.py new file mode 100644 index 000000000..f4d4ef1e2 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/unit_testing/test_unit_testing.py @@ -0,0 +1,64 @@ +import pytest +from dbt.tests.adapter.unit_testing.test_types import BaseUnitTestingTypes +from dbt.tests.adapter.unit_testing.test_case_insensitivity import BaseUnitTestCaseInsensivity +from dbt.tests.adapter.unit_testing.test_invalid_input import BaseUnitTestInvalidInput + + +class TestBigQueryUnitTestingTypes(BaseUnitTestingTypes): + @pytest.fixture + def data_types(self): + # sql_value, yaml_value + return [ + ["1", "1"], + ["'1'", "1"], + ["cast('true' as boolean)", "true"], + ["1.0", "1.0"], + ["'string value'", "string value"], + ["cast(1.0 as numeric)", "1.0"], + ["cast(1 as bigint)", 1], + ["cast('2019-01-01' as date)", "2019-01-01"], + ["cast('2013-11-03 00:00:00-07' as timestamp)", "2013-11-03 00:00:00-07"], + ["st_geogpoint(75, 45)", "'st_geogpoint(75, 45)'"], + # arrays + ["cast(['a','b','c'] as array)", "['a','b','c']"], + ["cast([1,2,3] as array)", "[1,2,3]"], + ["cast([true,true,false] as array)", "[true,true,false]"], + # array of date + ["[date '2019-01-01']", "['2020-01-01']"], + ["[date '2019-01-01']", "[]"], + ["[date '2019-01-01']", "null"], + # array of timestamp + ["[timestamp '2019-01-01']", "['2020-01-01']"], + ["[timestamp '2019-01-01']", "[]"], + ["[timestamp '2019-01-01']", "null"], + # json + [ + """json '{"name": "Cooper", "forname": "Alice"}'""", + """{"name": "Cooper", "forname": "Alice"}""", + ], + ["""json '{"name": "Cooper", "forname": "Alice"}'""", "{}"], + # structs + ["struct('Isha' as name, 22 as age)", """'struct("Isha" as name, 22 as age)'"""], + [ + "struct('Kipketer' AS name, [23.2, 26.1, 27.3, 29.4] AS laps)", + """'struct("Kipketer" AS name, [23.2, 26.1, 27.3, 29.4] AS laps)'""", + ], + # struct of struct + [ + "struct(struct(1 as id, 'blue' as color) as my_struct)", + """'struct(struct(1 as id, "blue" as color) as my_struct)'""", + ], + # array of struct + [ + "[struct(st_geogpoint(75, 45) as my_point), struct(st_geogpoint(75, 35) as my_point)]", + "['struct(st_geogpoint(75, 45) as my_point)', 'struct(st_geogpoint(75, 35) as my_point)']", + ], + ] + + +class TestBigQueryUnitTestCaseInsensitivity(BaseUnitTestCaseInsensivity): + pass + + +class TestBigQueryUnitTestInvalidInput(BaseUnitTestInvalidInput): + pass diff --git a/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/csv/source.csv b/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/csv/source.csv new file mode 100644 index 000000000..a8f87412e --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/csv/source.csv @@ -0,0 +1,101 @@ +favorite_color,id,first_name,email,ip_address,updated_at +blue,1,Larry,lking0@miitbeian.gov.cn,'69.135.206.194',2008-09-12 19:08:31 +blue,2,Larry,lperkins1@toplist.cz,'64.210.133.162',1978-05-09 04:15:14 +blue,3,Anna,amontgomery2@miitbeian.gov.cn,'168.104.64.114',2011-10-16 04:07:57 +blue,4,Sandra,sgeorge3@livejournal.com,'229.235.252.98',1973-07-19 10:52:43 +blue,5,Fred,fwoods4@google.cn,'78.229.170.124',2012-09-30 16:38:29 +blue,6,Stephen,shanson5@livejournal.com,'182.227.157.105',1995-11-07 21:40:50 +blue,7,William,wmartinez6@upenn.edu,'135.139.249.50',1982-09-05 03:11:59 +blue,8,Jessica,jlong7@hao123.com,'203.62.178.210',1991-10-16 11:03:15 +blue,9,Douglas,dwhite8@tamu.edu,'178.187.247.1',1979-10-01 09:49:48 +blue,10,Lisa,lcoleman9@nydailynews.com,'168.234.128.249',2011-05-26 07:45:49 +blue,11,Ralph,rfieldsa@home.pl,'55.152.163.149',1972-11-18 19:06:11 +blue,12,Louise,lnicholsb@samsung.com,'141.116.153.154',2014-11-25 20:56:14 +blue,13,Clarence,cduncanc@sfgate.com,'81.171.31.133',2011-11-17 07:02:36 +blue,14,Daniel,dfranklind@omniture.com,'8.204.211.37',1980-09-13 00:09:04 +blue,15,Katherine,klanee@auda.org.au,'176.96.134.59',1997-08-22 19:36:56 +blue,16,Billy,bwardf@wikia.com,'214.108.78.85',2003-10-19 02:14:47 +blue,17,Annie,agarzag@ocn.ne.jp,'190.108.42.70',1988-10-28 15:12:35 +blue,18,Shirley,scolemanh@fastcompany.com,'109.251.164.84',1988-08-24 10:50:57 +blue,19,Roger,rfrazieri@scribd.com,'38.145.218.108',1985-12-31 15:17:15 +blue,20,Lillian,lstanleyj@goodreads.com,'47.57.236.17',1970-06-08 02:09:05 +blue,21,Aaron,arodriguezk@nps.gov,'205.245.118.221',1985-10-11 23:07:49 +blue,22,Patrick,pparkerl@techcrunch.com,'19.8.100.182',2006-03-29 12:53:56 +blue,23,Phillip,pmorenom@intel.com,'41.38.254.103',2011-11-07 15:35:43 +blue,24,Henry,hgarcian@newsvine.com,'1.191.216.252',2008-08-28 08:30:44 +blue,25,Irene,iturnero@opera.com,'50.17.60.190',1994-04-01 07:15:02 +blue,26,Andrew,adunnp@pen.io,'123.52.253.176',2000-11-01 06:03:25 +blue,27,David,dgutierrezq@wp.com,'238.23.203.42',1988-01-25 07:29:18 +blue,28,Henry,hsanchezr@cyberchimps.com,'248.102.2.185',1983-01-01 13:36:37 +blue,29,Evelyn,epetersons@gizmodo.com,'32.80.46.119',1979-07-16 17:24:12 +blue,30,Tammy,tmitchellt@purevolume.com,'249.246.167.88',2001-04-03 10:00:23 +blue,31,Jacqueline,jlittleu@domainmarket.com,'127.181.97.47',1986-02-11 21:35:50 +blue,32,Earl,eortizv@opera.com,'166.47.248.240',1996-07-06 08:16:27 +blue,33,Juan,jgordonw@sciencedirect.com,'71.77.2.200',1987-01-31 03:46:44 +blue,34,Diane,dhowellx@nyu.edu,'140.94.133.12',1994-06-11 02:30:05 +blue,35,Randy,rkennedyy@microsoft.com,'73.255.34.196',2005-05-26 20:28:39 +blue,36,Janice,jriveraz@time.com,'22.214.227.32',1990-02-09 04:16:52 +blue,37,Laura,lperry10@diigo.com,'159.148.145.73',2015-03-17 05:59:25 +blue,38,Gary,gray11@statcounter.com,'40.193.124.56',1970-01-27 10:04:51 +blue,39,Jesse,jmcdonald12@typepad.com,'31.7.86.103',2009-03-14 08:14:29 +blue,40,Sandra,sgonzalez13@goodreads.com,'223.80.168.239',1993-05-21 14:08:54 +blue,41,Scott,smoore14@archive.org,'38.238.46.83',1980-08-30 11:16:56 +blue,42,Phillip,pevans15@cisco.com,'158.234.59.34',2011-12-15 23:26:31 +blue,43,Steven,sriley16@google.ca,'90.247.57.68',2011-10-29 19:03:28 +blue,44,Deborah,dbrown17@hexun.com,'179.125.143.240',1995-04-10 14:36:07 +blue,45,Lori,lross18@ow.ly,'64.80.162.180',1980-12-27 16:49:15 +blue,46,Sean,sjackson19@tumblr.com,'240.116.183.69',1988-06-12 21:24:45 +blue,47,Terry,tbarnes1a@163.com,'118.38.213.137',1997-09-22 16:43:19 +blue,48,Dorothy,dross1b@ebay.com,'116.81.76.49',2005-02-28 13:33:24 +blue,49,Samuel,swashington1c@house.gov,'38.191.253.40',1989-01-19 21:15:48 +blue,50,Ralph,rcarter1d@tinyurl.com,'104.84.60.174',2007-08-11 10:21:49 +green,51,Wayne,whudson1e@princeton.edu,'90.61.24.102',1983-07-03 16:58:12 +green,52,Rose,rjames1f@plala.or.jp,'240.83.81.10',1995-06-08 11:46:23 +green,53,Louise,lcox1g@theglobeandmail.com,'105.11.82.145',2016-09-19 14:45:51 +green,54,Kenneth,kjohnson1h@independent.co.uk,'139.5.45.94',1976-08-17 11:26:19 +green,55,Donna,dbrown1i@amazon.co.uk,'19.45.169.45',2006-05-27 16:51:40 +green,56,Johnny,jvasquez1j@trellian.com,'118.202.238.23',1975-11-17 08:42:32 +green,57,Patrick,pramirez1k@tamu.edu,'231.25.153.198',1997-08-06 11:51:09 +green,58,Helen,hlarson1l@prweb.com,'8.40.21.39',1993-08-04 19:53:40 +green,59,Patricia,pspencer1m@gmpg.org,'212.198.40.15',1977-08-03 16:37:27 +green,60,Joseph,jspencer1n@marriott.com,'13.15.63.238',2005-07-23 20:22:06 +green,61,Phillip,pschmidt1o@blogtalkradio.com,'177.98.201.190',1976-05-19 21:47:44 +green,62,Joan,jwebb1p@google.ru,'105.229.170.71',1972-09-07 17:53:47 +green,63,Phyllis,pkennedy1q@imgur.com,'35.145.8.244',2000-01-01 22:33:37 +green,64,Katherine,khunter1r@smh.com.au,'248.168.205.32',1991-01-09 06:40:24 +green,65,Laura,lvasquez1s@wiley.com,'128.129.115.152',1997-10-23 12:04:56 +green,66,Juan,jdunn1t@state.gov,'44.228.124.51',2004-11-10 05:07:35 +green,67,Judith,jholmes1u@wiley.com,'40.227.179.115',1977-08-02 17:01:45 +green,68,Beverly,bbaker1v@wufoo.com,'208.34.84.59',2016-03-06 20:07:23 +green,69,Lawrence,lcarr1w@flickr.com,'59.158.212.223',1988-09-13 06:07:21 +green,70,Gloria,gwilliams1x@mtv.com,'245.231.88.33',1995-03-18 22:32:46 +green,71,Steven,ssims1y@cbslocal.com,'104.50.58.255',2001-08-05 21:26:20 +green,72,Betty,bmills1z@arstechnica.com,'103.177.214.220',1981-12-14 21:26:54 +green,73,Mildred,mfuller20@prnewswire.com,'151.158.8.130',2000-04-19 10:13:55 +green,74,Donald,dday21@icq.com,'9.178.102.255',1972-12-03 00:58:24 +green,75,Eric,ethomas22@addtoany.com,'85.2.241.227',1992-11-01 05:59:30 +green,76,Joyce,jarmstrong23@sitemeter.com,'169.224.20.36',1985-10-24 06:50:01 +green,77,Maria,mmartinez24@amazonaws.com,'143.189.167.135',2005-10-05 05:17:42 +green,78,Harry,hburton25@youtube.com,'156.47.176.237',1978-03-26 05:53:33 +green,79,Kevin,klawrence26@hao123.com,'79.136.183.83',1994-10-12 04:38:52 +green,80,David,dhall27@prweb.com,'133.149.172.153',1976-12-15 16:24:24 +green,81,Kathy,kperry28@twitter.com,'229.242.72.228',1979-03-04 02:58:56 +green,82,Adam,aprice29@elegantthemes.com,'13.145.21.10',1982-11-07 11:46:59 +green,83,Brandon,bgriffin2a@va.gov,'73.249.128.212',2013-10-30 05:30:36 +green,84,Henry,hnguyen2b@discovery.com,'211.36.214.242',1985-01-09 06:37:27 +green,85,Eric,esanchez2c@edublogs.org,'191.166.188.251',2004-05-01 23:21:42 +green,86,Jason,jlee2d@jimdo.com,'193.92.16.182',1973-01-08 09:05:39 +green,87,Diana,drichards2e@istockphoto.com,'19.130.175.245',1994-10-05 22:50:49 +green,88,Andrea,awelch2f@abc.net.au,'94.155.233.96',2002-04-26 08:41:44 +green,89,Louis,lwagner2g@miitbeian.gov.cn,'26.217.34.111',2003-08-25 07:56:39 +green,90,Jane,jsims2h@seesaa.net,'43.4.220.135',1987-03-20 20:39:04 +green,91,Larry,lgrant2i@si.edu,'97.126.79.34',2000-09-07 20:26:19 +green,92,Louis,ldean2j@prnewswire.com,'37.148.40.127',2011-09-16 20:12:14 +green,93,Jennifer,jcampbell2k@xing.com,'38.106.254.142',1988-07-15 05:06:49 +green,94,Wayne,wcunningham2l@google.com.hk,'223.28.26.187',2009-12-15 06:16:54 +green,95,Lori,lstevens2m@icq.com,'181.250.181.58',1984-10-28 03:29:19 +green,96,Judy,jsimpson2n@marriott.com,'180.121.239.219',1986-02-07 15:18:10 +green,97,Phillip,phoward2o@usa.gov,'255.247.0.175',2002-12-26 08:44:45 +green,98,Gloria,gwalker2p@usa.gov,'156.140.7.128',1997-10-04 07:58:58 +green,99,Paul,pjohnson2q@umn.edu,'183.59.198.197',1991-11-14 12:33:55 +green,100,Frank,fgreene2r@blogspot.com,'150.143.68.121',2010-06-12 23:55:39 diff --git a/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/ndjson/source.ndjson b/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/ndjson/source.ndjson new file mode 100644 index 000000000..709898645 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/ndjson/source.ndjson @@ -0,0 +1,100 @@ +{"favorite_color":"blue","id":1,"first_name":"Larry","email":"lking0@miitbeian.gov.cn","ip_address":"'69.135.206.194'","updated_at":"2008-09-12 19:08:31"} +{"favorite_color":"blue","id":2,"first_name":"Larry","email":"lperkins1@toplist.cz","ip_address":"'64.210.133.162'","updated_at":"1978-05-09 04:15:14"} +{"favorite_color":"blue","id":3,"first_name":"Anna","email":"amontgomery2@miitbeian.gov.cn","ip_address":"'168.104.64.114'","updated_at":"2011-10-16 04:07:57"} +{"favorite_color":"blue","id":4,"first_name":"Sandra","email":"sgeorge3@livejournal.com","ip_address":"'229.235.252.98'","updated_at":"1973-07-19 10:52:43"} +{"favorite_color":"blue","id":5,"first_name":"Fred","email":"fwoods4@google.cn","ip_address":"'78.229.170.124'","updated_at":"2012-09-30 16:38:29"} +{"favorite_color":"blue","id":6,"first_name":"Stephen","email":"shanson5@livejournal.com","ip_address":"'182.227.157.105'","updated_at":"1995-11-07 21:40:50"} +{"favorite_color":"blue","id":7,"first_name":"William","email":"wmartinez6@upenn.edu","ip_address":"'135.139.249.50'","updated_at":"1982-09-05 03:11:59"} +{"favorite_color":"blue","id":8,"first_name":"Jessica","email":"jlong7@hao123.com","ip_address":"'203.62.178.210'","updated_at":"1991-10-16 11:03:15"} +{"favorite_color":"blue","id":9,"first_name":"Douglas","email":"dwhite8@tamu.edu","ip_address":"'178.187.247.1'","updated_at":"1979-10-01 09:49:48"} +{"favorite_color":"blue","id":10,"first_name":"Lisa","email":"lcoleman9@nydailynews.com","ip_address":"'168.234.128.249'","updated_at":"2011-05-26 07:45:49"} +{"favorite_color":"blue","id":11,"first_name":"Ralph","email":"rfieldsa@home.pl","ip_address":"'55.152.163.149'","updated_at":"1972-11-18 19:06:11"} +{"favorite_color":"blue","id":12,"first_name":"Louise","email":"lnicholsb@samsung.com","ip_address":"'141.116.153.154'","updated_at":"2014-11-25 20:56:14"} +{"favorite_color":"blue","id":13,"first_name":"Clarence","email":"cduncanc@sfgate.com","ip_address":"'81.171.31.133'","updated_at":"2011-11-17 07:02:36"} +{"favorite_color":"blue","id":14,"first_name":"Daniel","email":"dfranklind@omniture.com","ip_address":"'8.204.211.37'","updated_at":"1980-09-13 00:09:04"} +{"favorite_color":"blue","id":15,"first_name":"Katherine","email":"klanee@auda.org.au","ip_address":"'176.96.134.59'","updated_at":"1997-08-22 19:36:56"} +{"favorite_color":"blue","id":16,"first_name":"Billy","email":"bwardf@wikia.com","ip_address":"'214.108.78.85'","updated_at":"2003-10-19 02:14:47"} +{"favorite_color":"blue","id":17,"first_name":"Annie","email":"agarzag@ocn.ne.jp","ip_address":"'190.108.42.70'","updated_at":"1988-10-28 15:12:35"} +{"favorite_color":"blue","id":18,"first_name":"Shirley","email":"scolemanh@fastcompany.com","ip_address":"'109.251.164.84'","updated_at":"1988-08-24 10:50:57"} +{"favorite_color":"blue","id":19,"first_name":"Roger","email":"rfrazieri@scribd.com","ip_address":"'38.145.218.108'","updated_at":"1985-12-31 15:17:15"} +{"favorite_color":"blue","id":20,"first_name":"Lillian","email":"lstanleyj@goodreads.com","ip_address":"'47.57.236.17'","updated_at":"1970-06-08 02:09:05"} +{"favorite_color":"blue","id":21,"first_name":"Aaron","email":"arodriguezk@nps.gov","ip_address":"'205.245.118.221'","updated_at":"1985-10-11 23:07:49"} +{"favorite_color":"blue","id":22,"first_name":"Patrick","email":"pparkerl@techcrunch.com","ip_address":"'19.8.100.182'","updated_at":"2006-03-29 12:53:56"} +{"favorite_color":"blue","id":23,"first_name":"Phillip","email":"pmorenom@intel.com","ip_address":"'41.38.254.103'","updated_at":"2011-11-07 15:35:43"} +{"favorite_color":"blue","id":24,"first_name":"Henry","email":"hgarcian@newsvine.com","ip_address":"'1.191.216.252'","updated_at":"2008-08-28 08:30:44"} +{"favorite_color":"blue","id":25,"first_name":"Irene","email":"iturnero@opera.com","ip_address":"'50.17.60.190'","updated_at":"1994-04-01 07:15:02"} +{"favorite_color":"blue","id":26,"first_name":"Andrew","email":"adunnp@pen.io","ip_address":"'123.52.253.176'","updated_at":"2000-11-01 06:03:25"} +{"favorite_color":"blue","id":27,"first_name":"David","email":"dgutierrezq@wp.com","ip_address":"'238.23.203.42'","updated_at":"1988-01-25 07:29:18"} +{"favorite_color":"blue","id":28,"first_name":"Henry","email":"hsanchezr@cyberchimps.com","ip_address":"'248.102.2.185'","updated_at":"1983-01-01 13:36:37"} +{"favorite_color":"blue","id":29,"first_name":"Evelyn","email":"epetersons@gizmodo.com","ip_address":"'32.80.46.119'","updated_at":"1979-07-16 17:24:12"} +{"favorite_color":"blue","id":30,"first_name":"Tammy","email":"tmitchellt@purevolume.com","ip_address":"'249.246.167.88'","updated_at":"2001-04-03 10:00:23"} +{"favorite_color":"blue","id":31,"first_name":"Jacqueline","email":"jlittleu@domainmarket.com","ip_address":"'127.181.97.47'","updated_at":"1986-02-11 21:35:50"} +{"favorite_color":"blue","id":32,"first_name":"Earl","email":"eortizv@opera.com","ip_address":"'166.47.248.240'","updated_at":"1996-07-06 08:16:27"} +{"favorite_color":"blue","id":33,"first_name":"Juan","email":"jgordonw@sciencedirect.com","ip_address":"'71.77.2.200'","updated_at":"1987-01-31 03:46:44"} +{"favorite_color":"blue","id":34,"first_name":"Diane","email":"dhowellx@nyu.edu","ip_address":"'140.94.133.12'","updated_at":"1994-06-11 02:30:05"} +{"favorite_color":"blue","id":35,"first_name":"Randy","email":"rkennedyy@microsoft.com","ip_address":"'73.255.34.196'","updated_at":"2005-05-26 20:28:39"} +{"favorite_color":"blue","id":36,"first_name":"Janice","email":"jriveraz@time.com","ip_address":"'22.214.227.32'","updated_at":"1990-02-09 04:16:52"} +{"favorite_color":"blue","id":37,"first_name":"Laura","email":"lperry10@diigo.com","ip_address":"'159.148.145.73'","updated_at":"2015-03-17 05:59:25"} +{"favorite_color":"blue","id":38,"first_name":"Gary","email":"gray11@statcounter.com","ip_address":"'40.193.124.56'","updated_at":"1970-01-27 10:04:51"} +{"favorite_color":"blue","id":39,"first_name":"Jesse","email":"jmcdonald12@typepad.com","ip_address":"'31.7.86.103'","updated_at":"2009-03-14 08:14:29"} +{"favorite_color":"blue","id":40,"first_name":"Sandra","email":"sgonzalez13@goodreads.com","ip_address":"'223.80.168.239'","updated_at":"1993-05-21 14:08:54"} +{"favorite_color":"blue","id":41,"first_name":"Scott","email":"smoore14@archive.org","ip_address":"'38.238.46.83'","updated_at":"1980-08-30 11:16:56"} +{"favorite_color":"blue","id":42,"first_name":"Phillip","email":"pevans15@cisco.com","ip_address":"'158.234.59.34'","updated_at":"2011-12-15 23:26:31"} +{"favorite_color":"blue","id":43,"first_name":"Steven","email":"sriley16@google.ca","ip_address":"'90.247.57.68'","updated_at":"2011-10-29 19:03:28"} +{"favorite_color":"blue","id":44,"first_name":"Deborah","email":"dbrown17@hexun.com","ip_address":"'179.125.143.240'","updated_at":"1995-04-10 14:36:07"} +{"favorite_color":"blue","id":45,"first_name":"Lori","email":"lross18@ow.ly","ip_address":"'64.80.162.180'","updated_at":"1980-12-27 16:49:15"} +{"favorite_color":"blue","id":46,"first_name":"Sean","email":"sjackson19@tumblr.com","ip_address":"'240.116.183.69'","updated_at":"1988-06-12 21:24:45"} +{"favorite_color":"blue","id":47,"first_name":"Terry","email":"tbarnes1a@163.com","ip_address":"'118.38.213.137'","updated_at":"1997-09-22 16:43:19"} +{"favorite_color":"blue","id":48,"first_name":"Dorothy","email":"dross1b@ebay.com","ip_address":"'116.81.76.49'","updated_at":"2005-02-28 13:33:24"} +{"favorite_color":"blue","id":49,"first_name":"Samuel","email":"swashington1c@house.gov","ip_address":"'38.191.253.40'","updated_at":"1989-01-19 21:15:48"} +{"favorite_color":"blue","id":50,"first_name":"Ralph","email":"rcarter1d@tinyurl.com","ip_address":"'104.84.60.174'","updated_at":"2007-08-11 10:21:49"} +{"favorite_color":"green","id":51,"first_name":"Wayne","email":"whudson1e@princeton.edu","ip_address":"'90.61.24.102'","updated_at":"1983-07-03 16:58:12"} +{"favorite_color":"green","id":52,"first_name":"Rose","email":"rjames1f@plala.or.jp","ip_address":"'240.83.81.10'","updated_at":"1995-06-08 11:46:23"} +{"favorite_color":"green","id":53,"first_name":"Louise","email":"lcox1g@theglobeandmail.com","ip_address":"'105.11.82.145'","updated_at":"2016-09-19 14:45:51"} +{"favorite_color":"green","id":54,"first_name":"Kenneth","email":"kjohnson1h@independent.co.uk","ip_address":"'139.5.45.94'","updated_at":"1976-08-17 11:26:19"} +{"favorite_color":"green","id":55,"first_name":"Donna","email":"dbrown1i@amazon.co.uk","ip_address":"'19.45.169.45'","updated_at":"2006-05-27 16:51:40"} +{"favorite_color":"green","id":56,"first_name":"Johnny","email":"jvasquez1j@trellian.com","ip_address":"'118.202.238.23'","updated_at":"1975-11-17 08:42:32"} +{"favorite_color":"green","id":57,"first_name":"Patrick","email":"pramirez1k@tamu.edu","ip_address":"'231.25.153.198'","updated_at":"1997-08-06 11:51:09"} +{"favorite_color":"green","id":58,"first_name":"Helen","email":"hlarson1l@prweb.com","ip_address":"'8.40.21.39'","updated_at":"1993-08-04 19:53:40"} +{"favorite_color":"green","id":59,"first_name":"Patricia","email":"pspencer1m@gmpg.org","ip_address":"'212.198.40.15'","updated_at":"1977-08-03 16:37:27"} +{"favorite_color":"green","id":60,"first_name":"Joseph","email":"jspencer1n@marriott.com","ip_address":"'13.15.63.238'","updated_at":"2005-07-23 20:22:06"} +{"favorite_color":"green","id":61,"first_name":"Phillip","email":"pschmidt1o@blogtalkradio.com","ip_address":"'177.98.201.190'","updated_at":"1976-05-19 21:47:44"} +{"favorite_color":"green","id":62,"first_name":"Joan","email":"jwebb1p@google.ru","ip_address":"'105.229.170.71'","updated_at":"1972-09-07 17:53:47"} +{"favorite_color":"green","id":63,"first_name":"Phyllis","email":"pkennedy1q@imgur.com","ip_address":"'35.145.8.244'","updated_at":"2000-01-01 22:33:37"} +{"favorite_color":"green","id":64,"first_name":"Katherine","email":"khunter1r@smh.com.au","ip_address":"'248.168.205.32'","updated_at":"1991-01-09 06:40:24"} +{"favorite_color":"green","id":65,"first_name":"Laura","email":"lvasquez1s@wiley.com","ip_address":"'128.129.115.152'","updated_at":"1997-10-23 12:04:56"} +{"favorite_color":"green","id":66,"first_name":"Juan","email":"jdunn1t@state.gov","ip_address":"'44.228.124.51'","updated_at":"2004-11-10 05:07:35"} +{"favorite_color":"green","id":67,"first_name":"Judith","email":"jholmes1u@wiley.com","ip_address":"'40.227.179.115'","updated_at":"1977-08-02 17:01:45"} +{"favorite_color":"green","id":68,"first_name":"Beverly","email":"bbaker1v@wufoo.com","ip_address":"'208.34.84.59'","updated_at":"2016-03-06 20:07:23"} +{"favorite_color":"green","id":69,"first_name":"Lawrence","email":"lcarr1w@flickr.com","ip_address":"'59.158.212.223'","updated_at":"1988-09-13 06:07:21"} +{"favorite_color":"green","id":70,"first_name":"Gloria","email":"gwilliams1x@mtv.com","ip_address":"'245.231.88.33'","updated_at":"1995-03-18 22:32:46"} +{"favorite_color":"green","id":71,"first_name":"Steven","email":"ssims1y@cbslocal.com","ip_address":"'104.50.58.255'","updated_at":"2001-08-05 21:26:20"} +{"favorite_color":"green","id":72,"first_name":"Betty","email":"bmills1z@arstechnica.com","ip_address":"'103.177.214.220'","updated_at":"1981-12-14 21:26:54"} +{"favorite_color":"green","id":73,"first_name":"Mildred","email":"mfuller20@prnewswire.com","ip_address":"'151.158.8.130'","updated_at":"2000-04-19 10:13:55"} +{"favorite_color":"green","id":74,"first_name":"Donald","email":"dday21@icq.com","ip_address":"'9.178.102.255'","updated_at":"1972-12-03 00:58:24"} +{"favorite_color":"green","id":75,"first_name":"Eric","email":"ethomas22@addtoany.com","ip_address":"'85.2.241.227'","updated_at":"1992-11-01 05:59:30"} +{"favorite_color":"green","id":76,"first_name":"Joyce","email":"jarmstrong23@sitemeter.com","ip_address":"'169.224.20.36'","updated_at":"1985-10-24 06:50:01"} +{"favorite_color":"green","id":77,"first_name":"Maria","email":"mmartinez24@amazonaws.com","ip_address":"'143.189.167.135'","updated_at":"2005-10-05 05:17:42"} +{"favorite_color":"green","id":78,"first_name":"Harry","email":"hburton25@youtube.com","ip_address":"'156.47.176.237'","updated_at":"1978-03-26 05:53:33"} +{"favorite_color":"green","id":79,"first_name":"Kevin","email":"klawrence26@hao123.com","ip_address":"'79.136.183.83'","updated_at":"1994-10-12 04:38:52"} +{"favorite_color":"green","id":80,"first_name":"David","email":"dhall27@prweb.com","ip_address":"'133.149.172.153'","updated_at":"1976-12-15 16:24:24"} +{"favorite_color":"green","id":81,"first_name":"Kathy","email":"kperry28@twitter.com","ip_address":"'229.242.72.228'","updated_at":"1979-03-04 02:58:56"} +{"favorite_color":"green","id":82,"first_name":"Adam","email":"aprice29@elegantthemes.com","ip_address":"'13.145.21.10'","updated_at":"1982-11-07 11:46:59"} +{"favorite_color":"green","id":83,"first_name":"Brandon","email":"bgriffin2a@va.gov","ip_address":"'73.249.128.212'","updated_at":"2013-10-30 05:30:36"} +{"favorite_color":"green","id":84,"first_name":"Henry","email":"hnguyen2b@discovery.com","ip_address":"'211.36.214.242'","updated_at":"1985-01-09 06:37:27"} +{"favorite_color":"green","id":85,"first_name":"Eric","email":"esanchez2c@edublogs.org","ip_address":"'191.166.188.251'","updated_at":"2004-05-01 23:21:42"} +{"favorite_color":"green","id":86,"first_name":"Jason","email":"jlee2d@jimdo.com","ip_address":"'193.92.16.182'","updated_at":"1973-01-08 09:05:39"} +{"favorite_color":"green","id":87,"first_name":"Diana","email":"drichards2e@istockphoto.com","ip_address":"'19.130.175.245'","updated_at":"1994-10-05 22:50:49"} +{"favorite_color":"green","id":88,"first_name":"Andrea","email":"awelch2f@abc.net.au","ip_address":"'94.155.233.96'","updated_at":"2002-04-26 08:41:44"} +{"favorite_color":"green","id":89,"first_name":"Louis","email":"lwagner2g@miitbeian.gov.cn","ip_address":"'26.217.34.111'","updated_at":"2003-08-25 07:56:39"} +{"favorite_color":"green","id":90,"first_name":"Jane","email":"jsims2h@seesaa.net","ip_address":"'43.4.220.135'","updated_at":"1987-03-20 20:39:04"} +{"favorite_color":"green","id":91,"first_name":"Larry","email":"lgrant2i@si.edu","ip_address":"'97.126.79.34'","updated_at":"2000-09-07 20:26:19"} +{"favorite_color":"green","id":92,"first_name":"Louis","email":"ldean2j@prnewswire.com","ip_address":"'37.148.40.127'","updated_at":"2011-09-16 20:12:14"} +{"favorite_color":"green","id":93,"first_name":"Jennifer","email":"jcampbell2k@xing.com","ip_address":"'38.106.254.142'","updated_at":"1988-07-15 05:06:49"} +{"favorite_color":"green","id":94,"first_name":"Wayne","email":"wcunningham2l@google.com.hk","ip_address":"'223.28.26.187'","updated_at":"2009-12-15 06:16:54"} +{"favorite_color":"green","id":95,"first_name":"Lori","email":"lstevens2m@icq.com","ip_address":"'181.250.181.58'","updated_at":"1984-10-28 03:29:19"} +{"favorite_color":"green","id":96,"first_name":"Judy","email":"jsimpson2n@marriott.com","ip_address":"'180.121.239.219'","updated_at":"1986-02-07 15:18:10"} +{"favorite_color":"green","id":97,"first_name":"Phillip","email":"phoward2o@usa.gov","ip_address":"'255.247.0.175'","updated_at":"2002-12-26 08:44:45"} +{"favorite_color":"green","id":98,"first_name":"Gloria","email":"gwalker2p@usa.gov","ip_address":"'156.140.7.128'","updated_at":"1997-10-04 07:58:58"} +{"favorite_color":"green","id":99,"first_name":"Paul","email":"pjohnson2q@umn.edu","ip_address":"'183.59.198.197'","updated_at":"1991-11-14 12:33:55"} +{"favorite_color":"green","id":100,"first_name":"Frank","email":"fgreene2r@blogspot.com","ip_address":"'150.143.68.121'","updated_at":"2010-06-12 23:55:39"} diff --git a/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/parquet/source.parquet b/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/parquet/source.parquet new file mode 100644 index 000000000..3022faa67 Binary files /dev/null and b/dbt-bigquery/tests/functional/adapter/upload_file_tests/data/parquet/source.parquet differ diff --git a/dbt-bigquery/tests/functional/adapter/upload_file_tests/test_upload_file.py b/dbt-bigquery/tests/functional/adapter/upload_file_tests/test_upload_file.py new file mode 100644 index 000000000..ba9a2289b --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/upload_file_tests/test_upload_file.py @@ -0,0 +1,119 @@ +import pytest +from dbt.tests.util import get_relation_columns, run_dbt, run_sql_with_adapter +from dbt.contracts.results import NodeStatus +import datetime +import yaml + +_UPLOAD_FILE_SQL = """ +{% macro upload_file(local_file_path, database, table_schema, table_name) %} + {% do adapter.upload_file(local_file_path, database, table_schema, table_name, kwargs=kwargs) %} +{% endmacro %} +""".lstrip() + + +class TestUploadFile: + @pytest.fixture(scope="class") + def schema(self): + return "upload_file" + + @pytest.fixture(scope="class") + def macros(self): + return { + "upload_file.sql": _UPLOAD_FILE_SQL, + } + + @staticmethod + def perform_uploaded_table_checks(table_schema, table_name, project): + # Test the column names, and data types of the created table + col_result = get_relation_columns(project.adapter, f"{table_schema}.{table_name}") + assert [col_obj[0] for col_obj in col_result] == [ + "email", + "favorite_color", + "first_name", + "id", + "ip_address", + "updated_at", + ] + assert [col_obj[1] for col_obj in col_result] == [ + "STRING", + "STRING", + "STRING", + "INT64", + "STRING", + "TIMESTAMP", + ] + + # Test the values of the created table + value_query = f""" + select + count(*) row_count, + count(distinct id) as num_distinct_ids, + max(updated_at) as max_updated_at + from `{table_schema}.{table_name}` + """ + value_results = run_sql_with_adapter(project.adapter, value_query) + + # There should be 100 rows in this table + assert value_results[0][0] == 100 + # There should be 100 distinct id values in this table + assert value_results[0][1] == 100 + # Maximum updated_at value should be 2016-09-19 14:45:51 + assert value_results[0][2] == datetime.datetime( + 2016, 9, 19, 14, 45, 51, tzinfo=datetime.timezone.utc + ) + + def test_bigquery_upload_file_csv(self, project): + # Create a table from an uploaded CSV file + upload_args = yaml.safe_dump( + { + "local_file_path": f"{project.test_data_dir}/csv/source.csv", + "database": project.database, + "table_schema": project.test_schema, + "table_name": "TestUploadFileCSV", + "skip_leading_rows": 1, + "autodetect": True, + "write_disposition": "WRITE_TRUNCATE", + } + ) + upload_result = run_dbt(["run-operation", "upload_file", "--args", upload_args]) + assert upload_result.results[0].status == NodeStatus.Success + + # Check if the uploaded table contains expected values and schema + self.perform_uploaded_table_checks(project.test_schema, "TestUploadFileCSV", project) + + def test_bigquery_upload_file_ndjson(self, project): + # Create a table from an uploaded NDJSON file + upload_args = yaml.safe_dump( + { + "local_file_path": f"{project.test_data_dir}/ndjson/source.ndjson", + "database": project.database, + "table_schema": project.test_schema, + "table_name": "TestUploadFileNDJSON", + "autodetect": True, + "source_format": "NEWLINE_DELIMITED_JSON", + "write_disposition": "WRITE_TRUNCATE", + } + ) + upload_result = run_dbt(["run-operation", "upload_file", "--args", upload_args]) + assert upload_result.results[0].status == NodeStatus.Success + + # Check if the uploaded table contains expected values and schema + self.perform_uploaded_table_checks(project.test_schema, "TestUploadFileNDJSON", project) + + def test_bigquery_upload_file_parquet(self, project): + # Create a table from an uploaded parquet file + upload_args = yaml.safe_dump( + { + "local_file_path": f"{project.test_data_dir}/parquet/source.parquet", + "database": project.database, + "table_schema": project.test_schema, + "table_name": "TestUploadFileParquet", + "source_format": "PARQUET", + "write_disposition": "WRITE_TRUNCATE", + } + ) + upload_result = run_dbt(["run-operation", "upload_file", "--args", upload_args]) + assert upload_result.results[0].status == NodeStatus.Success + + # Check if the uploaded table contains expected values and schema + self.perform_uploaded_table_checks(project.test_schema, "TestUploadFileParquet", project) diff --git a/dbt-bigquery/tests/functional/adapter/utils/fixture_array_append.py b/dbt-bigquery/tests/functional/adapter/utils/fixture_array_append.py new file mode 100644 index 000000000..0558d66e1 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/utils/fixture_array_append.py @@ -0,0 +1,13 @@ +# array_append + +# EXCEPT can't be used with ARRAYs in BigQuery, so convert to a string +models__array_append_expected_sql = """ +select 1 as id, {{ array_to_string(array_construct([1,2,3,4])) }} as array_col union all +select 2 as id, {{ array_to_string(array_construct([4])) }} as array_col +""" + + +models__array_append_actual_sql = """ +select 1 as id, {{ array_to_string(array_append(array_construct([1,2,3]), 4)) }} as array_col union all +select 2 as id, {{ array_to_string(array_append(array_construct([]), 4)) }} as array_col +""" diff --git a/dbt-bigquery/tests/functional/adapter/utils/fixture_array_concat.py b/dbt-bigquery/tests/functional/adapter/utils/fixture_array_concat.py new file mode 100644 index 000000000..51af8bf12 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/utils/fixture_array_concat.py @@ -0,0 +1,15 @@ +# array_concat + +# EXCEPT can't be used with ARRAYs in BigQuery, so convert to a string +models__array_concat_expected_sql = """ +select 1 as id, {{ array_to_string(array_construct([1,2,3,4,5,6])) }} as array_col union all +select 2 as id, {{ array_to_string(array_construct([2])) }} as array_col union all +select 3 as id, {{ array_to_string(array_construct([3])) }} as array_col +""" + + +models__array_concat_actual_sql = """ +select 1 as id, {{ array_to_string(array_concat(array_construct([1,2,3]), array_construct([4,5,6]))) }} as array_col union all +select 2 as id, {{ array_to_string(array_concat(array_construct([]), array_construct([2]))) }} as array_col union all +select 3 as id, {{ array_to_string(array_concat(array_construct([3]), array_construct([]))) }} as array_col +""" diff --git a/dbt-bigquery/tests/functional/adapter/utils/fixture_array_construct.py b/dbt-bigquery/tests/functional/adapter/utils/fixture_array_construct.py new file mode 100644 index 000000000..13d0bb2f3 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/utils/fixture_array_construct.py @@ -0,0 +1,20 @@ +# array_construct + +# EXCEPT can't be used with ARRAYs in BigQuery, so convert to a string +models__array_construct_expected_sql = """ +select 1 as id, {{ array_to_string(array_construct([1,2,3])) }} as array_col union all +select 2 as id, {{ array_to_string(array_construct([])) }} as array_col +""" + + +models__array_construct_actual_sql = """ +select 1 as id, {{ array_to_string(array_construct([1,2,3])) }} as array_col union all +select 2 as id, {{ array_to_string(array_construct([])) }} as array_col +""" + + +macros__array_to_string_sql = """ +{% macro array_to_string(array) %} + (select string_agg(cast(element as string), ',') from unnest({{ array }}) element) +{% endmacro %} +""" diff --git a/dbt-bigquery/tests/functional/adapter/utils/fixture_get_intervals_between.py b/dbt-bigquery/tests/functional/adapter/utils/fixture_get_intervals_between.py new file mode 100644 index 000000000..be25d6bd9 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/utils/fixture_get_intervals_between.py @@ -0,0 +1,16 @@ +models__bq_test_get_intervals_between_sql = """ +SELECT + {{ get_intervals_between("'2023-09-01'", "'2023-09-12'", "day") }} as intervals, + 11 as expected + +""" + +models___bq_test_get_intervals_between_yml = """ +version: 2 +models: + - name: test_get_intervals_between + tests: + - assert_equal: + actual: intervals + expected: expected +""" diff --git a/dbt-bigquery/tests/functional/adapter/utils/test_data_types.py b/dbt-bigquery/tests/functional/adapter/utils/test_data_types.py new file mode 100644 index 000000000..722313dcc --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/utils/test_data_types.py @@ -0,0 +1,36 @@ +from dbt.tests.adapter.utils.data_types.test_type_bigint import BaseTypeBigInt +from dbt.tests.adapter.utils.data_types.test_type_float import BaseTypeFloat +from dbt.tests.adapter.utils.data_types.test_type_int import BaseTypeInt +from dbt.tests.adapter.utils.data_types.test_type_numeric import BaseTypeNumeric +from dbt.tests.adapter.utils.data_types.test_type_string import BaseTypeString +from dbt.tests.adapter.utils.data_types.test_type_timestamp import BaseTypeTimestamp +from dbt.tests.adapter.utils.data_types.test_type_boolean import BaseTypeBoolean + + +class TestTypeBigInt(BaseTypeBigInt): + pass + + +class TestTypeFloat(BaseTypeFloat): + pass + + +class TestTypeInt(BaseTypeInt): + pass + + +class TestTypeNumeric(BaseTypeNumeric): + def numeric_fixture_type(self): + return "numeric" + + +class TestTypeString(BaseTypeString): + pass + + +class TestTypeTimestamp(BaseTypeTimestamp): + pass + + +class TestTypeBoolean(BaseTypeBoolean): + pass diff --git a/dbt-bigquery/tests/functional/adapter/utils/test_timestamps.py b/dbt-bigquery/tests/functional/adapter/utils/test_timestamps.py new file mode 100644 index 000000000..40b5e0dce --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/utils/test_timestamps.py @@ -0,0 +1,18 @@ +import pytest +from dbt.tests.adapter.utils.test_timestamps import BaseCurrentTimestamps + + +class TestCurrentTimestampBigQuery(BaseCurrentTimestamps): + @pytest.fixture(scope="class") + def expected_schema(self): + return { + "current_timestamp": "TIMESTAMP", + "current_timestamp_in_utc_backcompat": "TIMESTAMP", + "current_timestamp_backcompat": "TIMESTAMP", + } + + @pytest.fixture(scope="class") + def expected_sql(self): + return """select current_timestamp() as current_timestamp, + current_timestamp as current_timestamp_in_utc_backcompat, + current_timestamp as current_timestamp_backcompat""" diff --git a/dbt-bigquery/tests/functional/adapter/utils/test_utils.py b/dbt-bigquery/tests/functional/adapter/utils/test_utils.py new file mode 100644 index 000000000..384b17108 --- /dev/null +++ b/dbt-bigquery/tests/functional/adapter/utils/test_utils.py @@ -0,0 +1,237 @@ +import random + +import pytest +from google.api_core.exceptions import NotFound + +from dbt.tests.adapter.utils.test_array_append import BaseArrayAppend +from dbt.tests.adapter.utils.test_array_concat import BaseArrayConcat +from dbt.tests.adapter.utils.test_array_construct import BaseArrayConstruct +from dbt.tests.adapter.utils.test_any_value import BaseAnyValue +from dbt.tests.adapter.utils.test_bool_or import BaseBoolOr +from dbt.tests.adapter.utils.test_cast import BaseCast +from dbt.tests.adapter.utils.test_cast_bool_to_text import BaseCastBoolToText +from dbt.tests.adapter.utils.test_concat import BaseConcat +from dbt.tests.adapter.utils.test_current_timestamp import BaseCurrentTimestampAware +from dbt.tests.adapter.utils.test_date import BaseDate +from dbt.tests.adapter.utils.test_dateadd import BaseDateAdd +from dbt.tests.adapter.utils.test_datediff import BaseDateDiff +from dbt.tests.adapter.utils.test_date_spine import BaseDateSpine +from dbt.tests.adapter.utils.test_date_trunc import BaseDateTrunc +from dbt.tests.adapter.utils.test_escape_single_quotes import BaseEscapeSingleQuotesBackslash +from dbt.tests.adapter.utils.test_except import BaseExcept +from dbt.tests.adapter.utils.test_generate_series import BaseGenerateSeries +from dbt.tests.adapter.utils.test_get_intervals_between import BaseGetIntervalsBetween +from dbt.tests.adapter.utils.test_get_powers_of_two import BaseGetPowersOfTwo +from dbt.tests.adapter.utils.test_hash import BaseHash +from dbt.tests.adapter.utils.test_intersect import BaseIntersect +from dbt.tests.adapter.utils.test_last_day import BaseLastDay +from dbt.tests.adapter.utils.test_length import BaseLength +from dbt.tests.adapter.utils.test_listagg import BaseListagg +from dbt.tests.adapter.utils.test_position import BasePosition +from dbt.tests.adapter.utils.test_replace import BaseReplace +from dbt.tests.adapter.utils.test_right import BaseRight +from dbt.tests.adapter.utils.test_safe_cast import BaseSafeCast +from dbt.tests.adapter.utils.test_split_part import BaseSplitPart +from dbt.tests.adapter.utils.test_string_literal import BaseStringLiteral +from dbt.tests.adapter.utils.test_validate_sql import BaseValidateSqlMethod +from tests.functional.adapter.utils.fixture_array_append import ( + models__array_append_actual_sql, + models__array_append_expected_sql, +) +from tests.functional.adapter.utils.fixture_array_concat import ( + models__array_concat_actual_sql, + models__array_concat_expected_sql, +) +from tests.functional.adapter.utils.fixture_array_construct import ( + models__array_construct_actual_sql, + models__array_construct_expected_sql, + macros__array_to_string_sql, +) +from tests.functional.adapter.utils.fixture_get_intervals_between import ( + models__bq_test_get_intervals_between_sql, + models___bq_test_get_intervals_between_yml, +) + + +class TestAnyValue(BaseAnyValue): + pass + + +class TestArrayAppend(BaseArrayAppend): + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": models__array_append_actual_sql, + "expected.sql": models__array_append_expected_sql, + } + + @pytest.fixture(scope="class") + def macros(self): + return { + "array_to_string.sql": macros__array_to_string_sql, + } + + +class TestArrayConcat(BaseArrayConcat): + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": models__array_concat_actual_sql, + "expected.sql": models__array_concat_expected_sql, + } + + @pytest.fixture(scope="class") + def macros(self): + return { + "array_to_string.sql": macros__array_to_string_sql, + } + + +class TestArrayConstruct(BaseArrayConstruct): + @pytest.fixture(scope="class") + def models(self): + return { + "actual.sql": models__array_construct_actual_sql, + "expected.sql": models__array_construct_expected_sql, + } + + @pytest.fixture(scope="class") + def macros(self): + return { + "array_to_string.sql": macros__array_to_string_sql, + } + + +class TestBoolOr(BaseBoolOr): + pass + + +class TestCast(BaseCast): + pass + + +class TestCastBoolToText(BaseCastBoolToText): + pass + + +class TestConcat(BaseConcat): + pass + + +# Use either BaseCurrentTimestampAware or BaseCurrentTimestampNaive but not both +class TestCurrentTimestamp(BaseCurrentTimestampAware): + pass + + +class TestDate(BaseDate): + pass + + +class TestDateAdd(BaseDateAdd): + pass + + +class TestDateDiff(BaseDateDiff): + pass + + +class TestDateSpine(BaseDateSpine): + pass + + +class TestDateTrunc(BaseDateTrunc): + pass + + +class TestEscapeSingleQuotes(BaseEscapeSingleQuotesBackslash): + pass + + +class TestExcept(BaseExcept): + pass + + +class TestGenerateSeries(BaseGenerateSeries): + pass + + +class TestGetIntervalsBetween(BaseGetIntervalsBetween): + @pytest.fixture(scope="class") + def models(self): + return { + "test_get_intervals_between.yml": models___bq_test_get_intervals_between_yml, + "test_get_intervals_between.sql": self.interpolate_macro_namespace( + models__bq_test_get_intervals_between_sql, "get_intervals_between" + ), + } + + +class TestGetPowersOfTwo(BaseGetPowersOfTwo): + pass + + +class TestHash(BaseHash): + pass + + +class TestIntersect(BaseIntersect): + pass + + +class TestLastDay(BaseLastDay): + pass + + +class TestLength(BaseLength): + pass + + +class TestListagg(BaseListagg): + pass + + +class TestPosition(BasePosition): + pass + + +class TestReplace(BaseReplace): + pass + + +class TestRight(BaseRight): + pass + + +class TestSafeCast(BaseSafeCast): + pass + + +class TestSplitPart(BaseSplitPart): + pass + + +class TestStringLiteral(BaseStringLiteral): + pass + + +class TestValidateSqlMethod(BaseValidateSqlMethod): + pass + + +class TestDryRunMethod: + """Test connection manager dry run method operation.""" + + def test_dry_run_method(self, project) -> None: + """Test dry run method on a DDL statement. + + This allows us to demonstrate that no SQL is executed. + """ + with project.adapter.connection_named("_test"): + client = project.adapter.connections.get_thread_connection().handle + random_suffix = "".join(random.choices([str(i) for i in range(10)], k=10)) + table_name = f"test_dry_run_{random_suffix}" + table_id = "{}.{}.{}".format(project.database, project.test_schema, table_name) + res = project.adapter.connections.dry_run(f"CREATE TABLE {table_id} (x INT64)") + assert res.code == "DRY RUN" + with pytest.raises(expected_exception=NotFound): + client.get_table(table_id) diff --git a/dbt-bigquery/tests/functional/python_model_tests/__init__.py b/dbt-bigquery/tests/functional/python_model_tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dbt-bigquery/tests/functional/python_model_tests/files.py b/dbt-bigquery/tests/functional/python_model_tests/files.py new file mode 100644 index 000000000..1cb95602a --- /dev/null +++ b/dbt-bigquery/tests/functional/python_model_tests/files.py @@ -0,0 +1,125 @@ +SINGLE_RECORD = """ +import pandas as pd + +def model(dbt, session): + + dbt.config( + submission_method="serverless", + materialized="table" + ) + + df = pd.DataFrame( + [ + {"column_name": {"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}}, + ] + ) + + return df +""" + + +MULTI_RECORD = """ +import pandas as pd + +def model(dbt, session): + + dbt.config( + submission_method="serverless", + materialized="table", + ) + + df = pd.DataFrame( + [ + {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]}, + ] + ) + + return df +""" + + +ORC_FORMAT = """ +import pandas as pd + +def model(dbt, session): + + dbt.config( + submission_method="serverless", + materialized="table", + intermediate_format="orc", + ) + + df = pd.DataFrame( + [ + {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]}, + ] + ) + + return df +""" + + +ENABLE_LIST_INFERENCE = """ +import pandas as pd + +def model(dbt, session): + + dbt.config( + submission_method="serverless", + materialized="table", + enable_list_inference="true", + ) + + df = pd.DataFrame( + [ + {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]}, + ] + ) + + return df +""" + + +ENABLE_LIST_INFERENCE_PARQUET_FORMAT = """ +import pandas as pd + +def model(dbt, session): + + dbt.config( + submission_method="serverless", + materialized="table", + enable_list_inference="true", + intermediate_format="parquet", + ) + + df = pd.DataFrame( + [ + {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]}, + ] + ) + + return df +""" + + +DISABLE_LIST_INFERENCE_ORC_FORMAT = """ +import pandas as pd + +def model(dbt, session): + + dbt.config( + submission_method="serverless", + materialized="table", + enable_list_inference="false", + intermediate_format="orc", + ) + + df = pd.DataFrame( + [ + {"column_name": [{"name": "hello", "my_list": ["h", "e", "l", "l", "o"]}]}, + ] + ) + + return df + +""" diff --git a/dbt-bigquery/tests/functional/python_model_tests/test_list_inference.py b/dbt-bigquery/tests/functional/python_model_tests/test_list_inference.py new file mode 100644 index 000000000..88b1c4fa5 --- /dev/null +++ b/dbt-bigquery/tests/functional/python_model_tests/test_list_inference.py @@ -0,0 +1,37 @@ +""" +This test case addresses this regression: https://github.com/dbt-labs/dbt-bigquery/issues/1047 + +As the comments point out, the issue appears when the settings are: + - list inference: off + - intermediate format: parquet + +Adjusting either of these alleviates the issue. + +When the regression was first reported, `files.MULTI_RECORD` failed while the other models passed. +""" + +from dbt.tests.util import run_dbt_and_capture +import pytest + +from tests.functional.python_model_tests import files + + +class TestPythonListInference: + @pytest.fixture(scope="class") + def models(self): + return { + # this is what worked prior to this issue + "single_record.py": files.SINGLE_RECORD, + # this is the model that initially failed for this issue + "multi_record.py": files.MULTI_RECORD, + # these are explicit versions of the default settings + "enable_list_inference.py": files.ENABLE_LIST_INFERENCE, + "enable_list_inference_parquet_format.py": files.ENABLE_LIST_INFERENCE_PARQUET_FORMAT, + # orc format also resolves the issue, regardless of list inference + "orc_format.py": files.ORC_FORMAT, + "disable_list_inference_orc_format.py": files.DISABLE_LIST_INFERENCE_ORC_FORMAT, + } + + def test_models_success(self, project, models): + result, output = run_dbt_and_capture(["run"]) + assert len(result) == len(models) diff --git a/dbt-bigquery/tests/functional/test_cancel.py b/dbt-bigquery/tests/functional/test_cancel.py new file mode 100644 index 000000000..823687b52 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_cancel.py @@ -0,0 +1,134 @@ +import platform + +import time + +import os +import signal +import subprocess + +import pytest + +from dbt.tests.util import get_connection + +_SEED_CSV = """ +id, name, astrological_sign, moral_alignment +1, Alice, Aries, Lawful Good +2, Bob, Taurus, Neutral Good +3, Thaddeus, Gemini, Chaotic Neutral +4, Zebulon, Cancer, Lawful Evil +5, Yorick, Leo, True Neutral +6, Xavier, Virgo, Chaotic Evil +7, Wanda, Libra, Lawful Neutral +""" + +_LONG_RUNNING_MODEL_SQL = """ + {{ config(materialized='table') }} + with array_1 as ( + select generated_ids from UNNEST(GENERATE_ARRAY(1, 200000)) AS generated_ids + ), + array_2 as ( + select generated_ids from UNNEST(GENERATE_ARRAY(2, 200000)) AS generated_ids + ) + + SELECT array_1.generated_ids + FROM array_1 + LEFT JOIN array_1 as jnd on 1=1 + LEFT JOIN array_2 as jnd2 on 1=1 + LEFT JOIN array_1 as jnd3 on jnd3.generated_ids >= jnd2.generated_ids +""" + + +def _get_info_schema_jobs_query(project_id, dataset_id, table_id): + """ + Running this query requires roles/bigquery.resourceViewer on the project, + see: https://cloud.google.com/bigquery/docs/information-schema-jobs#required_role + :param project_id: + :param dataset_id: + :param table_id: + :return: a single job id that matches the model we tried to create and was cancelled + """ + return f""" + SELECT job_id + FROM `region-us`.`INFORMATION_SCHEMA.JOBS_BY_PROJECT` + WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 5 HOUR) + AND statement_type = 'CREATE_TABLE_AS_SELECT' + AND state = 'DONE' + AND job_id IS NOT NULL + AND project_id = '{project_id}' + AND error_result.reason = 'stopped' + AND error_result.message = 'Job execution was cancelled: User requested cancellation' + AND destination_table.table_id = '{table_id}' + AND destination_table.dataset_id = '{dataset_id}' + """ + + +def _run_dbt_in_subprocess(project, dbt_command): + + run_dbt_process = subprocess.Popen( + [ + "dbt", + dbt_command, + "--profiles-dir", + project.profiles_dir, + "--project-dir", + project.project_root, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=False, + env=os.environ.copy(), + ) + std_out_log = "" + while True: + std_out_line = run_dbt_process.stdout.readline().decode("utf-8") + std_out_log += std_out_line + if std_out_line != "": + print(std_out_line) + if "1 of 1 START" in std_out_line: + time.sleep(1) + run_dbt_process.send_signal(signal.SIGINT) + + if run_dbt_process.poll(): + break + + return std_out_log + + +def _get_job_id(project, table_name): + # Because we run this in a subprocess we have to actually call Bigquery and look up the job id + with get_connection(project.adapter): + job_id = project.run_sql( + _get_info_schema_jobs_query(project.database, project.test_schema, table_name) + ) + + return job_id + + +@pytest.mark.skipif( + platform.system() == "Windows", reason="running signt is unsupported on Windows." +) +class TestBigqueryCancelsQueriesOnKeyboardInterrupt: + @pytest.fixture(scope="class", autouse=True) + def models(self): + return { + "model.sql": _LONG_RUNNING_MODEL_SQL, + } + + @pytest.fixture(scope="class", autouse=True) + def seeds(self): + return { + "seed.csv": _SEED_CSV, + } + + def test_bigquery_cancels_queries_for_model_on_keyboard_interrupt(self, project): + std_out_log = _run_dbt_in_subprocess(project, "run") + + assert "CANCEL query model.test.model" in std_out_log + assert len(_get_job_id(project, "model")) == 1 + + @pytest.mark.skip(reason="cannot reliably cancel seed queries in time") + def test_bigquery_cancels_queries_for_seed_on_keyboard_interrupt(self, project): + std_out_log = _run_dbt_in_subprocess(project, "seed") + + assert "CANCEL query seed.test.seed" in std_out_log + # we can't assert the job id since we can't kill the seed process fast enough to cancel it diff --git a/dbt-bigquery/tests/functional/test_changing_partitions.py b/dbt-bigquery/tests/functional/test_changing_partitions.py new file mode 100644 index 000000000..908f1ddfe --- /dev/null +++ b/dbt-bigquery/tests/functional/test_changing_partitions.py @@ -0,0 +1,259 @@ +import pytest +import json +from dbt.tests.util import run_dbt + +_MACRO_SQL = """ +{% test number_partitions(model, expected) %} + + {%- set result = get_partitions_metadata(model) %} + + {% if result %} + {% set partitions = result.columns['partition_id'].values() %} + {% else %} + {% set partitions = () %} + {% endif %} + + {% set actual = partitions | length %} + {% set success = 1 if model and actual == expected else 0 %} + + select 'Expected {{ expected }}, but got {{ actual }}' as validation_error + from (select true) + where {{ success }} = 0 + +{% endtest %} +""" + +_MODEL_SQL = """ +{{ + config( + materialized="table", + partition_by=var('partition_by'), + cluster_by=var('cluster_by'), + partition_expiration_days=var('partition_expiration_days'), + require_partition_filter=var('require_partition_filter') + ) +}} + +select 1 as id, 'dr. bigquery' as name, current_timestamp() as cur_time, current_date() as cur_date +union all +select 2 as id, 'prof. bigquery' as name, current_timestamp() as cur_time, current_date() as cur_date +""" + +_SCHEMA_YML = """ +version: 2 +models: +- name: my_model + tests: + - number_partitions: + expected: "{{ var('expected', 1) }}" +""" + + +class BaseBigQueryChangingPartition: + @pytest.fixture(scope="class") + def macros(self): + return {"partition_metadata.sql": _MACRO_SQL} + + @pytest.fixture(scope="class") + def models(self): + return {"my_model.sql": _MODEL_SQL, "schema.yml": _SCHEMA_YML} + + def run_changes(self, before, after): + results = run_dbt(["run", "--vars", json.dumps(before)]) + assert len(results) == 1 + + results = run_dbt(["run", "--vars", json.dumps(after)]) + assert len(results) == 1 + + def partitions_test(self, expected): + test_results = run_dbt(["test", "--vars", json.dumps(expected)]) + + for result in test_results: + assert result.status == "pass" + assert not result.skipped + assert result.failures == 0 + + +class TestBigQueryChangingPartition(BaseBigQueryChangingPartition): + def test_bigquery_add_partition(self, project): + before = { + "partition_by": None, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_time", "data_type": "timestamp"}, + "cluster_by": None, + "partition_expiration_days": 7, + "require_partition_filter": True, + } + self.run_changes(before, after) + self.partitions_test({"expected": 1}) + + def test_bigquery_add_partition_year(self, project): + before = { + "partition_by": None, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_time", "data_type": "timestamp", "granularity": "year"}, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) + self.partitions_test({"expected": 1}) + + def test_bigquery_add_partition_month(self, project): + before = { + "partition_by": None, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": { + "field": "cur_time", + "data_type": "timestamp", + "granularity": "month", + }, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) + self.partitions_test({"expected": 1}) + + def test_bigquery_add_partition_hour(self, project): + before = { + "partition_by": {"field": "cur_time", "data_type": "timestamp", "granularity": "day"}, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_time", "data_type": "timestamp", "granularity": "hour"}, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) + self.partitions_test({"expected": 1}) + + def test_bigquery_remove_partition(self, project): + before = { + "partition_by": {"field": "cur_time", "data_type": "timestamp"}, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": None, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) + + def test_bigquery_change_partitions(self, project): + before = { + "partition_by": {"field": "cur_time", "data_type": "timestamp"}, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_date"}, + "cluster_by": None, + "partition_expiration_days": 7, + "require_partition_filter": True, + } + self.run_changes(before, after) + self.partitions_test({"expected": 1}) + self.run_changes(after, before) + self.partitions_test({"expected": 1}) + + def test_bigquery_change_partitions_from_int(self, project): + before = { + "partition_by": { + "field": "id", + "data_type": "int64", + "range": {"start": 0, "end": 10, "interval": 1}, + }, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_date", "data_type": "date"}, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) + self.partitions_test({"expected": 1}) + self.run_changes(after, before) + self.partitions_test({"expected": 2}) + + def test_bigquery_add_clustering(self, project): + before = { + "partition_by": {"field": "cur_time", "data_type": "timestamp"}, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_date"}, + "cluster_by": "id", + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) + + def test_bigquery_remove_clustering(self, project): + before = { + "partition_by": {"field": "cur_time", "data_type": "timestamp"}, + "cluster_by": "id", + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_date"}, + "cluster_by": None, + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) + + def test_bigquery_change_clustering(self, project): + before = { + "partition_by": {"field": "cur_time", "data_type": "timestamp"}, + "cluster_by": "id", + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_date"}, + "cluster_by": "name", + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) + + def test_bigquery_change_clustering_strict(self, project): + before = { + "partition_by": {"field": "cur_time", "data_type": "timestamp"}, + "cluster_by": "id", + "partition_expiration_days": None, + "require_partition_filter": None, + } + after = { + "partition_by": {"field": "cur_date", "data_type": "date"}, + "cluster_by": "name", + "partition_expiration_days": None, + "require_partition_filter": None, + } + self.run_changes(before, after) diff --git a/dbt-bigquery/tests/functional/test_delete_column_policy.py b/dbt-bigquery/tests/functional/test_delete_column_policy.py new file mode 100644 index 000000000..73b1c1f28 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_delete_column_policy.py @@ -0,0 +1,73 @@ +import pytest +from dbt.tests.util import run_dbt, get_connection, relation_from_name, write_config_file + +from dbt.adapters.bigquery import BigQueryRelation + +_POLICY_TAG_MODEL = """{{ + config( + materialized='table', + persist_docs={ 'columns': true } + ) +}} + +select + struct( + 1 as field + ) as first_struct +""" + +_POLICY_TAG_YML = """version: 2 + +models: +- name: policy_tag_table + columns: + - name: first_struct + - name: first_struct.field + policy_tags: + - '{{ var("policy_tag") }}' +""" + +_POLICY_TAG_YML_NO_POLICY_TAGS = """version: 2 + +models: +- name: policy_tag_table + columns: + - name: first_struct + - name: first_struct.field +""" + +# Manually generated https://console.cloud.google.com/bigquery/policy-tags?project=dbt-test-env +_POLICY_TAG = "projects/dbt-test-env/locations/us/taxonomies/5785568062805976401/policyTags/135489647357012267" +_POLICY_TAG_MODEL_NAME = "policy_tag_table" + + +class TestBigqueryDeleteColumnPolicy: + """See BQ docs for more info on policy tags: + https://cloud.google.com/bigquery/docs/column-level-security#work_with_policy_tags""" + + @pytest.fixture(scope="class") + def project_config_update(self): + return {"config-version": 2, "vars": {"policy_tag": _POLICY_TAG}} + + @pytest.fixture(scope="class") + def models(self): + return {f"{_POLICY_TAG_MODEL_NAME}.sql": _POLICY_TAG_MODEL, "schema.yml": _POLICY_TAG_YML} + + def test_bigquery_delete_column_policy_tag(self, project): + results = run_dbt(["run", "-f", "--models", "policy_tag_table"]) + assert len(results) == 1 + write_config_file( + _POLICY_TAG_YML_NO_POLICY_TAGS, project.project_root + "/models", "schema.yml" + ) # update the model to remove the policy tag + new_results = run_dbt(["run", "-f", "--models", "policy_tag_table"]) + assert len(new_results) == 1 + relation: BigQueryRelation = relation_from_name(project.adapter, _POLICY_TAG_MODEL_NAME) + adapter = project.adapter + with get_connection(project.adapter) as conn: + table = conn.handle.get_table( + adapter.connections.get_bq_table( + relation.database, relation.schema, relation.table + ) + ) + for schema_field in table.schema: + assert schema_field.policy_tags is None diff --git a/dbt-bigquery/tests/functional/test_drop_temp_relation.py b/dbt-bigquery/tests/functional/test_drop_temp_relation.py new file mode 100644 index 000000000..4cdfaedae --- /dev/null +++ b/dbt-bigquery/tests/functional/test_drop_temp_relation.py @@ -0,0 +1,60 @@ +import pytest +from google.api_core.exceptions import NotFound +from dbt.adapters.bigquery.relation import BigQueryRelation +from dbt.tests.util import run_dbt, get_connection, relation_from_name + + +_INCREMENTAL_MODEL = """ +{{ + config( + materialized="incremental", + on_schema_change="sync_all_columns" + ) +}} + select 20 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour union all + select 40 as id, cast('2020-01-01 02:00:00' as datetime) as date_hour +""" + +_INCREMENTAL_MODEL_YAML = """version: 2 +models: +- name: test_drop_relation + columns: + - name: id + type: int64 + - name: date_hour + type: datetime +""" + + +class BaseIncrementalModelConfig: + @pytest.fixture(scope="class") + def models(self): + return { + "test_drop_relation.sql": _INCREMENTAL_MODEL, + "schema.yml": _INCREMENTAL_MODEL_YAML, + } + + +class TestIncrementalModel(BaseIncrementalModelConfig): + def test_incremental_model_succeeds(self, project): + """ + Steps: + 1. Create the model + 2. Merge into the model using __dbt_tmp table + 3. Assert raises NotFound exception + """ + results = run_dbt(["run"]) + assert len(results) == 1 + results = run_dbt(["run"]) + assert len(results) == 1 + relation: BigQueryRelation = relation_from_name( + project.adapter, "test_drop_relation__dbt_tmp" + ) + adapter = project.adapter + with pytest.raises(NotFound): + with get_connection(project.adapter) as conn: + conn.handle.get_table( + adapter.connections.get_bq_table( + relation.database, relation.schema, relation.table + ) + ) diff --git a/dbt-bigquery/tests/functional/test_get_columns_incomplete_database.py b/dbt-bigquery/tests/functional/test_get_columns_incomplete_database.py new file mode 100644 index 000000000..4fd92cdb2 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_get_columns_incomplete_database.py @@ -0,0 +1,53 @@ +import pytest +from dbt.tests.util import run_dbt + +# This is to test a edge case found in https://github.com/dbt-labs/dbt-bigquery/pull/165/files + +tests__get_cols_in_sql = """ +{% test get_cols_in(model) %} + + {# The step which causes the issue #} + {%- set relation = api.Relation.create(identifier=model.table) if execute -%} + + {% set columns = adapter.get_columns_in_relation(relation) %} + + select + {% for col in columns %} + {{ col.name }} {{ "," if not loop.last }} + {% endfor %} + + from {{ model }} + limit 0 + +{% endtest %} +""" + +models__my_model = """select 1 as id, 'text' as another_col +""" + +properties__model_yml = """ +version: 2 +models: + - name: my_model + tests: + - get_cols_in +""" + + +class TestIncompleteRelationSetup: + @pytest.fixture(scope="class") + def properties(self): + return {"properties__model_yml.yml": properties__model_yml} + + @pytest.fixture(scope="class") + def macros(self): + return {"get_col_in.sql": tests__get_cols_in_sql} + + @pytest.fixture(scope="class") + def models(self): + return {"my_model.sql": models__my_model} + + +class TestIncompleteRelation(TestIncompleteRelationSetup): + def test_incomplete_relation(self, project): + run_dbt(["build"]) diff --git a/dbt-bigquery/tests/functional/test_hours_to_expiration.py b/dbt-bigquery/tests/functional/test_hours_to_expiration.py new file mode 100644 index 000000000..8dbc71149 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_hours_to_expiration.py @@ -0,0 +1,26 @@ +import pytest +from dbt.tests.util import run_dbt_and_capture + +_MODEL_SQL = """ +select 1 as id +""" + + +class BaseBigQueryHoursToExpiration: + @pytest.fixture(scope="class") + def models(self): + return { + "model.sql": _MODEL_SQL, + } + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "models": {"test": {"materialized": "table", "model": {"hours_to_expiration": "4"}}} + } + + +class TestBigQueryHoursToExpiration(BaseBigQueryHoursToExpiration): + def test_bigquery_hours_to_expiration(self, project): + _, stdout = run_dbt_and_capture(["--debug", "run"]) + assert "expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 4 hour)" in stdout diff --git a/dbt-bigquery/tests/functional/test_incremental_materialization.py b/dbt-bigquery/tests/functional/test_incremental_materialization.py new file mode 100644 index 000000000..dce74db7c --- /dev/null +++ b/dbt-bigquery/tests/functional/test_incremental_materialization.py @@ -0,0 +1,41 @@ +import pytest +from dbt.tests.util import run_dbt + +# This is a short term hack, we need to go back +# and make adapter implementations of: +# https://github.com/dbt-labs/dbt-core/pull/6330 + +_INCREMENTAL_MODEL = """ +{{ + config( + materialized="incremental", + ) +}} + +{% if not is_incremental() %} + + select 10 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour union all + select 30 as id, cast('2020-01-01 02:00:00' as datetime) as date_hour + +{% else %} + + select 20 as id, cast('2020-01-01 01:00:00' as datetime) as date_hour union all + select 40 as id, cast('2020-01-01 02:00:00' as datetime) as date_hour + +{% endif %} +-- Test Comment To Prevent Reccurence of https://github.com/dbt-labs/dbt-core/issues/6485 +""" + + +class BaseIncrementalModelConfig: + @pytest.fixture(scope="class") + def models(self): + return {"test_incremental.sql": _INCREMENTAL_MODEL} + + +class TestIncrementalModel(BaseIncrementalModelConfig): + def test_incremental_model_succeeds(self, project): + results = run_dbt(["run"]) + assert len(results) == 1 + results = run_dbt(["run"]) + assert len(results) == 1 diff --git a/dbt-bigquery/tests/functional/test_job_timeout.py b/dbt-bigquery/tests/functional/test_job_timeout.py new file mode 100644 index 000000000..57172e133 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_job_timeout.py @@ -0,0 +1,63 @@ +import pytest + +from dbt.tests.util import run_dbt + +_REASONABLE_TIMEOUT = 300 +_SHORT_TIMEOUT = 1 + +_LONG_RUNNING_MODEL_SQL = """ + {{ config(materialized='table') }} + with array_1 as ( + select generated_ids from UNNEST(GENERATE_ARRAY(1, 200000)) AS generated_ids + ), + array_2 as ( + select generated_ids from UNNEST(GENERATE_ARRAY(2, 200000)) AS generated_ids + ) + + SELECT array_1.generated_ids + FROM array_1 + LEFT JOIN array_1 as jnd on 1=1 + LEFT JOIN array_2 as jnd2 on 1=1 + LEFT JOIN array_1 as jnd3 on jnd3.generated_ids >= jnd2.generated_ids +""" + +_SHORT_RUNNING_QUERY = """ + SELECT 1 as id + """ + + +class TestSuccessfulJobRun: + @pytest.fixture(scope="class") + def models(self): + return { + "model.sql": _SHORT_RUNNING_QUERY, + } + + @pytest.fixture(scope="class") + def profiles_config_update(self, dbt_profile_target): + outputs = {"default": dbt_profile_target} + outputs["default"]["job_execution_timeout_seconds"] = _REASONABLE_TIMEOUT + return {"test": {"outputs": outputs, "target": "default"}} + + def test_bigquery_job_run_succeeds_within_timeout(self, project): + result = run_dbt() + assert len(result) == 1 + + +class TestJobTimeout: + @pytest.fixture(scope="class") + def models(self): + return { + "model.sql": _LONG_RUNNING_MODEL_SQL, + } + + @pytest.fixture(scope="class") + def profiles_config_update(self, dbt_profile_target): + outputs = {"default": dbt_profile_target} + outputs["default"]["job_execution_timeout_seconds"] = _SHORT_TIMEOUT + return {"test": {"outputs": outputs, "target": "default"}} + + def test_job_timeout(self, project): + result = run_dbt(["run"], expect_pass=False) # project setup will fail + expected_error = f"Operation did not complete within the designated timeout of {_SHORT_TIMEOUT} seconds." + assert expected_error in result[0].message diff --git a/dbt-bigquery/tests/functional/test_location_change.py b/dbt-bigquery/tests/functional/test_location_change.py new file mode 100644 index 000000000..930c6b04d --- /dev/null +++ b/dbt-bigquery/tests/functional/test_location_change.py @@ -0,0 +1,40 @@ +import pytest +import os +from dbt.tests.util import run_dbt + +_MODEL_SQL = """ +select 1 as id +""" + +_INVALID_LOCATION = os.getenv("DBT_TEST_BIGQUERY_BAD_LOCATION", "northamerica-northeast1") +_VALID_LOCATION = os.getenv("DBT_TEST_BIGQUERY_INITIAL_LOCATION", "US") + + +class BaseBigQueryLocation: + @pytest.fixture(scope="class") + def models(self): + return { + "model.sql": _MODEL_SQL, + } + + +class TestBigqueryValidLocation(BaseBigQueryLocation): + def test_bigquery_valid_location(self, project): + results = run_dbt() + for result in results: + assert "US" == result.adapter_response["location"] + + +class TestBigqueryInvalidLocation(BaseBigQueryLocation): + @pytest.fixture(scope="class") + def profiles_config_update(self, dbt_profile_target): + outputs = {"default": dbt_profile_target} + outputs["default"]["location"] = _INVALID_LOCATION + yield + outputs = {"default": dbt_profile_target} + outputs["default"]["location"] = _VALID_LOCATION + + def test_bigquery_location_invalid(self, project): + results = run_dbt() + for result in results: + assert "northamerica-northeast1" == result.adapter_response["location"] diff --git a/dbt-bigquery/tests/functional/test_override_database/fixtures.py b/dbt-bigquery/tests/functional/test_override_database/fixtures.py new file mode 100644 index 000000000..470f42552 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_override_database/fixtures.py @@ -0,0 +1,73 @@ +import pytest +from dbt.tests.fixtures.project import write_project_files + + +models__view_2_sql = """ +{%- if target.type == 'bigquery' -%} + {{ config(project=var('alternate_db')) }} +{%- else -%} + {{ config(database=var('alternate_db')) }} +{%- endif -%} +select * from {{ ref('seed') }} + +""" + +models__view_1_sql = """ +{# + We are running against a database that must be quoted. + These calls ensure that we trigger an error if we're failing to quote at parse-time +#} +{% do adapter.already_exists(this.schema, this.table) %} +{% do adapter.get_relation(this.database, this.schema, this.table) %} +select * from {{ ref('seed') }} + +""" + +models__subfolder__view_4_sql = """ +{{ + config(database=var('alternate_db')) +}} + +select * from {{ ref('seed') }} + +""" + +models__subfolder__view_3_sql = """ +select * from {{ ref('seed') }} + +""" + +seeds__seed_csv = """id,name +1,a +2,b +3,c +4,d +5,e +""" + + +@pytest.fixture(scope="class") +def models(): + return { + "view_2.sql": models__view_2_sql, + "view_1.sql": models__view_1_sql, + "subfolder": { + "view_4.sql": models__subfolder__view_4_sql, + "view_3.sql": models__subfolder__view_3_sql, + }, + } + + +@pytest.fixture(scope="class") +def seeds(): + return {"seed.csv": seeds__seed_csv} + + +@pytest.fixture(scope="class") +def project_files( + project_root, + models, + seeds, +): + write_project_files(project_root, "models", models) + write_project_files(project_root, "seeds", seeds) diff --git a/dbt-bigquery/tests/functional/test_override_database/test_override_database.py b/dbt-bigquery/tests/functional/test_override_database/test_override_database.py new file mode 100644 index 000000000..32af1fd19 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_override_database/test_override_database.py @@ -0,0 +1,175 @@ +import pytest +import os +from dbt.tests.util import run_dbt, check_relations_equal_with_relations + +from tests.functional.test_override_database.fixtures import ( # noqa: F401 + models, + seeds, + project_files, +) + +ALT_DATABASE = os.getenv("BIGQUERY_TEST_ALT_DATABASE") + + +class BaseOverrideDatabase: + @pytest.fixture(scope="class") + def model_path(self): + return "models" + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "config-version": 2, + "seed-paths": ["seeds"], + "vars": { + "alternate_db": ALT_DATABASE, + }, + "quoting": { + "database": True, + }, + "seeds": { + "quote_columns": False, + }, + } + + @pytest.fixture(scope="function") + def clean_up(self, project): + yield + relation = project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema + ) + project.adapter.drop_schema(relation) + + +class TestModelOverrideBigQuery(BaseOverrideDatabase): + def run_database_override(self, project): + run_dbt(["seed"]) + assert len(run_dbt(["run"])) == 4 + check_relations_equal_with_relations( + project.adapter, + [ + project.adapter.Relation.create(schema=project.test_schema, identifier="seed"), + project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema, identifier="view_2" + ), + project.adapter.Relation.create(schema=project.test_schema, identifier="view_1"), + project.adapter.Relation.create(schema=project.test_schema, identifier="view_3"), + project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema, identifier="view_4" + ), + ], + ) + + def test_bigquery_database_override(self, project, clean_up): + self.run_database_override(project) + + +class BaseTestProjectModelOverrideBigQuery(BaseOverrideDatabase): + def run_database_override(self, project): + run_dbt(["seed"]) + assert len(run_dbt(["run"])) == 4 + self.assertExpectedRelations(project) + + def assertExpectedRelations(self, project): + check_relations_equal_with_relations( + project.adapter, + [ + project.adapter.Relation.create(schema=project.test_schema, identifier="seed"), + project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema, identifier="view_2" + ), + project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema, identifier="view_1" + ), + project.adapter.Relation.create(schema=project.test_schema, identifier="view_3"), + project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema, identifier="view_4" + ), + ], + ) + + +class TestProjectModelOverrideBigQuery(BaseTestProjectModelOverrideBigQuery): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "config-version": 2, + "models": { + "database": ALT_DATABASE, + "test": {"subfolder": {"database": "{{ target.database }}"}}, + }, + "seed-paths": ["seeds"], + "vars": { + "alternate_db": ALT_DATABASE, + }, + "quoting": { + "database": True, + }, + "seeds": { + "quote_columns": False, + }, + } + + def test_bigquery_database_override(self, project, clean_up): + self.run_database_override(project) + + +class TestProjectModelAliasOverrideBigQuery(BaseTestProjectModelOverrideBigQuery): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "config-version": 2, + "models": { + "project": ALT_DATABASE, + "test": {"subfolder": {"project": "{{ target.database }}"}}, + }, + "seed-paths": ["seeds"], + "vars": { + "alternate_db": ALT_DATABASE, + }, + "quoting": { + "database": True, + }, + "seeds": { + "quote_columns": False, + }, + } + + def test_bigquery_project_override(self, project, clean_up): + self.run_database_override(project) + + +class TestProjectSeedOverrideBigQuery(BaseOverrideDatabase): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "config-version": 2, + "seed-paths": ["seeds"], + "vars": { + "alternate_db": ALT_DATABASE, + }, + "seeds": {"database": ALT_DATABASE}, + } + + def run_database_override(self, project): + run_dbt(["seed"]) + assert len(run_dbt(["run"])) == 4 + check_relations_equal_with_relations( + project.adapter, + [ + project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema, identifier="seed" + ), + project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema, identifier="view_2" + ), + project.adapter.Relation.create(schema=project.test_schema, identifier="view_1"), + project.adapter.Relation.create(schema=project.test_schema, identifier="view_3"), + project.adapter.Relation.create( + database=ALT_DATABASE, schema=project.test_schema, identifier="view_4" + ), + ], + ) + + def test_bigquery_database_override(self, project, clean_up): + self.run_database_override(project) diff --git a/dbt-bigquery/tests/functional/test_quota_project.py b/dbt-bigquery/tests/functional/test_quota_project.py new file mode 100644 index 000000000..0b4bb90c4 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_quota_project.py @@ -0,0 +1,27 @@ +import os + +import pytest + +from dbt.tests.util import run_dbt + +_QUOTA_PROJECT = os.getenv("BIGQUERY_TEST_ALT_DATABASE") + + +class TestNoQuotaProject: + def test_no_quota_project(self, project): + results = run_dbt() + for result in results: + assert None == result.adapter_response["quota_project"] + + +class TestQuotaProjectOption: + @pytest.fixture(scope="class") + def profiles_config_update(self, dbt_profile_target): + outputs = {"default": dbt_profile_target} + outputs["default"]["quota_project"] = _QUOTA_PROJECT + yield + + def test_quota_project_option(self, project): + results = run_dbt() + for result in results: + assert _QUOTA_PROJECT == result.adapter_response["quota_project"] diff --git a/dbt-bigquery/tests/functional/test_update_column_policy.py b/dbt-bigquery/tests/functional/test_update_column_policy.py new file mode 100644 index 000000000..bd7fb8f58 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_update_column_policy.py @@ -0,0 +1,56 @@ +import pytest +from dbt.tests.util import run_dbt, get_connection, relation_from_name + +from dbt.adapters.bigquery import BigQueryRelation + +_POLICY_TAG_MODEL = """{{ + config( + materialized='table', + persist_docs={ 'columns': true } + ) +}} + +select + 1 field +""" + +_POLICY_TAG_YML = """version: 2 + +models: +- name: policy_tag_table + columns: + - name: field + policy_tags: + - '{{ var("policy_tag") }}' +""" + +# Manually generated https://console.cloud.google.com/bigquery/policy-tags?project=dbt-test-env +_POLICY_TAG = "projects/dbt-test-env/locations/us/taxonomies/5785568062805976401/policyTags/135489647357012267" +_POLICY_TAG_MODEL_NAME = "policy_tag_table" + + +class TestBigqueryUpdateColumnPolicy: + """See BQ docs for more info on policy tags: + https://cloud.google.com/bigquery/docs/column-level-security#work_with_policy_tags""" + + @pytest.fixture(scope="class") + def project_config_update(self): + return {"config-version": 2, "vars": {"policy_tag": _POLICY_TAG}} + + @pytest.fixture(scope="class") + def models(self): + return {f"{_POLICY_TAG_MODEL_NAME}.sql": _POLICY_TAG_MODEL, "schema.yml": _POLICY_TAG_YML} + + def test_bigquery_update_column_policy_tag(self, project): + results = run_dbt(["run", "--models", "policy_tag_table"]) + assert len(results) == 1 + relation: BigQueryRelation = relation_from_name(project.adapter, _POLICY_TAG_MODEL_NAME) + adapter = project.adapter + with get_connection(project.adapter) as conn: + table = conn.handle.get_table( + adapter.connections.get_bq_table( + relation.database, relation.schema, relation.table + ) + ) + for schema_field in table.schema: + assert schema_field.policy_tags.names == (_POLICY_TAG,) diff --git a/dbt-bigquery/tests/functional/test_update_field_description.py b/dbt-bigquery/tests/functional/test_update_field_description.py new file mode 100644 index 000000000..56d7a1754 --- /dev/null +++ b/dbt-bigquery/tests/functional/test_update_field_description.py @@ -0,0 +1,55 @@ +import pytest +from dbt.tests.util import relation_from_name, get_connection, run_dbt + +from dbt.adapters.bigquery import BigQueryRelation + +_FIELD_DESCRIPTION_MODEL = """{{ + config( + materialized='table', + persist_docs={ 'columns': true } + ) +}} + +select + 1 field +""" +_FIELD_DESCRIPTION_MODEL_NAME = "field_description_model" +_FIELD_DESCRIPTION = "this is not a field" +_FIELD_DESCRIPTION_MODEL_YML = """ +version: 2 + +models: +- name: field_description_model + columns: + - name: field + description: '{{ var("field_description") }}' +""" + + +class TestBigqueryUpdateColumnDescription: + @pytest.fixture(scope="class") + def project_config_update(self): + return {"config-version": 2, "vars": {"field_description": _FIELD_DESCRIPTION}} + + @pytest.fixture(scope="class") + def models(self): + return { + f"{_FIELD_DESCRIPTION_MODEL_NAME}.sql": _FIELD_DESCRIPTION_MODEL, + "schema.yml": _FIELD_DESCRIPTION_MODEL_YML, + } + + def test_bigquery_update_column_description(self, project): + results = run_dbt(["run"]) + assert len(results) == 1 + relation: BigQueryRelation = relation_from_name( + project.adapter, _FIELD_DESCRIPTION_MODEL_NAME + ) + adapter = project.adapter + with get_connection(project.adapter) as conn: + table = conn.handle.get_table( + adapter.connections.get_bq_table( + relation.database, relation.schema, relation.table + ) + ) + for schema_field in table.schema: + assert schema_field.description == _FIELD_DESCRIPTION diff --git a/dbt-bigquery/tests/unit/__init__.py b/dbt-bigquery/tests/unit/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dbt-bigquery/tests/unit/mock_adapter.py b/dbt-bigquery/tests/unit/mock_adapter.py new file mode 100644 index 000000000..885854261 --- /dev/null +++ b/dbt-bigquery/tests/unit/mock_adapter.py @@ -0,0 +1,84 @@ +from unittest import mock + +from dbt.adapters.base import BaseAdapter +from contextlib import contextmanager + + +def adapter_factory(): + class MockAdapter(BaseAdapter): + ConnectionManager = mock.MagicMock(TYPE="mock") + responder = mock.MagicMock() + # some convenient defaults + responder.quote.side_effect = lambda identifier: '"{}"'.format(identifier) + responder.date_function.side_effect = lambda: "unitdate()" + responder.is_cancelable.side_effect = lambda: False + + @contextmanager + def exception_handler(self, *args, **kwargs): + self.responder.exception_handler(*args, **kwargs) + yield + + def execute(self, *args, **kwargs): + return self.responder.execute(*args, **kwargs) + + def drop_relation(self, *args, **kwargs): + return self.responder.drop_relation(*args, **kwargs) + + def truncate_relation(self, *args, **kwargs): + return self.responder.truncate_relation(*args, **kwargs) + + def rename_relation(self, *args, **kwargs): + return self.responder.rename_relation(*args, **kwargs) + + def get_columns_in_relation(self, *args, **kwargs): + return self.responder.get_columns_in_relation(*args, **kwargs) + + def expand_column_types(self, *args, **kwargs): + return self.responder.expand_column_types(*args, **kwargs) + + def list_relations_without_caching(self, *args, **kwargs): + return self.responder.list_relations_without_caching(*args, **kwargs) + + def create_schema(self, *args, **kwargs): + return self.responder.create_schema(*args, **kwargs) + + def drop_schema(self, *args, **kwargs): + return self.responder.drop_schema(*args, **kwargs) + + @classmethod + def quote(cls, identifier): + return cls.responder.quote(identifier) + + def convert_text_type(self, *args, **kwargs): + return self.responder.convert_text_type(*args, **kwargs) + + def convert_number_type(self, *args, **kwargs): + return self.responder.convert_number_type(*args, **kwargs) + + def convert_integer_type(self, *args, **kwargs): + return self.responder.convert_integer_type(*args, **kwargs) + + def convert_boolean_type(self, *args, **kwargs): + return self.responder.convert_boolean_type(*args, **kwargs) + + def convert_datetime_type(self, *args, **kwargs): + return self.responder.convert_datetime_type(*args, **kwargs) + + def convert_date_type(self, *args, **kwargs): + return self.responder.convert_date_type(*args, **kwargs) + + def convert_time_type(self, *args, **kwargs): + return self.responder.convert_time_type(*args, **kwargs) + + def list_schemas(self, *args, **kwargs): + return self.responder.list_schemas(*args, **kwargs) + + @classmethod + def date_function(cls): + return cls.responder.date_function() + + @classmethod + def is_cancelable(cls): + return cls.responder.is_cancelable() + + return MockAdapter diff --git a/dbt-bigquery/tests/unit/test_bigquery_adapter.py b/dbt-bigquery/tests/unit/test_bigquery_adapter.py new file mode 100644 index 000000000..e57db9a62 --- /dev/null +++ b/dbt-bigquery/tests/unit/test_bigquery_adapter.py @@ -0,0 +1,995 @@ +from multiprocessing import get_context +from unittest import mock + +import agate +import decimal +import string +import random +import re +import pytest +import unittest +from unittest.mock import patch, MagicMock, create_autospec + +import dbt_common.dataclass_schema +import dbt_common.exceptions.base + +import dbt.adapters +from dbt.adapters.bigquery.relation_configs import PartitionConfig +from dbt.adapters.bigquery import BigQueryAdapter, BigQueryRelation +from google.cloud.bigquery.table import Table +from dbt.adapters.bigquery.connections import _sanitize_label, _VALIDATE_LABEL_LENGTH_LIMIT +from dbt_common.clients import agate_helper +import dbt_common.exceptions +from dbt.context.query_header import generate_query_header_context +from dbt.contracts.files import FileHash +from dbt.contracts.graph.manifest import ManifestStateCheck +from dbt.context.providers import RuntimeConfigObject, generate_runtime_macro_context + +from google.cloud.bigquery import AccessEntry + +from .utils import ( + config_from_parts_or_dicts, + inject_adapter, + TestAdapterConversions, + load_internal_manifest_macros, + mock_connection, +) + + +def _bq_conn(): + conn = MagicMock() + conn.get.side_effect = lambda x: "bigquery" if x == "type" else None + return conn + + +class BaseTestBigQueryAdapter(unittest.TestCase): + def setUp(self): + self.raw_profile = { + "outputs": { + "oauth": { + "type": "bigquery", + "method": "oauth", + "project": "dbt-unit-000000", + "schema": "dummy_schema", + "threads": 1, + }, + "service_account": { + "type": "bigquery", + "method": "service-account", + "project": "dbt-unit-000000", + "schema": "dummy_schema", + "keyfile": "/tmp/dummy-service-account.json", + "threads": 1, + }, + "loc": { + "type": "bigquery", + "method": "oauth", + "project": "dbt-unit-000000", + "schema": "dummy_schema", + "threads": 1, + "location": "Luna Station", + "priority": "batch", + "maximum_bytes_billed": 0, + }, + "impersonate": { + "type": "bigquery", + "method": "oauth", + "project": "dbt-unit-000000", + "schema": "dummy_schema", + "threads": 1, + "impersonate_service_account": "dummyaccount@dbt.iam.gserviceaccount.com", + }, + "oauth-credentials-token": { + "type": "bigquery", + "method": "oauth-secrets", + "token": "abc", + "project": "dbt-unit-000000", + "schema": "dummy_schema", + "threads": 1, + "location": "Luna Station", + "priority": "batch", + "maximum_bytes_billed": 0, + }, + "oauth-credentials": { + "type": "bigquery", + "method": "oauth-secrets", + "client_id": "abc", + "client_secret": "def", + "refresh_token": "ghi", + "token_uri": "jkl", + "project": "dbt-unit-000000", + "schema": "dummy_schema", + "threads": 1, + "location": "Luna Station", + "priority": "batch", + "maximum_bytes_billed": 0, + }, + "oauth-no-project": { + "type": "bigquery", + "method": "oauth", + "schema": "dummy_schema", + "threads": 1, + "location": "Solar Station", + }, + "dataproc-serverless-configured": { + "type": "bigquery", + "method": "oauth", + "schema": "dummy_schema", + "threads": 1, + "gcs_bucket": "dummy-bucket", + "dataproc_region": "europe-west1", + "submission_method": "serverless", + "dataproc_batch": { + "environment_config": { + "execution_config": { + "service_account": "dbt@dummy-project.iam.gserviceaccount.com", + "subnetwork_uri": "dataproc", + "network_tags": ["foo", "bar"], + } + }, + "labels": {"dbt": "rocks", "number": "1"}, + "runtime_config": { + "properties": { + "spark.executor.instances": "4", + "spark.driver.memory": "1g", + } + }, + }, + }, + "dataproc-serverless-default": { + "type": "bigquery", + "method": "oauth", + "schema": "dummy_schema", + "threads": 1, + "gcs_bucket": "dummy-bucket", + "dataproc_region": "europe-west1", + "submission_method": "serverless", + }, + }, + "target": "oauth", + } + + self.project_cfg = { + "name": "X", + "version": "0.1", + "project-root": "/tmp/dbt/does-not-exist", + "profile": "default", + "config-version": 2, + } + self.qh_patch = None + + @mock.patch("dbt.parser.manifest.ManifestLoader.build_manifest_state_check") + def _mock_state_check(self): + all_projects = self.all_projects + return ManifestStateCheck( + vars_hash=FileHash.from_contents("vars"), + project_hashes={name: FileHash.from_contents(name) for name in all_projects}, + profile_hash=FileHash.from_contents("profile"), + ) + + self.load_state_check = mock.patch( + "dbt.parser.manifest.ManifestLoader.build_manifest_state_check" + ) + self.mock_state_check = self.load_state_check.start() + self.mock_state_check.side_effect = _mock_state_check + + def tearDown(self): + if self.qh_patch: + self.qh_patch.stop() + super().tearDown() + + def get_adapter(self, target) -> BigQueryAdapter: + project = self.project_cfg.copy() + profile = self.raw_profile.copy() + profile["target"] = target + config = config_from_parts_or_dicts( + project=project, + profile=profile, + ) + adapter = BigQueryAdapter(config, get_context("spawn")) + adapter.set_macro_resolver(load_internal_manifest_macros(config)) + adapter.set_macro_context_generator(generate_runtime_macro_context) + adapter.connections.set_query_header( + generate_query_header_context(config, adapter.get_macro_resolver()) + ) + + self.qh_patch = patch.object(adapter.connections.query_header, "add") + self.mock_query_header_add = self.qh_patch.start() + self.mock_query_header_add.side_effect = lambda q: "/* dbt */\n{}".format(q) + + inject_adapter(adapter) + return adapter + + +class TestBigQueryAdapterAcquire(BaseTestBigQueryAdapter): + @patch( + "dbt.adapters.bigquery.credentials._create_bigquery_defaults", + return_value=("credentials", "project_id"), + ) + @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + def test_acquire_connection_oauth_no_project_validations( + self, mock_open_connection, mock_get_bigquery_defaults + ): + adapter = self.get_adapter("oauth-no-project") + mock_get_bigquery_defaults.assert_called_once() + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + + except dbt_common.exceptions.base.DbtValidationError as e: + self.fail("got DbtValidationError: {}".format(str(e))) + + except BaseException: + raise + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + def test_acquire_connection_oauth_validations(self, mock_open_connection): + adapter = self.get_adapter("oauth") + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + + except dbt_common.exceptions.base.DbtValidationError as e: + self.fail("got DbtValidationError: {}".format(str(e))) + + except BaseException: + raise + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + @patch( + "dbt.adapters.bigquery.credentials._create_bigquery_defaults", + return_value=("credentials", "project_id"), + ) + @patch( + "dbt.adapters.bigquery.connections.BigQueryConnectionManager.open", return_value=_bq_conn() + ) + def test_acquire_connection_dataproc_serverless( + self, mock_open_connection, mock_get_bigquery_defaults + ): + adapter = self.get_adapter("dataproc-serverless-configured") + mock_get_bigquery_defaults.assert_called_once() + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + + except dbt_common.exceptions.ValidationException as e: + self.fail("got ValidationException: {}".format(str(e))) + + except BaseException: + raise + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + def test_acquire_connection_service_account_validations(self, mock_open_connection): + adapter = self.get_adapter("service_account") + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + + except dbt_common.exceptions.base.DbtValidationError as e: + self.fail("got DbtValidationError: {}".format(str(e))) + + except BaseException: + raise + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + def test_acquire_connection_oauth_token_validations(self, mock_open_connection): + adapter = self.get_adapter("oauth-credentials-token") + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + + except dbt_common.exceptions.base.DbtValidationError as e: + self.fail("got DbtValidationError: {}".format(str(e))) + + except BaseException: + raise + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + def test_acquire_connection_oauth_credentials_validations(self, mock_open_connection): + adapter = self.get_adapter("oauth-credentials") + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + + except dbt_common.exceptions.base.DbtValidationError as e: + self.fail("got DbtValidationError: {}".format(str(e))) + + except BaseException: + raise + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + def test_acquire_connection_impersonated_service_account_validations( + self, mock_open_connection + ): + adapter = self.get_adapter("impersonate") + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + + except dbt_common.exceptions.base.DbtValidationError as e: + self.fail("got DbtValidationError: {}".format(str(e))) + + except BaseException: + raise + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + def test_acquire_connection_priority(self, mock_open_connection): + adapter = self.get_adapter("loc") + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + self.assertEqual(connection.credentials.priority, "batch") + + except dbt_common.exceptions.base.DbtValidationError as e: + self.fail("got DbtValidationError: {}".format(str(e))) + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + @patch("dbt.adapters.bigquery.BigQueryConnectionManager.open", return_value=_bq_conn()) + def test_acquire_connection_maximum_bytes_billed(self, mock_open_connection): + adapter = self.get_adapter("loc") + try: + connection = adapter.acquire_connection("dummy") + self.assertEqual(connection.type, "bigquery") + self.assertEqual(connection.credentials.maximum_bytes_billed, 0) + + except dbt_common.exceptions.base.DbtValidationError as e: + self.fail("got DbtValidationError: {}".format(str(e))) + + mock_open_connection.assert_not_called() + connection.handle + mock_open_connection.assert_called_once() + + def test_cancel_open_connections_empty(self): + adapter = self.get_adapter("oauth") + self.assertEqual(len(list(adapter.cancel_open_connections())), 0) + + def test_cancel_open_connections_master(self): + adapter = self.get_adapter("oauth") + key = adapter.connections.get_thread_identifier() + adapter.connections.thread_connections[key] = mock_connection("master") + self.assertEqual(len(list(adapter.cancel_open_connections())), 0) + + def test_cancel_open_connections_single(self): + adapter = self.get_adapter("oauth") + master = mock_connection("master") + model = mock_connection("model") + key = adapter.connections.get_thread_identifier() + + adapter.connections.thread_connections.update({key: master, 1: model}) + self.assertEqual(len(list(adapter.cancel_open_connections())), 1) + + @patch("dbt.adapters.bigquery.clients.ClientOptions") + @patch("dbt.adapters.bigquery.credentials.default") + @patch("dbt.adapters.bigquery.clients.BigQueryClient") + def test_location_user_agent(self, MockClient, mock_auth_default, MockClientOptions): + creds = MagicMock() + mock_auth_default.return_value = (creds, MagicMock()) + adapter = self.get_adapter("loc") + + connection = adapter.acquire_connection("dummy") + mock_client_options = MockClientOptions.return_value + + MockClient.assert_not_called() + connection.handle + MockClient.assert_called_once_with( + "dbt-unit-000000", + creds, + location="Luna Station", + client_info=HasUserAgent(), + client_options=mock_client_options, + ) + + +class HasUserAgent: + PAT = re.compile(r"dbt-bigquery-\d+\.\d+\.\d+((a|b|rc)\d+)?") + + def __eq__(self, other): + compare = getattr(other, "user_agent", "") + return bool(self.PAT.match(compare)) + + +class TestConnectionNamePassthrough(BaseTestBigQueryAdapter): + def setUp(self): + super().setUp() + self._conn_patch = patch.object(BigQueryAdapter, "ConnectionManager") + self.conn_manager_cls = self._conn_patch.start() + + self._relation_patch = patch.object(BigQueryAdapter, "Relation") + self.relation_cls = self._relation_patch.start() + + self.mock_connection_manager = self.conn_manager_cls.return_value + self.mock_connection_manager.get_if_exists().name = "mock_conn_name" + self.conn_manager_cls.TYPE = "bigquery" + self.relation_cls.get_default_quote_policy.side_effect = ( + BigQueryRelation.get_default_quote_policy + ) + + self.adapter = self.get_adapter("oauth") + + def tearDown(self): + super().tearDown() + self._conn_patch.stop() + self._relation_patch.stop() + + def test_get_relation(self): + self.adapter.get_relation("db", "schema", "my_model") + self.mock_connection_manager.get_bq_table.assert_called_once_with( + "db", "schema", "my_model" + ) + + @patch.object(BigQueryAdapter, "check_schema_exists") + def test_drop_schema(self, mock_check_schema): + mock_check_schema.return_value = True + relation = BigQueryRelation.create(database="db", schema="schema") + self.adapter.drop_schema(relation) + self.mock_connection_manager.drop_dataset.assert_called_once_with("db", "schema") + + def test_get_columns_in_relation(self): + self.mock_connection_manager.get_bq_table.side_effect = ValueError + self.adapter.get_columns_in_relation( + MagicMock(database="db", schema="schema", identifier="ident"), + ) + self.mock_connection_manager.get_bq_table.assert_called_once_with( + database="db", schema="schema", identifier="ident" + ) + + +class TestBigQueryRelation(unittest.TestCase): + def setUp(self): + pass + + def test_view_temp_relation(self): + kwargs = { + "type": None, + "path": {"database": "test-project", "schema": "test_schema", "identifier": "my_view"}, + "quote_policy": {"identifier": False}, + } + BigQueryRelation.validate(kwargs) + + def test_view_relation(self): + kwargs = { + "type": "view", + "path": {"database": "test-project", "schema": "test_schema", "identifier": "my_view"}, + "quote_policy": {"identifier": True, "schema": True}, + } + BigQueryRelation.validate(kwargs) + + def test_table_relation(self): + kwargs = { + "type": "table", + "path": { + "database": "test-project", + "schema": "test_schema", + "identifier": "generic_table", + }, + "quote_policy": {"identifier": True, "schema": True}, + } + BigQueryRelation.validate(kwargs) + + def test_external_source_relation(self): + kwargs = { + "type": "external", + "path": {"database": "test-project", "schema": "test_schema", "identifier": "sheet"}, + "quote_policy": {"identifier": True, "schema": True}, + } + BigQueryRelation.validate(kwargs) + + def test_invalid_relation(self): + kwargs = { + "type": "invalid-type", + "path": { + "database": "test-project", + "schema": "test_schema", + "identifier": "my_invalid_id", + }, + "quote_policy": {"identifier": False, "schema": True}, + } + with self.assertRaises(dbt_common.dataclass_schema.ValidationError): + BigQueryRelation.validate(kwargs) + + +class TestBigQueryInformationSchema(unittest.TestCase): + def setUp(self): + pass + + def test_replace(self): + kwargs = { + "type": None, + "path": {"database": "test-project", "schema": "test_schema", "identifier": "my_view"}, + # test for #2188 + "quote_policy": {"database": False}, + "include_policy": { + "database": True, + "schema": True, + "identifier": True, + }, + } + BigQueryRelation.validate(kwargs) + relation = BigQueryRelation.from_dict(kwargs) + info_schema = relation.information_schema() + + tables_schema = info_schema.replace(information_schema_view="__TABLES__") + assert tables_schema.information_schema_view == "__TABLES__" + assert tables_schema.include_policy.schema is True + assert tables_schema.include_policy.identifier is False + assert tables_schema.include_policy.database is True + assert tables_schema.quote_policy.schema is True + assert tables_schema.quote_policy.identifier is False + assert tables_schema.quote_policy.database is False + + schemata_schema = info_schema.replace(information_schema_view="SCHEMATA") + assert schemata_schema.information_schema_view == "SCHEMATA" + assert schemata_schema.include_policy.schema is False + assert schemata_schema.include_policy.identifier is True + assert schemata_schema.include_policy.database is True + assert schemata_schema.quote_policy.schema is True + assert schemata_schema.quote_policy.identifier is False + assert schemata_schema.quote_policy.database is False + + other_schema = info_schema.replace(information_schema_view="SOMETHING_ELSE") + assert other_schema.information_schema_view == "SOMETHING_ELSE" + assert other_schema.include_policy.schema is True + assert other_schema.include_policy.identifier is True + assert other_schema.include_policy.database is True + assert other_schema.quote_policy.schema is True + assert other_schema.quote_policy.identifier is False + assert other_schema.quote_policy.database is False + + +class TestBigQueryAdapter(BaseTestBigQueryAdapter): + def test_copy_table_materialization_table(self): + adapter = self.get_adapter("oauth") + adapter.connections = MagicMock() + adapter.copy_table("source", "destination", "table") + adapter.connections.copy_bq_table.assert_called_once_with( + "source", "destination", dbt.adapters.bigquery.impl.WRITE_TRUNCATE + ) + + def test_copy_table_materialization_incremental(self): + adapter = self.get_adapter("oauth") + adapter.connections = MagicMock() + adapter.copy_table("source", "destination", "incremental") + adapter.connections.copy_bq_table.assert_called_once_with( + "source", "destination", dbt.adapters.bigquery.impl.WRITE_APPEND + ) + + def test_parse_partition_by(self): + adapter = self.get_adapter("oauth") + + with self.assertRaises(dbt_common.exceptions.base.DbtValidationError): + adapter.parse_partition_by("date(ts)") + + with self.assertRaises(dbt_common.exceptions.base.DbtValidationError): + adapter.parse_partition_by("ts") + + self.assertEqual( + adapter.parse_partition_by( + { + "field": "ts", + } + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "date", + "granularity": "day", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + { + "field": "ts", + "data_type": "date", + } + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "date", + "granularity": "day", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "data_type": "date", "granularity": "MONTH"} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "date", + "granularity": "month", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "data_type": "date", "granularity": "YEAR"} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "date", + "granularity": "year", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "data_type": "timestamp", "granularity": "HOUR"} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "timestamp", + "granularity": "hour", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "data_type": "timestamp", "granularity": "MONTH"} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "timestamp", + "granularity": "month", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "data_type": "timestamp", "granularity": "YEAR"} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "timestamp", + "granularity": "year", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "data_type": "datetime", "granularity": "HOUR"} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "datetime", + "granularity": "hour", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "data_type": "datetime", "granularity": "MONTH"} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "datetime", + "granularity": "month", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "data_type": "datetime", "granularity": "YEAR"} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "datetime", + "granularity": "year", + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + self.assertEqual( + adapter.parse_partition_by( + {"field": "ts", "time_ingestion_partitioning": True, "copy_partitions": True} + ).to_dict(omit_none=True), + { + "field": "ts", + "data_type": "date", + "granularity": "day", + "time_ingestion_partitioning": True, + "copy_partitions": True, + }, + ) + + # Invalid, should raise an error + with self.assertRaises(dbt_common.exceptions.base.DbtValidationError): + adapter.parse_partition_by({}) + + # passthrough + self.assertEqual( + adapter.parse_partition_by( + { + "field": "id", + "data_type": "int64", + "range": {"start": 1, "end": 100, "interval": 20}, + } + ).to_dict(omit_none=True), + { + "field": "id", + "data_type": "int64", + "granularity": "day", + "range": {"start": 1, "end": 100, "interval": 20}, + "time_ingestion_partitioning": False, + "copy_partitions": False, + }, + ) + + def test_hours_to_expiration(self): + adapter = self.get_adapter("oauth") + mock_config = create_autospec(RuntimeConfigObject) + config = {"hours_to_expiration": 4} + mock_config.get.side_effect = lambda name: config.get(name) + + expected = { + "expiration_timestamp": "TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 4 hour)", + } + actual = adapter.get_table_options(mock_config, node={}, temporary=False) + self.assertEqual(expected, actual) + + def test_hours_to_expiration_temporary(self): + adapter = self.get_adapter("oauth") + mock_config = create_autospec(RuntimeConfigObject) + config = {"hours_to_expiration": 4} + mock_config.get.side_effect = lambda name: config.get(name) + + expected = { + "expiration_timestamp": ("TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 12 hour)"), + } + actual = adapter.get_table_options(mock_config, node={}, temporary=True) + self.assertEqual(expected, actual) + + def test_table_kms_key_name(self): + adapter = self.get_adapter("oauth") + mock_config = create_autospec(RuntimeConfigObject) + config = {"kms_key_name": "some_key"} + mock_config.get.side_effect = lambda name: config.get(name) + + expected = {"kms_key_name": "'some_key'"} + actual = adapter.get_table_options(mock_config, node={}, temporary=False) + self.assertEqual(expected, actual) + + def test_view_kms_key_name(self): + adapter = self.get_adapter("oauth") + mock_config = create_autospec(RuntimeConfigObject) + config = {"kms_key_name": "some_key"} + mock_config.get.side_effect = lambda name: config.get(name) + + expected = {} + actual = adapter.get_view_options(mock_config, node={}) + self.assertEqual(expected, actual) + + +class TestBigQueryFilterCatalog(unittest.TestCase): + def test__catalog_filter_table(self): + used_schemas = [["a", "B"], ["a", "1234"]] + column_names = ["table_name", "table_database", "table_schema", "something"] + rows = [ + ["foo", "a", "b", "1234"], # include + ["foo", "a", "1234", "1234"], # include, w/ table schema as str + ["foo", "c", "B", "1234"], # skip + ["1234", "A", "B", "1234"], # include, w/ table name as str + ] + table = agate.Table(rows, column_names, agate_helper.DEFAULT_TYPE_TESTER) + + result = BigQueryAdapter._catalog_filter_table(table, used_schemas) + assert len(result) == 3 + for row in result.rows: + assert isinstance(row["table_schema"], str) + assert isinstance(row["table_database"], str) + assert isinstance(row["table_name"], str) + assert isinstance(row["something"], decimal.Decimal) + + +class TestBigQueryAdapterConversions(TestAdapterConversions): + def test_convert_text_type(self): + rows = [ + ["", "a1", "stringval1"], + ["", "a2", "stringvalasdfasdfasdfa"], + ["", "a3", "stringval3"], + ] + agate_table = self._make_table_of(rows, agate.Text) + expected = ["string", "string", "string"] + for col_idx, expect in enumerate(expected): + assert BigQueryAdapter.convert_text_type(agate_table, col_idx) == expect + + def test_convert_number_type(self): + rows = [ + ["", "23.98", "-1"], + ["", "12.78", "-2"], + ["", "79.41", "-3"], + ] + agate_table = self._make_table_of(rows, agate.Number) + expected = ["int64", "float64", "int64"] + for col_idx, expect in enumerate(expected): + assert BigQueryAdapter.convert_number_type(agate_table, col_idx) == expect + + def test_convert_boolean_type(self): + rows = [ + ["", "false", "true"], + ["", "false", "false"], + ["", "false", "true"], + ] + agate_table = self._make_table_of(rows, agate.Boolean) + expected = ["bool", "bool", "bool"] + for col_idx, expect in enumerate(expected): + assert BigQueryAdapter.convert_boolean_type(agate_table, col_idx) == expect + + def test_convert_datetime_type(self): + rows = [ + ["", "20190101T01:01:01Z", "2019-01-01 01:01:01"], + ["", "20190102T01:01:01Z", "2019-01-01 01:01:01"], + ["", "20190103T01:01:01Z", "2019-01-01 01:01:01"], + ] + agate_table = self._make_table_of( + rows, [agate.DateTime, agate_helper.ISODateTime, agate.DateTime] + ) + expected = ["datetime", "datetime", "datetime"] + for col_idx, expect in enumerate(expected): + assert BigQueryAdapter.convert_datetime_type(agate_table, col_idx) == expect + + def test_convert_date_type(self): + rows = [ + ["", "2019-01-01", "2019-01-04"], + ["", "2019-01-02", "2019-01-04"], + ["", "2019-01-03", "2019-01-04"], + ] + agate_table = self._make_table_of(rows, agate.Date) + expected = ["date", "date", "date"] + for col_idx, expect in enumerate(expected): + assert BigQueryAdapter.convert_date_type(agate_table, col_idx) == expect + + def test_convert_time_type(self): + # dbt's default type testers actually don't have a TimeDelta at all. + agate.TimeDelta + rows = [ + ["", "120s", "10s"], + ["", "3m", "11s"], + ["", "1h", "12s"], + ] + agate_table = self._make_table_of(rows, agate.TimeDelta) + expected = ["time", "time", "time"] + for col_idx, expect in enumerate(expected): + assert BigQueryAdapter.convert_time_type(agate_table, col_idx) == expect + + # The casing in this case can't be enforced on the API side, + # so we have to validate that we have a case-insensitive comparison + def test_partitions_match(self): + table = Table.from_api_repr( + { + "tableReference": { + "projectId": "test-project", + "datasetId": "test_dataset", + "tableId": "test_table", + }, + "timePartitioning": {"type": "DAY", "field": "ts"}, + } + ) + partition_config = PartitionConfig.parse( + { + "field": "TS", + "data_type": "date", + "granularity": "day", + "time_ingestion_partitioning": False, + "copy_partitions": False, + } + ) + assert BigQueryAdapter._partitions_match(table, partition_config) is True + + +class TestBigQueryGrantAccessTo(BaseTestBigQueryAdapter): + entity = BigQueryRelation.from_dict( + { + "type": None, + "path": {"database": "test-project", "schema": "test_schema", "identifier": "my_view"}, + "quote_policy": {"identifier": False}, + } + ) + + def setUp(self): + super().setUp() + self.mock_dataset: MagicMock = MagicMock(name="GrantMockDataset") + self.mock_dataset.access_entries = [AccessEntry(None, "table", self.entity)] + self.mock_client: MagicMock = MagicMock(name="MockBQClient") + self.mock_client.get_dataset.return_value = self.mock_dataset + self.mock_connection = MagicMock(name="MockConn") + self.mock_connection.handle = self.mock_client + self.mock_connection_mgr = MagicMock( + name="GrantAccessMockMgr", + ) + self.mock_connection_mgr.get_thread_connection.return_value = self.mock_connection + _adapter = self.get_adapter("oauth") + _adapter.connections = self.mock_connection_mgr + self.adapter = _adapter + + def test_grant_access_to_calls_update_with_valid_access_entry(self): + a_different_entity = BigQueryRelation.from_dict( + { + "type": None, + "path": { + "database": "another-test-project", + "schema": "test_schema_2", + "identifier": "my_view", + }, + "quote_policy": {"identifier": True}, + } + ) + grant_target_dict = {"dataset": "someOtherDataset", "project": "someProject"} + self.adapter.grant_access_to( + entity=a_different_entity, + entity_type="view", + role=None, + grant_target_dict=grant_target_dict, + ) + self.mock_client.update_dataset.assert_called_once() + + +@pytest.mark.parametrize( + ["input", "output"], + [ + ("ABC", "abc"), + ("a c", "a_c"), + ("a ", "a"), + ], +) +def test_sanitize_label(input, output): + assert _sanitize_label(input) == output + + +@pytest.mark.parametrize( + "label_length", + [64, 65, 100], +) +def test_sanitize_label_length(label_length): + random_string = "".join( + random.choice(string.ascii_uppercase + string.digits) for i in range(label_length) + ) + assert len(_sanitize_label(random_string)) <= _VALIDATE_LABEL_LENGTH_LIMIT diff --git a/dbt-bigquery/tests/unit/test_bigquery_connection_manager.py b/dbt-bigquery/tests/unit/test_bigquery_connection_manager.py new file mode 100644 index 000000000..e7afd692f --- /dev/null +++ b/dbt-bigquery/tests/unit/test_bigquery_connection_manager.py @@ -0,0 +1,158 @@ +import json +import unittest +from requests.exceptions import ConnectionError +from unittest.mock import patch, MagicMock, Mock, ANY + +import dbt.adapters +import google.cloud.bigquery + +from dbt.adapters.bigquery import BigQueryCredentials +from dbt.adapters.bigquery import BigQueryRelation +from dbt.adapters.bigquery.connections import BigQueryConnectionManager +from dbt.adapters.bigquery.retry import RetryFactory + + +class TestBigQueryConnectionManager(unittest.TestCase): + def setUp(self): + self.credentials = Mock(BigQueryCredentials) + self.credentials.method = "oauth" + self.credentials.job_retries = 1 + self.credentials.job_retry_deadline_seconds = 1 + self.credentials.scopes = tuple() + + self.mock_client = Mock(google.cloud.bigquery.Client) + + self.mock_connection = MagicMock() + self.mock_connection.handle = self.mock_client + self.mock_connection.credentials = self.credentials + + self.connections = BigQueryConnectionManager( + profile=Mock(credentials=self.credentials, query_comment=None), + mp_context=Mock(), + ) + self.connections.get_thread_connection = lambda: self.mock_connection + + @patch( + "dbt.adapters.bigquery.retry.create_bigquery_client", + return_value=Mock(google.cloud.bigquery.Client), + ) + def test_retry_connection_reset(self, mock_client_factory): + new_mock_client = mock_client_factory.return_value + + @self.connections._retry.create_reopen_with_deadline(self.mock_connection) + def generate_connection_reset_error(): + raise ConnectionResetError + + assert self.mock_connection.handle is self.mock_client + + with self.assertRaises(ConnectionResetError): + # this will always raise the error, we just want to test that the connection was reopening in between + generate_connection_reset_error() + + assert self.mock_connection.handle is new_mock_client + assert new_mock_client is not self.mock_client + + def test_is_retryable(self): + _is_retryable = google.cloud.bigquery.retry._job_should_retry + exceptions = dbt.adapters.bigquery.impl.google.cloud.exceptions + internal_server_error = exceptions.InternalServerError("code broke") + bad_request_error = exceptions.BadRequest("code broke") + connection_error = ConnectionError("code broke") + client_error = exceptions.ClientError("bad code") + rate_limit_error = exceptions.Forbidden( + "code broke", errors=[{"reason": "rateLimitExceeded"}] + ) + service_unavailable_error = exceptions.ServiceUnavailable("service is unavailable") + + self.assertTrue(_is_retryable(internal_server_error)) + self.assertFalse( + _is_retryable(bad_request_error) + ) # this was removed after initially being included + self.assertTrue(_is_retryable(connection_error)) + self.assertFalse(_is_retryable(client_error)) + self.assertTrue(_is_retryable(rate_limit_error)) + self.assertTrue(_is_retryable(service_unavailable_error)) + + def test_drop_dataset(self): + mock_table = Mock() + mock_table.reference = "table1" + self.mock_client.list_tables.return_value = [mock_table] + + self.connections.drop_dataset("project", "dataset") + + self.mock_client.list_tables.assert_not_called() + self.mock_client.delete_table.assert_not_called() + self.mock_client.delete_dataset.assert_called_once() + + @patch("dbt.adapters.bigquery.connections.QueryJobConfig") + def test_query_and_results(self, MockQueryJobConfig): + self.connections._query_and_results( + self.mock_connection, + "sql", + {"dry_run": True}, + job_id=1, + ) + + MockQueryJobConfig.assert_called_once() + self.mock_client.query.assert_called_once_with( + query="sql", + job_config=MockQueryJobConfig(), + job_id=1, + timeout=self.credentials.job_creation_timeout_seconds, + ) + + def test_copy_bq_table_appends(self): + self._copy_table(write_disposition=dbt.adapters.bigquery.impl.WRITE_APPEND) + self.mock_client.copy_table.assert_called_once_with( + [self._table_ref("project", "dataset", "table1")], + self._table_ref("project", "dataset", "table2"), + job_config=ANY, + retry=ANY, + ) + args, kwargs = self.mock_client.copy_table.call_args + self.assertEqual( + kwargs["job_config"].write_disposition, dbt.adapters.bigquery.impl.WRITE_APPEND + ) + + def test_copy_bq_table_truncates(self): + self._copy_table(write_disposition=dbt.adapters.bigquery.impl.WRITE_TRUNCATE) + args, kwargs = self.mock_client.copy_table.call_args + self.mock_client.copy_table.assert_called_once_with( + [self._table_ref("project", "dataset", "table1")], + self._table_ref("project", "dataset", "table2"), + job_config=ANY, + retry=ANY, + ) + args, kwargs = self.mock_client.copy_table.call_args + self.assertEqual( + kwargs["job_config"].write_disposition, dbt.adapters.bigquery.impl.WRITE_TRUNCATE + ) + + def test_job_labels_valid_json(self): + expected = {"key": "value"} + labels = self.connections._labels_from_query_comment(json.dumps(expected)) + self.assertEqual(labels, expected) + + def test_job_labels_invalid_json(self): + labels = self.connections._labels_from_query_comment("not json") + self.assertEqual(labels, {"query_comment": "not_json"}) + + def test_list_dataset_correctly_calls_lists_datasets(self): + mock_dataset = Mock(dataset_id="d1") + mock_list_dataset = Mock(return_value=[mock_dataset]) + self.mock_client.list_datasets = mock_list_dataset + result = self.connections.list_dataset("project") + self.mock_client.list_datasets.assert_called_once_with( + project="project", max_results=10000, retry=ANY + ) + assert result == ["d1"] + + def _table_ref(self, proj, ds, table): + return self.connections.table_ref(proj, ds, table) + + def _copy_table(self, write_disposition): + source = BigQueryRelation.create(database="project", schema="dataset", identifier="table1") + destination = BigQueryRelation.create( + database="project", schema="dataset", identifier="table2" + ) + self.connections.copy_bq_table(source, destination, write_disposition) diff --git a/dbt-bigquery/tests/unit/test_column.py b/dbt-bigquery/tests/unit/test_column.py new file mode 100644 index 000000000..10f30594e --- /dev/null +++ b/dbt-bigquery/tests/unit/test_column.py @@ -0,0 +1,246 @@ +import pytest + +from dbt.adapters.bigquery.column import get_nested_column_data_types + + +@pytest.mark.parametrize( + ["columns", "constraints", "expected_nested_columns"], + [ + ({}, None, {}), + ({}, {"not_in_columns": "unique"}, {}), + # Flat column + ( + {"a": {"name": "a", "data_type": "string"}}, + None, + {"a": {"name": "a", "data_type": "string"}}, + ), + # Flat column - missing data_type + ( + {"a": {"name": "a"}}, + None, + {"a": {"name": "a", "data_type": None}}, + ), + # Flat column - with constraints + ( + {"a": {"name": "a", "data_type": "string"}}, + {"a": "not null"}, + {"a": {"name": "a", "data_type": "string not null"}}, + ), + # Flat column - with constraints + other keys + ( + {"a": {"name": "a", "data_type": "string", "quote": True}}, + {"a": "not null"}, + {"a": {"name": "a", "data_type": "string not null", "quote": True}}, + ), + # Single nested column, 1 level + ( + {"b.nested": {"name": "b.nested", "data_type": "string"}}, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - missing data_type + ( + {"b.nested": {"name": "b.nested"}}, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with constraints + ( + {"b.nested": {"name": "b.nested", "data_type": "string"}}, + {"b.nested": "not null"}, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with constraints, missing data_type (constraints not valid without data_type) + ( + {"b.nested": {"name": "b.nested"}}, + {"b.nested": "not null"}, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with constraints + other keys + ( + {"b.nested": {"name": "b.nested", "data_type": "string", "other": "unpreserved"}}, + {"b.nested": "not null"}, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with corresponding parent column + ( + { + "b": {"name": "b", "data_type": "struct"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with corresponding parent column specified last + ( + { + "b.nested": {"name": "b.nested", "data_type": "string"}, + "b": {"name": "b", "data_type": "struct"}, + }, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with corresponding parent column + parent constraint + ( + { + "b": {"name": "b", "data_type": "struct"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + {"b": "not null"}, + {"b": {"name": "b", "data_type": "struct not null"}}, + ), + # Single nested column, 1 level - with corresponding parent column as array + ( + { + "b": {"name": "b", "data_type": "array"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + None, + {"b": {"name": "b", "data_type": "array>"}}, + ), + # Single nested column, 1 level - with corresponding parent column as array + constraint + ( + { + "b": {"name": "b", "data_type": "array"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + {"b": "not null"}, + {"b": {"name": "b", "data_type": "array> not null"}}, + ), + # Multiple nested columns, 1 level + ( + { + "b.nested": {"name": "b.nested", "data_type": "string"}, + "b.nested2": {"name": "b.nested2", "data_type": "int64"}, + }, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Multiple nested columns, 1 level - with constraints + ( + { + "b.nested": {"name": "b.nested", "data_type": "string"}, + "b.nested2": {"name": "b.nested2", "data_type": "int64"}, + }, + {"b.nested": "not null"}, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Multiple nested columns, 1 level - with constraints + ( + { + "b.nested": {"name": "b.nested", "data_type": "string"}, + "b.nested2": {"name": "b.nested2", "data_type": "int64"}, + }, + {"b.nested": "not null"}, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Mix of flat and nested columns, 1 level + ( + { + "a": {"name": "a", "data_type": "string"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + "b.nested2": {"name": "b.nested2", "data_type": "int64"}, + }, + None, + { + "b": {"name": "b", "data_type": "struct"}, + "a": {"name": "a", "data_type": "string"}, + }, + ), + # Nested columns, multiple levels + ( + { + "b.user.name.first": { + "name": "b.user.name.first", + "data_type": "string", + }, + "b.user.name.last": { + "name": "b.user.name.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country", "data_type": "string"}, + }, + None, + { + "b": { + "name": "b", + "data_type": "struct, id int64, country string>>", + }, + }, + ), + # Nested columns, multiple levels - missing data_type + ( + { + "b.user.name.first": { + "name": "b.user.name.first", + "data_type": "string", + }, + "b.user.name.last": { + "name": "b.user.name.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country"}, # missing data_type + }, + None, + { + "b": { + "name": "b", + "data_type": "struct, id int64, country>>", + }, + }, + ), + # Nested columns, multiple levels - with constraints! + ( + { + "b.user.name.first": { + "name": "b.user.name.first", + "data_type": "string", + }, + "b.user.name.last": { + "name": "b.user.name.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country", "data_type": "string"}, + }, + {"b.user.name.first": "not null", "b.user.id": "unique"}, + { + "b": { + "name": "b", + "data_type": "struct, id int64 unique, country string>>", + }, + }, + ), + # Nested columns, multiple levels - with parent arrays and constraints! + ( + { + "b.user.names": { + "name": "b.user.names", + "data_type": "array", + }, + "b.user.names.first": { + "name": "b.user.names.first", + "data_type": "string", + }, + "b.user.names.last": { + "name": "b.user.names.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country", "data_type": "string"}, + }, + {"b.user.names.first": "not null", "b.user.id": "unique"}, + { + "b": { + "name": "b", + "data_type": "struct>, id int64 unique, country string>>", + }, + }, + ), + ], +) +def test_get_nested_column_data_types(columns, constraints, expected_nested_columns): + actual_nested_columns = get_nested_column_data_types(columns, constraints) + assert expected_nested_columns == actual_nested_columns diff --git a/dbt-bigquery/tests/unit/test_configure_dataproc_batch.py b/dbt-bigquery/tests/unit/test_configure_dataproc_batch.py new file mode 100644 index 000000000..6e5757589 --- /dev/null +++ b/dbt-bigquery/tests/unit/test_configure_dataproc_batch.py @@ -0,0 +1,78 @@ +from unittest.mock import patch + +from dbt.adapters.bigquery.python_submissions import _update_batch_from_config +from google.cloud import dataproc_v1 + +from .test_bigquery_adapter import BaseTestBigQueryAdapter + + +# Test application of dataproc_batch configuration to a +# google.cloud.dataproc_v1.Batch object. +# This reuses the machinery from BaseTestBigQueryAdapter to get hold of the +# parsed credentials +class TestConfigureDataprocBatch(BaseTestBigQueryAdapter): + @patch( + "dbt.adapters.bigquery.credentials._create_bigquery_defaults", + return_value=("credentials", "project_id"), + ) + def test_update_dataproc_serverless_batch(self, mock_get_bigquery_defaults): + adapter = self.get_adapter("dataproc-serverless-configured") + mock_get_bigquery_defaults.assert_called_once() + + credentials = adapter.acquire_connection("dummy").credentials + self.assertIsNotNone(credentials) + + batchConfig = credentials.dataproc_batch + self.assertIsNotNone(batchConfig) + + raw_batch_config = self.raw_profile["outputs"]["dataproc-serverless-configured"][ + "dataproc_batch" + ] + raw_environment_config = raw_batch_config["environment_config"] + raw_execution_config = raw_environment_config["execution_config"] + raw_labels: dict[str, any] = raw_batch_config["labels"] + raw_rt_config = raw_batch_config["runtime_config"] + + raw_batch_config = self.raw_profile["outputs"]["dataproc-serverless-configured"][ + "dataproc_batch" + ] + + batch = dataproc_v1.Batch() + + batch = _update_batch_from_config(raw_batch_config, batch) + + def to_str_values(d): + """google's protobuf types expose maps as dict[str, str]""" + return dict([(k, str(v)) for (k, v) in d.items()]) + + self.assertEqual( + batch.environment_config.execution_config.service_account, + raw_execution_config["service_account"], + ) + self.assertFalse(batch.environment_config.execution_config.network_uri) + self.assertEqual( + batch.environment_config.execution_config.subnetwork_uri, + raw_execution_config["subnetwork_uri"], + ) + self.assertEqual( + batch.environment_config.execution_config.network_tags, + raw_execution_config["network_tags"], + ) + self.assertEqual(batch.labels, to_str_values(raw_labels)) + self.assertEqual( + batch.runtime_config.properties, to_str_values(raw_rt_config["properties"]) + ) + + @patch( + "dbt.adapters.bigquery.credentials._create_bigquery_defaults", + return_value=("credentials", "project_id"), + ) + def test_default_dataproc_serverless_batch(self, mock_get_bigquery_defaults): + adapter = self.get_adapter("dataproc-serverless-default") + mock_get_bigquery_defaults.assert_called_once() + + credentials = adapter.acquire_connection("dummy").credentials + self.assertIsNotNone(credentials) + + batchConfig = credentials.dataproc_batch + self.assertIsNone(batchConfig) diff --git a/dbt-bigquery/tests/unit/test_dataset.py b/dbt-bigquery/tests/unit/test_dataset.py new file mode 100644 index 000000000..adb3964c6 --- /dev/null +++ b/dbt-bigquery/tests/unit/test_dataset.py @@ -0,0 +1,90 @@ +from dbt.adapters.bigquery.dataset import add_access_entry_to_dataset, is_access_entry_in_dataset +from dbt.adapters.bigquery import BigQueryRelation + +from google.cloud.bigquery import Dataset, AccessEntry, DatasetReference + + +def test_add_access_entry_to_dataset_updates_dataset(): + database = "someDb" + dataset = "someDataset" + entity = BigQueryRelation.from_dict( + { + "type": None, + "path": { + "database": "test-project", + "schema": "test_schema", + "identifier": "my_table", + }, + "quote_policy": {"identifier": False}, + } + ).to_dict() + dataset_ref = DatasetReference(project=database, dataset_id=dataset) + dataset = Dataset(dataset_ref) + access_entry = AccessEntry(None, "table", entity) + dataset = add_access_entry_to_dataset(dataset, access_entry) + assert access_entry in dataset.access_entries + + +def test_add_access_entry_to_dataset_updates_with_pre_existing_entries(): + database = "someOtherDb" + dataset = "someOtherDataset" + entity_2 = BigQueryRelation.from_dict( + { + "type": None, + "path": { + "database": "test-project", + "schema": "test_schema", + "identifier": "some_other_view", + }, + "quote_policy": {"identifier": False}, + } + ).to_dict() + dataset_ref = DatasetReference(project=database, dataset_id=dataset) + dataset = Dataset(dataset_ref) + initial_entry = AccessEntry(None, "view", entity_2) + initial_entry._properties.pop("role") + dataset.access_entries = [initial_entry] + access_entry = AccessEntry(None, "view", entity_2) + dataset = add_access_entry_to_dataset(dataset, access_entry) + assert len(dataset.access_entries) == 2 + + +def test_is_access_entry_in_dataset_returns_true_if_entry_in_dataset(): + database = "someDb" + dataset = "someDataset" + entity = BigQueryRelation.from_dict( + { + "type": None, + "path": { + "database": "test-project", + "schema": "test_schema", + "identifier": "my_table", + }, + "quote_policy": {"identifier": False}, + } + ).to_dict() + dataset_ref = DatasetReference(project=database, dataset_id=dataset) + dataset = Dataset(dataset_ref) + access_entry = AccessEntry(None, "table", entity) + dataset = add_access_entry_to_dataset(dataset, access_entry) + assert is_access_entry_in_dataset(dataset, access_entry) + + +def test_is_access_entry_in_dataset_returns_false_if_entry_not_in_dataset(): + database = "someDb" + dataset = "someDataset" + entity = BigQueryRelation.from_dict( + { + "type": None, + "path": { + "database": "test-project", + "schema": "test_schema", + "identifier": "my_table", + }, + "quote_policy": {"identifier": False}, + } + ).to_dict() + dataset_ref = DatasetReference(project=database, dataset_id=dataset) + dataset = Dataset(dataset_ref) + access_entry = AccessEntry(None, "table", entity) + assert not is_access_entry_in_dataset(dataset, access_entry) diff --git a/dbt-bigquery/tests/unit/test_renamed_relations.py b/dbt-bigquery/tests/unit/test_renamed_relations.py new file mode 100644 index 000000000..8e787e6a3 --- /dev/null +++ b/dbt-bigquery/tests/unit/test_renamed_relations.py @@ -0,0 +1,16 @@ +from dbt.adapters.bigquery.relation import BigQueryRelation +from dbt.adapters.contracts.relation import RelationType + + +def test_renameable_relation(): + relation = BigQueryRelation.create( + database="my_db", + schema="my_schema", + identifier="my_table", + type=RelationType.Table, + ) + assert relation.renameable_relations == frozenset( + { + RelationType.Table, + } + ) diff --git a/dbt-bigquery/tests/unit/utils.py b/dbt-bigquery/tests/unit/utils.py new file mode 100644 index 000000000..633b6d565 --- /dev/null +++ b/dbt-bigquery/tests/unit/utils.py @@ -0,0 +1,384 @@ +"""Unit test utility functions. +Note that all imports should be inside the functions to avoid import/mocking +issues. +""" + +import string +import os +from unittest import mock +from unittest import TestCase + +import agate +import pytest + +from dbt_common.dataclass_schema import ValidationError +from dbt.config.project import PartialProject + + +def normalize(path): + """On windows, neither is enough on its own: + >>> normcase('C:\\documents/ALL CAPS/subdir\\..') + 'c:\\documents\\all caps\\subdir\\..' + >>> normpath('C:\\documents/ALL CAPS/subdir\\..') + 'C:\\documents\\ALL CAPS' + >>> normpath(normcase('C:\\documents/ALL CAPS/subdir\\..')) + 'c:\\documents\\all caps' + """ + return os.path.normcase(os.path.normpath(path)) + + +class Obj: + which = "blah" + single_threaded = False + + +def mock_connection(name, state="open"): + conn = mock.MagicMock() + conn.name = name + conn.state = state + return conn + + +def profile_from_dict(profile, profile_name, cli_vars="{}"): + from dbt.config import Profile + from dbt.config.renderer import ProfileRenderer + from dbt.config.utils import parse_cli_vars + + if not isinstance(cli_vars, dict): + cli_vars = parse_cli_vars(cli_vars) + + renderer = ProfileRenderer(cli_vars) + + # in order to call dbt's internal profile rendering, we need to set the + # flags global. This is a bit of a hack, but it's the best way to do it. + from dbt.flags import set_from_args + from argparse import Namespace + + set_from_args(Namespace(), None) + return Profile.from_raw_profile_info( + profile, + profile_name, + renderer, + ) + + +def project_from_dict(project, profile, packages=None, selectors=None, cli_vars="{}"): + from dbt.config.renderer import DbtProjectYamlRenderer + from dbt.config.utils import parse_cli_vars + + if not isinstance(cli_vars, dict): + cli_vars = parse_cli_vars(cli_vars) + + renderer = DbtProjectYamlRenderer(profile, cli_vars) + + project_root = project.pop("project-root", os.getcwd()) + + partial = PartialProject.from_dicts( + project_root=project_root, + project_dict=project, + packages_dict=packages, + selectors_dict=selectors, + ) + return partial.render(renderer) + + +def config_from_parts_or_dicts(project, profile, packages=None, selectors=None, cli_vars="{}"): + from dbt.config import Project, Profile, RuntimeConfig + from copy import deepcopy + from dbt.config.utils import parse_cli_vars + + if not isinstance(cli_vars, dict): + cli_vars = parse_cli_vars(cli_vars) + + if isinstance(project, Project): + profile_name = project.profile_name + else: + profile_name = project.get("profile") + + if not isinstance(profile, Profile): + profile = profile_from_dict( + deepcopy(profile), + profile_name, + cli_vars, + ) + + if not isinstance(project, Project): + project = project_from_dict( + deepcopy(project), + profile, + packages, + selectors, + cli_vars, + ) + + args = Obj() + args.vars = cli_vars + args.profile_dir = "/dev/null" + return RuntimeConfig.from_parts(project=project, profile=profile, args=args) + + +def inject_plugin(plugin): + from dbt.adapters.factory import FACTORY + + key = plugin.adapter.type() + FACTORY.plugins[key] = plugin + + +def inject_plugin_for(config): + from dbt.adapters.factory import FACTORY + + FACTORY.load_plugin(config.credentials.type) + adapter = FACTORY.get_adapter(config) + return adapter + + +def inject_adapter(value): + """Inject the given adapter into the adapter factory, so your hand-crafted + artisanal adapter will be available from get_adapter() as if dbt loaded it. + """ + from dbt.adapters.factory import FACTORY + + key = value.type() + FACTORY.adapters[key] = value + + +def clear_plugin(plugin): + from dbt.adapters.factory import FACTORY + + key = plugin.adapter.type() + FACTORY.plugins.pop(key, None) + FACTORY.adapters.pop(key, None) + + +class ContractTestCase(TestCase): + ContractType = None + + def setUp(self): + self.maxDiff = None + super().setUp() + + def assert_to_dict(self, obj, dct): + self.assertEqual(obj.to_dict(omit_none=True), dct) + + def assert_from_dict(self, obj, dct, cls=None): + if cls is None: + cls = self.ContractType + cls.validate(dct) + self.assertEqual(cls.from_dict(dct), obj) + + def assert_symmetric(self, obj, dct, cls=None): + self.assert_to_dict(obj, dct) + self.assert_from_dict(obj, dct, cls) + + def assert_fails_validation(self, dct, cls=None): + if cls is None: + cls = self.ContractType + + with self.assertRaises(ValidationError): + cls.validate(dct) + cls.from_dict(dct) + + +def compare_dicts(dict1, dict2): + first_set = set(dict1.keys()) + second_set = set(dict2.keys()) + print(f"--- Difference between first and second keys: {first_set.difference(second_set)}") + print(f"--- Difference between second and first keys: {second_set.difference(first_set)}") + common_keys = set(first_set).intersection(set(second_set)) + found_differences = False + for key in common_keys: + if dict1[key] != dict2[key]: + print(f"--- --- first dict: {key}: {str(dict1[key])}") + print(f"--- --- second dict: {key}: {str(dict2[key])}") + found_differences = True + if found_differences: + print("--- Found differences in dictionaries") + else: + print("--- Found no differences in dictionaries") + + +def assert_from_dict(obj, dct, cls=None): + if cls is None: + cls = obj.__class__ + cls.validate(dct) + obj_from_dict = cls.from_dict(dct) + if hasattr(obj, "created_at"): + obj_from_dict.created_at = 1 + obj.created_at = 1 + assert obj_from_dict == obj + + +def assert_to_dict(obj, dct): + obj_to_dict = obj.to_dict(omit_none=True) + if "created_at" in obj_to_dict: + obj_to_dict["created_at"] = 1 + if "created_at" in dct: + dct["created_at"] = 1 + assert obj_to_dict == dct + + +def assert_symmetric(obj, dct, cls=None): + assert_to_dict(obj, dct) + assert_from_dict(obj, dct, cls) + + +def assert_fails_validation(dct, cls): + with pytest.raises(ValidationError): + cls.validate(dct) + cls.from_dict(dct) + + +def generate_name_macros(package): + from dbt.contracts.graph.nodes import Macro + from dbt.node_types import NodeType + + name_sql = {} + for component in ("database", "schema", "alias"): + if component == "alias": + source = "node.name" + else: + source = f"target.{component}" + name = f"generate_{component}_name" + sql = f"{{% macro {name}(value, node) %}} {{% if value %}} {{{{ value }}}} {{% else %}} {{{{ {source} }}}} {{% endif %}} {{% endmacro %}}" + name_sql[name] = sql + + for name, sql in name_sql.items(): + pm = Macro( + name=name, + resource_type=NodeType.Macro, + unique_id=f"macro.{package}.{name}", + package_name=package, + original_file_path=normalize("macros/macro.sql"), + path=normalize("macros/macro.sql"), + macro_sql=sql, + ) + yield pm + + +class TestAdapterConversions(TestCase): + def _get_tester_for(self, column_type): + from dbt_common.clients import agate_helper + + if column_type is agate.TimeDelta: # dbt never makes this! + return agate.TimeDelta() + + for instance in agate_helper.DEFAULT_TYPE_TESTER._possible_types: + if isinstance(instance, column_type): # include child types + return instance + + raise ValueError(f"no tester for {column_type}") + + def _make_table_of(self, rows, column_types): + column_names = list(string.ascii_letters[: len(rows[0])]) + if isinstance(column_types, type): + column_types = [self._get_tester_for(column_types) for _ in column_names] + else: + column_types = [self._get_tester_for(typ) for typ in column_types] + table = agate.Table(rows, column_names=column_names, column_types=column_types) + return table + + +def MockMacro(package, name="my_macro", **kwargs): + from dbt.contracts.graph.nodes import Macro + from dbt.node_types import NodeType + + mock_kwargs = dict( + resource_type=NodeType.Macro, + package_name=package, + unique_id=f"macro.{package}.{name}", + original_file_path="/dev/null", + ) + + mock_kwargs.update(kwargs) + + macro = mock.MagicMock(spec=Macro, **mock_kwargs) + macro.name = name + return macro + + +def MockMaterialization(package, name="my_materialization", adapter_type=None, **kwargs): + if adapter_type is None: + adapter_type = "default" + kwargs["adapter_type"] = adapter_type + return MockMacro(package, f"materialization_{name}_{adapter_type}", **kwargs) + + +def MockGenerateMacro(package, component="some_component", **kwargs): + name = f"generate_{component}_name" + return MockMacro(package, name=name, **kwargs) + + +def MockSource(package, source_name, name, **kwargs): + from dbt.node_types import NodeType + from dbt.contracts.graph.nodes import SourceDefinition + + src = mock.MagicMock( + __class__=SourceDefinition, + resource_type=NodeType.Source, + source_name=source_name, + package_name=package, + unique_id=f"source.{package}.{source_name}.{name}", + search_name=f"{source_name}.{name}", + **kwargs, + ) + src.name = name + return src + + +def MockNode(package, name, resource_type=None, **kwargs): + from dbt.node_types import NodeType + from dbt.contracts.graph.nodes import ModelNode, SeedNode + + if resource_type is None: + resource_type = NodeType.Model + if resource_type == NodeType.Model: + cls = ModelNode + elif resource_type == NodeType.Seed: + cls = SeedNode + else: + raise ValueError(f"I do not know how to handle {resource_type}") + node = mock.MagicMock( + __class__=cls, + resource_type=resource_type, + package_name=package, + unique_id=f"{str(resource_type)}.{package}.{name}", + search_name=name, + **kwargs, + ) + node.name = name + return node + + +def MockDocumentation(package, name, **kwargs): + from dbt.node_types import NodeType + from dbt.contracts.graph.nodes import Documentation + + doc = mock.MagicMock( + __class__=Documentation, + resource_type=NodeType.Documentation, + package_name=package, + search_name=name, + unique_id=f"{package}.{name}", + **kwargs, + ) + doc.name = name + return doc + + +def load_internal_manifest_macros(config, macro_hook=lambda m: None): + from dbt.parser.manifest import ManifestLoader + + return ManifestLoader.load_macros(config, macro_hook) + + +def dict_replace(dct, **kwargs): + dct = dct.copy() + dct.update(kwargs) + return dct + + +def replace_config(n, **kwargs): + return n.replace( + config=n.config.replace(**kwargs), + unrendered_config=dict_replace(n.unrendered_config, **kwargs), + ) diff --git a/dbt-bigquery/third-party-stubs/agate/__init__.pyi b/dbt-bigquery/third-party-stubs/agate/__init__.pyi new file mode 100644 index 000000000..c773cc7d7 --- /dev/null +++ b/dbt-bigquery/third-party-stubs/agate/__init__.pyi @@ -0,0 +1,89 @@ +from collections.abc import Sequence + +from typing import Any, Optional, Callable, Iterable, Dict, Union + +from . import data_types as data_types +from .data_types import ( + Text as Text, + Number as Number, + Boolean as Boolean, + DateTime as DateTime, + Date as Date, + TimeDelta as TimeDelta, +) + +class MappedSequence(Sequence): + def __init__(self, values: Any, keys: Optional[Any] = ...) -> None: ... + def __unicode__(self): ... + def __getitem__(self, key: Any): ... + def __setitem__(self, key: Any, value: Any) -> None: ... + def __iter__(self): ... + def __len__(self): ... + def __eq__(self, other: Any): ... + def __ne__(self, other: Any): ... + def __contains__(self, value: Any): ... + def keys(self): ... + def values(self): ... + def items(self): ... + def get(self, key: Any, default: Optional[Any] = ...): ... + def dict(self): ... + +class Row(MappedSequence): ... + +class Table: + def __init__( + self, + rows: Any, + column_names: Optional[Any] = ..., + column_types: Optional[Any] = ..., + row_names: Optional[Any] = ..., + _is_fork: bool = ..., + ) -> None: ... + def __len__(self): ... + def __iter__(self): ... + def __getitem__(self, key: Any): ... + @property + def column_types(self): ... + @property + def column_names(self): ... + @property + def row_names(self): ... + @property + def columns(self): ... + @property + def rows(self): ... + def print_csv(self, **kwargs: Any) -> None: ... + def print_json(self, **kwargs: Any) -> None: ... + def where(self, test: Callable[[Row], bool]) -> "Table": ... + def select(self, key: Union[Iterable[str], str]) -> "Table": ... + # these definitions are much narrower than what's actually accepted + @classmethod + def from_object( + cls, obj: Iterable[Dict[str, Any]], *, column_types: Optional["TypeTester"] = None + ) -> "Table": ... + @classmethod + def from_csv( + cls, path: Iterable[str], *, column_types: Optional["TypeTester"] = None + ) -> "Table": ... + @classmethod + def merge(cls, tables: Iterable["Table"]) -> "Table": ... + def rename( + self, + column_names: Optional[Iterable[str]] = None, + row_names: Optional[Any] = None, + slug_columns: bool = False, + slug_rows: bool = False, + **kwargs: Any, + ) -> "Table": ... + +class TypeTester: + def __init__( + self, force: Any = ..., limit: Optional[Any] = ..., types: Optional[Any] = ... + ) -> None: ... + def run(self, rows: Any, column_names: Any): ... + +class MaxPrecision: + def __init__(self, column_name: Any) -> None: ... + +# this is not strictly true, but it's all we care about. +def aggregate(self, aggregations: MaxPrecision) -> int: ... diff --git a/dbt-bigquery/third-party-stubs/agate/data_types.pyi b/dbt-bigquery/third-party-stubs/agate/data_types.pyi new file mode 100644 index 000000000..8114f7b55 --- /dev/null +++ b/dbt-bigquery/third-party-stubs/agate/data_types.pyi @@ -0,0 +1,71 @@ +from typing import Any, Optional + +DEFAULT_NULL_VALUES: Any + +class DataType: + null_values: Any = ... + def __init__(self, null_values: Any = ...) -> None: ... + def test(self, d: Any): ... + def cast(self, d: Any) -> None: ... + def csvify(self, d: Any): ... + def jsonify(self, d: Any): ... + +DEFAULT_TRUE_VALUES: Any +DEFAULT_FALSE_VALUES: Any + +class Boolean(DataType): + true_values: Any = ... + false_values: Any = ... + def __init__( + self, true_values: Any = ..., false_values: Any = ..., null_values: Any = ... + ) -> None: ... + def cast(self, d: Any): ... + def jsonify(self, d: Any): ... + +ZERO_DT: Any + +class Date(DataType): + date_format: Any = ... + parser: Any = ... + def __init__(self, date_format: Optional[Any] = ..., **kwargs: Any) -> None: ... + def cast(self, d: Any): ... + def csvify(self, d: Any): ... + def jsonify(self, d: Any): ... + +class DateTime(DataType): + datetime_format: Any = ... + timezone: Any = ... + def __init__( + self, datetime_format: Optional[Any] = ..., timezone: Optional[Any] = ..., **kwargs: Any + ) -> None: ... + def cast(self, d: Any): ... + def csvify(self, d: Any): ... + def jsonify(self, d: Any): ... + +DEFAULT_CURRENCY_SYMBOLS: Any +POSITIVE: Any +NEGATIVE: Any + +class Number(DataType): + locale: Any = ... + currency_symbols: Any = ... + group_symbol: Any = ... + decimal_symbol: Any = ... + def __init__( + self, + locale: str = ..., + group_symbol: Optional[Any] = ..., + decimal_symbol: Optional[Any] = ..., + currency_symbols: Any = ..., + **kwargs: Any, + ) -> None: ... + def cast(self, d: Any): ... + def jsonify(self, d: Any): ... + +class TimeDelta(DataType): + def cast(self, d: Any): ... + +class Text(DataType): + cast_nulls: Any = ... + def __init__(self, cast_nulls: bool = ..., **kwargs: Any) -> None: ... + def cast(self, d: Any): ...