From 03ba78be0cbc876ec2c91f1303d4018dcbdac922 Mon Sep 17 00:00:00 2001 From: matt garber Date: Wed, 15 May 2024 16:12:11 -0400 Subject: [PATCH] Replaced wildcard match with regex (#237) * Replaced wildcard match with regex * Better regex, wrapper function for like --- cumulus_library/databases.py | 15 ++++++++ .../core/core_templates/medication.sql.jinja | 5 +-- .../core/reference_sql/builder_condition.sql | 18 +++++----- .../builder_documentreference.sql | 2 +- .../core/reference_sql/builder_encounter.sql | 30 ++++++++-------- .../core/reference_sql/builder_medication.sql | 6 ++-- .../codeable_concept_denormalize.sql.jinja | 2 +- .../template_sql/coding_denormalize.sql.jinja | 2 +- .../shared_macros/syntax.sql.jinja | 30 +++++++++++++++- .../template_sql/show_tables.sql.jinja | 3 +- .../template_sql/show_views.sql.jinja | 3 +- tests/test_base_templates.py | 35 +++++++++++++++++-- 12 files changed, 114 insertions(+), 37 deletions(-) diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index b27ef003..717dcda7 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -13,6 +13,7 @@ import json import os import pathlib +import re import sys from pathlib import Path from typing import Any, Protocol @@ -355,6 +356,13 @@ def __init__(self, db_file: str): None, duckdb.typing.VARCHAR, ) + self.connection.create_function( + # DuckDB's version is regexp_matches. + "regexp_like", + self._compat_regexp_like, + None, + duckdb.typing.BOOLEAN, + ) self.connection.create_function( # We frequently use Athena's date() function because it's easier than # the more widely-supported way of CAST(x AS DATE). @@ -407,6 +415,13 @@ def _compat_array_join( return None return delimiter.join(v for v in value if v is not None) + @staticmethod + def _compat_regexp_like(string: str | None, pattern: str | None) -> bool: + if string is None or pattern is None: + return None + match = re.search(pattern, string) + return match is not None + @staticmethod def _compat_date( value: str | datetime.datetime | datetime.date | None, diff --git a/cumulus_library/studies/core/core_templates/medication.sql.jinja b/cumulus_library/studies/core/core_templates/medication.sql.jinja index 8502313f..8f4362d0 100644 --- a/cumulus_library/studies/core/core_templates/medication.sql.jinja +++ b/cumulus_library/studies/core/core_templates/medication.sql.jinja @@ -1,4 +1,5 @@ {% import 'core_utils.jinja' as utils -%} +{% import 'syntax.sql.jinja' as syntax -%} CREATE TABLE core__medication AS ( WITH @@ -32,7 +33,7 @@ CREATE TABLE core__medication AS ( mr.id, substring(mr.med_ref, 2) AS medication_id FROM mr_basics AS mr - WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE '#%' + WHERE mr.med_ref IS NOT NULL AND {{ syntax.like('mr.med_ref', '#%') }} ), external_refs AS ( @@ -40,7 +41,7 @@ CREATE TABLE core__medication AS ( mr.id, substring(mr.med_ref, 12) AS medication_id FROM mr_basics AS mr - WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE 'Medication/%' + WHERE mr.med_ref IS NOT NULL AND {{ syntax.like('mr.med_ref', 'Medication/%') }} ) {# Internal: medication data from inline ETL extraction. diff --git a/cumulus_library/studies/core/reference_sql/builder_condition.sql b/cumulus_library/studies/core/reference_sql/builder_condition.sql index 17a9592a..3dbd4f51 100644 --- a/cumulus_library/studies/core/reference_sql/builder_condition.sql +++ b/cumulus_library/studies/core/reference_sql/builder_condition.sql @@ -72,7 +72,7 @@ CREATE TABLE core__condition_dn_clinical_status AS ( condition AS s, UNNEST(s.clinicalStatus.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/condition-clinical' + REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/condition-clinical') ), --noqa: LT07 union_table AS ( @@ -137,7 +137,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( condition AS s, UNNEST(s.code.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://snomed.info/sct' + REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct') ), --noqa: LT07 system_code_1 AS ( @@ -153,7 +153,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( condition AS s, UNNEST(s.code.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-10-cm' + REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-10-cm') ), --noqa: LT07 system_code_2 AS ( @@ -169,7 +169,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( condition AS s, UNNEST(s.code.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm' + REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm') ), --noqa: LT07 system_code_3 AS ( @@ -185,7 +185,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( condition AS s, UNNEST(s.code.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm/diagnosis' + REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm/diagnosis') ), --noqa: LT07 system_code_4 AS ( @@ -201,7 +201,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( condition AS s, UNNEST(s.code.coding) AS u (coding) WHERE - u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.728286' + REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.728286') ), --noqa: LT07 system_code_5 AS ( @@ -217,7 +217,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( condition AS s, UNNEST(s.code.coding) AS u (coding) WHERE - u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.4.698084.10375' + REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.4\.698084\.10375') ), --noqa: LT07 system_code_6 AS ( @@ -233,7 +233,7 @@ CREATE TABLE core__condition_codable_concepts_display AS ( condition AS s, UNNEST(s.code.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/data-absent-reason' + REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/data-absent-reason') ), --noqa: LT07 union_table AS ( @@ -397,7 +397,7 @@ CREATE TABLE core__condition_dn_verification_status AS ( condition AS s, UNNEST(s.verificationStatus.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/condition-ver-status' + REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/condition-ver-status') ), --noqa: LT07 union_table AS ( diff --git a/cumulus_library/studies/core/reference_sql/builder_documentreference.sql b/cumulus_library/studies/core/reference_sql/builder_documentreference.sql index 8f46aeeb..d80153a7 100644 --- a/cumulus_library/studies/core/reference_sql/builder_documentreference.sql +++ b/cumulus_library/studies/core/reference_sql/builder_documentreference.sql @@ -71,7 +71,7 @@ CREATE TABLE core__documentreference_dn_category AS ( flattened_rows AS s, UNNEST(s.category.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://hl7.org/fhir/us/core/ValueSet/us-core-documentreference-category' + REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/us/core/ValueSet/us-core-documentreference-category') ), --noqa: LT07 union_table AS ( diff --git a/cumulus_library/studies/core/reference_sql/builder_encounter.sql b/cumulus_library/studies/core/reference_sql/builder_encounter.sql index 0bbfac27..93b279b6 100644 --- a/cumulus_library/studies/core/reference_sql/builder_encounter.sql +++ b/cumulus_library/studies/core/reference_sql/builder_encounter.sql @@ -32,7 +32,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/encounter-type' + REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/encounter-type') ), --noqa: LT07 system_type_1 AS ( @@ -48,7 +48,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/v2-0004' + REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/v2-0004') ), --noqa: LT07 system_type_2 AS ( @@ -64,7 +64,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'urn:oid:2.16.840.1.113883.4.642.3.248' + REGEXP_LIKE(u.coding.system, 'urn:oid:2\.16\.840\.1\.113883\.4\.642\.3\.248') ), --noqa: LT07 system_type_3 AS ( @@ -80,7 +80,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://snomed.info/sct' + REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct') ), --noqa: LT07 system_type_4 AS ( @@ -96,7 +96,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'https://fhir.cerner.com/%/codeSet/71' + REGEXP_LIKE(u.coding.system, 'https://fhir\.cerner\.com/.*/codeSet/71') ), --noqa: LT07 system_type_5 AS ( @@ -112,7 +112,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.10110' + REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.10110') ), --noqa: LT07 system_type_6 AS ( @@ -128,7 +128,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.18875' + REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.18875') ), --noqa: LT07 system_type_7 AS ( @@ -144,7 +144,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.30' + REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.30') ), --noqa: LT07 system_type_8 AS ( @@ -160,7 +160,7 @@ CREATE TABLE core__encounter_dn_type AS ( flattened_rows AS s, UNNEST(s.type.coding) AS u (coding) WHERE - u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.808267' + REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.808267') ), --noqa: LT07 union_table AS ( @@ -340,7 +340,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( flattened_rows AS s, UNNEST(s.reasoncode.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/v3-ActPriority' + REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/v3-ActPriority') ), --noqa: LT07 system_reasoncode_1 AS ( @@ -356,7 +356,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( flattened_rows AS s, UNNEST(s.reasoncode.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://snomed.info/sct' + REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct') ), --noqa: LT07 system_reasoncode_2 AS ( @@ -372,7 +372,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( flattened_rows AS s, UNNEST(s.reasoncode.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-10-cm' + REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-10-cm') ), --noqa: LT07 system_reasoncode_3 AS ( @@ -388,7 +388,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( flattened_rows AS s, UNNEST(s.reasoncode.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm' + REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm') ), --noqa: LT07 system_reasoncode_4 AS ( @@ -404,7 +404,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( flattened_rows AS s, UNNEST(s.reasoncode.coding) AS u (coding) WHERE - u.coding.system LIKE 'https://fhir.cerner.com/%/nomenclature' + REGEXP_LIKE(u.coding.system, 'https://fhir\.cerner\.com/.*/nomenclature') ), --noqa: LT07 system_reasoncode_5 AS ( @@ -420,7 +420,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS ( flattened_rows AS s, UNNEST(s.reasoncode.coding) AS u (coding) WHERE - u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.728286' + REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.728286') ), --noqa: LT07 union_table AS ( diff --git a/cumulus_library/studies/core/reference_sql/builder_medication.sql b/cumulus_library/studies/core/reference_sql/builder_medication.sql index c4997196..e1a3bbc7 100644 --- a/cumulus_library/studies/core/reference_sql/builder_medication.sql +++ b/cumulus_library/studies/core/reference_sql/builder_medication.sql @@ -86,7 +86,8 @@ CREATE TABLE core__medication AS ( mr.id, substring(mr.med_ref, 2) AS medication_id FROM mr_basics AS mr - WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE '#%' + WHERE mr.med_ref IS NOT NULL AND REGEXP_LIKE(mr.med_ref, '#.*') + ) ), external_refs AS ( @@ -94,7 +95,8 @@ CREATE TABLE core__medication AS ( mr.id, substring(mr.med_ref, 12) AS medication_id FROM mr_basics AS mr - WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE 'Medication/%' + WHERE mr.med_ref IS NOT NULL AND REGEXP_LIKE(mr.med_ref, 'Medication/.*') + ) ) diff --git a/cumulus_library/template_sql/codeable_concept_denormalize.sql.jinja b/cumulus_library/template_sql/codeable_concept_denormalize.sql.jinja index a764161d..8a9fe01b 100644 --- a/cumulus_library/template_sql/codeable_concept_denormalize.sql.jinja +++ b/cumulus_library/template_sql/codeable_concept_denormalize.sql.jinja @@ -79,7 +79,7 @@ CREATE TABLE {{ target_table }} AS ( UNNEST(s.{{ field_alias }}.coding) AS u (coding) {%- if filter_priority %} WHERE - u.coding.system LIKE '{{ system }}' + {{ syntax.like('u.coding.system', system) }} {%- endif %} ), --noqa: LT07 {%- endfor %} diff --git a/cumulus_library/template_sql/coding_denormalize.sql.jinja b/cumulus_library/template_sql/coding_denormalize.sql.jinja index 44c6ba65..a24c4773 100644 --- a/cumulus_library/template_sql/coding_denormalize.sql.jinja +++ b/cumulus_library/template_sql/coding_denormalize.sql.jinja @@ -17,7 +17,7 @@ CREATE TABLE {{ target_table }} AS ( UNNEST(s.{{ parent_field }}) AS u (parent_col) {%- if filter_priority %} WHERE - u.parent_col.{{ column_name }}.system LIKE '{{ system }}' + {{ syntax.like("u.parent_col." + column_name + ".system", system) }} {%- endif %} ), --noqa: LT07 {%- endfor %} diff --git a/cumulus_library/template_sql/shared_macros/syntax.sql.jinja b/cumulus_library/template_sql/shared_macros/syntax.sql.jinja index 59e26c96..f264a411 100644 --- a/cumulus_library/template_sql/shared_macros/syntax.sql.jinja +++ b/cumulus_library/template_sql/shared_macros/syntax.sql.jinja @@ -1,4 +1,4 @@ -{# Commonly used macros related to basic SQL syntax formatting #} +{# Commonly used macros related to basic SQL syntax formatting #}-- noqa:disable=JJ01 {%- macro comma_delineate(loop) -%} {%- if not loop.last -%} @@ -17,4 +17,32 @@ UNION {%- endif -%} {%- endmacro -%} + +{#- The intent of this macro is to convert sql wildcard format to regex syntax. + So for example, a value of macro_string like: + +https://fhir.cerner.com/%/codeSet/71 + +will be converted into a regex-escaped pattern with a regex match anything pattern: + +https:\/\/fhir\.cerner\.com\/(.*)\/codeSet\/71 + +See https://trino.io/docs/current/functions/regexp.html for more information. We +are applying a limited ruleset currently, since this is based on common patterns +in coding system fields, where this is currently used; if needed this could be +made more robust. +-#} +{%- macro wildcard_to_regex(match_string) -%} +'^{{ match_string.replace("\'", "\\\''").replace(".", "\\.").replace("%", ".*") }}$' +{%- endmacro -%} + + +{#- This macro provides a SQL LIKE-esque interface to abstract away having to +worry about the slightly more performant regex based matching we want to use +in most cases. +-#} +{%- macro like(field, match_string) -%} +REGEXP_LIKE({{ field }}, {{ wildcard_to_regex(match_string) }}) +{%- endmacro -%} + --noqa: LT12 diff --git a/cumulus_library/template_sql/show_tables.sql.jinja b/cumulus_library/template_sql/show_tables.sql.jinja index a40dd66c..14a395b1 100644 --- a/cumulus_library/template_sql/show_tables.sql.jinja +++ b/cumulus_library/template_sql/show_tables.sql.jinja @@ -1,6 +1,7 @@ +{%- import 'syntax.sql.jinja' as syntax -%} SELECT table_name FROM information_schema.tables WHERE table_schema = '{{ schema_name }}' AND table_type = 'BASE TABLE' - AND table_name LIKE '{{ prefix }}%'; + AND {{ syntax.like('table_name', prefix +"%") }} diff --git a/cumulus_library/template_sql/show_views.sql.jinja b/cumulus_library/template_sql/show_views.sql.jinja index 2d4e5e48..774b2d4d 100644 --- a/cumulus_library/template_sql/show_views.sql.jinja +++ b/cumulus_library/template_sql/show_views.sql.jinja @@ -1,6 +1,7 @@ +{%- import 'syntax.sql.jinja' as syntax -%} SELECT table_name FROM information_schema.tables WHERE table_schema = '{{ schema_name }}' AND table_type = 'VIEW' - AND table_name LIKE '{{ prefix }}%'; + AND {{ syntax.like('table_name', prefix + "%") }} diff --git a/tests/test_base_templates.py b/tests/test_base_templates.py index 6b17e45b..7e77995d 100644 --- a/tests/test_base_templates.py +++ b/tests/test_base_templates.py @@ -79,7 +79,8 @@ def test_codeable_concept_denormalize_all_creation(): def test_codeable_concept_denormalize_filter_creation(): - expected = """CREATE TABLE target__concepts AS ( + # fmt: off + expected = r"""CREATE TABLE target__concepts AS ( WITH system_code_col_0 AS ( @@ -95,7 +96,7 @@ def test_codeable_concept_denormalize_filter_creation(): source AS s, UNNEST(s.code_col.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://snomed.info/sct' + REGEXP_LIKE(u.coding.system, '^http://snomed\.info/sct$') ), --noqa: LT07 system_code_col_1 AS ( @@ -111,7 +112,23 @@ def test_codeable_concept_denormalize_filter_creation(): source AS s, UNNEST(s.code_col.coding) AS u (coding) WHERE - u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-10-cm' + REGEXP_LIKE(u.coding.system, '^http://hl7\.org/fhir/sid/icd-10-cm$') + ), --noqa: LT07 + + system_code_col_2 AS ( + SELECT DISTINCT + s.id AS id, + 0 AS row, + '2' AS priority, + u.coding.code, + u.coding.display, + u.coding.system AS code_system, + u.coding.userSelected + FROM + source AS s, + UNNEST(s.code_col.coding) AS u (coding) + WHERE + REGEXP_LIKE(u.coding.system, '^https://fhir\.cerner\.com/.*/codeSet/71$') ), --noqa: LT07 union_table AS ( @@ -134,6 +151,16 @@ def test_codeable_concept_denormalize_filter_creation(): display, userSelected FROM system_code_col_1 + UNION + SELECT + id, + row, + priority, + code_system, + code, + display, + userSelected + FROM system_code_col_2 ), @@ -167,6 +194,7 @@ def test_codeable_concept_denormalize_filter_creation(): WHERE available_priority = 1 ); """ + # fmt: on config = sql_utils.CodeableConceptConfig( source_table="source", @@ -177,6 +205,7 @@ def test_codeable_concept_denormalize_filter_creation(): code_systems=[ "http://snomed.info/sct", "http://hl7.org/fhir/sid/icd-10-cm", + "https://fhir.cerner.com/%/codeSet/71", ], ) query = base_templates.get_codeable_concept_denormalize_query(config)