Skip to content

Commit

Permalink
Replaced wildcard match with regex (#237)
Browse files Browse the repository at this point in the history
* Replaced wildcard match with regex

* Better regex, wrapper function for like
  • Loading branch information
dogversioning authored May 15, 2024
1 parent eb4b818 commit 03ba78b
Show file tree
Hide file tree
Showing 12 changed files with 114 additions and 37 deletions.
15 changes: 15 additions & 0 deletions cumulus_library/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import json
import os
import pathlib
import re
import sys
from pathlib import Path
from typing import Any, Protocol
Expand Down Expand Up @@ -355,6 +356,13 @@ def __init__(self, db_file: str):
None,
duckdb.typing.VARCHAR,
)
self.connection.create_function(
# DuckDB's version is regexp_matches.
"regexp_like",
self._compat_regexp_like,
None,
duckdb.typing.BOOLEAN,
)
self.connection.create_function(
# We frequently use Athena's date() function because it's easier than
# the more widely-supported way of CAST(x AS DATE).
Expand Down Expand Up @@ -407,6 +415,13 @@ def _compat_array_join(
return None
return delimiter.join(v for v in value if v is not None)

@staticmethod
def _compat_regexp_like(string: str | None, pattern: str | None) -> bool:
if string is None or pattern is None:
return None
match = re.search(pattern, string)
return match is not None

@staticmethod
def _compat_date(
value: str | datetime.datetime | datetime.date | None,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{% import 'core_utils.jinja' as utils -%}
{% import 'syntax.sql.jinja' as syntax -%}

CREATE TABLE core__medication AS (
WITH
Expand Down Expand Up @@ -32,15 +33,15 @@ CREATE TABLE core__medication AS (
mr.id,
substring(mr.med_ref, 2) AS medication_id
FROM mr_basics AS mr
WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE '#%'
WHERE mr.med_ref IS NOT NULL AND {{ syntax.like('mr.med_ref', '#%') }}
),

external_refs AS (
SELECT DISTINCT
mr.id,
substring(mr.med_ref, 12) AS medication_id
FROM mr_basics AS mr
WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE 'Medication/%'
WHERE mr.med_ref IS NOT NULL AND {{ syntax.like('mr.med_ref', 'Medication/%') }}
)

{# Internal: medication data from inline ETL extraction.
Expand Down
18 changes: 9 additions & 9 deletions cumulus_library/studies/core/reference_sql/builder_condition.sql
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ CREATE TABLE core__condition_dn_clinical_status AS (
condition AS s,
UNNEST(s.clinicalStatus.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/condition-clinical'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/condition-clinical')
), --noqa: LT07

union_table AS (
Expand Down Expand Up @@ -137,7 +137,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://snomed.info/sct'
REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct')
), --noqa: LT07

system_code_1 AS (
Expand All @@ -153,7 +153,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-10-cm'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-10-cm')
), --noqa: LT07

system_code_2 AS (
Expand All @@ -169,7 +169,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm')
), --noqa: LT07

system_code_3 AS (
Expand All @@ -185,7 +185,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm/diagnosis'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm/diagnosis')
), --noqa: LT07

system_code_4 AS (
Expand All @@ -201,7 +201,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.728286'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.728286')
), --noqa: LT07

system_code_5 AS (
Expand All @@ -217,7 +217,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.4.698084.10375'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.4\.698084\.10375')
), --noqa: LT07

system_code_6 AS (
Expand All @@ -233,7 +233,7 @@ CREATE TABLE core__condition_codable_concepts_display AS (
condition AS s,
UNNEST(s.code.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/data-absent-reason'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/data-absent-reason')
), --noqa: LT07

union_table AS (
Expand Down Expand Up @@ -397,7 +397,7 @@ CREATE TABLE core__condition_dn_verification_status AS (
condition AS s,
UNNEST(s.verificationStatus.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/condition-ver-status'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/condition-ver-status')
), --noqa: LT07

union_table AS (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ CREATE TABLE core__documentreference_dn_category AS (
flattened_rows AS s,
UNNEST(s.category.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/us/core/ValueSet/us-core-documentreference-category'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/us/core/ValueSet/us-core-documentreference-category')
), --noqa: LT07

union_table AS (
Expand Down
30 changes: 15 additions & 15 deletions cumulus_library/studies/core/reference_sql/builder_encounter.sql
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/encounter-type'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/encounter-type')
), --noqa: LT07

system_type_1 AS (
Expand All @@ -48,7 +48,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/v2-0004'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/v2-0004')
), --noqa: LT07

system_type_2 AS (
Expand All @@ -64,7 +64,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:2.16.840.1.113883.4.642.3.248'
REGEXP_LIKE(u.coding.system, 'urn:oid:2\.16\.840\.1\.113883\.4\.642\.3\.248')
), --noqa: LT07

system_type_3 AS (
Expand All @@ -80,7 +80,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://snomed.info/sct'
REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct')
), --noqa: LT07

system_type_4 AS (
Expand All @@ -96,7 +96,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'https://fhir.cerner.com/%/codeSet/71'
REGEXP_LIKE(u.coding.system, 'https://fhir\.cerner\.com/.*/codeSet/71')
), --noqa: LT07

system_type_5 AS (
Expand All @@ -112,7 +112,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.10110'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.10110')
), --noqa: LT07

system_type_6 AS (
Expand All @@ -128,7 +128,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.18875'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.18875')
), --noqa: LT07

system_type_7 AS (
Expand All @@ -144,7 +144,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.10.698084.30'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.10\.698084\.30')
), --noqa: LT07

system_type_8 AS (
Expand All @@ -160,7 +160,7 @@ CREATE TABLE core__encounter_dn_type AS (
flattened_rows AS s,
UNNEST(s.type.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.808267'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.808267')
), --noqa: LT07

union_table AS (
Expand Down Expand Up @@ -340,7 +340,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://terminology.hl7.org/CodeSystem/v3-ActPriority'
REGEXP_LIKE(u.coding.system, 'http://terminology\.hl7\.org/CodeSystem/v3-ActPriority')
), --noqa: LT07

system_reasoncode_1 AS (
Expand All @@ -356,7 +356,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://snomed.info/sct'
REGEXP_LIKE(u.coding.system, 'http://snomed\.info/sct')
), --noqa: LT07

system_reasoncode_2 AS (
Expand All @@ -372,7 +372,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-10-cm'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-10-cm')
), --noqa: LT07

system_reasoncode_3 AS (
Expand All @@ -388,7 +388,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'http://hl7.org/fhir/sid/icd-9-cm'
REGEXP_LIKE(u.coding.system, 'http://hl7\.org/fhir/sid/icd-9-cm')
), --noqa: LT07

system_reasoncode_4 AS (
Expand All @@ -404,7 +404,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'https://fhir.cerner.com/%/nomenclature'
REGEXP_LIKE(u.coding.system, 'https://fhir\.cerner\.com/.*/nomenclature')
), --noqa: LT07

system_reasoncode_5 AS (
Expand All @@ -420,7 +420,7 @@ CREATE TABLE core__encounter_dn_reasoncode AS (
flattened_rows AS s,
UNNEST(s.reasoncode.coding) AS u (coding)
WHERE
u.coding.system LIKE 'urn:oid:1.2.840.114350.1.13.71.2.7.2.728286'
REGEXP_LIKE(u.coding.system, 'urn:oid:1\.2\.840\.114350\.1\.13\.71\.2\.7\.2\.728286')
), --noqa: LT07

union_table AS (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,17 @@ CREATE TABLE core__medication AS (
mr.id,
substring(mr.med_ref, 2) AS medication_id
FROM mr_basics AS mr
WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE '#%'
WHERE mr.med_ref IS NOT NULL AND REGEXP_LIKE(mr.med_ref, '#.*')
)
),

external_refs AS (
SELECT DISTINCT
mr.id,
substring(mr.med_ref, 12) AS medication_id
FROM mr_basics AS mr
WHERE mr.med_ref IS NOT NULL AND mr.med_ref LIKE 'Medication/%'
WHERE mr.med_ref IS NOT NULL AND REGEXP_LIKE(mr.med_ref, 'Medication/.*')
)
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ CREATE TABLE {{ target_table }} AS (
UNNEST(s.{{ field_alias }}.coding) AS u (coding)
{%- if filter_priority %}
WHERE
u.coding.system LIKE '{{ system }}'
{{ syntax.like('u.coding.system', system) }}
{%- endif %}
), --noqa: LT07
{%- endfor %}
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/template_sql/coding_denormalize.sql.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ CREATE TABLE {{ target_table }} AS (
UNNEST(s.{{ parent_field }}) AS u (parent_col)
{%- if filter_priority %}
WHERE
u.parent_col.{{ column_name }}.system LIKE '{{ system }}'
{{ syntax.like("u.parent_col." + column_name + ".system", system) }}
{%- endif %}
), --noqa: LT07
{%- endfor %}
Expand Down
30 changes: 29 additions & 1 deletion cumulus_library/template_sql/shared_macros/syntax.sql.jinja
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{# Commonly used macros related to basic SQL syntax formatting #}
{# Commonly used macros related to basic SQL syntax formatting #}-- noqa:disable=JJ01

{%- macro comma_delineate(loop) -%}
{%- if not loop.last -%}
Expand All @@ -17,4 +17,32 @@
UNION
{%- endif -%}
{%- endmacro -%}

{#- The intent of this macro is to convert sql wildcard format to regex syntax.
So for example, a value of macro_string like:
https://fhir.cerner.com/%/codeSet/71
will be converted into a regex-escaped pattern with a regex match anything pattern:
https:\/\/fhir\.cerner\.com\/(.*)\/codeSet\/71
See https://trino.io/docs/current/functions/regexp.html for more information. We
are applying a limited ruleset currently, since this is based on common patterns
in coding system fields, where this is currently used; if needed this could be
made more robust.
-#}
{%- macro wildcard_to_regex(match_string) -%}
'^{{ match_string.replace("\'", "\\\''").replace(".", "\\.").replace("%", ".*") }}$'
{%- endmacro -%}


{#- This macro provides a SQL LIKE-esque interface to abstract away having to
worry about the slightly more performant regex based matching we want to use
in most cases.
-#}
{%- macro like(field, match_string) -%}
REGEXP_LIKE({{ field }}, {{ wildcard_to_regex(match_string) }})
{%- endmacro -%}

--noqa: LT12
3 changes: 2 additions & 1 deletion cumulus_library/template_sql/show_tables.sql.jinja
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{%- import 'syntax.sql.jinja' as syntax -%}
SELECT table_name
FROM information_schema.tables
WHERE
table_schema = '{{ schema_name }}'
AND table_type = 'BASE TABLE'
AND table_name LIKE '{{ prefix }}%';
AND {{ syntax.like('table_name', prefix +"%") }}
3 changes: 2 additions & 1 deletion cumulus_library/template_sql/show_views.sql.jinja
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{%- import 'syntax.sql.jinja' as syntax -%}
SELECT table_name
FROM information_schema.tables
WHERE
table_schema = '{{ schema_name }}'
AND table_type = 'VIEW'
AND table_name LIKE '{{ prefix }}%';
AND {{ syntax.like('table_name', prefix + "%") }}
Loading

0 comments on commit 03ba78b

Please sign in to comment.