Skip to content

Commit

Permalink
Hard coded env vars for test environment and fixed warehouse clash wi…
Browse files Browse the repository at this point in the history
…th existing tests in e2e.
  • Loading branch information
wpfl-dbt committed Feb 28, 2025
1 parent a544e90 commit 5dbd08b
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 25 deletions.
18 changes: 9 additions & 9 deletions src/matchbox/client/helpers/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def select(
Args:
selection: Full source names and optionally a subset of columns to select
engine: the engine to connect to the data warehouse hosting the source.
engine: The engine to connect to the data warehouse hosting the source.
If not provided, will use a connection string from the
`MB__CLIENT__DEFAULT_WAREHOUSE` environment variable.
non_indexed: Whether you intend to select non-indexed columns. Will raise a
Expand Down Expand Up @@ -101,20 +101,20 @@ def query(
"""Runs queries against the selected backend.
Args:
selectors: each selector is the output of `select()`
selectors: Each selector is the output of `select()`.
This allows querying sources coming from different engines
resolution_name (optional): the name of the resolution point to query
resolution_name (optional): The name of the resolution point to query
If not set:
* If querying a single source, it will use the source resolution
* If querying 2 or more sources, it will look for a default resolution
return_type: the form to return data in, one of "pandas" or "arrow"
return_type: The form to return data in, one of "pandas" or "arrow"
Defaults to pandas for ease of use
threshold (optional): the threshold to use for creating clusters
threshold (optional): The threshold to use for creating clusters
If None, uses the resolutions' default threshold
If an integer, uses that threshold for the specified resolution, and the
resolution's cached thresholds for its ancestors
limit (optional): the number to use in a limit clause. Useful for testing
limit (optional): The number to use in a limit clause. Useful for testing
Returns:
Data in the requested return type
Expand Down Expand Up @@ -214,13 +214,13 @@ def match(
"""Matches IDs against the selected backend.
Args:
targets: each target is the output of `select()`
targets: Each target is the output of `select()`.
This allows matching against sources coming from different engines
source: The output of using `select()` on a single source.
source_pk: The primary key value to match from the source.
resolution_name (optional): the resolution name to use for filtering results.
resolution_name (optional): The resolution name to use for filtering results.
If not set, it will look for a default resolution.
threshold (optional): the threshold to use for creating clusters
threshold (optional): The threshold to use for creating clusters.
If None, uses the resolutions' default threshold
If an integer, uses that threshold for the specified resolution, and the
resolution's cached thresholds for its ancestors
Expand Down
Empty file removed test/e2e/.gitkeep
Empty file.
32 changes: 19 additions & 13 deletions test/e2e/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def setup_environment(self, matchbox_client: Client, warehouse_engine: Engine):
# Create source configurations that match our test fixtures
source_configs = (
SourceConfig(
full_name="test.crn",
full_name="e2e.crn",
engine=warehouse_engine,
features=(
features["company_name"].add_variations(
Expand All @@ -82,7 +82,7 @@ def setup_environment(self, matchbox_client: Client, warehouse_engine: Engine):
repetition=0, # No duplicates within the variations
),
SourceConfig(
full_name="test.duns",
full_name="e2e.duns",
engine=warehouse_engine,
features=(
features["company_name"],
Expand All @@ -92,7 +92,7 @@ def setup_environment(self, matchbox_client: Client, warehouse_engine: Engine):
repetition=0,
),
SourceConfig(
full_name="test.cdms",
full_name="e2e.cdms",
engine=warehouse_engine,
features=(
features["crn"],
Expand All @@ -109,6 +109,12 @@ def setup_environment(self, matchbox_client: Client, warehouse_engine: Engine):
seed=42, # For reproducibility
)

# Use a separate schema to avoid conflict with legacy test data
# TODO: Remove once legacy tests are refactored
with warehouse_engine.connect() as conn:
conn.execute(text("create schema e2e;"))
conn.commit()

# Setup code - Create tables in warehouse
for source_name, source_testkit in self.linked_testkit.sources.items():
# Use pandas to write the data to the warehouse
Expand Down Expand Up @@ -266,13 +272,13 @@ def _clean_company_name(df: DataFrame, prefix: str) -> DataFrame:
# Define linking pairs based on common fields
linking_pairs = [
(
self.linked_testkit.sources["test.crn"],
self.linked_testkit.sources["test.duns"],
self.linked_testkit.sources["e2e.crn"],
self.linked_testkit.sources["e2e.duns"],
"company_name",
), # CRN-DUNS via company_name
(
self.linked_testkit.sources["test.crn"],
self.linked_testkit.sources["test.cdms"],
self.linked_testkit.sources["e2e.crn"],
self.linked_testkit.sources["e2e.cdms"],
"crn",
), # CRN-CDMS via crn
]
Expand Down Expand Up @@ -375,9 +381,9 @@ def _clean_company_name(df: DataFrame, prefix: str) -> DataFrame:

# === FINAL LINKING PHASE ===
# Now link the first linked pair (crn-duns) with the third source (cdms)
crn_source = "test.crn"
duns_source = "test.duns"
cdms_source = "test.cdms"
crn_source = "e2e.crn"
duns_source = "e2e.duns"
cdms_source = "e2e.cdms"
first_pair = (crn_source, duns_source)

# Get prefixes for column names
Expand Down Expand Up @@ -469,9 +475,9 @@ def _clean_company_name(df: DataFrame, prefix: str) -> DataFrame:

# === FINAL VERIFICATION PHASE ===
# Query the final linked data with specific columns
crn_source = "test.crn"
duns_source = "test.duns"
cdms_source = "test.cdms"
crn_source = "e2e.crn"
duns_source = "e2e.duns"
cdms_source = "e2e.cdms"

# Get necessary columns from each source
final_df = query(
Expand Down
9 changes: 6 additions & 3 deletions test/fixtures/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from matchbox import index, make_model
from matchbox.client._handler import create_client
from matchbox.client._settings import ClientSettings
from matchbox.client._settings import settings as client_settings
from matchbox.common.sources import Source, SourceAddress
from matchbox.server.base import MatchboxDatastoreSettings, MatchboxDBAdapter
from matchbox.server.postgresql import MatchboxPostgres, MatchboxPostgresSettings
Expand Down Expand Up @@ -396,15 +395,19 @@ def s3(aws_credentials: None) -> Generator[S3Client, None, None]:
def matchbox_api() -> Generator[MockRouter, None, None]:
"""Client for the mocked Matchbox API."""
with respx.mock(
base_url=client_settings.api_root, assert_all_called=True
base_url="http://localhost:8000", assert_all_called=True
) as respx_mock:
yield respx_mock


@pytest.fixture(scope="session")
def matchbox_client_settings() -> ClientSettings:
"""Client settings for the Matchbox API running in Docker."""
return client_settings
return ClientSettings(
api_root="http://localhost:8000",
timeout=10,
retry_delay=2,
)


@pytest.fixture(scope="session")
Expand Down

0 comments on commit 5dbd08b

Please sign in to comment.