From 85e07ac67d72565a0100d520088d0caa94973265 Mon Sep 17 00:00:00 2001 From: Dejan Lozanovic Date: Wed, 5 Feb 2025 11:26:20 +0000 Subject: [PATCH 1/5] Pre pr adbc ingestion --- .gitignore | 2 +- src/matchbox/server/postgresql/utils/db.py | 108 +++++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 16df8831..b04cc875 100644 --- a/.gitignore +++ b/.gitignore @@ -190,4 +190,4 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +.idea/ \ No newline at end of file diff --git a/src/matchbox/server/postgresql/utils/db.py b/src/matchbox/server/postgresql/utils/db.py index e07fffb8..85351382 100644 --- a/src/matchbox/server/postgresql/utils/db.py +++ b/src/matchbox/server/postgresql/utils/db.py @@ -4,6 +4,11 @@ import pstats from itertools import islice from typing import Any, Callable, Iterable +from datetime import datetime + +import pyarrow as pa +import adbc_driver_postgresql.dbapi +from adbc_driver_manager.dbapi import Connection as ADBCConnection from pg_bulk_ingest import Delete, Upsert, ingest from sqlalchemy import Engine, Index, MetaData, Table, func @@ -162,3 +167,106 @@ def batch_ingest( upsert=Upsert.IF_PRIMARY_KEY, delete=Delete.OFF, ) + +POSTGRESQL_URI = "postgresql://postgres:postgres@localhost:5432/postgres" +def adbc_ingest_data(clusters:pa.Table, contains:pa.Table, probabilities:pa.Table) -> bool: + """ Ingest data from PostgreSQL using pyarrow adbc ingest. + Args: clusters: pa.Table, contains: pa.Table, probabilities: pa.Table + """ + suffix = datetime.now().strftime("%Y%m%d%H%M%S") + if _adbc_insert_data(clusters, contains, probabilities, suffix): + _create_adbc_table_constraints(suffix) + + else: + return False + +def _create_adbc_table_constraints(db_schema:str, sufix:str) -> bool: + """ Creating primary and secondary keys indexes and constraints. + Args: db_schema: str, the name of the schema + """ + # Cluster + _run_query(f"ALTER TABLE {db_schema}.clusters_{sufix} ADD PRIMARY KEY (cluster_id)") + _run_query(f"""ALTER TABLE {db_schema}.probabilities_{sufix} ADD PRIMARY KEY (resolution, "cluster")""") + _run_query(f"CREATE UNIQUE INDEX cluster_hash_index_{sufix} ON {db_schema}.clusters_{sufix} USING btree (cluster_hash)") + # _run_query(f"CREATE UNIQUE INDEX clusters_adbc_clusters_is_{sufix} ON {db_schema}.clusters_{sufix} USING btree (cluster_id)") + _run_query(f"CREATE INDEX ix_clusters_id_gin_{sufix} ON {db_schema}.clusters_{sufix} USING gin (source_pk)") + _run_query(f"CREATE INDEX ix_mb_clusters_source_pk_{sufix} ON {db_schema}.clusters_{sufix} USING btree (source_pk)") + + # Contains + _run_query(f"CREATE UNIQUE INDEX ix_contains_child_parent_{sufix} ON {db_schema}.contains_{sufix} USING btree (child, parent)") + _run_query(f"CREATE UNIQUE INDEX ix_contains_parent_child_{sufix} ON {db_schema}.contains_{sufix} USING btree (parent, child)") + + # Foreign keys + _run_query(f"ALTER TABLE {db_schema}.clusters_{sufix} ADD CONSTRAINT clusters_dataset_fkey FOREIGN KEY (dataset) REFERENCES {db_schema}.sources(resolution_id)") + _run_query(f"""ALTER TABLE {db_schema}."contains_{sufix}" ADD CONSTRAINT contains_child_fkey FOREIGN KEY (child) REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""") + _run_query(f"""ALTER TABLE {db_schema}."contains_{sufix}" ADD CONSTRAINT contains_parent_fkey FOREIGN KEY (parent) REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""") + _run_query(f"""ALTER TABLE {db_schema}.probabilities_{sufix} ADD CONSTRAINT probabilities_cluster_fkey FOREIGN KEY ("cluster") REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""") + _run_query(f"ALTER TABLE {db_schema}.probabilities_{sufix} ADD CONSTRAINT probabilities_resolution_fkey FOREIGN KEY (resolution) REFERENCES {db_schema}.resolutions(resolution_id) ON DELETE CASCADE") + + _run_queries([ + f"""DROP TABLE IF EXISTS {db_schema}.clusters""", + f"""DROP TABLE IF EXISTS {db_schema}.contains""", + f"""DROP TABLE IF EXISTS {db_schema}.probabilities""", + + f"""ALTER TABLE {db_schema}.clusters_{sufix} RENAME TO clusters""", + f"""ALTER TABLE {db_schema}.contains_{sufix} RENAME TO contains""", + f"""ALTER TABLE {db_schema}.probabilities_{sufix} RENAME TO probabilities""" + ]) + +def _adbc_insert_data(clusters, contains, probabilities, suffix) -> bool: + # TODO: try except proper exception type from adbc + conn = adbc_driver_postgresql.dbapi.connect(POSTGRESQL_URI) + _run_query(f"CREATE TABLE clusters_{suffix} AS SELECT * FROM clusters") + _save_to_postgresql( + table=clusters, + conn=conn, + schema="", + table_name=f"clusters_{suffix}", + ) + _run_query(f"CREATE TABLE contains_{suffix} AS SELECT * FROM contains") + _save_to_postgresql( + table=contains, + conn=conn, + schema="", + table_name=f"contains_{suffix}", + ) + _run_query(f"CREATE TABLE probabilities_{suffix} AS SELECT * FROM probabilities") + _save_to_postgresql( + table=probabilities, + conn=conn, + schema="", + table_name=f"probabilities_{suffix}", + ) + conn.commit() + return True + +def _run_query(query: str) -> None: + conn = get_engine() + conn.execute(text(query)) + conn.commit() + + +def _run_queries(queries: list[str]) -> None: + conn = get_engine() + conn.begin() + for query in queries: + conn.execute(text(query)) + conn.commit() + +def _save_to_postgresql( + table: pa.Table, conn: ADBCConnection, schema: str, table_name: str +): + """ + Saves a PyArrow Table to PostgreSQL using ADBC. + """ + with conn.cursor() as cursor: + # Convert PyArrow Table to Arrow RecordBatchStream for efficient transfer + batch_reader = pa.RecordBatchReader.from_batches( + table.schema, table.to_batches() + ) + cursor.adbc_ingest( + table_name=table_name, + data=batch_reader, + mode="append", + db_schema_name=schema, + ) From 6748dacbd112615e661851b8fa4e447b8f1e1389 Mon Sep 17 00:00:00 2001 From: Dejan Lozanovic Date: Thu, 6 Feb 2025 09:06:36 +0000 Subject: [PATCH 2/5] Pre pr adbc ingestion --- pyproject.toml | 1 + src/matchbox/server/postgresql/utils/db.py | 126 ++++++++++-------- .../server/postgresql/utils/insert.py | 67 +--------- uv.lock | 96 ++++++++++--- 4 files changed, 150 insertions(+), 140 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b6b0a540..b8559262 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "splink>=4.0.5,<4.1.0", "sqlalchemy>=2.0.35", "rich>=13.9.4", + "adbc-driver-postgresql>=1.4.0", ] [project.optional-dependencies] diff --git a/src/matchbox/server/postgresql/utils/db.py b/src/matchbox/server/postgresql/utils/db.py index 85351382..5bef6999 100644 --- a/src/matchbox/server/postgresql/utils/db.py +++ b/src/matchbox/server/postgresql/utils/db.py @@ -1,6 +1,7 @@ import contextlib import cProfile import io +import os import pstats from itertools import islice from typing import Any, Callable, Iterable @@ -9,11 +10,13 @@ import pyarrow as pa import adbc_driver_postgresql.dbapi from adbc_driver_manager.dbapi import Connection as ADBCConnection +from adbc_driver_manager import DatabaseError as ADBCDatabaseError from pg_bulk_ingest import Delete, Upsert, ingest -from sqlalchemy import Engine, Index, MetaData, Table, func +from sqlalchemy import Engine, Index, MetaData, Table, func, text from sqlalchemy.engine.base import Connection from sqlalchemy.orm import DeclarativeMeta, Session +from sqlalchemy.exc import DatabaseError as AlchemyDatabaseError from matchbox.common.graph import ( ResolutionEdge, @@ -168,40 +171,51 @@ def batch_ingest( delete=Delete.OFF, ) -POSTGRESQL_URI = "postgresql://postgres:postgres@localhost:5432/postgres" -def adbc_ingest_data(clusters:pa.Table, contains:pa.Table, probabilities:pa.Table) -> bool: + +MB__POSTGRES__PASSWORD = os.environ["MB__POSTGRES__PASSWORD"] +MB__POSTGRES__PORT = os.environ["MB__POSTGRES__PORT"] +MB__POSTGRES__USER = os.environ["MB__POSTGRES__USER"] +MB__POSTGRES__DATABASE = os.environ["MB__POSTGRES__DATABASE"] +MB__POSTGRES__HOST = os.environ["MB__POSTGRES__HOST"] +MB__POSTGRES__SCHEMA = os.environ["MB__POSTGRES__DB_SCHEMA"] + + +POSTGRESQL_URI = f"postgresql://{MB__POSTGRES__USER}:{MB__POSTGRES__PASSWORD}@{MB__POSTGRES__HOST}:{MB__POSTGRES__PORT}/{MB__POSTGRES__DATABASE}" + +def adbc_ingest_data(clusters:pa.Table, contains:pa.Table, probabilities:pa.Table, engine:Engine, resolution_id:int) -> bool: """ Ingest data from PostgreSQL using pyarrow adbc ingest. - Args: clusters: pa.Table, contains: pa.Table, probabilities: pa.Table + Args: clusters: pa.Table, contains: pa.Table, probabilities: pa.Table, engine: Engine """ - suffix = datetime.now().strftime("%Y%m%d%H%M%S") - if _adbc_insert_data(clusters, contains, probabilities, suffix): - _create_adbc_table_constraints(suffix) - else: - return False + with engine.connect() as alchemy_conn: + suffix = datetime.now().strftime("%Y%m%d%H%M%S") + if _adbc_insert_data(clusters, contains, probabilities, suffix, alchemy_conn, resolution_id): + return _create_adbc_table_constraints(suffix, alchemy_conn) + else: + return False -def _create_adbc_table_constraints(db_schema:str, sufix:str) -> bool: +def _create_adbc_table_constraints(db_schema:str, sufix:str, conn:Connection) -> bool: """ Creating primary and secondary keys indexes and constraints. Args: db_schema: str, the name of the schema """ # Cluster - _run_query(f"ALTER TABLE {db_schema}.clusters_{sufix} ADD PRIMARY KEY (cluster_id)") - _run_query(f"""ALTER TABLE {db_schema}.probabilities_{sufix} ADD PRIMARY KEY (resolution, "cluster")""") - _run_query(f"CREATE UNIQUE INDEX cluster_hash_index_{sufix} ON {db_schema}.clusters_{sufix} USING btree (cluster_hash)") - # _run_query(f"CREATE UNIQUE INDEX clusters_adbc_clusters_is_{sufix} ON {db_schema}.clusters_{sufix} USING btree (cluster_id)") - _run_query(f"CREATE INDEX ix_clusters_id_gin_{sufix} ON {db_schema}.clusters_{sufix} USING gin (source_pk)") - _run_query(f"CREATE INDEX ix_mb_clusters_source_pk_{sufix} ON {db_schema}.clusters_{sufix} USING btree (source_pk)") + _run_query(f"ALTER TABLE {db_schema}.clusters_{sufix} ADD PRIMARY KEY (cluster_id)", conn) + _run_query(f"""ALTER TABLE {db_schema}.probabilities_{sufix} ADD PRIMARY KEY (resolution, "cluster")""", conn) + _run_query(f"CREATE UNIQUE INDEX cluster_hash_index_{sufix} ON {db_schema}.clusters_{sufix} USING btree (cluster_hash)", conn) + # _run_query(f"CREATE UNIQUE INDEX clusters_adbc_clusters_is_{sufix} ON {db_schema}.clusters_{sufix} USING btree (cluster_id)", conn) + _run_query(f"CREATE INDEX ix_clusters_id_gin_{sufix} ON {db_schema}.clusters_{sufix} USING gin (source_pk)", conn) + _run_query(f"CREATE INDEX ix_mb_clusters_source_pk_{sufix} ON {db_schema}.clusters_{sufix} USING btree (source_pk)", conn) # Contains - _run_query(f"CREATE UNIQUE INDEX ix_contains_child_parent_{sufix} ON {db_schema}.contains_{sufix} USING btree (child, parent)") - _run_query(f"CREATE UNIQUE INDEX ix_contains_parent_child_{sufix} ON {db_schema}.contains_{sufix} USING btree (parent, child)") + _run_query(f"CREATE UNIQUE INDEX ix_contains_child_parent_{sufix} ON {db_schema}.contains_{sufix} USING btree (child, parent)", conn) + _run_query(f"CREATE UNIQUE INDEX ix_contains_parent_child_{sufix} ON {db_schema}.contains_{sufix} USING btree (parent, child)", conn) # Foreign keys - _run_query(f"ALTER TABLE {db_schema}.clusters_{sufix} ADD CONSTRAINT clusters_dataset_fkey FOREIGN KEY (dataset) REFERENCES {db_schema}.sources(resolution_id)") - _run_query(f"""ALTER TABLE {db_schema}."contains_{sufix}" ADD CONSTRAINT contains_child_fkey FOREIGN KEY (child) REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""") - _run_query(f"""ALTER TABLE {db_schema}."contains_{sufix}" ADD CONSTRAINT contains_parent_fkey FOREIGN KEY (parent) REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""") - _run_query(f"""ALTER TABLE {db_schema}.probabilities_{sufix} ADD CONSTRAINT probabilities_cluster_fkey FOREIGN KEY ("cluster") REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""") - _run_query(f"ALTER TABLE {db_schema}.probabilities_{sufix} ADD CONSTRAINT probabilities_resolution_fkey FOREIGN KEY (resolution) REFERENCES {db_schema}.resolutions(resolution_id) ON DELETE CASCADE") + _run_query(f"ALTER TABLE {db_schema}.clusters_{sufix} ADD CONSTRAINT clusters_dataset_fkey FOREIGN KEY (dataset) REFERENCES {db_schema}.sources(resolution_id)", conn) + _run_query(f"""ALTER TABLE {db_schema}."contains_{sufix}" ADD CONSTRAINT contains_child_fkey FOREIGN KEY (child) REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""", conn) + _run_query(f"""ALTER TABLE {db_schema}."contains_{sufix}" ADD CONSTRAINT contains_parent_fkey FOREIGN KEY (parent) REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""", conn) + _run_query(f"""ALTER TABLE {db_schema}.probabilities_{sufix} ADD CONSTRAINT probabilities_cluster_fkey FOREIGN KEY ("cluster") REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""", conn) + _run_query(f"ALTER TABLE {db_schema}.probabilities_{sufix} ADD CONSTRAINT probabilities_resolution_fkey FOREIGN KEY (resolution) REFERENCES {db_schema}.resolutions(resolution_id) ON DELETE CASCADE", conn) _run_queries([ f"""DROP TABLE IF EXISTS {db_schema}.clusters""", @@ -211,43 +225,47 @@ def _create_adbc_table_constraints(db_schema:str, sufix:str) -> bool: f"""ALTER TABLE {db_schema}.clusters_{sufix} RENAME TO clusters""", f"""ALTER TABLE {db_schema}.contains_{sufix} RENAME TO contains""", f"""ALTER TABLE {db_schema}.probabilities_{sufix} RENAME TO probabilities""" - ]) - -def _adbc_insert_data(clusters, contains, probabilities, suffix) -> bool: - # TODO: try except proper exception type from adbc - conn = adbc_driver_postgresql.dbapi.connect(POSTGRESQL_URI) - _run_query(f"CREATE TABLE clusters_{suffix} AS SELECT * FROM clusters") - _save_to_postgresql( - table=clusters, - conn=conn, - schema="", - table_name=f"clusters_{suffix}", - ) - _run_query(f"CREATE TABLE contains_{suffix} AS SELECT * FROM contains") - _save_to_postgresql( - table=contains, - conn=conn, - schema="", - table_name=f"contains_{suffix}", - ) - _run_query(f"CREATE TABLE probabilities_{suffix} AS SELECT * FROM probabilities") - _save_to_postgresql( - table=probabilities, - conn=conn, - schema="", - table_name=f"probabilities_{suffix}", - ) - conn.commit() + ], conn) return True -def _run_query(query: str) -> None: - conn = get_engine() +def _adbc_insert_data(clusters:pa.Table, contains:pa.Table, probabilities:pa.Table, suffix:str, alchemy_conn:Connection, resolution_id:int) -> bool: + # TODO: try except proper exception type from adbc + with adbc_driver_postgresql.dbapi.connect(POSTGRESQL_URI) as conn: + try: + _run_query(f"CREATE TABLE clusters_{suffix} AS SELECT * FROM clusters", alchemy_conn) + _save_to_postgresql( + table=clusters, + conn=conn, + schema=MB__POSTGRES__SCHEMA, + table_name=f"clusters_{suffix}", + ) + _run_query(f"CREATE TABLE contains_{suffix} AS SELECT * FROM contains", alchemy_conn) + _save_to_postgresql( + table=contains, + conn=conn, + schema=MB__POSTGRES__SCHEMA, + table_name=f"contains_{suffix}", + ) + _run_query(f"CREATE TABLE probabilities_{suffix} AS SELECT * FROM probabilities WHERE resolution != {resolution_id}", alchemy_conn) + _save_to_postgresql( + table=probabilities, + conn=conn, + schema=MB__POSTGRES__SCHEMA, + table_name=f"probabilities_{suffix}", + ) + conn.commit() + return True + except ADBCConnection as e: + return False + except AlchemyDatabaseError as e: + return False + +def _run_query(query: str,conn:Connection) -> None: conn.execute(text(query)) conn.commit() -def _run_queries(queries: list[str]) -> None: - conn = get_engine() +def _run_queries(queries: list[str], conn:Connection) -> None: conn.begin() for query in queries: conn.execute(text(query)) diff --git a/src/matchbox/server/postgresql/utils/insert.py b/src/matchbox/server/postgresql/utils/insert.py index 773f5be7..5b9c800e 100644 --- a/src/matchbox/server/postgresql/utils/insert.py +++ b/src/matchbox/server/postgresql/utils/insert.py @@ -25,7 +25,7 @@ Resolutions, Sources, ) -from matchbox.server.postgresql.utils.db import batch_ingest, hash_to_hex_decode +from matchbox.server.postgresql.utils.db import batch_ingest, hash_to_hex_decode , adbc_ingest_data logic_logger = logging.getLogger("mb_logic") @@ -493,69 +493,6 @@ def insert_results( resolution=resolution, results=results, engine=engine ) - with Session(engine) as session: - try: - # Clear existing probabilities for this resolution - session.execute( - delete(Probabilities).where( - Probabilities.resolution == resolution.resolution_id - ) - ) - - session.commit() - logic_logger.info(f"[{resolution.name}] Removed old probabilities") - - except SQLAlchemyError as e: - session.rollback() - logic_logger.error( - f"[{resolution.name}] Failed to clear old probabilities: {str(e)}" - ) - raise - - with engine.connect() as conn: - try: - logic_logger.info( - f"[{resolution.name}] Inserting {clusters.shape[0]:,} results objects" - ) - - batch_ingest( - records=[tuple(c.values()) for c in clusters.to_pylist()], - table=Clusters, - conn=conn, - batch_size=batch_size, - ) - - logic_logger.info( - f"[{resolution.name}] Successfully inserted {clusters.shape[0]} " - "objects into Clusters table" - ) - - batch_ingest( - records=[tuple(c.values()) for c in contains.to_pylist()], - table=Contains, - conn=conn, - batch_size=batch_size, - ) - - logic_logger.info( - f"[{resolution.name}] Successfully inserted {contains.shape[0]} " - "objects into Contains table" - ) - - batch_ingest( - records=[tuple(c.values()) for c in probabilities.to_pylist()], - table=Probabilities, - conn=conn, - batch_size=batch_size, - ) - - logic_logger.info( - f"[{resolution.name}] Successfully inserted " - f"{probabilities.shape[0]} objects into Probabilities table" - ) - - except SQLAlchemyError as e: - logic_logger.error(f"[{resolution.name}] Failed to insert data: {str(e)}") - raise + adbc_ingest_data(clusters=clusters, contains=contains, probabilities=probabilities, engine=engine, resolution_id=resolution.resolution_id) logic_logger.info(f"[{resolution.name}] Insert operation complete!") diff --git a/uv.lock b/uv.lock index 2a33a846..6e30c485 100644 --- a/uv.lock +++ b/uv.lock @@ -7,6 +7,49 @@ resolution-markers = [ "python_full_version < '3.12' and platform_python_implementation == 'PyPy'", ] +[[package]] +name = "adbc-driver-manager" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/20/2c/580f0e2024ce0c6330c5e990eaeffd2b206355b5929d19889b3eac008064/adbc_driver_manager-1.4.0.tar.gz", hash = "sha256:fb8437f3e3aad9aa119ccbdfe05b83418ae552ca7e1e3f6d46d53e1f316610b8", size = 107712 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/07/72cfaec3fb1e5090e4495bb310e4ae62d4262ea2330ee7f7395909b3505d/adbc_driver_manager-1.4.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:7bd274386c3decd52212d225cc46f934ce3503a3f9e0e823e4f8a40162d89b2b", size = 381818 }, + { url = "https://files.pythonhosted.org/packages/20/80/efb076dd9148f903a4e96f969dd7a0cdafeeabba8e14c1db9bf21452ce31/adbc_driver_manager-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:90d1055f2557d703fa2e7073d7b26cc6394159ff4b1422d2dae05d08b774f687", size = 368704 }, + { url = "https://files.pythonhosted.org/packages/0f/29/ed9525e46d474230a0fb310ab077a681b49c45c6e4e5a305e0c704702256/adbc_driver_manager-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db899b18caeb9e26b99c0baf17e239f35274d6c3dc3aa0e3afc4d278ef3c6747", size = 2150204 }, + { url = "https://files.pythonhosted.org/packages/a5/32/c00c7b5dd4c187003f0f6799090d17db42effc3396b5a6971228e0d1cbb4/adbc_driver_manager-1.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c1a7079c7f254c7010afe08872c4bfab6e716a507fc343c32204e8ce6bfd898", size = 2164967 }, + { url = "https://files.pythonhosted.org/packages/37/30/3b62800f5f7aad8c51e2e71fc8e9a87dadb007588289fddab09d180ea1ae/adbc_driver_manager-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:27a8943ba838a4038b4ec6466e11eafe336ca5d382adb7c5d2cc9c00dd44bd10", size = 538168 }, + { url = "https://files.pythonhosted.org/packages/59/30/e76d5bdb044b4126b4deed32ff5cf02b62d9e7eba4eec5a06c6005a3952f/adbc_driver_manager-1.4.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:3770180aa2cccc6c18ffd189d623ebbf35dccad75895f0fd4275b73d39297849", size = 381431 }, + { url = "https://files.pythonhosted.org/packages/c8/25/a96b04c0162253ff9e014b774789cc76e84e536f9ef874c9d2af240bfa42/adbc_driver_manager-1.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6ba26a0510d1e7596c827d1ef58185d32fb4f97837e463608801ec3b4850ce74", size = 365687 }, + { url = "https://files.pythonhosted.org/packages/1e/8d/caae84fceed7e2cff414ce9a17f62eee0ceca98058bb8b1fbde1a1941904/adbc_driver_manager-1.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50eedc6b173a80a0a8f46ef6352761a2c53063ac94327b1f45603db33176010d", size = 2129076 }, + { url = "https://files.pythonhosted.org/packages/fb/8b/7d4ce1622b2930205261c5b7dae7ded7f3328033fdfe02f213f2bb41720f/adbc_driver_manager-1.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:168a07ce237902dd9aa0e079d7e8c131ad6f61fb20f3b43020597b2a3c29d3ea", size = 2161148 }, + { url = "https://files.pythonhosted.org/packages/59/ab/991d5dc4d8b65adab2990f949524ce5ca504aa84727bc501fa6ba919288f/adbc_driver_manager-1.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1c165069fe3bcffc25e593cb933958b4a0563aff317cb6a37fc390c8dd0ed26f", size = 535674 }, + { url = "https://files.pythonhosted.org/packages/80/97/e33d558177e8dcbdaec2191bc37970e5068e46095a21dc157aaa65530e58/adbc_driver_manager-1.4.0-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:acfe7ca9c227a0835abc52289601337bde7a96c20befc443c9ba4eb78d673214", size = 379612 }, + { url = "https://files.pythonhosted.org/packages/1e/4f/11aacce2421902b9bed07970d5f4565c5948f58788392962ffeceadbac21/adbc_driver_manager-1.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b5fbf822e90fc6df5321067a25564cae11b404d4c9ba56fe4471c73c5b54e38f", size = 363412 }, + { url = "https://files.pythonhosted.org/packages/0d/bd/68672ab658dbcb154500cb668e2fe630861b3ac3348c5cdb6bf501ae10ab/adbc_driver_manager-1.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e1d255116003514088990cc5af5995cc7f5d2ebea96b00e5b2a1f4d922d7137", size = 2123358 }, + { url = "https://files.pythonhosted.org/packages/f7/4a/5a966873541d580bf27711355ed9fd40cd46bea702eb092c6825306957a6/adbc_driver_manager-1.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682364a252de1029fa12a2f7526d4cb9e8e03e009d8d0f64bc3530a7539a74f6", size = 2156251 }, + { url = "https://files.pythonhosted.org/packages/ca/4b/19d32beccfcb647451db25cc2e40dbeb6b763bf274cdc85d22e68511baa4/adbc_driver_manager-1.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:f01441fb8cc7859037ae0c39692bffa26f2fa0da68664589ced5b3794540f402", size = 533846 }, +] + +[[package]] +name = "adbc-driver-postgresql" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "adbc-driver-manager" }, + { name = "importlib-resources" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2a/8b/5d088a3c4d739a3a62df081a819117df924318b5044ab3214d72723591ad/adbc_driver_postgresql-1.4.0.tar.gz", hash = "sha256:0bc9c34911b8d1a2e333ee5ef3d284a6517a8f20d5509b2c183cb4cf8337efc7", size = 18402 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/69/165a7155a9514aa472beae1e16fc93c51a363c7deed0db7d2d470c28788d/adbc_driver_postgresql-1.4.0-py3-none-macosx_10_15_x86_64.whl", hash = "sha256:ca62087f1496192c3131a30fc3e4d2710fa3b31b1982bf8374b6a05fdc446513", size = 2683068 }, + { url = "https://files.pythonhosted.org/packages/7c/37/d991e047b931a3f005156a43c8be92ffbfd234589d5969b38adee03faa80/adbc_driver_postgresql-1.4.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c75827a5cfdc8ea429c21292ac6bd598a64a42af53797c7ace3ad2b3044ddc71", size = 2996305 }, + { url = "https://files.pythonhosted.org/packages/2d/5e/dc25c82cf2055e500b2f8e47093cf6037fd636866e0e6c54ccadccda1e6a/adbc_driver_postgresql-1.4.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ce4e30ced25802acd6d0baad70e7b9c7394869043c1349206218d51d27613c1", size = 3190063 }, + { url = "https://files.pythonhosted.org/packages/42/bd/910e7b8e0d6e91f94348fe23a2f8d3dfc4e86a0b933b26d41a4298f14772/adbc_driver_postgresql-1.4.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d04b2ea0353bacca814fb15c5da0d407888d27e86b4f3f58d66ebe71a2e8d3da", size = 2846350 }, + { url = "https://files.pythonhosted.org/packages/7d/fc/19fb644ab228040a01971e21bb42c98c48a95d3701e68dd38fa1bfb0bc61/adbc_driver_postgresql-1.4.0-py3-none-win_amd64.whl", hash = "sha256:10ae2584e47928e15da8ca541bbf594de13d6c1afc8c1adad6af3f832cf0a5fc", size = 2656378 }, +] + [[package]] name = "altair" version = "5.5.0" @@ -822,6 +865,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/17/621d3a59430851a327421fdbec9ec8494d7fadaffc6dfdd42d4a95accbf2/igraph-0.11.8-cp39-abi3-win_amd64.whl", hash = "sha256:248831a6162130f16909c1f776cc246b48f692339ea4baca489cad4ed8dc0e13", size = 1976778 }, ] +[[package]] +name = "importlib-resources" +version = "6.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cf/8c/f834fbf984f691b4f7ff60f50b514cc3de5cc08abfc3295564dd89c5e2e7/importlib_resources-6.5.2.tar.gz", hash = "sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c", size = 44693 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/ed/1f1afb2e9e7f38a545d628f864d562a5ae64fe6f7a10e28ffb9b185b4e89/importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec", size = 37461 }, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -1131,6 +1183,7 @@ name = "matchbox" version = "0.2.1" source = { editable = "." } dependencies = [ + { name = "adbc-driver-postgresql" }, { name = "click" }, { name = "connectorx" }, { name = "duckdb" }, @@ -1184,6 +1237,7 @@ typing = [ [package.metadata] requires-dist = [ + { name = "adbc-driver-postgresql", specifier = ">=1.4.0" }, { name = "boto3", marker = "extra == 'server'", specifier = ">=1.35.99" }, { name = "click", specifier = ">=8.1.7" }, { name = "connectorx", specifier = ">=0.3.3" }, @@ -2480,27 +2534,27 @@ wheels = [ [[package]] name = "ruff" -version = "0.9.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1e/7f/60fda2eec81f23f8aa7cbbfdf6ec2ca11eb11c273827933fb2541c2ce9d8/ruff-0.9.3.tar.gz", hash = "sha256:8293f89985a090ebc3ed1064df31f3b4b56320cdfcec8b60d3295bddb955c22a", size = 3586740 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f9/77/4fb790596d5d52c87fd55b7160c557c400e90f6116a56d82d76e95d9374a/ruff-0.9.3-py3-none-linux_armv6l.whl", hash = "sha256:7f39b879064c7d9670197d91124a75d118d00b0990586549949aae80cdc16624", size = 11656815 }, - { url = "https://files.pythonhosted.org/packages/a2/a8/3338ecb97573eafe74505f28431df3842c1933c5f8eae615427c1de32858/ruff-0.9.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a187171e7c09efa4b4cc30ee5d0d55a8d6c5311b3e1b74ac5cb96cc89bafc43c", size = 11594821 }, - { url = "https://files.pythonhosted.org/packages/8e/89/320223c3421962762531a6b2dd58579b858ca9916fb2674874df5e97d628/ruff-0.9.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c59ab92f8e92d6725b7ded9d4a31be3ef42688a115c6d3da9457a5bda140e2b4", size = 11040475 }, - { url = "https://files.pythonhosted.org/packages/b2/bd/1d775eac5e51409535804a3a888a9623e87a8f4b53e2491580858a083692/ruff-0.9.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dc153c25e715be41bb228bc651c1e9b1a88d5c6e5ed0194fa0dfea02b026439", size = 11856207 }, - { url = "https://files.pythonhosted.org/packages/7f/c6/3e14e09be29587393d188454064a4aa85174910d16644051a80444e4fd88/ruff-0.9.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:646909a1e25e0dc28fbc529eab8eb7bb583079628e8cbe738192853dbbe43af5", size = 11420460 }, - { url = "https://files.pythonhosted.org/packages/ef/42/b7ca38ffd568ae9b128a2fa76353e9a9a3c80ef19746408d4ce99217ecc1/ruff-0.9.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a5a46e09355695fbdbb30ed9889d6cf1c61b77b700a9fafc21b41f097bfbba4", size = 12605472 }, - { url = "https://files.pythonhosted.org/packages/a6/a1/3167023f23e3530fde899497ccfe239e4523854cb874458ac082992d206c/ruff-0.9.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:c4bb09d2bbb394e3730d0918c00276e79b2de70ec2a5231cd4ebb51a57df9ba1", size = 13243123 }, - { url = "https://files.pythonhosted.org/packages/d0/b4/3c600758e320f5bf7de16858502e849f4216cb0151f819fa0d1154874802/ruff-0.9.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:96a87ec31dc1044d8c2da2ebbed1c456d9b561e7d087734336518181b26b3aa5", size = 12744650 }, - { url = "https://files.pythonhosted.org/packages/be/38/266fbcbb3d0088862c9bafa8b1b99486691d2945a90b9a7316336a0d9a1b/ruff-0.9.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb7554aca6f842645022fe2d301c264e6925baa708b392867b7a62645304df4", size = 14458585 }, - { url = "https://files.pythonhosted.org/packages/63/a6/47fd0e96990ee9b7a4abda62de26d291bd3f7647218d05b7d6d38af47c30/ruff-0.9.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cabc332b7075a914ecea912cd1f3d4370489c8018f2c945a30bcc934e3bc06a6", size = 12419624 }, - { url = "https://files.pythonhosted.org/packages/84/5d/de0b7652e09f7dda49e1a3825a164a65f4998175b6486603c7601279baad/ruff-0.9.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:33866c3cc2a575cbd546f2cd02bdd466fed65118e4365ee538a3deffd6fcb730", size = 11843238 }, - { url = "https://files.pythonhosted.org/packages/9e/be/3f341ceb1c62b565ec1fb6fd2139cc40b60ae6eff4b6fb8f94b1bb37c7a9/ruff-0.9.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:006e5de2621304c8810bcd2ee101587712fa93b4f955ed0985907a36c427e0c2", size = 11484012 }, - { url = "https://files.pythonhosted.org/packages/a3/c8/ff8acbd33addc7e797e702cf00bfde352ab469723720c5607b964491d5cf/ruff-0.9.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:ba6eea4459dbd6b1be4e6bfc766079fb9b8dd2e5a35aff6baee4d9b1514ea519", size = 12038494 }, - { url = "https://files.pythonhosted.org/packages/73/b1/8d9a2c0efbbabe848b55f877bc10c5001a37ab10aca13c711431673414e5/ruff-0.9.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:90230a6b8055ad47d3325e9ee8f8a9ae7e273078a66401ac66df68943ced029b", size = 12473639 }, - { url = "https://files.pythonhosted.org/packages/cb/44/a673647105b1ba6da9824a928634fe23186ab19f9d526d7bdf278cd27bc3/ruff-0.9.3-py3-none-win32.whl", hash = "sha256:eabe5eb2c19a42f4808c03b82bd313fc84d4e395133fb3fc1b1516170a31213c", size = 9834353 }, - { url = "https://files.pythonhosted.org/packages/c3/01/65cadb59bf8d4fbe33d1a750103e6883d9ef302f60c28b73b773092fbde5/ruff-0.9.3-py3-none-win_amd64.whl", hash = "sha256:040ceb7f20791dfa0e78b4230ee9dce23da3b64dd5848e40e3bf3ab76468dcf4", size = 10821444 }, - { url = "https://files.pythonhosted.org/packages/69/cb/b3fe58a136a27d981911cba2f18e4b29f15010623b79f0f2510fd0d31fd3/ruff-0.9.3-py3-none-win_arm64.whl", hash = "sha256:800d773f6d4d33b0a3c60e2c6ae8f4c202ea2de056365acfa519aa48acf28e0b", size = 10038168 }, +version = "0.9.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c0/17/529e78f49fc6f8076f50d985edd9a2cf011d1dbadb1cdeacc1d12afc1d26/ruff-0.9.4.tar.gz", hash = "sha256:6907ee3529244bb0ed066683e075f09285b38dd5b4039370df6ff06041ca19e7", size = 3599458 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b6/f8/3fafb7804d82e0699a122101b5bee5f0d6e17c3a806dcbc527bb7d3f5b7a/ruff-0.9.4-py3-none-linux_armv6l.whl", hash = "sha256:64e73d25b954f71ff100bb70f39f1ee09e880728efb4250c632ceed4e4cdf706", size = 11668400 }, + { url = "https://files.pythonhosted.org/packages/2e/a6/2efa772d335da48a70ab2c6bb41a096c8517ca43c086ea672d51079e3d1f/ruff-0.9.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6ce6743ed64d9afab4fafeaea70d3631b4d4b28b592db21a5c2d1f0ef52934bf", size = 11628395 }, + { url = "https://files.pythonhosted.org/packages/dc/d7/cd822437561082f1c9d7225cc0d0fbb4bad117ad7ac3c41cd5d7f0fa948c/ruff-0.9.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:54499fb08408e32b57360f6f9de7157a5fec24ad79cb3f42ef2c3f3f728dfe2b", size = 11090052 }, + { url = "https://files.pythonhosted.org/packages/9e/67/3660d58e893d470abb9a13f679223368ff1684a4ef40f254a0157f51b448/ruff-0.9.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37c892540108314a6f01f105040b5106aeb829fa5fb0561d2dcaf71485021137", size = 11882221 }, + { url = "https://files.pythonhosted.org/packages/79/d1/757559995c8ba5f14dfec4459ef2dd3fcea82ac43bc4e7c7bf47484180c0/ruff-0.9.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:de9edf2ce4b9ddf43fd93e20ef635a900e25f622f87ed6e3047a664d0e8f810e", size = 11424862 }, + { url = "https://files.pythonhosted.org/packages/c0/96/7915a7c6877bb734caa6a2af424045baf6419f685632469643dbd8eb2958/ruff-0.9.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87c90c32357c74f11deb7fbb065126d91771b207bf9bfaaee01277ca59b574ec", size = 12626735 }, + { url = "https://files.pythonhosted.org/packages/0e/cc/dadb9b35473d7cb17c7ffe4737b4377aeec519a446ee8514123ff4a26091/ruff-0.9.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:56acd6c694da3695a7461cc55775f3a409c3815ac467279dfa126061d84b314b", size = 13255976 }, + { url = "https://files.pythonhosted.org/packages/5f/c3/ad2dd59d3cabbc12df308cced780f9c14367f0321e7800ca0fe52849da4c/ruff-0.9.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0c93e7d47ed951b9394cf352d6695b31498e68fd5782d6cbc282425655f687a", size = 12752262 }, + { url = "https://files.pythonhosted.org/packages/c7/17/5f1971e54bd71604da6788efd84d66d789362b1105e17e5ccc53bba0289b/ruff-0.9.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1d4c8772670aecf037d1bf7a07c39106574d143b26cfe5ed1787d2f31e800214", size = 14401648 }, + { url = "https://files.pythonhosted.org/packages/30/24/6200b13ea611b83260501b6955b764bb320e23b2b75884c60ee7d3f0b68e/ruff-0.9.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfc5f1d7afeda8d5d37660eeca6d389b142d7f2b5a1ab659d9214ebd0e025231", size = 12414702 }, + { url = "https://files.pythonhosted.org/packages/34/cb/f5d50d0c4ecdcc7670e348bd0b11878154bc4617f3fdd1e8ad5297c0d0ba/ruff-0.9.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:faa935fc00ae854d8b638c16a5f1ce881bc3f67446957dd6f2af440a5fc8526b", size = 11859608 }, + { url = "https://files.pythonhosted.org/packages/d6/f4/9c8499ae8426da48363bbb78d081b817b0f64a9305f9b7f87eab2a8fb2c1/ruff-0.9.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a6c634fc6f5a0ceae1ab3e13c58183978185d131a29c425e4eaa9f40afe1e6d6", size = 11485702 }, + { url = "https://files.pythonhosted.org/packages/18/59/30490e483e804ccaa8147dd78c52e44ff96e1c30b5a95d69a63163cdb15b/ruff-0.9.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:433dedf6ddfdec7f1ac7575ec1eb9844fa60c4c8c2f8887a070672b8d353d34c", size = 12067782 }, + { url = "https://files.pythonhosted.org/packages/3d/8c/893fa9551760b2f8eb2a351b603e96f15af167ceaf27e27ad873570bc04c/ruff-0.9.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d612dbd0f3a919a8cc1d12037168bfa536862066808960e0cc901404b77968f0", size = 12483087 }, + { url = "https://files.pythonhosted.org/packages/23/15/f6751c07c21ca10e3f4a51ea495ca975ad936d780c347d9808bcedbd7182/ruff-0.9.4-py3-none-win32.whl", hash = "sha256:db1192ddda2200671f9ef61d9597fcef89d934f5d1705e571a93a67fb13a4402", size = 9852302 }, + { url = "https://files.pythonhosted.org/packages/12/41/2d2d2c6a72e62566f730e49254f602dfed23019c33b5b21ea8f8917315a1/ruff-0.9.4-py3-none-win_amd64.whl", hash = "sha256:05bebf4cdbe3ef75430d26c375773978950bbf4ee3c95ccb5448940dc092408e", size = 10850051 }, + { url = "https://files.pythonhosted.org/packages/c6/e6/3d6ec3bc3d254e7f005c543a661a41c3e788976d0e52a1ada195bd664344/ruff-0.9.4-py3-none-win_arm64.whl", hash = "sha256:585792f1e81509e38ac5123492f8875fbc36f3ede8185af0a26df348e5154f41", size = 10078251 }, ] [[package]] From 3cb7937dfa1410f916790c68bc013a4b7e1597a3 Mon Sep 17 00:00:00 2001 From: Dejan Lozanovic Date: Mon, 10 Feb 2025 08:25:13 +0000 Subject: [PATCH 3/5] tests --- src/matchbox/server/postgresql/utils/db.py | 1 - test/unit/__init__.py | 0 test/unit/test_adbcingest.py | 130 +++++++++++++++++++++ 3 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 test/unit/__init__.py create mode 100644 test/unit/test_adbcingest.py diff --git a/src/matchbox/server/postgresql/utils/db.py b/src/matchbox/server/postgresql/utils/db.py index 5bef6999..e66abd98 100644 --- a/src/matchbox/server/postgresql/utils/db.py +++ b/src/matchbox/server/postgresql/utils/db.py @@ -229,7 +229,6 @@ def _create_adbc_table_constraints(db_schema:str, sufix:str, conn:Connection) -> return True def _adbc_insert_data(clusters:pa.Table, contains:pa.Table, probabilities:pa.Table, suffix:str, alchemy_conn:Connection, resolution_id:int) -> bool: - # TODO: try except proper exception type from adbc with adbc_driver_postgresql.dbapi.connect(POSTGRESQL_URI) as conn: try: _run_query(f"CREATE TABLE clusters_{suffix} AS SELECT * FROM clusters", alchemy_conn) diff --git a/test/unit/__init__.py b/test/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/unit/test_adbcingest.py b/test/unit/test_adbcingest.py new file mode 100644 index 00000000..0129c8ba --- /dev/null +++ b/test/unit/test_adbcingest.py @@ -0,0 +1,130 @@ +import unittest +from unittest.mock import patch, MagicMock +import pyarrow as pa +import pandas as pd +from sqlalchemy.engine import Engine, Connection + + +from matchbox.server.postgresql.utils.db import adbc_ingest_data, _adbc_insert_data, _save_to_postgresql, _run_query, _run_queries + + +class TestAdbcIngestData(unittest.TestCase): + + @patch('my_module._create_adbc_table_constraints') + @patch('my_module._adbc_insert_data') + @patch('my_module.datetime') + def test_adbc_ingest_data(self, mock_datetime, mock_adbc_insert_data, mock_create_adbc_table_constraints): + # Mock datetime + mock_datetime.now.return_value.strftime.return_value = "20250101123045" + + # Create mock arguments + clusters = pa.Table.from_pandas(pd.DataFrame({"column1": [1, 2], "column2": [3, 4]})) + contains = pa.Table.from_pandas(pd.DataFrame({"column1": [1, 2], "column2": [3, 4]})) + probabilities = pa.Table.from_pandas(pd.DataFrame({"column1": [1, 2], "column2": [3, 4]})) + engine = MagicMock(spec=Engine) + resolution_id = 1 + + # Mock the engine connection context manager + mock_connection = engine.connect.return_value.__enter__.return_value + + # Test when _adbc_insert_data returns True + mock_adbc_insert_data.return_value = True + mock_create_adbc_table_constraints.return_value = True + result = adbc_ingest_data(clusters, contains, probabilities, engine, resolution_id) + self.assertTrue(result) + + # Test when _adbc_insert_data returns False + mock_adbc_insert_data.return_value = False + result = adbc_ingest_data(clusters, contains, probabilities, engine, resolution_id) + self.assertFalse(result) + + +@patch('my_module._save_to_postgresql') +@patch('my_module._run_query') +@patch('my_module.adbc_driver_postgresql.dbapi.connect') +def test_adbc_insert_data(self, mock_connect, mock_run_query, mock_save_to_postgresql): + # Mock the connect method + mock_conn = mock_connect.return_value.__enter__.return_value + + # Create mock arguments + clusters = pa.Table.from_pandas(pd.DataFrame({"column1": [1, 2], "column2": [3, 4]})) + contains = pa.Table.from_pandas(pd.DataFrame({"column1": [1, 2], "column2": [3, 4]})) + probabilities = pa.Table.from_pandas(pd.DataFrame({"column1": [1, 2], "column2": [3, 4]})) + suffix = "20250101123045" + alchemy_conn = MagicMock() + resolution_id = 1 + + # Test when all queries and saves succeed + mock_run_query.side_effect = [None, None, None] + mock_save_to_postgresql.side_effect = [None, None, None] + result = _adbc_insert_data(clusters, contains, probabilities, suffix, alchemy_conn, resolution_id) + self.assertTrue(result) + + # Test when a query fails + mock_run_query.side_effect = Exception("Query failed") + result = _adbc_insert_data(clusters, contains, probabilities, suffix, alchemy_conn, resolution_id) + self.assertFalse(result) + + # Test when save_to_postgresql fails + mock_run_query.side_effect = [None, None, None] + mock_save_to_postgresql.side_effect = [None, Exception("Save failed"), None] + result = _adbc_insert_data(clusters, contains, probabilities, suffix, alchemy_conn, resolution_id) + self.assertFalse(result) + + @patch('my_module.pa.RecordBatchReader.from_batches') + def test_save_to_postgresql(self, mock_from_batches): + # Mock the from_batches method + mock_batch_reader = MagicMock() + mock_from_batches.return_value = mock_batch_reader + + # Create mock arguments + table = pa.Table.from_pandas(pd.DataFrame({"column1": [1, 2], "column2": [3, 4]})) + conn = MagicMock() + schema = "test_schema" + table_name = "test_table" + + # Mock the cursor context manager + mock_cursor = conn.cursor.return_value.__enter__.return_value + + # Call the function + _save_to_postgresql(table, conn, schema, table_name) + + # Verify the cursor method was called correctly + mock_cursor.adbc_ingest.assert_called_once_with( + table_name=table_name, + data=mock_batch_reader, + mode="append", + db_schema_name=schema, + ) + + @patch('my_module.text') + def test_run_query(self, mock_text): + # Create mock arguments + query = "SELECT * FROM test_table" + conn = MagicMock(spec=Connection) + + # Call the function + _run_query(query, conn) + + # Verify the execute method was called correctly + conn.execute.assert_called_once_with(mock_text(query)) + conn.commit.assert_called_once() + + @patch('my_module.text') + def test_run_queries(self, mock_text): + # Create mock arguments + queries = ["SELECT * FROM test_table", "DELETE FROM test_table"] + conn = MagicMock(spec=Connection) + + # Call the function + _run_queries(queries, conn) + + # Verify the execute method was called correctly for each query + conn.begin.assert_called_once() + self.assertEqual(conn.execute.call_count, len(queries)) + for query in queries: + conn.execute.assert_any_call(mock_text(query)) + + +if __name__ == '__main__': + unittest.main() From 75268ee77c466d6d9c79a9b9b8b028187f549911 Mon Sep 17 00:00:00 2001 From: Dejan Lozanovic Date: Thu, 13 Feb 2025 09:23:50 +0000 Subject: [PATCH 4/5] sql alchemy changes --- src/matchbox/server/postgresql/orm.py | 308 +++++++++++---------- src/matchbox/server/postgresql/utils/db.py | 17 -- test/unit/test_adbcingest.py | 18 +- uv.lock | 42 +-- 4 files changed, 197 insertions(+), 188 deletions(-) diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py index 93a1f336..c38897c2 100644 --- a/src/matchbox/server/postgresql/orm.py +++ b/src/matchbox/server/postgresql/orm.py @@ -22,27 +22,28 @@ class ResolutionFrom(CountMixin, MBDB.MatchboxBase): """Resolution lineage closure table with cached truth values.""" - __tablename__ = "resolution_from" - - # Columns - parent = Column( - BIGINT, - ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), - primary_key=True, - ) - child = Column( - BIGINT, - ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), - primary_key=True, - ) - level = Column(INTEGER, nullable=False) - truth_cache = Column(FLOAT, nullable=True) - - # Constraints - __table_args__ = ( - CheckConstraint("parent != child", name="no_self_reference"), - CheckConstraint("level > 0", name="positive_level"), - ) + def __init__(self, suffix=""): + self.__tablename__ = f"resolution_from{suffix}" + + # Columns + self.parent = Column( + BIGINT, + ForeignKey(f"resolutions{suffix}.resolution_id", ondelete="CASCADE"), + primary_key=True, + ) + self.child = Column( + BIGINT, + ForeignKey(f"resolutions{suffix}.resolution_id", ondelete="CASCADE"), + primary_key=True, + ) + self.level = Column(INTEGER, nullable=False) + self.truth_cache = Column(FLOAT, nullable=True) + + # Constraints + self.__table_args__ = ( + CheckConstraint("parent != child", name="no_self_reference"), + CheckConstraint("level > 0", name="positive_level"), + ) class Resolutions(CountMixin, MBDB.MatchboxBase): @@ -51,38 +52,39 @@ class Resolutions(CountMixin, MBDB.MatchboxBase): Resolutions produce probabilities or own data in the clusters table. """ - __tablename__ = "resolutions" - - # Columns - resolution_id = Column(BIGINT, primary_key=True) - resolution_hash = Column(BYTEA, nullable=False) - type = Column(TEXT, nullable=False) - name = Column(TEXT, nullable=False) - description = Column(TEXT) - truth = Column(FLOAT) - - # Relationships - source = relationship("Sources", back_populates="dataset_resolution", uselist=False) - probabilities = relationship( - "Probabilities", back_populates="proposed_by", cascade="all, delete-orphan" - ) - children = relationship( - "Resolutions", - secondary=ResolutionFrom.__table__, - primaryjoin="Resolutions.resolution_id == ResolutionFrom.parent", - secondaryjoin="Resolutions.resolution_id == ResolutionFrom.child", - backref="parents", - ) - - # Constraints - __table_args__ = ( - CheckConstraint( - "type IN ('model', 'dataset', 'human')", - name="resolution_type_constraints", - ), - UniqueConstraint("resolution_hash", name="resolutions_hash_key"), - UniqueConstraint("name", name="resolutions_name_key"), - ) + def __init__(self, suffix=""): + self.__tablename__ = f"resolutions{suffix}" + + # Columns + self.resolution_id = Column(BIGINT, primary_key=True) + self.resolution_hash = Column(BYTEA, nullable=False) + self.type = Column(TEXT, nullable=False) + self.name = Column(TEXT, nullable=False) + self.description = Column(TEXT) + self.truth = Column(FLOAT) + + # Relationships + self.source = relationship("Sources", back_populates="dataset_resolution", uselist=False) + self.probabilities = relationship( + "Probabilities", back_populates="proposed_by", cascade="all, delete-orphan" + ) + self.children = relationship( + "Resolutions", + secondary=ResolutionFrom.__table__, + primaryjoin="Resolutions.resolution_id == ResolutionFrom.parent", + secondaryjoin="Resolutions.resolution_id == ResolutionFrom.child", + backref="parents", + ) + + # Constraints + self.__table_args__ = ( + CheckConstraint( + "type IN ('model', 'dataset', 'human')", + name="resolution_type_constraints", + ), + UniqueConstraint("resolution_hash", name="resolutions_hash_key"), + UniqueConstraint("name", name="resolutions_name_key"), + ) @property def ancestors(self) -> set["Resolutions"]: @@ -171,30 +173,31 @@ def next_id(cls) -> int: class Sources(CountMixin, MBDB.MatchboxBase): """Table of sources of data for Matchbox.""" - __tablename__ = "sources" - - # Columns - resolution_id = Column( - BIGINT, - ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), - primary_key=True, - ) - alias = Column(TEXT, nullable=False) - full_name = Column(TEXT, nullable=False) - warehouse_hash = Column(BYTEA, nullable=False) - id = Column(TEXT, nullable=False) - column_names = Column(ARRAY(TEXT), nullable=False) - column_aliases = Column(ARRAY(TEXT), nullable=False) - column_types = Column(ARRAY(TEXT), nullable=False) - - # Relationships - dataset_resolution = relationship("Resolutions", back_populates="source") - clusters = relationship("Clusters", back_populates="source") - - # Constraints - __table_args__ = ( - UniqueConstraint("full_name", "warehouse_hash", name="unique_source_address"), - ) + def __init__(self, suffix="", contains_temporary=False): + self.__tablename__ = "sources" + + # Columns + self.resolution_id = Column( + BIGINT, + ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), + primary_key=True, + ) + self.alias = Column(TEXT, nullable=False) + self.full_name = Column(TEXT, nullable=False) + self.warehouse_hash = Column(BYTEA, nullable=False) + self.id = Column(TEXT, nullable=False) + self.column_names = Column(ARRAY(TEXT), nullable=False) + self.column_aliases = Column(ARRAY(TEXT), nullable=False) + self.column_types = Column(ARRAY(TEXT), nullable=False) + + # Relationships + self.dataset_resolution = relationship("Resolutions", back_populates="source") + self.clusters = relationship("Clusters", back_populates="source") + + # Constraints + self.__table_args__ = ( + UniqueConstraint("full_name", "warehouse_hash", name="unique_source_address"), + ) @classmethod def list(cls) -> list["Sources"]: @@ -205,56 +208,69 @@ def list(cls) -> list["Sources"]: class Contains(CountMixin, MBDB.MatchboxBase): """Cluster lineage table.""" - __tablename__ = "contains" + def __init__(self, suffix="", clusters_temporary=False): + self.__tablename__ = f"contains{suffix}" - # Columns - parent = Column( - BIGINT, ForeignKey("clusters.cluster_id", ondelete="CASCADE"), primary_key=True - ) - child = Column( - BIGINT, ForeignKey("clusters.cluster_id", ondelete="CASCADE"), primary_key=True - ) + # Columns + if clusters_temporary: + clusters_name = f"clusters{suffix}" + else: + clusters_name = "clusters" - # Constraints and indices - __table_args__ = ( - CheckConstraint("parent != child", name="no_self_containment"), - Index("ix_contains_parent_child", "parent", "child"), - Index("ix_contains_child_parent", "child", "parent"), - ) + self.parent = Column( + BIGINT, ForeignKey(f"{clusters_name}.cluster_id", ondelete="CASCADE"), primary_key=True + ) + self.child = Column( + BIGINT, ForeignKey(f"{clusters_name}.cluster_id", ondelete="CASCADE"), primary_key=True + ) + + # Constraints and indices + self.__table_args__ = ( + CheckConstraint("parent != child", name="no_self_containment"), + Index(f"ix_contains_parent_child{suffix}", "parent", "child"), + Index(f"ix_contains_child_parent", "child", "parent"), + ) class Clusters(CountMixin, MBDB.MatchboxBase): """Table of indexed data and clusters that match it.""" - __tablename__ = "clusters" - - # Columns - cluster_id = Column(BIGINT, primary_key=True) - cluster_hash = Column(BYTEA, nullable=False) - dataset = Column(BIGINT, ForeignKey("sources.resolution_id"), nullable=True) - # Uses array as source data may have identical rows. We can't control this - # Must be indexed or PostgreSQL incorrectly tries to use nested joins - # when retrieving small datasets in query() -- extremely slow - source_pk = Column(ARRAY(TEXT), index=True, nullable=True) - - # Relationships - source = relationship("Sources", back_populates="clusters") - probabilities = relationship( - "Probabilities", back_populates="proposes", cascade="all, delete-orphan" - ) - children = relationship( - "Clusters", - secondary=Contains.__table__, - primaryjoin="Clusters.cluster_id == Contains.parent", - secondaryjoin="Clusters.cluster_id == Contains.child", - backref="parents", - ) - - # Constraints and indices - __table_args__ = ( - Index("ix_clusters_id_gin", source_pk, postgresql_using="gin"), - UniqueConstraint("cluster_hash", name="clusters_hash_key"), - ) + def __init__(self, suffix="", contains_temporary=False): + self.__tablename__ = f"clusters{suffix}" + + # Columns + self.cluster_id = Column(BIGINT, primary_key=True) + self.cluster_hash = Column(BYTEA, nullable=False) + + self.dataset = Column(BIGINT, ForeignKey("sources.resolution_id"), nullable=True) + # Uses array as source data may have identical rows. We can't control this + # Must be indexed or PostgreSQL incorrectly tries to use nested joins + # when retrieving small datasets in query() -- extremely slow + self.source_pk = Column(ARRAY(TEXT), index=True, nullable=True) + + # Relationships + self.source = relationship("Sources", back_populates="clusters") + self.probabilities = relationship( + "Probabilities", back_populates="proposes", cascade="all, delete-orphan" + ) + + if contains_temporary: + contains_name = f"{Contains.__table__}{suffix}" + else: + contains_name = Contains.__table__ + self.children = relationship( + "Clusters", + secondary=contains_name, + primaryjoin="Clusters.cluster_id == Contains.parent", + secondaryjoin="Clusters.cluster_id == Contains.child", + backref="parents", + ) + + # Constraints and indices + self.__table_args__ = ( + Index(f"ix_clusters_id_gin{suffix}", self.source_pk, postgresql_using="gin"), + UniqueConstraint("cluster_hash", name=f"clusters_hash_key{suffix}"), + ) @classmethod def next_id(cls) -> int: @@ -269,24 +285,34 @@ def next_id(cls) -> int: class Probabilities(CountMixin, MBDB.MatchboxBase): """Table of probabilities that a cluster is correct, according to a resolution.""" - __tablename__ = "probabilities" - - # Columns - resolution = Column( - BIGINT, - ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), - primary_key=True, - ) - cluster = Column( - BIGINT, ForeignKey("clusters.cluster_id", ondelete="CASCADE"), primary_key=True - ) - probability = Column(SMALLINT, nullable=False) - - # Relationships - proposed_by = relationship("Resolutions", back_populates="probabilities") - proposes = relationship("Clusters", back_populates="probabilities") - - # Constraints - __table_args__ = ( - CheckConstraint("probability BETWEEN 0 AND 100", name="valid_probability"), - ) + def __init__(self, suffix="", resolutions_temporary=False, clusters_temporary=False): + self.__tablename__ = f"probabilities{suffix}" + + # Columns + if resolutions_temporary: + resolutions_name = f"resolutions{suffix}" + else: + resolutions_name = "resolutions" + if clusters_temporary: + clusters_name = f"clusters{suffix}" + else: + clusters_name = "clusters" + + self.resolution = Column( + BIGINT, + ForeignKey(f"{resolutions_name}.resolution_id", ondelete="CASCADE"), + primary_key=True, + ) + self.cluster = Column( + BIGINT, ForeignKey(f"{clusters_name}.cluster_id", ondelete="CASCADE"), primary_key=True + ) + self.probability = Column(SMALLINT, nullable=False) + + # Relationships + self.proposed_by = relationship("Resolutions", back_populates="probabilities") + self.proposes = relationship("Clusters", back_populates="probabilities") + + # Constraints + self.__table_args__ = ( + CheckConstraint("probability BETWEEN 0 AND 100", name="valid_probability"), + ) diff --git a/src/matchbox/server/postgresql/utils/db.py b/src/matchbox/server/postgresql/utils/db.py index e66abd98..71c4144a 100644 --- a/src/matchbox/server/postgresql/utils/db.py +++ b/src/matchbox/server/postgresql/utils/db.py @@ -199,23 +199,6 @@ def _create_adbc_table_constraints(db_schema:str, sufix:str, conn:Connection) -> Args: db_schema: str, the name of the schema """ # Cluster - _run_query(f"ALTER TABLE {db_schema}.clusters_{sufix} ADD PRIMARY KEY (cluster_id)", conn) - _run_query(f"""ALTER TABLE {db_schema}.probabilities_{sufix} ADD PRIMARY KEY (resolution, "cluster")""", conn) - _run_query(f"CREATE UNIQUE INDEX cluster_hash_index_{sufix} ON {db_schema}.clusters_{sufix} USING btree (cluster_hash)", conn) - # _run_query(f"CREATE UNIQUE INDEX clusters_adbc_clusters_is_{sufix} ON {db_schema}.clusters_{sufix} USING btree (cluster_id)", conn) - _run_query(f"CREATE INDEX ix_clusters_id_gin_{sufix} ON {db_schema}.clusters_{sufix} USING gin (source_pk)", conn) - _run_query(f"CREATE INDEX ix_mb_clusters_source_pk_{sufix} ON {db_schema}.clusters_{sufix} USING btree (source_pk)", conn) - - # Contains - _run_query(f"CREATE UNIQUE INDEX ix_contains_child_parent_{sufix} ON {db_schema}.contains_{sufix} USING btree (child, parent)", conn) - _run_query(f"CREATE UNIQUE INDEX ix_contains_parent_child_{sufix} ON {db_schema}.contains_{sufix} USING btree (parent, child)", conn) - - # Foreign keys - _run_query(f"ALTER TABLE {db_schema}.clusters_{sufix} ADD CONSTRAINT clusters_dataset_fkey FOREIGN KEY (dataset) REFERENCES {db_schema}.sources(resolution_id)", conn) - _run_query(f"""ALTER TABLE {db_schema}."contains_{sufix}" ADD CONSTRAINT contains_child_fkey FOREIGN KEY (child) REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""", conn) - _run_query(f"""ALTER TABLE {db_schema}."contains_{sufix}" ADD CONSTRAINT contains_parent_fkey FOREIGN KEY (parent) REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""", conn) - _run_query(f"""ALTER TABLE {db_schema}.probabilities_{sufix} ADD CONSTRAINT probabilities_cluster_fkey FOREIGN KEY ("cluster") REFERENCES {db_schema}.clusters_{sufix}(cluster_id) ON DELETE CASCADE""", conn) - _run_query(f"ALTER TABLE {db_schema}.probabilities_{sufix} ADD CONSTRAINT probabilities_resolution_fkey FOREIGN KEY (resolution) REFERENCES {db_schema}.resolutions(resolution_id) ON DELETE CASCADE", conn) _run_queries([ f"""DROP TABLE IF EXISTS {db_schema}.clusters""", diff --git a/test/unit/test_adbcingest.py b/test/unit/test_adbcingest.py index 0129c8ba..c01b8b5f 100644 --- a/test/unit/test_adbcingest.py +++ b/test/unit/test_adbcingest.py @@ -10,9 +10,9 @@ class TestAdbcIngestData(unittest.TestCase): - @patch('my_module._create_adbc_table_constraints') - @patch('my_module._adbc_insert_data') - @patch('my_module.datetime') + @patch('matchbox.server.postgresql.utils.db._create_adbc_table_constraints') + @patch('matchbox.server.postgresql.utils.db._adbc_insert_data') + @patch('matchbox.server.postgresql.utils.db.datetime') def test_adbc_ingest_data(self, mock_datetime, mock_adbc_insert_data, mock_create_adbc_table_constraints): # Mock datetime mock_datetime.now.return_value.strftime.return_value = "20250101123045" @@ -39,9 +39,9 @@ def test_adbc_ingest_data(self, mock_datetime, mock_adbc_insert_data, mock_creat self.assertFalse(result) -@patch('my_module._save_to_postgresql') -@patch('my_module._run_query') -@patch('my_module.adbc_driver_postgresql.dbapi.connect') +@patch('matchbox.server.postgresql.utils.db._save_to_postgresql') +@patch('matchbox.server.postgresql.utils.db._run_query') +@patch('matchbox.server.postgresql.utils.db.adbc_driver_postgresql.dbapi.connect') def test_adbc_insert_data(self, mock_connect, mock_run_query, mock_save_to_postgresql): # Mock the connect method mock_conn = mock_connect.return_value.__enter__.return_value @@ -71,7 +71,7 @@ def test_adbc_insert_data(self, mock_connect, mock_run_query, mock_save_to_postg result = _adbc_insert_data(clusters, contains, probabilities, suffix, alchemy_conn, resolution_id) self.assertFalse(result) - @patch('my_module.pa.RecordBatchReader.from_batches') + @patch('matchbox.server.postgresql.utils.db.pa.RecordBatchReader.from_batches') def test_save_to_postgresql(self, mock_from_batches): # Mock the from_batches method mock_batch_reader = MagicMock() @@ -97,7 +97,7 @@ def test_save_to_postgresql(self, mock_from_batches): db_schema_name=schema, ) - @patch('my_module.text') + @patch('matchbox.server.postgresql.utils.db.text') def test_run_query(self, mock_text): # Create mock arguments query = "SELECT * FROM test_table" @@ -110,7 +110,7 @@ def test_run_query(self, mock_text): conn.execute.assert_called_once_with(mock_text(query)) conn.commit.assert_called_once() - @patch('my_module.text') + @patch('matchbox.server.postgresql.utils.db.text') def test_run_queries(self, mock_text): # Create mock arguments queries = ["SELECT * FROM test_table", "DELETE FROM test_table"] diff --git a/uv.lock b/uv.lock index 6e30c485..52cc2f04 100644 --- a/uv.lock +++ b/uv.lock @@ -2534,27 +2534,27 @@ wheels = [ [[package]] name = "ruff" -version = "0.9.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c0/17/529e78f49fc6f8076f50d985edd9a2cf011d1dbadb1cdeacc1d12afc1d26/ruff-0.9.4.tar.gz", hash = "sha256:6907ee3529244bb0ed066683e075f09285b38dd5b4039370df6ff06041ca19e7", size = 3599458 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/f8/3fafb7804d82e0699a122101b5bee5f0d6e17c3a806dcbc527bb7d3f5b7a/ruff-0.9.4-py3-none-linux_armv6l.whl", hash = "sha256:64e73d25b954f71ff100bb70f39f1ee09e880728efb4250c632ceed4e4cdf706", size = 11668400 }, - { url = "https://files.pythonhosted.org/packages/2e/a6/2efa772d335da48a70ab2c6bb41a096c8517ca43c086ea672d51079e3d1f/ruff-0.9.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6ce6743ed64d9afab4fafeaea70d3631b4d4b28b592db21a5c2d1f0ef52934bf", size = 11628395 }, - { url = "https://files.pythonhosted.org/packages/dc/d7/cd822437561082f1c9d7225cc0d0fbb4bad117ad7ac3c41cd5d7f0fa948c/ruff-0.9.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:54499fb08408e32b57360f6f9de7157a5fec24ad79cb3f42ef2c3f3f728dfe2b", size = 11090052 }, - { url = "https://files.pythonhosted.org/packages/9e/67/3660d58e893d470abb9a13f679223368ff1684a4ef40f254a0157f51b448/ruff-0.9.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37c892540108314a6f01f105040b5106aeb829fa5fb0561d2dcaf71485021137", size = 11882221 }, - { url = "https://files.pythonhosted.org/packages/79/d1/757559995c8ba5f14dfec4459ef2dd3fcea82ac43bc4e7c7bf47484180c0/ruff-0.9.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:de9edf2ce4b9ddf43fd93e20ef635a900e25f622f87ed6e3047a664d0e8f810e", size = 11424862 }, - { url = "https://files.pythonhosted.org/packages/c0/96/7915a7c6877bb734caa6a2af424045baf6419f685632469643dbd8eb2958/ruff-0.9.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87c90c32357c74f11deb7fbb065126d91771b207bf9bfaaee01277ca59b574ec", size = 12626735 }, - { url = "https://files.pythonhosted.org/packages/0e/cc/dadb9b35473d7cb17c7ffe4737b4377aeec519a446ee8514123ff4a26091/ruff-0.9.4-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:56acd6c694da3695a7461cc55775f3a409c3815ac467279dfa126061d84b314b", size = 13255976 }, - { url = "https://files.pythonhosted.org/packages/5f/c3/ad2dd59d3cabbc12df308cced780f9c14367f0321e7800ca0fe52849da4c/ruff-0.9.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0c93e7d47ed951b9394cf352d6695b31498e68fd5782d6cbc282425655f687a", size = 12752262 }, - { url = "https://files.pythonhosted.org/packages/c7/17/5f1971e54bd71604da6788efd84d66d789362b1105e17e5ccc53bba0289b/ruff-0.9.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1d4c8772670aecf037d1bf7a07c39106574d143b26cfe5ed1787d2f31e800214", size = 14401648 }, - { url = "https://files.pythonhosted.org/packages/30/24/6200b13ea611b83260501b6955b764bb320e23b2b75884c60ee7d3f0b68e/ruff-0.9.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfc5f1d7afeda8d5d37660eeca6d389b142d7f2b5a1ab659d9214ebd0e025231", size = 12414702 }, - { url = "https://files.pythonhosted.org/packages/34/cb/f5d50d0c4ecdcc7670e348bd0b11878154bc4617f3fdd1e8ad5297c0d0ba/ruff-0.9.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:faa935fc00ae854d8b638c16a5f1ce881bc3f67446957dd6f2af440a5fc8526b", size = 11859608 }, - { url = "https://files.pythonhosted.org/packages/d6/f4/9c8499ae8426da48363bbb78d081b817b0f64a9305f9b7f87eab2a8fb2c1/ruff-0.9.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:a6c634fc6f5a0ceae1ab3e13c58183978185d131a29c425e4eaa9f40afe1e6d6", size = 11485702 }, - { url = "https://files.pythonhosted.org/packages/18/59/30490e483e804ccaa8147dd78c52e44ff96e1c30b5a95d69a63163cdb15b/ruff-0.9.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:433dedf6ddfdec7f1ac7575ec1eb9844fa60c4c8c2f8887a070672b8d353d34c", size = 12067782 }, - { url = "https://files.pythonhosted.org/packages/3d/8c/893fa9551760b2f8eb2a351b603e96f15af167ceaf27e27ad873570bc04c/ruff-0.9.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:d612dbd0f3a919a8cc1d12037168bfa536862066808960e0cc901404b77968f0", size = 12483087 }, - { url = "https://files.pythonhosted.org/packages/23/15/f6751c07c21ca10e3f4a51ea495ca975ad936d780c347d9808bcedbd7182/ruff-0.9.4-py3-none-win32.whl", hash = "sha256:db1192ddda2200671f9ef61d9597fcef89d934f5d1705e571a93a67fb13a4402", size = 9852302 }, - { url = "https://files.pythonhosted.org/packages/12/41/2d2d2c6a72e62566f730e49254f602dfed23019c33b5b21ea8f8917315a1/ruff-0.9.4-py3-none-win_amd64.whl", hash = "sha256:05bebf4cdbe3ef75430d26c375773978950bbf4ee3c95ccb5448940dc092408e", size = 10850051 }, - { url = "https://files.pythonhosted.org/packages/c6/e6/3d6ec3bc3d254e7f005c543a661a41c3e788976d0e52a1ada195bd664344/ruff-0.9.4-py3-none-win_arm64.whl", hash = "sha256:585792f1e81509e38ac5123492f8875fbc36f3ede8185af0a26df348e5154f41", size = 10078251 }, +version = "0.9.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/74/6c359f6b9ed85b88df6ef31febce18faeb852f6c9855651dfb1184a46845/ruff-0.9.5.tar.gz", hash = "sha256:11aecd7a633932875ab3cb05a484c99970b9d52606ce9ea912b690b02653d56c", size = 3634177 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/4b/82b7c9ac874e72b82b19fd7eab57d122e2df44d2478d90825854f9232d02/ruff-0.9.5-py3-none-linux_armv6l.whl", hash = "sha256:d466d2abc05f39018d53f681fa1c0ffe9570e6d73cde1b65d23bb557c846f442", size = 11681264 }, + { url = "https://files.pythonhosted.org/packages/27/5c/f5ae0a9564e04108c132e1139d60491c0abc621397fe79a50b3dc0bd704b/ruff-0.9.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:38840dbcef63948657fa7605ca363194d2fe8c26ce8f9ae12eee7f098c85ac8a", size = 11657554 }, + { url = "https://files.pythonhosted.org/packages/2a/83/c6926fa3ccb97cdb3c438bb56a490b395770c750bf59f9bc1fe57ae88264/ruff-0.9.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d56ba06da53536b575fbd2b56517f6f95774ff7be0f62c80b9e67430391eeb36", size = 11088959 }, + { url = "https://files.pythonhosted.org/packages/af/a7/42d1832b752fe969ffdbfcb1b4cb477cb271bed5835110fb0a16ef31ab81/ruff-0.9.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7cb2a01da08244c50b20ccfaeb5972e4228c3c3a1989d3ece2bc4b1f996001", size = 11902041 }, + { url = "https://files.pythonhosted.org/packages/53/cf/1fffa09fb518d646f560ccfba59f91b23c731e461d6a4dedd21a393a1ff1/ruff-0.9.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:96d5c76358419bc63a671caac70c18732d4fd0341646ecd01641ddda5c39ca0b", size = 11421069 }, + { url = "https://files.pythonhosted.org/packages/09/27/bb8f1b7304e2a9431f631ae7eadc35550fe0cf620a2a6a0fc4aa3d736f94/ruff-0.9.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:deb8304636ed394211f3a6d46c0e7d9535b016f53adaa8340139859b2359a070", size = 12625095 }, + { url = "https://files.pythonhosted.org/packages/d7/ce/ab00bc9d3df35a5f1b64f5117458160a009f93ae5caf65894ebb63a1842d/ruff-0.9.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:df455000bf59e62b3e8c7ba5ed88a4a2bc64896f900f311dc23ff2dc38156440", size = 13257797 }, + { url = "https://files.pythonhosted.org/packages/88/81/c639a082ae6d8392bc52256058ec60f493c6a4d06d5505bccface3767e61/ruff-0.9.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de92170dfa50c32a2b8206a647949590e752aca8100a0f6b8cefa02ae29dce80", size = 12763793 }, + { url = "https://files.pythonhosted.org/packages/b3/d0/0a3d8f56d1e49af466dc770eeec5c125977ba9479af92e484b5b0251ce9c/ruff-0.9.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d28532d73b1f3f627ba88e1456f50748b37f3a345d2be76e4c653bec6c3e393", size = 14386234 }, + { url = "https://files.pythonhosted.org/packages/04/70/e59c192a3ad476355e7f45fb3a87326f5219cc7c472e6b040c6c6595c8f0/ruff-0.9.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c746d7d1df64f31d90503ece5cc34d7007c06751a7a3bbeee10e5f2463d52d2", size = 12437505 }, + { url = "https://files.pythonhosted.org/packages/55/4e/3abba60a259d79c391713e7a6ccabf7e2c96e5e0a19100bc4204f1a43a51/ruff-0.9.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:11417521d6f2d121fda376f0d2169fb529976c544d653d1d6044f4c5562516ee", size = 11884799 }, + { url = "https://files.pythonhosted.org/packages/a3/db/b0183a01a9f25b4efcae919c18fb41d32f985676c917008620ad692b9d5f/ruff-0.9.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:5b9d71c3879eb32de700f2f6fac3d46566f644a91d3130119a6378f9312a38e1", size = 11527411 }, + { url = "https://files.pythonhosted.org/packages/0a/e4/3ebfcebca3dff1559a74c6becff76e0b64689cea02b7aab15b8b32ea245d/ruff-0.9.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:2e36c61145e70febcb78483903c43444c6b9d40f6d2f800b5552fec6e4a7bb9a", size = 12078868 }, + { url = "https://files.pythonhosted.org/packages/ec/b2/5ab808833e06c0a1b0d046a51c06ec5687b73c78b116e8d77687dc0cd515/ruff-0.9.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:2f71d09aeba026c922aa7aa19a08d7bd27c867aedb2f74285a2639644c1c12f5", size = 12524374 }, + { url = "https://files.pythonhosted.org/packages/e0/51/1432afcc3b7aa6586c480142caae5323d59750925c3559688f2a9867343f/ruff-0.9.5-py3-none-win32.whl", hash = "sha256:134f958d52aa6fdec3b294b8ebe2320a950d10c041473c4316d2e7d7c2544723", size = 9853682 }, + { url = "https://files.pythonhosted.org/packages/b7/ad/c7a900591bd152bb47fc4882a27654ea55c7973e6d5d6396298ad3fd6638/ruff-0.9.5-py3-none-win_amd64.whl", hash = "sha256:78cc6067f6d80b6745b67498fb84e87d32c6fc34992b52bffefbdae3442967d6", size = 10865744 }, + { url = "https://files.pythonhosted.org/packages/75/d9/fde7610abd53c0c76b6af72fc679cb377b27c617ba704e25da834e0a0608/ruff-0.9.5-py3-none-win_arm64.whl", hash = "sha256:18a29f1a005bddb229e580795627d297dfa99f16b30c7039e73278cf6b5f9fa9", size = 10064595 }, ] [[package]] From 3b049244e008a80c9e072efac92f49aefb98ca30 Mon Sep 17 00:00:00 2001 From: Dejan Lozanovic Date: Fri, 14 Feb 2025 10:00:59 +0000 Subject: [PATCH 5/5] sql alchemy creating constrains --- src/matchbox/server/postgresql/orm.py | 308 ++++++++++----------- src/matchbox/server/postgresql/utils/db.py | 16 +- 2 files changed, 155 insertions(+), 169 deletions(-) diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py index c38897c2..93a1f336 100644 --- a/src/matchbox/server/postgresql/orm.py +++ b/src/matchbox/server/postgresql/orm.py @@ -22,28 +22,27 @@ class ResolutionFrom(CountMixin, MBDB.MatchboxBase): """Resolution lineage closure table with cached truth values.""" - def __init__(self, suffix=""): - self.__tablename__ = f"resolution_from{suffix}" - - # Columns - self.parent = Column( - BIGINT, - ForeignKey(f"resolutions{suffix}.resolution_id", ondelete="CASCADE"), - primary_key=True, - ) - self.child = Column( - BIGINT, - ForeignKey(f"resolutions{suffix}.resolution_id", ondelete="CASCADE"), - primary_key=True, - ) - self.level = Column(INTEGER, nullable=False) - self.truth_cache = Column(FLOAT, nullable=True) - - # Constraints - self.__table_args__ = ( - CheckConstraint("parent != child", name="no_self_reference"), - CheckConstraint("level > 0", name="positive_level"), - ) + __tablename__ = "resolution_from" + + # Columns + parent = Column( + BIGINT, + ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), + primary_key=True, + ) + child = Column( + BIGINT, + ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), + primary_key=True, + ) + level = Column(INTEGER, nullable=False) + truth_cache = Column(FLOAT, nullable=True) + + # Constraints + __table_args__ = ( + CheckConstraint("parent != child", name="no_self_reference"), + CheckConstraint("level > 0", name="positive_level"), + ) class Resolutions(CountMixin, MBDB.MatchboxBase): @@ -52,39 +51,38 @@ class Resolutions(CountMixin, MBDB.MatchboxBase): Resolutions produce probabilities or own data in the clusters table. """ - def __init__(self, suffix=""): - self.__tablename__ = f"resolutions{suffix}" - - # Columns - self.resolution_id = Column(BIGINT, primary_key=True) - self.resolution_hash = Column(BYTEA, nullable=False) - self.type = Column(TEXT, nullable=False) - self.name = Column(TEXT, nullable=False) - self.description = Column(TEXT) - self.truth = Column(FLOAT) - - # Relationships - self.source = relationship("Sources", back_populates="dataset_resolution", uselist=False) - self.probabilities = relationship( - "Probabilities", back_populates="proposed_by", cascade="all, delete-orphan" - ) - self.children = relationship( - "Resolutions", - secondary=ResolutionFrom.__table__, - primaryjoin="Resolutions.resolution_id == ResolutionFrom.parent", - secondaryjoin="Resolutions.resolution_id == ResolutionFrom.child", - backref="parents", - ) - - # Constraints - self.__table_args__ = ( - CheckConstraint( - "type IN ('model', 'dataset', 'human')", - name="resolution_type_constraints", - ), - UniqueConstraint("resolution_hash", name="resolutions_hash_key"), - UniqueConstraint("name", name="resolutions_name_key"), - ) + __tablename__ = "resolutions" + + # Columns + resolution_id = Column(BIGINT, primary_key=True) + resolution_hash = Column(BYTEA, nullable=False) + type = Column(TEXT, nullable=False) + name = Column(TEXT, nullable=False) + description = Column(TEXT) + truth = Column(FLOAT) + + # Relationships + source = relationship("Sources", back_populates="dataset_resolution", uselist=False) + probabilities = relationship( + "Probabilities", back_populates="proposed_by", cascade="all, delete-orphan" + ) + children = relationship( + "Resolutions", + secondary=ResolutionFrom.__table__, + primaryjoin="Resolutions.resolution_id == ResolutionFrom.parent", + secondaryjoin="Resolutions.resolution_id == ResolutionFrom.child", + backref="parents", + ) + + # Constraints + __table_args__ = ( + CheckConstraint( + "type IN ('model', 'dataset', 'human')", + name="resolution_type_constraints", + ), + UniqueConstraint("resolution_hash", name="resolutions_hash_key"), + UniqueConstraint("name", name="resolutions_name_key"), + ) @property def ancestors(self) -> set["Resolutions"]: @@ -173,31 +171,30 @@ def next_id(cls) -> int: class Sources(CountMixin, MBDB.MatchboxBase): """Table of sources of data for Matchbox.""" - def __init__(self, suffix="", contains_temporary=False): - self.__tablename__ = "sources" - - # Columns - self.resolution_id = Column( - BIGINT, - ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), - primary_key=True, - ) - self.alias = Column(TEXT, nullable=False) - self.full_name = Column(TEXT, nullable=False) - self.warehouse_hash = Column(BYTEA, nullable=False) - self.id = Column(TEXT, nullable=False) - self.column_names = Column(ARRAY(TEXT), nullable=False) - self.column_aliases = Column(ARRAY(TEXT), nullable=False) - self.column_types = Column(ARRAY(TEXT), nullable=False) - - # Relationships - self.dataset_resolution = relationship("Resolutions", back_populates="source") - self.clusters = relationship("Clusters", back_populates="source") - - # Constraints - self.__table_args__ = ( - UniqueConstraint("full_name", "warehouse_hash", name="unique_source_address"), - ) + __tablename__ = "sources" + + # Columns + resolution_id = Column( + BIGINT, + ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), + primary_key=True, + ) + alias = Column(TEXT, nullable=False) + full_name = Column(TEXT, nullable=False) + warehouse_hash = Column(BYTEA, nullable=False) + id = Column(TEXT, nullable=False) + column_names = Column(ARRAY(TEXT), nullable=False) + column_aliases = Column(ARRAY(TEXT), nullable=False) + column_types = Column(ARRAY(TEXT), nullable=False) + + # Relationships + dataset_resolution = relationship("Resolutions", back_populates="source") + clusters = relationship("Clusters", back_populates="source") + + # Constraints + __table_args__ = ( + UniqueConstraint("full_name", "warehouse_hash", name="unique_source_address"), + ) @classmethod def list(cls) -> list["Sources"]: @@ -208,69 +205,56 @@ def list(cls) -> list["Sources"]: class Contains(CountMixin, MBDB.MatchboxBase): """Cluster lineage table.""" - def __init__(self, suffix="", clusters_temporary=False): - self.__tablename__ = f"contains{suffix}" + __tablename__ = "contains" - # Columns - if clusters_temporary: - clusters_name = f"clusters{suffix}" - else: - clusters_name = "clusters" + # Columns + parent = Column( + BIGINT, ForeignKey("clusters.cluster_id", ondelete="CASCADE"), primary_key=True + ) + child = Column( + BIGINT, ForeignKey("clusters.cluster_id", ondelete="CASCADE"), primary_key=True + ) - self.parent = Column( - BIGINT, ForeignKey(f"{clusters_name}.cluster_id", ondelete="CASCADE"), primary_key=True - ) - self.child = Column( - BIGINT, ForeignKey(f"{clusters_name}.cluster_id", ondelete="CASCADE"), primary_key=True - ) - - # Constraints and indices - self.__table_args__ = ( - CheckConstraint("parent != child", name="no_self_containment"), - Index(f"ix_contains_parent_child{suffix}", "parent", "child"), - Index(f"ix_contains_child_parent", "child", "parent"), - ) + # Constraints and indices + __table_args__ = ( + CheckConstraint("parent != child", name="no_self_containment"), + Index("ix_contains_parent_child", "parent", "child"), + Index("ix_contains_child_parent", "child", "parent"), + ) class Clusters(CountMixin, MBDB.MatchboxBase): """Table of indexed data and clusters that match it.""" - def __init__(self, suffix="", contains_temporary=False): - self.__tablename__ = f"clusters{suffix}" - - # Columns - self.cluster_id = Column(BIGINT, primary_key=True) - self.cluster_hash = Column(BYTEA, nullable=False) - - self.dataset = Column(BIGINT, ForeignKey("sources.resolution_id"), nullable=True) - # Uses array as source data may have identical rows. We can't control this - # Must be indexed or PostgreSQL incorrectly tries to use nested joins - # when retrieving small datasets in query() -- extremely slow - self.source_pk = Column(ARRAY(TEXT), index=True, nullable=True) - - # Relationships - self.source = relationship("Sources", back_populates="clusters") - self.probabilities = relationship( - "Probabilities", back_populates="proposes", cascade="all, delete-orphan" - ) - - if contains_temporary: - contains_name = f"{Contains.__table__}{suffix}" - else: - contains_name = Contains.__table__ - self.children = relationship( - "Clusters", - secondary=contains_name, - primaryjoin="Clusters.cluster_id == Contains.parent", - secondaryjoin="Clusters.cluster_id == Contains.child", - backref="parents", - ) - - # Constraints and indices - self.__table_args__ = ( - Index(f"ix_clusters_id_gin{suffix}", self.source_pk, postgresql_using="gin"), - UniqueConstraint("cluster_hash", name=f"clusters_hash_key{suffix}"), - ) + __tablename__ = "clusters" + + # Columns + cluster_id = Column(BIGINT, primary_key=True) + cluster_hash = Column(BYTEA, nullable=False) + dataset = Column(BIGINT, ForeignKey("sources.resolution_id"), nullable=True) + # Uses array as source data may have identical rows. We can't control this + # Must be indexed or PostgreSQL incorrectly tries to use nested joins + # when retrieving small datasets in query() -- extremely slow + source_pk = Column(ARRAY(TEXT), index=True, nullable=True) + + # Relationships + source = relationship("Sources", back_populates="clusters") + probabilities = relationship( + "Probabilities", back_populates="proposes", cascade="all, delete-orphan" + ) + children = relationship( + "Clusters", + secondary=Contains.__table__, + primaryjoin="Clusters.cluster_id == Contains.parent", + secondaryjoin="Clusters.cluster_id == Contains.child", + backref="parents", + ) + + # Constraints and indices + __table_args__ = ( + Index("ix_clusters_id_gin", source_pk, postgresql_using="gin"), + UniqueConstraint("cluster_hash", name="clusters_hash_key"), + ) @classmethod def next_id(cls) -> int: @@ -285,34 +269,24 @@ def next_id(cls) -> int: class Probabilities(CountMixin, MBDB.MatchboxBase): """Table of probabilities that a cluster is correct, according to a resolution.""" - def __init__(self, suffix="", resolutions_temporary=False, clusters_temporary=False): - self.__tablename__ = f"probabilities{suffix}" - - # Columns - if resolutions_temporary: - resolutions_name = f"resolutions{suffix}" - else: - resolutions_name = "resolutions" - if clusters_temporary: - clusters_name = f"clusters{suffix}" - else: - clusters_name = "clusters" - - self.resolution = Column( - BIGINT, - ForeignKey(f"{resolutions_name}.resolution_id", ondelete="CASCADE"), - primary_key=True, - ) - self.cluster = Column( - BIGINT, ForeignKey(f"{clusters_name}.cluster_id", ondelete="CASCADE"), primary_key=True - ) - self.probability = Column(SMALLINT, nullable=False) - - # Relationships - self.proposed_by = relationship("Resolutions", back_populates="probabilities") - self.proposes = relationship("Clusters", back_populates="probabilities") - - # Constraints - self.__table_args__ = ( - CheckConstraint("probability BETWEEN 0 AND 100", name="valid_probability"), - ) + __tablename__ = "probabilities" + + # Columns + resolution = Column( + BIGINT, + ForeignKey("resolutions.resolution_id", ondelete="CASCADE"), + primary_key=True, + ) + cluster = Column( + BIGINT, ForeignKey("clusters.cluster_id", ondelete="CASCADE"), primary_key=True + ) + probability = Column(SMALLINT, nullable=False) + + # Relationships + proposed_by = relationship("Resolutions", back_populates="probabilities") + proposes = relationship("Clusters", back_populates="probabilities") + + # Constraints + __table_args__ = ( + CheckConstraint("probability BETWEEN 0 AND 100", name="valid_probability"), + ) diff --git a/src/matchbox/server/postgresql/utils/db.py b/src/matchbox/server/postgresql/utils/db.py index 71c4144a..1db04c12 100644 --- a/src/matchbox/server/postgresql/utils/db.py +++ b/src/matchbox/server/postgresql/utils/db.py @@ -18,6 +18,8 @@ from sqlalchemy.orm import DeclarativeMeta, Session from sqlalchemy.exc import DatabaseError as AlchemyDatabaseError +from matchbox.server.postgresql.db import MBDB + from matchbox.common.graph import ( ResolutionEdge, ResolutionGraph, @@ -200,7 +202,7 @@ def _create_adbc_table_constraints(db_schema:str, sufix:str, conn:Connection) -> """ # Cluster - _run_queries([ + statements = [ f"""DROP TABLE IF EXISTS {db_schema}.clusters""", f"""DROP TABLE IF EXISTS {db_schema}.contains""", f"""DROP TABLE IF EXISTS {db_schema}.probabilities""", @@ -208,7 +210,17 @@ def _create_adbc_table_constraints(db_schema:str, sufix:str, conn:Connection) -> f"""ALTER TABLE {db_schema}.clusters_{sufix} RENAME TO clusters""", f"""ALTER TABLE {db_schema}.contains_{sufix} RENAME TO contains""", f"""ALTER TABLE {db_schema}.probabilities_{sufix} RENAME TO probabilities""" - ], conn) + ] + #start the transaction + conn.begin() + for query in statements: + conn.execute(text(query)) + + MBDB.MatchboxBase.metadata.create_all(conn) + + conn.commit() + + return True def _adbc_insert_data(clusters:pa.Table, contains:pa.Table, probabilities:pa.Table, suffix:str, alchemy_conn:Connection, resolution_id:int) -> bool: