diff --git a/pyproject.toml b/pyproject.toml index 4ed88b0d..5de023a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ server = [ [dependency-groups] dev = [ "ipykernel>=6.29.5", + "ipywidgets>=8.1.5", "pre-commit>=3.8.0", "pytest>=8.3.3", "pytest-cov>=5.0.0", diff --git a/src/matchbox/common/factories.py b/src/matchbox/common/factories.py new file mode 100644 index 00000000..71be3427 --- /dev/null +++ b/src/matchbox/common/factories.py @@ -0,0 +1,295 @@ +from collections import Counter +from textwrap import dedent +from typing import Any + +import numpy as np +import pyarrow as pa +import rustworkx as rx + +from matchbox.common.transform import graph_results + + +def verify_components(all_nodes: list[Any], table: pa.Table) -> dict: + """ + Fast verification of connected components using rustworkx. + + Args: + all_nodes: list of identities of inputs being matched + table: PyArrow table with 'left', 'right' columns + + Returns: + dictionary containing basic component statistics + """ + graph, _, _ = graph_results(table, all_nodes) + components = rx.connected_components(graph) + component_sizes = Counter(len(component) for component in components) + + return { + "num_components": len(components), + "total_nodes": graph.num_nodes(), + "total_edges": graph.num_edges(), + "component_sizes": component_sizes, + "min_component_size": min(component_sizes.keys()), + "max_component_size": max(component_sizes.keys()), + } + + +def _min_edges_component(left: int, right: int, deduplicate: bool) -> int: + """ + Calculate min edges for component to be connected. + Does so by assuming a spanning tree. + + Args: + left: number of nodes of component on the left + right: number of nodes of component on the right (for linking) + deduplicate: whether edges are for deduplication + + Returns: + Minimum number of edges + """ + if not deduplicate: + return left + right - 1 + + return left - 1 + + +def _max_edges_component(left: int, right: int, deduplicate: bool) -> int: + """ + Calculate max edges for component to be avoid duplication. + Considers complete graph for deduping, and complete bipartite graph for linking. + + Args: + left: number of nodes of component on the left + right: number of nodes of component on the right (for linking) + deduplicate: whether edges are for deduplication + + Returns: + Maximum number of edges + """ + if not deduplicate: + return left * right + # n*(n-1) is always divisible by 2 + return left * (left - 1) // 2 + + +def calculate_min_max_edges( + left_nodes: int, right_nodes: int, num_components: int, deduplicate: bool +) -> tuple[int, int]: + """ + Calculate min and max edges for a graph. + + Args: + left_nodes: number of nodes in left source + right_nodes: number of nodes in right source + num_components: number of requested components + deduplicate: whether edges are for deduplication + + Returns: + Two-tuple representing min and max edges + """ + left_mod, right_mod = left_nodes % num_components, right_nodes % num_components + left_div, right_div = left_nodes // num_components, right_nodes // num_components + + min_mod, max_mod = sorted([left_mod, right_mod]) + + min_edges, max_edges = 0, 0 + # components where both sides have maximum nodes + min_edges += ( + _min_edges_component(left_div + 1, right_div + 1, deduplicate) * min_mod + ) + max_edges += ( + _max_edges_component(left_div + 1, right_div + 1, deduplicate) * min_mod + ) + # components where one side has maximum nodes + left_after_min_mod, right_after_min_mod = left_div + 1, right_div + if left_mod == min_mod: + left_after_min_mod, right_after_min_mod = left_div, right_div + 1 + min_edges += _min_edges_component( + left_after_min_mod, right_after_min_mod, deduplicate + ) * (max_mod - min_mod) + max_edges += _max_edges_component( + left_after_min_mod, right_after_min_mod, deduplicate + ) * (max_mod - min_mod) + # components where both side have minimum nodes + min_edges += _min_edges_component(left_div, right_div, deduplicate) * ( + num_components - max_mod + ) + max_edges += _max_edges_component(left_div, right_div, deduplicate) * ( + num_components - max_mod + ) + + return min_edges, max_edges + + +def generate_dummy_probabilities( + left_values: list[int], + right_values: list[int] | None, + prob_range: tuple[float, float], + num_components: int, + total_rows: int, +) -> pa.Table: + """ + Generate dummy Arrow probabilities data with guaranteed isolated components. + + Args: + left_values: List of integers to use for left column + right_values: List of integers to use for right column. If None, assume we + are generating probabilities for deduplication + prob_range: Tuple of (min_prob, max_prob) to constrain probabilities + num_components: Number of distinct connected components to generate + total_rows: Total number of rows to generate + + Returns: + PyArrow Table with 'left', 'right', and 'probability' columns + """ + # Validate inputs + deduplicate = False + if right_values is None: + right_values = left_values + deduplicate = True + + if len(left_values) < 2 or len(right_values) < 2: + raise ValueError("Need at least 2 possible values for both left and right") + if num_components > min(len(left_values), len(right_values)): + raise ValueError( + "Cannot have more components than minimum of left/right values" + ) + + left_nodes, right_nodes = len(left_values), len(right_values) + min_possible_edges, max_possible_edges = calculate_min_max_edges( + left_nodes, right_nodes, num_components, deduplicate + ) + + mode = "dedupe" if deduplicate else "link" + + if total_rows == 0: + raise ValueError("At least one edge must be generated") + if total_rows < min_possible_edges: + raise ValueError( + dedent(f""" + Cannot generate {total_rows:,} {mode} edges with {num_components:,} + components. + Min edges is {min_possible_edges:,} for nodes given. + Either decrease the number of nodes, increase the number of components, + or increase the total edges requested. + """) + ) + if total_rows > max_possible_edges: + raise ValueError( + dedent(f""" + Cannot generate {total_rows:,} {mode} edges with {num_components:,} + components. + Max edges is {max_possible_edges:,} for nodes given. + Either increase the number of nodes, decrease the number of components, + or decrease the total edges requested. + """) + ) + + n_extra_edges = total_rows - min_possible_edges + + # Convert probability range to integers (60-80 for 0.60-0.80) + prob_min = int(prob_range[0] * 100) + prob_max = int(prob_range[1] * 100) + + # Split values into completely separate groups for each component + left_components = np.array_split(np.array(left_values), num_components) + right_components = np.array_split(np.array(right_values), num_components) + # For each left-right component pair, the right equals the left rotated by one + right_components = [np.roll(c, -1) for c in right_components] + + all_edges = [] + + # Generate edges for each component + for comp_idx in range(num_components): + comp_left_values = left_components[comp_idx] + comp_right_values = right_components[comp_idx] + + min_comp_nodes, max_comp_nodes = sorted( + [len(comp_left_values), len(comp_right_values)] + ) + + # Ensure basic connectivity within the component by creating a spanning-tree + base_edges = set() + # For deduping (A B C) you just need (A - B) (B - C) (C - A) + # which just needs matching pairwise the data and its rotated version. + # For deduping, `min_comp_nodes` == `max_comp_nodes` + if deduplicate: + for i in range(min_comp_nodes - 1): + small_n, large_n = sorted([comp_left_values[i], comp_right_values[i]]) + base_edges.add((small_n, large_n)) + else: + # For linking (A B) and (C D E), we begin by adding (A - C) and (B - D) + for i in range(min_comp_nodes): + base_edges.add((comp_left_values[i], comp_right_values[i])) + # we now add (C - B) + for i in range(min_comp_nodes - 1): + base_edges.add((comp_left_values[i + 1], comp_right_values[i])) + # we now add (A - D) + left_right_diff = max_comp_nodes - min_comp_nodes + for i in range(left_right_diff): + left_i, right_i = 0, min_comp_nodes + i + if len(comp_right_values) < len(comp_left_values): + left_i, right_i = min_comp_nodes + i, 0 + + base_edges.add((comp_left_values[left_i], comp_right_values[right_i])) + + component_edges = list(base_edges) + + if n_extra_edges > 0: + # Generate remaining random edges strictly within this component + # TODO: this can certainly be optimised + if deduplicate: + all_possible_edges = list( + { + tuple(sorted([x, y])) + for x in comp_left_values + for y in comp_right_values + if x != y and tuple(sorted([x, y])) not in base_edges + } + ) + else: + all_possible_edges = list( + { + (x, y) + for x in comp_left_values + for y in comp_right_values + if x != y and (x, y) not in base_edges + } + ) + max_new_edges = len(all_possible_edges) + if max_new_edges >= n_extra_edges: + edges_required = n_extra_edges + n_extra_edges = 0 + else: + edges_required = max_new_edges + n_extra_edges -= max_new_edges + + extra_edges_idx = np.random.choice( + len(all_possible_edges), size=edges_required, replace=False + ) + extra_edges = [ + e for i, e in enumerate(all_possible_edges) if i in extra_edges_idx + ] + component_edges += extra_edges + random_probs = np.random.randint( + prob_min, prob_max + 1, size=len(component_edges) + ) + + component_edges = [ + (le, ri, pr) + for (le, ri), pr in zip(component_edges, random_probs, strict=True) + ] + + all_edges.extend(component_edges) + + # Convert to arrays + lefts, rights, probs = zip(*all_edges, strict=True) + + # Create PyArrow arrays + left_array = pa.array(lefts, type=pa.uint64()) + right_array = pa.array(rights, type=pa.uint64()) + prob_array = pa.array(probs, type=pa.uint8()) + + return pa.table( + [left_array, right_array, prob_array], names=["left", "right", "probability"] + ) diff --git a/src/matchbox/common/transform.py b/src/matchbox/common/transform.py index 1e257a48..1903bfbb 100644 --- a/src/matchbox/common/transform.py +++ b/src/matchbox/common/transform.py @@ -2,7 +2,7 @@ import multiprocessing from collections import defaultdict from concurrent.futures import ProcessPoolExecutor -from typing import Callable, Generic, Hashable, Literal, TypeVar +from typing import Callable, Generic, Hashable, Iterable, Literal, TypeVar from uuid import uuid4 import numpy as np @@ -87,19 +87,22 @@ def to_clusters( ) -def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table: +def graph_results( + probabilities: pa.Table, all_nodes: Iterable[int] | None = None +) -> tuple[rx.PyDiGraph, np.ndarray, np.ndarray]: """ - Takes an Arrow table of probabilities and adds a component column. - - Expects an Arrow table of column, left, right, probability. + Convert probability table to graph representation. - Returns a table with an additional column, component. + Args: + probabilities: PyArrow table with 'left', 'right' columns + all_nodes: superset of node identities figuring in probabilities table. + Used to optionally add isolated nodes to the graph. + Returns: + A tuple containing: + - Rustwork directed graph + - A list mapping the 'left' probabilities column to node indices in the graph + - A list mapping the 'right' probabilities column to node indices in the graph """ - # Handle empty probabilities - if len(probabilities) == 0: - empty_components = pa.array([], type=pa.int64()) - return probabilities.append_column("component", empty_components) - # Create index to use in graph unique = pc.unique( pa.concat_arrays( @@ -109,8 +112,9 @@ def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table: ] ) ) - left_indices = pc.index_in(probabilities["left"], unique) - right_indices = pc.index_in(probabilities["right"], unique) + + left_indices = pc.index_in(probabilities["left"], unique).to_numpy() + right_indices = pc.index_in(probabilities["right"], unique).to_numpy() # Create and process graph n_nodes = len(unique) @@ -119,9 +123,30 @@ def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table: graph = rx.PyGraph(node_count_hint=n_nodes, edge_count_hint=n_edges) graph.add_nodes_from(range(n_nodes)) - edges = tuple(zip(left_indices.to_numpy(), right_indices.to_numpy(), strict=True)) + if all_nodes is not None: + isolated_nodes = len(set(all_nodes) - set(unique.to_pylist())) + graph.add_nodes_from(range(isolated_nodes)) + + edges = tuple(zip(left_indices, right_indices, strict=True)) graph.add_edges_from_no_data(edges) + return graph, left_indices, right_indices + + +def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table: + """ + Takes an Arrow table of probabilities and adds a component column. + + Expects an Arrow table of column, left, right, probability. + + Returns a table with an additional column, component. + """ + # Handle empty probabilities + if len(probabilities) == 0: + empty_components = pa.array([], type=pa.int64()) + return probabilities.append_column("component", empty_components) + + graph, left_indices, _ = graph_results(probabilities) components = rx.connected_components(graph) # Convert components to arrays, map back to input to join, and reattach @@ -130,10 +155,10 @@ def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table: np.arange(len(components)), [len(c) for c in components] ) - node_to_component = np.zeros(len(unique), dtype=np.int64) + node_to_component = np.zeros(graph.num_nodes(), dtype=np.int64) node_to_component[component_indices] = component_labels - edge_components = pa.array(node_to_component[left_indices.to_numpy()]) + edge_components = pa.array(node_to_component[left_indices]) return probabilities.append_column("component", edge_components).sort_by( [("component", "ascending"), ("probability", "descending")] @@ -201,11 +226,15 @@ def component_to_hierarchy( Returns: Arrow Table with columns ['parent', 'child', 'probability'] """ - probs = np.sort(pc.unique(table["probability"]).to_numpy())[::-1] + ascending_probs = np.sort( + pc.unique(table["probability"]).to_numpy(zero_copy_only=False) + ) + probs = ascending_probs[::-1] djs = DisjointSet[int]() # implements connected components current_roots: dict[int, set[int]] = defaultdict(set) # tracks ultimate parents hierarchy: list[tuple[int, int, float]] = [] # the output of this function + seen_components: set[frozenset[int]] = set() # track previously seen component sets for threshold in probs: # Get current probability rows @@ -228,6 +257,13 @@ def component_to_hierarchy( if len(children) <= 2: continue # Skip pairs already handled by pairwise probabilities + # Skip if we've seen this exact component before + frozen_children = frozenset(children) + if frozen_children in seen_components: + continue + + seen_components.add(frozen_children) + parent = hash_func(*children) prev_roots: set[int] = set() for child in children: @@ -261,7 +297,6 @@ def to_hierarchical_clusters( probabilities: Arrow table with columns ['component', 'left', 'right', 'probability'] proc_func: Function to process each component - dtype: Arrow data type for parent/child columns timeout: Maximum seconds to wait for each component to process Returns: @@ -317,7 +352,9 @@ def to_hierarchical_clusters( with ProcessPoolExecutor(max_workers=n_cores) as executor: futures = [ - executor.submit(proc_func, component_table, dtype, hash_func) + executor.submit( + proc_func, component_table, hash_func=hash_func, dtype=dtype + ) for component_table in component_tables ] diff --git a/src/matchbox/server/postgresql/README.md b/src/matchbox/server/postgresql/README.md index c4183f18..72542530 100644 --- a/src/matchbox/server/postgresql/README.md +++ b/src/matchbox/server/postgresql/README.md @@ -32,7 +32,7 @@ erDiagram bigint child PK,FK } Probabilities { - bigint model PK,FK + bigint resolution PK,FK bigint cluster PK,FK float probability } diff --git a/src/matchbox/server/postgresql/benchmark/__init__.py b/src/matchbox/server/postgresql/benchmark/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/matchbox/server/postgresql/benchmark/cluster_pipeline.py b/src/matchbox/server/postgresql/benchmark/cluster_pipeline.py new file mode 100644 index 00000000..6843b7c5 --- /dev/null +++ b/src/matchbox/server/postgresql/benchmark/cluster_pipeline.py @@ -0,0 +1,138 @@ +import logging +import time +from contextlib import contextmanager +from pathlib import Path + +import pyarrow.parquet as pq +from rich.logging import RichHandler + +from matchbox.common.hash import HASH_FUNC +from matchbox.common.transform import ( + attach_components_to_probabilities, + to_hierarchical_clusters, +) + +logging.basicConfig( + level=logging.INFO, + format="%(message)s", + handlers=[RichHandler(rich_tracebacks=True)], +) +pipeline_logger = logging.getLogger("mb_pipeline") + +ROOT = Path(__file__).parent.parent + + +@contextmanager +def timer(description: str): + start = time.time() + yield + elapsed = time.time() - start + + if elapsed >= 60: + minutes = int(elapsed // 60) + seconds = elapsed % 60 + time_str = f"{minutes} min {seconds:.1f} sec" + else: + time_str = f"{elapsed:.2f} seconds" + + pipeline_logger.info(f"{description} in {time_str}") + + +INPUT_NAME = "hierarchical_cc200k" +OUTPUT_PREFIX = "large" + +if __name__ == "__main__": + with timer("Full pipeline completed"): + with timer("Read table"): + table = pq.read_table(Path.cwd() / f"data/{INPUT_NAME}.parquet") + + pipeline_logger.info(f"Processing {len(table):,} records") + + with timer("Added components"): + cc = attach_components_to_probabilities(table) + + with timer("Built hierarchical clusters"): + hierarchy = to_hierarchical_clusters(cc) + + with timer("Created output tables"): + fake_resolution_hash = HASH_FUNC( + "ceci n'est pas un model".encode("utf-8") + ).digest() + + parents_im, children_im, thresholds = ( + hierarchy.column("parent").to_numpy(), + hierarchy.column("child").to_numpy(), + hierarchy.column("probability").to_numpy(), + ) + import numpy as np + import pyarrow as pa + from pyarrow.parquet import write_table + + im_to_pos = dict() + next_int = max(max(parents_im), 0) + parents = [] + children = [] + for pim in parents_im: + if pim >= 0: + parents.append(pim) + elif pim in im_to_pos: + parents.append(im_to_pos[pim]) + else: + im_to_pos[pim] = next_int + parents.append(next_int) + next_int += 1 + + for cim in children_im: + if cim >= 0: + children.append(cim) + elif cim in im_to_pos: + children.append(im_to_pos[cim]) + else: + im_to_pos[cim] = next_int + children.append(next_int) + next_int += 1 + + unique_clusters = np.unique(parents) + + out_clusters = pa.table( + { + "id": pa.array(unique_clusters, type=pa.uint64()), + "dataset_id": pa.array( + [None] * len(unique_clusters), type=pa.uint64() + ), + "id_in_dataset": pa.array( + [None] * len(unique_clusters), type=pa.string() + ), + } + ) + + out_contains = pa.table( + { + "parent": pa.array(parents, type=pa.uint64()), + "child": pa.array(children, type=pa.uint64()), + } + ) + + out_probabilities = pa.table( + { + "model": pa.array( + [fake_resolution_hash] * len(parents), type=pa.binary() + ), + "cluster": pa.array(parents, type=pa.uint64()), + "probability": pa.array(thresholds, type=pa.uint64()), + } + ) + + write_table( + out_clusters, + Path.cwd() / "data" / f"{OUTPUT_PREFIX}_ingest_clusters.parquet", + ) + + write_table( + out_contains, Path.cwd() / "data" / f"{OUTPUT_PREFIX}_contains.parquet" + ) + + write_table( + out_probabilities, + Path.cwd() / "data" / f"{OUTPUT_PREFIX}_ingest_probabilities.parquet", + ) diff --git a/src/matchbox/server/postgresql/benchmark/generate_tables.py b/src/matchbox/server/postgresql/benchmark/generate_tables.py new file mode 100644 index 00000000..c18dadb1 --- /dev/null +++ b/src/matchbox/server/postgresql/benchmark/generate_tables.py @@ -0,0 +1,407 @@ +import json +from pathlib import Path +from typing import Iterable + +import click +import pyarrow as pa +import pyarrow.parquet as pq + +from matchbox.common.factories import generate_dummy_probabilities +from matchbox.common.hash import HASH_FUNC +from matchbox.common.transform import ( + attach_components_to_probabilities, + to_hierarchical_clusters, +) + + +class IDCreator: + """ + A generator of incremental integer IDs from positive and negative integers. + + Positive integers will be returned as they are, while a new ID will be generated + for each negative integer. + """ + + def __init__(self, start: int): + self.id_map = dict() + self._next_int = start + + def create(self, temp_ids: list[int]) -> list[int]: + results = [] + for ti in temp_ids: + if ti >= 0: + results.append(ti) + elif ti in self.id_map: + results.append(self.id_map[ti]) + else: + self.id_map[ti] = self._next_int + results.append(self._next_int) + self._next_int += 1 + + return results + + def reset_mapping(self): + self.__init__(self._next_int) + + return self + + +def _hash_list_int(li: list[int]) -> list[bytes]: + return [HASH_FUNC(str(i).encode("utf-8")).digest() for i in li] + + +def _unique_clusters( + all_parents: Iterable[int], all_probabilities: Iterable[int] +) -> tuple[list[int], list[float]]: + ll = set() + clusters = [] + probabilities = [] + for parent, prob in zip(all_parents, all_probabilities, strict=True): + if parent in ll: + continue + else: + ll.add(parent) + clusters.append(parent) + probabilities.append(prob / 100) + return clusters, probabilities + + +def generate_sources() -> pa.Table: + """ + Generate sources table. + + Returns: + PyArrow sources table + """ + sources_resolution_id = [1, 2] + sources_alias = ["alias1", "alias2"] + sources_schema = ["dbt", "dbt"] + sources_table = ["companies_house", "hmrc_exporters"] + sources_id = ["company_number", "id"] + sources_indices = [ + { + "literal": ["col1", "col2", "col3"], + "alias": ["col1", "col2", "col3"], + }, + { + "literal": ["col1", "col2", "col3"], + "alias": ["col1", "col2", "col3"], + }, + ] + sources_indices = [json.dumps(si) for si in sources_indices] + return pa.table( + { + "resolution_id": pa.array(sources_resolution_id, type=pa.uint64()), + "alias": pa.array(sources_alias, type=pa.string()), + "schema": pa.array(sources_schema, type=pa.string()), + "table": pa.array(sources_table, type=pa.string()), + "id": pa.array(sources_id, type=pa.string()), + "indices": pa.array(sources_indices, type=pa.string()), + } + ) + + +def generate_resolutions() -> pa.Table: + """ + Generate resolutions table. + + Returns: + PyArrow resolutions table + """ + resolutions_resolution_id = [1, 2, 3, 4, 5] + resolutions_name = ["source1", "source2", "dedupe1", "dedupe2", "link"] + resolutions_resolution_hash = [ + HASH_FUNC(rid.encode("utf-8")).digest() for rid in resolutions_name + ] + resolutions_type = ["dataset", "dataset", "model", "model", "model"] + resolutions_float = [None, None, 0.8, 0.8, 0.9] + + return pa.table( + { + "resolution_id": pa.array(resolutions_resolution_id, type=pa.uint64()), + "resolution_hash": pa.array(resolutions_resolution_hash, type=pa.binary()), + "type": pa.array(resolutions_type, type=pa.string()), + "name": pa.array(resolutions_name, type=pa.string()), + "description": pa.array(resolutions_name, type=pa.string()), + "truth": pa.array(resolutions_float, type=pa.float64()), + } + ) + + +def generate_resolution_from() -> pa.Table: + """ + Generate resolution_from table. + + Returns: + PyArrow resolution_from table + """ + # 1 and 2 are sources; 3 and 4 are dedupers; 5 is a linker + resolution_parent = [1, 1, 3, 2, 2, 4] + resolution_child = [3, 5, 5, 4, 5, 5] + resolution_level = [1, 2, 1, 1, 2, 1] + resolution_truth_cache = [None, None, 0.7, None, None, 0.7] + + return pa.table( + { + "parent": pa.array(resolution_parent, type=pa.uint64()), + "child": pa.array(resolution_child, type=pa.uint64()), + "level": pa.array(resolution_level, type=pa.uint32()), + "truth_cache": pa.array(resolution_truth_cache, type=pa.float64()), + } + ) + + +def generate_cluster_source(range_left: int, range_right: int) -> pa.Table: + """ + Generate cluster table containing rows for source rows. + + Args: + range_left: first ID to generate + range_right: last ID to generate, plus one + Returns: + PyArrow cluster table + """ + + def create_source_pk(li: list[int]) -> list[list[str]]: + return [[str(i)] for i in li] + + source = list(range(range_left, range_right)) + + return pa.table( + { + "cluster_id": pa.array(source, type=pa.uint64()), + "cluster_hash": pa.array(_hash_list_int(source), type=pa.binary()), + "dataset": pa.array([1] * len(source), type=pa.uint64()), + "source_pk": pa.array(create_source_pk(source), type=pa.list_(pa.string())), + } + ) + + +def generate_result_tables( + left_ids: Iterable[int], + right_ids: Iterable[int] | None, + resolution_id: int, + id_creator: IDCreator, + n_components: int, + n_probs: int, + prob_min: float = 0.6, + prob_max: float = 1, +) -> tuple[list[int], pa.Table, pa.Table, pa.Table]: + """ + Generate probabilities, contains and clusters tables. + + Args: + left_ids: list of IDs for rows to dedupe, or for left rows to link + right_ids: list of IDs for right rows to link + resolution_id: ID of resolution for this dedupe or link model + id_creator: an IDCreator instance + n_components: number of implied connected components + n_probs: total number of probability edges to be generated + prob_min: minimum value for probabilities to be generated + prob_max: maximum value for probabilities to be generated + + Returns: + Tuple with 1 list of top-level clusters and 3 PyArrow tables, for probabilities, + contains and clusters + """ + probs = generate_dummy_probabilities( + left_ids, right_ids, [prob_min, prob_max], n_components, n_probs + ) + + clusters = to_hierarchical_clusters(attach_components_to_probabilities(probs)) + + indexed_parents = id_creator.create(clusters["parent"].to_pylist()) + indexed_children = id_creator.create(clusters["child"].to_pylist()) + + final_clusters, final_probs = _unique_clusters( + indexed_parents, clusters["probability"].to_numpy() + ) + + source_entries = left_ids if right_ids is None else left_ids + right_ids + set_children = set(indexed_children) + top_clusters = [c for c in final_clusters + source_entries if c not in set_children] + + probabilities_table = pa.table( + { + "resolution": pa.array( + [resolution_id] * len(final_clusters), type=pa.uint64() + ), + "cluster": pa.array(final_clusters, type=pa.uint64()), + "probability": pa.array(final_probs, type=pa.float64()), + } + ) + + contains_table = pa.table( + { + "parent": pa.array(indexed_parents, type=pa.uint64()), + "child": pa.array(indexed_children, type=pa.uint64()), + } + ) + + clusters_table = pa.table( + { + "cluster_id": pa.array(final_clusters, type=pa.uint64()), + "cluster_hash": pa.array(_hash_list_int(final_clusters), type=pa.binary()), + "dataset": pa.array([None] * len(final_clusters), type=pa.uint64()), + "source_pk": pa.array( + [None] * len(final_clusters), type=pa.list_(pa.string()) + ), + } + ) + + return (top_clusters, probabilities_table, contains_table, clusters_table) + + +def generate_all_tables( + source_len: int, + dedupe_components: int, + dedupe_len: int, + link_components: int, + link_len: int, +) -> dict[str, pa.Table]: + """ + Make all 6 backend tables. It will create two sources, one deduper for each, + and one linker from each deduper. + + Args: + source_len: length of each data source + dedupe_components: number of connected components implied by each deduper + dedupe_len: probabilities generated by each deduper + link_components: number of connected components implied by each linker + link_len: probabilities generated by each linker + Returns: + A dictionary where keys are table names and values are PyArrow tables + """ + resolutions = generate_resolutions() + resolution_from = generate_resolution_from() + sources = generate_sources() + + clusters_source1 = generate_cluster_source(0, source_len) + clusters_source2 = generate_cluster_source(source_len, source_len * 2) + + id_creator = IDCreator(source_len * 2) + top_clusters1, probabilities_dedupe1, contains_dedupe1, clusters_dedupe1 = ( + generate_result_tables( + clusters_source1["cluster_id"].to_pylist(), + None, + 3, + id_creator, + dedupe_components, + dedupe_len, + ) + ) + + top_clusters2, probabilities_dedupe2, contains_dedupe2, clusters_dedupe2 = ( + generate_result_tables( + clusters_source2["cluster_id"].to_pylist(), + None, + 4, + id_creator.reset_mapping(), + dedupe_components, + dedupe_len, + ) + ) + + _, probabilities_link, contains_link, clusters_link = generate_result_tables( + top_clusters1, + top_clusters2, + 5, + id_creator.reset_mapping(), + link_components, + link_len, + ) + + probabilities = pa.concat_tables( + [probabilities_dedupe1, probabilities_dedupe2, probabilities_link] + ) + contains = pa.concat_tables([contains_dedupe1, contains_dedupe2, contains_link]) + clusters = pa.concat_tables( + [ + clusters_source1, + clusters_source2, + clusters_dedupe1, + clusters_dedupe2, + clusters_link, + ] + ) + + return { + "resolutions": resolutions, + "resolution_from": resolution_from, + "sources": sources, + "probabilities": probabilities, + "contains": contains, + "clusters": clusters, + } + + +@click.command() +@click.option("-s", "--settings", type=str, required=True) +@click.option("-o", "--output_dir", type=click.Path(exists=True, path_type=Path)) +def main(settings, output_dir): + PRESETS = { + "xs": { + "source_len": 10_000, + "dedupe_components": 8000, + "dedupe_len": 2000, + "link_components": 6000, + "link_len": 10_000, + }, + "s": { + "source_len": 100_000, + "dedupe_components": 80_000, + "dedupe_len": 20_000, + "link_components": 60_000, + "link_len": 100_000, + }, + "m": { + "source_len": 1_000_000, + "dedupe_components": 800_000, + "dedupe_len": 200_000, + "link_components": 600_000, + "link_len": 1_000_000, + }, + "l": { + "source_len": 10_000_000, + "dedupe_components": 8_000_000, + "dedupe_len": 2_000_000, + "link_components": 6_000_000, + "link_len": 10_000_000, + }, + "xl": { + "source_len": 100_000_000, + "dedupe_components": 80_000_000, + "dedupe_len": 20_000_000, + "link_components": 60_000_000, + "link_len": 100_000_000, + }, + } + + if not output_dir: + output_dir = Path.cwd() / "data" / "all_tables" + if settings not in PRESETS: + raise ValueError(f"Settings {settings} are invalid") + + config = PRESETS[settings] + source_len = config["source_len"] + dedupe_components = config["dedupe_components"] + dedupe_len = config["dedupe_len"] + link_len = config["link_len"] + link_components = config["link_components"] + + all_tables = generate_all_tables( + source_len=source_len, + dedupe_components=dedupe_components, + dedupe_len=dedupe_len, + link_components=link_components, + link_len=link_len, + ) + + output_dir /= settings + output_dir.mkdir(parents=True, exist_ok=True) + for name, table in all_tables.items(): + pq.write_table(table, output_dir / f"{name}.parquet") + + +if __name__ == "__main__": + main() diff --git a/src/matchbox/server/postgresql/benchmark/init_schema.py b/src/matchbox/server/postgresql/benchmark/init_schema.py new file mode 100644 index 00000000..b6b3a177 --- /dev/null +++ b/src/matchbox/server/postgresql/benchmark/init_schema.py @@ -0,0 +1,50 @@ +from textwrap import dedent + +from sqlalchemy.dialects import postgresql +from sqlalchemy.schema import CreateTable + +from matchbox.server.postgresql.db import MBDB +from matchbox.server.postgresql.orm import ( + Clusters, + Contains, + Probabilities, + ResolutionFrom, + Resolutions, + Sources, +) + + +def empty_schema() -> str: + schema = MBDB.MatchboxBase.metadata.schema + sql = dedent(f""" + DROP SCHEMA IF EXISTS {schema} CASCADE; + CREATE SCHEMA {schema}; + """) + + return sql + + +def create_tables() -> str: + sql = "" + # Order matters + for table_class in ( + Resolutions, + ResolutionFrom, + Sources, + Clusters, + Contains, + Probabilities, + ): + sql += ( + str( + CreateTable(table_class.__table__).compile(dialect=postgresql.dialect()) + ) + + "; \n" + ) + + return sql + + +if __name__ == "__main__": + print(empty_schema()) + print(create_tables()) diff --git a/test/common/test_factories.py b/test/common/test_factories.py new file mode 100644 index 00000000..e48aa3e3 --- /dev/null +++ b/test/common/test_factories.py @@ -0,0 +1,181 @@ +from typing import Any + +import numpy as np +import pyarrow.compute as pc +import pytest + +from matchbox.common.factories import ( + calculate_min_max_edges, + generate_dummy_probabilities, + verify_components, +) + + +@pytest.mark.parametrize( + ("left_n", "right_n", "n_components", "true_min", "true_max"), + [ + (10, None, 2, 8, 20), + (11, None, 2, 9, 25), + (9, 9, 3, 15, 27), + (8, 4, 3, 9, 11), + (4, 8, 3, 9, 11), + (8, 8, 3, 13, 22), + ], + ids=[ + "dedupe_no_mod", + "dedup_mod", + "link_no_mod", + "link_left_mod", + "link_right_mod", + "link_same_mod", + ], +) +def test_calculate_min_max_edges( + left_n: int, right_n: int | None, n_components: int, true_min: int, true_max: int +): + deduplicate = False + if not right_n: + deduplicate = True + right_n = left_n + min_edges, max_edges = calculate_min_max_edges( + left_n, right_n, n_components, deduplicate + ) + + assert true_min == min_edges + assert true_max == max_edges + + +@pytest.mark.parametrize( + ("parameters"), + [ + { + "left_count": 5, + "right_count": None, + "prob_range": (0.6, 0.8), + "num_components": 3, + "total_rows": 2, + }, + { + "left_count": 1000, + "right_count": None, + "prob_range": (0.6, 0.8), + "num_components": 10, + "total_rows": calculate_min_max_edges(1000, 1000, 10, True)[0], + }, + { + "left_count": 1_000, + "right_count": None, + "prob_range": (0.6, 0.8), + "num_components": 10, + "total_rows": calculate_min_max_edges(1000, 1000, 10, True)[1], + }, + { + "left_count": 1_000, + "right_count": 1_000, + "prob_range": (0.6, 0.8), + "num_components": 10, + "total_rows": calculate_min_max_edges(1000, 1000, 10, False)[0], + }, + { + "left_count": 1_000, + "right_count": 1_000, + "prob_range": (0.6, 0.8), + "num_components": 10, + "total_rows": calculate_min_max_edges(1000, 1000, 10, False)[1], + }, + ], + ids=[ + "dedupe_no_edges", + "dedupe_min", + "dedupe_max", + "link_min", + "link_max", + ], +) +def test_generate_dummy_probabilities(parameters: dict[str, Any]): + len_left = parameters["left_count"] + len_right = parameters["right_count"] + if len_right: + total_len = len_left + len_right + len_right = parameters["right_count"] + rand_vals = np.random.choice(a=total_len, replace=False, size=total_len) + left_values = list(rand_vals[:len_left]) + right_values = list(rand_vals[len_left:]) + else: + rand_vals = np.random.choice(a=len_left, replace=False, size=len_left) + left_values = list(rand_vals[:len_left]) + right_values = None + + n_components = parameters["num_components"] + total_rows = parameters["total_rows"] + + probabilities = generate_dummy_probabilities( + left_values=left_values, + right_values=right_values, + prob_range=parameters["prob_range"], + num_components=n_components, + total_rows=total_rows, + ) + report = verify_components(table=probabilities, all_nodes=rand_vals) + p_left = probabilities["left"].to_pylist() + p_right = probabilities["right"].to_pylist() + + assert report["num_components"] == n_components + + # Link job + if right_values: + assert set(p_left) <= set(left_values) + assert set(p_right) <= set(right_values) + # Dedupe + else: + assert set(p_left) | set(p_right) <= set(left_values) + + assert ( + pc.max(probabilities["probability"]).as_py() / 100 + <= parameters["prob_range"][1] + ) + assert ( + pc.min(probabilities["probability"]).as_py() / 100 + >= parameters["prob_range"][0] + ) + + assert len(probabilities) == total_rows + + edges = zip(p_left, p_right, strict=True) + edges_set = {tuple(sorted(e)) for e in edges} + assert len(edges_set) == total_rows + + self_references = [e for e in edges if e[0] == e[1]] + assert len(self_references) == 0 + + +@pytest.mark.parametrize( + ("parameters"), + [ + { + "left_range": (0, 10_000), + "right_range": (10_000, 20_000), + "num_components": 2, + "total_rows": 1, + }, + { + "left_range": (0, 10), + "right_range": (10, 20), + "num_components": 2, + "total_rows": 8_000, + }, + ], + ids=["lower_than_min", "higher_than_max"], +) +def test_generate_dummy_probabilities_errors(parameters: dict[str, Any]): + left_values = range(*parameters["left_range"]) + right_values = range(*parameters["right_range"]) + + with pytest.raises(ValueError): + generate_dummy_probabilities( + left_values=left_values, + right_values=right_values, + prob_range=(0.6, 0.8), + num_components=parameters["num_components"], + total_rows=parameters["total_rows"], + ) diff --git a/test/client/test_transform.py b/test/common/test_transform.py similarity index 84% rename from test/client/test_transform.py rename to test/common/test_transform.py index 6b79d25c..88b46273 100644 --- a/test/client/test_transform.py +++ b/test/common/test_transform.py @@ -6,18 +6,19 @@ import pyarrow.compute as pc import pytest +from matchbox.common.factories import generate_dummy_probabilities from matchbox.common.transform import ( attach_components_to_probabilities, component_to_hierarchy, to_hierarchical_clusters, ) -from test.fixtures.factories import generate_dummy_probabilities, verify_components @lru_cache(maxsize=None) def _combine_strings(*n: str) -> str: """ - Combine n strings into a single string. + Combine n strings into a single string, with a cache. + Meant to replace `matchbox.common.hash.IntMap.index` Args: *args: Variable number of strings to combine @@ -29,45 +30,6 @@ def _combine_strings(*n: str) -> str: return "".join(sorted(letters)) -@pytest.mark.parametrize( - ("parameters"), - [ - { - "left_range": (0, 1_000), - "right_range": (1_000, 2_000), - "prob_range": (0.6, 0.8), - "num_components": 10, - "total_rows": 100_000, - }, - ], - ids=["simple"], -) -def test_probabilities_factory(parameters: dict[str, Any]): - left_values = range(*parameters["left_range"]) - right_values = range(*parameters["right_range"]) - - probabilities = generate_dummy_probabilities( - left_values=left_values, - right_values=right_values, - prob_range=parameters["prob_range"], - num_components=parameters["num_components"], - total_rows=parameters["total_rows"], - ) - report = verify_components(table=probabilities) - - assert report["num_components"] == parameters["num_components"] - assert set(pc.unique(probabilities["left"]).to_pylist()) == set(left_values) - assert set(pc.unique(probabilities["right"]).to_pylist()) == set(right_values) - assert ( - pc.max(probabilities["probability"]).as_py() / 100 - <= parameters["prob_range"][1] - ) - assert ( - pc.min(probabilities["probability"]).as_py() / 100 - >= parameters["prob_range"][0] - ) - - @pytest.mark.parametrize( ("parameters"), [ @@ -166,8 +128,27 @@ def test_empty_attach_components_to_probabilities(): ("xy", "y", 90), }, ), + # Test case 4: A component larger than two remains unchanged + # at a successive threshold + ( + { + "left": ["x", "y", "a"], + "right": ["y", "z", "b"], + "probability": [90, 90, 85], + }, + { + ("xy", "x", 90), + ("xy", "y", 90), + ("yz", "y", 90), + ("yz", "z", 90), + ("xyz", "xy", 90), + ("xyz", "yz", 90), + ("ab", "a", 85), + ("ab", "b", 85), + }, + ), ], - ids=["equal", "asymmetric", "single"], + ids=["equal", "asymmetric", "single", "unchanged"], ) def test_component_to_hierarchy( probabilities: dict[str, list[str | float]], hierarchy: set[tuple[str, str, int]] diff --git a/test/fixtures/factories.py b/test/fixtures/factories.py deleted file mode 100644 index 612f6e2b..00000000 --- a/test/fixtures/factories.py +++ /dev/null @@ -1,187 +0,0 @@ -from collections import Counter - -import numpy as np -import pyarrow as pa -import rustworkx as rx - - -def verify_components(table) -> dict: - """ - Fast verification of connected components using rustworkx. - - Args: - table: PyArrow table with 'left', 'right' columns - - Returns: - dictionary containing basic component statistics - """ - graph = rx.PyGraph() - - unique_nodes = set(table["left"].to_numpy()) | set(table["right"].to_numpy()) - graph.add_nodes_from(range(len(unique_nodes))) - - node_to_idx = {node: idx for idx, node in enumerate(unique_nodes)} - edges = [ - (node_to_idx[left], node_to_idx[right]) - for left, right in zip( - table["left"].to_numpy(), - table["right"].to_numpy(), - strict=False, - ) - ] - - graph.add_edges_from_no_data(edges) - - components = rx.connected_components(graph) - component_sizes = Counter(len(component) for component in components) - - return { - "num_components": len(components), - "total_nodes": len(unique_nodes), - "total_edges": len(edges), - "component_sizes": component_sizes, - "min_component_size": min(component_sizes.keys()), - "max_component_size": max(component_sizes.keys()), - } - - -def _calculate_max_possible_edges(n_nodes: int, num_components: int) -> int: - """ - Calculate the max possible number of edges given n nodes split into k components. - - Args: - n_nodes: Total number of nodes - num_components: Number of components to split into - - Returns: - Maximum possible number of edges - """ - nodes_per_component = n_nodes // num_components - max_edges_per_component = ( - nodes_per_component * nodes_per_component - ) # Complete bipartite graph - return max_edges_per_component * num_components - - -def _split_values_into_components( - values: list[int], num_components: int -) -> list[np.ndarray]: - """ - Split values into non-overlapping groups for each component. - - Args: - values: List of values to split - num_components: Number of components to create - - Returns: - List of arrays, one for each component - """ - values = np.array(values) - np.random.shuffle(values) - return np.array_split(values, num_components) - - -def generate_dummy_probabilities( - left_values: list[int], - right_values: list[int], - prob_range: tuple[float, float], - num_components: int, - total_rows: int, -) -> pa.Table: - """ - Generate dummy Arrow probabilities data with guaranteed isolated components. - - Args: - left_values: List of integers to use for left column - right_values: List of integers to use for right column - prob_range: Tuple of (min_prob, max_prob) to constrain probabilities - num_components: Number of distinct connected components to generate - total_rows: Total number of rows to generate - - Returns: - PyArrow Table with 'left', 'right', and 'probability' columns - """ - # Validate inputs - if len(left_values) < 2 or len(right_values) < 2: - raise ValueError("Need at least 2 possible values for both left and right") - if num_components > min(len(left_values), len(right_values)): - raise ValueError( - "Cannot have more components than minimum of left/right values" - ) - - min_nodes = min(len(left_values), len(right_values)) - max_possible_edges = _calculate_max_possible_edges(min_nodes, num_components) - - if total_rows > max_possible_edges: - raise ValueError( - f"Cannot generate {total_rows:,} edges with {num_components:,} components. " - f"Max possible edges is {max_possible_edges:,} given {min_nodes:,} nodes. " - "Either increase the number of nodes, decrease the number of components, " - "or decrease the total edges requested." - ) - - # Convert probability range to integers (60-80 for 0.60-0.80) - prob_min = int(prob_range[0] * 100) - prob_max = int(prob_range[1] * 100) - - # Split values into completely separate groups for each component - left_components = _split_values_into_components(left_values, num_components) - right_components = _split_values_into_components(right_values, num_components) - - # Calculate base number of edges per component - base_edges_per_component = total_rows // num_components - remaining_edges = total_rows % num_components - - all_edges = [] - - # Generate edges for each component - for comp_idx in range(num_components): - comp_left_values = left_components[comp_idx] - comp_right_values = right_components[comp_idx] - - # Calculate edges for this component - edges_in_component = base_edges_per_component - if comp_idx < remaining_edges: # Distribute remaining edges - edges_in_component += 1 - - # Ensure basic connectivity within the component - base_edges = [] - - # Create a spanning tree-like structure - for i in range(len(comp_left_values)): - base_edges.append( - ( - comp_left_values[i], - comp_right_values[i % len(comp_right_values)], - np.random.randint(prob_min, prob_max + 1), - ) - ) - - # Generate remaining random edges strictly within this component - remaining_edges = edges_in_component - len(base_edges) - if remaining_edges > 0: - random_lefts = np.random.choice(comp_left_values, size=remaining_edges) - random_rights = np.random.choice(comp_right_values, size=remaining_edges) - random_probs = np.random.randint( - prob_min, prob_max + 1, size=remaining_edges - ) - - component_edges = base_edges + list( - zip(random_lefts, random_rights, random_probs, strict=False) - ) - else: - component_edges = base_edges - - all_edges.extend(component_edges) - - # Convert to arrays - lefts, rights, probs = zip(*all_edges, strict=False) - - # Create PyArrow arrays - left_array = pa.array(lefts, type=pa.uint64()) - right_array = pa.array(rights, type=pa.uint64()) - prob_array = pa.array(probs, type=pa.uint8()) - - return pa.table( - [left_array, right_array, prob_array], names=["left", "right", "probability"] - ) diff --git a/test/pipeline.py b/test/pipeline.py deleted file mode 100644 index 95d8b367..00000000 --- a/test/pipeline.py +++ /dev/null @@ -1,51 +0,0 @@ -import logging -import time -from contextlib import contextmanager -from pathlib import Path - -import pyarrow.parquet as pq -from rich.logging import RichHandler - -from matchbox.common.transform import ( - attach_components_to_probabilities, - to_hierarchical_clusters, -) - -logging.basicConfig( - level=logging.INFO, - format="%(message)s", - handlers=[RichHandler(rich_tracebacks=True)], -) -pipeline_logger = logging.getLogger("mb_pipeline") - -ROOT = Path(__file__).parent.parent - - -@contextmanager -def timer(description: str): - start = time.time() - yield - elapsed = time.time() - start - - if elapsed >= 60: - minutes = int(elapsed // 60) - seconds = elapsed % 60 - time_str = f"{minutes} min {seconds:.1f} sec" - else: - time_str = f"{elapsed:.2f} seconds" - - pipeline_logger.info(f"{description} in {time_str}") - - -if __name__ == "__main__": - with timer("Full pipeline completed"): - with timer("Read table"): - table = pq.read_table(Path.cwd() / "data/hierarchical_cc20k.parquet") - - pipeline_logger.info(f"Processing {len(table):,} records") - - with timer("Added components"): - cc = attach_components_to_probabilities(table) - - with timer("Built hierarchical clusters"): - out = to_hierarchical_clusters(cc) diff --git a/uv.lock b/uv.lock index 07a7cb3d..a3c3f534 100644 --- a/uv.lock +++ b/uv.lock @@ -734,6 +734,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/a2/6c725958e6f135d8e5de081e69841bb2c1d84b3fc259d02eb092b8fc203a/ipython-8.27.0-py3-none-any.whl", hash = "sha256:f68b3cb8bde357a5d7adc9598d57e22a45dfbea19eb6b98286fa3b288c9cd55c", size = 818986 }, ] +[[package]] +name = "ipywidgets" +version = "8.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "comm" }, + { name = "ipython" }, + { name = "jupyterlab-widgets" }, + { name = "traitlets" }, + { name = "widgetsnbextension" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c7/4c/dab2a281b07596a5fc220d49827fe6c794c66f1493d7a74f1df0640f2cc5/ipywidgets-8.1.5.tar.gz", hash = "sha256:870e43b1a35656a80c18c9503bbf2d16802db1cb487eec6fab27d683381dde17", size = 116723 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/2d/9c0b76f2f9cc0ebede1b9371b6f317243028ed60b90705863d493bae622e/ipywidgets-8.1.5-py3-none-any.whl", hash = "sha256:3290f526f87ae6e77655555baba4f36681c555b8bdbbff430b70e52c34c86245", size = 139767 }, +] + [[package]] name = "jedi" version = "0.19.1" @@ -815,6 +831,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c9/fb/108ecd1fe961941959ad0ee4e12ee7b8b1477247f30b1fdfd83ceaf017f0/jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409", size = 28965 }, ] +[[package]] +name = "jupyterlab-widgets" +version = "3.0.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/59/73/fa26bbb747a9ea4fca6b01453aa22990d52ab62dd61384f1ac0dc9d4e7ba/jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed", size = 203556 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/93/858e87edc634d628e5d752ba944c2833133a28fa87bb093e6832ced36a3e/jupyterlab_widgets-3.0.13-py3-none-any.whl", hash = "sha256:e3cda2c233ce144192f1e29914ad522b2f4c40e77214b0cc97377ca3d323db54", size = 214392 }, +] + [[package]] name = "kiwisolver" version = "1.4.7" @@ -944,6 +969,7 @@ server = [ dev = [ { name = "docker" }, { name = "ipykernel" }, + { name = "ipywidgets" }, { name = "pre-commit" }, { name = "pytest" }, { name = "pytest-cov" }, @@ -982,6 +1008,7 @@ requires-dist = [ dev = [ { name = "docker", specifier = ">=7.1.0" }, { name = "ipykernel", specifier = ">=6.29.5" }, + { name = "ipywidgets", specifier = ">=8.1.5" }, { name = "pre-commit", specifier = ">=3.8.0" }, { name = "pytest", specifier = ">=8.3.3" }, { name = "pytest-cov", specifier = ">=5.0.0" }, @@ -1461,6 +1488,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/39/5a9a229bb5414abeb86e33b8fc8143ab0aecce5a7f698a53e31367d30caa/psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4", size = 1163736 }, { url = "https://files.pythonhosted.org/packages/3d/16/4623fad6076448df21c1a870c93a9774ad8a7b4dd1660223b59082dd8fec/psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067", size = 1025113 }, { url = "https://files.pythonhosted.org/packages/66/de/baed128ae0fc07460d9399d82e631ea31a1f171c0c4ae18f9808ac6759e3/psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e", size = 1163951 }, + { url = "https://files.pythonhosted.org/packages/ae/49/a6cfc94a9c483b1fa401fbcb23aca7892f60c7269c5ffa2ac408364f80dc/psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2", size = 2569060 }, ] [[package]] @@ -2346,6 +2374,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/0b/c7e5d11020242984d9d37990310520ed663b942333b83a033c2f20191113/websockets-14.1-py3-none-any.whl", hash = "sha256:4d4fc827a20abe6d544a119896f6b78ee13fe81cbfef416f3f2ddf09a03f0e2e", size = 156277 }, ] +[[package]] +name = "widgetsnbextension" +version = "4.0.13" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/56/fc/238c424fd7f4ebb25f8b1da9a934a3ad7c848286732ae04263661eb0fc03/widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6", size = 1164730 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71", size = 2335872 }, +] + [[package]] name = "wrapt" version = "1.17.0"