diff --git a/pyproject.toml b/pyproject.toml
index 4ed88b0d..5de023a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,7 @@ server = [
 [dependency-groups]
 dev = [
     "ipykernel>=6.29.5",
+    "ipywidgets>=8.1.5",
     "pre-commit>=3.8.0",
     "pytest>=8.3.3",
     "pytest-cov>=5.0.0",
diff --git a/src/matchbox/common/factories.py b/src/matchbox/common/factories.py
new file mode 100644
index 00000000..71be3427
--- /dev/null
+++ b/src/matchbox/common/factories.py
@@ -0,0 +1,295 @@
+from collections import Counter
+from textwrap import dedent
+from typing import Any
+
+import numpy as np
+import pyarrow as pa
+import rustworkx as rx
+
+from matchbox.common.transform import graph_results
+
+
+def verify_components(all_nodes: list[Any], table: pa.Table) -> dict:
+    """
+    Fast verification of connected components using rustworkx.
+
+    Args:
+        all_nodes: list of identities of inputs being matched
+        table: PyArrow table with 'left', 'right' columns
+
+    Returns:
+        dictionary containing basic component statistics
+    """
+    graph, _, _ = graph_results(table, all_nodes)
+    components = rx.connected_components(graph)
+    component_sizes = Counter(len(component) for component in components)
+
+    return {
+        "num_components": len(components),
+        "total_nodes": graph.num_nodes(),
+        "total_edges": graph.num_edges(),
+        "component_sizes": component_sizes,
+        "min_component_size": min(component_sizes.keys()),
+        "max_component_size": max(component_sizes.keys()),
+    }
+
+
+def _min_edges_component(left: int, right: int, deduplicate: bool) -> int:
+    """
+    Calculate min edges for component to be connected.
+    Does so by assuming a spanning tree.
+
+    Args:
+        left: number of nodes of component on the left
+        right: number of nodes of component on the right (for linking)
+        deduplicate: whether edges are for deduplication
+
+    Returns:
+        Minimum number of edges
+    """
+    if not deduplicate:
+        return left + right - 1
+
+    return left - 1
+
+
+def _max_edges_component(left: int, right: int, deduplicate: bool) -> int:
+    """
+    Calculate max edges for component to be avoid duplication.
+    Considers complete graph for deduping, and complete bipartite graph for linking.
+
+    Args:
+        left: number of nodes of component on the left
+        right: number of nodes of component on the right (for linking)
+        deduplicate: whether edges are for deduplication
+
+    Returns:
+        Maximum number of edges
+    """
+    if not deduplicate:
+        return left * right
+    # n*(n-1) is always divisible by 2
+    return left * (left - 1) // 2
+
+
+def calculate_min_max_edges(
+    left_nodes: int, right_nodes: int, num_components: int, deduplicate: bool
+) -> tuple[int, int]:
+    """
+    Calculate min and max edges for a graph.
+
+    Args:
+        left_nodes: number of nodes in left source
+        right_nodes: number of nodes in right source
+        num_components: number of requested components
+        deduplicate: whether edges are for deduplication
+
+    Returns:
+        Two-tuple representing min and max edges
+    """
+    left_mod, right_mod = left_nodes % num_components, right_nodes % num_components
+    left_div, right_div = left_nodes // num_components, right_nodes // num_components
+
+    min_mod, max_mod = sorted([left_mod, right_mod])
+
+    min_edges, max_edges = 0, 0
+    # components where both sides have maximum nodes
+    min_edges += (
+        _min_edges_component(left_div + 1, right_div + 1, deduplicate) * min_mod
+    )
+    max_edges += (
+        _max_edges_component(left_div + 1, right_div + 1, deduplicate) * min_mod
+    )
+    # components where one side has maximum nodes
+    left_after_min_mod, right_after_min_mod = left_div + 1, right_div
+    if left_mod == min_mod:
+        left_after_min_mod, right_after_min_mod = left_div, right_div + 1
+    min_edges += _min_edges_component(
+        left_after_min_mod, right_after_min_mod, deduplicate
+    ) * (max_mod - min_mod)
+    max_edges += _max_edges_component(
+        left_after_min_mod, right_after_min_mod, deduplicate
+    ) * (max_mod - min_mod)
+    # components where both side have minimum nodes
+    min_edges += _min_edges_component(left_div, right_div, deduplicate) * (
+        num_components - max_mod
+    )
+    max_edges += _max_edges_component(left_div, right_div, deduplicate) * (
+        num_components - max_mod
+    )
+
+    return min_edges, max_edges
+
+
+def generate_dummy_probabilities(
+    left_values: list[int],
+    right_values: list[int] | None,
+    prob_range: tuple[float, float],
+    num_components: int,
+    total_rows: int,
+) -> pa.Table:
+    """
+    Generate dummy Arrow probabilities data with guaranteed isolated components.
+
+    Args:
+        left_values: List of integers to use for left column
+        right_values: List of integers to use for right column. If None, assume we
+            are generating probabilities for deduplication
+        prob_range: Tuple of (min_prob, max_prob) to constrain probabilities
+        num_components: Number of distinct connected components to generate
+        total_rows: Total number of rows to generate
+
+    Returns:
+        PyArrow Table with 'left', 'right', and 'probability' columns
+    """
+    # Validate inputs
+    deduplicate = False
+    if right_values is None:
+        right_values = left_values
+        deduplicate = True
+
+    if len(left_values) < 2 or len(right_values) < 2:
+        raise ValueError("Need at least 2 possible values for both left and right")
+    if num_components > min(len(left_values), len(right_values)):
+        raise ValueError(
+            "Cannot have more components than minimum of left/right values"
+        )
+
+    left_nodes, right_nodes = len(left_values), len(right_values)
+    min_possible_edges, max_possible_edges = calculate_min_max_edges(
+        left_nodes, right_nodes, num_components, deduplicate
+    )
+
+    mode = "dedupe" if deduplicate else "link"
+
+    if total_rows == 0:
+        raise ValueError("At least one edge must be generated")
+    if total_rows < min_possible_edges:
+        raise ValueError(
+            dedent(f"""
+            Cannot generate {total_rows:,} {mode} edges with {num_components:,}
+            components.
+            Min edges is {min_possible_edges:,} for nodes given.
+            Either decrease the number of nodes, increase the number of components, 
+            or increase the total edges requested.
+            """)
+        )
+    if total_rows > max_possible_edges:
+        raise ValueError(
+            dedent(f"""
+            Cannot generate {total_rows:,} {mode} edges with {num_components:,}
+            components. 
+            Max edges is {max_possible_edges:,} for nodes given.
+            Either increase the number of nodes, decrease the number of components, 
+            or decrease the total edges requested.
+            """)
+        )
+
+    n_extra_edges = total_rows - min_possible_edges
+
+    # Convert probability range to integers (60-80 for 0.60-0.80)
+    prob_min = int(prob_range[0] * 100)
+    prob_max = int(prob_range[1] * 100)
+
+    # Split values into completely separate groups for each component
+    left_components = np.array_split(np.array(left_values), num_components)
+    right_components = np.array_split(np.array(right_values), num_components)
+    # For each left-right component pair, the right equals the left rotated by one
+    right_components = [np.roll(c, -1) for c in right_components]
+
+    all_edges = []
+
+    # Generate edges for each component
+    for comp_idx in range(num_components):
+        comp_left_values = left_components[comp_idx]
+        comp_right_values = right_components[comp_idx]
+
+        min_comp_nodes, max_comp_nodes = sorted(
+            [len(comp_left_values), len(comp_right_values)]
+        )
+
+        # Ensure basic connectivity within the component by creating a spanning-tree
+        base_edges = set()
+        # For deduping (A B C) you just need (A - B) (B - C) (C - A)
+        # which just needs matching pairwise the data and its rotated version.
+        # For deduping, `min_comp_nodes` == `max_comp_nodes`
+        if deduplicate:
+            for i in range(min_comp_nodes - 1):
+                small_n, large_n = sorted([comp_left_values[i], comp_right_values[i]])
+                base_edges.add((small_n, large_n))
+        else:
+            # For linking (A B) and (C D E), we begin by adding (A - C) and (B - D)
+            for i in range(min_comp_nodes):
+                base_edges.add((comp_left_values[i], comp_right_values[i]))
+            # we now add (C - B)
+            for i in range(min_comp_nodes - 1):
+                base_edges.add((comp_left_values[i + 1], comp_right_values[i]))
+            # we now add (A - D)
+            left_right_diff = max_comp_nodes - min_comp_nodes
+            for i in range(left_right_diff):
+                left_i, right_i = 0, min_comp_nodes + i
+                if len(comp_right_values) < len(comp_left_values):
+                    left_i, right_i = min_comp_nodes + i, 0
+
+                base_edges.add((comp_left_values[left_i], comp_right_values[right_i]))
+
+        component_edges = list(base_edges)
+
+        if n_extra_edges > 0:
+            # Generate remaining random edges strictly within this component
+            # TODO: this can certainly be optimised
+            if deduplicate:
+                all_possible_edges = list(
+                    {
+                        tuple(sorted([x, y]))
+                        for x in comp_left_values
+                        for y in comp_right_values
+                        if x != y and tuple(sorted([x, y])) not in base_edges
+                    }
+                )
+            else:
+                all_possible_edges = list(
+                    {
+                        (x, y)
+                        for x in comp_left_values
+                        for y in comp_right_values
+                        if x != y and (x, y) not in base_edges
+                    }
+                )
+            max_new_edges = len(all_possible_edges)
+            if max_new_edges >= n_extra_edges:
+                edges_required = n_extra_edges
+                n_extra_edges = 0
+            else:
+                edges_required = max_new_edges
+                n_extra_edges -= max_new_edges
+
+            extra_edges_idx = np.random.choice(
+                len(all_possible_edges), size=edges_required, replace=False
+            )
+            extra_edges = [
+                e for i, e in enumerate(all_possible_edges) if i in extra_edges_idx
+            ]
+            component_edges += extra_edges
+        random_probs = np.random.randint(
+            prob_min, prob_max + 1, size=len(component_edges)
+        )
+
+        component_edges = [
+            (le, ri, pr)
+            for (le, ri), pr in zip(component_edges, random_probs, strict=True)
+        ]
+
+        all_edges.extend(component_edges)
+
+    # Convert to arrays
+    lefts, rights, probs = zip(*all_edges, strict=True)
+
+    # Create PyArrow arrays
+    left_array = pa.array(lefts, type=pa.uint64())
+    right_array = pa.array(rights, type=pa.uint64())
+    prob_array = pa.array(probs, type=pa.uint8())
+
+    return pa.table(
+        [left_array, right_array, prob_array], names=["left", "right", "probability"]
+    )
diff --git a/src/matchbox/common/transform.py b/src/matchbox/common/transform.py
index 1e257a48..1903bfbb 100644
--- a/src/matchbox/common/transform.py
+++ b/src/matchbox/common/transform.py
@@ -2,7 +2,7 @@
 import multiprocessing
 from collections import defaultdict
 from concurrent.futures import ProcessPoolExecutor
-from typing import Callable, Generic, Hashable, Literal, TypeVar
+from typing import Callable, Generic, Hashable, Iterable, Literal, TypeVar
 from uuid import uuid4
 
 import numpy as np
@@ -87,19 +87,22 @@ def to_clusters(
     )
 
 
-def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table:
+def graph_results(
+    probabilities: pa.Table, all_nodes: Iterable[int] | None = None
+) -> tuple[rx.PyDiGraph, np.ndarray, np.ndarray]:
     """
-    Takes an Arrow table of probabilities and adds a component column.
-
-    Expects an Arrow table of column, left, right, probability.
+    Convert probability table to graph representation.
 
-    Returns a table with an additional column, component.
+    Args:
+        probabilities: PyArrow table with 'left', 'right' columns
+        all_nodes: superset of node identities figuring in probabilities table.
+            Used to optionally add isolated nodes to the graph.
+    Returns:
+        A tuple containing:
+        - Rustwork directed graph
+        - A list mapping the 'left' probabilities column to node indices in the graph
+        - A list mapping the 'right' probabilities column to node indices in the graph
     """
-    # Handle empty probabilities
-    if len(probabilities) == 0:
-        empty_components = pa.array([], type=pa.int64())
-        return probabilities.append_column("component", empty_components)
-
     # Create index to use in graph
     unique = pc.unique(
         pa.concat_arrays(
@@ -109,8 +112,9 @@ def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table:
             ]
         )
     )
-    left_indices = pc.index_in(probabilities["left"], unique)
-    right_indices = pc.index_in(probabilities["right"], unique)
+
+    left_indices = pc.index_in(probabilities["left"], unique).to_numpy()
+    right_indices = pc.index_in(probabilities["right"], unique).to_numpy()
 
     # Create and process graph
     n_nodes = len(unique)
@@ -119,9 +123,30 @@ def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table:
     graph = rx.PyGraph(node_count_hint=n_nodes, edge_count_hint=n_edges)
     graph.add_nodes_from(range(n_nodes))
 
-    edges = tuple(zip(left_indices.to_numpy(), right_indices.to_numpy(), strict=True))
+    if all_nodes is not None:
+        isolated_nodes = len(set(all_nodes) - set(unique.to_pylist()))
+        graph.add_nodes_from(range(isolated_nodes))
+
+    edges = tuple(zip(left_indices, right_indices, strict=True))
     graph.add_edges_from_no_data(edges)
 
+    return graph, left_indices, right_indices
+
+
+def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table:
+    """
+    Takes an Arrow table of probabilities and adds a component column.
+
+    Expects an Arrow table of column, left, right, probability.
+
+    Returns a table with an additional column, component.
+    """
+    # Handle empty probabilities
+    if len(probabilities) == 0:
+        empty_components = pa.array([], type=pa.int64())
+        return probabilities.append_column("component", empty_components)
+
+    graph, left_indices, _ = graph_results(probabilities)
     components = rx.connected_components(graph)
 
     # Convert components to arrays, map back to input to join, and reattach
@@ -130,10 +155,10 @@ def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table:
         np.arange(len(components)), [len(c) for c in components]
     )
 
-    node_to_component = np.zeros(len(unique), dtype=np.int64)
+    node_to_component = np.zeros(graph.num_nodes(), dtype=np.int64)
     node_to_component[component_indices] = component_labels
 
-    edge_components = pa.array(node_to_component[left_indices.to_numpy()])
+    edge_components = pa.array(node_to_component[left_indices])
 
     return probabilities.append_column("component", edge_components).sort_by(
         [("component", "ascending"), ("probability", "descending")]
@@ -201,11 +226,15 @@ def component_to_hierarchy(
     Returns:
         Arrow Table with columns ['parent', 'child', 'probability']
     """
-    probs = np.sort(pc.unique(table["probability"]).to_numpy())[::-1]
+    ascending_probs = np.sort(
+        pc.unique(table["probability"]).to_numpy(zero_copy_only=False)
+    )
+    probs = ascending_probs[::-1]
 
     djs = DisjointSet[int]()  # implements connected components
     current_roots: dict[int, set[int]] = defaultdict(set)  # tracks ultimate parents
     hierarchy: list[tuple[int, int, float]] = []  # the output of this function
+    seen_components: set[frozenset[int]] = set()  # track previously seen component sets
 
     for threshold in probs:
         # Get current probability rows
@@ -228,6 +257,13 @@ def component_to_hierarchy(
             if len(children) <= 2:
                 continue  # Skip pairs already handled by pairwise probabilities
 
+            # Skip if we've seen this exact component before
+            frozen_children = frozenset(children)
+            if frozen_children in seen_components:
+                continue
+
+            seen_components.add(frozen_children)
+
             parent = hash_func(*children)
             prev_roots: set[int] = set()
             for child in children:
@@ -261,7 +297,6 @@ def to_hierarchical_clusters(
         probabilities: Arrow table with columns ['component', 'left', 'right',
             'probability']
         proc_func: Function to process each component
-        dtype: Arrow data type for parent/child columns
         timeout: Maximum seconds to wait for each component to process
 
     Returns:
@@ -317,7 +352,9 @@ def to_hierarchical_clusters(
 
         with ProcessPoolExecutor(max_workers=n_cores) as executor:
             futures = [
-                executor.submit(proc_func, component_table, dtype, hash_func)
+                executor.submit(
+                    proc_func, component_table, hash_func=hash_func, dtype=dtype
+                )
                 for component_table in component_tables
             ]
 
diff --git a/src/matchbox/server/postgresql/README.md b/src/matchbox/server/postgresql/README.md
index c4183f18..72542530 100644
--- a/src/matchbox/server/postgresql/README.md
+++ b/src/matchbox/server/postgresql/README.md
@@ -32,7 +32,7 @@ erDiagram
         bigint child PK,FK
     }
     Probabilities {
-        bigint model PK,FK
+        bigint resolution PK,FK
         bigint cluster PK,FK
         float probability
     }
diff --git a/src/matchbox/server/postgresql/benchmark/__init__.py b/src/matchbox/server/postgresql/benchmark/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/matchbox/server/postgresql/benchmark/cluster_pipeline.py b/src/matchbox/server/postgresql/benchmark/cluster_pipeline.py
new file mode 100644
index 00000000..6843b7c5
--- /dev/null
+++ b/src/matchbox/server/postgresql/benchmark/cluster_pipeline.py
@@ -0,0 +1,138 @@
+import logging
+import time
+from contextlib import contextmanager
+from pathlib import Path
+
+import pyarrow.parquet as pq
+from rich.logging import RichHandler
+
+from matchbox.common.hash import HASH_FUNC
+from matchbox.common.transform import (
+    attach_components_to_probabilities,
+    to_hierarchical_clusters,
+)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(message)s",
+    handlers=[RichHandler(rich_tracebacks=True)],
+)
+pipeline_logger = logging.getLogger("mb_pipeline")
+
+ROOT = Path(__file__).parent.parent
+
+
+@contextmanager
+def timer(description: str):
+    start = time.time()
+    yield
+    elapsed = time.time() - start
+
+    if elapsed >= 60:
+        minutes = int(elapsed // 60)
+        seconds = elapsed % 60
+        time_str = f"{minutes} min {seconds:.1f} sec"
+    else:
+        time_str = f"{elapsed:.2f} seconds"
+
+    pipeline_logger.info(f"{description} in {time_str}")
+
+
+INPUT_NAME = "hierarchical_cc200k"
+OUTPUT_PREFIX = "large"
+
+if __name__ == "__main__":
+    with timer("Full pipeline completed"):
+        with timer("Read table"):
+            table = pq.read_table(Path.cwd() / f"data/{INPUT_NAME}.parquet")
+
+        pipeline_logger.info(f"Processing {len(table):,} records")
+
+        with timer("Added components"):
+            cc = attach_components_to_probabilities(table)
+
+        with timer("Built hierarchical clusters"):
+            hierarchy = to_hierarchical_clusters(cc)
+
+        with timer("Created output tables"):
+            fake_resolution_hash = HASH_FUNC(
+                "ceci n'est pas un model".encode("utf-8")
+            ).digest()
+
+            parents_im, children_im, thresholds = (
+                hierarchy.column("parent").to_numpy(),
+                hierarchy.column("child").to_numpy(),
+                hierarchy.column("probability").to_numpy(),
+            )
+            import numpy as np
+            import pyarrow as pa
+            from pyarrow.parquet import write_table
+
+            im_to_pos = dict()
+            next_int = max(max(parents_im), 0)
+            parents = []
+            children = []
+            for pim in parents_im:
+                if pim >= 0:
+                    parents.append(pim)
+                elif pim in im_to_pos:
+                    parents.append(im_to_pos[pim])
+                else:
+                    im_to_pos[pim] = next_int
+                    parents.append(next_int)
+                    next_int += 1
+
+            for cim in children_im:
+                if cim >= 0:
+                    children.append(cim)
+                elif cim in im_to_pos:
+                    children.append(im_to_pos[cim])
+                else:
+                    im_to_pos[cim] = next_int
+                    children.append(next_int)
+                    next_int += 1
+
+            unique_clusters = np.unique(parents)
+
+            out_clusters = pa.table(
+                {
+                    "id": pa.array(unique_clusters, type=pa.uint64()),
+                    "dataset_id": pa.array(
+                        [None] * len(unique_clusters), type=pa.uint64()
+                    ),
+                    "id_in_dataset": pa.array(
+                        [None] * len(unique_clusters), type=pa.string()
+                    ),
+                }
+            )
+
+            out_contains = pa.table(
+                {
+                    "parent": pa.array(parents, type=pa.uint64()),
+                    "child": pa.array(children, type=pa.uint64()),
+                }
+            )
+
+            out_probabilities = pa.table(
+                {
+                    "model": pa.array(
+                        [fake_resolution_hash] * len(parents), type=pa.binary()
+                    ),
+                    "cluster": pa.array(parents, type=pa.uint64()),
+                    "probability": pa.array(thresholds, type=pa.uint64()),
+                }
+            )
+
+            write_table(
+                out_clusters,
+                Path.cwd() / "data" / f"{OUTPUT_PREFIX}_ingest_clusters.parquet",
+            )
+
+            write_table(
+                out_contains, Path.cwd() / "data" / f"{OUTPUT_PREFIX}_contains.parquet"
+            )
+
+            write_table(
+                out_probabilities,
+                Path.cwd() / "data" / f"{OUTPUT_PREFIX}_ingest_probabilities.parquet",
+            )
diff --git a/src/matchbox/server/postgresql/benchmark/generate_tables.py b/src/matchbox/server/postgresql/benchmark/generate_tables.py
new file mode 100644
index 00000000..c18dadb1
--- /dev/null
+++ b/src/matchbox/server/postgresql/benchmark/generate_tables.py
@@ -0,0 +1,407 @@
+import json
+from pathlib import Path
+from typing import Iterable
+
+import click
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from matchbox.common.factories import generate_dummy_probabilities
+from matchbox.common.hash import HASH_FUNC
+from matchbox.common.transform import (
+    attach_components_to_probabilities,
+    to_hierarchical_clusters,
+)
+
+
+class IDCreator:
+    """
+    A generator of incremental integer IDs from positive and negative integers.
+
+    Positive integers will be returned as they are, while a new ID will be generated
+    for each negative integer.
+    """
+
+    def __init__(self, start: int):
+        self.id_map = dict()
+        self._next_int = start
+
+    def create(self, temp_ids: list[int]) -> list[int]:
+        results = []
+        for ti in temp_ids:
+            if ti >= 0:
+                results.append(ti)
+            elif ti in self.id_map:
+                results.append(self.id_map[ti])
+            else:
+                self.id_map[ti] = self._next_int
+                results.append(self._next_int)
+                self._next_int += 1
+
+        return results
+
+    def reset_mapping(self):
+        self.__init__(self._next_int)
+
+        return self
+
+
+def _hash_list_int(li: list[int]) -> list[bytes]:
+    return [HASH_FUNC(str(i).encode("utf-8")).digest() for i in li]
+
+
+def _unique_clusters(
+    all_parents: Iterable[int], all_probabilities: Iterable[int]
+) -> tuple[list[int], list[float]]:
+    ll = set()
+    clusters = []
+    probabilities = []
+    for parent, prob in zip(all_parents, all_probabilities, strict=True):
+        if parent in ll:
+            continue
+        else:
+            ll.add(parent)
+            clusters.append(parent)
+            probabilities.append(prob / 100)
+    return clusters, probabilities
+
+
+def generate_sources() -> pa.Table:
+    """
+    Generate sources table.
+
+    Returns:
+        PyArrow sources table
+    """
+    sources_resolution_id = [1, 2]
+    sources_alias = ["alias1", "alias2"]
+    sources_schema = ["dbt", "dbt"]
+    sources_table = ["companies_house", "hmrc_exporters"]
+    sources_id = ["company_number", "id"]
+    sources_indices = [
+        {
+            "literal": ["col1", "col2", "col3"],
+            "alias": ["col1", "col2", "col3"],
+        },
+        {
+            "literal": ["col1", "col2", "col3"],
+            "alias": ["col1", "col2", "col3"],
+        },
+    ]
+    sources_indices = [json.dumps(si) for si in sources_indices]
+    return pa.table(
+        {
+            "resolution_id": pa.array(sources_resolution_id, type=pa.uint64()),
+            "alias": pa.array(sources_alias, type=pa.string()),
+            "schema": pa.array(sources_schema, type=pa.string()),
+            "table": pa.array(sources_table, type=pa.string()),
+            "id": pa.array(sources_id, type=pa.string()),
+            "indices": pa.array(sources_indices, type=pa.string()),
+        }
+    )
+
+
+def generate_resolutions() -> pa.Table:
+    """
+    Generate resolutions table.
+
+    Returns:
+        PyArrow resolutions table
+    """
+    resolutions_resolution_id = [1, 2, 3, 4, 5]
+    resolutions_name = ["source1", "source2", "dedupe1", "dedupe2", "link"]
+    resolutions_resolution_hash = [
+        HASH_FUNC(rid.encode("utf-8")).digest() for rid in resolutions_name
+    ]
+    resolutions_type = ["dataset", "dataset", "model", "model", "model"]
+    resolutions_float = [None, None, 0.8, 0.8, 0.9]
+
+    return pa.table(
+        {
+            "resolution_id": pa.array(resolutions_resolution_id, type=pa.uint64()),
+            "resolution_hash": pa.array(resolutions_resolution_hash, type=pa.binary()),
+            "type": pa.array(resolutions_type, type=pa.string()),
+            "name": pa.array(resolutions_name, type=pa.string()),
+            "description": pa.array(resolutions_name, type=pa.string()),
+            "truth": pa.array(resolutions_float, type=pa.float64()),
+        }
+    )
+
+
+def generate_resolution_from() -> pa.Table:
+    """
+    Generate resolution_from table.
+
+    Returns:
+        PyArrow resolution_from table
+    """
+    # 1 and 2 are sources; 3 and 4 are dedupers; 5 is a linker
+    resolution_parent = [1, 1, 3, 2, 2, 4]
+    resolution_child = [3, 5, 5, 4, 5, 5]
+    resolution_level = [1, 2, 1, 1, 2, 1]
+    resolution_truth_cache = [None, None, 0.7, None, None, 0.7]
+
+    return pa.table(
+        {
+            "parent": pa.array(resolution_parent, type=pa.uint64()),
+            "child": pa.array(resolution_child, type=pa.uint64()),
+            "level": pa.array(resolution_level, type=pa.uint32()),
+            "truth_cache": pa.array(resolution_truth_cache, type=pa.float64()),
+        }
+    )
+
+
+def generate_cluster_source(range_left: int, range_right: int) -> pa.Table:
+    """
+    Generate cluster table containing rows for source rows.
+
+    Args:
+        range_left: first ID to generate
+        range_right: last ID to generate, plus one
+    Returns:
+        PyArrow cluster table
+    """
+
+    def create_source_pk(li: list[int]) -> list[list[str]]:
+        return [[str(i)] for i in li]
+
+    source = list(range(range_left, range_right))
+
+    return pa.table(
+        {
+            "cluster_id": pa.array(source, type=pa.uint64()),
+            "cluster_hash": pa.array(_hash_list_int(source), type=pa.binary()),
+            "dataset": pa.array([1] * len(source), type=pa.uint64()),
+            "source_pk": pa.array(create_source_pk(source), type=pa.list_(pa.string())),
+        }
+    )
+
+
+def generate_result_tables(
+    left_ids: Iterable[int],
+    right_ids: Iterable[int] | None,
+    resolution_id: int,
+    id_creator: IDCreator,
+    n_components: int,
+    n_probs: int,
+    prob_min: float = 0.6,
+    prob_max: float = 1,
+) -> tuple[list[int], pa.Table, pa.Table, pa.Table]:
+    """
+    Generate probabilities, contains and clusters tables.
+
+    Args:
+        left_ids: list of IDs for rows to dedupe, or for left rows to link
+        right_ids: list of IDs for right rows to link
+        resolution_id: ID of resolution for this dedupe or link model
+        id_creator: an IDCreator instance
+        n_components: number of implied connected components
+        n_probs: total number of probability edges to be generated
+        prob_min: minimum value for probabilities to be generated
+        prob_max: maximum value for probabilities to be generated
+
+    Returns:
+        Tuple with 1 list of top-level clusters and 3 PyArrow tables, for probabilities,
+        contains and clusters
+    """
+    probs = generate_dummy_probabilities(
+        left_ids, right_ids, [prob_min, prob_max], n_components, n_probs
+    )
+
+    clusters = to_hierarchical_clusters(attach_components_to_probabilities(probs))
+
+    indexed_parents = id_creator.create(clusters["parent"].to_pylist())
+    indexed_children = id_creator.create(clusters["child"].to_pylist())
+
+    final_clusters, final_probs = _unique_clusters(
+        indexed_parents, clusters["probability"].to_numpy()
+    )
+
+    source_entries = left_ids if right_ids is None else left_ids + right_ids
+    set_children = set(indexed_children)
+    top_clusters = [c for c in final_clusters + source_entries if c not in set_children]
+
+    probabilities_table = pa.table(
+        {
+            "resolution": pa.array(
+                [resolution_id] * len(final_clusters), type=pa.uint64()
+            ),
+            "cluster": pa.array(final_clusters, type=pa.uint64()),
+            "probability": pa.array(final_probs, type=pa.float64()),
+        }
+    )
+
+    contains_table = pa.table(
+        {
+            "parent": pa.array(indexed_parents, type=pa.uint64()),
+            "child": pa.array(indexed_children, type=pa.uint64()),
+        }
+    )
+
+    clusters_table = pa.table(
+        {
+            "cluster_id": pa.array(final_clusters, type=pa.uint64()),
+            "cluster_hash": pa.array(_hash_list_int(final_clusters), type=pa.binary()),
+            "dataset": pa.array([None] * len(final_clusters), type=pa.uint64()),
+            "source_pk": pa.array(
+                [None] * len(final_clusters), type=pa.list_(pa.string())
+            ),
+        }
+    )
+
+    return (top_clusters, probabilities_table, contains_table, clusters_table)
+
+
+def generate_all_tables(
+    source_len: int,
+    dedupe_components: int,
+    dedupe_len: int,
+    link_components: int,
+    link_len: int,
+) -> dict[str, pa.Table]:
+    """
+    Make all 6 backend tables. It will create two sources, one deduper for each,
+    and one linker from each deduper.
+
+    Args:
+        source_len: length of each data source
+        dedupe_components: number of connected components implied by each deduper
+        dedupe_len: probabilities generated by each deduper
+        link_components: number of connected components implied by each linker
+        link_len: probabilities generated by each linker
+    Returns:
+        A dictionary where keys are table names and values are PyArrow tables
+    """
+    resolutions = generate_resolutions()
+    resolution_from = generate_resolution_from()
+    sources = generate_sources()
+
+    clusters_source1 = generate_cluster_source(0, source_len)
+    clusters_source2 = generate_cluster_source(source_len, source_len * 2)
+
+    id_creator = IDCreator(source_len * 2)
+    top_clusters1, probabilities_dedupe1, contains_dedupe1, clusters_dedupe1 = (
+        generate_result_tables(
+            clusters_source1["cluster_id"].to_pylist(),
+            None,
+            3,
+            id_creator,
+            dedupe_components,
+            dedupe_len,
+        )
+    )
+
+    top_clusters2, probabilities_dedupe2, contains_dedupe2, clusters_dedupe2 = (
+        generate_result_tables(
+            clusters_source2["cluster_id"].to_pylist(),
+            None,
+            4,
+            id_creator.reset_mapping(),
+            dedupe_components,
+            dedupe_len,
+        )
+    )
+
+    _, probabilities_link, contains_link, clusters_link = generate_result_tables(
+        top_clusters1,
+        top_clusters2,
+        5,
+        id_creator.reset_mapping(),
+        link_components,
+        link_len,
+    )
+
+    probabilities = pa.concat_tables(
+        [probabilities_dedupe1, probabilities_dedupe2, probabilities_link]
+    )
+    contains = pa.concat_tables([contains_dedupe1, contains_dedupe2, contains_link])
+    clusters = pa.concat_tables(
+        [
+            clusters_source1,
+            clusters_source2,
+            clusters_dedupe1,
+            clusters_dedupe2,
+            clusters_link,
+        ]
+    )
+
+    return {
+        "resolutions": resolutions,
+        "resolution_from": resolution_from,
+        "sources": sources,
+        "probabilities": probabilities,
+        "contains": contains,
+        "clusters": clusters,
+    }
+
+
+@click.command()
+@click.option("-s", "--settings", type=str, required=True)
+@click.option("-o", "--output_dir", type=click.Path(exists=True, path_type=Path))
+def main(settings, output_dir):
+    PRESETS = {
+        "xs": {
+            "source_len": 10_000,
+            "dedupe_components": 8000,
+            "dedupe_len": 2000,
+            "link_components": 6000,
+            "link_len": 10_000,
+        },
+        "s": {
+            "source_len": 100_000,
+            "dedupe_components": 80_000,
+            "dedupe_len": 20_000,
+            "link_components": 60_000,
+            "link_len": 100_000,
+        },
+        "m": {
+            "source_len": 1_000_000,
+            "dedupe_components": 800_000,
+            "dedupe_len": 200_000,
+            "link_components": 600_000,
+            "link_len": 1_000_000,
+        },
+        "l": {
+            "source_len": 10_000_000,
+            "dedupe_components": 8_000_000,
+            "dedupe_len": 2_000_000,
+            "link_components": 6_000_000,
+            "link_len": 10_000_000,
+        },
+        "xl": {
+            "source_len": 100_000_000,
+            "dedupe_components": 80_000_000,
+            "dedupe_len": 20_000_000,
+            "link_components": 60_000_000,
+            "link_len": 100_000_000,
+        },
+    }
+
+    if not output_dir:
+        output_dir = Path.cwd() / "data" / "all_tables"
+    if settings not in PRESETS:
+        raise ValueError(f"Settings {settings} are invalid")
+
+    config = PRESETS[settings]
+    source_len = config["source_len"]
+    dedupe_components = config["dedupe_components"]
+    dedupe_len = config["dedupe_len"]
+    link_len = config["link_len"]
+    link_components = config["link_components"]
+
+    all_tables = generate_all_tables(
+        source_len=source_len,
+        dedupe_components=dedupe_components,
+        dedupe_len=dedupe_len,
+        link_components=link_components,
+        link_len=link_len,
+    )
+
+    output_dir /= settings
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for name, table in all_tables.items():
+        pq.write_table(table, output_dir / f"{name}.parquet")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/matchbox/server/postgresql/benchmark/init_schema.py b/src/matchbox/server/postgresql/benchmark/init_schema.py
new file mode 100644
index 00000000..b6b3a177
--- /dev/null
+++ b/src/matchbox/server/postgresql/benchmark/init_schema.py
@@ -0,0 +1,50 @@
+from textwrap import dedent
+
+from sqlalchemy.dialects import postgresql
+from sqlalchemy.schema import CreateTable
+
+from matchbox.server.postgresql.db import MBDB
+from matchbox.server.postgresql.orm import (
+    Clusters,
+    Contains,
+    Probabilities,
+    ResolutionFrom,
+    Resolutions,
+    Sources,
+)
+
+
+def empty_schema() -> str:
+    schema = MBDB.MatchboxBase.metadata.schema
+    sql = dedent(f"""
+        DROP SCHEMA IF EXISTS {schema} CASCADE;
+        CREATE SCHEMA {schema};   
+    """)
+
+    return sql
+
+
+def create_tables() -> str:
+    sql = ""
+    # Order matters
+    for table_class in (
+        Resolutions,
+        ResolutionFrom,
+        Sources,
+        Clusters,
+        Contains,
+        Probabilities,
+    ):
+        sql += (
+            str(
+                CreateTable(table_class.__table__).compile(dialect=postgresql.dialect())
+            )
+            + "; \n"
+        )
+
+    return sql
+
+
+if __name__ == "__main__":
+    print(empty_schema())
+    print(create_tables())
diff --git a/test/common/test_factories.py b/test/common/test_factories.py
new file mode 100644
index 00000000..e48aa3e3
--- /dev/null
+++ b/test/common/test_factories.py
@@ -0,0 +1,181 @@
+from typing import Any
+
+import numpy as np
+import pyarrow.compute as pc
+import pytest
+
+from matchbox.common.factories import (
+    calculate_min_max_edges,
+    generate_dummy_probabilities,
+    verify_components,
+)
+
+
+@pytest.mark.parametrize(
+    ("left_n", "right_n", "n_components", "true_min", "true_max"),
+    [
+        (10, None, 2, 8, 20),
+        (11, None, 2, 9, 25),
+        (9, 9, 3, 15, 27),
+        (8, 4, 3, 9, 11),
+        (4, 8, 3, 9, 11),
+        (8, 8, 3, 13, 22),
+    ],
+    ids=[
+        "dedupe_no_mod",
+        "dedup_mod",
+        "link_no_mod",
+        "link_left_mod",
+        "link_right_mod",
+        "link_same_mod",
+    ],
+)
+def test_calculate_min_max_edges(
+    left_n: int, right_n: int | None, n_components: int, true_min: int, true_max: int
+):
+    deduplicate = False
+    if not right_n:
+        deduplicate = True
+        right_n = left_n
+    min_edges, max_edges = calculate_min_max_edges(
+        left_n, right_n, n_components, deduplicate
+    )
+
+    assert true_min == min_edges
+    assert true_max == max_edges
+
+
+@pytest.mark.parametrize(
+    ("parameters"),
+    [
+        {
+            "left_count": 5,
+            "right_count": None,
+            "prob_range": (0.6, 0.8),
+            "num_components": 3,
+            "total_rows": 2,
+        },
+        {
+            "left_count": 1000,
+            "right_count": None,
+            "prob_range": (0.6, 0.8),
+            "num_components": 10,
+            "total_rows": calculate_min_max_edges(1000, 1000, 10, True)[0],
+        },
+        {
+            "left_count": 1_000,
+            "right_count": None,
+            "prob_range": (0.6, 0.8),
+            "num_components": 10,
+            "total_rows": calculate_min_max_edges(1000, 1000, 10, True)[1],
+        },
+        {
+            "left_count": 1_000,
+            "right_count": 1_000,
+            "prob_range": (0.6, 0.8),
+            "num_components": 10,
+            "total_rows": calculate_min_max_edges(1000, 1000, 10, False)[0],
+        },
+        {
+            "left_count": 1_000,
+            "right_count": 1_000,
+            "prob_range": (0.6, 0.8),
+            "num_components": 10,
+            "total_rows": calculate_min_max_edges(1000, 1000, 10, False)[1],
+        },
+    ],
+    ids=[
+        "dedupe_no_edges",
+        "dedupe_min",
+        "dedupe_max",
+        "link_min",
+        "link_max",
+    ],
+)
+def test_generate_dummy_probabilities(parameters: dict[str, Any]):
+    len_left = parameters["left_count"]
+    len_right = parameters["right_count"]
+    if len_right:
+        total_len = len_left + len_right
+        len_right = parameters["right_count"]
+        rand_vals = np.random.choice(a=total_len, replace=False, size=total_len)
+        left_values = list(rand_vals[:len_left])
+        right_values = list(rand_vals[len_left:])
+    else:
+        rand_vals = np.random.choice(a=len_left, replace=False, size=len_left)
+        left_values = list(rand_vals[:len_left])
+        right_values = None
+
+    n_components = parameters["num_components"]
+    total_rows = parameters["total_rows"]
+
+    probabilities = generate_dummy_probabilities(
+        left_values=left_values,
+        right_values=right_values,
+        prob_range=parameters["prob_range"],
+        num_components=n_components,
+        total_rows=total_rows,
+    )
+    report = verify_components(table=probabilities, all_nodes=rand_vals)
+    p_left = probabilities["left"].to_pylist()
+    p_right = probabilities["right"].to_pylist()
+
+    assert report["num_components"] == n_components
+
+    # Link job
+    if right_values:
+        assert set(p_left) <= set(left_values)
+        assert set(p_right) <= set(right_values)
+    # Dedupe
+    else:
+        assert set(p_left) | set(p_right) <= set(left_values)
+
+    assert (
+        pc.max(probabilities["probability"]).as_py() / 100
+        <= parameters["prob_range"][1]
+    )
+    assert (
+        pc.min(probabilities["probability"]).as_py() / 100
+        >= parameters["prob_range"][0]
+    )
+
+    assert len(probabilities) == total_rows
+
+    edges = zip(p_left, p_right, strict=True)
+    edges_set = {tuple(sorted(e)) for e in edges}
+    assert len(edges_set) == total_rows
+
+    self_references = [e for e in edges if e[0] == e[1]]
+    assert len(self_references) == 0
+
+
+@pytest.mark.parametrize(
+    ("parameters"),
+    [
+        {
+            "left_range": (0, 10_000),
+            "right_range": (10_000, 20_000),
+            "num_components": 2,
+            "total_rows": 1,
+        },
+        {
+            "left_range": (0, 10),
+            "right_range": (10, 20),
+            "num_components": 2,
+            "total_rows": 8_000,
+        },
+    ],
+    ids=["lower_than_min", "higher_than_max"],
+)
+def test_generate_dummy_probabilities_errors(parameters: dict[str, Any]):
+    left_values = range(*parameters["left_range"])
+    right_values = range(*parameters["right_range"])
+
+    with pytest.raises(ValueError):
+        generate_dummy_probabilities(
+            left_values=left_values,
+            right_values=right_values,
+            prob_range=(0.6, 0.8),
+            num_components=parameters["num_components"],
+            total_rows=parameters["total_rows"],
+        )
diff --git a/test/client/test_transform.py b/test/common/test_transform.py
similarity index 84%
rename from test/client/test_transform.py
rename to test/common/test_transform.py
index 6b79d25c..88b46273 100644
--- a/test/client/test_transform.py
+++ b/test/common/test_transform.py
@@ -6,18 +6,19 @@
 import pyarrow.compute as pc
 import pytest
 
+from matchbox.common.factories import generate_dummy_probabilities
 from matchbox.common.transform import (
     attach_components_to_probabilities,
     component_to_hierarchy,
     to_hierarchical_clusters,
 )
-from test.fixtures.factories import generate_dummy_probabilities, verify_components
 
 
 @lru_cache(maxsize=None)
 def _combine_strings(*n: str) -> str:
     """
-    Combine n strings into a single string.
+    Combine n strings into a single string, with a cache.
+    Meant to replace `matchbox.common.hash.IntMap.index`
 
     Args:
         *args: Variable number of strings to combine
@@ -29,45 +30,6 @@ def _combine_strings(*n: str) -> str:
     return "".join(sorted(letters))
 
 
-@pytest.mark.parametrize(
-    ("parameters"),
-    [
-        {
-            "left_range": (0, 1_000),
-            "right_range": (1_000, 2_000),
-            "prob_range": (0.6, 0.8),
-            "num_components": 10,
-            "total_rows": 100_000,
-        },
-    ],
-    ids=["simple"],
-)
-def test_probabilities_factory(parameters: dict[str, Any]):
-    left_values = range(*parameters["left_range"])
-    right_values = range(*parameters["right_range"])
-
-    probabilities = generate_dummy_probabilities(
-        left_values=left_values,
-        right_values=right_values,
-        prob_range=parameters["prob_range"],
-        num_components=parameters["num_components"],
-        total_rows=parameters["total_rows"],
-    )
-    report = verify_components(table=probabilities)
-
-    assert report["num_components"] == parameters["num_components"]
-    assert set(pc.unique(probabilities["left"]).to_pylist()) == set(left_values)
-    assert set(pc.unique(probabilities["right"]).to_pylist()) == set(right_values)
-    assert (
-        pc.max(probabilities["probability"]).as_py() / 100
-        <= parameters["prob_range"][1]
-    )
-    assert (
-        pc.min(probabilities["probability"]).as_py() / 100
-        >= parameters["prob_range"][0]
-    )
-
-
 @pytest.mark.parametrize(
     ("parameters"),
     [
@@ -166,8 +128,27 @@ def test_empty_attach_components_to_probabilities():
                 ("xy", "y", 90),
             },
         ),
+        # Test case 4: A component larger than two remains unchanged
+        # at a successive threshold
+        (
+            {
+                "left": ["x", "y", "a"],
+                "right": ["y", "z", "b"],
+                "probability": [90, 90, 85],
+            },
+            {
+                ("xy", "x", 90),
+                ("xy", "y", 90),
+                ("yz", "y", 90),
+                ("yz", "z", 90),
+                ("xyz", "xy", 90),
+                ("xyz", "yz", 90),
+                ("ab", "a", 85),
+                ("ab", "b", 85),
+            },
+        ),
     ],
-    ids=["equal", "asymmetric", "single"],
+    ids=["equal", "asymmetric", "single", "unchanged"],
 )
 def test_component_to_hierarchy(
     probabilities: dict[str, list[str | float]], hierarchy: set[tuple[str, str, int]]
diff --git a/test/fixtures/factories.py b/test/fixtures/factories.py
deleted file mode 100644
index 612f6e2b..00000000
--- a/test/fixtures/factories.py
+++ /dev/null
@@ -1,187 +0,0 @@
-from collections import Counter
-
-import numpy as np
-import pyarrow as pa
-import rustworkx as rx
-
-
-def verify_components(table) -> dict:
-    """
-    Fast verification of connected components using rustworkx.
-
-    Args:
-        table: PyArrow table with 'left', 'right' columns
-
-    Returns:
-        dictionary containing basic component statistics
-    """
-    graph = rx.PyGraph()
-
-    unique_nodes = set(table["left"].to_numpy()) | set(table["right"].to_numpy())
-    graph.add_nodes_from(range(len(unique_nodes)))
-
-    node_to_idx = {node: idx for idx, node in enumerate(unique_nodes)}
-    edges = [
-        (node_to_idx[left], node_to_idx[right])
-        for left, right in zip(
-            table["left"].to_numpy(),
-            table["right"].to_numpy(),
-            strict=False,
-        )
-    ]
-
-    graph.add_edges_from_no_data(edges)
-
-    components = rx.connected_components(graph)
-    component_sizes = Counter(len(component) for component in components)
-
-    return {
-        "num_components": len(components),
-        "total_nodes": len(unique_nodes),
-        "total_edges": len(edges),
-        "component_sizes": component_sizes,
-        "min_component_size": min(component_sizes.keys()),
-        "max_component_size": max(component_sizes.keys()),
-    }
-
-
-def _calculate_max_possible_edges(n_nodes: int, num_components: int) -> int:
-    """
-    Calculate the max possible number of edges given n nodes split into k components.
-
-    Args:
-        n_nodes: Total number of nodes
-        num_components: Number of components to split into
-
-    Returns:
-        Maximum possible number of edges
-    """
-    nodes_per_component = n_nodes // num_components
-    max_edges_per_component = (
-        nodes_per_component * nodes_per_component
-    )  # Complete bipartite graph
-    return max_edges_per_component * num_components
-
-
-def _split_values_into_components(
-    values: list[int], num_components: int
-) -> list[np.ndarray]:
-    """
-    Split values into non-overlapping groups for each component.
-
-    Args:
-        values: List of values to split
-        num_components: Number of components to create
-
-    Returns:
-        List of arrays, one for each component
-    """
-    values = np.array(values)
-    np.random.shuffle(values)
-    return np.array_split(values, num_components)
-
-
-def generate_dummy_probabilities(
-    left_values: list[int],
-    right_values: list[int],
-    prob_range: tuple[float, float],
-    num_components: int,
-    total_rows: int,
-) -> pa.Table:
-    """
-    Generate dummy Arrow probabilities data with guaranteed isolated components.
-
-    Args:
-        left_values: List of integers to use for left column
-        right_values: List of integers to use for right column
-        prob_range: Tuple of (min_prob, max_prob) to constrain probabilities
-        num_components: Number of distinct connected components to generate
-        total_rows: Total number of rows to generate
-
-    Returns:
-        PyArrow Table with 'left', 'right', and 'probability' columns
-    """
-    # Validate inputs
-    if len(left_values) < 2 or len(right_values) < 2:
-        raise ValueError("Need at least 2 possible values for both left and right")
-    if num_components > min(len(left_values), len(right_values)):
-        raise ValueError(
-            "Cannot have more components than minimum of left/right values"
-        )
-
-    min_nodes = min(len(left_values), len(right_values))
-    max_possible_edges = _calculate_max_possible_edges(min_nodes, num_components)
-
-    if total_rows > max_possible_edges:
-        raise ValueError(
-            f"Cannot generate {total_rows:,} edges with {num_components:,} components. "
-            f"Max possible edges is {max_possible_edges:,} given {min_nodes:,} nodes. "
-            "Either increase the number of nodes, decrease the number of components, "
-            "or decrease the total edges requested."
-        )
-
-    # Convert probability range to integers (60-80 for 0.60-0.80)
-    prob_min = int(prob_range[0] * 100)
-    prob_max = int(prob_range[1] * 100)
-
-    # Split values into completely separate groups for each component
-    left_components = _split_values_into_components(left_values, num_components)
-    right_components = _split_values_into_components(right_values, num_components)
-
-    # Calculate base number of edges per component
-    base_edges_per_component = total_rows // num_components
-    remaining_edges = total_rows % num_components
-
-    all_edges = []
-
-    # Generate edges for each component
-    for comp_idx in range(num_components):
-        comp_left_values = left_components[comp_idx]
-        comp_right_values = right_components[comp_idx]
-
-        # Calculate edges for this component
-        edges_in_component = base_edges_per_component
-        if comp_idx < remaining_edges:  # Distribute remaining edges
-            edges_in_component += 1
-
-        # Ensure basic connectivity within the component
-        base_edges = []
-
-        # Create a spanning tree-like structure
-        for i in range(len(comp_left_values)):
-            base_edges.append(
-                (
-                    comp_left_values[i],
-                    comp_right_values[i % len(comp_right_values)],
-                    np.random.randint(prob_min, prob_max + 1),
-                )
-            )
-
-        # Generate remaining random edges strictly within this component
-        remaining_edges = edges_in_component - len(base_edges)
-        if remaining_edges > 0:
-            random_lefts = np.random.choice(comp_left_values, size=remaining_edges)
-            random_rights = np.random.choice(comp_right_values, size=remaining_edges)
-            random_probs = np.random.randint(
-                prob_min, prob_max + 1, size=remaining_edges
-            )
-
-            component_edges = base_edges + list(
-                zip(random_lefts, random_rights, random_probs, strict=False)
-            )
-        else:
-            component_edges = base_edges
-
-        all_edges.extend(component_edges)
-
-    # Convert to arrays
-    lefts, rights, probs = zip(*all_edges, strict=False)
-
-    # Create PyArrow arrays
-    left_array = pa.array(lefts, type=pa.uint64())
-    right_array = pa.array(rights, type=pa.uint64())
-    prob_array = pa.array(probs, type=pa.uint8())
-
-    return pa.table(
-        [left_array, right_array, prob_array], names=["left", "right", "probability"]
-    )
diff --git a/test/pipeline.py b/test/pipeline.py
deleted file mode 100644
index 95d8b367..00000000
--- a/test/pipeline.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import logging
-import time
-from contextlib import contextmanager
-from pathlib import Path
-
-import pyarrow.parquet as pq
-from rich.logging import RichHandler
-
-from matchbox.common.transform import (
-    attach_components_to_probabilities,
-    to_hierarchical_clusters,
-)
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(message)s",
-    handlers=[RichHandler(rich_tracebacks=True)],
-)
-pipeline_logger = logging.getLogger("mb_pipeline")
-
-ROOT = Path(__file__).parent.parent
-
-
-@contextmanager
-def timer(description: str):
-    start = time.time()
-    yield
-    elapsed = time.time() - start
-
-    if elapsed >= 60:
-        minutes = int(elapsed // 60)
-        seconds = elapsed % 60
-        time_str = f"{minutes} min {seconds:.1f} sec"
-    else:
-        time_str = f"{elapsed:.2f} seconds"
-
-    pipeline_logger.info(f"{description} in {time_str}")
-
-
-if __name__ == "__main__":
-    with timer("Full pipeline completed"):
-        with timer("Read table"):
-            table = pq.read_table(Path.cwd() / "data/hierarchical_cc20k.parquet")
-
-        pipeline_logger.info(f"Processing {len(table):,} records")
-
-        with timer("Added components"):
-            cc = attach_components_to_probabilities(table)
-
-        with timer("Built hierarchical clusters"):
-            out = to_hierarchical_clusters(cc)
diff --git a/uv.lock b/uv.lock
index 07a7cb3d..a3c3f534 100644
--- a/uv.lock
+++ b/uv.lock
@@ -734,6 +734,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a8/a2/6c725958e6f135d8e5de081e69841bb2c1d84b3fc259d02eb092b8fc203a/ipython-8.27.0-py3-none-any.whl", hash = "sha256:f68b3cb8bde357a5d7adc9598d57e22a45dfbea19eb6b98286fa3b288c9cd55c", size = 818986 },
 ]
 
+[[package]]
+name = "ipywidgets"
+version = "8.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "comm" },
+    { name = "ipython" },
+    { name = "jupyterlab-widgets" },
+    { name = "traitlets" },
+    { name = "widgetsnbextension" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c7/4c/dab2a281b07596a5fc220d49827fe6c794c66f1493d7a74f1df0640f2cc5/ipywidgets-8.1.5.tar.gz", hash = "sha256:870e43b1a35656a80c18c9503bbf2d16802db1cb487eec6fab27d683381dde17", size = 116723 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/2d/9c0b76f2f9cc0ebede1b9371b6f317243028ed60b90705863d493bae622e/ipywidgets-8.1.5-py3-none-any.whl", hash = "sha256:3290f526f87ae6e77655555baba4f36681c555b8bdbbff430b70e52c34c86245", size = 139767 },
+]
+
 [[package]]
 name = "jedi"
 version = "0.19.1"
@@ -815,6 +831,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c9/fb/108ecd1fe961941959ad0ee4e12ee7b8b1477247f30b1fdfd83ceaf017f0/jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409", size = 28965 },
 ]
 
+[[package]]
+name = "jupyterlab-widgets"
+version = "3.0.13"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/59/73/fa26bbb747a9ea4fca6b01453aa22990d52ab62dd61384f1ac0dc9d4e7ba/jupyterlab_widgets-3.0.13.tar.gz", hash = "sha256:a2966d385328c1942b683a8cd96b89b8dd82c8b8f81dda902bb2bc06d46f5bed", size = 203556 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a9/93/858e87edc634d628e5d752ba944c2833133a28fa87bb093e6832ced36a3e/jupyterlab_widgets-3.0.13-py3-none-any.whl", hash = "sha256:e3cda2c233ce144192f1e29914ad522b2f4c40e77214b0cc97377ca3d323db54", size = 214392 },
+]
+
 [[package]]
 name = "kiwisolver"
 version = "1.4.7"
@@ -944,6 +969,7 @@ server = [
 dev = [
     { name = "docker" },
     { name = "ipykernel" },
+    { name = "ipywidgets" },
     { name = "pre-commit" },
     { name = "pytest" },
     { name = "pytest-cov" },
@@ -982,6 +1008,7 @@ requires-dist = [
 dev = [
     { name = "docker", specifier = ">=7.1.0" },
     { name = "ipykernel", specifier = ">=6.29.5" },
+    { name = "ipywidgets", specifier = ">=8.1.5" },
     { name = "pre-commit", specifier = ">=3.8.0" },
     { name = "pytest", specifier = ">=8.3.3" },
     { name = "pytest-cov", specifier = ">=5.0.0" },
@@ -1461,6 +1488,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/33/39/5a9a229bb5414abeb86e33b8fc8143ab0aecce5a7f698a53e31367d30caa/psycopg2-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:0435034157049f6846e95103bd8f5a668788dd913a7c30162ca9503fdf542cb4", size = 1163736 },
     { url = "https://files.pythonhosted.org/packages/3d/16/4623fad6076448df21c1a870c93a9774ad8a7b4dd1660223b59082dd8fec/psycopg2-2.9.10-cp312-cp312-win32.whl", hash = "sha256:65a63d7ab0e067e2cdb3cf266de39663203d38d6a8ed97f5ca0cb315c73fe067", size = 1025113 },
     { url = "https://files.pythonhosted.org/packages/66/de/baed128ae0fc07460d9399d82e631ea31a1f171c0c4ae18f9808ac6759e3/psycopg2-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:4a579d6243da40a7b3182e0430493dbd55950c493d8c68f4eec0b302f6bbf20e", size = 1163951 },
+    { url = "https://files.pythonhosted.org/packages/ae/49/a6cfc94a9c483b1fa401fbcb23aca7892f60c7269c5ffa2ac408364f80dc/psycopg2-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:91fd603a2155da8d0cfcdbf8ab24a2d54bca72795b90d2a3ed2b6da8d979dee2", size = 2569060 },
 ]
 
 [[package]]
@@ -2346,6 +2374,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b0/0b/c7e5d11020242984d9d37990310520ed663b942333b83a033c2f20191113/websockets-14.1-py3-none-any.whl", hash = "sha256:4d4fc827a20abe6d544a119896f6b78ee13fe81cbfef416f3f2ddf09a03f0e2e", size = 156277 },
 ]
 
+[[package]]
+name = "widgetsnbextension"
+version = "4.0.13"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/56/fc/238c424fd7f4ebb25f8b1da9a934a3ad7c848286732ae04263661eb0fc03/widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6", size = 1164730 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/21/02/88b65cc394961a60c43c70517066b6b679738caf78506a5da7b88ffcb643/widgetsnbextension-4.0.13-py3-none-any.whl", hash = "sha256:74b2692e8500525cc38c2b877236ba51d34541e6385eeed5aec15a70f88a6c71", size = 2335872 },
+]
+
 [[package]]
 name = "wrapt"
 version = "1.17.0"