diff --git a/src/matchbox/common/transform.py b/src/matchbox/common/transform.py index 26d9a732..fc32a7ca 100644 --- a/src/matchbox/common/transform.py +++ b/src/matchbox/common/transform.py @@ -23,7 +23,7 @@ def to_clusters( results: pa.Table, - dtype: pa.DataType = pa.large_binary, + dtype: pa.DataType = pa.binary, hash_func: Callable[[*tuple[T, ...]], T] = hash_values, ) -> pa.Table: """ @@ -216,7 +216,7 @@ def get_components(self) -> list[set[T]]: def component_to_hierarchy( table: pa.Table, - dtype: pa.DataType = pa.large_binary, + dtype: pa.DataType = pa.binary, hash_func: Callable[[*tuple[T, ...]], T] = hash_values, ) -> pa.Table: """ @@ -293,7 +293,7 @@ def to_hierarchical_clusters( probabilities: pa.Table, proc_func: Callable[[pa.Table, pa.DataType], pa.Table] = component_to_hierarchy, hash_func: Callable[[*tuple[T, ...]], T] = hash_values, - dtype: pa.DataType = pa.large_binary, + dtype: pa.DataType = pa.binary, timeout: int = 300, ) -> pa.Table: """ diff --git a/src/matchbox/server/postgresql/benchmark/generate_tables.py b/src/matchbox/server/postgresql/benchmark/generate_tables.py index 5dcf237f..56c86faa 100644 --- a/src/matchbox/server/postgresql/benchmark/generate_tables.py +++ b/src/matchbox/server/postgresql/benchmark/generate_tables.py @@ -73,9 +73,7 @@ def generate_resolutions() -> pa.Table: return pa.table( { "resolution_id": pa.array(resolutions_resolution_id, type=pa.uint64()), - "resolution_hash": pa.array( - resolutions_resolution_hash, type=pa.large_binary() - ), + "resolution_hash": pa.array(resolutions_resolution_hash, type=pa.binary()), "type": pa.array(resolutions_type, type=pa.string()), "name": pa.array(resolutions_name, type=pa.string()), "description": pa.array(resolutions_name, type=pa.string()), @@ -126,7 +124,7 @@ def create_source_pk(li: list[int]) -> list[list[str]]: return pa.table( { "cluster_id": pa.array(source, type=pa.uint64()), - "cluster_hash": pa.array(_hash_list_int(source), type=pa.large_binary()), + "cluster_hash": pa.array(_hash_list_int(source), type=pa.binary()), "dataset": pa.array([1] * len(source), type=pa.uint64()), "source_pk": pa.array(create_source_pk(source), type=pa.list_(pa.string())), } @@ -172,7 +170,7 @@ def generate_result_tables( { "id": all_probs, "hash": pa.array( - [hash_data(p) for p in all_probs.to_pylist()], type=pa.large_binary() + [hash_data(p) for p in all_probs.to_pylist()], type=pa.binary() ), } ) @@ -194,7 +192,7 @@ def generate_result_tables( hierarchy = to_hierarchical_clusters( probabilities=probs_with_ccs, hash_func=hash_values, - dtype=pa.large_binary, + dtype=pa.binary, ) # Shape into tables diff --git a/src/matchbox/server/postgresql/utils/insert.py b/src/matchbox/server/postgresql/utils/insert.py index 6bb99923..9e9273e9 100644 --- a/src/matchbox/server/postgresql/utils/insert.py +++ b/src/matchbox/server/postgresql/utils/insert.py @@ -49,7 +49,7 @@ def __init__(self, start: int, lookup: pa.Table = None): self.lookup = pa.Table.from_arrays( [ pa.array([], type=pa.uint64()), - pa.array([], type=pa.large_binary()), + pa.array([], type=pa.binary()), pa.array([], type=pa.bool_()), ], names=["id", "hash", "new"], @@ -405,7 +405,7 @@ def _results_to_insert_tables( engine=engine, return_type="arrow", ) - lookup = lookup.cast(pa.schema([("hash", pa.large_binary()), ("id", pa.uint64())])) + lookup = lookup.cast(pa.schema([("hash", pa.binary()), ("id", pa.uint64())])) hm = HashIDMap(start=Clusters.next_id(), lookup=lookup) @@ -424,7 +424,7 @@ def _results_to_insert_tables( hierarchy = to_hierarchical_clusters( probabilities=probs_with_ccs, hash_func=hash_values, - dtype=pa.large_binary, + dtype=pa.binary, ) # Create Probabilities Arrow table to insert, containing all generated probabilities