Skip to content

Commit

Permalink
Replaced large_binary with binary
Browse files Browse the repository at this point in the history
  • Loading branch information
wpfl-dbt committed Jan 9, 2025
1 parent 3086d64 commit f42e66a
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 12 deletions.
6 changes: 3 additions & 3 deletions src/matchbox/common/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

def to_clusters(
results: pa.Table,
dtype: pa.DataType = pa.large_binary,
dtype: pa.DataType = pa.binary,
hash_func: Callable[[*tuple[T, ...]], T] = hash_values,
) -> pa.Table:
"""
Expand Down Expand Up @@ -216,7 +216,7 @@ def get_components(self) -> list[set[T]]:

def component_to_hierarchy(
table: pa.Table,
dtype: pa.DataType = pa.large_binary,
dtype: pa.DataType = pa.binary,
hash_func: Callable[[*tuple[T, ...]], T] = hash_values,
) -> pa.Table:
"""
Expand Down Expand Up @@ -293,7 +293,7 @@ def to_hierarchical_clusters(
probabilities: pa.Table,
proc_func: Callable[[pa.Table, pa.DataType], pa.Table] = component_to_hierarchy,
hash_func: Callable[[*tuple[T, ...]], T] = hash_values,
dtype: pa.DataType = pa.large_binary,
dtype: pa.DataType = pa.binary,
timeout: int = 300,
) -> pa.Table:
"""
Expand Down
10 changes: 4 additions & 6 deletions src/matchbox/server/postgresql/benchmark/generate_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,7 @@ def generate_resolutions() -> pa.Table:
return pa.table(
{
"resolution_id": pa.array(resolutions_resolution_id, type=pa.uint64()),
"resolution_hash": pa.array(
resolutions_resolution_hash, type=pa.large_binary()
),
"resolution_hash": pa.array(resolutions_resolution_hash, type=pa.binary()),
"type": pa.array(resolutions_type, type=pa.string()),
"name": pa.array(resolutions_name, type=pa.string()),
"description": pa.array(resolutions_name, type=pa.string()),
Expand Down Expand Up @@ -126,7 +124,7 @@ def create_source_pk(li: list[int]) -> list[list[str]]:
return pa.table(
{
"cluster_id": pa.array(source, type=pa.uint64()),
"cluster_hash": pa.array(_hash_list_int(source), type=pa.large_binary()),
"cluster_hash": pa.array(_hash_list_int(source), type=pa.binary()),
"dataset": pa.array([1] * len(source), type=pa.uint64()),
"source_pk": pa.array(create_source_pk(source), type=pa.list_(pa.string())),
}
Expand Down Expand Up @@ -172,7 +170,7 @@ def generate_result_tables(
{
"id": all_probs,
"hash": pa.array(
[hash_data(p) for p in all_probs.to_pylist()], type=pa.large_binary()
[hash_data(p) for p in all_probs.to_pylist()], type=pa.binary()
),
}
)
Expand All @@ -194,7 +192,7 @@ def generate_result_tables(
hierarchy = to_hierarchical_clusters(
probabilities=probs_with_ccs,
hash_func=hash_values,
dtype=pa.large_binary,
dtype=pa.binary,
)

# Shape into tables
Expand Down
6 changes: 3 additions & 3 deletions src/matchbox/server/postgresql/utils/insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(self, start: int, lookup: pa.Table = None):
self.lookup = pa.Table.from_arrays(
[
pa.array([], type=pa.uint64()),
pa.array([], type=pa.large_binary()),
pa.array([], type=pa.binary()),
pa.array([], type=pa.bool_()),
],
names=["id", "hash", "new"],
Expand Down Expand Up @@ -405,7 +405,7 @@ def _results_to_insert_tables(
engine=engine,
return_type="arrow",
)
lookup = lookup.cast(pa.schema([("hash", pa.large_binary()), ("id", pa.uint64())]))
lookup = lookup.cast(pa.schema([("hash", pa.binary()), ("id", pa.uint64())]))

hm = HashIDMap(start=Clusters.next_id(), lookup=lookup)

Expand All @@ -424,7 +424,7 @@ def _results_to_insert_tables(
hierarchy = to_hierarchical_clusters(
probabilities=probs_with_ccs,
hash_func=hash_values,
dtype=pa.large_binary,
dtype=pa.binary,
)

# Create Probabilities Arrow table to insert, containing all generated probabilities
Expand Down

0 comments on commit f42e66a

Please sign in to comment.