Skip to content

Commit

Permalink
#4 added dtype parameterization; added more one-liners for bigann and…
Browse files Browse the repository at this point in the history
… ssnpp datasets
  • Loading branch information
DmitryKey committed Oct 24, 2021
1 parent 71dcac5 commit 90790be
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 6 deletions.
25 changes: 20 additions & 5 deletions src/algorithms/sharding/kanndi/shard_by_distance.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys
from enum import Enum
from util.utils import read_bin, get_total_nvecs_fbin, add_points, Shard, display_top
from util.utils import read_bin, get_total_nvecs_fbin, add_points, Shard, display_top, read_fbin
from numpy import linalg
from statistics import median
import numpy as np
Expand Down Expand Up @@ -77,12 +77,13 @@ class DistMethod(Enum):


# objective function | loss function like in K-Means
def shard_by_dist(data_file: str, dist: float, output_index_path: str, shards_m: int = M):
def shard_by_dist(data_file: str, dist: float, output_index_path: str, dtype:np.dtype, shards_m: int = M):
tracemalloc.start()
# set of integer order ids of each point that was already placed into a shard => processed
processed_point_ids = set()
complete_shards = 0


total_num_elements = get_total_nvecs_fbin(data_file)
# dimensionality = get_total_dim_fbin(data_file)
print(f"Total number of points to process: {total_num_elements}", flush=True)
Expand All @@ -100,7 +101,7 @@ def shard_by_dist(data_file: str, dist: float, output_index_path: str, shards_m:
print("Expected shard size: {}".format(expected_shard_size), flush=True)

# get the seed point, which initially is the first point of the dataset
points = read_bin(data_file, dtype=np.uint8, start_idx=0, chunk_size=1)
points = read_bin(data_file, dtype=dtype, start_idx=0, chunk_size=1)
seed_point_id = 0
seed_point = points[seed_point_id]
print("Seed point for shard {}: {}".format(seed_point_id, seed_point), flush=True)
Expand Down Expand Up @@ -285,6 +286,7 @@ def add_shard(output_index_path, shard):
parser.add_argument('--input_file', help='input file with the multidimensional points', required=True)
parser.add_argument('--output_dir', help='where to store the index', required=True)
parser.add_argument('-M', type=int, help="expected number of shards, say 1000", required=True)
parser.add_argument('--dtype', type=str, help="dataset dtype: uint8, float32, int8", required=True)

args = parser.parse_args()
print(args)
Expand All @@ -295,9 +297,22 @@ def add_shard(output_index_path, shard):
points_file = args.input_file
output_index_path = args.output_dir
shards_number = args.M
dtype = args.dtype
req_type = None

if dtype == "float32":
req_dtype = np.float32
points = read_fbin(points_file, start_idx=0, chunk_size=SAMPLE_SIZE)
elif dtype == "uint8":
req_dtype = np.uint8
points = read_bin(points_file, dtype=req_dtype, start_idx=0, chunk_size=SAMPLE_SIZE)
else:
print("Unsupported data type.")
exit(0)



points = read_bin(points_file, dtype=np.uint8, start_idx=0, chunk_size=SAMPLE_SIZE)
computed_dist_max = compute_median_dist(points)
print(f"computed {computed_dist_max}", flush=True)

shard_by_dist(points_file, computed_dist_max, output_index_path, shards_m=shards_number)
shard_by_dist(points_file, computed_dist_max, output_index_path, dtype=req_dtype, shards_m=shards_number)
18 changes: 17 additions & 1 deletion src/run_kanndi.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
# fix the util package reading issue
export PYTHONPATH=.
###
# BIGANN
###

# 100M points
#python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/bigann.bak/base.1B.u8bin.crop_nb_100000000 --output_dir /datadrive/big-ann/data -M 100
# 1B points
python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/bigann/base.1B.u8bin --output_dir /datadrive/big-ann/data.1B/ -M 1000
python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/bigann/base.1B.u8bin --output_dir /datadrive/big-ann/index/bigann/data.1B -M 100 --dtype uint8


###
# Text2Image
###
#python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/text2image1B/base.1B.fbin.crop_nb_100000000 --output_dir /datadrive/big-ann/text2image/data.1B/ -M 100

###
# SSNPP: Facebook SimSearchNet++
###
# CONVERGED
# python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/FB_ssnpp/FB_ssnpp_database.u8bin.crop_nb_100000000 --output_dir /datadrive/big-ann/index/ssnpp/data.100M/ -M 100 --dtype uint8

0 comments on commit 90790be

Please sign in to comment.