#4 added dtype parameterization; added more one-liners for bigann and…

… ssnpp datasets
DmitryKey · Oct 24, 2021 · 90790be · 90790be
1 parent 71dcac5
commit 90790be
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 6 deletions.
diff --git a/src/algorithms/sharding/kanndi/shard_by_distance.py b/src/algorithms/sharding/kanndi/shard_by_distance.py
@@ -1,6 +1,6 @@
 import sys
 from enum import Enum
-from util.utils import read_bin, get_total_nvecs_fbin, add_points, Shard, display_top
+from util.utils import read_bin, get_total_nvecs_fbin, add_points, Shard, display_top, read_fbin
 from numpy import linalg
 from statistics import median
 import numpy as np
@@ -77,12 +77,13 @@ class DistMethod(Enum):
 
 
 # objective function | loss function like in K-Means
-def shard_by_dist(data_file: str, dist: float, output_index_path: str, shards_m: int = M):
+def shard_by_dist(data_file: str, dist: float, output_index_path: str, dtype:np.dtype, shards_m: int = M):
     tracemalloc.start()
     # set of integer order ids of each point that was already placed into a shard => processed
     processed_point_ids = set()
     complete_shards = 0
 
+
     total_num_elements = get_total_nvecs_fbin(data_file)
     # dimensionality = get_total_dim_fbin(data_file)
     print(f"Total number of points to process: {total_num_elements}", flush=True)
@@ -100,7 +101,7 @@ def shard_by_dist(data_file: str, dist: float, output_index_path: str, shards_m:
     print("Expected shard size: {}".format(expected_shard_size), flush=True)
 
     # get the seed point, which initially is the first point of the dataset
-    points = read_bin(data_file, dtype=np.uint8, start_idx=0, chunk_size=1)
+    points = read_bin(data_file, dtype=dtype, start_idx=0, chunk_size=1)
     seed_point_id = 0
     seed_point = points[seed_point_id]
     print("Seed point for shard {}: {}".format(seed_point_id, seed_point), flush=True)
@@ -285,6 +286,7 @@ def add_shard(output_index_path, shard):
     parser.add_argument('--input_file', help='input file with the multidimensional points', required=True)
     parser.add_argument('--output_dir', help='where to store the index', required=True)
     parser.add_argument('-M', type=int, help="expected number of shards, say 1000", required=True)
+    parser.add_argument('--dtype', type=str, help="dataset dtype: uint8, float32, int8", required=True)
 
     args = parser.parse_args()
     print(args)
@@ -295,9 +297,22 @@ def add_shard(output_index_path, shard):
     points_file = args.input_file
     output_index_path = args.output_dir
     shards_number = args.M
+    dtype = args.dtype
+    req_type = None
+
+    if dtype == "float32":
+        req_dtype = np.float32
+        points = read_fbin(points_file, start_idx=0, chunk_size=SAMPLE_SIZE)
+    elif dtype == "uint8":
+        req_dtype = np.uint8
+        points = read_bin(points_file, dtype=req_dtype, start_idx=0, chunk_size=SAMPLE_SIZE)
+    else:
+        print("Unsupported data type.")
+        exit(0)
+
+
 
-    points = read_bin(points_file, dtype=np.uint8, start_idx=0, chunk_size=SAMPLE_SIZE)
     computed_dist_max = compute_median_dist(points)
     print(f"computed {computed_dist_max}", flush=True)
 
-    shard_by_dist(points_file, computed_dist_max, output_index_path, shards_m=shards_number)
+    shard_by_dist(points_file, computed_dist_max, output_index_path, dtype=req_dtype, shards_m=shards_number)
diff --git a/src/run_kanndi.sh b/src/run_kanndi.sh
@@ -1,6 +1,22 @@
 # fix the util package reading issue
 export PYTHONPATH=.
+###
+# BIGANN
+###
+
 # 100M points
 #python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/bigann.bak/base.1B.u8bin.crop_nb_100000000 --output_dir /datadrive/big-ann/data -M 100
 # 1B points
-python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/bigann/base.1B.u8bin --output_dir /datadrive/big-ann/data.1B/ -M 1000
+python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/bigann/base.1B.u8bin --output_dir /datadrive/big-ann/index/bigann/data.1B -M 100 --dtype uint8
+
+
+###
+# Text2Image
+###
+#python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/text2image1B/base.1B.fbin.crop_nb_100000000 --output_dir /datadrive/big-ann/text2image/data.1B/ -M 100
+
+###
+# SSNPP: Facebook SimSearchNet++
+###
+# CONVERGED
+# python algorithms/sharding/kanndi/shard_by_distance.py --input_file /datadrive/big-ann-benchmarks/data/FB_ssnpp/FB_ssnpp_database.u8bin.crop_nb_100000000 --output_dir /datadrive/big-ann/index/ssnpp/data.100M/ -M 100 --dtype uint8