Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: harsha-simhadri/big-ann-benchmarks
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: main
Choose a base ref
...
head repository: DmitryKey/big-ann-benchmarks
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: main
Choose a head ref
Can’t automatically merge. Don’t worry, you can still create the pull request.
  • 6 commits
  • 3 files changed
  • 2 contributors

Commits on Sep 27, 2021

  1. #1 T1 FAISS customized for own quantized dataset

    Ubuntu committed Sep 27, 2021
    Copy the full SHA
    c7acd84 View commit details
  2. Copy the full SHA
    920e6f6 View commit details

Commits on Oct 1, 2021

  1. Copy the full SHA
    adc134f View commit details

Commits on Oct 5, 2021

  1. Copy the full SHA
    cc1d327 View commit details

Commits on Oct 22, 2021

  1. Copy the full SHA
    76d145d View commit details

Commits on Oct 31, 2021

  1. Copy the full SHA
    03bd0b2 View commit details
Showing with 77 additions and 0 deletions.
  1. +29 −0 benchmark/datasets.py
  2. +37 −0 run_t1_faiss_baseline_eval.sh
  3. +11 −0 run_t1_faiss_baseline_index.sh
29 changes: 29 additions & 0 deletions benchmark/datasets.py
Original file line number Diff line number Diff line change
@@ -294,6 +294,7 @@ def get_dataset_iterator(self, bs=512, split=(1,0)):
i0, i1 = self.nb * rank // nsplit, self.nb * (rank + 1) // nsplit
filename = self.get_dataset_fn()
x = xbin_mmap(filename, dtype=self.dtype, maxn=self.nb)
print(f"x.shape={x.shape} self.nb={self.nb} self.d={self.d}")
assert x.shape == (self.nb, self.d)
for j0 in range(i0, i1, bs):
j1 = min(j0 + bs, i1)
@@ -385,6 +386,33 @@ def __init__(self, nb_M=1000):
def distance(self):
return "euclidean"


class BigANNDimReducedDataset(DatasetCompetitionFormat):
def __init__(self, nb_M=1000):
self.nb_M = nb_M
self.nb = 10**6 * nb_M
self.d = 32
self.nq = 10000
# use for indexing of the dimensionality reduced dataset
self.dtype = "float32"
# use for search over the index built from the dimensionality reduced dataset
# self.dtype = "uint8"
self.ds_fn = "base.1B.u8bin"
self.qs_fn = "query.public.10K.u8bin"
self.gt_fn = (
"GT.public.1B.ibin" if self.nb_M == 1000 else
subset_url + "GT_100M/bigann-100M" if self.nb_M == 100 else
subset_url + "GT_10M/bigann-10M" if self.nb_M == 10 else
None
)
# self.gt_fn = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/bigann/public_query_gt100.bin" if self.nb == 10**9 else None
self.base_url = "https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/bigann/"
self.basedir = os.path.join(BASEDIR, "bigann")

def distance(self):
return "euclidean"


class Deep1BDataset(DatasetCompetitionFormat):
def __init__(self, nb_M=1000):
self.nb_M = nb_M
@@ -616,6 +644,7 @@ def default_count(self):
'bigann-1B': lambda : BigANNDataset(1000),
'bigann-100M': lambda : BigANNDataset(100),
'bigann-10M': lambda : BigANNDataset(10),
'bigann-dim-reduced-100M': lambda: BigANNDimReducedDataset(100),

'deep-1B': lambda : Deep1BDataset(),
'deep-100M': lambda : Deep1BDataset(100),
37 changes: 37 additions & 0 deletions run_t1_faiss_baseline_eval.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
params="
nprobe=1,quantizer_efSearch=4
nprobe=2,quantizer_efSearch=4
nprobe=4,quantizer_efSearch=4
nprobe=4,quantizer_efSearch=8
nprobe=8,quantizer_efSearch=4
nprobe=8,quantizer_efSearch=8
nprobe=8,quantizer_efSearch=16
nprobe=8,quantizer_efSearch=32
nprobe=16,quantizer_efSearch=16
nprobe=16,quantizer_efSearch=32
nprobe=16,quantizer_efSearch=64
nprobe=32,quantizer_efSearch=8
nprobe=32,quantizer_efSearch=32
nprobe=32,quantizer_efSearch=64
nprobe=32,quantizer_efSearch=128
nprobe=64,quantizer_efSearch=16
nprobe=64,quantizer_efSearch=32
nprobe=64,quantizer_efSearch=64
nprobe=64,quantizer_efSearch=128
nprobe=64,quantizer_efSearch=256
nprobe=128,quantizer_efSearch=32
nprobe=128,quantizer_efSearch=64
nprobe=128,quantizer_efSearch=128
nprobe=128,quantizer_efSearch=256
nprobe=128,quantizer_efSearch=512
nprobe=256,quantizer_efSearch=64
nprobe=256,quantizer_efSearch=128
nprobe=256,quantizer_efSearch=512
nprobe=512,quantizer_efSearch=256
nprobe=512,quantizer_efSearch=512
nprobe=1024,quantizer_efSearch=256
"

python track1_baseline_faiss/baseline_faiss.py \
--dataset bigann-dim-reduced-100M --indexfile data/track1_baseline_faiss/bigann-100M.IVF1M_2level_PQ64x4fsr.faissindex \
--search --searchparams $params
11 changes: 11 additions & 0 deletions run_t1_faiss_baseline_index.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
python -u track1_baseline_faiss/baseline_faiss.py --dataset bigann-dim-reduced-100M \
--indexkey OPQ64_128,IVF1048576_HNSW32,PQ64x4fsr \
--maxtrain 10000000 \
--two_level_clustering \
--build \
--add_splits 30 \
--indexfile data/track1_baseline_faiss_dim_reduction/bigann-100M.IVF1M_2level_PQ64x4fsr.faissindex \
--quantizer_efConstruction 200 \
--quantizer_add_efSearch 80