From 72b61f6c9a26b4f992a7e0e4dc0ad042ef76a7fa Mon Sep 17 00:00:00 2001 From: magdalendobson <58752279+magdalendobson@users.noreply.github.com> Date: Sat, 28 Oct 2023 20:39:53 -0400 Subject: [PATCH] [Neurips23] ParlayANN Submission for OOD track (#186) * initial commit * added default alpha * fixed bad dockerfile * cache bust * fixed timeout * added additional search configs to get past .9 * one more query config * added two pass arg * fixing arg in diskann dockerfile * committing to switch branches * committing to switch branches * committing to switch branches * added vamana.py * fixed issue in file detection * finalizing before PR * changes requested for PR * changes for PR * initial commit * added two pass arg * added default alpha * cache bust * added additional search configs to get past .9 * one more query config * committing to switch branches * committing to switch branches * committing to switch branches * added vamana.py * fixed issue in file detection * finalizing before PR * changes requested for PR --------- Co-authored-by: Ben Landrum <8enlandrum@gmail.com> Co-authored-by: Magdalen Dobson --- .github/workflows/neurips23.yml | 3 + benchmark/dataset_io.py | 1 + neurips23/ood/vamana/Dockerfile | 24 +++++ neurips23/ood/vamana/config.yaml | 54 +++++++++++ neurips23/ood/vamana/vamana.py | 150 +++++++++++++++++++++++++++++++ 5 files changed, 232 insertions(+) create mode 100644 neurips23/ood/vamana/Dockerfile create mode 100644 neurips23/ood/vamana/config.yaml create mode 100644 neurips23/ood/vamana/vamana.py diff --git a/.github/workflows/neurips23.yml b/.github/workflows/neurips23.yml index d0675bcf..1869062c 100644 --- a/.github/workflows/neurips23.yml +++ b/.github/workflows/neurips23.yml @@ -24,6 +24,9 @@ jobs: - algorithm: diskann dataset: random-xs track: ood + - algorithm: vamana + dataset: random-xs + track: ood fail-fast: false steps: diff --git a/benchmark/dataset_io.py b/benchmark/dataset_io.py index 1aeaa9c7..1ec562f8 100644 --- a/benchmark/dataset_io.py +++ b/benchmark/dataset_io.py @@ -16,6 +16,7 @@ def download(src, dst=None, max_size=None): """ download an URL, possibly cropped """ if os.path.exists(dst): + print("Already exists") return print('downloading %s -> %s...' % (src, dst)) if max_size is not None: diff --git a/neurips23/ood/vamana/Dockerfile b/neurips23/ood/vamana/Dockerfile new file mode 100644 index 00000000..61ed0566 --- /dev/null +++ b/neurips23/ood/vamana/Dockerfile @@ -0,0 +1,24 @@ +FROM neurips23 + +RUN apt update +RUN apt install -y software-properties-common +RUN add-apt-repository -y ppa:git-core/ppa +RUN apt update +RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10 + + +ARG CACHEBUST=1 +RUN git clone -b ood_v2 https://github.com/cmuparlay/ParlayANN.git && cd ParlayANN && git submodule update --init --recursive && cd python && pip install pybind11 && bash compile.sh +# WORKDIR /home/app/ParlayANN +# RUN git submodule update --init --recursive +# WORKDIR /home/app/ParlayANN/python + +# RUN pip install pybind11 + +# RUN bash compile.sh + +ENV PYTHONPATH=$PYTHONPATH:/home/app/ParlayANN/python + +# ENV PARLAY_NUM_THREADS=8 + +WORKDIR /home/app \ No newline at end of file diff --git a/neurips23/ood/vamana/config.yaml b/neurips23/ood/vamana/config.yaml new file mode 100644 index 00000000..82663028 --- /dev/null +++ b/neurips23/ood/vamana/config.yaml @@ -0,0 +1,54 @@ +random-xs: + vamana: + docker-tag: neurips23-ood-vamana + module: neurips23.ood.vamana.vamana + constructor: vamana + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":30, "L":50, "alpha":1.2}] + query-args: | + [{"Ls":50, "T":8}] +text2image-10M: + vamana: + docker-tag: neurips23-ood-vamana + module: neurips23.ood.vamana.vamana + constructor: vamana + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":55, "L":500, "alpha":1.0, "two_pass":1, "use_query_data":1, "compress":1}] + query-args: | + [ + {"Ls":70, "T":8}, + {"Ls":80, "T":8}, + {"Ls":90, "T":8}, + {"Ls":95, "T":8}, + {"Ls":100, "T":8}, + {"Ls":105, "T":8}, + {"Ls":110, "T":8}, + {"Ls":120, "T":8}, + {"Ls":125, "T":8}, + {"Ls":150, "T":8}] + vamana-singlepass: + docker-tag: neurips23-ood-vamana + module: neurips23.ood.vamana.vamana + constructor: vamana + base-args: ["@metric"] + run-groups: + base: + args: | + [{"R":64, "L":500}] + query-args: | + [{"Ls":30, "T":8}, + {"Ls":50, "T":8}, + {"Ls":70, "T":8}, + {"Ls":100, "T":8}, + {"Ls":113, "T":8}, + {"Ls":125, "T":8}, + {"Ls":150, "T":8}, + {"Ls":175, "T":8}, + {"Ls":200, "T":8}] + diff --git a/neurips23/ood/vamana/vamana.py b/neurips23/ood/vamana/vamana.py new file mode 100644 index 00000000..9ed922e7 --- /dev/null +++ b/neurips23/ood/vamana/vamana.py @@ -0,0 +1,150 @@ +from __future__ import absolute_import +import psutil +import os +import time +import numpy as np +import wrapper as pann + +from neurips23.ood.base import BaseOODANN +from benchmark.datasets import DATASETS, download_accelerated, BASEDIR +from benchmark.dataset_io import download + +class vamana(BaseOODANN): + def __init__(self, metric, index_params): + self.name = "vamana" + if (index_params.get("R")==None): + print("Error: missing parameter R") + return + if (index_params.get("L")==None): + print("Error: missing parameter L") + return + self._index_params = index_params + self._metric = self.translate_dist_fn(metric) + + self.R = int(index_params.get("R")) + self.L = int(index_params.get("L")) + self.alpha = float(index_params.get("alpha", 1.0)) + self.two_pass = bool(index_params.get("two_pass", False)) + self.use_query_data = bool(index_params.get("use_query_data", False)) + self.compress_vectors = bool(index_params.get("compress", False)) + + def index_name(self): + return f"R{self.R}_L{self.L}_alpha{self.alpha}" + + def create_index_dir(self, dataset): + index_dir = os.path.join(os.getcwd(), "data", "indices", "ood") + os.makedirs(index_dir, mode=0o777, exist_ok=True) + index_dir = os.path.join(index_dir, 'vamana') + os.makedirs(index_dir, mode=0o777, exist_ok=True) + index_dir = os.path.join(index_dir, dataset.short_name()) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + index_dir = os.path.join(index_dir, self.index_name()) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + return os.path.join(index_dir, self.index_name()) + + def translate_dist_fn(self, metric): + if metric == 'euclidean': + return 'Euclidian' + elif metric == 'ip': + return 'mips' + else: + raise Exception('Invalid metric') + + def translate_dtype(self, dtype:str): + if dtype == 'float32': + return 'float' + else: + return dtype + + def prepare_sample_info(self, index_dir): + if(self.use_query_data): + #download the additional sample points for the ood index + self.sample_points_path = "data/text2image1B/query_sample_200000.fbin" + sample_qs_large_url = "https://storage.yandexcloud.net/yr-secret-share/ann-datasets-5ac0659e27/T2I/query.private.1M.fbin" + bytes_to_download = 8 + 200000*4*200 + download(sample_qs_large_url, self.sample_points_path, bytes_to_download) + header = np.memmap(self.sample_points_path, shape=2, dtype='uint32', mode="r+") + header[0] = 200000 + + self.secondary_index_dir = index_dir + ".secondary" + self.secondary_gt_dir = self.secondary_index_dir + ".gt" + else: + self.sample_points_path = "" + self.secondary_index_dir = "" + self.secondary_gt_dir = "" + + def prepare_compressed_info(self): + if(self.compress_vectors): + self.compressed_vectors_path = "data/text2image1B/compressed_10M.fbin" + else: + self.compressed_vectors_path = "" + + def fit(self, dataset): + """ + Build the index for the data points given in dataset name. + """ + ds = DATASETS[dataset]() + d = ds.d + + index_dir = self.create_index_dir(ds) + + self.prepare_sample_info(index_dir) + self.prepare_compressed_info() + + if hasattr(self, 'index'): + print("Index already exists") + return + else: + start = time.time() + # ds.ds_fn is the name of the dataset file but probably needs a prefix + pann.build_vamana_index(self._metric, self.translate_dtype(ds.dtype), ds.get_dataset_fn(), self.sample_points_path, + self.compressed_vectors_path, index_dir, self.secondary_index_dir, self.secondary_gt_dir, self.R, self.L, self.alpha, + self.two_pass) + end = time.time() + print("Indexing time: ", end - start) + print(f"Wrote index to {index_dir}") + + self.index = pann.load_vamana_index(self._metric, self.translate_dtype(ds.dtype), ds.get_dataset_fn(), self.compressed_vectors_path, + self.sample_points_path, index_dir, self.secondary_index_dir, self.secondary_gt_dir, ds.nb, d) + print("Index loaded") + + def query(self, X, k): + nq, d = X.shape + self.res, self.query_dists = self.index.batch_search(X, nq, k, self.Ls) + + def set_query_arguments(self, query_args): + self._query_args = query_args + self.Ls = 0 if query_args.get("Ls") is None else query_args.get("Ls") + self.search_threads = self._query_args.get("T", 16) + os.environ["PARLAY_NUM_THREADS"] = str(self.search_threads) + + def load_index(self, dataset): + ds = DATASETS[dataset]() + d = ds.d + + index_dir = self.create_index_dir(ds) + self.prepare_sample_info(index_dir) + self.prepare_compressed_info() + + print("Trying to load...") + + try: + file_size = os.path.getsize(index_dir) + print(f"File Size in Bytes is {file_size}") + except FileNotFoundError: + file_size = 0 + print("File not found.") + + if file_size != 0: + try: + self.index = pann.load_vamana_index(self._metric, self.translate_dtype(ds.dtype), ds.get_dataset_fn(), + self.compressed_vectors_path, self.sample_points_path, index_dir, + self.secondary_index_dir, self.secondary_gt_dir, ds.nb, d) + print("Index loaded") + return True + except: + print("Index not found") + return False + else: + print("Index not found") + return False \ No newline at end of file