diff --git a/neurips23/filter/base.py b/neurips23/filter/base.py deleted file mode 100644 index 9ac933d1..00000000 --- a/neurips23/filter/base.py +++ /dev/null @@ -1,12 +0,0 @@ -from benchmark.algorithms.base import BaseANN - -class BaseFilterANN(BaseANN): - def filtered_query(self, X, filter, k): - """ - Carry out a batch query for k-NN of query set X with associated filter. - Query X[i] has asks for k-NN in the index that pass all filters in filter[i]. - """ - raise NotImplementedError() - - def track(self): - return "filter" \ No newline at end of file diff --git a/neurips23/filter/faiss/Dockerfile b/neurips23/filter/faiss/Dockerfile deleted file mode 100644 index 163391a8..00000000 --- a/neurips23/filter/faiss/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM neurips23 - -RUN apt update && apt install -y wget swig -RUN wget https://repo.anaconda.com/archive/Anaconda3-2023.03-0-Linux-x86_64.sh -RUN bash Anaconda3-2023.03-0-Linux-x86_64.sh -b - -ENV PATH /root/anaconda3/bin:$PATH -ENV CONDA_PREFIX /root/anaconda3/ - -RUN conda install -c pytorch faiss-cpu -COPY install/requirements_conda.txt ./ -# conda doesn't like some of our packages, use pip -RUN python3 -m pip install -r requirements_conda.txt - -COPY neurips23/filter/faiss/bow_id_selector.swig ./ - -RUN swig -c++ -python -I$CONDA_PREFIX/include -Ifaiss bow_id_selector.swig -RUN g++ -shared -O3 -g -fPIC bow_id_selector_wrap.cxx -o _bow_id_selector.so \ - -I $( python -c "import distutils.sysconfig ; print(distutils.sysconfig.get_python_inc())" ) \ - -I $CONDA_PREFIX/include $CONDA_PREFIX/lib/libfaiss_avx2.so -Ifaiss - -RUN python3 -c 'import faiss; print(faiss.IndexFlatL2); print(faiss.__version__)' - - - diff --git a/neurips23/filter/faiss/README.md b/neurips23/filter/faiss/README.md deleted file mode 100644 index c834af51..00000000 --- a/neurips23/filter/faiss/README.md +++ /dev/null @@ -1,102 +0,0 @@ - -# Faiss baseline for the Filtered search track - -The database of size $N=10^7$ can be seen as the combination of: - -- a matrix $M$ of size $N \times d$ of embedding vectors (called `xb` in the code). $d=192$. -- a sparse matrix $M_\mathrm{meta}$ of size $N \times v$, entry $i,j$ is set to 1 iff word $j$ is applicable to vector $i$. $v=200386$, called `meta_b` in the code (a [CSR matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)) - -The Faiss basleline for the filtered search track is based on two distinct data structures, a word-based inverted file and a Faiss `IndexIVFFlat`. -Both data structured allow to peform filtered searches in two different ways. - -The search is based on a query vector $q\in \mathbb{R}^d$ and associated query words $w_1, w_2$ (there are one or two query words). -The search results are the database vectors that include /all/ query words and that are nearest to $q$ in $L_2$ distance. - -## Word-based inverted file - -This is term-based inverted file that maps each word to the vectors (docs) that contain that term. -In the code it is a CSR matrix called `docs_per_word` (it's just the transposed version of `meta_b`). - -At search time, the subset (`subset`) of vectors eligible for results depends on the number of query words: - -- if there is a single word $w_1$ then it's just the set of non-0 entries in row $w_1$ of the `docs_per_word` matrix. -This can be extracted at no cost - -- if there are two words $w_1$ and $w_2$ then the sets of non-0 entries of rows $w_1$ and $w_2$ are intersected. -This is done with `np.intersect1d` or the C++ function `intersect_sorted`, that is faster (linear in nb of non-0 entries of the two rows). - -When this subset is selected, the result is found by searching the top-k vectors in this subset of rows of $M$. -The result is exact and the search is most efficient when the subset is small (ie. the words are discriminative enough to filter the results well). - -## IndexIVFFlat structure - -This is a Faiss [`IndexIVFFlat`](https://github.com/facebookresearch/faiss/wiki/The-index-factory#encodings) called `index`. - -By default the index performs unfiltered search, ie. the nearest vectors to $q$ can be retrieved. -The accuracy of this search depends on the number of visited centroids of the `IndexIVFFlat` (parameter `nprobe`, the larger the more accurate and the slower). - -One solution would be to over-fetch vectors and perform filtering post-hoc using the words in the result list. -However, it is unclear /how much/ we should overfetch. - -Therefore, another solution is to use the Faiss [filtering functionality](https://github.com/facebookresearch/faiss/wiki/Setting-search-parameters-for-one-query#searching-in-a-subset-of-elements), ie. provide a callback function that is called for each vector id to decide if it should be considered as a result or not. - -The callback function is implemented in C++ in the class `IDSelectorBOW`. -For vector id $i$ it looks up the row $i$ of $M_\mathrm{meta}$ and peforms a binary search on $w_1$ to check of that word belongs to the words associated to vector $i$. -If $w_2$ is also provided, it does the same for $w_2$. -The callback returns true only if all terms are present. - -### Binary filtering - -The issue is that this callback is relatively slow because (1) it requires to access the $M_\mathrm{meta}$ matrix which causes cache misses and (2) it performs an iterative binary search. -Since the callback is called in the tightest inner loop of the search function, and since the IVF search tends to perform many vector comparisons, this has non negligible performance impact. - -To speed up this test, we can use a nifty piece of bit manipulation. -The idea is that the vector ids are 63 bits long (64 bits integers but negative values are reserved, so we cannot use the sign bit). -However, since $N=10^7$ we use only $\lceil \log_2 N \rceil = 24$ bits of these, leaving 63-24 = 39 bits that are always 0. - -Now, we associate to each word $j$ a 39-bit signature $S[j]$, and the to each set of words the binary `or` of these signatures. -The query is represented by $s_\mathrm{q} = S[w_1] \vee S[w_2]$. -Database entry $i$ with words $W_i$ is represented by $s_i = \vee_{w\in W_i} S[w]$. - -Then we have the following implication: if $\\{w_1, w_2\\} \subset W_i$ then all 1 bits of $s_\mathrm{q}$ are also set to 1 in $s_i$. - -$$\\{w_1, w_2\\} \subset W_i \Rightarrow \neg s_i \wedge s_\mathrm{q} = 0$$ - -Which is equivalent to: - -$$\neg s_i \wedge s_\mathrm{q} \neq 0 \Rightarrow \\{w_1, w_2\\} \not\subset W_i $$ - -Of course, this is an implication, not an equivalence. -Therefore, it can only rule out database vectors. -However, the binary test is very cheap to perform (uses a few machine instructions on data that is already in machine registers), so it can be used as a pre-filter to apply the full membership test on candidates. -This is implemented in the `IDSelectorBOWBin` object. - -The remaining degree of freedom is how to choose the binary signatures, because this rule is always valid, but its filtering ability depends on the choice of the signatures $S$. -After a few tests (see [this notebook](https://gist.github.com/mdouze/75103e4cef436510ac9b834f9a77496f#file-eval_binary_signatures-ipynb) ) it seems that a random signature with 0.1 probability for 1s filters our 80% of negative tests. -Asjuting this to the frequency of the words did not seem to yield better results. - -## Choosing between the two implementations - -The two implementations are complementary: the word-first implementation gives exact results, and has a strong filtering ability for rare words. -The `IndexIVFFlat` implementation gives approximate results and is more relevant for words that are more common, where a significant subset of vectors are indeed relevant. - -Therefore, there should be a rule to choose between the two, and the relevant metric is the size of the subset of vectors to consider. -We can use statistics on the words, ie. $\mathrm{nocc}[j]$ is the number of times word $j$ appears in the dataset (this is just the column-wise sum of the $M_\mathrm{meta}$). - -For a single query word $w_1$, the fraction of relevant indices is just $f = \mathrm{nocc}[w_1] / N$. -For two query words, it is more complicated to compute but an estimate is given by $f = \mathrm{nocc}[w_1] \times \mathrm{nocc}[w_2] / N^2$ (this estimate assumes words are independent, which is incorrect). - -Therefore, the rule that we use is based on a threshold $\tau$ (called `metadata_threshold` in the code) : - -- if $f < \tau$ then use the word-first search - -- otherwise use the IVFFlat based index - -Note that the optimal threshold also depends on the target accuracy (since the IVFFlat is not exact, when a higher accuracy is desired), see https://github.com/harsha-simhadri/big-ann-benchmarks/pull/105#issuecomment-1539842223 . - - -## Code layout - -The code is in faiss.py, with performance critical parts implemented in C++ and wrapped with SWIG in `bow_id_selector.swig`. -SWIG directly exposes the C++ classes and functions in Python. - diff --git a/neurips23/filter/faiss/bow_id_selector.swig b/neurips23/filter/faiss/bow_id_selector.swig deleted file mode 100644 index 6712aa25..00000000 --- a/neurips23/filter/faiss/bow_id_selector.swig +++ /dev/null @@ -1,183 +0,0 @@ - -%module bow_id_selector - -/* -To compile when Faiss is installed via conda: - -swig -c++ -python -I$CONDA_PREFIX/include bow_id_selector.swig && \ -g++ -shared -O3 -g -fPIC bow_id_selector_wrap.cxx -o _bow_id_selector.so \ - -I $( python -c "import distutils.sysconfig ; print(distutils.sysconfig.get_python_inc())" ) \ - -I $CONDA_PREFIX/include $CONDA_PREFIX/lib/libfaiss_avx2.so - -*/ - - -// Put C++ includes here -%{ - -#include -#include - -%} - -// to get uint32_t and friends -%include - -// This means: assume what's declared in these .h files is provided -// by the Faiss module. -%import(module="faiss") "faiss/MetricType.h" -%import(module="faiss") "faiss/impl/IDSelector.h" - -// functions to be parsed here - -// This is important to release GIL and do Faiss exception handing -%exception { - Py_BEGIN_ALLOW_THREADS - try { - $action - } catch(faiss::FaissException & e) { - PyEval_RestoreThread(_save); - - if (PyErr_Occurred()) { - // some previous code already set the error type. - } else { - PyErr_SetString(PyExc_RuntimeError, e.what()); - } - SWIG_fail; - } catch(std::bad_alloc & ba) { - PyEval_RestoreThread(_save); - PyErr_SetString(PyExc_MemoryError, "std::bad_alloc"); - SWIG_fail; - } - Py_END_ALLOW_THREADS -} - - -// any class or function declared below will be made available -// in the module. -%inline %{ - -struct IDSelectorBOW : faiss::IDSelector { - size_t nb; - using TL = int32_t; - const TL *lims; - const int32_t *indices; - int32_t w1 = -1, w2 = -1; - - IDSelectorBOW( - size_t nb, const TL *lims, const int32_t *indices): - nb(nb), lims(lims), indices(indices) {} - - void set_query_words(int32_t w1, int32_t w2) { - this->w1 = w1; - this->w2 = w2; - } - - // binary search in the indices array - bool find_sorted(TL l0, TL l1, int32_t w) const { - while (l1 > l0 + 1) { - TL lmed = (l0 + l1) / 2; - if (indices[lmed] > w) { - l1 = lmed; - } else { - l0 = lmed; - } - } - return indices[l0] == w; - } - - bool is_member(faiss::idx_t id) const { - TL l0 = lims[id], l1 = lims[id + 1]; - if (l1 <= l0) { - return false; - } - if(!find_sorted(l0, l1, w1)) { - return false; - } - if(w2 >= 0 && !find_sorted(l0, l1, w2)) { - return false; - } - return true; - } - - ~IDSelectorBOW() override {} -}; - - -struct IDSelectorBOWBin : IDSelectorBOW { - /** with additional binary filtering */ - faiss::idx_t id_mask; - - IDSelectorBOWBin( - size_t nb, const TL *lims, const int32_t *indices, faiss::idx_t id_mask): - IDSelectorBOW(nb, lims, indices), id_mask(id_mask) {} - - faiss::idx_t q_mask = 0; - - void set_query_words_mask(int32_t w1, int32_t w2, faiss::idx_t q_mask) { - set_query_words(w1, w2); - this->q_mask = q_mask; - } - - bool is_member(faiss::idx_t id) const { - if (q_mask & ~id) { - return false; - } - return IDSelectorBOW::is_member(id & id_mask); - } - - ~IDSelectorBOWBin() override {} -}; - - -size_t intersect_sorted_c( - size_t n1, const int32_t *a1, - size_t n2, const int32_t *a2, - int32_t *res) -{ - if (n1 == 0 || n2 == 0) { - return 0; - } - size_t i1 = 0, i2 = 0, i = 0; - for(;;) { - if (a1[i1] < a2[i2]) { - i1++; - if (i1 >= n1) { - return i; - } - } else if (a1[i1] > a2[i2]) { - i2++; - if (i2 >= n2) { - return i; - } - } else { // equal - res[i++] = a1[i1++]; - i2++; - if (i1 >= n1 || i2 >= n2) { - return i; - } - } - } -} - -%} - - -%pythoncode %{ - -import numpy as np - -# example additional function that converts the passed-in numpy arrays to -# C++ pointers -def intersect_sorted(a1, a2): - n1, = a1.shape - n2, = a2.shape - res = np.empty(n1 + n2, dtype=a1.dtype) - nres = intersect_sorted_c( - n1, faiss.swig_ptr(a1), - n2, faiss.swig_ptr(a2), - faiss.swig_ptr(res) - ) - return res[:nres] - -%} \ No newline at end of file diff --git a/neurips23/filter/faiss/config.yaml b/neurips23/filter/faiss/config.yaml deleted file mode 100644 index 62cc5b24..00000000 --- a/neurips23/filter/faiss/config.yaml +++ /dev/null @@ -1,74 +0,0 @@ -random-filter-s: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF1024,SQ8"}] - query-args: | - [{"nprobe": 1}, - {"nprobe":2}, - {"nprobe":4}] -random-s: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF1024,SQ8"}] - query-args: | - [{"nprobe": 1}, - {"nprobe":2}, - {"nprobe":4}] -yfcc-10M-unfiltered: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF16384,SQ8", "binarysig": true, "threads": 16}] - query-args: | - [{"nprobe": 1}, {"nprobe": 4}, {"nprobe": 16}, {"nprobe": 64}] -yfcc-10M: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF16384,SQ8", - "binarysig": true, - "threads": 16 - }] - query-args: | - [{"nprobe": 1, "mt_threshold":0.0003}, - {"nprobe": 4, "mt_threshold":0.0003}, - {"nprobe": 16, "mt_threshold":0.0003}, - {"nprobe": 32, "mt_threshold":0.0003}, - {"nprobe": 64, "mt_threshold":0.0003}, - {"nprobe": 96, "mt_threshold":0.0003}, - {"nprobe": 1, "mt_threshold":0.0001}, - {"nprobe": 4, "mt_threshold":0.0001}, - {"nprobe": 16, "mt_threshold":0.0001}, - {"nprobe": 32, "mt_threshold":0.0001}, - {"nprobe": 64, "mt_threshold":0.0001}, - {"nprobe": 96, "mt_threshold":0.0001}, - {"nprobe": 1, "mt_threshold":0.01}, - {"nprobe": 4, "mt_threshold":0.01}, - {"nprobe": 16, "mt_threshold":0.01}, - {"nprobe": 32, "mt_threshold":0.01}, - {"nprobe": 64, "mt_threshold":0.01}, - {"nprobe": 96, "mt_threshold":0.01} - ] - diff --git a/neurips23/filter/faiss/faiss.py b/neurips23/filter/faiss/faiss.py deleted file mode 100644 index 02980d12..00000000 --- a/neurips23/filter/faiss/faiss.py +++ /dev/null @@ -1,287 +0,0 @@ -import pdb -import pickle -import numpy as np -import os - -from multiprocessing.pool import ThreadPool - -import faiss - -from neurips23.filter.base import BaseFilterANN -from benchmark.datasets import DATASETS -from benchmark.dataset_io import download_accelerated - -import bow_id_selector - -def csr_get_row_indices(m, i): - """ get the non-0 column indices for row i in matrix m """ - return m.indices[m.indptr[i] : m.indptr[i + 1]] - -def make_bow_id_selector(mat, id_mask=0): - sp = faiss.swig_ptr - if id_mask == 0: - return bow_id_selector.IDSelectorBOW(mat.shape[0], sp(mat.indptr), sp(mat.indices)) - else: - return bow_id_selector.IDSelectorBOWBin( - mat.shape[0], sp(mat.indptr), sp(mat.indices), id_mask - ) - -def set_invlist_ids(invlists, l, ids): - n, = ids.shape - ids = np.ascontiguousarray(ids, dtype='int64') - assert invlists.list_size(l) == n - faiss.memcpy( - invlists.get_ids(l), - faiss.swig_ptr(ids), n * 8 - ) - - - -def csr_to_bitcodes(matrix, bitsig): - """ Compute binary codes for the rows of the matrix: each binary code is - the OR of bitsig for non-0 entries of the row. - """ - indptr = matrix.indptr - indices = matrix.indices - n = matrix.shape[0] - bit_codes = np.zeros(n, dtype='int64') - for i in range(n): - # print(bitsig[indices[indptr[i]:indptr[i + 1]]]) - bit_codes[i] = np.bitwise_or.reduce(bitsig[indices[indptr[i]:indptr[i + 1]]]) - return bit_codes - - -class BinarySignatures: - """ binary signatures that encode vectors """ - - def __init__(self, meta_b, proba_1): - nvec, nword = meta_b.shape - # number of bits reserved for the vector ids - self.id_bits = int(np.ceil(np.log2(nvec))) - # number of bits for the binary signature - self.sig_bits = nbits = 63 - self.id_bits - - # select binary signatures for the vocabulary - rs = np.random.RandomState(123) # we rely on this to be reproducible! - bitsig = np.packbits(rs.rand(nword, nbits) < proba_1, axis=1) - bitsig = np.pad(bitsig, ((0, 0), (0, 8 - bitsig.shape[1]))).view("int64").ravel() - self.bitsig = bitsig - - # signatures for all the metadata matrix - self.db_sig = csr_to_bitcodes(meta_b, bitsig) << self.id_bits - - # mask to keep only the ids - self.id_mask = (1 << self.id_bits) - 1 - - def query_signature(self, w1, w2): - """ compute the query signature for 1 or 2 words """ - sig = self.bitsig[w1] - if w2 != -1: - sig |= self.bitsig[w2] - return int(sig << self.id_bits) - -class FAISS(BaseFilterANN): - - def __init__(self, metric, index_params): - self._index_params = index_params - self._metric = metric - print(index_params) - self.indexkey = index_params.get("indexkey", "IVF32768,SQ8") - self.binarysig = index_params.get("binarysig", True) - self.binarysig_proba1 = index_params.get("binarysig_proba1", 0.1) - self.metadata_threshold = 1e-3 - self.nt = index_params.get("threads", 1) - - - def fit(self, dataset): - ds = DATASETS[dataset]() - if ds.search_type() == "knn_filtered" and self.binarysig: - print("preparing binary signatures") - meta_b = ds.get_dataset_metadata() - self.binsig = BinarySignatures(meta_b, self.binarysig_proba1) - print("writing to", self.binarysig_name(dataset)) - pickle.dump(self.binsig, open(self.binarysig_name(dataset), "wb"), -1) - else: - self.binsig = None - - if ds.search_type() == "knn_filtered": - self.meta_b = ds.get_dataset_metadata() - self.meta_b.sort_indices() - - index = faiss.index_factory(ds.d, self.indexkey) - xb = ds.get_dataset() - print("train") - index.train(xb) - print("populate") - if self.binsig is None: - index.add(xb) - else: - ids = np.arange(ds.nb) | self.binsig.db_sig - index.add_with_ids(xb, ids) - - self.index = index - self.nb = ds.nb - self.xb = xb - self.ps = faiss.ParameterSpace() - self.ps.initialize(self.index) - print("store", self.index_name(dataset)) - faiss.write_index(index, self.index_name(dataset)) - - - def index_name(self, name): - return f"data/{name}.{self.indexkey}.faissindex" - - def binarysig_name(self, name): - return f"data/{name}.{self.indexkey}.binarysig" - - - def load_index(self, dataset): - """ - Load the index for dataset. Returns False if index - is not available, True otherwise. - - Checking the index usually involves the dataset name - and the index build paramters passed during construction. - """ - if not os.path.exists(self.index_name(dataset)): - if 'url' not in self._index_params: - return False - - print('Downloading index in background. This can take a while.') - download_accelerated(self._index_params['url'], self.index_name(dataset), quiet=True) - - print("Loading index") - - self.index = faiss.read_index(self.index_name(dataset)) - - self.ps = faiss.ParameterSpace() - self.ps.initialize(self.index) - - ds = DATASETS[dataset]() - - if ds.search_type() == "knn_filtered" and self.binarysig: - if not os.path.exists(self.binarysig_name(dataset)): - print("preparing binary signatures") - meta_b = ds.get_dataset_metadata() - self.binsig = BinarySignatures(meta_b, self.binarysig_proba1) - else: - print("loading binary signatures") - self.binsig = pickle.load(open(self.binarysig_name(dataset), "rb")) - else: - self.binsig = None - - if ds.search_type() == "knn_filtered": - self.meta_b = ds.get_dataset_metadata() - self.meta_b.sort_indices() - - self.nb = ds.nb - self.xb = ds.get_dataset() - - return True - - def index_files_to_store(self, dataset): - """ - Specify a triplet with the local directory path of index files, - the common prefix name of index component(s) and a list of - index components that need to be uploaded to (after build) - or downloaded from (for search) cloud storage. - - For local directory path under docker environment, please use - a directory under - data/indices/track(T1 or T2)/algo.__str__()/DATASETS[dataset]().short_name() - """ - raise NotImplementedError() - - def query(self, X, k): - nq = X.shape[0] - self.I = -np.ones((nq, k), dtype='int32') - bs = 1024 - for i0 in range(0, nq, bs): - _, self.I[i0:i0+bs] = self.index.search(X[i0:i0+bs], k) - - - def filtered_query(self, X, filter, k): - print('running filtered query') - nq = X.shape[0] - self.I = -np.ones((nq, k), dtype='int32') - meta_b = self.meta_b - meta_q = filter - docs_per_word = meta_b.T.tocsr() - ndoc_per_word = docs_per_word.indptr[1:] - docs_per_word.indptr[:-1] - freq_per_word = ndoc_per_word / self.nb - - def process_one_row(q): - faiss.omp_set_num_threads(1) - qwords = csr_get_row_indices(meta_q, q) - assert qwords.size in (1, 2) - w1 = qwords[0] - freq = freq_per_word[w1] - if qwords.size == 2: - w2 = qwords[1] - freq *= freq_per_word[w2] - else: - w2 = -1 - if freq < self.metadata_threshold: - # metadata first - docs = csr_get_row_indices(docs_per_word, w1) - if w2 != -1: - docs = bow_id_selector.intersect_sorted( - docs, csr_get_row_indices(docs_per_word, w2)) - - assert len(docs) >= k, pdb.set_trace() - xb_subset = self.xb[docs] - _, Ii = faiss.knn(X[q : q + 1], xb_subset, k=k) - - self.I[q, :] = docs[Ii.ravel()] - else: - # IVF first, filtered search - sel = make_bow_id_selector(meta_b, self.binsig.id_mask if self.binsig else 0) - if self.binsig is None: - sel.set_query_words(int(w1), int(w2)) - else: - sel.set_query_words_mask( - int(w1), int(w2), self.binsig.query_signature(w1, w2)) - - params = faiss.SearchParametersIVF(sel=sel, nprobe=self.nprobe) - - _, Ii = self.index.search( - X[q:q+1], k, params=params - ) - Ii = Ii.ravel() - if self.binsig is None: - self.I[q] = Ii - else: - # we'll just assume there are enough results - # valid = Ii != -1 - # I[q, valid] = Ii[valid] & binsig.id_mask - self.I[q] = Ii & self.binsig.id_mask - - - if self.nt <= 1: - for q in range(nq): - process_one_row(q) - else: - faiss.omp_set_num_threads(self.nt) - pool = ThreadPool(self.nt) - list(pool.map(process_one_row, range(nq))) - - def get_results(self): - return self.I - - def set_query_arguments(self, query_args): - faiss.cvar.indexIVF_stats.reset() - if "nprobe" in query_args: - self.nprobe = query_args['nprobe'] - self.ps.set_index_parameters(self.index, f"nprobe={query_args['nprobe']}") - self.qas = query_args - else: - self.nprobe = 1 - if "mt_threshold" in query_args: - self.metadata_threshold = query_args['mt_threshold'] - else: - self.metadata_threshold = 1e-3 - - def __str__(self): - return f'Faiss({self.indexkey, self.qas})' - - \ No newline at end of file diff --git a/neurips23/filter/puck/Dockerfile b/neurips23/filter/puck/Dockerfile deleted file mode 100755 index d2db5e83..00000000 --- a/neurips23/filter/puck/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt-get update -RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip -#swig -RUN apt-get update && apt-get install -y swig cmake -RUN pip3 install pybind11 numpy -RUN cat /etc/ld.so.conf -RUN ls /etc/ld.so.conf.d/ -##cmake -RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh -RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake -ENV PATH /home/app/cmake/bin:$PATH - -#mkl -RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh -RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s - -RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf -RUN ldconfig -RUN touch /etc/profile.d/intel.sh -RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh -RUN . /etc/profile.d/intel.sh - -ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" -#RUN git config --global http.sslVerify false - -RUN git clone -b filter https://github.com/baidu/puck.git -RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install -RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/filter/puck/config.yaml b/neurips23/filter/puck/config.yaml deleted file mode 100755 index f71d438a..00000000 --- a/neurips23/filter/puck/config.yaml +++ /dev/null @@ -1,58 +0,0 @@ -random-filter-s: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF1024,SQ8"}] - query-args: | - [{"nprobe": 1}, - {"nprobe":2}, - {"nprobe":4}] -random-s: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF1024,SQ8"}] - query-args: | - [{"nprobe": 1}, - {"nprobe":2}, - {"nprobe":4}] -yfcc-10M-unfiltered: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF16384,SQ8", "binarysig": true, "threads": 16}] - query-args: | - [{"nprobe": 1}, {"nprobe": 4}, {"nprobe": 16}, {"nprobe": 64}] -yfcc-10M: - puck: - docker-tag: neurips23-filter-puck - module: neurips23.filter.puck.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: | - [{ "index_type": 3, "C":1000, "F":1000, "FN":16, "N":0}] - query-args: | - [ - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":135}, - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":105}, - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":110}, - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":115}, - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":120 } - ] diff --git a/neurips23/filter/puck/puck.py b/neurips23/filter/puck/puck.py deleted file mode 100755 index e13072e8..00000000 --- a/neurips23/filter/puck/puck.py +++ /dev/null @@ -1,222 +0,0 @@ -# !/usr/bin/env python3 -#-*- coding:utf-8 -*- -################################################################################ -# -# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved -# -################################################################################ -""" -@file: puck_inmem.py -@author: yinjie06(yinjie06@baidu.com) -@date: 2021-10-06 13:44 -@brief: -""" - -from neurips23.filter.base import BaseFilterANN -from benchmark.datasets import DATASETS -from benchmark.dataset_io import download_accelerated -from puck import py_puck_api -import multiprocessing.pool -from multiprocessing import Process -import os -import numpy as np -import time -import math -import struct - -CPU_LIMIT = multiprocessing.cpu_count() -swig_ptr = py_puck_api.swig_ptr - -class Puck(BaseFilterANN): - def __init__(self, metric, index_params): - self._index_params = index_params - self._metric = metric - self.indexkey = index_params.get("indexkey", "NA") - - self.index = py_puck_api.PySearcher() - self.topk = 10 - self.n = 0 - self.build_memory_usage = -1 - print("after init") - - def init_dataset_key(self, dataset): - - #更新gflags - ds = DATASETS[dataset]() - d = ds.d - #特征纬度 - py_puck_api.update_gflag('feature_dim', "%d"%(d)) - - #根据距离计算方式调整 - self.whether_norm = False - py_puck_api.update_gflag('whether_norm', 'false') - self.ip2cos = 0 - if ds.distance() == "angular": - self.whether_norm = True - py_puck_api.update_gflag('whether_norm', 'true') - elif ds.distance() == "ip": - self.ip2cos = 1 - py_puck_api.update_gflag('ip2cos', '%d'%(self.ip2cos)) - - self.init_indexkey() - #测试 - py_puck_api.update_gflag('kmeans_iterations_count',"1") - - def check_feature(self, dataset): - self.init_dataset_key(dataset) - ds = DATASETS[dataset]() - d = ds.d - #索引存储目录 - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - if not os.path.exists(self.index_name(dataset)): - index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - - meta_indices_file_name = self.index_name(dataset) + "/indices.dat" - meta_indices_file = open(meta_indices_file_name, 'wb') - meta_indptr_file_name = self.index_name(dataset) + "/indptr.dat" - meta_indptr_file = open(meta_indptr_file_name, 'wb') - - meta_to_write = ds.get_dataset_metadata() - buf = struct.pack('i', len(meta_to_write.indices)) - meta_indices_file.write(buf) - buf = struct.pack('i' * len(meta_to_write.indices), *(meta_to_write.indices)) - meta_indices_file.write(buf) - meta_indices_file.close() - - buf = struct.pack('i', len(meta_to_write.indptr)) - meta_indptr_file.write(buf) - buf = struct.pack('i' * len(meta_to_write.indptr), *(meta_to_write.indptr)) - meta_indptr_file.write(buf) - meta_indptr_file.close() - - #训练用目录 - if not os.path.exists('mid-data'): - os.mkdir('mid-data') - #格式化文件数据,将来可不要,后续可修改训练接口 - all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb') - - add_part=100000 - i0 = 0 - t0 = time.time() - for xblock in ds.get_dataset_iterator(bs=add_part): - i1 = i0 + len(xblock) - i0 = i1 - for x in xblock: - feat = x.astype(np.float32) - if(self.whether_norm): - feat = feat / np.sqrt(np.dot(feat, feat)) - elif(self.ip2cos > 0): - norm = np.dot(feat, feat) - if norm > 1.0: - print("not support, please contact yinjie06") - return False - feat = np.append(feat, math.sqrt(1.0 - norm)) - - buf = struct.pack('i', len(feat)) - all_feature_file.write(buf) - buf = struct.pack('f' * len(feat), *feat) - all_feature_file.write(buf) - print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0)) - all_feature_file.close() - - def fit(self, dataset): - print("start fit") - #self.check_feature(dataset) - p = Process(target=self.check_feature, args=(dataset,)) - p.start() - p.join() - self.init_dataset_key(dataset) - ds = DATASETS[dataset]() - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - #训练数据采样 - py_puck_api.update_gflag('train_points_count', "5000000") - py_puck_api.update_gflag('pq_train_points_count', "500000") - print(self.index_name(dataset)) - print("start to train") - self.index.build(ds.nb) - self.load_index(dataset) - - def init_indexkey(self): - #一级聚类中心 - if "C" in self._index_params: - py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) - self.indexkey = "C%s"%(self._index_params['C']) - #二级聚类中心 - if "F" in self._index_params: - py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) - self.indexkey += "_F%s"%(self._index_params['F']) - #filter - if "FN" in self._index_params: - py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) - self.indexkey += "_FN%s"%(self._index_params['FN']) - #量化 - if "N" in self._index_params: - if int(self._index_params['N']) > 1: - py_puck_api.update_gflag('whether_pq', 'true') - py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) - self.indexkey += "_N%s"%(self._index_params['N']) - else: - py_puck_api.update_gflag('whether_pq', 'false') - self.indexkey += "_Flat" - - if "tinker_neighborhood" in self._index_params: - py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) - self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) - if "tinker_construction" in self._index_params: - py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) - self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) - if "index_type" in self._index_params: - py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) - - def index_name(self, name): - return f"data/{name}.{self.indexkey}.puckindex" - - def index_tag_name(self, name): - return f"{name}.{self.indexkey}.puckindex" - - def load_index(self, dataset): - print("Loading index") - self.init_indexkey() - ds = DATASETS[dataset]() - self.topk = ds.default_count() - print("self.topk=%d"%self.topk) - py_puck_api.update_gflag('topk', "%s"%(ds.default_count())) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - py_puck_api.update_gflag('context_initial_pool_size', "%d"%(CPU_LIMIT)) - py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) - print(self.index_name(dataset)) - ret = self.index.init() - print("ret = ",ret) - if ret != 0: - return False - self.index.show() - self.n = ds.nq - return True - - def set_query_arguments(self, query_args): - for key, value in query_args.items(): - py_puck_api.update_gflag(key, "%s"%value) - #query_args_list = query_args.strip().split(',') - #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) - self.index.init() - #topk是作为检索参数传入puck - self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32')) - self.qas = query_args - - def query(self, X, topK): - - n, d = X.shape - self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) - #print(self.res[0]) - print(self.res[1]) - def get_results(self): - return self.res[1] - def filtered_query(self, X, filter, k): - n, d = X.shape - x_float = X.astype(np.float32) - meta_q = filter - self.index.filter_search(n, swig_ptr(x_float), k, swig_ptr(self.res[0]), swig_ptr(self.res[1]), swig_ptr(filter.indptr), swig_ptr(filter.indices)) - - def __str__(self): - return f'Puck{self.indexkey, self.qas}' diff --git a/neurips23/filter/run.py b/neurips23/filter/run.py deleted file mode 100644 index d5d56dde..00000000 --- a/neurips23/filter/run.py +++ /dev/null @@ -1,54 +0,0 @@ -from benchmark.algorithms.base_runner import BaseRunner -import time - -class FilterRunner(BaseRunner): - def run_task(algo, ds, distance, count, run_count, search_type, private_query): - best_search_time = float('inf') - search_times = [] - - if not private_query: - X = ds.get_queries() - else: - X = ds.get_private_queries() - - print(fr"Got {X.shape[0]} queries") - - for i in range(run_count): - print('Run %d/%d...' % (i + 1, run_count)) - - start = time.time() - if search_type == "knn": - algo.query(X, count) - total = (time.time() - start) - results = algo.get_results() - assert results.shape[0] == X.shape[0] - elif search_type == "knn_filtered": - if not private_query: - metadata = ds.get_queries_metadata() - else: - metadata = ds.get_private_queries_metadata() - algo.filtered_query(X, metadata, count) - total = (time.time() - start) - results = algo.get_results() - assert results.shape[0] == X.shape[0] - else: - raise NotImplementedError() - - search_time = total - best_search_time = min(best_search_time, search_time) - search_times.append( search_time ) - - attrs = { - "best_search_time": best_search_time, - "name": str(algo), - "run_count": run_count, - "distance": distance, - "type": search_type, - "count": int(count), - "search_times": search_times - } - additional = algo.get_additional() - for k in additional: - attrs[k] = additional[k] - return (attrs, results) - diff --git a/neurips23/filter/wm_filter/Dockerfile b/neurips23/filter/wm_filter/Dockerfile deleted file mode 100644 index cf63f4a4..00000000 --- a/neurips23/filter/wm_filter/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM neurips23 - -RUN apt-get update; DEBIAN_FRONTEND=noninteractive apt install intel-mkl python3-setuptools wget python3-matplotlib build-essential checkinstall libssl-dev swig4.0 python3-dev python3-numpy python3-numpy-dev -y -COPY install/requirements_conda.txt ./ -# conda doesn't like some of our packages, use pip -RUN python3 -m pip install -r requirements_conda.txt - - -# CMAKE with good enough version -RUN mkdir /build && wget https://github.com/Kitware/CMake/archive/refs/tags/v3.27.1.tar.gz && mv v3.27.1.tar.gz /build -RUN cd /build; tar -zxvf v3.27.1.tar.gz -RUN cd /build/CMake-3.27.1 && ./bootstrap && make && make install - - -RUN cd / && git clone https://github.com/alemagnani/faiss.git && cd /faiss && git pull && git checkout wm_filter - -RUN cd /faiss && rm -rf ./build -RUN cd /faiss/; cmake -B build /faiss/ -DFAISS_ENABLE_GPU=OFF -DFAISS_ENABLE_PYTHON=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DFAISS_OPT_LEVEL=avx2 -DBLA_VENDOR=Intel10_64_dyn -DBUILD_TESTING=ON -DPython_EXECUTABLE=/usr/bin/python3 -DMKL_LIBRARIES=/usr/lib/x86_64-linux-gnu/libmkl_rt.so -RUN cd /faiss/; make -C build -j faiss faiss_avx2 swigfaiss swigfaiss_avx2 -RUN (cd /faiss/build/faiss/python && python3 setup.py install) - -#RUN pip install tritonclient[all] -ENV PYTHONPATH=/faiss/build/faiss/python/build/lib/ - -RUN python3 -c 'import faiss; print(faiss.IndexFlatL2); print(faiss.__version__)' - - - diff --git a/neurips23/filter/wm_filter/README.md b/neurips23/filter/wm_filter/README.md deleted file mode 100644 index 0d451065..00000000 --- a/neurips23/filter/wm_filter/README.md +++ /dev/null @@ -1,7 +0,0 @@ - -### Submission for Neurips23 Filter track of WM_filter team -This submission leverages the IVF index to run the filter in a fast way. - -More info to come... - - diff --git a/neurips23/filter/wm_filter/config.yaml b/neurips23/filter/wm_filter/config.yaml deleted file mode 100644 index 4397152e..00000000 --- a/neurips23/filter/wm_filter/config.yaml +++ /dev/null @@ -1,50 +0,0 @@ -random-filter-s: - wm_filter: - docker-tag: neurips23-filter-wm_filter - module: neurips23.filter.wm_filter.wm_filter - constructor: FAISS - base-args: [ "@metric" ] - run-groups: - base: - args: | - [{"indexkey": "IVF1024,SQ8", - "threads": 8, - "train_size": 2000000, - "type": "direct" - }] - query-args: | - [ - {"nprobe": 80, "max_codes": 100, "selector_probe_limit": 80}, - {"nprobe": 100, "max_codes": 500, "selector_probe_limit": 100}, - {"nprobe": 120, "max_codes": 1000, "selector_probe_limit": 120}, - {"nprobe": 140, "max_codes": 1800, "selector_probe_limit": 140}, - {"nprobe": 160, "max_codes": 500, "selector_probe_limit": 160}, - {"nprobe": 70, "max_codes": 1000, "selector_probe_limit": 70} - ] -yfcc-10M: - wm_filter: - docker-tag: neurips23-filter-wm_filter - module: neurips23.filter.wm_filter.wm_filter - constructor: FAISS - base-args: [ "@metric" ] - run-groups: - base: - args: | - [{"indexkey": "IVF1024,SQ8", - "threads": 8, - "train_size": 2000000, - "type": "direct" - }] - query-args: | - [ - {"nprobe": 80, "max_codes": 1800, "selector_probe_limit": 80}, - {"nprobe": 100, "max_codes": 1800, "selector_probe_limit": 100}, - {"nprobe": 120, "max_codes": 1800, "selector_probe_limit": 120}, - {"nprobe": 140, "max_codes": 1800, "selector_probe_limit": 140}, - {"nprobe": 160, "max_codes": 1800, "selector_probe_limit": 160}, - {"nprobe": 70, "max_codes": 2100, "selector_probe_limit": 70}, - {"nprobe": 100, "max_codes": 2100, "selector_probe_limit": 100}, - {"nprobe": 130, "max_codes": 2100, "selector_probe_limit": 130}, - {"nprobe": 160, "max_codes": 2100, "selector_probe_limit": 160}, - {"nprobe": 200, "max_codes": 2100, "selector_probe_limit": 200} - ] diff --git a/neurips23/filter/wm_filter/wm_filter.py b/neurips23/filter/wm_filter/wm_filter.py deleted file mode 100644 index 671b19de..00000000 --- a/neurips23/filter/wm_filter/wm_filter.py +++ /dev/null @@ -1,572 +0,0 @@ -import pdb -import pickle -import numpy as np -import os - -from multiprocessing.pool import ThreadPool -from threading import current_thread - -import faiss - - -from faiss.contrib.inspect_tools import get_invlist -from neurips23.filter.base import BaseFilterANN -from benchmark.datasets import DATASETS -from benchmark.dataset_io import download_accelerated -from math import log10, pow - - -def csr_get_row_indices(m, i): - """ get the non-0 column indices for row i in matrix m """ - return m.indices[m.indptr[i] : m.indptr[i + 1]] - -def make_id_selector_ivf_two(docs_per_word): - sp = faiss.swig_ptr - return faiss.IDSelectorIVFTwo(sp(docs_per_word.indices), sp(docs_per_word.indptr)) - -def make_id_selector_cluster_aware(indices, limits, clusters, cluster_limits): - sp = faiss.swig_ptr - return faiss.IDSelectorIVFClusterAware(sp(indices), sp(limits), sp(clusters), sp(cluster_limits)) - -def make_id_selector_cluster_aware_intersect(indices, limits, clusters, cluster_limits, tmp_size): - sp = faiss.swig_ptr - return faiss.IDSelectorIVFClusterAwareIntersect(sp(indices), sp(limits), sp(clusters), sp(cluster_limits), int(tmp_size)) - -def make_id_selector_cluster_aware_direct(id_position_in_cluster, limits, clusters, cluster_limits, tmp_size): - sp = faiss.swig_ptr - return faiss.IDSelectorIVFClusterAwareIntersectDirect(sp(id_position_in_cluster), sp(limits), sp(clusters), sp(cluster_limits), int(tmp_size)) - -def make_id_selector_cluster_aware_direct_exp(id_position_in_cluster, limits, nprobes, tmp_size): - sp = faiss.swig_ptr - return faiss.IDSelectorIVFClusterAwareIntersectDirectExp(sp(id_position_in_cluster), sp(limits), int(nprobes), int(tmp_size)) - - -def find_invlists(index): - try: - inverted_lists = index.invlists - except: - base_index = faiss.downcast_index(index.base_index) - print('cannot find the inverted list trying one level down') - print('type of index', type(base_index)) - inverted_lists = base_index.invlists - return inverted_lists - -def print_stats(): - m = 1000000. - intersection = faiss.cvar.IDSelectorMy_Stats.intersection/m - find_cluster = faiss.cvar.IDSelectorMy_Stats.find_cluster/m - set_list_time = faiss.cvar.IDSelectorMy_Stats.set_list_time/m - scan_codes = faiss.cvar.IDSelectorMy_Stats.scan_codes/m - one_list = faiss.cvar.IDSelectorMy_Stats.one_list/m - extra = faiss.cvar.IDSelectorMy_Stats.extra / m - inter_plus_find = intersection + find_cluster - print('intersection: {}, find_cluster: {}, intersection+ find cluster: {}, set list time: {}, scan_codes: {}, one list: {}, extra: {}'.format(intersection, find_cluster, inter_plus_find, set_list_time, scan_codes, one_list, extra)) - - -def spot_check_filter(docs_per_word, index, indices, limits, clusters, cluster_limits): - print('running spot check') - - - inverted_lists = find_invlists(index) - - from_id_to_map = dict() - for i in range(inverted_lists.nlist): - list_ids, _ = get_invlist(inverted_lists, i) - for id in list_ids: - from_id_to_map[id] = i - - indptr = docs_per_word.indptr - - ## lets' run some spot check - for word in [0, 5, 7]: - #for word in range(docs_per_word.shape[0]): - #for word in [docs_per_word.shape[0]-1 ]: - c_start = cluster_limits[word] - c_end = cluster_limits[word + 1] - assert c_end >= c_start - - start = indptr[word] - end = indptr[word + 1] - ids_in_word = {id for id in docs_per_word.indices[start:end]} - - cluster_base = -1 - for pos, cluster in enumerate(clusters[c_start: c_end]): - if cluster_base == -1: - cluster_base = cluster - else: - assert cluster != cluster_base - cluster_base = cluster - for id in indices[limits[c_start + pos]: limits[c_start + pos + 1]]: - assert from_id_to_map[id] == cluster - assert id in ids_in_word - ids_in_word.remove(id) - assert len(ids_in_word) == 0 # we should have covered all the ids in the word with the clusters - - -def spot_check_filter_exp(docs_per_word, index, indices, limits): - print('running spot check') - - - inverted_lists = find_invlists(index) - - from_id_to_map = dict() - for i in range(inverted_lists.nlist): - list_ids, _ = get_invlist(inverted_lists, i) - for id in list_ids: - from_id_to_map[id] = i - - indptr = docs_per_word.indptr - - nprobes = inverted_lists.nlist - - ## lets' run some spot check - for word in [0, 5000, 12124, 151123, 198000]: - #for word in range(docs_per_word.shape[0]): - #for word in [docs_per_word.shape[0]-1 ]: - local_ids_to_cluster = dict() - #print(limits[nprobes * word: nprobes * word + nprobes]) - for cluster in range(nprobes): - c_start = limits[word * nprobes + cluster] - c_end = limits[word * nprobes + cluster+1] - - if c_end >=0 and c_start >=0 and c_end > c_start: - for id in indices[c_start: c_end]: - local_ids_to_cluster[id] = cluster - - - - start = indptr[word] - end = indptr[word + 1] - ids_in_word = {id for id in docs_per_word.indices[start:end]} - print(len(ids_in_word), len(local_ids_to_cluster)) - assert len(ids_in_word) == len(local_ids_to_cluster) - for id in ids_in_word: - cluster_found = from_id_to_map[id] - assert cluster_found == local_ids_to_cluster[id] - print('done checking word ', word) - - print('done spot check') - - -def find_max_interval(limits): - - out = -1 - for i in range(len(limits)-1): - delta = limits[i+1] - limits[i] - if delta > out: - out = delta - return out - - -def prepare_filter_by_cluster(docs_per_word, index): - print('creating filter cluster') - inverted_lists = find_invlists(index) - from_id_to_map = dict() - from_id_to_pos = dict() - for i in range(inverted_lists.nlist): - list_ids, _ = get_invlist(inverted_lists, i) - for pos, id in enumerate(list_ids): - #print('list: ', i, "id: ", id, "pos: ",pos) - from_id_to_map[id] = i - from_id_to_pos[id] = pos - print('loaded the mapping with {} entries'.format(len(from_id_to_map))) - - ## reorganize the docs per word - # - cluster_limits = [0] - clusters = list() - limits = list() - id_position_in_cluster = list() - - indices = np.array(docs_per_word.indices) - indptr = docs_per_word.indptr - for word in range(docs_per_word.shape[0]): - start = indptr[word] - end = indptr[word + 1] - if word % 10000 == 0: - print('processed {} words'.format(word)) - array_ind_cluster = [(id, from_id_to_map[id]) for id in indices[start:end]] - array_ind_cluster.sort(key=lambda x: x[1]) - - if len(array_ind_cluster) == 0: - pass - local_clusters = [] - local_limits = [] - current_cluster = -1 - for pos, arr in enumerate(array_ind_cluster): - id, cluster = arr - if current_cluster == -1 or cluster != current_cluster: - current_cluster = cluster - local_clusters.append(cluster) - local_limits.append(start + pos) - indices[start + pos] = id - id_position_in_cluster.append(from_id_to_pos[id]) - - clusters.extend(local_clusters) - limits.extend(local_limits) - new_cluster_limit = len(local_clusters) + cluster_limits[-1] - cluster_limits.append( new_cluster_limit) - limits.append(len(indices)) - - clusters = np.array(clusters, dtype=np.int16) - limits = np.array(limits, dtype=np.int32) - cluster_limits = np.array(cluster_limits, dtype=np.int32) - id_position_in_cluster = np.array(id_position_in_cluster, dtype=np.int32) - - return indices, limits, clusters, cluster_limits, id_position_in_cluster - - -def prepare_filter_by_cluster_exp(docs_per_word, index): - print('creating filter cluster expanded') - inverted_lists = find_invlists(index) - from_id_to_map = dict() - from_id_to_pos = dict() - - nprobes = inverted_lists.nlist - for i in range(inverted_lists.nlist): - list_ids, _ = get_invlist(inverted_lists, i) - for pos, id in enumerate(list_ids): - #print('list: ', i, "id: ", id, "pos: ",pos) - from_id_to_map[id] = i - from_id_to_pos[id] = pos - print('loaded the mapping with {} entries'.format(len(from_id_to_map))) - - ## reorganize the docs per word - # - - limits = -np.ones( (docs_per_word.shape[0] * nprobes + 1,), dtype=np.int32) - id_position_in_cluster = list() - - indices = np.array(docs_per_word.indices) - indptr = docs_per_word.indptr - for word in range(docs_per_word.shape[0]): - start = indptr[word] - end = indptr[word + 1] - if word % 10000 == 0: - print('processed {} words'.format(word)) - array_ind_cluster = [(id, from_id_to_map[id]) for id in indices[start:end]] - array_ind_cluster.sort(key=lambda x: x[1]) - - - - local_limits = [] - current_cluster = -1 - - for pos, arr in enumerate(array_ind_cluster): - id, cluster = arr - if current_cluster == -1 or cluster != current_cluster: - - if current_cluster != -1: - limits[word * nprobes + current_cluster + 1] = start + pos - - - current_cluster = cluster - local_limits.append(start + pos) - - limits[word * nprobes + current_cluster] = start + pos - - indices[start + pos] = id - id_position_in_cluster.append(from_id_to_pos[id]) - - limits[word * nprobes + current_cluster + 1] = start + len(array_ind_cluster) - - - limits = np.array(limits, dtype=np.int32) - - id_position_in_cluster = np.array(id_position_in_cluster, dtype=np.int32) - - return indices, limits, id_position_in_cluster, nprobes - - -class FAISS(BaseFilterANN): - - def __init__(self, metric, index_params): - self._index_params = index_params - self._metric = metric - - self.train_size = index_params.get('train_size', None) - self.indexkey = index_params.get("indexkey", "IVF32768,SQ8") - self.metadata_threshold = 1e-3 - self.nt = index_params.get("threads", 1) - self.type = index_params.get("type", "intersect") - - self.clustet_dist = [] - - - def fit(self, dataset): - faiss.omp_set_num_threads(self.nt) - ds = DATASETS[dataset]() - - print('the size of the index', ds.d) - index = faiss.index_factory(ds.d, self.indexkey) - xb = ds.get_dataset() - - print("train") - print('train_size', self.train_size) - if self.train_size is not None: - x_train = xb[:self.train_size] - else: - x_train = xb - index.train(x_train) - print("populate") - - bs = 1024 - for i0 in range(0, ds.nb, bs): - index.add(xb[i0: i0 + bs]) - - - print('ids added') - self.index = index - self.nb = ds.nb - self.xb = xb - self.ps = faiss.ParameterSpace() - self.ps.initialize(self.index) - print("store", self.index_name(dataset)) - faiss.write_index(index, self.index_name(dataset)) - - if ds.search_type() == "knn_filtered": - words_per_doc = ds.get_dataset_metadata() - words_per_doc.sort_indices() - self.docs_per_word = words_per_doc.T.tocsr() - self.docs_per_word.sort_indices() - self.ndoc_per_word = self.docs_per_word.indptr[1:] - self.docs_per_word.indptr[:-1] - self.freq_per_word = self.ndoc_per_word / self.nb - del words_per_doc - - if self.type == 'exp': - self.indices, self.limits, self.id_position_in_cluster, self.total_clusters = prepare_filter_by_cluster_exp( - self.docs_per_word, self.index) - pickle.dump( - (self.indices, self.limits, self.id_position_in_cluster, self.total_clusters ), - open(self.cluster_sig_name(dataset), "wb"), -1) - #spot_check_filter_exp(self.docs_per_word, self.index, self.indices, self.limits) - else: - self.indices, self.limits, self.clusters, self.cluster_limits, self.id_position_in_cluster = prepare_filter_by_cluster(self.docs_per_word, self.index) - print('dumping cluster map') - pickle.dump((self.indices, self.limits, self.clusters, self.cluster_limits, self.id_position_in_cluster), open(self.cluster_sig_name(dataset), "wb"), -1) - #spot_check_filter(self.docs_per_word, self.index, self.indices, self.limits, self.clusters, - # self.cluster_limits) - - self.max_range = find_max_interval(self.limits) - print('the max range is {}'.format(self.max_range)) - - def index_name(self, name): - - if self.type == 'exp': - return f"data/{name}.{self.indexkey}_exp_wm.faissindex" - else: - return f"data/{name}.{self.indexkey}_wm.faissindex" - - - def cluster_sig_name(self, name): - if self.type == 'exp': - return f"data/{name}.{self.indexkey}_exp_cluster_wm.pickle" - return f"data/{name}.{self.indexkey}_cluster_wm.pickle" - - - def get_probes(self, freq, a, b, min_prob = 4, max_prob=256): - #print("b: ", b) - probes = int( pow(2, - a * log10(freq )+ b)) - probes = max(min_prob, probes) - probes = min(max_prob, probes) - return probes - - def load_index(self, dataset): - """ - Load the index for dataset. Returns False if index - is not available, True otherwise. - - Checking the index usually involves the dataset name - and the index build paramters passed during construction. - """ - if not os.path.exists(self.index_name(dataset)): - if 'url' not in self._index_params: - return False - - print('Downloading index in background. This can take a while.') - download_accelerated(self._index_params['url'], self.index_name(dataset), quiet=True) - - print("Loading index") - ds = DATASETS[dataset]() - self.nb = ds.nb - self.xb = ds.get_dataset() - - if ds.search_type() == "knn_filtered": - words_per_doc = ds.get_dataset_metadata() - words_per_doc.sort_indices() - self.docs_per_word = words_per_doc.T.tocsr() - self.docs_per_word.sort_indices() - self.ndoc_per_word = self.docs_per_word.indptr[1:] - self.docs_per_word.indptr[:-1] - self.freq_per_word = self.ndoc_per_word / self.nb - del words_per_doc - - self.index = faiss.read_index(self.index_name(dataset)) - - if ds.search_type() == "knn_filtered": - if os.path.isfile( self.cluster_sig_name(dataset)): - print('loading cluster file') - if self.type == 'exp': - self.indices, self.limits, self.id_position_in_cluster, self.total_clusters = pickle.load( - open(self.cluster_sig_name(dataset), "rb")) - #spot_check_filter_exp(self.docs_per_word, self.index, self.indices, self.limits) - - else: - self.indices, self.limits, self.clusters, self.cluster_limits, self.id_position_in_cluster = pickle.load(open(self.cluster_sig_name(dataset), "rb")) - else: - print('cluster file not found') - if self.type == 'exp': - self.indices, self.limits, self.id_position_in_cluster, self.total_clusters = prepare_filter_by_cluster_exp( - self.docs_per_word, self.index) - pickle.dump( - (self.indices, self.limits, self.id_position_in_cluster, self.total_clusters ), - open(self.cluster_sig_name(dataset), "wb"), -1) - #spot_check_filter_exp(self.docs_per_word, self.index, self.indices, self.limits) - - else: - self.indices, self.limits, self.clusters, self.cluster_limits, self.id_position_in_cluster = prepare_filter_by_cluster(self.docs_per_word, self.index) - pickle.dump((self.indices, self.limits, self.clusters, self.cluster_limits, self.id_position_in_cluster), open(self.cluster_sig_name(dataset), "wb"), -1) - - #spot_check_filter(self.docs_per_word, self.index, self.indices, self.limits, self.clusters, self.cluster_limits) - - self.max_range = find_max_interval(self.limits) - print('the max range is {}'.format(self.max_range)) - - self.ps = faiss.ParameterSpace() - self.ps.initialize(self.index) - - - # delete not necessary data - del self.xb - del ds - if self.type == "exp" or self.type == 'direct': - print(" deleting indices") - del self.indices - #del self.docs_per_word - return True - - def index_files_to_store(self, dataset): - """ - Specify a triplet with the local directory path of index files, - the common prefix name of index component(s) and a list of - index components that need to be uploaded to (after build) - or downloaded from (for search) cloud storage. - - For local directory path under docker environment, please use - a directory under - data/indices/track(T1 or T2)/algo.__str__()/DATASETS[dataset]().short_name() - """ - raise NotImplementedError() - - def query(self, X, k): - nq = X.shape[0] - self.I = -np.ones((nq, k), dtype='int32') - bs = 1024 - - try: - print('k_factor', self.index.k_factor) - self.index.k_factor = self.k_factor - except Exception as e: - print(e) - pass - for i0 in range(0, nq, bs): - _, self.I[i0:i0+bs] = self.index.search(X[i0:i0+bs], k) - - - - def filtered_query(self, X, filter, k): - - # try: - # self.index.k_factor = self.k_factor - # except Exception as e: - # pass - - nq = X.shape[0] - self.I = -np.ones((nq, k), dtype='int32') - - meta_q = filter - selector_by_thread = dict() - - def process_one_row(q): - faiss.omp_set_num_threads(1) - thread = current_thread() - - qwords = csr_get_row_indices(meta_q, q) - w1 = qwords[0] - if qwords.size == 2: - w2 = qwords[1] - else: - w2 = -1 - - if thread not in selector_by_thread: - - sel = make_id_selector_cluster_aware_direct(self.id_position_in_cluster, self.limits, self.clusters, - self.cluster_limits, self.max_range) - # # IVF first, filtered search - # if self.type == 'simple': - # sel = make_id_selector_ivf_two(self.docs_per_word) - # elif self.type == "aware": - # sel = make_id_selector_cluster_aware(self.indices, self.limits, self.clusters, self.cluster_limits) - # elif self.type == 'intersect': - # sel = make_id_selector_cluster_aware_intersect(self.indices, self.limits, self.clusters, self.cluster_limits, self.max_range) - # elif self.type == 'direct': - # sel = make_id_selector_cluster_aware_direct(self.id_position_in_cluster, self.limits, self.clusters, - # self.cluster_limits, self.max_range) - # elif self.type == 'exp': - # sel = make_id_selector_cluster_aware_direct_exp(self.id_position_in_cluster, self.limits, self.total_clusters, self.max_range) - # else: - # raise Exception('unknown type ', self.type) - selector_by_thread[thread] = sel - else: - sel = selector_by_thread.get(thread) - - sel.set_words(int(w1), int(w2)) - - params = faiss.SearchParametersIVF(sel=sel, nprobe=self.nprobe, max_codes=self.max_codes, selector_probe_limit=self.selector_probe_limit) - _, Ii = self.index.search( X[q:q+1], k, params=params) - Ii = Ii.ravel() - self.I[q] = Ii - - if self.nt <= 1: - for q in range(nq): - process_one_row(q) - else: - faiss.omp_set_num_threads(self.nt) - - pool = ThreadPool(self.nt) - list(pool.map(process_one_row, range(nq))) - - def get_results(self): - return self.I - - def set_query_arguments(self, query_args): - #faiss.cvar.indexIVF_stats.reset() - #faiss.cvar.IDSelectorMy_Stats.reset() - if "nprobe" in query_args: - self.nprobe = query_args['nprobe'] - self.ps.set_index_parameters(self.index, f"nprobe={query_args['nprobe']}") - self.qas = query_args - else: - self.nprobe = 1 - if "max_codes" in query_args: - self.max_codes = query_args["max_codes"] - self.ps.set_index_parameters(self.index, f"max_codes={query_args['max_codes']}") - self.qas = query_args - else: - self.max_codes = -1 - if "selector_probe_limit" in query_args: - self.selector_probe_limit = query_args['selector_probe_limit'] - self.ps.set_index_parameters(self.index, f"selector_probe_limit={query_args['selector_probe_limit']}") - self.qas = query_args - else: - self.selector_probe_limit = 0 - - if "k_factor" in query_args: - self.k_factor = query_args['k_factor'] - self.qas = query_args - - - - def __str__(self): - return f'Faiss({self.indexkey,self.type, self.qas})' - - \ No newline at end of file diff --git a/neurips23/ood/base.py b/neurips23/ood/base.py deleted file mode 100644 index f5df16cd..00000000 --- a/neurips23/ood/base.py +++ /dev/null @@ -1,5 +0,0 @@ -from benchmark.algorithms.base import BaseANN - -class BaseOODANN(BaseANN): - def track(self): - return "ood" \ No newline at end of file diff --git a/neurips23/ood/diskann/Dockerfile b/neurips23/ood/diskann/Dockerfile deleted file mode 100644 index a61f7e86..00000000 --- a/neurips23/ood/diskann/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt install -y software-properties-common -RUN add-apt-repository -y ppa:git-core/ppa -RUN apt update -RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10 - -RUN git clone https://github.com/microsoft/DiskANN.git --branch 0.5.0.rc3.post1 -WORKDIR /home/app/DiskANN -RUN pip3 install virtualenv build -RUN python3 -m build -RUN pip install dist/diskannpy-0.5.0rc3.post1-cp310-cp310-linux_x86_64.whl -WORKDIR /home/app diff --git a/neurips23/ood/diskann/config.yaml b/neurips23/ood/diskann/config.yaml deleted file mode 100644 index a996f916..00000000 --- a/neurips23/ood/diskann/config.yaml +++ /dev/null @@ -1,27 +0,0 @@ -random-xs: - diskann: - docker-tag: neurips23-ood-diskann - module: neurips23.ood.diskann.diskann-in-mem - constructor: diskann - base-args: ["@metric"] - run-groups: - base: - args: | - [{"R":32, "L":50, "buildthreads":32}] - query-args: | - [{"Ls":50, "T":8}] -text2image-10M: - diskann: - docker-tag: neurips23-ood-diskann - module: neurips23.ood.diskann.diskann-in-mem - constructor: diskann - base-args: ["@metric"] - run-groups: - base: - args: | - [{"R":64, "L":500, "buildthreads":32}] - query-args: | - [{"Ls":30, "T":8}, - {"Ls":50, "T":8}, - {"Ls":70, "T":8}, - {"Ls":100, "T":8}] diff --git a/neurips23/ood/diskann/diskann-in-mem.py b/neurips23/ood/diskann/diskann-in-mem.py deleted file mode 100755 index 5a387259..00000000 --- a/neurips23/ood/diskann/diskann-in-mem.py +++ /dev/null @@ -1,172 +0,0 @@ -from __future__ import absolute_import -import psutil -import os -import time -import numpy as np -import diskannpy - -from neurips23.ood.base import BaseOODANN -from benchmark.datasets import DATASETS, download_accelerated - -class diskann(BaseOODANN): - def __init__(self, metric, index_params): - self.name = "diskann" - if (index_params.get("R")==None): - print("Error: missing parameter R") - return - if (index_params.get("L")==None): - print("Error: missing parameter L") - return - self._index_params = index_params - self._metric = metric - - self.R = index_params.get("R") - self.L = index_params.get("L") - - def index_name(self): - return f"R{self.R}_L{self.L}" - - def create_index_dir(self, dataset): - index_dir = os.path.join(os.getcwd(), "data", "indices", "ood") - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, 'diskann') - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, dataset.short_name()) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, self.index_name()) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - return index_dir - - def translate_dist_fn(self, metric): - if metric == 'euclidean': - return 'l2' - elif metric == 'ip': - return 'mips' - else: - raise Exception('Invalid metric') - - def translate_dtype(self, dtype:str): - if dtype == 'uint8': - return np.uint8 - elif dtype == 'int8': - return np.int8 - elif dtype == 'float32': - return np.float32 - else: - raise Exception('Invalid data type') - - def fit(self, dataset): - """ - Build the index for the data points given in dataset name. - """ - - ds = DATASETS[dataset]() - d = ds.d - - buildthreads = self._index_params.get("buildthreads", -1) - print(buildthreads) - if buildthreads == -1: - buildthreads = 0 - - index_dir = self.create_index_dir(ds) - - if hasattr(self, 'index'): - print('Index object exists already') - return - - print(ds.get_dataset_fn()) - - start = time.time() - diskannpy.build_memory_index( - data = ds.get_dataset_fn(), - distance_metric = self.translate_dist_fn(ds.distance()), - vector_dtype = self.translate_dtype(ds.dtype), - index_directory = index_dir, - index_prefix = self.index_name(), - complexity=self.L, - graph_degree=self.R, - num_threads = buildthreads, - alpha=1.2, - use_pq_build=False, - num_pq_bytes=0, #irrelevant given use_pq_build=False - use_opq=False - ) - end = time.time() - print("DiskANN index built in %.3f s" % (end - start)) - - - print('Loading index..') - self.index = diskannpy.StaticMemoryIndex( - distance_metric = self.translate_dist_fn(ds.distance()), - vector_dtype = self.translate_dtype(ds.dtype), - index_directory = index_dir, - index_prefix = self.index_name(), - num_threads = 64, #to allocate scratch space for up to 64 search threads - initial_search_complexity = 100 - ) - print('Index ready for search') - - def get_index_components(self, dataset): - index_components = ['', '.data'] - ds = DATASETS[dataset]() - if ds.distance() == "ip": - index_components = index_components + [] - return index_components - - def index_files_to_store(self, dataset): - return [self.create_index_dir(DATASETS[dataset]()), self.index_name(), self.get_index_components(dataset)] - - def load_index(self, dataset): - """ - Load the index for dataset. Returns False if index - is not available, True otherwise. - - Checking the index usually involves the dataset name - and the index build paramters passed during construction. - """ - ds = DATASETS[dataset]() - - index_dir = self.create_index_dir(ds) - if not (os.path.exists(index_dir)) and 'url' not in self._index_params: - return False - - index_path = os.path.join(index_dir, self.index_name()) - index_components = self.get_index_components(dataset) - - for component in index_components: - index_file = index_path + component - if not (os.path.exists(index_file)): - if 'url' in self._index_params: - index_file_source = self._index_params['url'] + '/' + self.index_name() + component - print(f"Downloading index in background. This can take a while.") - download_accelerated(index_file_source, index_file, quiet=True) - else: - return False - - print("Loading index") - - self.index = diskannpy.StaticMemoryIndex( - distance_metric = self.translate_dist_fn(ds.distance()), - vector_dtype = self.translate_dtype(ds.dtype), - index_directory = index_dir, - index_prefix = self.index_name(), - num_threads = 64, #to allocate scratch space for up to 64 search threads - initial_search_complexity = 100 - ) - print ("Load index success.") - return True - - def query(self, X, k): - """Carry out a batch query for k-NN of query set X.""" - nq, dim = (np.shape(X)) - self.res, self.query_dists = self.index.batch_search( - X, k, self.Ls, self.search_threads) - - - def set_query_arguments(self, query_args): - self._query_args = query_args - self.Ls = 0 if query_args.get("Ls") == None else query_args.get("Ls") - self.search_threads = self._query_args.get("T") - - def __str__(self): - return f'diskann({self.index_name(), self._query_args})' \ No newline at end of file diff --git a/neurips23/ood/puck-fizz/Dockerfile b/neurips23/ood/puck-fizz/Dockerfile deleted file mode 100755 index 1695b160..00000000 --- a/neurips23/ood/puck-fizz/Dockerfile +++ /dev/null @@ -1,35 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt-get update -RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip -#swig -RUN apt-get update && apt-get install -y swig cmake -RUN pip3 install pybind11 numpy -RUN cat /etc/ld.so.conf -RUN ls /etc/ld.so.conf.d/ -##cmake -# COPY cmake-3.22.0-linux-x86_64.sh . -RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh -RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake -ENV PATH /home/app/cmake/bin:$PATH - -#mkl -# COPY l_onemkl_p_2023.2.0.49497_offline.sh . -RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh -RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s - -RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf -RUN ldconfig -RUN touch /etc/profile.d/intel.sh -RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh -RUN . /etc/profile.d/intel.sh - -ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" -#RUN git config --global http.sslVerify false - -RUN git clone -b ood-try https://github.com/baidu/puck.git -# COPY puck-ood-feature.tar.gz . -# RUN tar zxvf puck-ood-feature.tar.gz -RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install -RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/ood/puck-fizz/config.yaml b/neurips23/ood/puck-fizz/config.yaml deleted file mode 100755 index 018d49d0..00000000 --- a/neurips23/ood/puck-fizz/config.yaml +++ /dev/null @@ -1,33 +0,0 @@ -random-xs: - puck: - docker-tag: neurips23-ood-puck-fizz - module: neurips23.ood.puck-fizz.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] - query-args: | - [ - {"search_coarse_count":50, "tinker_search_range": 100}, - {"search_coarse_count":50, "tinker_search_range": 200}, - {"search_coarse_count":50, "tinker_search_range": 300} - ] - - -text2image-10M: - puck-fizz: - docker-tag: neurips23-ood-puck-fizz - module: neurips23.ood.puck-fizz.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] - query-args: | - [ - {"search_coarse_count":10, "tinker_search_range": 160}, - {"search_coarse_count":10, "tinker_search_range": 170}, - {"search_coarse_count":10, "tinker_search_range": 180}, - {"search_coarse_count":10, "tinker_search_range": 190} - ] \ No newline at end of file diff --git a/neurips23/ood/puck-fizz/puck.py b/neurips23/ood/puck-fizz/puck.py deleted file mode 100755 index 347075bd..00000000 --- a/neurips23/ood/puck-fizz/puck.py +++ /dev/null @@ -1,217 +0,0 @@ -# !/usr/bin/env python3 -#-*- coding:utf-8 -*- -################################################################################ -# -# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved -# -################################################################################ -""" -@file: puck_inmem.py -@author: heaoxiang(heaoxiang@baidu.com) -@date: 2023-10-29 13:44 -@brief: -""" -import ctypes - -from neurips23.ood.base import BaseOODANN -from benchmark.datasets import DATASETS -from benchmark.dataset_io import download_accelerated,xbin_mmap,xbin_write -# from neurips23.ood.puck.puck_lib import py_puck_api -from puck import py_puck_api -import multiprocessing.pool -import multiprocessing -import gc -import os -import numpy as np -import time -import math -import struct - -CPU_LIMIT = multiprocessing.cpu_count() -swig_ptr = py_puck_api.swig_ptr -class Puck(BaseOODANN): - def __init__(self, metric, index_params): - self._index_params = index_params - self._metric = metric - self.indexkey = index_params.get("indexkey", "NA") - - self.index = py_puck_api.PySearcher() - self.topk = 10 - self.n = 0 - self.build_memory_usage = -1 - - def track(self): - #T1 means in memory - return "T1 for 10M & 100M" - - - def check_feature(self, dataset): - #更新gflags - ds = DATASETS[dataset]() - d = ds.d - #特征纬度 - py_puck_api.update_gflag('feature_dim', "%d"%(d)) - - #根据距离计算方式调整 - whether_norm = False - py_puck_api.update_gflag('whether_norm', 'false') - ip2cos = 0 - if ds.distance() == "angular": - whether_norm = True - py_puck_api.update_gflag('whether_norm', 'true') - elif ds.distance() == "ip": - ip2cos = 1 - py_puck_api.update_gflag('ip2cos', '%d'%(ip2cos)) - - self.init_indexkey() - - #测试 - py_puck_api.update_gflag('kmeans_iterations_count',"1") - - #索引存储目录 - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - if not os.path.exists(self.index_name(dataset)): - index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - - #训练用目录 - if not os.path.exists('mid-data'): - os.mkdir('mid-data') - - #格式化文件数据,将来可不要,后续可修改训练接口 - all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb') - - add_part=100000 - i0 = 0 - t0 = time.time() - for xblock in ds.get_dataset_iterator(bs=add_part): - i1 = i0 + len(xblock) - i0 = i1 - for x in xblock: - feat = x - if(whether_norm): - feat = feat / np.sqrt(np.dot(feat, feat)) - elif(ip2cos > 0): - norm = np.dot(feat, feat) - if norm > 1.0: - print("not support, please contact yinjie06") - return False - feat = np.append(feat, math.sqrt(1.0 - norm)) - - buf = struct.pack('i', len(feat)) - all_feature_file.write(buf) - buf = struct.pack('f' * len(feat), *feat) - all_feature_file.write(buf) - print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0)) - all_feature_file.close() - - print(" init help query ") - filename = os.path.join(ds.basedir, ds.qs_fn) - if os.path.exists(filename): - read_x = xbin_mmap(filename, dtype=ds.dtype) - write_x = np.append(read_x, np.zeros((read_x.shape[0], 1)), axis=1) - print("help query shape nrows = %d , ncols = %d "%(write_x.shape[0],write_x.shape[1])) - xbin_write(write_x,"%s/help_query.feat.bin"%(self.index_name(dataset))) - - return True - - def fit(self, dataset): - self.check_feature(dataset) - ds = DATASETS[dataset]() - #训练数据采样 - py_puck_api.update_gflag('train_points_count', "5000000") - py_puck_api.update_gflag('pq_train_points_count', "500000") - print(self.index_name(dataset)) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - - self.index.build(ds.nb) - self.load_index(dataset) - - #index = py_puck_api.PySearcher() - #p = multiprocessing.Process(group=None,target=index.build,args=(ds.nb,)) - #self.index.build(ds.nb) - #p.start() - #p.join() - - def init_indexkey(self): - #一级聚类中心 - if "C" in self._index_params: - py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) - self.indexkey = "C%s"%(self._index_params['C']) - #二级聚类中心 - if "F" in self._index_params: - py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) - self.indexkey += "_F%s"%(self._index_params['F']) - #filter - if "FN" in self._index_params: - py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) - self.indexkey += "_FN%s"%(self._index_params['FN']) - #量化 - if "N" in self._index_params: - if int(self._index_params['N']) > 1: - py_puck_api.update_gflag('whether_pq', 'true') - py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) - self.indexkey += "_N%s"%(self._index_params['N']) - else: - py_puck_api.update_gflag('whether_pq', 'false') - self.indexkey += "_Flat" - - if "tinker_neighborhood" in self._index_params: - py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) - self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) - if "tinker_construction" in self._index_params: - py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) - self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) - if "index_type" in self._index_params: - py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) - if "radius_rate" in self._index_params: - py_puck_api.update_gflag('radius_rate', "%f"%(self._index_params['radius_rate'])) - self.indexkey += "_RadiusRate%s"%(self._index_params['radius_rate']) - - - def index_name(self, name): - return f"data/{name}.{self.indexkey}.puckindex" - - def index_tag_name(self, name): - return f"{name}.{self.indexkey}.puckindex" - - def load_index(self, dataset): - print("Loading index") - self.init_indexkey() - ds = DATASETS[dataset]() - self.topk = ds.default_count() - print("self.topk=%d"%self.topk) - py_puck_api.update_gflag('topk', "%s"%(ds.default_count())) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - py_puck_api.update_gflag('context_initial_pool_size', "%d"%(2 * CPU_LIMIT)) - py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) - print(self.index_name(dataset)) - ret = self.index.init() - print("ret = ",ret) - if ret != 0: - return False - self.index.show() - self.n = ds.nq - return True - - def set_query_arguments(self, query_args): - for key, value in query_args.items(): - py_puck_api.update_gflag(key, "%s"%value) - #query_args_list = query_args.strip().split(',') - #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) - self.index.init() - #topk是作为检索参数传入puck - self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32')) - self.qas = query_args - - def query(self, X, topK): - n, d = X.shape - - self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) - #print(self.res[0]) - # print(self.res[1]) - def get_results(self): - return self.res[1] - - def __str__(self): - return f'Puck{self.indexkey, self.qas}' \ No newline at end of file diff --git a/neurips23/ood/puck/Dockerfile b/neurips23/ood/puck/Dockerfile deleted file mode 100755 index ff088505..00000000 --- a/neurips23/ood/puck/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt-get update -RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip -#swig -RUN apt-get update && apt-get install -y swig cmake -RUN pip3 install pybind11 numpy -RUN cat /etc/ld.so.conf -RUN ls /etc/ld.so.conf.d/ -##cmake -RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh -RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake -ENV PATH /home/app/cmake/bin:$PATH - -#mkl -RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh -RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s - -RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf -RUN ldconfig -RUN touch /etc/profile.d/intel.sh -RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh -RUN . /etc/profile.d/intel.sh - -ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" -#RUN git config --global http.sslVerify false - -RUN git clone -b ood https://github.com/baidu/puck.git -RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install -RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/ood/puck/config.yaml b/neurips23/ood/puck/config.yaml deleted file mode 100755 index 933f963a..00000000 --- a/neurips23/ood/puck/config.yaml +++ /dev/null @@ -1,34 +0,0 @@ -random-xs: - puck: - docker-tag: neurips23-ood-puck - module: neurips23.ood.puck.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] - query-args: | - [ - {"search_coarse_count":50, "tinker_search_range": 100}, - {"search_coarse_count":50, "tinker_search_range": 200}, - {"search_coarse_count":50, "tinker_search_range": 300} - ] - - -text2image-10M: - puck: - docker-tag: neurips23-ood-puck - module: neurips23.ood.puck.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] - query-args: | - [ - {"search_coarse_count":10, "tinker_search_range": 190}, - {"search_coarse_count":10, "tinker_search_range": 160}, - {"search_coarse_count":10, "tinker_search_range": 165}, - {"search_coarse_count":10, "tinker_search_range": 170}, - {"search_coarse_count":10, "tinker_search_range": 175} - ] diff --git a/neurips23/ood/puck/puck.py b/neurips23/ood/puck/puck.py deleted file mode 100755 index 6c407800..00000000 --- a/neurips23/ood/puck/puck.py +++ /dev/null @@ -1,216 +0,0 @@ -# !/usr/bin/env python3 -#-*- coding:utf-8 -*- -################################################################################ -# -# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved -# -################################################################################ -""" -@file: puck_inmem.py -@author: yinjie06(yinjie06@baidu.com) -@date: 2021-10-06 13:44 -@brief: -""" -import ctypes - -from neurips23.ood.base import BaseOODANN -from benchmark.datasets import DATASETS -from benchmark.dataset_io import download_accelerated,xbin_mmap,xbin_write -from puck import py_puck_api -import multiprocessing.pool -import multiprocessing -import gc -import os -import numpy as np -import time -import math -import struct - -CPU_LIMIT = multiprocessing.cpu_count() -swig_ptr = py_puck_api.swig_ptr -class Puck(BaseOODANN): - def __init__(self, metric, index_params): - self._index_params = index_params - self._metric = metric - self.indexkey = index_params.get("indexkey", "NA") - - self.index = py_puck_api.PySearcher() - self.topk = 10 - self.n = 0 - self.build_memory_usage = -1 - - def track(self): - #T1 means in memory - return "T1 for 10M & 100M" - - - def check_feature(self, dataset): - #更新gflags - ds = DATASETS[dataset]() - d = ds.d - #特征纬度 - py_puck_api.update_gflag('feature_dim', "%d"%(d)) - - #根据距离计算方式调整 - whether_norm = False - py_puck_api.update_gflag('whether_norm', 'false') - ip2cos = 0 - if ds.distance() == "angular": - whether_norm = True - py_puck_api.update_gflag('whether_norm', 'true') - elif ds.distance() == "ip": - ip2cos = 1 - py_puck_api.update_gflag('ip2cos', '%d'%(ip2cos)) - - self.init_indexkey() - - #测试 - py_puck_api.update_gflag('kmeans_iterations_count',"1") - - #索引存储目录 - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - if not os.path.exists(self.index_name(dataset)): - index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - - #训练用目录 - if not os.path.exists('mid-data'): - os.mkdir('mid-data') - - #格式化文件数据,将来可不要,后续可修改训练接口 - all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb') - - add_part=100000 - i0 = 0 - t0 = time.time() - for xblock in ds.get_dataset_iterator(bs=add_part): - i1 = i0 + len(xblock) - i0 = i1 - for x in xblock: - feat = x - if(whether_norm): - feat = feat / np.sqrt(np.dot(feat, feat)) - elif(ip2cos > 0): - norm = np.dot(feat, feat) - if norm > 1.0: - print("not support, please contact yinjie06") - return False - feat = np.append(feat, math.sqrt(1.0 - norm)) - - buf = struct.pack('i', len(feat)) - all_feature_file.write(buf) - buf = struct.pack('f' * len(feat), *feat) - all_feature_file.write(buf) - print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0)) - all_feature_file.close() - - # print(" init help query ") - # filename = os.path.join(ds.basedir, ds.qs_fn) - # read_x = xbin_mmap(filename, dtype=ds.dtype) - - # write_x = np.append(read_x, np.zeros((read_x.shape[0], 1)), axis=1) - # print("help query shape nrows = %d , ncols = %d "%(write_x.shape[0],write_x.shape[1])) - # xbin_write(write_x,"%s/help_query.feat.bin"%(self.index_name(dataset))) - - return True - - def fit(self, dataset): - self.check_feature(dataset) - ds = DATASETS[dataset]() - #训练数据采样 - py_puck_api.update_gflag('train_points_count', "5000000") - py_puck_api.update_gflag('pq_train_points_count', "500000") - print(self.index_name(dataset)) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - - self.index.build(ds.nb) - self.load_index(dataset) - - #index = py_puck_api.PySearcher() - #p = multiprocessing.Process(group=None,target=index.build,args=(ds.nb,)) - #self.index.build(ds.nb) - #p.start() - #p.join() - - def init_indexkey(self): - #一级聚类中心 - if "C" in self._index_params: - py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) - self.indexkey = "C%s"%(self._index_params['C']) - #二级聚类中心 - if "F" in self._index_params: - py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) - self.indexkey += "_F%s"%(self._index_params['F']) - #filter - if "FN" in self._index_params: - py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) - self.indexkey += "_FN%s"%(self._index_params['FN']) - #量化 - if "N" in self._index_params: - if int(self._index_params['N']) > 1: - py_puck_api.update_gflag('whether_pq', 'true') - py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) - self.indexkey += "_N%s"%(self._index_params['N']) - else: - py_puck_api.update_gflag('whether_pq', 'false') - self.indexkey += "_Flat" - - if "tinker_neighborhood" in self._index_params: - py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) - self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) - if "tinker_construction" in self._index_params: - py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) - self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) - if "index_type" in self._index_params: - py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) - if "radius_rate" in self._index_params: - py_puck_api.update_gflag('radius_rate', "%f"%(self._index_params['radius_rate'])) - self.indexkey += "_RadiusRate%s"%(self._index_params['radius_rate']) - - - def index_name(self, name): - return f"data/{name}.{self.indexkey}.puckindex" - - def index_tag_name(self, name): - return f"{name}.{self.indexkey}.puckindex" - - def load_index(self, dataset): - print("Loading index") - self.init_indexkey() - ds = DATASETS[dataset]() - self.topk = ds.default_count() - print("self.topk=%d"%self.topk) - py_puck_api.update_gflag('topk', "%s"%(ds.default_count())) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - py_puck_api.update_gflag('context_initial_pool_size', "%d"%(2 * CPU_LIMIT)) - py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) - print(self.index_name(dataset)) - ret = self.index.init() - print("ret = ",ret) - if ret != 0: - return False - self.index.show() - self.n = ds.nq - return True - - def set_query_arguments(self, query_args): - for key, value in query_args.items(): - py_puck_api.update_gflag(key, "%s"%value) - #query_args_list = query_args.strip().split(',') - #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) - self.index.init() - #topk是作为检索参数传入puck - self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32')) - self.qas = query_args - - def query(self, X, topK): - n, d = X.shape - - self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) - #print(self.res[0]) - # print(self.res[1]) - def get_results(self): - return self.res[1] - - def __str__(self): - return f'Puck{self.indexkey, self.qas}' \ No newline at end of file diff --git a/neurips23/ood/run.py b/neurips23/ood/run.py deleted file mode 100644 index 8117c989..00000000 --- a/neurips23/ood/run.py +++ /dev/null @@ -1,5 +0,0 @@ -from benchmark.algorithms.base_runner import BaseRunner -import time - -class OODRunner(BaseRunner): - pass \ No newline at end of file diff --git a/neurips23/ood/sustech-ood/Dockerfile b/neurips23/ood/sustech-ood/Dockerfile deleted file mode 100644 index 994cd723..00000000 --- a/neurips23/ood/sustech-ood/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt install -y libssl-dev -RUN wget https://cmake.org/files/v3.23/cmake-3.23.1.tar.gz -RUN tar -zxvf cmake-3.23.1.tar.gz -WORKDIR /home/app/cmake-3.23.1 -RUN ./bootstrap --parallel=8 -RUN make -j4 -RUN make install -WORKDIR /home/app -RUN git clone -b faiss https://github.com/whateveraname/SUSTech-OOD.git --recursive -WORKDIR /home/app/SUSTech-OOD -RUN cmake -DCMAKE_BUILD_TYPE=Release . && make -j4 -WORKDIR /home/app \ No newline at end of file diff --git a/neurips23/ood/sustech-ood/SUSTech-OOD.py b/neurips23/ood/sustech-ood/SUSTech-OOD.py deleted file mode 100755 index 1e407cfa..00000000 --- a/neurips23/ood/sustech-ood/SUSTech-OOD.py +++ /dev/null @@ -1,114 +0,0 @@ -from __future__ import absolute_import -import psutil -import os -import time -import numpy as np -import sys -sys.path.append("/home/app/SUSTech-OOD") -import graphood - -from neurips23.ood.base import BaseOODANN -from benchmark.datasets import DATASETS, download_accelerated - -class IndexGraphOOD(BaseOODANN): - def __init__(self, metric, index_params): - self.name = "graphood" - self._index_params = index_params - self._metric = metric - - self.M = index_params.get("M") - self.ef = index_params.get("ef") - self.cluster_num = index_params.get("cluster_num") - self.hnsw_n = f'M{self.M}_ef{self.ef}.hnsw' - self.ivf_n = f'{self.cluster_num}.ivf' - self.sq_n = f"sq" - - def index_name(self): - return f"M{self.M}_ef{self.ef}_c{self.cluster_num}" - - def create_index_dir(self, dataset): - index_dir = os.path.join(os.getcwd(), "data", "indices", "ood") - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, 'final') - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, dataset.short_name()) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, self.index_name()) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - print(index_dir) - return index_dir - - def fit(self, dataset): - """ - Build the index for the data points given in dataset name. - """ - - ds = DATASETS[dataset]() - d = ds.d - - index_dir = self.create_index_dir(ds) - self.hnsw_fn = os.path.join(index_dir, self.hnsw_n) - self.ivf_fn = os.path.join(index_dir, self.ivf_n) - self.sq_fn = os.path.join(index_dir, self.sq_n) - - if hasattr(self, 'index'): - print('Index object exists already') - return - - start = time.time() - graphood.build_index(ds.get_dataset_fn(), self.hnsw_fn, self.ivf_fn, self.sq_fn, self.M, self.ef, self.cluster_num) - end = time.time() - print("index built in %.3f s" % (end - start)) - - print('Loading index..') - self.index = graphood.IndexGraphOOD(d, ds.nb, self.hnsw_fn, self.ivf_fn, self.sq_fn) - print('Index ready for search') - - def load_index(self, dataset): - """ - Load the index for dataset. Returns False if index - is not available, True otherwise. - - Checking the index usually involves the dataset name - and the index build paramters passed during construction. - """ - ds = DATASETS[dataset]() - d = ds.d - - index_dir = self.create_index_dir(ds) - if not (os.path.exists(index_dir)) and 'url' not in self._index_params: - return False - - self.hnsw_fn = os.path.join(index_dir, self.hnsw_n) - self.ivf_fn = os.path.join(index_dir, self.ivf_n) - self.sq_fn = os.path.join(index_dir, self.sq_n) - - for component in [self.hnsw_fn, self.ivf_fn, self.sq_fn]: - if not (os.path.exists(component)): - if 'url' in self._index_params: - index_file_source = self._index_params['url'] + '/' + self.index_name() + component - print(f"Downloading index in background. This can take a while.") - download_accelerated(index_file_source, component, quiet=True) - else: - return False - - print("Loading index") - - self.index = graphood.IndexGraphOOD(d, ds.nb, self.hnsw_fn, self.ivf_fn, self.sq_fn) - print ("Load index success.") - print(sys.getsizeof(self.index)) - return True - - def query(self, X, k): - """Carry out a batch query for k-NN of query set X.""" - nq, dim = (np.shape(X)) - self.res = self.index.batch_search(nq, X, k, self.ef, self.nprobe) - - - def set_query_arguments(self, query_args): - self._query_args = query_args - self.ef = query_args.get("ef") - self.nprobe = query_args.get("nprobe") - - def __str__(self): - return f'{self._query_args})' \ No newline at end of file diff --git a/neurips23/ood/sustech-ood/config.yaml b/neurips23/ood/sustech-ood/config.yaml deleted file mode 100644 index d0f7acca..00000000 --- a/neurips23/ood/sustech-ood/config.yaml +++ /dev/null @@ -1,32 +0,0 @@ -random-xs: - sustech-ood: - docker-tag: neurips23-ood-sustech-ood - module: neurips23.ood.sustech-ood.SUSTech-OOD - constructor: IndexGraphOOD - base-args: ["@metric"] - run-groups: - base: - args: | - [{"M":5, "ef":100, "cluster_num":1}] - query-args: | - [{"ef":20, "nprobe":5}] -text2image-10M: - sustech-ood: - docker-tag: neurips23-ood-sustech-ood - module: neurips23.ood.sustech-ood.SUSTech-OOD - constructor: IndexGraphOOD - base-args: ["@metric"] - run-groups: - base: - args: | - [{"M":20, "ef":1200, "cluster_num":1000}] - query-args: | - [{"ef":95, "nprobe":30}, - {"ef":115, "nprobe":30}, - {"ef":125, "nprobe":30}, - {"ef":130, "nprobe":30}, - {"ef":135, "nprobe":30}, - {"ef":140, "nprobe":30}, - {"ef":145, "nprobe":30}, - {"ef":155, "nprobe":30}, - {"ef":175, "nprobe":30}] diff --git a/neurips23/ood/vamana/Dockerfile b/neurips23/ood/vamana/Dockerfile deleted file mode 100644 index 61ed0566..00000000 --- a/neurips23/ood/vamana/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt install -y software-properties-common -RUN add-apt-repository -y ppa:git-core/ppa -RUN apt update -RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10 - - -ARG CACHEBUST=1 -RUN git clone -b ood_v2 https://github.com/cmuparlay/ParlayANN.git && cd ParlayANN && git submodule update --init --recursive && cd python && pip install pybind11 && bash compile.sh -# WORKDIR /home/app/ParlayANN -# RUN git submodule update --init --recursive -# WORKDIR /home/app/ParlayANN/python - -# RUN pip install pybind11 - -# RUN bash compile.sh - -ENV PYTHONPATH=$PYTHONPATH:/home/app/ParlayANN/python - -# ENV PARLAY_NUM_THREADS=8 - -WORKDIR /home/app \ No newline at end of file diff --git a/neurips23/ood/vamana/config.yaml b/neurips23/ood/vamana/config.yaml deleted file mode 100644 index 82663028..00000000 --- a/neurips23/ood/vamana/config.yaml +++ /dev/null @@ -1,54 +0,0 @@ -random-xs: - vamana: - docker-tag: neurips23-ood-vamana - module: neurips23.ood.vamana.vamana - constructor: vamana - base-args: ["@metric"] - run-groups: - base: - args: | - [{"R":30, "L":50, "alpha":1.2}] - query-args: | - [{"Ls":50, "T":8}] -text2image-10M: - vamana: - docker-tag: neurips23-ood-vamana - module: neurips23.ood.vamana.vamana - constructor: vamana - base-args: ["@metric"] - run-groups: - base: - args: | - [{"R":55, "L":500, "alpha":1.0, "two_pass":1, "use_query_data":1, "compress":1}] - query-args: | - [ - {"Ls":70, "T":8}, - {"Ls":80, "T":8}, - {"Ls":90, "T":8}, - {"Ls":95, "T":8}, - {"Ls":100, "T":8}, - {"Ls":105, "T":8}, - {"Ls":110, "T":8}, - {"Ls":120, "T":8}, - {"Ls":125, "T":8}, - {"Ls":150, "T":8}] - vamana-singlepass: - docker-tag: neurips23-ood-vamana - module: neurips23.ood.vamana.vamana - constructor: vamana - base-args: ["@metric"] - run-groups: - base: - args: | - [{"R":64, "L":500}] - query-args: | - [{"Ls":30, "T":8}, - {"Ls":50, "T":8}, - {"Ls":70, "T":8}, - {"Ls":100, "T":8}, - {"Ls":113, "T":8}, - {"Ls":125, "T":8}, - {"Ls":150, "T":8}, - {"Ls":175, "T":8}, - {"Ls":200, "T":8}] - diff --git a/neurips23/ood/vamana/vamana.py b/neurips23/ood/vamana/vamana.py deleted file mode 100644 index 9ed922e7..00000000 --- a/neurips23/ood/vamana/vamana.py +++ /dev/null @@ -1,150 +0,0 @@ -from __future__ import absolute_import -import psutil -import os -import time -import numpy as np -import wrapper as pann - -from neurips23.ood.base import BaseOODANN -from benchmark.datasets import DATASETS, download_accelerated, BASEDIR -from benchmark.dataset_io import download - -class vamana(BaseOODANN): - def __init__(self, metric, index_params): - self.name = "vamana" - if (index_params.get("R")==None): - print("Error: missing parameter R") - return - if (index_params.get("L")==None): - print("Error: missing parameter L") - return - self._index_params = index_params - self._metric = self.translate_dist_fn(metric) - - self.R = int(index_params.get("R")) - self.L = int(index_params.get("L")) - self.alpha = float(index_params.get("alpha", 1.0)) - self.two_pass = bool(index_params.get("two_pass", False)) - self.use_query_data = bool(index_params.get("use_query_data", False)) - self.compress_vectors = bool(index_params.get("compress", False)) - - def index_name(self): - return f"R{self.R}_L{self.L}_alpha{self.alpha}" - - def create_index_dir(self, dataset): - index_dir = os.path.join(os.getcwd(), "data", "indices", "ood") - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, 'vamana') - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, dataset.short_name()) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - index_dir = os.path.join(index_dir, self.index_name()) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - return os.path.join(index_dir, self.index_name()) - - def translate_dist_fn(self, metric): - if metric == 'euclidean': - return 'Euclidian' - elif metric == 'ip': - return 'mips' - else: - raise Exception('Invalid metric') - - def translate_dtype(self, dtype:str): - if dtype == 'float32': - return 'float' - else: - return dtype - - def prepare_sample_info(self, index_dir): - if(self.use_query_data): - #download the additional sample points for the ood index - self.sample_points_path = "data/text2image1B/query_sample_200000.fbin" - sample_qs_large_url = "https://storage.yandexcloud.net/yr-secret-share/ann-datasets-5ac0659e27/T2I/query.private.1M.fbin" - bytes_to_download = 8 + 200000*4*200 - download(sample_qs_large_url, self.sample_points_path, bytes_to_download) - header = np.memmap(self.sample_points_path, shape=2, dtype='uint32', mode="r+") - header[0] = 200000 - - self.secondary_index_dir = index_dir + ".secondary" - self.secondary_gt_dir = self.secondary_index_dir + ".gt" - else: - self.sample_points_path = "" - self.secondary_index_dir = "" - self.secondary_gt_dir = "" - - def prepare_compressed_info(self): - if(self.compress_vectors): - self.compressed_vectors_path = "data/text2image1B/compressed_10M.fbin" - else: - self.compressed_vectors_path = "" - - def fit(self, dataset): - """ - Build the index for the data points given in dataset name. - """ - ds = DATASETS[dataset]() - d = ds.d - - index_dir = self.create_index_dir(ds) - - self.prepare_sample_info(index_dir) - self.prepare_compressed_info() - - if hasattr(self, 'index'): - print("Index already exists") - return - else: - start = time.time() - # ds.ds_fn is the name of the dataset file but probably needs a prefix - pann.build_vamana_index(self._metric, self.translate_dtype(ds.dtype), ds.get_dataset_fn(), self.sample_points_path, - self.compressed_vectors_path, index_dir, self.secondary_index_dir, self.secondary_gt_dir, self.R, self.L, self.alpha, - self.two_pass) - end = time.time() - print("Indexing time: ", end - start) - print(f"Wrote index to {index_dir}") - - self.index = pann.load_vamana_index(self._metric, self.translate_dtype(ds.dtype), ds.get_dataset_fn(), self.compressed_vectors_path, - self.sample_points_path, index_dir, self.secondary_index_dir, self.secondary_gt_dir, ds.nb, d) - print("Index loaded") - - def query(self, X, k): - nq, d = X.shape - self.res, self.query_dists = self.index.batch_search(X, nq, k, self.Ls) - - def set_query_arguments(self, query_args): - self._query_args = query_args - self.Ls = 0 if query_args.get("Ls") is None else query_args.get("Ls") - self.search_threads = self._query_args.get("T", 16) - os.environ["PARLAY_NUM_THREADS"] = str(self.search_threads) - - def load_index(self, dataset): - ds = DATASETS[dataset]() - d = ds.d - - index_dir = self.create_index_dir(ds) - self.prepare_sample_info(index_dir) - self.prepare_compressed_info() - - print("Trying to load...") - - try: - file_size = os.path.getsize(index_dir) - print(f"File Size in Bytes is {file_size}") - except FileNotFoundError: - file_size = 0 - print("File not found.") - - if file_size != 0: - try: - self.index = pann.load_vamana_index(self._metric, self.translate_dtype(ds.dtype), ds.get_dataset_fn(), - self.compressed_vectors_path, self.sample_points_path, index_dir, - self.secondary_index_dir, self.secondary_gt_dir, ds.nb, d) - print("Index loaded") - return True - except: - print("Index not found") - return False - else: - print("Index not found") - return False \ No newline at end of file diff --git a/neurips23/streaming/puck/config.yaml b/neurips23/streaming/puck/config.yaml index 28a4e4a0..f46aabe4 100755 --- a/neurips23/streaming/puck/config.yaml +++ b/neurips23/streaming/puck/config.yaml @@ -1,3 +1,54 @@ +random-xs: + puck: + docker-tag: neurips23-streaming-puck + module: neurips23.streaming.puck.puck + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: | + [ + { "index_type": 1, "C":200, "F":200, "FN":8, "N":0} + ] + query-args: | + [ + {"radius_rate":1.00 ,"search_coarse_count":200, "filter_topk":1000 } + ] + +random-xs-clustered: + puck: + docker-tag: neurips23-streaming-puck + module: neurips23.streaming.puck.puck + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: | + [ + { "index_type": 1, "C":200, "F":200, "FN":8, "N":0} + ] + query-args: | + [ + {"radius_rate":1.00 ,"search_coarse_count":200, "filter_topk":1000 } + ] + +msturing-10M-clustered: + puck: + docker-tag: neurips23-streaming-puck + module: neurips23.streaming.puck.puck + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: | + [ + { "index_type": 1, "C":200, "F":200, "FN":8, "N":0} + ] + query-args: | + [ + {"radius_rate":1.00 ,"search_coarse_count":200, "filter_topk":1000 } + ] + msturing-30M-clustered: puck: docker-tag: neurips23-streaming-puck