From 4742123458c63a1a1674d8bf3659b21b22161d92 Mon Sep 17 00:00:00 2001 From: nk2014yj Date: Mon, 30 Oct 2023 10:51:12 +0800 Subject: [PATCH 1/8] NIPS 2023 of Puck 1.ood 2.streaming 3.filter --- neurips23/filter/puck/Dockerfile | 40 +++++ neurips23/filter/puck/config.yaml | 57 +++++++ neurips23/filter/puck/puck.py | 222 +++++++++++++++++++++++++++ neurips23/ood/puck/Dockerfile | 31 ++++ neurips23/ood/puck/config.yaml | 33 ++++ neurips23/ood/puck/puck.py | 216 ++++++++++++++++++++++++++ neurips23/streaming/puck/Dockerfile | 31 ++++ neurips23/streaming/puck/config.yaml | 20 +++ neurips23/streaming/puck/puck.py | 141 +++++++++++++++++ 9 files changed, 791 insertions(+) create mode 100755 neurips23/filter/puck/Dockerfile create mode 100755 neurips23/filter/puck/config.yaml create mode 100755 neurips23/filter/puck/puck.py create mode 100755 neurips23/ood/puck/Dockerfile create mode 100755 neurips23/ood/puck/config.yaml create mode 100755 neurips23/ood/puck/puck.py create mode 100755 neurips23/streaming/puck/Dockerfile create mode 100755 neurips23/streaming/puck/config.yaml create mode 100755 neurips23/streaming/puck/puck.py diff --git a/neurips23/filter/puck/Dockerfile b/neurips23/filter/puck/Dockerfile new file mode 100755 index 00000000..1f5aa431 --- /dev/null +++ b/neurips23/filter/puck/Dockerfile @@ -0,0 +1,40 @@ +FROM neurips23 + +#COPY apt-source /etc/apt/sources.list +RUN apt update +RUN apt install -y software-properties-common +#RUN add-apt-repository -y ppa:git-core/ppa +RUN apt update --fix-missing +RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10 + +#swig +RUN apt-get update && apt-get install -y swig +RUN pip3 install pybind11 numpy +RUN cat /etc/ld.so.conf +RUN ls /etc/ld.so.conf.d/ +##cmake +RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh +# COPY cmake-3.22.0-linux-x86_64.sh . +RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake +ENV PATH /home/app/cmake/bin:$PATH + +#mkl +RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh +# COPY l_onemkl_p_2023.2.0.49497_offline.sh . +RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s + +RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf +RUN ldconfig +RUN touch /etc/profile.d/intel.sh +RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh +RUN . /etc/profile.d/intel.sh + +ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON -DGIT_URL=https://github.baidu-int.com " +RUN git config --global http.sslVerify false + +RUN git clone -b filter https://github.com/baidu/puck.git + +RUN mv puck puck_build && cd puck_build && . /etc/profile.d/intel.sh && python3 setup.py install +RUN python3 -c 'from puck import py_puck_api' + +WORKDIR /home/app diff --git a/neurips23/filter/puck/config.yaml b/neurips23/filter/puck/config.yaml new file mode 100755 index 00000000..c810afd8 --- /dev/null +++ b/neurips23/filter/puck/config.yaml @@ -0,0 +1,57 @@ +random-filter-s: + faiss: + docker-tag: neurips23-filter-faiss + module: neurips23.filter.faiss.faiss + constructor: FAISS + base-args: ["@metric"] + run-groups: + base: + args: | + [{"indexkey": "IVF1024,SQ8"}] + query-args: | + [{"nprobe": 1}, + {"nprobe":2}, + {"nprobe":4}] +random-s: + faiss: + docker-tag: neurips23-filter-faiss + module: neurips23.filter.faiss.faiss + constructor: FAISS + base-args: ["@metric"] + run-groups: + base: + args: | + [{"indexkey": "IVF1024,SQ8"}] + query-args: | + [{"nprobe": 1}, + {"nprobe":2}, + {"nprobe":4}] +yfcc-10M-unfiltered: + faiss: + docker-tag: neurips23-filter-faiss + module: neurips23.filter.faiss.faiss + constructor: FAISS + base-args: ["@metric"] + run-groups: + base: + args: | + [{"indexkey": "IVF16384,SQ8", "binarysig": true, "threads": 16}] + query-args: | + [{"nprobe": 1}, {"nprobe": 4}, {"nprobe": 16}, {"nprobe": 64}] +yfcc-10M: + puck: + docker-tag: neurips23-filter-puck + module: neurips23.filter.puck.puck + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: | + [{ "index_type": 3, "C":1000, "F":1000, "FN":16, "N":0}] + query-args: | + [ + {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":105}, + {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":110}, + {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":115}, + {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":120 } + ] \ No newline at end of file diff --git a/neurips23/filter/puck/puck.py b/neurips23/filter/puck/puck.py new file mode 100755 index 00000000..e13072e8 --- /dev/null +++ b/neurips23/filter/puck/puck.py @@ -0,0 +1,222 @@ +# !/usr/bin/env python3 +#-*- coding:utf-8 -*- +################################################################################ +# +# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved +# +################################################################################ +""" +@file: puck_inmem.py +@author: yinjie06(yinjie06@baidu.com) +@date: 2021-10-06 13:44 +@brief: +""" + +from neurips23.filter.base import BaseFilterANN +from benchmark.datasets import DATASETS +from benchmark.dataset_io import download_accelerated +from puck import py_puck_api +import multiprocessing.pool +from multiprocessing import Process +import os +import numpy as np +import time +import math +import struct + +CPU_LIMIT = multiprocessing.cpu_count() +swig_ptr = py_puck_api.swig_ptr + +class Puck(BaseFilterANN): + def __init__(self, metric, index_params): + self._index_params = index_params + self._metric = metric + self.indexkey = index_params.get("indexkey", "NA") + + self.index = py_puck_api.PySearcher() + self.topk = 10 + self.n = 0 + self.build_memory_usage = -1 + print("after init") + + def init_dataset_key(self, dataset): + + #更新gflags + ds = DATASETS[dataset]() + d = ds.d + #特征纬度 + py_puck_api.update_gflag('feature_dim', "%d"%(d)) + + #根据距离计算方式调整 + self.whether_norm = False + py_puck_api.update_gflag('whether_norm', 'false') + self.ip2cos = 0 + if ds.distance() == "angular": + self.whether_norm = True + py_puck_api.update_gflag('whether_norm', 'true') + elif ds.distance() == "ip": + self.ip2cos = 1 + py_puck_api.update_gflag('ip2cos', '%d'%(self.ip2cos)) + + self.init_indexkey() + #测试 + py_puck_api.update_gflag('kmeans_iterations_count',"1") + + def check_feature(self, dataset): + self.init_dataset_key(dataset) + ds = DATASETS[dataset]() + d = ds.d + #索引存储目录 + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + if not os.path.exists(self.index_name(dataset)): + index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + + meta_indices_file_name = self.index_name(dataset) + "/indices.dat" + meta_indices_file = open(meta_indices_file_name, 'wb') + meta_indptr_file_name = self.index_name(dataset) + "/indptr.dat" + meta_indptr_file = open(meta_indptr_file_name, 'wb') + + meta_to_write = ds.get_dataset_metadata() + buf = struct.pack('i', len(meta_to_write.indices)) + meta_indices_file.write(buf) + buf = struct.pack('i' * len(meta_to_write.indices), *(meta_to_write.indices)) + meta_indices_file.write(buf) + meta_indices_file.close() + + buf = struct.pack('i', len(meta_to_write.indptr)) + meta_indptr_file.write(buf) + buf = struct.pack('i' * len(meta_to_write.indptr), *(meta_to_write.indptr)) + meta_indptr_file.write(buf) + meta_indptr_file.close() + + #训练用目录 + if not os.path.exists('mid-data'): + os.mkdir('mid-data') + #格式化文件数据,将来可不要,后续可修改训练接口 + all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb') + + add_part=100000 + i0 = 0 + t0 = time.time() + for xblock in ds.get_dataset_iterator(bs=add_part): + i1 = i0 + len(xblock) + i0 = i1 + for x in xblock: + feat = x.astype(np.float32) + if(self.whether_norm): + feat = feat / np.sqrt(np.dot(feat, feat)) + elif(self.ip2cos > 0): + norm = np.dot(feat, feat) + if norm > 1.0: + print("not support, please contact yinjie06") + return False + feat = np.append(feat, math.sqrt(1.0 - norm)) + + buf = struct.pack('i', len(feat)) + all_feature_file.write(buf) + buf = struct.pack('f' * len(feat), *feat) + all_feature_file.write(buf) + print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0)) + all_feature_file.close() + + def fit(self, dataset): + print("start fit") + #self.check_feature(dataset) + p = Process(target=self.check_feature, args=(dataset,)) + p.start() + p.join() + self.init_dataset_key(dataset) + ds = DATASETS[dataset]() + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + #训练数据采样 + py_puck_api.update_gflag('train_points_count', "5000000") + py_puck_api.update_gflag('pq_train_points_count', "500000") + print(self.index_name(dataset)) + print("start to train") + self.index.build(ds.nb) + self.load_index(dataset) + + def init_indexkey(self): + #一级聚类中心 + if "C" in self._index_params: + py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) + self.indexkey = "C%s"%(self._index_params['C']) + #二级聚类中心 + if "F" in self._index_params: + py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) + self.indexkey += "_F%s"%(self._index_params['F']) + #filter + if "FN" in self._index_params: + py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) + self.indexkey += "_FN%s"%(self._index_params['FN']) + #量化 + if "N" in self._index_params: + if int(self._index_params['N']) > 1: + py_puck_api.update_gflag('whether_pq', 'true') + py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) + self.indexkey += "_N%s"%(self._index_params['N']) + else: + py_puck_api.update_gflag('whether_pq', 'false') + self.indexkey += "_Flat" + + if "tinker_neighborhood" in self._index_params: + py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) + self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) + if "tinker_construction" in self._index_params: + py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) + self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) + if "index_type" in self._index_params: + py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) + + def index_name(self, name): + return f"data/{name}.{self.indexkey}.puckindex" + + def index_tag_name(self, name): + return f"{name}.{self.indexkey}.puckindex" + + def load_index(self, dataset): + print("Loading index") + self.init_indexkey() + ds = DATASETS[dataset]() + self.topk = ds.default_count() + print("self.topk=%d"%self.topk) + py_puck_api.update_gflag('topk', "%s"%(ds.default_count())) + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + py_puck_api.update_gflag('context_initial_pool_size', "%d"%(CPU_LIMIT)) + py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) + print(self.index_name(dataset)) + ret = self.index.init() + print("ret = ",ret) + if ret != 0: + return False + self.index.show() + self.n = ds.nq + return True + + def set_query_arguments(self, query_args): + for key, value in query_args.items(): + py_puck_api.update_gflag(key, "%s"%value) + #query_args_list = query_args.strip().split(',') + #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) + self.index.init() + #topk是作为检索参数传入puck + self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32')) + self.qas = query_args + + def query(self, X, topK): + + n, d = X.shape + self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) + #print(self.res[0]) + print(self.res[1]) + def get_results(self): + return self.res[1] + def filtered_query(self, X, filter, k): + n, d = X.shape + x_float = X.astype(np.float32) + meta_q = filter + self.index.filter_search(n, swig_ptr(x_float), k, swig_ptr(self.res[0]), swig_ptr(self.res[1]), swig_ptr(filter.indptr), swig_ptr(filter.indices)) + + def __str__(self): + return f'Puck{self.indexkey, self.qas}' diff --git a/neurips23/ood/puck/Dockerfile b/neurips23/ood/puck/Dockerfile new file mode 100755 index 00000000..ff088505 --- /dev/null +++ b/neurips23/ood/puck/Dockerfile @@ -0,0 +1,31 @@ +FROM neurips23 + +RUN apt update +RUN apt-get update +RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip +#swig +RUN apt-get update && apt-get install -y swig cmake +RUN pip3 install pybind11 numpy +RUN cat /etc/ld.so.conf +RUN ls /etc/ld.so.conf.d/ +##cmake +RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh +RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake +ENV PATH /home/app/cmake/bin:$PATH + +#mkl +RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh +RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s + +RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf +RUN ldconfig +RUN touch /etc/profile.d/intel.sh +RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh +RUN . /etc/profile.d/intel.sh + +ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" +#RUN git config --global http.sslVerify false + +RUN git clone -b ood https://github.com/baidu/puck.git +RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install +RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/ood/puck/config.yaml b/neurips23/ood/puck/config.yaml new file mode 100755 index 00000000..055c8ac0 --- /dev/null +++ b/neurips23/ood/puck/config.yaml @@ -0,0 +1,33 @@ +random-xs: + puck: + docker-tag: neurips23-ood-puck + module: neurips23.ood.puck.puck_inmem + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] + query-args: | + [ + {"search_coarse_count":50, "tinker_search_range": 100}, + {"search_coarse_count":50, "tinker_search_range": 200}, + {"search_coarse_count":50, "tinker_search_range": 300} + ] + + +text2image-10M: + puck: + docker-tag: neurips23-ood-puck + module: neurips23.ood.puck.puck_inmem + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] + query-args: | + [ + {"search_coarse_count":10, "tinker_search_range": 160}, + {"search_coarse_count":10, "tinker_search_range": 165}, + {"search_coarse_count":10, "tinker_search_range": 170}, + {"search_coarse_count":10, "tinker_search_range": 175} + ] \ No newline at end of file diff --git a/neurips23/ood/puck/puck.py b/neurips23/ood/puck/puck.py new file mode 100755 index 00000000..6c407800 --- /dev/null +++ b/neurips23/ood/puck/puck.py @@ -0,0 +1,216 @@ +# !/usr/bin/env python3 +#-*- coding:utf-8 -*- +################################################################################ +# +# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved +# +################################################################################ +""" +@file: puck_inmem.py +@author: yinjie06(yinjie06@baidu.com) +@date: 2021-10-06 13:44 +@brief: +""" +import ctypes + +from neurips23.ood.base import BaseOODANN +from benchmark.datasets import DATASETS +from benchmark.dataset_io import download_accelerated,xbin_mmap,xbin_write +from puck import py_puck_api +import multiprocessing.pool +import multiprocessing +import gc +import os +import numpy as np +import time +import math +import struct + +CPU_LIMIT = multiprocessing.cpu_count() +swig_ptr = py_puck_api.swig_ptr +class Puck(BaseOODANN): + def __init__(self, metric, index_params): + self._index_params = index_params + self._metric = metric + self.indexkey = index_params.get("indexkey", "NA") + + self.index = py_puck_api.PySearcher() + self.topk = 10 + self.n = 0 + self.build_memory_usage = -1 + + def track(self): + #T1 means in memory + return "T1 for 10M & 100M" + + + def check_feature(self, dataset): + #更新gflags + ds = DATASETS[dataset]() + d = ds.d + #特征纬度 + py_puck_api.update_gflag('feature_dim', "%d"%(d)) + + #根据距离计算方式调整 + whether_norm = False + py_puck_api.update_gflag('whether_norm', 'false') + ip2cos = 0 + if ds.distance() == "angular": + whether_norm = True + py_puck_api.update_gflag('whether_norm', 'true') + elif ds.distance() == "ip": + ip2cos = 1 + py_puck_api.update_gflag('ip2cos', '%d'%(ip2cos)) + + self.init_indexkey() + + #测试 + py_puck_api.update_gflag('kmeans_iterations_count',"1") + + #索引存储目录 + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + if not os.path.exists(self.index_name(dataset)): + index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + + #训练用目录 + if not os.path.exists('mid-data'): + os.mkdir('mid-data') + + #格式化文件数据,将来可不要,后续可修改训练接口 + all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb') + + add_part=100000 + i0 = 0 + t0 = time.time() + for xblock in ds.get_dataset_iterator(bs=add_part): + i1 = i0 + len(xblock) + i0 = i1 + for x in xblock: + feat = x + if(whether_norm): + feat = feat / np.sqrt(np.dot(feat, feat)) + elif(ip2cos > 0): + norm = np.dot(feat, feat) + if norm > 1.0: + print("not support, please contact yinjie06") + return False + feat = np.append(feat, math.sqrt(1.0 - norm)) + + buf = struct.pack('i', len(feat)) + all_feature_file.write(buf) + buf = struct.pack('f' * len(feat), *feat) + all_feature_file.write(buf) + print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0)) + all_feature_file.close() + + # print(" init help query ") + # filename = os.path.join(ds.basedir, ds.qs_fn) + # read_x = xbin_mmap(filename, dtype=ds.dtype) + + # write_x = np.append(read_x, np.zeros((read_x.shape[0], 1)), axis=1) + # print("help query shape nrows = %d , ncols = %d "%(write_x.shape[0],write_x.shape[1])) + # xbin_write(write_x,"%s/help_query.feat.bin"%(self.index_name(dataset))) + + return True + + def fit(self, dataset): + self.check_feature(dataset) + ds = DATASETS[dataset]() + #训练数据采样 + py_puck_api.update_gflag('train_points_count', "5000000") + py_puck_api.update_gflag('pq_train_points_count', "500000") + print(self.index_name(dataset)) + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + + self.index.build(ds.nb) + self.load_index(dataset) + + #index = py_puck_api.PySearcher() + #p = multiprocessing.Process(group=None,target=index.build,args=(ds.nb,)) + #self.index.build(ds.nb) + #p.start() + #p.join() + + def init_indexkey(self): + #一级聚类中心 + if "C" in self._index_params: + py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) + self.indexkey = "C%s"%(self._index_params['C']) + #二级聚类中心 + if "F" in self._index_params: + py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) + self.indexkey += "_F%s"%(self._index_params['F']) + #filter + if "FN" in self._index_params: + py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) + self.indexkey += "_FN%s"%(self._index_params['FN']) + #量化 + if "N" in self._index_params: + if int(self._index_params['N']) > 1: + py_puck_api.update_gflag('whether_pq', 'true') + py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) + self.indexkey += "_N%s"%(self._index_params['N']) + else: + py_puck_api.update_gflag('whether_pq', 'false') + self.indexkey += "_Flat" + + if "tinker_neighborhood" in self._index_params: + py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) + self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) + if "tinker_construction" in self._index_params: + py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) + self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) + if "index_type" in self._index_params: + py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) + if "radius_rate" in self._index_params: + py_puck_api.update_gflag('radius_rate', "%f"%(self._index_params['radius_rate'])) + self.indexkey += "_RadiusRate%s"%(self._index_params['radius_rate']) + + + def index_name(self, name): + return f"data/{name}.{self.indexkey}.puckindex" + + def index_tag_name(self, name): + return f"{name}.{self.indexkey}.puckindex" + + def load_index(self, dataset): + print("Loading index") + self.init_indexkey() + ds = DATASETS[dataset]() + self.topk = ds.default_count() + print("self.topk=%d"%self.topk) + py_puck_api.update_gflag('topk', "%s"%(ds.default_count())) + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + py_puck_api.update_gflag('context_initial_pool_size', "%d"%(2 * CPU_LIMIT)) + py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) + print(self.index_name(dataset)) + ret = self.index.init() + print("ret = ",ret) + if ret != 0: + return False + self.index.show() + self.n = ds.nq + return True + + def set_query_arguments(self, query_args): + for key, value in query_args.items(): + py_puck_api.update_gflag(key, "%s"%value) + #query_args_list = query_args.strip().split(',') + #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) + self.index.init() + #topk是作为检索参数传入puck + self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32')) + self.qas = query_args + + def query(self, X, topK): + n, d = X.shape + + self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) + #print(self.res[0]) + # print(self.res[1]) + def get_results(self): + return self.res[1] + + def __str__(self): + return f'Puck{self.indexkey, self.qas}' \ No newline at end of file diff --git a/neurips23/streaming/puck/Dockerfile b/neurips23/streaming/puck/Dockerfile new file mode 100755 index 00000000..0af82acc --- /dev/null +++ b/neurips23/streaming/puck/Dockerfile @@ -0,0 +1,31 @@ +FROM neurips23 + +RUN apt update +RUN apt-get update +RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip +#swig +RUN apt-get update && apt-get install -y swig cmake +RUN pip3 install pybind11 numpy +RUN cat /etc/ld.so.conf +RUN ls /etc/ld.so.conf.d/ +##cmake +RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh +RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake +ENV PATH /home/app/cmake/bin:$PATH + +#mkl +RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh +RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s + +RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf +RUN ldconfig +RUN touch /etc/profile.d/intel.sh +RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh +RUN . /etc/profile.d/intel.sh + +ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" +#RUN git config --global http.sslVerify false + +RUN git clone -b streaming https://github.com/baidu/puck.git +RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install +RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/streaming/puck/config.yaml b/neurips23/streaming/puck/config.yaml new file mode 100755 index 00000000..10e5989d --- /dev/null +++ b/neurips23/streaming/puck/config.yaml @@ -0,0 +1,20 @@ +msturing-30M-clustered: + puck: + docker-tag: neurips23-streaming-puck + module: neurips23.streaming.puck.puck + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: | + [ + { "index_type": 1, "C":100, "F":100, "FN":8, "N":0}, + { "index_type": 1, "C":200, "F":200, "FN":8, "N":0}, + { "index_type": 1, "C":200, "F":500, "FN":8, "N":0} + ] + query-args: | + [ + {"radius_rate":1.00 ,"search_coarse_count":200, "filter_topk":1000 } + ] + + diff --git a/neurips23/streaming/puck/puck.py b/neurips23/streaming/puck/puck.py new file mode 100755 index 00000000..ccd4e9f6 --- /dev/null +++ b/neurips23/streaming/puck/puck.py @@ -0,0 +1,141 @@ +# !/usr/bin/env python3 +#-*- coding:utf-8 -*- +################################################################################ +# +# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved +# +################################################################################ +""" +@file: puck_inmem.py +@author: yinjie06(yinjie06@baidu.com) +@date: 2021-10-06 13:44 +@brief: +""" +import ctypes + +from neurips23.streaming.base import BaseStreamingANN +from benchmark.datasets import DATASETS +from benchmark.dataset_io import download_accelerated +from puck import py_puck_api +import multiprocessing.pool +import multiprocessing +import gc +import os +import numpy as np +import time +import math +import struct +import sys +print(sys.version) +CPU_LIMIT = 16 #multiprocessing.cpu_count() +swig_ptr = py_puck_api.swig_ptr +class Puck(BaseStreamingANN): + def __init__(self, metric, index_params): + self._index_params = index_params + self._metric = metric + self.indexkey = index_params.get("indexkey", "NA") + + self.index = py_puck_api.PySearcher() + self.topk = 10 + self.n = 0 + self.build_memory_usage = -1 + + def track(self): + #T1 means in memory + return "T1 for 10M & 100M" + + + def init_indexkey(self): + #一级聚类中心 + if "C" in self._index_params: + py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) + self.indexkey = "C%s"%(self._index_params['C']) + #二级聚类中心 + if "F" in self._index_params: + py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) + self.indexkey += "_F%s"%(self._index_params['F']) + #filter + if "FN" in self._index_params: + py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) + self.indexkey += "_FN%s"%(self._index_params['FN']) + #量化 + if "N" in self._index_params: + if int(self._index_params['N']) > 1: + py_puck_api.update_gflag('whether_pq', 'true') + py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) + self.indexkey += "_N%s"%(self._index_params['N']) + else: + py_puck_api.update_gflag('whether_pq', 'false') + self.indexkey += "_Flat" + + if "tinker_neighborhood" in self._index_params: + py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) + self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) + if "tinker_construction" in self._index_params: + py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) + self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) + if "index_type" in self._index_params: + py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) + if "radius_rate" in self._index_params: + py_puck_api.update_gflag('radius_rate', "%f"%(self._index_params['radius_rate'])) + self.indexkey += "_RadiusRate%s"%(self._index_params['radius_rate']) + + + def index_name(self, name): + return f"data/{name}.{self.indexkey}.puckindex" + + def index_tag_name(self, name): + return f"{name}.{self.indexkey}.puckindex" + + def setup(self, dtype, max_pts, ndim): + print("setup") + py_puck_api.update_gflag('max_point_stored', '%d'%(max_pts)) + py_puck_api.update_gflag('whether_norm', 'false') + py_puck_api.update_gflag('max_point_stored', "%d"%(max_pts)) + py_puck_api.update_gflag('feature_dim', "%d"%(ndim)) + self.init_indexkey() + #索引存储目录 + dataset = "streaming" + py_puck_api.update_gflag('kmeans_iterations_count',"1") + py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) + py_puck_api.update_gflag('context_initial_pool_size', "%d"%(2 * CPU_LIMIT)) + print(self.index_name(dataset)) + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + + if not os.path.exists(self.index_name(dataset)): + index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + + #训练用目录 + if not os.path.exists('mid-data'): + os.mkdir('mid-data') + + print(1) + self.index.init() + + def set_query_arguments(self, query_args): + for key, value in query_args.items(): + py_puck_api.update_gflag(key, "%s"%value) + #query_args_list = query_args.strip().split(',') + #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) + py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) + self.index.init() + #topk是作为检索参数传入puck + self.qas = query_args + def insert(self, X, ids): + n, d = X.shape + self.index.batch_add(n, d, swig_ptr(X), swig_ptr(ids)) + def delete(self, ids): + n = len(ids) + self.index.batch_delete(n, swig_ptr(ids)) + def query(self, X, topK): + n, d = X.shape + self.res = (np.empty((n, topK), dtype='float32'), np.empty((n, topK), dtype='uint32')) + self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) + #print(self.res[0]) + #print(self.res[1]) + def get_results(self): + return self.res[1] + + def __str__(self): + return f'Puck{self.indexkey, self.qas}' From 18dedd64abb7a5f23fe0e1a4ba33e43cbb7783c7 Mon Sep 17 00:00:00 2001 From: nk2014yj Date: Mon, 30 Oct 2023 11:12:54 +0800 Subject: [PATCH 2/8] update update --- neurips23/filter/puck/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neurips23/filter/puck/Dockerfile b/neurips23/filter/puck/Dockerfile index 1f5aa431..bfe8c8b9 100755 --- a/neurips23/filter/puck/Dockerfile +++ b/neurips23/filter/puck/Dockerfile @@ -29,7 +29,7 @@ RUN touch /etc/profile.d/intel.sh RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh RUN . /etc/profile.d/intel.sh -ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON -DGIT_URL=https://github.baidu-int.com " +ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON " RUN git config --global http.sslVerify false RUN git clone -b filter https://github.com/baidu/puck.git From 4a461c993a54b5f208904a649a030dca91d22e7a Mon Sep 17 00:00:00 2001 From: nk2014yj Date: Mon, 30 Oct 2023 11:35:04 +0800 Subject: [PATCH 3/8] update --- neurips23/ood/puck/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neurips23/ood/puck/config.yaml b/neurips23/ood/puck/config.yaml index 055c8ac0..445b7d6b 100755 --- a/neurips23/ood/puck/config.yaml +++ b/neurips23/ood/puck/config.yaml @@ -1,7 +1,7 @@ random-xs: puck: docker-tag: neurips23-ood-puck - module: neurips23.ood.puck.puck_inmem + module: neurips23.ood.puck.puck constructor: Puck base-args: ["@metric"] run-groups: @@ -18,7 +18,7 @@ random-xs: text2image-10M: puck: docker-tag: neurips23-ood-puck - module: neurips23.ood.puck.puck_inmem + module: neurips23.ood.puck.puck constructor: Puck base-args: ["@metric"] run-groups: From 4535a671f0e2e4582656c154f66943c6dcf986bb Mon Sep 17 00:00:00 2001 From: nk2014yj Date: Mon, 30 Oct 2023 14:15:13 +0800 Subject: [PATCH 4/8] update --- neurips23/filter/puck/Dockerfile | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/neurips23/filter/puck/Dockerfile b/neurips23/filter/puck/Dockerfile index bfe8c8b9..d2db5e83 100755 --- a/neurips23/filter/puck/Dockerfile +++ b/neurips23/filter/puck/Dockerfile @@ -1,26 +1,20 @@ FROM neurips23 -#COPY apt-source /etc/apt/sources.list RUN apt update -RUN apt install -y software-properties-common -#RUN add-apt-repository -y ppa:git-core/ppa -RUN apt update --fix-missing -RUN DEBIAN_FRONTEND=noninteractive apt install -y git make cmake g++ libaio-dev libgoogle-perftools-dev libunwind-dev clang-format libboost-dev libboost-program-options-dev libmkl-full-dev libcpprest-dev python3.10 - +RUN apt-get update +RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip #swig -RUN apt-get update && apt-get install -y swig -RUN pip3 install pybind11 numpy +RUN apt-get update && apt-get install -y swig cmake +RUN pip3 install pybind11 numpy RUN cat /etc/ld.so.conf RUN ls /etc/ld.so.conf.d/ ##cmake RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh -# COPY cmake-3.22.0-linux-x86_64.sh . RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake ENV PATH /home/app/cmake/bin:$PATH #mkl RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh -# COPY l_onemkl_p_2023.2.0.49497_offline.sh . RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf @@ -29,12 +23,9 @@ RUN touch /etc/profile.d/intel.sh RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh RUN . /etc/profile.d/intel.sh -ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON " -RUN git config --global http.sslVerify false - -RUN git clone -b filter https://github.com/baidu/puck.git +ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" +#RUN git config --global http.sslVerify false -RUN mv puck puck_build && cd puck_build && . /etc/profile.d/intel.sh && python3 setup.py install +RUN git clone -b filter https://github.com/baidu/puck.git +RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install RUN python3 -c 'from puck import py_puck_api' - -WORKDIR /home/app From 464f49c2d20b41a70cde341dff5c3a8b5e3cc51d Mon Sep 17 00:00:00 2001 From: nk2014yj Date: Mon, 30 Oct 2023 14:57:42 +0800 Subject: [PATCH 5/8] Update config.yaml --- neurips23/streaming/puck/config.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/neurips23/streaming/puck/config.yaml b/neurips23/streaming/puck/config.yaml index 10e5989d..28a4e4a0 100755 --- a/neurips23/streaming/puck/config.yaml +++ b/neurips23/streaming/puck/config.yaml @@ -8,9 +8,7 @@ msturing-30M-clustered: base: args: | [ - { "index_type": 1, "C":100, "F":100, "FN":8, "N":0}, - { "index_type": 1, "C":200, "F":200, "FN":8, "N":0}, - { "index_type": 1, "C":200, "F":500, "FN":8, "N":0} + { "index_type": 1, "C":200, "F":200, "FN":8, "N":0} ] query-args: | [ From d772f74979e9b9c180283234d262fc1c3cc0088a Mon Sep 17 00:00:00 2001 From: nk2014yj Date: Mon, 30 Oct 2023 15:03:27 +0800 Subject: [PATCH 6/8] add puck-fizz --- neurips23/ood/puck-fizz/Dockerfile | 35 +++++ neurips23/ood/puck-fizz/config.yaml | 33 +++++ neurips23/ood/puck-fizz/puck.py | 217 ++++++++++++++++++++++++++++ 3 files changed, 285 insertions(+) create mode 100755 neurips23/ood/puck-fizz/Dockerfile create mode 100755 neurips23/ood/puck-fizz/config.yaml create mode 100755 neurips23/ood/puck-fizz/puck.py diff --git a/neurips23/ood/puck-fizz/Dockerfile b/neurips23/ood/puck-fizz/Dockerfile new file mode 100755 index 00000000..1695b160 --- /dev/null +++ b/neurips23/ood/puck-fizz/Dockerfile @@ -0,0 +1,35 @@ +FROM neurips23 + +RUN apt update +RUN apt-get update +RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip +#swig +RUN apt-get update && apt-get install -y swig cmake +RUN pip3 install pybind11 numpy +RUN cat /etc/ld.so.conf +RUN ls /etc/ld.so.conf.d/ +##cmake +# COPY cmake-3.22.0-linux-x86_64.sh . +RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh +RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake +ENV PATH /home/app/cmake/bin:$PATH + +#mkl +# COPY l_onemkl_p_2023.2.0.49497_offline.sh . +RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh +RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s + +RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf +RUN ldconfig +RUN touch /etc/profile.d/intel.sh +RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh +RUN . /etc/profile.d/intel.sh + +ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" +#RUN git config --global http.sslVerify false + +RUN git clone -b ood-try https://github.com/baidu/puck.git +# COPY puck-ood-feature.tar.gz . +# RUN tar zxvf puck-ood-feature.tar.gz +RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install +RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/ood/puck-fizz/config.yaml b/neurips23/ood/puck-fizz/config.yaml new file mode 100755 index 00000000..018d49d0 --- /dev/null +++ b/neurips23/ood/puck-fizz/config.yaml @@ -0,0 +1,33 @@ +random-xs: + puck: + docker-tag: neurips23-ood-puck-fizz + module: neurips23.ood.puck-fizz.puck + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] + query-args: | + [ + {"search_coarse_count":50, "tinker_search_range": 100}, + {"search_coarse_count":50, "tinker_search_range": 200}, + {"search_coarse_count":50, "tinker_search_range": 300} + ] + + +text2image-10M: + puck-fizz: + docker-tag: neurips23-ood-puck-fizz + module: neurips23.ood.puck-fizz.puck + constructor: Puck + base-args: ["@metric"] + run-groups: + base: + args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] + query-args: | + [ + {"search_coarse_count":10, "tinker_search_range": 160}, + {"search_coarse_count":10, "tinker_search_range": 170}, + {"search_coarse_count":10, "tinker_search_range": 180}, + {"search_coarse_count":10, "tinker_search_range": 190} + ] \ No newline at end of file diff --git a/neurips23/ood/puck-fizz/puck.py b/neurips23/ood/puck-fizz/puck.py new file mode 100755 index 00000000..347075bd --- /dev/null +++ b/neurips23/ood/puck-fizz/puck.py @@ -0,0 +1,217 @@ +# !/usr/bin/env python3 +#-*- coding:utf-8 -*- +################################################################################ +# +# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved +# +################################################################################ +""" +@file: puck_inmem.py +@author: heaoxiang(heaoxiang@baidu.com) +@date: 2023-10-29 13:44 +@brief: +""" +import ctypes + +from neurips23.ood.base import BaseOODANN +from benchmark.datasets import DATASETS +from benchmark.dataset_io import download_accelerated,xbin_mmap,xbin_write +# from neurips23.ood.puck.puck_lib import py_puck_api +from puck import py_puck_api +import multiprocessing.pool +import multiprocessing +import gc +import os +import numpy as np +import time +import math +import struct + +CPU_LIMIT = multiprocessing.cpu_count() +swig_ptr = py_puck_api.swig_ptr +class Puck(BaseOODANN): + def __init__(self, metric, index_params): + self._index_params = index_params + self._metric = metric + self.indexkey = index_params.get("indexkey", "NA") + + self.index = py_puck_api.PySearcher() + self.topk = 10 + self.n = 0 + self.build_memory_usage = -1 + + def track(self): + #T1 means in memory + return "T1 for 10M & 100M" + + + def check_feature(self, dataset): + #更新gflags + ds = DATASETS[dataset]() + d = ds.d + #特征纬度 + py_puck_api.update_gflag('feature_dim', "%d"%(d)) + + #根据距离计算方式调整 + whether_norm = False + py_puck_api.update_gflag('whether_norm', 'false') + ip2cos = 0 + if ds.distance() == "angular": + whether_norm = True + py_puck_api.update_gflag('whether_norm', 'true') + elif ds.distance() == "ip": + ip2cos = 1 + py_puck_api.update_gflag('ip2cos', '%d'%(ip2cos)) + + self.init_indexkey() + + #测试 + py_puck_api.update_gflag('kmeans_iterations_count',"1") + + #索引存储目录 + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + if not os.path.exists(self.index_name(dataset)): + index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) + os.makedirs(index_dir, mode=0o777, exist_ok=True) + + #训练用目录 + if not os.path.exists('mid-data'): + os.mkdir('mid-data') + + #格式化文件数据,将来可不要,后续可修改训练接口 + all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb') + + add_part=100000 + i0 = 0 + t0 = time.time() + for xblock in ds.get_dataset_iterator(bs=add_part): + i1 = i0 + len(xblock) + i0 = i1 + for x in xblock: + feat = x + if(whether_norm): + feat = feat / np.sqrt(np.dot(feat, feat)) + elif(ip2cos > 0): + norm = np.dot(feat, feat) + if norm > 1.0: + print("not support, please contact yinjie06") + return False + feat = np.append(feat, math.sqrt(1.0 - norm)) + + buf = struct.pack('i', len(feat)) + all_feature_file.write(buf) + buf = struct.pack('f' * len(feat), *feat) + all_feature_file.write(buf) + print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0)) + all_feature_file.close() + + print(" init help query ") + filename = os.path.join(ds.basedir, ds.qs_fn) + if os.path.exists(filename): + read_x = xbin_mmap(filename, dtype=ds.dtype) + write_x = np.append(read_x, np.zeros((read_x.shape[0], 1)), axis=1) + print("help query shape nrows = %d , ncols = %d "%(write_x.shape[0],write_x.shape[1])) + xbin_write(write_x,"%s/help_query.feat.bin"%(self.index_name(dataset))) + + return True + + def fit(self, dataset): + self.check_feature(dataset) + ds = DATASETS[dataset]() + #训练数据采样 + py_puck_api.update_gflag('train_points_count', "5000000") + py_puck_api.update_gflag('pq_train_points_count', "500000") + print(self.index_name(dataset)) + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + + self.index.build(ds.nb) + self.load_index(dataset) + + #index = py_puck_api.PySearcher() + #p = multiprocessing.Process(group=None,target=index.build,args=(ds.nb,)) + #self.index.build(ds.nb) + #p.start() + #p.join() + + def init_indexkey(self): + #一级聚类中心 + if "C" in self._index_params: + py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) + self.indexkey = "C%s"%(self._index_params['C']) + #二级聚类中心 + if "F" in self._index_params: + py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) + self.indexkey += "_F%s"%(self._index_params['F']) + #filter + if "FN" in self._index_params: + py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) + self.indexkey += "_FN%s"%(self._index_params['FN']) + #量化 + if "N" in self._index_params: + if int(self._index_params['N']) > 1: + py_puck_api.update_gflag('whether_pq', 'true') + py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) + self.indexkey += "_N%s"%(self._index_params['N']) + else: + py_puck_api.update_gflag('whether_pq', 'false') + self.indexkey += "_Flat" + + if "tinker_neighborhood" in self._index_params: + py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) + self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) + if "tinker_construction" in self._index_params: + py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) + self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) + if "index_type" in self._index_params: + py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) + if "radius_rate" in self._index_params: + py_puck_api.update_gflag('radius_rate', "%f"%(self._index_params['radius_rate'])) + self.indexkey += "_RadiusRate%s"%(self._index_params['radius_rate']) + + + def index_name(self, name): + return f"data/{name}.{self.indexkey}.puckindex" + + def index_tag_name(self, name): + return f"{name}.{self.indexkey}.puckindex" + + def load_index(self, dataset): + print("Loading index") + self.init_indexkey() + ds = DATASETS[dataset]() + self.topk = ds.default_count() + print("self.topk=%d"%self.topk) + py_puck_api.update_gflag('topk', "%s"%(ds.default_count())) + py_puck_api.update_gflag('index_path', self.index_name(dataset)) + py_puck_api.update_gflag('context_initial_pool_size', "%d"%(2 * CPU_LIMIT)) + py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) + print(self.index_name(dataset)) + ret = self.index.init() + print("ret = ",ret) + if ret != 0: + return False + self.index.show() + self.n = ds.nq + return True + + def set_query_arguments(self, query_args): + for key, value in query_args.items(): + py_puck_api.update_gflag(key, "%s"%value) + #query_args_list = query_args.strip().split(',') + #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) + self.index.init() + #topk是作为检索参数传入puck + self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32')) + self.qas = query_args + + def query(self, X, topK): + n, d = X.shape + + self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) + #print(self.res[0]) + # print(self.res[1]) + def get_results(self): + return self.res[1] + + def __str__(self): + return f'Puck{self.indexkey, self.qas}' \ No newline at end of file From e5527fbdd1a6d58263f0d55702af7acb20d9a6a6 Mon Sep 17 00:00:00 2001 From: nk2014yj Date: Mon, 30 Oct 2023 15:42:33 +0800 Subject: [PATCH 7/8] update params --- neurips23/filter/puck/config.yaml | 3 ++- neurips23/ood/puck/config.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/neurips23/filter/puck/config.yaml b/neurips23/filter/puck/config.yaml index c810afd8..f71d438a 100755 --- a/neurips23/filter/puck/config.yaml +++ b/neurips23/filter/puck/config.yaml @@ -50,8 +50,9 @@ yfcc-10M: [{ "index_type": 3, "C":1000, "F":1000, "FN":16, "N":0}] query-args: | [ + {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":135}, {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":105}, {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":110}, {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":115}, {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":120 } - ] \ No newline at end of file + ] diff --git a/neurips23/ood/puck/config.yaml b/neurips23/ood/puck/config.yaml index 445b7d6b..933f963a 100755 --- a/neurips23/ood/puck/config.yaml +++ b/neurips23/ood/puck/config.yaml @@ -26,8 +26,9 @@ text2image-10M: args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] query-args: | [ + {"search_coarse_count":10, "tinker_search_range": 190}, {"search_coarse_count":10, "tinker_search_range": 160}, {"search_coarse_count":10, "tinker_search_range": 165}, {"search_coarse_count":10, "tinker_search_range": 170}, {"search_coarse_count":10, "tinker_search_range": 175} - ] \ No newline at end of file + ] From 5885f8c9645cdab92155a6bf1df1740be3f78e20 Mon Sep 17 00:00:00 2001 From: nk2014yj Date: Mon, 30 Oct 2023 17:06:38 +0800 Subject: [PATCH 8/8] only keep puck-fizz of OOD --- neurips23/filter/puck/Dockerfile | 31 ---- neurips23/filter/puck/config.yaml | 58 ------- neurips23/filter/puck/puck.py | 222 --------------------------- neurips23/ood/puck/Dockerfile | 31 ---- neurips23/ood/puck/config.yaml | 34 ---- neurips23/ood/puck/puck.py | 216 -------------------------- neurips23/streaming/puck/Dockerfile | 31 ---- neurips23/streaming/puck/config.yaml | 18 --- neurips23/streaming/puck/puck.py | 141 ----------------- 9 files changed, 782 deletions(-) delete mode 100755 neurips23/filter/puck/Dockerfile delete mode 100755 neurips23/filter/puck/config.yaml delete mode 100755 neurips23/filter/puck/puck.py delete mode 100755 neurips23/ood/puck/Dockerfile delete mode 100755 neurips23/ood/puck/config.yaml delete mode 100755 neurips23/ood/puck/puck.py delete mode 100755 neurips23/streaming/puck/Dockerfile delete mode 100755 neurips23/streaming/puck/config.yaml delete mode 100755 neurips23/streaming/puck/puck.py diff --git a/neurips23/filter/puck/Dockerfile b/neurips23/filter/puck/Dockerfile deleted file mode 100755 index d2db5e83..00000000 --- a/neurips23/filter/puck/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt-get update -RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip -#swig -RUN apt-get update && apt-get install -y swig cmake -RUN pip3 install pybind11 numpy -RUN cat /etc/ld.so.conf -RUN ls /etc/ld.so.conf.d/ -##cmake -RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh -RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake -ENV PATH /home/app/cmake/bin:$PATH - -#mkl -RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh -RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s - -RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf -RUN ldconfig -RUN touch /etc/profile.d/intel.sh -RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh -RUN . /etc/profile.d/intel.sh - -ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" -#RUN git config --global http.sslVerify false - -RUN git clone -b filter https://github.com/baidu/puck.git -RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install -RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/filter/puck/config.yaml b/neurips23/filter/puck/config.yaml deleted file mode 100755 index f71d438a..00000000 --- a/neurips23/filter/puck/config.yaml +++ /dev/null @@ -1,58 +0,0 @@ -random-filter-s: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF1024,SQ8"}] - query-args: | - [{"nprobe": 1}, - {"nprobe":2}, - {"nprobe":4}] -random-s: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF1024,SQ8"}] - query-args: | - [{"nprobe": 1}, - {"nprobe":2}, - {"nprobe":4}] -yfcc-10M-unfiltered: - faiss: - docker-tag: neurips23-filter-faiss - module: neurips23.filter.faiss.faiss - constructor: FAISS - base-args: ["@metric"] - run-groups: - base: - args: | - [{"indexkey": "IVF16384,SQ8", "binarysig": true, "threads": 16}] - query-args: | - [{"nprobe": 1}, {"nprobe": 4}, {"nprobe": 16}, {"nprobe": 64}] -yfcc-10M: - puck: - docker-tag: neurips23-filter-puck - module: neurips23.filter.puck.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: | - [{ "index_type": 3, "C":1000, "F":1000, "FN":16, "N":0}] - query-args: | - [ - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":135}, - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":105}, - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":110}, - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":115}, - {"radius_rate":1.00 ,"search_coarse_count":30, "filter_topk":120 } - ] diff --git a/neurips23/filter/puck/puck.py b/neurips23/filter/puck/puck.py deleted file mode 100755 index e13072e8..00000000 --- a/neurips23/filter/puck/puck.py +++ /dev/null @@ -1,222 +0,0 @@ -# !/usr/bin/env python3 -#-*- coding:utf-8 -*- -################################################################################ -# -# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved -# -################################################################################ -""" -@file: puck_inmem.py -@author: yinjie06(yinjie06@baidu.com) -@date: 2021-10-06 13:44 -@brief: -""" - -from neurips23.filter.base import BaseFilterANN -from benchmark.datasets import DATASETS -from benchmark.dataset_io import download_accelerated -from puck import py_puck_api -import multiprocessing.pool -from multiprocessing import Process -import os -import numpy as np -import time -import math -import struct - -CPU_LIMIT = multiprocessing.cpu_count() -swig_ptr = py_puck_api.swig_ptr - -class Puck(BaseFilterANN): - def __init__(self, metric, index_params): - self._index_params = index_params - self._metric = metric - self.indexkey = index_params.get("indexkey", "NA") - - self.index = py_puck_api.PySearcher() - self.topk = 10 - self.n = 0 - self.build_memory_usage = -1 - print("after init") - - def init_dataset_key(self, dataset): - - #更新gflags - ds = DATASETS[dataset]() - d = ds.d - #特征纬度 - py_puck_api.update_gflag('feature_dim', "%d"%(d)) - - #根据距离计算方式调整 - self.whether_norm = False - py_puck_api.update_gflag('whether_norm', 'false') - self.ip2cos = 0 - if ds.distance() == "angular": - self.whether_norm = True - py_puck_api.update_gflag('whether_norm', 'true') - elif ds.distance() == "ip": - self.ip2cos = 1 - py_puck_api.update_gflag('ip2cos', '%d'%(self.ip2cos)) - - self.init_indexkey() - #测试 - py_puck_api.update_gflag('kmeans_iterations_count',"1") - - def check_feature(self, dataset): - self.init_dataset_key(dataset) - ds = DATASETS[dataset]() - d = ds.d - #索引存储目录 - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - if not os.path.exists(self.index_name(dataset)): - index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - - meta_indices_file_name = self.index_name(dataset) + "/indices.dat" - meta_indices_file = open(meta_indices_file_name, 'wb') - meta_indptr_file_name = self.index_name(dataset) + "/indptr.dat" - meta_indptr_file = open(meta_indptr_file_name, 'wb') - - meta_to_write = ds.get_dataset_metadata() - buf = struct.pack('i', len(meta_to_write.indices)) - meta_indices_file.write(buf) - buf = struct.pack('i' * len(meta_to_write.indices), *(meta_to_write.indices)) - meta_indices_file.write(buf) - meta_indices_file.close() - - buf = struct.pack('i', len(meta_to_write.indptr)) - meta_indptr_file.write(buf) - buf = struct.pack('i' * len(meta_to_write.indptr), *(meta_to_write.indptr)) - meta_indptr_file.write(buf) - meta_indptr_file.close() - - #训练用目录 - if not os.path.exists('mid-data'): - os.mkdir('mid-data') - #格式化文件数据,将来可不要,后续可修改训练接口 - all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb') - - add_part=100000 - i0 = 0 - t0 = time.time() - for xblock in ds.get_dataset_iterator(bs=add_part): - i1 = i0 + len(xblock) - i0 = i1 - for x in xblock: - feat = x.astype(np.float32) - if(self.whether_norm): - feat = feat / np.sqrt(np.dot(feat, feat)) - elif(self.ip2cos > 0): - norm = np.dot(feat, feat) - if norm > 1.0: - print("not support, please contact yinjie06") - return False - feat = np.append(feat, math.sqrt(1.0 - norm)) - - buf = struct.pack('i', len(feat)) - all_feature_file.write(buf) - buf = struct.pack('f' * len(feat), *feat) - all_feature_file.write(buf) - print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0)) - all_feature_file.close() - - def fit(self, dataset): - print("start fit") - #self.check_feature(dataset) - p = Process(target=self.check_feature, args=(dataset,)) - p.start() - p.join() - self.init_dataset_key(dataset) - ds = DATASETS[dataset]() - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - #训练数据采样 - py_puck_api.update_gflag('train_points_count', "5000000") - py_puck_api.update_gflag('pq_train_points_count', "500000") - print(self.index_name(dataset)) - print("start to train") - self.index.build(ds.nb) - self.load_index(dataset) - - def init_indexkey(self): - #一级聚类中心 - if "C" in self._index_params: - py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) - self.indexkey = "C%s"%(self._index_params['C']) - #二级聚类中心 - if "F" in self._index_params: - py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) - self.indexkey += "_F%s"%(self._index_params['F']) - #filter - if "FN" in self._index_params: - py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) - self.indexkey += "_FN%s"%(self._index_params['FN']) - #量化 - if "N" in self._index_params: - if int(self._index_params['N']) > 1: - py_puck_api.update_gflag('whether_pq', 'true') - py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) - self.indexkey += "_N%s"%(self._index_params['N']) - else: - py_puck_api.update_gflag('whether_pq', 'false') - self.indexkey += "_Flat" - - if "tinker_neighborhood" in self._index_params: - py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) - self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) - if "tinker_construction" in self._index_params: - py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) - self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) - if "index_type" in self._index_params: - py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) - - def index_name(self, name): - return f"data/{name}.{self.indexkey}.puckindex" - - def index_tag_name(self, name): - return f"{name}.{self.indexkey}.puckindex" - - def load_index(self, dataset): - print("Loading index") - self.init_indexkey() - ds = DATASETS[dataset]() - self.topk = ds.default_count() - print("self.topk=%d"%self.topk) - py_puck_api.update_gflag('topk', "%s"%(ds.default_count())) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - py_puck_api.update_gflag('context_initial_pool_size', "%d"%(CPU_LIMIT)) - py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) - print(self.index_name(dataset)) - ret = self.index.init() - print("ret = ",ret) - if ret != 0: - return False - self.index.show() - self.n = ds.nq - return True - - def set_query_arguments(self, query_args): - for key, value in query_args.items(): - py_puck_api.update_gflag(key, "%s"%value) - #query_args_list = query_args.strip().split(',') - #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) - self.index.init() - #topk是作为检索参数传入puck - self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32')) - self.qas = query_args - - def query(self, X, topK): - - n, d = X.shape - self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) - #print(self.res[0]) - print(self.res[1]) - def get_results(self): - return self.res[1] - def filtered_query(self, X, filter, k): - n, d = X.shape - x_float = X.astype(np.float32) - meta_q = filter - self.index.filter_search(n, swig_ptr(x_float), k, swig_ptr(self.res[0]), swig_ptr(self.res[1]), swig_ptr(filter.indptr), swig_ptr(filter.indices)) - - def __str__(self): - return f'Puck{self.indexkey, self.qas}' diff --git a/neurips23/ood/puck/Dockerfile b/neurips23/ood/puck/Dockerfile deleted file mode 100755 index ff088505..00000000 --- a/neurips23/ood/puck/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt-get update -RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip -#swig -RUN apt-get update && apt-get install -y swig cmake -RUN pip3 install pybind11 numpy -RUN cat /etc/ld.so.conf -RUN ls /etc/ld.so.conf.d/ -##cmake -RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh -RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake -ENV PATH /home/app/cmake/bin:$PATH - -#mkl -RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh -RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s - -RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf -RUN ldconfig -RUN touch /etc/profile.d/intel.sh -RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh -RUN . /etc/profile.d/intel.sh - -ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" -#RUN git config --global http.sslVerify false - -RUN git clone -b ood https://github.com/baidu/puck.git -RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install -RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/ood/puck/config.yaml b/neurips23/ood/puck/config.yaml deleted file mode 100755 index 933f963a..00000000 --- a/neurips23/ood/puck/config.yaml +++ /dev/null @@ -1,34 +0,0 @@ -random-xs: - puck: - docker-tag: neurips23-ood-puck - module: neurips23.ood.puck.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] - query-args: | - [ - {"search_coarse_count":50, "tinker_search_range": 100}, - {"search_coarse_count":50, "tinker_search_range": 200}, - {"search_coarse_count":50, "tinker_search_range": 300} - ] - - -text2image-10M: - puck: - docker-tag: neurips23-ood-puck - module: neurips23.ood.puck.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}] - query-args: | - [ - {"search_coarse_count":10, "tinker_search_range": 190}, - {"search_coarse_count":10, "tinker_search_range": 160}, - {"search_coarse_count":10, "tinker_search_range": 165}, - {"search_coarse_count":10, "tinker_search_range": 170}, - {"search_coarse_count":10, "tinker_search_range": 175} - ] diff --git a/neurips23/ood/puck/puck.py b/neurips23/ood/puck/puck.py deleted file mode 100755 index 6c407800..00000000 --- a/neurips23/ood/puck/puck.py +++ /dev/null @@ -1,216 +0,0 @@ -# !/usr/bin/env python3 -#-*- coding:utf-8 -*- -################################################################################ -# -# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved -# -################################################################################ -""" -@file: puck_inmem.py -@author: yinjie06(yinjie06@baidu.com) -@date: 2021-10-06 13:44 -@brief: -""" -import ctypes - -from neurips23.ood.base import BaseOODANN -from benchmark.datasets import DATASETS -from benchmark.dataset_io import download_accelerated,xbin_mmap,xbin_write -from puck import py_puck_api -import multiprocessing.pool -import multiprocessing -import gc -import os -import numpy as np -import time -import math -import struct - -CPU_LIMIT = multiprocessing.cpu_count() -swig_ptr = py_puck_api.swig_ptr -class Puck(BaseOODANN): - def __init__(self, metric, index_params): - self._index_params = index_params - self._metric = metric - self.indexkey = index_params.get("indexkey", "NA") - - self.index = py_puck_api.PySearcher() - self.topk = 10 - self.n = 0 - self.build_memory_usage = -1 - - def track(self): - #T1 means in memory - return "T1 for 10M & 100M" - - - def check_feature(self, dataset): - #更新gflags - ds = DATASETS[dataset]() - d = ds.d - #特征纬度 - py_puck_api.update_gflag('feature_dim', "%d"%(d)) - - #根据距离计算方式调整 - whether_norm = False - py_puck_api.update_gflag('whether_norm', 'false') - ip2cos = 0 - if ds.distance() == "angular": - whether_norm = True - py_puck_api.update_gflag('whether_norm', 'true') - elif ds.distance() == "ip": - ip2cos = 1 - py_puck_api.update_gflag('ip2cos', '%d'%(ip2cos)) - - self.init_indexkey() - - #测试 - py_puck_api.update_gflag('kmeans_iterations_count',"1") - - #索引存储目录 - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - if not os.path.exists(self.index_name(dataset)): - index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - - #训练用目录 - if not os.path.exists('mid-data'): - os.mkdir('mid-data') - - #格式化文件数据,将来可不要,后续可修改训练接口 - all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb') - - add_part=100000 - i0 = 0 - t0 = time.time() - for xblock in ds.get_dataset_iterator(bs=add_part): - i1 = i0 + len(xblock) - i0 = i1 - for x in xblock: - feat = x - if(whether_norm): - feat = feat / np.sqrt(np.dot(feat, feat)) - elif(ip2cos > 0): - norm = np.dot(feat, feat) - if norm > 1.0: - print("not support, please contact yinjie06") - return False - feat = np.append(feat, math.sqrt(1.0 - norm)) - - buf = struct.pack('i', len(feat)) - all_feature_file.write(buf) - buf = struct.pack('f' * len(feat), *feat) - all_feature_file.write(buf) - print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0)) - all_feature_file.close() - - # print(" init help query ") - # filename = os.path.join(ds.basedir, ds.qs_fn) - # read_x = xbin_mmap(filename, dtype=ds.dtype) - - # write_x = np.append(read_x, np.zeros((read_x.shape[0], 1)), axis=1) - # print("help query shape nrows = %d , ncols = %d "%(write_x.shape[0],write_x.shape[1])) - # xbin_write(write_x,"%s/help_query.feat.bin"%(self.index_name(dataset))) - - return True - - def fit(self, dataset): - self.check_feature(dataset) - ds = DATASETS[dataset]() - #训练数据采样 - py_puck_api.update_gflag('train_points_count', "5000000") - py_puck_api.update_gflag('pq_train_points_count', "500000") - print(self.index_name(dataset)) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - - self.index.build(ds.nb) - self.load_index(dataset) - - #index = py_puck_api.PySearcher() - #p = multiprocessing.Process(group=None,target=index.build,args=(ds.nb,)) - #self.index.build(ds.nb) - #p.start() - #p.join() - - def init_indexkey(self): - #一级聚类中心 - if "C" in self._index_params: - py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) - self.indexkey = "C%s"%(self._index_params['C']) - #二级聚类中心 - if "F" in self._index_params: - py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) - self.indexkey += "_F%s"%(self._index_params['F']) - #filter - if "FN" in self._index_params: - py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) - self.indexkey += "_FN%s"%(self._index_params['FN']) - #量化 - if "N" in self._index_params: - if int(self._index_params['N']) > 1: - py_puck_api.update_gflag('whether_pq', 'true') - py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) - self.indexkey += "_N%s"%(self._index_params['N']) - else: - py_puck_api.update_gflag('whether_pq', 'false') - self.indexkey += "_Flat" - - if "tinker_neighborhood" in self._index_params: - py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) - self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) - if "tinker_construction" in self._index_params: - py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) - self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) - if "index_type" in self._index_params: - py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) - if "radius_rate" in self._index_params: - py_puck_api.update_gflag('radius_rate', "%f"%(self._index_params['radius_rate'])) - self.indexkey += "_RadiusRate%s"%(self._index_params['radius_rate']) - - - def index_name(self, name): - return f"data/{name}.{self.indexkey}.puckindex" - - def index_tag_name(self, name): - return f"{name}.{self.indexkey}.puckindex" - - def load_index(self, dataset): - print("Loading index") - self.init_indexkey() - ds = DATASETS[dataset]() - self.topk = ds.default_count() - print("self.topk=%d"%self.topk) - py_puck_api.update_gflag('topk', "%s"%(ds.default_count())) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - py_puck_api.update_gflag('context_initial_pool_size', "%d"%(2 * CPU_LIMIT)) - py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) - print(self.index_name(dataset)) - ret = self.index.init() - print("ret = ",ret) - if ret != 0: - return False - self.index.show() - self.n = ds.nq - return True - - def set_query_arguments(self, query_args): - for key, value in query_args.items(): - py_puck_api.update_gflag(key, "%s"%value) - #query_args_list = query_args.strip().split(',') - #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) - self.index.init() - #topk是作为检索参数传入puck - self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32')) - self.qas = query_args - - def query(self, X, topK): - n, d = X.shape - - self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) - #print(self.res[0]) - # print(self.res[1]) - def get_results(self): - return self.res[1] - - def __str__(self): - return f'Puck{self.indexkey, self.qas}' \ No newline at end of file diff --git a/neurips23/streaming/puck/Dockerfile b/neurips23/streaming/puck/Dockerfile deleted file mode 100755 index 0af82acc..00000000 --- a/neurips23/streaming/puck/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -FROM neurips23 - -RUN apt update -RUN apt-get update -RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip -#swig -RUN apt-get update && apt-get install -y swig cmake -RUN pip3 install pybind11 numpy -RUN cat /etc/ld.so.conf -RUN ls /etc/ld.so.conf.d/ -##cmake -RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh -RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake -ENV PATH /home/app/cmake/bin:$PATH - -#mkl -RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh -RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s - -RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf -RUN ldconfig -RUN touch /etc/profile.d/intel.sh -RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh -RUN . /etc/profile.d/intel.sh - -ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON" -#RUN git config --global http.sslVerify false - -RUN git clone -b streaming https://github.com/baidu/puck.git -RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install -RUN python3 -c 'from puck import py_puck_api' diff --git a/neurips23/streaming/puck/config.yaml b/neurips23/streaming/puck/config.yaml deleted file mode 100755 index 28a4e4a0..00000000 --- a/neurips23/streaming/puck/config.yaml +++ /dev/null @@ -1,18 +0,0 @@ -msturing-30M-clustered: - puck: - docker-tag: neurips23-streaming-puck - module: neurips23.streaming.puck.puck - constructor: Puck - base-args: ["@metric"] - run-groups: - base: - args: | - [ - { "index_type": 1, "C":200, "F":200, "FN":8, "N":0} - ] - query-args: | - [ - {"radius_rate":1.00 ,"search_coarse_count":200, "filter_topk":1000 } - ] - - diff --git a/neurips23/streaming/puck/puck.py b/neurips23/streaming/puck/puck.py deleted file mode 100755 index ccd4e9f6..00000000 --- a/neurips23/streaming/puck/puck.py +++ /dev/null @@ -1,141 +0,0 @@ -# !/usr/bin/env python3 -#-*- coding:utf-8 -*- -################################################################################ -# -# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved -# -################################################################################ -""" -@file: puck_inmem.py -@author: yinjie06(yinjie06@baidu.com) -@date: 2021-10-06 13:44 -@brief: -""" -import ctypes - -from neurips23.streaming.base import BaseStreamingANN -from benchmark.datasets import DATASETS -from benchmark.dataset_io import download_accelerated -from puck import py_puck_api -import multiprocessing.pool -import multiprocessing -import gc -import os -import numpy as np -import time -import math -import struct -import sys -print(sys.version) -CPU_LIMIT = 16 #multiprocessing.cpu_count() -swig_ptr = py_puck_api.swig_ptr -class Puck(BaseStreamingANN): - def __init__(self, metric, index_params): - self._index_params = index_params - self._metric = metric - self.indexkey = index_params.get("indexkey", "NA") - - self.index = py_puck_api.PySearcher() - self.topk = 10 - self.n = 0 - self.build_memory_usage = -1 - - def track(self): - #T1 means in memory - return "T1 for 10M & 100M" - - - def init_indexkey(self): - #一级聚类中心 - if "C" in self._index_params: - py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C'])) - self.indexkey = "C%s"%(self._index_params['C']) - #二级聚类中心 - if "F" in self._index_params: - py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F'])) - self.indexkey += "_F%s"%(self._index_params['F']) - #filter - if "FN" in self._index_params: - py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN'])) - self.indexkey += "_FN%s"%(self._index_params['FN']) - #量化 - if "N" in self._index_params: - if int(self._index_params['N']) > 1: - py_puck_api.update_gflag('whether_pq', 'true') - py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N'])) - self.indexkey += "_N%s"%(self._index_params['N']) - else: - py_puck_api.update_gflag('whether_pq', 'false') - self.indexkey += "_Flat" - - if "tinker_neighborhood" in self._index_params: - py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood'])) - self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood']) - if "tinker_construction" in self._index_params: - py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction'])) - self.indexkey += "_Construction%s"%(self._index_params['tinker_construction']) - if "index_type" in self._index_params: - py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type'])) - if "radius_rate" in self._index_params: - py_puck_api.update_gflag('radius_rate', "%f"%(self._index_params['radius_rate'])) - self.indexkey += "_RadiusRate%s"%(self._index_params['radius_rate']) - - - def index_name(self, name): - return f"data/{name}.{self.indexkey}.puckindex" - - def index_tag_name(self, name): - return f"{name}.{self.indexkey}.puckindex" - - def setup(self, dtype, max_pts, ndim): - print("setup") - py_puck_api.update_gflag('max_point_stored', '%d'%(max_pts)) - py_puck_api.update_gflag('whether_norm', 'false') - py_puck_api.update_gflag('max_point_stored', "%d"%(max_pts)) - py_puck_api.update_gflag('feature_dim', "%d"%(ndim)) - self.init_indexkey() - #索引存储目录 - dataset = "streaming" - py_puck_api.update_gflag('kmeans_iterations_count',"1") - py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) - py_puck_api.update_gflag('context_initial_pool_size', "%d"%(2 * CPU_LIMIT)) - print(self.index_name(dataset)) - py_puck_api.update_gflag('index_path', self.index_name(dataset)) - - if not os.path.exists(self.index_name(dataset)): - index_dir = os.path.join(os.getcwd(), self.index_name(dataset)) - os.makedirs(index_dir, mode=0o777, exist_ok=True) - - #训练用目录 - if not os.path.exists('mid-data'): - os.mkdir('mid-data') - - print(1) - self.index.init() - - def set_query_arguments(self, query_args): - for key, value in query_args.items(): - py_puck_api.update_gflag(key, "%s"%value) - #query_args_list = query_args.strip().split(',') - #self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3])) - py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT)) - self.index.init() - #topk是作为检索参数传入puck - self.qas = query_args - def insert(self, X, ids): - n, d = X.shape - self.index.batch_add(n, d, swig_ptr(X), swig_ptr(ids)) - def delete(self, ids): - n = len(ids) - self.index.batch_delete(n, swig_ptr(ids)) - def query(self, X, topK): - n, d = X.shape - self.res = (np.empty((n, topK), dtype='float32'), np.empty((n, topK), dtype='uint32')) - self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1])) - #print(self.res[0]) - #print(self.res[1]) - def get_results(self): - return self.res[1] - - def __str__(self): - return f'Puck{self.indexkey, self.qas}'