Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NeurIPS 2023 OOD Track] Puck-fizz #203

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions neurips23/ood/puck-fizz/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM neurips23

RUN apt update
RUN apt-get update
RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip
#swig
RUN apt-get update && apt-get install -y swig cmake
RUN pip3 install pybind11 numpy
RUN cat /etc/ld.so.conf
RUN ls /etc/ld.so.conf.d/
##cmake
# COPY cmake-3.22.0-linux-x86_64.sh .
RUN wget https://cmake.org/files/v3.22/cmake-3.22.0-linux-x86_64.sh
RUN mkdir cmake && sh cmake-3.22.0-linux-x86_64.sh --skip-license --prefix=cmake
ENV PATH /home/app/cmake/bin:$PATH

#mkl
# COPY l_onemkl_p_2023.2.0.49497_offline.sh .
RUN wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/adb8a02c-4ee7-4882-97d6-a524150da358/l_onemkl_p_2023.2.0.49497_offline.sh
RUN sh l_onemkl_p_2023.2.0.49497_offline.sh -a --components intel.oneapi.lin.mkl.devel --action install --eula accept -s

RUN echo "/opt/intel/oneapi/mkl/latest/lib/intel64" > /etc/ld.so.conf.d/mkl.conf
RUN ldconfig
RUN touch /etc/profile.d/intel.sh
RUN echo ". /opt/intel/oneapi/mkl/latest/env/vars.sh" >> /etc/profile.d/intel.sh
RUN . /etc/profile.d/intel.sh

ENV CMAKE_ARGS "-DMKLROOT=/opt/intel/oneapi/mkl/latest/ -DBLA_VENDOR=Intel10_64lp_seq -DBLA_STATIC=ON"
#RUN git config --global http.sslVerify false

RUN git clone -b ood-try https://github.com/baidu/puck.git
# COPY puck-ood-feature.tar.gz .
# RUN tar zxvf puck-ood-feature.tar.gz
RUN cd puck && . /etc/profile.d/intel.sh && python3 setup.py install
RUN python3 -c 'from puck import py_puck_api'
33 changes: 33 additions & 0 deletions neurips23/ood/puck-fizz/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
random-xs:
puck:
docker-tag: neurips23-ood-puck-fizz
module: neurips23.ood.puck-fizz.puck
constructor: Puck
base-args: ["@metric"]
run-groups:
base:
args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}]
query-args: |
[
{"search_coarse_count":50, "tinker_search_range": 100},
{"search_coarse_count":50, "tinker_search_range": 200},
{"search_coarse_count":50, "tinker_search_range": 300}
]


text2image-10M:
puck-fizz:
docker-tag: neurips23-ood-puck-fizz
module: neurips23.ood.puck-fizz.puck
constructor: Puck
base-args: ["@metric"]
run-groups:
base:
args: [{"index_type":2, "C":1000, "F":1000,"tinker_neighborhood":16,"tinker_construction":200}]
query-args: |
[
{"search_coarse_count":10, "tinker_search_range": 160},
{"search_coarse_count":10, "tinker_search_range": 170},
{"search_coarse_count":10, "tinker_search_range": 180},
{"search_coarse_count":10, "tinker_search_range": 190}
]
217 changes: 217 additions & 0 deletions neurips23/ood/puck-fizz/puck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# !/usr/bin/env python3
#-*- coding:utf-8 -*-
################################################################################
#
# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
################################################################################
"""
@file: puck_inmem.py
@author: heaoxiang([email protected])
@date: 2023-10-29 13:44
@brief:
"""
import ctypes

from neurips23.ood.base import BaseOODANN
from benchmark.datasets import DATASETS
from benchmark.dataset_io import download_accelerated,xbin_mmap,xbin_write
# from neurips23.ood.puck.puck_lib import py_puck_api
from puck import py_puck_api
import multiprocessing.pool
import multiprocessing
import gc
import os
import numpy as np
import time
import math
import struct

CPU_LIMIT = multiprocessing.cpu_count()
swig_ptr = py_puck_api.swig_ptr
class Puck(BaseOODANN):
def __init__(self, metric, index_params):
self._index_params = index_params
self._metric = metric
self.indexkey = index_params.get("indexkey", "NA")

self.index = py_puck_api.PySearcher()
self.topk = 10
self.n = 0
self.build_memory_usage = -1

def track(self):
#T1 means in memory
return "T1 for 10M & 100M"


def check_feature(self, dataset):
#更新gflags
ds = DATASETS[dataset]()
d = ds.d
#特征纬度
py_puck_api.update_gflag('feature_dim', "%d"%(d))

#根据距离计算方式调整
whether_norm = False
py_puck_api.update_gflag('whether_norm', 'false')
ip2cos = 0
if ds.distance() == "angular":
whether_norm = True
py_puck_api.update_gflag('whether_norm', 'true')
elif ds.distance() == "ip":
ip2cos = 1
py_puck_api.update_gflag('ip2cos', '%d'%(ip2cos))

self.init_indexkey()

#测试
py_puck_api.update_gflag('kmeans_iterations_count',"1")

#索引存储目录
py_puck_api.update_gflag('index_path', self.index_name(dataset))
if not os.path.exists(self.index_name(dataset)):
index_dir = os.path.join(os.getcwd(), self.index_name(dataset))
os.makedirs(index_dir, mode=0o777, exist_ok=True)

#训练用目录
if not os.path.exists('mid-data'):
os.mkdir('mid-data')

#格式化文件数据,将来可不要,后续可修改训练接口
all_feature_file = open("%s/all_data.feat.bin"%(self.index_name(dataset)), 'wb')

add_part=100000
i0 = 0
t0 = time.time()
for xblock in ds.get_dataset_iterator(bs=add_part):
i1 = i0 + len(xblock)
i0 = i1
for x in xblock:
feat = x
if(whether_norm):
feat = feat / np.sqrt(np.dot(feat, feat))
elif(ip2cos > 0):
norm = np.dot(feat, feat)
if norm > 1.0:
print("not support, please contact yinjie06")
return False
feat = np.append(feat, math.sqrt(1.0 - norm))

buf = struct.pack('i', len(feat))
all_feature_file.write(buf)
buf = struct.pack('f' * len(feat), *feat)
all_feature_file.write(buf)
print(" adding %d:%d / %d [%.3f s] " % (i0, i1, ds.nb, time.time() - t0))
all_feature_file.close()

print(" init help query ")
filename = os.path.join(ds.basedir, ds.qs_fn)
if os.path.exists(filename):
read_x = xbin_mmap(filename, dtype=ds.dtype)
write_x = np.append(read_x, np.zeros((read_x.shape[0], 1)), axis=1)
print("help query shape nrows = %d , ncols = %d "%(write_x.shape[0],write_x.shape[1]))
xbin_write(write_x,"%s/help_query.feat.bin"%(self.index_name(dataset)))

return True

def fit(self, dataset):
self.check_feature(dataset)
ds = DATASETS[dataset]()
#训练数据采样
py_puck_api.update_gflag('train_points_count', "5000000")
py_puck_api.update_gflag('pq_train_points_count', "500000")
print(self.index_name(dataset))
py_puck_api.update_gflag('index_path', self.index_name(dataset))

self.index.build(ds.nb)
self.load_index(dataset)

#index = py_puck_api.PySearcher()
#p = multiprocessing.Process(group=None,target=index.build,args=(ds.nb,))
#self.index.build(ds.nb)
#p.start()
#p.join()

def init_indexkey(self):
#一级聚类中心
if "C" in self._index_params:
py_puck_api.update_gflag('coarse_cluster_count', "%d"%(self._index_params['C']))
self.indexkey = "C%s"%(self._index_params['C'])
#二级聚类中心
if "F" in self._index_params:
py_puck_api.update_gflag('fine_cluster_count', "%d"%(self._index_params['F']))
self.indexkey += "_F%s"%(self._index_params['F'])
#filter
if "FN" in self._index_params:
py_puck_api.update_gflag('filter_nsq', "%d"%(self._index_params['FN']))
self.indexkey += "_FN%s"%(self._index_params['FN'])
#量化
if "N" in self._index_params:
if int(self._index_params['N']) > 1:
py_puck_api.update_gflag('whether_pq', 'true')
py_puck_api.update_gflag('nsq', "%d"%(self._index_params['N']))
self.indexkey += "_N%s"%(self._index_params['N'])
else:
py_puck_api.update_gflag('whether_pq', 'false')
self.indexkey += "_Flat"

if "tinker_neighborhood" in self._index_params:
py_puck_api.update_gflag('tinker_neighborhood', "%d"%(self._index_params['tinker_neighborhood']))
self.indexkey += "_Neighborhood%s"%(self._index_params['tinker_neighborhood'])
if "tinker_construction" in self._index_params:
py_puck_api.update_gflag('tinker_construction', "%d"%(self._index_params['tinker_construction']))
self.indexkey += "_Construction%s"%(self._index_params['tinker_construction'])
if "index_type" in self._index_params:
py_puck_api.update_gflag('index_type', "%d"%(self._index_params['index_type']))
if "radius_rate" in self._index_params:
py_puck_api.update_gflag('radius_rate', "%f"%(self._index_params['radius_rate']))
self.indexkey += "_RadiusRate%s"%(self._index_params['radius_rate'])


def index_name(self, name):
return f"data/{name}.{self.indexkey}.puckindex"

def index_tag_name(self, name):
return f"{name}.{self.indexkey}.puckindex"

def load_index(self, dataset):
print("Loading index")
self.init_indexkey()
ds = DATASETS[dataset]()
self.topk = ds.default_count()
print("self.topk=%d"%self.topk)
py_puck_api.update_gflag('topk', "%s"%(ds.default_count()))
py_puck_api.update_gflag('index_path', self.index_name(dataset))
py_puck_api.update_gflag('context_initial_pool_size', "%d"%(2 * CPU_LIMIT))
py_puck_api.update_gflag('threads_count', "%d"%(CPU_LIMIT))
print(self.index_name(dataset))
ret = self.index.init()
print("ret = ",ret)
if ret != 0:
return False
self.index.show()
self.n = ds.nq
return True

def set_query_arguments(self, query_args):
for key, value in query_args.items():
py_puck_api.update_gflag(key, "%s"%value)
#query_args_list = query_args.strip().split(',')
#self.index.update_params(int(self.topk), int(query_args_list[1]), int(query_args_list[2]),int(query_args_list[3]))
self.index.init()
#topk是作为检索参数传入puck
self.res = (np.empty((self.n, self.topk), dtype='float32'), np.empty((self.n, self.topk), dtype='uint32'))
self.qas = query_args

def query(self, X, topK):
n, d = X.shape

self.index.search(n, swig_ptr(X), topK, swig_ptr(self.res[0]), swig_ptr(self.res[1]))
#print(self.res[0])
# print(self.res[1])
def get_results(self):
return self.res[1]

def __str__(self):
return f'Puck{self.indexkey, self.qas}'