diff --git a/include/CANDY/ThresholdIndex.h b/include/CANDY/ThresholdIndex.h new file mode 100644 index 000000000..b8c24d15b --- /dev/null +++ b/include/CANDY/ThresholdIndex.h @@ -0,0 +1,212 @@ +/*! \file ThresholdIndex.h*/ +// +// Created by tony on 04/01/24. +// + +#ifndef CANDY_INCLUDE_CANDY_THRESHOLDINDEX_H_ +#define CANDY_INCLUDE_CANDY_THRESHOLDINDEX_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +namespace CANDY { + +/** + * @ingroup CANDY_lib_bottom The main body and interfaces of library function + * @{ + */ +/** + * @class ThresholdIndex CANDY/ThresholdIndex.h + * @brief The Threshold class of an index approach + */ +class ThresholdIndex: public AbstractIndex{ + protected: + faiss::MetricType faissMetric = faiss::METRIC_L2; + int64_t containerTier = 0; + int64_t dataThreshold; + int64_t dataVolume; + std::string indexAlgorithm; + //std::vector indices; + std::vector indices; + + + public: + bool isHPCStarted = false; + ThresholdIndex() { + + } + + ~ThresholdIndex() { + + } + /** + * @brief set the tier of this indexing, 0 refers the entry indexing + * @param tie the setting of tier number + * @note The parameter of tier idx affects nothing now, but will do something later + */ + /*virtual void setTier(int64_t tie) { + containerTier = tie; + }*/ + /** + * @brief reset this index to inited status + */ + virtual void reset(); + /** + * @brief set the index-specific config related to one index + * @param cfg the config of this class, using raw class + * @note If there is any pre-built data structures, please load it in implementing this + * @note If there is any initial tensors to be stored, please load it after this by @ref loadInitialTensor + * @return bool whether the configuration is successful + */ + virtual bool setConfigClass(INTELLI::ConfigMap cfg); + /** + * @brief set the index-specfic config related to one index + * @param cfg the config of this class + * @note If there is any pre-built data structures, please load it in implementing this + * @note If there is any initial tensors to be stored, please load it after this by @ref loadInitialTensor + * @return bool whether the configuration is successful + */ + virtual bool setConfig(INTELLI::ConfigMapPtr cfg); + /** + * @brief some extra set-ups if the index has HPC fetures + * @return bool whether the HPC set-up is successful + */ + //virtual bool startHPC(); + /** + * @brief insert a tensor + * @note This is majorly an online function + * @param t the tensor, some index need to be single row + * @return bool whether the insertion is successful + */ + virtual bool insertTensor_th(torch::Tensor &t, std::string nameTag); + + + virtual void createThresholdIndex(int64_t dimension, std::string nameTag); + //CANDY::AbstractIndexPtr createIndex(const std::string& nameTag); + + //CANDY::AbstractIndexPtr createIndex(std::string nameTag); + + /** + * @brief load the initial tensors of a data base, use this BEFORE @ref insertTensor + * @note This is majorly an offline function, and may be different from @ref insertTensor for some indexes + * @param t the tensor, some index need to be single row + * @return bool whether the loading is successful + */ + // virtual bool loadInitialTensor(torch::Tensor &t); + /** + * @brief delete a tensor, also online function + * @param t the tensor, some index needs to be single row + * @param k the number of nearest neighbors + * @return bool whether the deleting is successful + */ + //virtual bool deleteTensor(torch::Tensor &t, int64_t k = 1); + + /** + * @brief revise a tensor + * @param t the tensor to be revised + * @param w the revised value + * @return bool whether the revising is successful + */ + //virtual bool reviseTensor(torch::Tensor &t, torch::Tensor &w); + /** + * @brief search the k-NN of a query tensor, return their index + * @param t the tensor, allow multiple rows + * @param k the returned neighbors + * @return std::vector the index, follow faiss's order + */ + virtual std::vector searchIndex(torch::Tensor q, int64_t k); + +/** + * @brief search the k-NN of a query tensor, return the result tensors + * @param t the tensor, allow multiple rows + * @param k the returned neighbors + * @return std::vector the result tensor for each row of query + */ + virtual std::vector searchTensor_th(torch::Tensor &q, int64_t k); + /** + * @brief some extra termination if the index has HPC features + * @return bool whether the HPC termination is successful + */ + // virtual bool endHPC(); + /** + * @brief set the frozen level of online updating internal state + * @param frozenLv the level of frozen, 0 means freeze any online update in internal state + * @return whether the setting is successful + */ + // virtual bool setFrozenLevel(int64_t frozenLv); + /** + * @brief offline build phase + * @param t the tensor for offline build + * @note This is to generate some offline data structures, NOT load offline tensors + * @note Please use @ref loadInitialTensor for loading initial tensors + * @return whether the building is successful + */ + virtual bool offlineBuild(torch::Tensor &t, std::string nameTag); + /** + * @brief a busy waiting for all pending operations to be done + * @return bool, whether the waiting is actually done; + */ + //virtual bool waitPendingOperations(); + + /** + * @brief load the initial tensors of a data base along with its string objects, use this BEFORE @ref insertTensor + * @note This is majorly an offline function, and may be different from @ref insertTensor for some indexes + * @param t the tensor, some index need to be single row + * * @param strs the corresponding list of strings + * @return bool whether the loading is successful + */ + //virtual bool loadInitialStringObject(torch::Tensor &t, std::vector &strs); + /** + * @brief insert a string object + * @note This is majorly an online function + * @param t the tensor, some index need to be single row + * @param strs the corresponding list of strings + * @return bool whether the insertion is successful + */ + // virtual bool insertStringObject(torch::Tensor &t, std::vector &strs); + + /** + * @brief delete tensor along with its corresponding string object + * @note This is majorly an online function + * @param t the tensor, some index need to be single row + * @param k the number of nearest neighbors + * @return bool whether the delet is successful + */ + // virtual bool deleteStringObject(torch::Tensor &t, int64_t k = 1); + + /** + * @brief search the k-NN of a query tensor, return the linked string objects + * @param t the tensor, allow multiple rows + * @param k the returned neighbors + * @return std::vector> the result object for each row of query + */ + //virtual std::vector> searchStringObject(torch::Tensor &q, int64_t k); + + + + }; + +/** + * @ingroup CANDY_lib_bottom + * @typedef ThresholdIndexPtr + * @brief The class to describe a shared pointer to @ref ThresholdIndex + + */ +typedef std::shared_ptr ThresholdIndexPtr; +/** + * @ingroup CANDY_lib_bottom + * @def newThresholdIndex + * @brief (Macro) To creat a new @ref ThresholdIndex shared pointer. + */ +#define newThresholdIndex std::make_shared +} +/** + * @} + */ + +#endif //INTELLISTREAM_INCLUDE_CPPALGOS_ThresholdCPPALGO_H_ diff --git a/src/CANDY/CMakeLists.txt b/src/CANDY/CMakeLists.txt index 1c3163b10..6e3ff3b36 100644 --- a/src/CANDY/CMakeLists.txt +++ b/src/CANDY/CMakeLists.txt @@ -20,6 +20,7 @@ add_sources( NNDescentIndex.cpp FlannIndex.cpp DPGIndex.cpp + ThresholdIndex.cpp LSHAPGIndex.cpp FlatGPUIndex.cpp ) diff --git a/src/CANDY/IndexTable.cpp b/src/CANDY/IndexTable.cpp index 42552d331..b2f9d2c1f 100644 --- a/src/CANDY/IndexTable.cpp +++ b/src/CANDY/IndexTable.cpp @@ -23,8 +23,10 @@ #include #include #include +#include #include #include + #include #if CANDY_CL == 1 //#include @@ -57,8 +59,11 @@ CANDY::IndexTable::IndexTable() { indexMap["nnDescent"] = newNNDescentIndex(); indexMap["Flann"] = newFlannIndex(); indexMap["DPG"] = newDPGIndex(); + indexMap["threshold"] = newThresholdIndex(); + indexMap["LSHAPG"] = newLSHAPGIndex(); indexMap["flatGPU"] = newFlatGPUIndex(); + #if CANDY_CL == 1 // indexMap["cl"] = newCLMMCPPAlgo(); #endif diff --git a/src/CANDY/ThresholdIndex.cpp b/src/CANDY/ThresholdIndex.cpp new file mode 100644 index 000000000..b74d63f39 --- /dev/null +++ b/src/CANDY/ThresholdIndex.cpp @@ -0,0 +1,235 @@ +/*! \file ThresholdIndex.cpp*/ +// +// Created by tony on 25/05/23. +// + +#include +#include +#include +#include +#include +#include + + + +void CANDY::ThresholdIndex::reset() { + +} + +bool CANDY::ThresholdIndex::offlineBuild(torch::Tensor &t, std::string nameTag) { + assert(t.size(1)); + if (indices.empty() || dataVolume >= dataThreshold) { + createThresholdIndex(t.size(1), nameTag); + } + //auto index = new faiss::IndexFlatL2(t.size(1)); + //index->add(t.size(0), t.data_ptr()); + //indices.push_back(index); + + indices.back()->insertTensor(t); + dataVolume++; + + return true; +} + + +bool CANDY::ThresholdIndex::setConfig(INTELLI::ConfigMapPtr cfg) { + assert(cfg); + std::string metricType = cfg->tryString("metricType", "L2", true); + faissMetric = faiss::METRIC_L2; + if (metricType == "dot" || metricType == "IP" || metricType == "cossim") { + faissMetric = faiss::METRIC_INNER_PRODUCT; + } + + dataThreshold = cfg->tryI64("dataThreshold", 100); + indexAlgorithm = cfg->tryString("indexAlgorithm", "HNSW"); + + + return false; +} + +bool CANDY::ThresholdIndex::setConfigClass(INTELLI::ConfigMap cfg) { + INTELLI::ConfigMapPtr cfgPtr=newConfigMap(); + cfgPtr->loadFrom(cfg); + return setConfig(cfgPtr); +} +/* +bool CANDY::ThresholdIndex::setFrozenLevel(int64_t frozenLv) { + assert(frozenLv >= 0); + return false; +}*/ +bool CANDY::ThresholdIndex::insertTensor_th(torch::Tensor &t ,std::string nameTag) { + assert(t.size(1)); + if (indices.empty() || dataVolume >= dataThreshold) { + createThresholdIndex(t.size(1), nameTag); + } + indices.back()->insertTensor(t); + + dataVolume++; + return true; +} + +void CANDY::ThresholdIndex::createThresholdIndex(int64_t dimension, std::string nameTag) { + //auto index = new faiss::IndexFlatL2(dimension); + //int M=32; + //auto index = AbstractIndexPtr::createIndex(nameTag); + + IndexTable tab; + auto ru= tab.getIndex(nameTag); + + if(ru==nullptr){ + INTELLI_ERROR("No index named "+nameTag+", return flat"); + nameTag="flat"; + INTELLI::ConfigMapPtr cfg_new = newConfigMap(); + cfg_new->edit("vecDim", (int64_t) 3); + cfg_new->edit("M", (int64_t) 4); + ru->setConfig(cfg_new); + indices.push_back(tab.getIndex(nameTag)); + dataVolume=0; + return; + } + INTELLI::ConfigMapPtr cfg_new = newConfigMap(); + cfg_new->edit("vecDim", (int64_t) 3); + cfg_new->edit("M", (int64_t) 4); + ru->setConfig(cfg_new); + dataVolume=0; + indices.push_back(ru); + + return; +} +/* +bool CANDY::ThresholdIndex::insertStringObject(torch::Tensor &t, std::vector &strs) { + assert(t.size(1)); + assert (strs.size()); + return false; +} +*/ + +std::vector CANDY::ThresholdIndex::searchIndex(torch::Tensor q, int64_t k) { + assert(k > 0); + assert(q.size(1)); + std::vector ru(1); + return ru; +} + +float getDist(const torch::Tensor& a, const torch::Tensor& b) { + return torch::norm(a - b).item(); +} + +std::vector CANDY::ThresholdIndex::searchTensor_th(torch::Tensor &q, int64_t k) { + assert(k > 0); + assert(q.size(1)); + + //auto idx = searchIndex(q, k); + std::vector Indicesx ; + + for (int64_t i = 0; i < indices.size(); ++i) { + int64_t querySize = q.size(0); + auto inx = indices[i]->searchTensor(q,k); + //std::cout << "Index " << i << " returned " << inx.size() << " results" << endl; + for(int64_t j=0; j< k; j++) + {//std::cout << "Result tensor from index " << i << ": " << inx[j] << std::endl; + Indicesx.push_back(inx[0].slice(0, j, j + 1) ); + } + //Indicesx.insert(Indicesx.end(), inx.begin(), inx.end()) + +} + + if(Indicesx.size()>k) + { + std::vector> dist; + for (int64_t i = 0; i < Indicesx.size(); ++i) { + float distance = getDist(q, Indicesx[i]); + dist.push_back(std::make_pair(distance, i)); + } + + std::sort(dist.begin(), dist.end()); + std::vector topK; + + for (int64_t i = 0; i < k; ++i) { + topK.push_back(Indicesx[dist[i].second]); + } + return topK; + } + + return Indicesx; + +} + + /*std::vector> knn; + for (size_t i = 0; i < Indicesx.size(); ++i) { + knn.emplace_back(Distances[i], Indicesx[i]); + } + + std::sort(knn.begin(), knn.end(), [](const auto &a, const auto &b) { + return a.first < b.first; + }); + + std::vector topK; + for (int64_t i = 0; i < k && i < knn.size(); ++i) { + topK.push_back(torch::tensor(knn[i].second)); + } + */ + + +/* +torch::Tensor CANDY::ThresholdIndex::rawData() { + return torch::rand({1, 1}); +} + +bool CANDY::ThresholdIndex::startHPC() { + return false; +} + +bool CANDY::ThresholdIndex::endHPC() { + return false; +} +bool CANDY::ThresholdIndex::waitPendingOperations() { + return true; +} +std::tuple, std::vector>> CANDY::ThresholdIndex::searchTensorAndStringObject(torch::Tensor &q, int64_t k) { + auto ruT = searchTensor(q, k); + auto ruS = searchStringObject(q, k); + std::tuple, std::vector>> ru(ruT, ruS); + return ru; +} +bool CANDY::ThresholdIndex::loadInitialTensorAndQueryDistribution(torch::Tensor &t, torch::Tensor &query) { + assert(query.size(0) > 0); + return loadInitialTensor(t); +} + +std::vector> CANDY::ThresholdIndex::searchStringObject(torch::Tensor &q, int64_t k) { + assert(k > 0); + assert(q.size(1)); + std::vector> ru(1); + ru[0] = std::vector(1); + ru[0][0] = ""; + return ru; +} + +std::vector CANDY::ThresholdIndex::getTensorByIndex(std::vector &idx, int64_t k) { + assert(k > 0); + assert(idx.size()); + std::vector ru(1); + ru[0] = torch::rand({1, 1}); + return ru; +} + +bool CANDY::ThresholdIndex::loadInitialStringObject(torch::Tensor &t, std::vector &strs) { + return insertStringObject(t, strs); +} +bool CANDY::ThresholdIndex::deleteTensor(torch::Tensor &t, int64_t k) { + assert(t.size(1)); + assert(k > 0); + return false; +} + +bool CANDY::ThresholdIndex::deleteStringObject(torch::Tensor &t, int64_t k) { + assert(t.size(1)); + assert(k > 0); + return false; +} +bool CANDY::ThresholdIndex::reviseTensor(torch::Tensor &t, torch::Tensor &w) { + assert(t.size(1) == w.size(1)); + return false; +} +*/ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c1a393aff..2dfaf183b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -35,6 +35,9 @@ macro(add_catch_test_ray appName SOURCE_FILES SOURCE_LIBS) COMMAND $ --success ) endmacro() + + + add_catch_test(hnsw_test SystemTest/HNSWTest.cpp CANDYBENCH) add_catch_test(cpp_test SystemTest/SimpleTest.cpp CANDYBENCH) add_catch_test(flatIndex_test SystemTest/FlatIndexTest.cpp CANDYBENCH) @@ -49,11 +52,13 @@ add_catch_test(yinYangSimple_test SystemTest/YinYangGraphSimpleIndexTest.cpp CAN add_catch_test(nnDescent_test SystemTest/NNDescentIndexTest.cpp CANDYBENCH) add_catch_test(kdTree_test SystemTest/KdTreeTest.cpp CANDYBENCH) add_catch_test(dpgIndex_test SystemTest/DPGIndexTest.cpp CANDYBENCH) +#add_catch_test(threshold_test SystemTest/ThresholdTest.cpp CANDY) add_catch_test(lshAPGIndex_test SystemTest/LSHAPGIndexTest.cpp CANDYBENCH) if (ENABLE_SPTAG) add_catch_test(sptagIndex_test SystemTest/SPTAGIndexTest.cpp CANDYBENCH) endif () + if (ENABLE_OPENCL) add_catch_test(cl_test SystemTest/CLTest.cpp CANDYBENCH) endif () diff --git a/test/SystemTest/ThresholdTest.cpp b/test/SystemTest/ThresholdTest.cpp new file mode 100644 index 000000000..60d308055 --- /dev/null +++ b/test/SystemTest/ThresholdTest.cpp @@ -0,0 +1,148 @@ +// +// Created by tony on 05/01/24. +// +#include + +#define CATCH_CONFIG_MAIN + +#include "catch.hpp" +#include +#include +#include +#include +#include +using namespace std; +using namespace INTELLI; +using namespace torch; +using namespace CANDY; +TEST_CASE("Test threshold index", "[short]") { + torch::manual_seed(114514); + INTELLI::ConfigMapPtr cfg = newConfigMap(); + CANDY::ThresholdIndex thresholdIndex; + + cfg->edit("metricType", "L2"); + cfg->edit("dataThreshold", (int64_t) 60); + + thresholdIndex.setConfig(cfg); + + auto ta = torch::rand({1, 3}); + thresholdIndex.insertTensor_th(ta, "HNSWNaive"); + auto tb = torch::rand({1, 3}); + thresholdIndex.insertTensor_th(tb, "HNSWNaive"); + auto tc = torch::rand({1, 3}); + thresholdIndex.insertTensor_th(tc, "HNSWNaive"); + + for(int i = 0; i < 100; i++) { + auto x_in = torch::rand({1, 3}); + thresholdIndex.insertTensor_th(x_in, "HNSWNaive"); + } + + thresholdIndex.insertTensor_th(ta, "HNSWNaive"); + thresholdIndex.insertTensor_th(tb, "HNSWNaive"); + thresholdIndex.insertTensor_th(tb, "HNSWNaive"); + + std::cout << "Insertion finished" << std::endl; + + // Search for 'ta' + std::cout << "1. Now, do the query\n" << ta << std::endl; + auto ruTensors = thresholdIndex.searchTensor_th(ta, 2); + std::cout << "Get tensor\n"; + + for (const auto& tensor : ruTensors) { + std::cout << tensor << std::endl; + } + std::cout << "Total results: " << ruTensors.size() << std::endl; +} + + //size_t k = 1; + /* + for(int i=0; i<20; i++) + { + auto x_in = torch::rand({1, 3}); + hnswIdx.insertTensor(x_in); + } + cout << "insertion finish" << endl; + auto q = torch::rand({1, 3}); + std::cout << "1. now, do the query\n" << q << std::endl; + auto ruTensors = hnswIdx.searchTensor(q, 3); + std::cout << "get tensor\n" << ruTensors[0] << std::endl; + auto y_in = torch::rand({110, 3}); + thresholdIndex.insertTensor_th(y_in, "HNSWNaive"); + auto ru = thresholdIndex.searchTensor_th(x_in, k); + for (int64_t i = 0; i < x_in.size(0); i++) { + + auto new_in = newTensor(x_in.slice(0, i, i + 1)); + cout << "looking for" << *new_in << endl; + cout << endl << ru[i] << endl << endl; + }*/ + /* + auto ta = torch::rand({1, 3}); + thresholdIndex.insertTensor_th(ta, "HNSWNaive"); + auto tb = torch::rand({1, 3}); + thresholdIndex.insertTensor_th(tb, "HNSWNaive"); + auto tc = torch::rand({1, 3}); + thresholdIndex.insertTensor_th(ta, "HNSWNaive"); + thresholdIndex.insertTensor_th(tb, "HNSWNaive"); + thresholdIndex.insertTensor_th(tc, "HNSWNaive"); + //std::cout << "0.now, the data base is\n" << thresholdIndex.rawData() << std::endl; + */ + + + /* + //offline + auto x_offline = torch::rand({150, 3}); + REQUIRE(thresholdIndex.offlineBuild(x_offline) == true); + cout << "Offline build complete" << endl; + + //online --> need change not correct + auto x_online = torch::rand({10, 3}); + REQUIRE(thresholdIndex.insertTensor(x_online) == true); + cout << "Online insertion complete" << endl; + + //search + size_t k = 3; + auto tb = torch::rand({1, 4}); + auto ruTensors = thresholdIndex.searchTensor(tb, 2); + std::cout << "get tensor\n" << ruTensors[0] << std::endl; + */ + + //auto db = torch::rand({6, 4}); + //thresholdIndex.insertTensor_th(db, "HNSWNaive"); + //std::cout << "data base is\n" << db << std::endl; + //auto query = db.slice(0, 2, 3); + //std::cout << "query is\n" << query << std::endl; + //auto thRu = thresholdIndex.searchTensor(query, 2); + //std::cout << "get tensor\n" << thRu[0] << std::endl; + + /*CANDY::IndexTable it; + auto flatIdx = it.getIndex("flat"); + cfg->edit("vecDim", (int64_t) 4); + flatIdx->setConfig(cfg); + auto ta = torch::rand({1, 4}); + flatIdx->insertTensor(ta); + auto tb = torch::rand({1, 4}); + flatIdx->insertTensor(tb); + auto tc = torch::rand({1, 4}); + flatIdx->insertTensor(tc); + flatIdx->insertTensor(tb); + std::cout << "0.now, the data base is\n" << flatIdx->rawData() << std::endl; + auto ruIdx = flatIdx->searchIndex(tb, 2); + auto ruTensors = flatIdx->searchTensor(tb, 2); + std::cout << "1. now, do the query\n" << flatIdx->rawData() << std::endl; + for (uint64_t i = 0; i < ruIdx.size(); i++) { + std::cout << "result [" + to_string(i) + "]:idx=" << ruIdx[i] << std::endl; + } + std::cout << "get tensor\n" << ruTensors[0] << std::endl; + std::cout << "3. now, delete 2 similar\n" << std::endl; + flatIdx->deleteTensor(tb, 2); + std::cout << "the data base is\n" << flatIdx->rawData() << std::endl; + std::cout << "4. now, do the edit\n" << std::endl; + flatIdx->reviseTensor(tc, tb); + std::cout << "the data base is\n" << flatIdx->rawData() << std::endl; + REQUIRE(a == 0); + auto ru = thresholdIndex.searchTensor(x_online, k); + for (int64_t i = 0; i < x_online.size(0); i++) { + auto new_in = x_online.slice(0, i, i + 1); + cout << "Query tensor:" << new_in << endl; + cout << "Search result:" << ru[i] << endl << endl;*/ +