From f64bdad6ff525b9464f43a8ac1bbb57bd686d4fd Mon Sep 17 00:00:00 2001 From: kushaljain-apra Date: Thu, 11 Apr 2024 07:44:49 +0530 Subject: [PATCH 01/13] add llama vcpkg port --- base/CMakeLists.txt | 3 +- base/fix-vcpkg-json.ps1 | 4 ++ base/fix-vcpkg-json.sh | 4 ++ base/vcpkg.json | 9 +++- .../custom-overlay/llama/portfile.cmake | 46 +++++++++++++++++++ thirdparty/custom-overlay/llama/usage | 4 ++ thirdparty/custom-overlay/llama/vcpkg.json | 28 +++++++++++ 7 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 thirdparty/custom-overlay/llama/portfile.cmake create mode 100644 thirdparty/custom-overlay/llama/usage create mode 100644 thirdparty/custom-overlay/llama/vcpkg.json diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index 9f2cd1470..af75e41b3 100755 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -52,6 +52,7 @@ find_package(ZXing CONFIG REQUIRED) find_package(bigint CONFIG REQUIRED) find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED) find_package(whisper CONFIG REQUIRED) +find_package(llama CONFIG REQUIRED) IF(ENABLE_CUDA) if((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL "")) @@ -617,4 +618,4 @@ IF(ENABLE_WINDOWS) IF(GHA) file(COPY ${RUNTIME_DLLS} DESTINATION RelWithDebInfo/) ENDIF(GHA) -ENDIF(ENABLE_WINDOWS) +ENDIF(ENABLE_WINDOWS) \ No newline at end of file diff --git a/base/fix-vcpkg-json.ps1 b/base/fix-vcpkg-json.ps1 index 634ef9c83..e34f2d38e 100644 --- a/base/fix-vcpkg-json.ps1 +++ b/base/fix-vcpkg-json.ps1 @@ -12,6 +12,10 @@ if ($removeCUDA.IsPresent) $v.dependencies | Where-Object { $_.name -eq 'whisper' } | ForEach-Object { $_.features = $_.features -ne 'cuda' } + + $v.dependencies | + Where-Object { $_.name -eq 'llama' } | + ForEach-Object { $_.features = $_.features -ne 'cuda' } } if($removeOpenCV.IsPresent) diff --git a/base/fix-vcpkg-json.sh b/base/fix-vcpkg-json.sh index 74bfc0aa3..380da8912 100644 --- a/base/fix-vcpkg-json.sh +++ b/base/fix-vcpkg-json.sh @@ -25,6 +25,10 @@ if $removeCUDA; then # Remove "cuda" features for this "whisper" instance v=$(echo "$v" | jq ".dependencies[$index].features |= map(select(. != \"cuda\"))") fi + if [ "$name" == "llama"]; then + # Remove "cuda" features for this "llama" instance + v=$(echo "$v" | jq ".dependencies[$index].features |= map(select(. != \"cuda\"))") + fi done fi diff --git a/base/vcpkg.json b/base/vcpkg.json index 4df4664c0..94ead0353 100644 --- a/base/vcpkg.json +++ b/base/vcpkg.json @@ -2,7 +2,7 @@ "$schema": "https://raw.githubusercontent.com/microsoft/vcpkg/master/scripts/vcpkg.schema.json", "name": "apra-pipes-cuda", "version": "0.0.1", - "builtin-baseline": "eac79fc7bda260819c646d10c97dec825305aecd", + "builtin-baseline": "16c5b44823b085e7b3b30b393ea8081a65a93256", "dependencies": [ { "name": "whisper", @@ -11,6 +11,13 @@ "cuda" ] }, + { + "name": "llama", + "default-features": false, + "features": [ + "cuda" + ] + }, { "name": "opencv4", "default-features": false, diff --git a/thirdparty/custom-overlay/llama/portfile.cmake b/thirdparty/custom-overlay/llama/portfile.cmake new file mode 100644 index 000000000..bab45f3b5 --- /dev/null +++ b/thirdparty/custom-overlay/llama/portfile.cmake @@ -0,0 +1,46 @@ +vcpkg_check_linkage(ONLY_STATIC_LIBRARY) + +vcpkg_from_github( + OUT_SOURCE_PATH SOURCE_PATH + REPO Apra-Labs/llama.cpp + REF e5bd6e1abb146b38649236429c22ed6b4db0f3da + SHA512 f36a0731e7b5044b1d75297fdd806cf19206a439bc9996bba1ee36b0b2e692e4482d5fac9b7dcd111c7d69bbd900b99ed38b301c572c450a48ad6fd484b3322f + HEAD_REF kj/vcpkg-port +) + +vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS + FEATURES + "cuda" LLAMA_CUBLAS +) + +set(LLAMA_CUBLAS OFF) +if("cuda" IN_LIST FEATURES) + set(LLAMA_CUBLAS ON) +endif() + + +vcpkg_cmake_configure( + SOURCE_PATH "${SOURCE_PATH}" + OPTIONS + ${FEATURE_OPTIONS} + -DLLAMA_CUBLAS=${LLAMA_CUBLAS} + DISABLE_PARALLEL_CONFIGURE +) + +vcpkg_cmake_install() +vcpkg_cmake_config_fixup( + CONFIG_PATH lib/cmake/llama + PACKAGE_NAME llama + ) +vcpkg_copy_pdbs() + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") + +file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright) +configure_file("${CMAKE_CURRENT_LIST_DIR}/usage" "${CURRENT_PACKAGES_DIR}/share/${PORT}/usage" COPYONLY) + +file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share") + +if(VCPKG_LIBRARY_LINKAGE STREQUAL "static") + file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/bin" "${CURRENT_PACKAGES_DIR}/debug/bin") +endif() \ No newline at end of file diff --git a/thirdparty/custom-overlay/llama/usage b/thirdparty/custom-overlay/llama/usage new file mode 100644 index 000000000..bfd8500ea --- /dev/null +++ b/thirdparty/custom-overlay/llama/usage @@ -0,0 +1,4 @@ +llama provides CMake targets: + +find_package(llama CONFIG REQUIRED) +target_link_libraries(main PRIVATE common_llama llama llavalib) \ No newline at end of file diff --git a/thirdparty/custom-overlay/llama/vcpkg.json b/thirdparty/custom-overlay/llama/vcpkg.json new file mode 100644 index 000000000..721a1dda3 --- /dev/null +++ b/thirdparty/custom-overlay/llama/vcpkg.json @@ -0,0 +1,28 @@ +{ + "name": "llama", + "version-string": "b1708", + "homepage": "https://github.com/Apra-Labs/llama.cpp", + "description": "Fork of llama.cpp", + "license": "MIT", + "dependencies": [ + { + "name": "vcpkg-cmake", + "host": true + }, + { + "name": "vcpkg-cmake-config", + "host": true + } + ], + "default-features": [ + "default-features" + ], + "features": { + "cuda": { + "description": "Build Llama with CUDA support", + "dependencies": [ + "cuda" + ] + } + } +} \ No newline at end of file From 0f9f19db891df00bc500a1ac5c872f5974fde369 Mon Sep 17 00:00:00 2001 From: kushaljain-apra Date: Thu, 11 Apr 2024 07:44:59 +0530 Subject: [PATCH 02/13] update whisper portfile to support cuda --- thirdparty/custom-overlay/whisper/portfile.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/thirdparty/custom-overlay/whisper/portfile.cmake b/thirdparty/custom-overlay/whisper/portfile.cmake index 1ff52d319..7416ac3b7 100644 --- a/thirdparty/custom-overlay/whisper/portfile.cmake +++ b/thirdparty/custom-overlay/whisper/portfile.cmake @@ -22,6 +22,9 @@ endif() vcpkg_cmake_configure( SOURCE_PATH "${SOURCE_PATH}" + OPTIONS + ${FEATURE_OPTIONS} + -DWHISPER_CUBLAS=${WHISPER_CUBLAS} DISABLE_PARALLEL_CONFIGURE ) From c864c13caa946fd8e9133e54d10e67dbea2698d0 Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:13:35 +0530 Subject: [PATCH 03/13] add llm model abstract class --- base/include/LlmModelAbstract.h | 91 +++++++++++++++++++++++++++++++++ base/src/LlmModelAbstract.cpp | 33 ++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 base/include/LlmModelAbstract.h create mode 100644 base/src/LlmModelAbstract.cpp diff --git a/base/include/LlmModelAbstract.h b/base/include/LlmModelAbstract.h new file mode 100644 index 000000000..c803e413e --- /dev/null +++ b/base/include/LlmModelAbstract.h @@ -0,0 +1,91 @@ +#pragma once + +#include "Module.h" + +class LlmModelAbstractProps { +public: + enum ModelArchitectureType { + TRANSFORMER = 0, + ENCODERDECODER, + CASUALDECODER, + PREFIXDECODER + }; + + enum DataType { TEXT = 0, IMAGE, AUDIO, TEXT_EMBEDDING, IMAGE_EMBEDDING, AUDIO_EMBEDDING }; + + enum UseCase { TEXT_TO_TEXT = 0, SCENE_DESCRIPTOR, OCR }; + + LlmModelAbstractProps() { + modelArchitecture = ModelArchitectureType::TRANSFORMER; + inputTypes = {DataType::TEXT}; + outputTypes = {DataType::TEXT}; + useCases = {UseCase::TEXT_TO_TEXT}; + qlen = 20; + } + + LlmModelAbstractProps(ModelArchitectureType _modelArchitecture, + std::vector _inputTypes, + std::vector _outputTypes, + std::vector _useCases) { + modelArchitecture = _modelArchitecture; + inputTypes = _inputTypes; + outputTypes = _outputTypes; + useCases = _useCases; + qlen = 20; + } + + size_t getSerializeSize() { + return sizeof(modelArchitecture) + sizeof(inputTypes) + sizeof(outputTypes) + sizeof(useCases) + sizeof(qlen); + } + + ModelArchitectureType modelArchitecture; + std::vector inputTypes; + std::vector outputTypes; + std::vector useCases; + size_t qlen; + +private: + friend class boost::serialization::access; + + template + void serialize(Archive &ar, const unsigned int version) { + ar &boost::serialization::base_object(*this); + ar & modelArchitecture; + ar & inputTypes; + ar & outputTypes; + ar & useCases; + ar & qlen; + } +}; + +class LlmModelAbstract { +public: + LlmModelAbstract(std::string name, LlmModelAbstractProps props); + ~LlmModelAbstract(); + + std::string getMyName() { + return myName; + } + + boost::shared_ptr getQue() { + return mQue; + } + + virtual bool modelInit() = 0; + virtual bool modelTerm() = 0; + virtual bool modelInference(frame_container& frameContainer) {return false;} + virtual size_t getFrameSize() = 0; + virtual void getFrames(frame_sp& frame) = 0; + + virtual bool validateUseCase(LlmModelAbstractProps::UseCase useCase) = 0; + + bool init(); + bool term(); + bool step(); + bool push(frame_container& frameContainer); + +private: + std::string myName; + boost::shared_ptr mQue; + boost::shared_ptr mProps; +}; \ No newline at end of file diff --git a/base/src/LlmModelAbstract.cpp b/base/src/LlmModelAbstract.cpp new file mode 100644 index 000000000..058520048 --- /dev/null +++ b/base/src/LlmModelAbstract.cpp @@ -0,0 +1,33 @@ +#include "LlmModelAbstract.h" +#include "FrameContainerQueue.h" + +LlmModelAbstract::LlmModelAbstract(std::string name, LlmModelAbstractProps _props) : myName(name) { + mQue.reset(new FrameContainerQueue(_props.qlen)); + mProps.reset(new LlmModelAbstractProps(_props)); +} + +LlmModelAbstract::~LlmModelAbstract() {} + +bool LlmModelAbstract::init() { + mQue->accept(); + return true; +} + +bool LlmModelAbstract::term() { + mQue->clear(); + return true; +} + +bool LlmModelAbstract::step() { + auto frames = mQue->pop(); + if (frames.size() == 0) { + return true; + } + bool ret = modelInference(frames); + return ret; +} + +bool LlmModelAbstract::push(frame_container& frameContainer) { + mQue->push(frameContainer); + return true; +} \ No newline at end of file From 5524ad9c439925b14e2ee82c3e3932d8a5d6cbe8 Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:14:00 +0530 Subject: [PATCH 04/13] add encoder model abstract class --- base/include/EncoderModelAbstract.h | 91 +++++++++++++++++++++++++++++ base/src/EncoderModelAbstract.cpp | 34 +++++++++++ 2 files changed, 125 insertions(+) create mode 100644 base/include/EncoderModelAbstract.h create mode 100644 base/src/EncoderModelAbstract.cpp diff --git a/base/include/EncoderModelAbstract.h b/base/include/EncoderModelAbstract.h new file mode 100644 index 000000000..6f2afe030 --- /dev/null +++ b/base/include/EncoderModelAbstract.h @@ -0,0 +1,91 @@ +#pragma once + +#include "Module.h" + +class EncoderModelAbstractProps { +public: + enum ModelArchitectureType { + BERT= 0, // Vision Transformer + VIT, // Bidirectional Encoder Representations from Transformer + AST, // Audio Spectrogram Transformer + VIVIT // Video Vision Transformer + }; + + enum DataType { TEXT = 0, IMAGE, AUDIO, TEXT_EMBEDDING, IMAGE_EMBEDDING, AUDIO_EMBEDDING }; + + enum UseCase { TEXT_TO_TEXT = 0, SCENE_DESCRIPTOR, OCR }; + + EncoderModelAbstractProps() { + modelArchitecture = ModelArchitectureType::BERT; + inputTypes = {DataType::TEXT}; + outputTypes = {DataType::TEXT_EMBEDDING}; + useCases = {UseCase::TEXT_TO_TEXT}; + qlen = 20; + } + + EncoderModelAbstractProps(ModelArchitectureType _modelArchitecture, + std::vector _inputTypes, + std::vector _outputTypes, + std::vector _useCases) { + modelArchitecture = _modelArchitecture; + inputTypes = _inputTypes; + outputTypes = _outputTypes; + useCases = _useCases; + qlen = 20; + } + + size_t getSerializeSize() { + return sizeof(modelArchitecture) + sizeof(inputTypes) + sizeof(outputTypes) + sizeof(useCases) + sizeof(qlen); + } + + ModelArchitectureType modelArchitecture; + std::vector inputTypes; + std::vector outputTypes; + std::vector useCases; + size_t qlen; + +private: + friend class boost::serialization::access; + + template + void serialize(Archive &ar, const unsigned int version) { + ar &boost::serialization::base_object(*this); + ar & modelArchitecture; + ar & inputTypes; + ar & outputTypes; + ar & useCases; + ar & qlen; + } +}; + +class EncoderModelAbstract { +public: + EncoderModelAbstract(std::string name, EncoderModelAbstractProps props); + ~EncoderModelAbstract(); + + std::string getMyName() { + return myName; + } + + boost::shared_ptr getQue() { + return mQue; + } + + virtual bool modelInit() = 0; + virtual bool modelTerm() = 0; + virtual bool modelInference(frame_container& frameContainer) {return false;} + virtual size_t getFrameSize() = 0; + virtual void getFrames(frame_sp& frame) = 0; + + virtual bool validateUseCase(EncoderModelAbstractProps::UseCase useCase) = 0; + + bool init(); + bool term(); + bool step(); + bool push(frame_container& frameContainer); + +private: + std::string myName; + boost::shared_ptr mQue; + boost::shared_ptr mProps; +}; \ No newline at end of file diff --git a/base/src/EncoderModelAbstract.cpp b/base/src/EncoderModelAbstract.cpp new file mode 100644 index 000000000..a57bc5ada --- /dev/null +++ b/base/src/EncoderModelAbstract.cpp @@ -0,0 +1,34 @@ +#include "EncoderModelAbstract.h" +#include "FrameContainerQueue.h" +#include "Logger.h" + +EncoderModelAbstract::EncoderModelAbstract(std::string name, EncoderModelAbstractProps _props) : myName(name) { + mQue.reset(new FrameContainerQueue(_props.qlen)); + mProps.reset(new EncoderModelAbstractProps(_props)); +} + +EncoderModelAbstract::~EncoderModelAbstract() { } + +bool EncoderModelAbstract::init() { + mQue->accept(); + return true; +} + +bool EncoderModelAbstract::term() { + mQue->clear(); + return true; +} + +bool EncoderModelAbstract::step() { + auto frames = mQue->pop(); + if (frames.size() == 0) { + return true; + } + bool ret = modelInference(frames); + return ret; +} + +bool EncoderModelAbstract::push(frame_container& frameContainer) { + mQue->push(frameContainer); + return true; +} \ No newline at end of file From 4365203889905b035b7fbd6a18b58c28db9d18e7 Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:14:29 +0530 Subject: [PATCH 05/13] add Llava Model class --- base/include/Llava.h | 65 +++++++++++++++ base/src/Llava.cpp | 189 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+) create mode 100644 base/include/Llava.h create mode 100644 base/src/Llava.cpp diff --git a/base/include/Llava.h b/base/include/Llava.h new file mode 100644 index 000000000..fae64d377 --- /dev/null +++ b/base/include/Llava.h @@ -0,0 +1,65 @@ +#pragma once + +#include "LlmModelAbstract.h" + +class LlavaProps : public LlmModelAbstractProps { +public: + LlavaProps(std::string _modelPath, std::string _prompt, + int _contextSize, int _batchSize, float _degreeOfRandomness, int _gpuLayers, int _predictionLength) { + + /* Set LLM Model Base Class Properties for each model*/ + modelArchitecture = ModelArchitectureType::TRANSFORMER; + inputTypes = {DataType::TEXT, DataType::IMAGE_EMBEDDING}; + outputTypes = {DataType::TEXT}; + useCases = {UseCase::TEXT_TO_TEXT, UseCase::OCR, UseCase::SCENE_DESCRIPTOR}; + + /*Unique Model Properties*/ + modelPath = _modelPath; + prompt = _prompt; + degreeOfRandomness = _degreeOfRandomness; + contextSize = _contextSize; + batchSize = _batchSize; + gpuLayers = _gpuLayers; + predictionLength = _predictionLength; + } + + std::string modelPath; + std::string prompt; + int contextSize; + int batchSize; + float degreeOfRandomness; + int gpuLayers; + int predictionLength; + + size_t getSerializeSize() { + return LlmModelAbstractProps::getSerializeSize() + sizeof(modelPath) + + sizeof(prompt) + sizeof(float) + 4 * sizeof(int); + } + +private: + friend class boost::serialization::access; + + template + void serialize(Archive &ar, const unsigned int version) { + ar &boost::serialization::base_object(*this); + ar & modelPath & prompt; + ar & degreeOfRandomness; + ar & contextSize & batchSize & gpuLayers & predictionLength; + } +}; + +class Llava : public LlmModelAbstract { +public: + Llava(LlavaProps _props); + virtual ~Llava(); + bool modelInit() override; + bool modelTerm() override; + bool modelInference(frame_container& frames) override; + bool validateUseCase(LlmModelAbstractProps::UseCase useCase) override; + size_t getFrameSize() override; + void getFrames(frame_sp& frame) override; + +private: + class Detail; + boost::shared_ptr mDetail; +}; \ No newline at end of file diff --git a/base/src/Llava.cpp b/base/src/Llava.cpp new file mode 100644 index 000000000..77e5494e8 --- /dev/null +++ b/base/src/Llava.cpp @@ -0,0 +1,189 @@ +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "Utils.h" + +#include "llama/common.h" +#include "llama/llama.h" +#include "llama/llava.h" +#include "Llava.h" + +class Llava::Detail { +public: + Detail(LlavaProps &_props) : mProps(_props) { + setContextSize(_props); + setBatchSize(_props); + setDegreeOfRandomness(_props); + setGpuLayers(_props); + } + + ~Detail() { + + } + + void setProps(LlavaProps &_props) { + mProps = _props; + updateProps(_props); + } + + void updateProps(LlavaProps &_props) { + setContextSize(_props); + setBatchSize(_props); + setDegreeOfRandomness(_props); + setGpuLayers(_props); + } + + void setContextSize(LlavaProps &_props) { + mLlavaContextParams.n_ctx = _props.contextSize; + } + + void setBatchSize(LlavaProps &_props) { + mLlavaContextParams.n_batch = _props.batchSize; + } + + void setDegreeOfRandomness(LlavaProps &_props) { + mLlavaSamplingParams.temp = _props.degreeOfRandomness; + } + + void setGpuLayers(LlavaProps &_props) { + mLlavaModelParams.n_gpu_layers = _props.gpuLayers; + } + + void compute(llama_context * llamaContext, std::vector tokens, int nBatch, int * nPast) { + int N = (int) tokens.size(); + for (int i = 0; i < N; i += nBatch) { + int nEval = (int) tokens.size() - i; + if (nEval > nBatch) { + nEval = nBatch; + } + if (llama_decode(llamaContext, llama_batch_get_one(&tokens[i], nEval, *nPast, 0))) { + LOG_ERROR << "LLAMA DECODE ERROR"; + break; + } + *nPast += nEval; + } + } + +public: + LlavaProps mProps; + llama_model *mLlavaModel; + llama_context *mLlavaContext = NULL; + llama_model_params mLlavaModelParams; + llama_context_params mLlavaContextParams; + llama_sampling_params mLlavaSamplingParams; + std::string storedData; +}; + +Llava::Llava(LlavaProps _props) : LlmModelAbstract("Llava", _props) { + mDetail.reset(new Detail(_props)); +} + +Llava::~Llava() {} + +bool Llava::validateUseCase(LlavaProps::UseCase useCase) { + for(auto validUseCase: mDetail->mProps.useCases) { + if(validUseCase == useCase) { + return true; + } + } + return false; +} + +bool Llava::modelInit() { + llama_backend_init(false /*NUMA Architecure set to false*/); + + mDetail->mLlavaModelParams = llama_model_default_params(); + mDetail->mLlavaContextParams = llama_context_default_params(); + mDetail->updateProps(mDetail->mProps); + + mDetail->mLlavaModel = llama_load_model_from_file( + mDetail->mProps.modelPath.c_str(), mDetail->mLlavaModelParams); + mDetail->mLlavaContext = llama_new_context_with_model( + mDetail->mLlavaModel, mDetail->mLlavaContextParams); + return LlmModelAbstract::init(); +} + +bool Llava::modelTerm() { + llama_free(mDetail->mLlavaContext); + llama_free_model(mDetail->mLlavaModel); + llama_backend_free(); + return LlmModelAbstract::term(); +} + +bool Llava::modelInference(frame_container& frames) { + /*Parameter Declaration*/ + auto frame = frames.begin()->second; + auto frameType = frame->getMetadata()->getFrameType(); + int nPast = 0; + std::string systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; + std::string userPrompt = mDetail->mProps.prompt; + const bool add_bos = llama_should_add_bos_token(llama_get_model(mDetail->mLlavaContext)); + int nPredict = mDetail->mProps.predictionLength; + int nBatch = mDetail->mProps.batchSize; + + /*System Prompt Tokenization*/ + std::vector systemPromptTokens = ::llama_tokenize(mDetail->mLlavaContext, systemPrompt, add_bos); + mDetail->compute(mDetail->mLlavaContext, systemPromptTokens, nBatch, &nPast); + + if(frameType == FrameMetadata::FrameType::IMAGE_EMBEDDING){ + /*Image Embed Tokenization*/ + auto imageEmbed = static_cast(frame->data()); + llava_eval_image_embed(mDetail->mLlavaContext, imageEmbed, nBatch, &nPast); + } + else if(frameType == FrameMetadata::FrameType::TEXT){ + /*Text Embed Tokenization*/ + auto textEmbed = static_cast(frame->data()); + std::string textEmbedPrompt(textEmbed); + std::vector textEmbedTokens = ::llama_tokenize(mDetail->mLlavaContext, textEmbedPrompt, false); + mDetail->compute(mDetail->mLlavaContext, textEmbedTokens, nBatch, &nPast); + } + + /*User Prompt Tokenization*/ + std::vector userPromptTokens = ::llama_tokenize(mDetail->mLlavaContext, (userPrompt + "\nASSISTANT:").c_str(), false); + mDetail->compute(mDetail->mLlavaContext, userPromptTokens, nBatch, &nPast); + + std::string output = ""; + + std::cout << "\n"; + + /*Prediction token by token*/ + for(int i = 0; i < nPredict; i++) { + llama_token id = 0; + auto logits = llama_get_logits(mDetail->mLlavaContext); + auto n_vocab = llama_n_vocab(llama_get_model(mDetail->mLlavaContext)); + + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + id = llama_sample_token_greedy(mDetail->mLlavaContext, &candidates_p); + + if (id == llama_token_eos(llama_get_model(mDetail->mLlavaContext))) { + break; + } + + std::string ret = llama_token_to_piece(mDetail->mLlavaContext, id); + output += ret; + + std::cout << ret; + + std::vector tokens; + tokens.push_back(id); + mDetail->compute(mDetail->mLlavaContext, tokens, 1, &nPast); + } + + mDetail->storedData = output; + return true; +} + +size_t Llava::getFrameSize() { + return (mDetail->storedData.length() + 1); /* Add 1 more byte for /0 for conversion from std::string to char* */ +} + +void Llava::getFrames(frame_sp& frame) { + memcpy(frame->data(), mDetail->storedData.c_str(), frame->size()); +} \ No newline at end of file From 7b00c687765118949599494083d84dc6f3803f11 Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:14:46 +0530 Subject: [PATCH 06/13] add Clip Encoder Class --- base/include/ClipEncoder.h | 48 ++++++++++++++++++++++++ base/src/ClipEncoder.cpp | 76 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 base/include/ClipEncoder.h create mode 100644 base/src/ClipEncoder.cpp diff --git a/base/include/ClipEncoder.h b/base/include/ClipEncoder.h new file mode 100644 index 000000000..c11653ddb --- /dev/null +++ b/base/include/ClipEncoder.h @@ -0,0 +1,48 @@ +#pragma once + +#include "EncoderModelAbstract.h" + +class ClipEncoderProps : public EncoderModelAbstractProps { +public: + ClipEncoderProps(std::string _modelPath) { + /* Set LLM Model Base Class Properties for each model*/ + modelArchitecture = ModelArchitectureType::VIT; + inputTypes = {DataType::TEXT, DataType::IMAGE}; + outputTypes = {DataType::IMAGE_EMBEDDING}; + useCases = {UseCase::OCR, UseCase::SCENE_DESCRIPTOR}; + + /*Unique Model Properties*/ + modelPath = _modelPath; + } + + std::string modelPath; + + size_t getSerializeSize() { + return EncoderModelAbstractProps::getSerializeSize() + sizeof(modelPath); + } + +private: + friend class boost::serialization::access; + + template + void serialize(Archive &ar, const unsigned int version) { + ar &boost::serialization::base_object(*this); + ar & modelPath; + } +}; + +class ClipEncoder : public EncoderModelAbstract { +public: + ClipEncoder(ClipEncoderProps _props); + virtual ~ClipEncoder(); + bool modelInit() override; + bool modelTerm() override; + bool modelInference(frame_container& frames) override; + bool validateUseCase(EncoderModelAbstractProps::UseCase useCase) override; + size_t getFrameSize() override; + void getFrames(frame_sp& frame) override; + +private: + class Detail; + boost::shared_ptr mDetail; +}; \ No newline at end of file diff --git a/base/src/ClipEncoder.cpp b/base/src/ClipEncoder.cpp new file mode 100644 index 000000000..054014940 --- /dev/null +++ b/base/src/ClipEncoder.cpp @@ -0,0 +1,76 @@ +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "Utils.h" + +#include "llama/common.h" +#include "llama/llama.h" +#include "llama/clip.h" +#include "llama/llava.h" +#include "ClipEncoder.h" + +class ClipEncoder::Detail { +public: + Detail(ClipEncoderProps &_props) : mProps(_props) { + + } + + ~Detail() { } + + void setProps(ClipEncoderProps &_props) { + mProps = _props; + } + +public: + ClipEncoderProps mProps; + clip_ctx * mClipContext = NULL; + llava_image_embed * storedData; +}; + +ClipEncoder::ClipEncoder(ClipEncoderProps _props) : EncoderModelAbstract("ClipEncoder", _props) { + mDetail.reset(new Detail(_props)); +} + +ClipEncoder::~ClipEncoder() {} + +bool ClipEncoder::validateUseCase(ClipEncoderProps::UseCase useCase) { + for(auto validUseCase: mDetail->mProps.useCases) { + if(validUseCase == useCase) { + return true; + } + } + return false; +} + +bool ClipEncoder::modelInit() { + mDetail->mClipContext = clip_model_load(mDetail->mProps.modelPath.c_str(), 1); + return EncoderModelAbstract::init(); +} + +bool ClipEncoder::modelTerm() { + if(mDetail->mClipContext){ + clip_free(mDetail->mClipContext); + mDetail->mClipContext = NULL; + } + return EncoderModelAbstract::term(); +} + +bool ClipEncoder::modelInference(frame_container& frames) { + auto frame = frames.begin()->second; + auto imageBytes = static_cast(frame->data()); + auto imageBytesLength = frame->size(); + mDetail->storedData = llava_image_embed_make_with_bytes(mDetail->mClipContext, 8, imageBytes, imageBytesLength); + return true; +} + +size_t ClipEncoder::getFrameSize() { + return (clip_embd_nbytes(mDetail->mClipContext) + sizeof(int)); +} + +void ClipEncoder::getFrames(frame_sp& frame) { + memcpy(frame->data(), mDetail->storedData, sizeof(llava_image_embed)); + float *char_buffer = (float *)frame->data(); + char_buffer += sizeof(llava_image_embed); + memcpy(char_buffer, mDetail->storedData->embed, clip_embd_nbytes(mDetail->mClipContext)); +} \ No newline at end of file From bee2754ef295040de735578ba9f515ab0bfad723 Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:15:11 +0530 Subject: [PATCH 07/13] add Model Strategy Class --- base/include/ModelStrategy.h | 43 +++++++++++++++++++++++ base/src/ModelStrategy.cpp | 66 ++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 base/include/ModelStrategy.h create mode 100644 base/src/ModelStrategy.cpp diff --git a/base/include/ModelStrategy.h b/base/include/ModelStrategy.h new file mode 100644 index 000000000..e31a7f1cd --- /dev/null +++ b/base/include/ModelStrategy.h @@ -0,0 +1,43 @@ +# pragma once + +#include "Module.h" +#include "ClipEncoder.h" +#include "Llava.h" + +class ModelStrategy { +public: + enum ModelStrategyType { + LLAMA_TEXT_TO_TEXT = 0, + LLAVA_TEXT_TO_TEXT, + LLAVA_SCENE_DESCRIPTOR + }; + + static boost::shared_ptr create(ModelStrategyType type); + + ModelStrategy(); + virtual ~ModelStrategy(); + + virtual bool initStrategy() = 0; + virtual bool termStrategy() = 0; +public: + boost::shared_ptr encoderModel; + boost::shared_ptr llmModel; +}; + +class SceneDescriptorModelStrategy : public ModelStrategy { +public: + SceneDescriptorModelStrategy(); + ~SceneDescriptorModelStrategy(); + + bool initStrategy() override; + bool termStrategy() override; +}; + +class LlavaTextToTextModelStrategy : public ModelStrategy { +public: + LlavaTextToTextModelStrategy(); + ~LlavaTextToTextModelStrategy(); + + bool initStrategy() override; + bool termStrategy() override; +}; \ No newline at end of file diff --git a/base/src/ModelStrategy.cpp b/base/src/ModelStrategy.cpp new file mode 100644 index 000000000..592cc6552 --- /dev/null +++ b/base/src/ModelStrategy.cpp @@ -0,0 +1,66 @@ +#include "ModelStrategy.h" + +ModelStrategy::ModelStrategy() { + +} + +ModelStrategy::~ModelStrategy() { + +} + +boost::shared_ptr ModelStrategy::create(ModelStrategyType type) { + switch (type) { + case ModelStrategyType::LLAVA_SCENE_DESCRIPTOR: + return boost::make_shared(); + case ModelStrategyType::LLAVA_TEXT_TO_TEXT: + return boost::make_shared(); + default: + return boost::make_shared(); + break; + } +} + +/*LLAVA SCENE-DESCRIPTOR STRATEGY*/ +SceneDescriptorModelStrategy::SceneDescriptorModelStrategy() : ModelStrategy() { + auto clipProps = ClipEncoderProps("./data/llm/llava/llava-v1.6-7b/mmproj-model-f16.gguf"); + auto llavaProps = LlavaProps("./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", "Describe the image", 2048, 512, 0.8, 10, 256); + + encoderModel = boost::shared_ptr(new ClipEncoder(clipProps)); + llmModel = boost::shared_ptr(new Llava(llavaProps)); +} + +SceneDescriptorModelStrategy::~SceneDescriptorModelStrategy() { + +} + +bool SceneDescriptorModelStrategy::initStrategy() { + encoderModel->modelInit(); + llmModel->modelInit(); + return true; +} + +bool SceneDescriptorModelStrategy::termStrategy() { + encoderModel->modelTerm(); + llmModel->modelTerm(); + return true; +} + +/*LLAVE TEXT-TO-TEXT STRATEGY*/ +LlavaTextToTextModelStrategy::LlavaTextToTextModelStrategy() : ModelStrategy() { + auto llavaProps = LlavaProps("./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", "Tell me a story", 2048, 512, 0.8, 10, 256); + llmModel = boost::shared_ptr(new Llava(llavaProps)); +} + +LlavaTextToTextModelStrategy::~LlavaTextToTextModelStrategy() { + +} + +bool LlavaTextToTextModelStrategy::initStrategy() { + llmModel->modelInit(); + return true; +} + +bool LlavaTextToTextModelStrategy::termStrategy() { + llmModel->modelTerm(); + return true; +} \ No newline at end of file From efd40825f516b7367e0d64ea677e76d48fad5aac Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:15:39 +0530 Subject: [PATCH 08/13] add SceneDescriptorXform Module --- base/include/SceneDescriptorXForm.h | 51 ++++++++ base/src/SceneDescriptorXForm.cpp | 174 ++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 base/include/SceneDescriptorXForm.h create mode 100644 base/src/SceneDescriptorXForm.cpp diff --git a/base/include/SceneDescriptorXForm.h b/base/include/SceneDescriptorXForm.h new file mode 100644 index 000000000..3a2ada07f --- /dev/null +++ b/base/include/SceneDescriptorXForm.h @@ -0,0 +1,51 @@ +# pragma once + +#include "Module.h" + +class SceneDescriptorXFormProps : public ModuleProps { +public: + enum SceneDescriptorStrategy { + LLAVA = 0 + }; + + SceneDescriptorXFormProps(SceneDescriptorStrategy _modelStrategyType) { + modelStrategyType = _modelStrategyType; + } + + size_t getSerializeSize() { + return ModuleProps::getSerializeSize() + sizeof(modelStrategyType); + } + + SceneDescriptorStrategy modelStrategyType; +private: + friend class boost::serialization::access; + + template + void serialize(Archive& ar, const unsigned int version) { + ar &boost::serialization::base_object(*this); + ar & modelStrategyType; + } +}; + +class SceneDescriptorXForm : public Module { +public: + SceneDescriptorXForm(SceneDescriptorXFormProps _props); + virtual ~SceneDescriptorXForm(); + bool init(); + bool term(); + void setProps(SceneDescriptorXFormProps& props); + SceneDescriptorXFormProps getProps(); + +protected: + bool process(frame_container& frames); + bool processSOS(frame_sp& frame); + bool validateInputPins(); + bool validateOutputPins(); + void addInputPin(framemetadata_sp& metadata, string& pinId); + bool handlePropsChange(frame_sp& frame); + +private: + void setMetadata(framemetadata_sp& metadata); + class Detail; + boost::shared_ptr mDetail; +}; \ No newline at end of file diff --git a/base/src/SceneDescriptorXForm.cpp b/base/src/SceneDescriptorXForm.cpp new file mode 100644 index 000000000..ece582031 --- /dev/null +++ b/base/src/SceneDescriptorXForm.cpp @@ -0,0 +1,174 @@ +#include "SceneDescriptorXForm.h" +#include "ClipEncoder.h" +#include "Llava.h" +#include "ModelStrategy.h" + +class SceneDescriptorXForm::Detail { +public: + Detail(SceneDescriptorXFormProps &_props) : mProps(_props) { + setModelStrategy(_props); + } + ~Detail() {} + + void setProps(SceneDescriptorXFormProps &props) { mProps = props; } + + void setModelStrategy(SceneDescriptorXFormProps &props) { + switch (props.modelStrategyType) { + case SceneDescriptorXFormProps::SceneDescriptorStrategy::LLAVA: + modelStrategyType = ModelStrategy::ModelStrategyType::LLAVA_SCENE_DESCRIPTOR; + break; + default: + LOG_ERROR << "Please choose a valid model strategy!!!\n"; + break; + } + + modelStrategy = ModelStrategy::create(modelStrategyType); + } + +public: + framemetadata_sp mOutputMetadata; + std::string mOutputPinId; + SceneDescriptorXFormProps mProps; + ModelStrategy::ModelStrategyType modelStrategyType; + boost::shared_ptr modelStrategy; +}; + +SceneDescriptorXForm::SceneDescriptorXForm(SceneDescriptorXFormProps _props) + : Module(TRANSFORM, "SceneDescriptorXForm", _props) { + mDetail.reset(new Detail(_props)); +} + +SceneDescriptorXForm::~SceneDescriptorXForm() {} + +bool SceneDescriptorXForm::validateInputPins() { + if (getNumberOfInputPins() != 1) { + LOG_ERROR << "<" << getId() + << ">::validateInputPins size is expected to be 1. Actual<" + << getNumberOfInputPins() << ">"; + return false; + } + + framemetadata_sp metadata = getFirstInputMetadata(); + + FrameMetadata::FrameType frameType = metadata->getFrameType(); + + if (frameType != FrameMetadata::FrameType::ENCODED_IMAGE) { + LOG_ERROR << "<" << getId() + << ">::validateInputPins input frameType is expected to be " + "Audio. Actual<" + << frameType << ">"; + return false; + } + + FrameMetadata::MemType memType = metadata->getMemType(); + if (memType != FrameMetadata::MemType::HOST) { + LOG_ERROR + << "<" << getId() + << ">::validateInputPins input memType is expected to be HOST. Actual<" + << memType << ">"; + return false; + } + return true; +} + +bool SceneDescriptorXForm::validateOutputPins() { + if (getNumberOfOutputPins() != 1) { + LOG_ERROR << "<" << getId() + << ">::validateOutputPins size is expected to be 1. Actual<" + << getNumberOfOutputPins() << ">"; + return false; + } + + framemetadata_sp metadata = getFirstOutputMetadata(); + FrameMetadata::FrameType frameType = metadata->getFrameType(); + if (frameType != FrameMetadata::FrameType::TEXT) { + LOG_ERROR << "<" << getId() + << ">::validateOutputPins input frameType is expected to be " + "TEXT. Actual<" + << frameType << ">"; + return false; + } + + return true; +} + +void SceneDescriptorXForm::addInputPin(framemetadata_sp &metadata, + string &pinId) { + Module::addInputPin(metadata, pinId); + mDetail->mOutputMetadata = + framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::TEXT)); + mDetail->mOutputMetadata->copyHint(*metadata.get()); + mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata); +} + +bool SceneDescriptorXForm::init() { + bool ret = mDetail->modelStrategy->initStrategy(); + if (!ret) { + return false; + } + return Module::init(); +} + +bool SceneDescriptorXForm::term() { + bool ret = mDetail->modelStrategy->termStrategy(); + if (!ret) { + return false; + } + return Module::term(); +} + +bool SceneDescriptorXForm::process(frame_container &frames) { + /*Encoder Model*/ + mDetail->modelStrategy->encoderModel->push(frames); + mDetail->modelStrategy->encoderModel->step(); + auto clipFrame = + makeFrame(mDetail->modelStrategy->encoderModel->getFrameSize()); + auto clipMetaData = boost::shared_ptr( + new FrameMetadata(FrameMetadata::FrameType::IMAGE_EMBEDDING)); + clipFrame->setMetadata(clipMetaData); + mDetail->modelStrategy->encoderModel->getFrames(clipFrame); + + frame_container clipFrames; + clipFrames.insert(make_pair(mDetail->mOutputPinId, clipFrame)); + + /*LLM Model*/ + mDetail->modelStrategy->llmModel->push(clipFrames); + mDetail->modelStrategy->llmModel->step(); + auto outFrame = makeFrame(mDetail->modelStrategy->llmModel->getFrameSize()); + mDetail->modelStrategy->llmModel->getFrames(outFrame); + + frames.insert(make_pair(mDetail->mOutputPinId, outFrame)); + send(frames); + return true; +} + +void SceneDescriptorXForm::setMetadata(framemetadata_sp &metadata) { + if (!metadata->isSet()) { + return; + } +} + +bool SceneDescriptorXForm::processSOS(frame_sp &frame) { + auto metadata = frame->getMetadata(); + setMetadata(metadata); + return true; +} + +SceneDescriptorXFormProps SceneDescriptorXForm::getProps() { + fillProps(mDetail->mProps); + return mDetail->mProps; +} + +bool SceneDescriptorXForm::handlePropsChange(frame_sp &frame) { + SceneDescriptorXFormProps props(mDetail->mProps.modelStrategyType); + auto ret = Module::handlePropsChange(frame, props); + mDetail->setProps(props); + return ret; +} + +void SceneDescriptorXForm::setProps(SceneDescriptorXFormProps &props) { + if (props.modelStrategyType != mDetail->mProps.modelStrategyType) { + throw AIPException(AIP_FATAL, "Model Strategy Type dynamic change not handled"); + } + Module::addPropsToQueue(props); +} \ No newline at end of file From c60e9ec84c70ddeb23ed90847aaa86144436d1b0 Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:16:29 +0530 Subject: [PATCH 09/13] add unit tests --- base/test/llavamodel_tests.cpp | 30 +++++++++++++ base/test/sceneDescriptorXForm_tests.cpp | 56 ++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 base/test/llavamodel_tests.cpp create mode 100644 base/test/sceneDescriptorXForm_tests.cpp diff --git a/base/test/llavamodel_tests.cpp b/base/test/llavamodel_tests.cpp new file mode 100644 index 000000000..5fad3ddfa --- /dev/null +++ b/base/test/llavamodel_tests.cpp @@ -0,0 +1,30 @@ +#include +#include "stdafx.h" +#include +#include + +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "test_utils.h" +#include "PipeLine.h" +#include "FileWriterModule.h" +#include "FileReaderModule.h" +#include "FileWriterModule.h" +#include "LlmModelAbstract.h" +#include "Llava.h" +#include "Module.h" +#include "ExternalSinkModule.h" + +BOOST_AUTO_TEST_SUITE(llavamodel_test) + +BOOST_AUTO_TEST_CASE(llava_init) +{ + auto llavaProps = LlavaProps("C:/Users/developer/ws_kushal/llm-integration-branch/ApraPipes/data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", "Tell a story", 2048, 512, 0.8, 5, 50); + auto llavaModel = boost::shared_ptr(new Llava(llavaProps)); + llavaModel->modelInit(); + llavaModel->modelTerm(); +} + +BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file diff --git a/base/test/sceneDescriptorXForm_tests.cpp b/base/test/sceneDescriptorXForm_tests.cpp new file mode 100644 index 000000000..99b4c099d --- /dev/null +++ b/base/test/sceneDescriptorXForm_tests.cpp @@ -0,0 +1,56 @@ +#include +#include "stdafx.h" +#include +#include + +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "test_utils.h" +#include "PipeLine.h" +#include "FileWriterModule.h" +#include "FileReaderModule.h" +#include "FileWriterModule.h" +#include "SceneDescriptorXForm.h" +#include "ModelStrategy.h" +#include "Module.h" +#include "ExternalSinkModule.h" + +BOOST_AUTO_TEST_SUITE(sceneDescriptorXForm_tests) + +BOOST_AUTO_TEST_CASE(testing) +{ + std::vector sceneDescriptorOutText = { "./data/sceneDescriptor_out.txt" }; + Test_Utils::FileCleaner f(sceneDescriptorOutText); + + Logger::setLogLevel(boost::log::trivial::severity_level::info); + + auto fileReaderProps = FileReaderModuleProps("./data/1280x960.jpg"); + fileReaderProps.readLoop = false; + auto fileReader = boost::shared_ptr(new FileReaderModule(fileReaderProps)); + auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::ENCODED_IMAGE)); + auto pinId = fileReader->addOutputPin(metadata); + + auto sceneDescriptorProps = SceneDescriptorXFormProps(SceneDescriptorXFormProps::SceneDescriptorStrategy::LLAVA); + auto sceneDescriptor = boost::shared_ptr(new SceneDescriptorXForm(sceneDescriptorProps)); + fileReader->setNext(sceneDescriptor); + + auto outputFile = boost::shared_ptr(new FileWriterModule(FileWriterModuleProps(sceneDescriptorOutText[0], false))); + sceneDescriptor->setNext(outputFile); + + BOOST_TEST(fileReader->init()); + BOOST_TEST(sceneDescriptor->init()); + BOOST_TEST(outputFile->init()); + + fileReader->step(); + sceneDescriptor->step(); + outputFile->step(); + + std::ifstream in_file_text(sceneDescriptorOutText[0]); + std::ostringstream buffer; + buffer << in_file_text.rdbuf(); + in_file_text.close(); +} + +BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file From d0997263968f7a467db3e66681e4747cb0485171 Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:16:59 +0530 Subject: [PATCH 10/13] update cmakelists.txt --- base/CMakeLists.txt | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index af75e41b3..eecd054ff 100755 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -283,7 +283,13 @@ SET(IP_FILES src/OverlayFactory.h src/OverlayFactory.cpp src/TestSignalGeneratorSrc.cpp - src/AudioToTextXForm.cpp + src/AudioToTextXForm.cpp + src/ModelStrategy.cpp + src/LlmModelAbstract.cpp + src/EncoderModelAbstract.cpp + src/Llava.cpp + src/ClipEncoder.cpp + src/SceneDescriptorXForm.cpp ) SET(IP_FILES_H @@ -309,6 +315,12 @@ SET(IP_FILES_H include/ColorConversionXForm.h include/Overlay.h include/AudioToTextXForm.h + include/ModelStrategy.h + include/LlmModelAbstract.h + include/EncoderModelAbstract.h + include/Llava.h + include/ClipEncoder.h + include/SceneDescriptorXForm.h ) SET(CUDA_CORE_FILES @@ -563,6 +575,8 @@ SET(UT_FILES test/overlaymodule_tests.cpp test/testSignalGeneratorSrc_tests.cpp test/audioToTextXform_tests.cpp + test/llavamodel_tests.cpp + test/SceneDescriptorXForm_tests.cpp ${ARM64_UT_FILES} ${CUDA_UT_FILES} ) @@ -588,8 +602,10 @@ ENDIF (ENABLE_CUDA) find_library(OPENH264_LIB NAMES openh264.lib libopenh264.a REQUIRED) find_library(LIBMP4_LIB NAMES mp4lib.lib libmp4lib.a REQUIRED) +find_library(COMMON_LIB NAMES common_llama.lib REQUIRED) +find_library(LLAVA_LIB NAMES llavalib.lib REQUIRED) -target_link_libraries(aprapipesut +target_link_libraries(aprapipesut aprapipes ${JPEG_LIBRARIES} ${LIBMP4_LIB} @@ -609,6 +625,9 @@ target_link_libraries(aprapipesut liblzma::liblzma bigint::bigint sfml-audio + ${COMMON_LIB} + llama + ${LLAVA_LIB} whisper::whisper ) From 76d37ac703cabebfa5e499d1ffada1fb67101e57 Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Wed, 17 Apr 2024 19:17:20 +0530 Subject: [PATCH 11/13] update framemetadata.h --- base/include/FrameMetadata.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/base/include/FrameMetadata.h b/base/include/FrameMetadata.h index ebddf592b..0ea2eb674 100755 --- a/base/include/FrameMetadata.h +++ b/base/include/FrameMetadata.h @@ -51,7 +51,8 @@ class FrameMetadata { MOTION_VECTOR_DATA, OVERLAY_INFO_IMAGE, FACE_LANDMARKS_INFO, - TEXT + TEXT, + IMAGE_EMBEDDING }; enum MemType From 926a07ff23612d5d851555b6fa57db741bf43a9b Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Thu, 2 May 2024 15:33:19 +0530 Subject: [PATCH 12/13] Updated code to resolve PR conversations --- base/CMakeLists.txt | 15 +- base/include/ClipEncoder.h | 32 +-- base/include/EncoderModelAbstract.h | 110 ++++---- base/include/FrameMetadata.h | 254 +++++++++--------- base/include/Llava.h | 39 +-- base/include/LlmModelAbstract.h | 108 ++++---- base/include/ModelEnums.h | 20 ++ base/include/ModelStrategy.h | 47 +++- base/include/SceneDescriptorXForm.h | 92 ++++--- base/src/ClipEncoder.cpp | 102 ++++--- base/src/EncoderModelAbstract.cpp | 65 ++++- base/src/Llava.cpp | 232 ++++++++++------ base/src/LlmModelAbstract.cpp | 63 ++++- base/src/ModelStrategy.cpp | 69 +++-- base/src/SceneDescriptorXForm.cpp | 127 +++++---- base/test/llavamodel_tests.cpp | 12 +- base/test/sceneDescriptorXForm_tests.cpp | 83 +++--- .../custom-overlay/llama/portfile.cmake | 6 +- 18 files changed, 872 insertions(+), 604 deletions(-) mode change 100755 => 100644 base/include/FrameMetadata.h create mode 100644 base/include/ModelEnums.h diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index eecd054ff..d278156c9 100755 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -52,7 +52,7 @@ find_package(ZXing CONFIG REQUIRED) find_package(bigint CONFIG REQUIRED) find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED) find_package(whisper CONFIG REQUIRED) -find_package(llama CONFIG REQUIRED) +find_package(Llama CONFIG REQUIRED) IF(ENABLE_CUDA) if((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL "")) @@ -315,6 +315,7 @@ SET(IP_FILES_H include/ColorConversionXForm.h include/Overlay.h include/AudioToTextXForm.h + include/ModelEnums.h include/ModelStrategy.h include/LlmModelAbstract.h include/EncoderModelAbstract.h @@ -576,7 +577,7 @@ SET(UT_FILES test/testSignalGeneratorSrc_tests.cpp test/audioToTextXform_tests.cpp test/llavamodel_tests.cpp - test/SceneDescriptorXForm_tests.cpp + test/sceneDescriptorXForm_tests.cpp ${ARM64_UT_FILES} ${CUDA_UT_FILES} ) @@ -602,8 +603,8 @@ ENDIF (ENABLE_CUDA) find_library(OPENH264_LIB NAMES openh264.lib libopenh264.a REQUIRED) find_library(LIBMP4_LIB NAMES mp4lib.lib libmp4lib.a REQUIRED) -find_library(COMMON_LIB NAMES common_llama.lib REQUIRED) -find_library(LLAVA_LIB NAMES llavalib.lib REQUIRED) +find_library(COMMON_LIB NAMES common_llama.lib libcommon_llama.a REQUIRED) +find_library(LLAVA_LIB NAMES llavalib.lib libllavalib.a REQUIRED) target_link_libraries(aprapipesut aprapipes @@ -625,9 +626,9 @@ target_link_libraries(aprapipesut liblzma::liblzma bigint::bigint sfml-audio - ${COMMON_LIB} - llama - ${LLAVA_LIB} + ${COMMON_LIB} + llama + ${LLAVA_LIB} whisper::whisper ) diff --git a/base/include/ClipEncoder.h b/base/include/ClipEncoder.h index c11653ddb..6c91dccd2 100644 --- a/base/include/ClipEncoder.h +++ b/base/include/ClipEncoder.h @@ -2,22 +2,15 @@ #include "EncoderModelAbstract.h" -class ClipEncoderProps : public EncoderModelAbstractProps { +class ClipEncoderProps : public EncoderModelAbstractProps +{ public: - ClipEncoderProps(std::string _modelPath) { - /* Set LLM Model Base Class Properties for each model*/ - modelArchitecture = ModelArchitectureType::VIT; - inputTypes = {DataType::TEXT, DataType::IMAGE}; - outputTypes = {DataType::IMAGE_EMBEDDING}; - useCases = {UseCase::OCR, UseCase::SCENE_DESCRIPTOR}; - - /*Unique Model Properties*/ - modelPath = _modelPath; - } + ClipEncoderProps(std::string _modelPath); std::string modelPath; - size_t getSerializeSize() { + size_t getSerializeSize() + { return EncoderModelAbstractProps::getSerializeSize() + sizeof(modelPath); } @@ -25,22 +18,25 @@ class ClipEncoderProps : public EncoderModelAbstractProps { friend class boost::serialization::access; template - void serialize(Archive &ar, const unsigned int version) { - ar &boost::serialization::base_object(*this); + void serialize(Archive &ar, const unsigned int version) + { + ar &boost::serialization::base_object(*this); ar & modelPath; } }; -class ClipEncoder : public EncoderModelAbstract { +class ClipEncoder : public EncoderModelAbstract +{ public: ClipEncoder(ClipEncoderProps _props); virtual ~ClipEncoder(); bool modelInit() override; bool modelTerm() override; - bool modelInference(frame_container& frames) override; - bool validateUseCase(EncoderModelAbstractProps::UseCase useCase) override; + bool modelInference(frame_container &inputFrameContainer, + frame_container &outputFrameContainer, std::function makeFrame) override; + bool validateUseCase(UseCase useCase) override; size_t getFrameSize() override; - void getFrames(frame_sp& frame) override; + void storeFrames(frame_sp &frame); private: class Detail; diff --git a/base/include/EncoderModelAbstract.h b/base/include/EncoderModelAbstract.h index 6f2afe030..f471ae3f7 100644 --- a/base/include/EncoderModelAbstract.h +++ b/base/include/EncoderModelAbstract.h @@ -1,46 +1,42 @@ #pragma once +#include "stdafx.h" +#include +#include +#include +#include +#include +#include +#include "Frame.h" +#include +#include "BoundBuffer.h" +#include "FrameFactory.h" +#include "CommonDefs.h" +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Command.h" +#include "BufferMaker.h" +#include "ModelEnums.h" +#include "FrameContainerQueue.h" -#include "Module.h" - -class EncoderModelAbstractProps { +class EncoderModelAbstractProps +{ public: - enum ModelArchitectureType { - BERT= 0, // Vision Transformer - VIT, // Bidirectional Encoder Representations from Transformer - AST, // Audio Spectrogram Transformer - VIVIT // Video Vision Transformer - }; - - enum DataType { TEXT = 0, IMAGE, AUDIO, TEXT_EMBEDDING, IMAGE_EMBEDDING, AUDIO_EMBEDDING }; - - enum UseCase { TEXT_TO_TEXT = 0, SCENE_DESCRIPTOR, OCR }; - - EncoderModelAbstractProps() { - modelArchitecture = ModelArchitectureType::BERT; - inputTypes = {DataType::TEXT}; - outputTypes = {DataType::TEXT_EMBEDDING}; - useCases = {UseCase::TEXT_TO_TEXT}; - qlen = 20; - } + EncoderModelAbstractProps(); EncoderModelAbstractProps(ModelArchitectureType _modelArchitecture, - std::vector _inputTypes, - std::vector _outputTypes, - std::vector _useCases) { - modelArchitecture = _modelArchitecture; - inputTypes = _inputTypes; - outputTypes = _outputTypes; - useCases = _useCases; - qlen = 20; - } + std::vector _inputTypes, + std::vector _outputTypes, + std::vector _useCases); - size_t getSerializeSize() { - return sizeof(modelArchitecture) + sizeof(inputTypes) + sizeof(outputTypes) + sizeof(useCases) + sizeof(qlen); + size_t getSerializeSize() + { + return sizeof(modelArchitecture) + sizeof(inputTypes) + + sizeof(outputTypes) + sizeof(useCases) + sizeof(qlen); } ModelArchitectureType modelArchitecture; - std::vector inputTypes; - std::vector outputTypes; + std::vector inputTypes; + std::vector outputTypes; std::vector useCases; size_t qlen; @@ -48,8 +44,9 @@ class EncoderModelAbstractProps { friend class boost::serialization::access; template - void serialize(Archive &ar, const unsigned int version) { - ar &boost::serialization::base_object(*this); + void serialize(Archive &ar, const unsigned int version) + { + ar &boost::serialization::base_object(*this); ar & modelArchitecture; ar & inputTypes; ar & outputTypes; @@ -58,34 +55,35 @@ class EncoderModelAbstractProps { } }; -class EncoderModelAbstract { +class EncoderModelAbstract +{ public: - EncoderModelAbstract(std::string name, EncoderModelAbstractProps props); + EncoderModelAbstract(std::string _modelName, EncoderModelAbstractProps props); ~EncoderModelAbstract(); - std::string getMyName() { - return myName; - } + std::string getMyName() { return modelName; } - boost::shared_ptr getQue() { - return mQue; - } + boost::shared_ptr getQue() { return mQue; } - virtual bool modelInit() = 0; - virtual bool modelTerm() = 0; - virtual bool modelInference(frame_container& frameContainer) {return false;} + virtual bool modelInit() = 0; + virtual bool modelTerm() = 0; + virtual bool modelInference(frame_container &inputFrameContainer, + frame_container &outputFrameContainer, std::function makeFrame) + { + return false; + } virtual size_t getFrameSize() = 0; - virtual void getFrames(frame_sp& frame) = 0; - virtual bool validateUseCase(EncoderModelAbstractProps::UseCase useCase) = 0; - - bool init(); - bool term(); - bool step(); - bool push(frame_container& frameContainer); - + virtual bool validateUseCase(UseCase useCase) = 0; + + bool init(); + bool term(); + bool step(frame_container &outputFrameContaine, std::function makeFrame); + bool push(frame_container &inputFrameContainer, + frame_container &outputFrameContainer, std::function _makeFrame); + private: - std::string myName; + std::string modelName; boost::shared_ptr mQue; boost::shared_ptr mProps; }; \ No newline at end of file diff --git a/base/include/FrameMetadata.h b/base/include/FrameMetadata.h old mode 100755 new mode 100644 index 0ea2eb674..99330f588 --- a/base/include/FrameMetadata.h +++ b/base/include/FrameMetadata.h @@ -7,143 +7,131 @@ class RawImageMetadata; -class FrameMetadata { -public: - static size_t getPaddingLength(size_t length, size_t alignLength) - { - if (!alignLength) - { - return 0; - } - - auto rem = length % alignLength; - if (rem == 0) - { - return 0; - } - - return (alignLength - rem); - } +class FrameMetadata +{ +public: + static size_t getPaddingLength(size_t length, size_t alignLength) + { + if (!alignLength) + { + return 0; + } + + auto rem = length % alignLength; + if (rem == 0) + { + return 0; + } + + return (alignLength - rem); + } public: - enum FrameType { - GENERAL = 0, - ENCODED_IMAGE, - RAW_IMAGE, - RAW_IMAGE_PLANAR, - AUDIO, - ARRAY, - CHANGE_DETECTION, - EDGEDEFECT_ANALYSIS_INFO, - PROPS_CHANGE, - PAUSE_PLAY, - COMMAND, - H264_DATA, - GPIO, - APRA_LINES, - LINE, - ROI, - DEFECTS_INFO, - FACEDETECTS_INFO, - BMP_IMAGE, - MP4_VIDEO_METADATA, - HEVC_DATA, //H265 - MOTION_VECTOR_DATA, - OVERLAY_INFO_IMAGE, - FACE_LANDMARKS_INFO, - TEXT, - IMAGE_EMBEDDING - }; - - enum MemType - { - HOST = 1, + enum FrameType + { + GENERAL = 0, + ENCODED_IMAGE, + RAW_IMAGE, + RAW_IMAGE_PLANAR, + AUDIO, + ARRAY, + CHANGE_DETECTION, + EDGEDEFECT_ANALYSIS_INFO, + PROPS_CHANGE, + PAUSE_PLAY, + COMMAND, + H264_DATA, + GPIO, + APRA_LINES, + LINE, + ROI, + DEFECTS_INFO, + FACEDETECTS_INFO, + BMP_IMAGE, + MP4_VIDEO_METADATA, + HEVC_DATA, // H265 + MOTION_VECTOR_DATA, + OVERLAY_INFO_IMAGE, + FACE_LANDMARKS_INFO, + TEXT, + IMAGE_EMBEDDING, + TEXT_EMBEDDING + }; + + enum MemType + { + HOST = 1, #ifdef APRA_CUDA_ENABLED - HOST_PINNED = 2, - CUDA_DEVICE = 3, - DMABUF = 4 + HOST_PINNED = 2, + CUDA_DEVICE = 3, + DMABUF = 4 #endif - }; - - FrameMetadata(FrameType _frameType) - { - frameType = _frameType; - memType = MemType::HOST; - hint = ""; - } - - FrameMetadata(FrameType _frameType, std::string _hint) - { - frameType = _frameType; - memType = MemType::HOST; - hint = _hint; - } - - FrameMetadata(FrameType _frameType, MemType _memType) - { - frameType = _frameType; - memType = _memType; - hint = ""; - } - - virtual ~FrameMetadata() { } - - virtual void reset() - { - dataSize = NOT_SET_NUM; - } - - virtual bool isSet() - { - return true; - } - - FrameType getFrameType() - { - return frameType; - } - - MemType getMemType() - { - return memType; - } - - virtual size_t getDataSize() - { - return dataSize; - } - - std::string getHint() { return hint; } - - void setHint(std::string _hint) { hint = _hint; } - void copyHint(FrameMetadata& metadata) - { - if(!hint.empty()) - { - return; - } - - auto _hint = metadata.getHint(); - if(_hint.empty()) - { - return; - } - - setHint(_hint); - } - - void setData(FrameMetadata& metadata) - { - // dont set memType - // assuming frameType is same so no need to set - - // hint I am still undecided whether to copy or not - } + }; + + FrameMetadata(FrameType _frameType) + { + frameType = _frameType; + memType = MemType::HOST; + hint = ""; + } + + FrameMetadata(FrameType _frameType, std::string _hint) + { + frameType = _frameType; + memType = MemType::HOST; + hint = _hint; + } + + FrameMetadata(FrameType _frameType, MemType _memType) + { + frameType = _frameType; + memType = _memType; + hint = ""; + } + + virtual ~FrameMetadata() {} + + virtual void reset() { dataSize = NOT_SET_NUM; } + + virtual bool isSet() { return true; } + + FrameType getFrameType() { return frameType; } + + MemType getMemType() { return memType; } + + virtual size_t getDataSize() { return dataSize; } + + std::string getHint() { return hint; } + + void setHint(std::string _hint) { hint = _hint; } + void copyHint(FrameMetadata &metadata) + { + if (!hint.empty()) + { + return; + } + + auto _hint = metadata.getHint(); + if (_hint.empty()) + { + return; + } + + setHint(_hint); + } + + void setData(FrameMetadata &metadata) + { + // dont set memType + // assuming frameType is same so no need to set + + // hint I am still undecided whether to copy or not + } protected: - FrameType frameType; - MemType memType; - std::string hint; - - size_t dataSize = NOT_SET_NUM; + FrameType frameType; + MemType memType; + std::string hint; + + size_t dataSize = NOT_SET_NUM; }; diff --git a/base/include/Llava.h b/base/include/Llava.h index fae64d377..de5954543 100644 --- a/base/include/Llava.h +++ b/base/include/Llava.h @@ -4,27 +4,13 @@ class LlavaProps : public LlmModelAbstractProps { public: - LlavaProps(std::string _modelPath, std::string _prompt, - int _contextSize, int _batchSize, float _degreeOfRandomness, int _gpuLayers, int _predictionLength) { - - /* Set LLM Model Base Class Properties for each model*/ - modelArchitecture = ModelArchitectureType::TRANSFORMER; - inputTypes = {DataType::TEXT, DataType::IMAGE_EMBEDDING}; - outputTypes = {DataType::TEXT}; - useCases = {UseCase::TEXT_TO_TEXT, UseCase::OCR, UseCase::SCENE_DESCRIPTOR}; - - /*Unique Model Properties*/ - modelPath = _modelPath; - prompt = _prompt; - degreeOfRandomness = _degreeOfRandomness; - contextSize = _contextSize; - batchSize = _batchSize; - gpuLayers = _gpuLayers; - predictionLength = _predictionLength; - } + LlavaProps(std::string _modelPath, std::string _systemPrompt, + std::string _userPrompt, int _contextSize, int _batchSize, + float _degreeOfRandomness, int _gpuLayers, int _predictionLength); std::string modelPath; - std::string prompt; + std::string systemPrompt; + std::string userPrompt; int contextSize; int batchSize; float degreeOfRandomness; @@ -33,7 +19,8 @@ class LlavaProps : public LlmModelAbstractProps { size_t getSerializeSize() { return LlmModelAbstractProps::getSerializeSize() + sizeof(modelPath) + - sizeof(prompt) + sizeof(float) + 4 * sizeof(int); + sizeof(systemPrompt) + sizeof(userPrompt) + sizeof(float) + + 4 * sizeof(int); } private: @@ -41,8 +28,8 @@ class LlavaProps : public LlmModelAbstractProps { template void serialize(Archive &ar, const unsigned int version) { - ar &boost::serialization::base_object(*this); - ar & modelPath & prompt; + ar &boost::serialization::base_object(*this); + ar & modelPath & systemPrompt & userPrompt; ar & degreeOfRandomness; ar & contextSize & batchSize & gpuLayers & predictionLength; } @@ -54,10 +41,12 @@ class Llava : public LlmModelAbstract { virtual ~Llava(); bool modelInit() override; bool modelTerm() override; - bool modelInference(frame_container& frames) override; - bool validateUseCase(LlmModelAbstractProps::UseCase useCase) override; + bool modelInference(frame_container &inputFrameContainer, + frame_container &outputFrameContainer, + std::function makeFrame) override; + bool validateUseCase(UseCase useCase) override; size_t getFrameSize() override; - void getFrames(frame_sp& frame) override; + void storeFrames(frame_sp &frame); private: class Detail; diff --git a/base/include/LlmModelAbstract.h b/base/include/LlmModelAbstract.h index c803e413e..107813dd8 100644 --- a/base/include/LlmModelAbstract.h +++ b/base/include/LlmModelAbstract.h @@ -1,46 +1,42 @@ #pragma once - -#include "Module.h" - -class LlmModelAbstractProps { +#include "stdafx.h" +#include +#include +#include +#include +#include +#include +#include "Frame.h" +#include +#include "BoundBuffer.h" +#include "FrameFactory.h" +#include "CommonDefs.h" +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Command.h" +#include "BufferMaker.h" +#include "ModelEnums.h" +#include "FrameContainerQueue.h" + +class LlmModelAbstractProps +{ public: - enum ModelArchitectureType { - TRANSFORMER = 0, - ENCODERDECODER, - CASUALDECODER, - PREFIXDECODER - }; - - enum DataType { TEXT = 0, IMAGE, AUDIO, TEXT_EMBEDDING, IMAGE_EMBEDDING, AUDIO_EMBEDDING }; - - enum UseCase { TEXT_TO_TEXT = 0, SCENE_DESCRIPTOR, OCR }; - - LlmModelAbstractProps() { - modelArchitecture = ModelArchitectureType::TRANSFORMER; - inputTypes = {DataType::TEXT}; - outputTypes = {DataType::TEXT}; - useCases = {UseCase::TEXT_TO_TEXT}; - qlen = 20; - } + LlmModelAbstractProps(); LlmModelAbstractProps(ModelArchitectureType _modelArchitecture, - std::vector _inputTypes, - std::vector _outputTypes, - std::vector _useCases) { - modelArchitecture = _modelArchitecture; - inputTypes = _inputTypes; - outputTypes = _outputTypes; - useCases = _useCases; - qlen = 20; - } - - size_t getSerializeSize() { - return sizeof(modelArchitecture) + sizeof(inputTypes) + sizeof(outputTypes) + sizeof(useCases) + sizeof(qlen); + std::vector _inputTypes, + std::vector _outputTypes, + std::vector _useCases); + + size_t getSerializeSize() + { + return sizeof(modelArchitecture) + sizeof(inputTypes) + + sizeof(outputTypes) + sizeof(useCases) + sizeof(qlen); } ModelArchitectureType modelArchitecture; - std::vector inputTypes; - std::vector outputTypes; + std::vector inputTypes; + std::vector outputTypes; std::vector useCases; size_t qlen; @@ -48,8 +44,9 @@ class LlmModelAbstractProps { friend class boost::serialization::access; template - void serialize(Archive &ar, const unsigned int version) { - ar &boost::serialization::base_object(*this); + void serialize(Archive &ar, const unsigned int version) + { + ar &boost::serialization::base_object(*this); ar & modelArchitecture; ar & inputTypes; ar & outputTypes; @@ -58,34 +55,33 @@ class LlmModelAbstractProps { } }; -class LlmModelAbstract { +class LlmModelAbstract +{ public: - LlmModelAbstract(std::string name, LlmModelAbstractProps props); + LlmModelAbstract(std::string _modelName, LlmModelAbstractProps props); ~LlmModelAbstract(); - std::string getMyName() { - return myName; - } + std::string getMyName() { return modelName; } - boost::shared_ptr getQue() { - return mQue; - } + boost::shared_ptr getQue() { return mQue; } - virtual bool modelInit() = 0; - virtual bool modelTerm() = 0; - virtual bool modelInference(frame_container& frameContainer) {return false;} + virtual bool modelInit() = 0; + virtual bool modelTerm() = 0; + virtual bool modelInference(frame_container &inputFrameContainer, frame_container &outputFrameContainer, std::function makeFrame) + { + return false; + } virtual size_t getFrameSize() = 0; - virtual void getFrames(frame_sp& frame) = 0; - virtual bool validateUseCase(LlmModelAbstractProps::UseCase useCase) = 0; + virtual bool validateUseCase(UseCase useCase) = 0; - bool init(); - bool term(); - bool step(); - bool push(frame_container& frameContainer); + bool init(); + bool term(); + bool step(frame_container &outputFrameContainer, std::function makeFrame); + bool push(frame_container &inputFrameContainer, frame_container &outputFrameContainer, std::function makeFrame); private: - std::string myName; + std::string modelName; boost::shared_ptr mQue; boost::shared_ptr mProps; }; \ No newline at end of file diff --git a/base/include/ModelEnums.h b/base/include/ModelEnums.h new file mode 100644 index 000000000..f84126e6b --- /dev/null +++ b/base/include/ModelEnums.h @@ -0,0 +1,20 @@ +#pragma once + +enum ModelArchitectureType +{ + TRANSFORMER = 0, + ENCODERDECODER, + CASUALDECODER, + PREFIXDECODER, + BERT, // Vision Transformer + VIT, // Bidirectional Encoder Representations from Transformer + AST, // Audio Spectrogram Transformer + VIVIT // Video Vision Transformer +}; + +enum UseCase +{ + TEXT_TO_TEXT = 0, + SCENE_DESCRIPTOR, + OCR +}; \ No newline at end of file diff --git a/base/include/ModelStrategy.h b/base/include/ModelStrategy.h index e31a7f1cd..cea24ffe0 100644 --- a/base/include/ModelStrategy.h +++ b/base/include/ModelStrategy.h @@ -1,43 +1,66 @@ -# pragma once +#pragma once -#include "Module.h" -#include "ClipEncoder.h" -#include "Llava.h" +#include "SceneDescriptorXForm.h" +#include "EncoderModelAbstract.h" +#include "LlmModelAbstract.h" -class ModelStrategy { +class ModelStrategy +{ public: - enum ModelStrategyType { + enum ModelStrategyType + { LLAMA_TEXT_TO_TEXT = 0, LLAVA_TEXT_TO_TEXT, LLAVA_SCENE_DESCRIPTOR }; - static boost::shared_ptr create(ModelStrategyType type); + template + static boost::shared_ptr create(ModelStrategyType type, + T &props); ModelStrategy(); virtual ~ModelStrategy(); virtual bool initStrategy() = 0; virtual bool termStrategy() = 0; + public: boost::shared_ptr encoderModel; - boost::shared_ptr llmModel; + boost::shared_ptr llmModel; }; -class SceneDescriptorModelStrategy : public ModelStrategy { +class SceneDescriptorModelStrategy : public ModelStrategy +{ public: - SceneDescriptorModelStrategy(); + SceneDescriptorModelStrategy(SceneDescriptorXFormProps props); ~SceneDescriptorModelStrategy(); bool initStrategy() override; bool termStrategy() override; }; -class LlavaTextToTextModelStrategy : public ModelStrategy { +class LlavaTextToTextModelStrategy : public ModelStrategy +{ public: LlavaTextToTextModelStrategy(); ~LlavaTextToTextModelStrategy(); bool initStrategy() override; bool termStrategy() override; -}; \ No newline at end of file +}; + +template +boost::shared_ptr ModelStrategy::create(ModelStrategyType type, + T &props) +{ + switch (type) + { + case ModelStrategyType::LLAVA_SCENE_DESCRIPTOR: + return boost::make_shared(props); + case ModelStrategyType::LLAVA_TEXT_TO_TEXT: + return boost::make_shared(); + default: + return boost::make_shared(); + break; + } +} \ No newline at end of file diff --git a/base/include/SceneDescriptorXForm.h b/base/include/SceneDescriptorXForm.h index 3a2ada07f..90d59bd9c 100644 --- a/base/include/SceneDescriptorXForm.h +++ b/base/include/SceneDescriptorXForm.h @@ -1,51 +1,69 @@ -# pragma once +#pragma once #include "Module.h" -class SceneDescriptorXFormProps : public ModuleProps { +class SceneDescriptorXFormProps : public ModuleProps +{ public: - enum SceneDescriptorStrategy { - LLAVA = 0 - }; - - SceneDescriptorXFormProps(SceneDescriptorStrategy _modelStrategyType) { - modelStrategyType = _modelStrategyType; - } - - size_t getSerializeSize() { - return ModuleProps::getSerializeSize() + sizeof(modelStrategyType); - } - - SceneDescriptorStrategy modelStrategyType; + enum ModelStrategyType + { + LLAVA = 0 + }; + + SceneDescriptorXFormProps(ModelStrategyType _modelStrategyType, + std::string _encoderModelPath, + std::string _llmModelPath, + std::string _systemPrompt, std::string _userPrompt, + int _gpuLayers); + + size_t getSerializeSize() + { + return ModuleProps::getSerializeSize() + sizeof(modelStrategyType) + + sizeof(encoderModelPath) + sizeof(llmModelPath) + + sizeof(systemPrompt) + sizeof(userPrompt) + sizeof(gpuLayers); + } + + ModelStrategyType modelStrategyType; + std::string encoderModelPath; + std::string llmModelPath; + std::string systemPrompt; + std::string userPrompt; + int gpuLayers; + private: - friend class boost::serialization::access; + friend class boost::serialization::access; - template - void serialize(Archive& ar, const unsigned int version) { - ar &boost::serialization::base_object(*this); - ar & modelStrategyType; - } + template + void serialize(Archive &ar, const unsigned int version) + { + ar &boost::serialization::base_object(*this); + ar & modelStrategyType; + ar & encoderModelPath & llmModelPath; + ar & systemPrompt & userPrompt; + ar & gpuLayers; + } }; -class SceneDescriptorXForm : public Module { +class SceneDescriptorXForm : public Module +{ public: - SceneDescriptorXForm(SceneDescriptorXFormProps _props); - virtual ~SceneDescriptorXForm(); - bool init(); - bool term(); - void setProps(SceneDescriptorXFormProps& props); - SceneDescriptorXFormProps getProps(); + SceneDescriptorXForm(SceneDescriptorXFormProps _props); + virtual ~SceneDescriptorXForm(); + bool init(); + bool term(); + void setProps(SceneDescriptorXFormProps &props); + SceneDescriptorXFormProps getProps(); protected: - bool process(frame_container& frames); - bool processSOS(frame_sp& frame); - bool validateInputPins(); - bool validateOutputPins(); - void addInputPin(framemetadata_sp& metadata, string& pinId); - bool handlePropsChange(frame_sp& frame); + bool process(frame_container &frames); + bool processSOS(frame_sp &frame); + bool validateInputPins(); + bool validateOutputPins(); + void addInputPin(framemetadata_sp &metadata, string &pinId); + bool handlePropsChange(frame_sp &frame); private: - void setMetadata(framemetadata_sp& metadata); - class Detail; - boost::shared_ptr mDetail; + void setMetadata(framemetadata_sp &metadata); + class Detail; + boost::shared_ptr mDetail; }; \ No newline at end of file diff --git a/base/src/ClipEncoder.cpp b/base/src/ClipEncoder.cpp index 054014940..73f45f58e 100644 --- a/base/src/ClipEncoder.cpp +++ b/base/src/ClipEncoder.cpp @@ -1,76 +1,114 @@ +#include "ClipEncoder.h" +#include "Frame.h" #include "FrameMetadata.h" #include "FrameMetadataFactory.h" -#include "Frame.h" #include "Logger.h" #include "Utils.h" -#include "llama/common.h" -#include "llama/llama.h" -#include "llama/clip.h" -#include "llama/llava.h" -#include "ClipEncoder.h" +#include +#include +#include + +ClipEncoderProps::ClipEncoderProps(std::string _modelPath) +{ + /* Set LLM Model Base Class Properties for each model*/ + modelArchitecture = ModelArchitectureType::VIT; + inputTypes = {FrameMetadata::FrameType::TEXT, + FrameMetadata::FrameType::ENCODED_IMAGE}; + outputTypes = {FrameMetadata::FrameType::IMAGE_EMBEDDING}; + useCases = {UseCase::OCR, UseCase::SCENE_DESCRIPTOR}; -class ClipEncoder::Detail { + /*Unique Model Properties*/ + modelPath = _modelPath; +} + +class ClipEncoder::Detail +{ public: - Detail(ClipEncoderProps &_props) : mProps(_props) { - - } + Detail(ClipEncoderProps &_props) : mProps(_props) { mClipContext = NULL; } - ~Detail() { } + ~Detail() {} - void setProps(ClipEncoderProps &_props) { - mProps = _props; - } + void setProps(ClipEncoderProps &_props) { mProps = _props; } public: ClipEncoderProps mProps; - clip_ctx * mClipContext = NULL; - llava_image_embed * storedData; + clip_ctx *mClipContext; + llava_image_embed *storedData; }; -ClipEncoder::ClipEncoder(ClipEncoderProps _props) : EncoderModelAbstract("ClipEncoder", _props) { +ClipEncoder::ClipEncoder(ClipEncoderProps _props) + : EncoderModelAbstract("ClipEncoder", _props) +{ mDetail.reset(new Detail(_props)); } ClipEncoder::~ClipEncoder() {} -bool ClipEncoder::validateUseCase(ClipEncoderProps::UseCase useCase) { - for(auto validUseCase: mDetail->mProps.useCases) { - if(validUseCase == useCase) { +bool ClipEncoder::validateUseCase(UseCase useCase) +{ + for (auto validUseCase : mDetail->mProps.useCases) + { + if (validUseCase == useCase) + { return true; } } + throw AIPException(AIP_FATAL, "Model cannot be used for the this use case"); return false; } -bool ClipEncoder::modelInit() { - mDetail->mClipContext = clip_model_load(mDetail->mProps.modelPath.c_str(), 1); +bool ClipEncoder::modelInit() +{ + int verbosity = 1; + mDetail->mClipContext = + clip_model_load(mDetail->mProps.modelPath.c_str(), verbosity); + if (!mDetail->mClipContext) + { + LOG_ERROR << "Cannot Load Clip Model"; + return false; + } return EncoderModelAbstract::init(); } -bool ClipEncoder::modelTerm() { - if(mDetail->mClipContext){ +bool ClipEncoder::modelTerm() +{ + if (mDetail->mClipContext) + { clip_free(mDetail->mClipContext); mDetail->mClipContext = NULL; } return EncoderModelAbstract::term(); } -bool ClipEncoder::modelInference(frame_container& frames) { - auto frame = frames.begin()->second; - auto imageBytes = static_cast(frame->data()); - auto imageBytesLength = frame->size(); - mDetail->storedData = llava_image_embed_make_with_bytes(mDetail->mClipContext, 8, imageBytes, imageBytesLength); +bool ClipEncoder::modelInference(frame_container &inputFrameContainer, + frame_container &outputFrameContainer, std::function makeFrame) +{ + auto outputPinId = inputFrameContainer.begin()->first; + auto inputFrame = inputFrameContainer.begin()->second; + mDetail->storedData = llava_image_embed_make_with_bytes( + mDetail->mClipContext, 8, + static_cast(inputFrame->data()), inputFrame->size()); + + auto outputFrame = makeFrame(getFrameSize()); + auto metaData = boost::shared_ptr( + new FrameMetadata(FrameMetadata::FrameType::IMAGE_EMBEDDING)); + outputFrame->setMetadata(metaData); + storeFrames(outputFrame); + outputFrameContainer.insert(make_pair(outputPinId, outputFrame)); return true; } -size_t ClipEncoder::getFrameSize() { +size_t ClipEncoder::getFrameSize() +{ return (clip_embd_nbytes(mDetail->mClipContext) + sizeof(int)); } -void ClipEncoder::getFrames(frame_sp& frame) { +void ClipEncoder::storeFrames(frame_sp &frame) +{ memcpy(frame->data(), mDetail->storedData, sizeof(llava_image_embed)); float *char_buffer = (float *)frame->data(); char_buffer += sizeof(llava_image_embed); - memcpy(char_buffer, mDetail->storedData->embed, clip_embd_nbytes(mDetail->mClipContext)); + memcpy(char_buffer, mDetail->storedData->embed, + clip_embd_nbytes(mDetail->mClipContext)); } \ No newline at end of file diff --git a/base/src/EncoderModelAbstract.cpp b/base/src/EncoderModelAbstract.cpp index a57bc5ada..f95b42234 100644 --- a/base/src/EncoderModelAbstract.cpp +++ b/base/src/EncoderModelAbstract.cpp @@ -1,34 +1,71 @@ #include "EncoderModelAbstract.h" -#include "FrameContainerQueue.h" -#include "Logger.h" -EncoderModelAbstract::EncoderModelAbstract(std::string name, EncoderModelAbstractProps _props) : myName(name) { - mQue.reset(new FrameContainerQueue(_props.qlen)); - mProps.reset(new EncoderModelAbstractProps(_props)); +EncoderModelAbstractProps::EncoderModelAbstractProps() +{ + modelArchitecture = ModelArchitectureType::BERT; + inputTypes = {FrameMetadata::FrameType::TEXT}; + outputTypes = {FrameMetadata::FrameType::TEXT_EMBEDDING}; + useCases = {UseCase::TEXT_TO_TEXT}; + qlen = 20; } -EncoderModelAbstract::~EncoderModelAbstract() { } +EncoderModelAbstractProps::EncoderModelAbstractProps( + ModelArchitectureType _modelArchitecture, + std::vector _inputTypes, + std::vector _outputTypes, + std::vector _useCases) +{ + modelArchitecture = _modelArchitecture; + inputTypes = _inputTypes; + outputTypes = _outputTypes; + useCases = _useCases; + qlen = 20; +} + +EncoderModelAbstract::EncoderModelAbstract(std::string _modelName, + EncoderModelAbstractProps _props) + : modelName(_modelName) +{ + mQue.reset(new FrameContainerQueue(_props.qlen)); + mProps.reset(new EncoderModelAbstractProps(_props)); +} -bool EncoderModelAbstract::init() { +EncoderModelAbstract::~EncoderModelAbstract() {} + +bool EncoderModelAbstract::init() +{ mQue->accept(); return true; } -bool EncoderModelAbstract::term() { +bool EncoderModelAbstract::term() +{ mQue->clear(); return true; } -bool EncoderModelAbstract::step() { - auto frames = mQue->pop(); - if (frames.size() == 0) { +bool EncoderModelAbstract::step(frame_container &outputFrameContainer, std::function makeFrame) +{ + auto inputFrameContainer = mQue->pop(); + if (inputFrameContainer.size() == 0) + { return true; } - bool ret = modelInference(frames); + bool ret = modelInference(inputFrameContainer, outputFrameContainer, makeFrame); return ret; } -bool EncoderModelAbstract::push(frame_container& frameContainer) { - mQue->push(frameContainer); +bool EncoderModelAbstract::push(frame_container &inputFrameContainer, + frame_container &outputFrameContainer, std::function makeFrame) +{ + mQue->push(inputFrameContainer); + while (mQue->size() != 0) + { + if (!step(outputFrameContainer, makeFrame)) + { + LOG_ERROR << "Step failed"; + return false; + } + } return true; } \ No newline at end of file diff --git a/base/src/Llava.cpp b/base/src/Llava.cpp index 77e5494e8..f965e18f0 100644 --- a/base/src/Llava.cpp +++ b/base/src/Llava.cpp @@ -1,189 +1,261 @@ +#include "Llava.h" +#include "Frame.h" #include "FrameMetadata.h" #include "FrameMetadataFactory.h" -#include "Frame.h" #include "Logger.h" #include "Utils.h" -#include "llama/common.h" -#include "llama/llama.h" -#include "llama/llava.h" -#include "Llava.h" +#include +#include +#include -class Llava::Detail { +LlavaProps::LlavaProps(std::string _modelPath, std::string _systemPrompt, std::string _userPrompt, int _contextSize, + int _batchSize, float _degreeOfRandomness, int _gpuLayers, + int _predictionLength) +{ + + /* Set LLM Model Base Class Properties for each model*/ + modelArchitecture = ModelArchitectureType::TRANSFORMER; + inputTypes = {FrameMetadata::FrameType::TEXT, + FrameMetadata::FrameType::IMAGE_EMBEDDING}; + outputTypes = {FrameMetadata::FrameType::TEXT}; + useCases = {UseCase::TEXT_TO_TEXT, UseCase::OCR, UseCase::SCENE_DESCRIPTOR}; + + /*Unique Model Properties*/ + modelPath = _modelPath; + systemPrompt = _systemPrompt; + userPrompt = _userPrompt; + degreeOfRandomness = _degreeOfRandomness; + contextSize = _contextSize; + batchSize = _batchSize; + gpuLayers = _gpuLayers; + predictionLength = _predictionLength; +} + +class Llava::Detail +{ public: - Detail(LlavaProps &_props) : mProps(_props) { - setContextSize(_props); - setBatchSize(_props); - setDegreeOfRandomness(_props); - setGpuLayers(_props); + Detail(LlavaProps &_props) : mProps(_props) + { + mLlavaContext = NULL; + systemPromptFlag = true; + nPast = 0; } - ~Detail() { - - } + ~Detail() {} - void setProps(LlavaProps &_props) { + void setProps(LlavaProps &_props) + { mProps = _props; - updateProps(_props); + setModelProps(_props); } - void updateProps(LlavaProps &_props) { + void setModelProps(LlavaProps &_props) + { + mLlavaModelParams = llama_model_default_params(); + mLlavaContextParams = llama_context_default_params(); setContextSize(_props); setBatchSize(_props); setDegreeOfRandomness(_props); setGpuLayers(_props); } - void setContextSize(LlavaProps &_props) { + void setContextSize(LlavaProps &_props) + { mLlavaContextParams.n_ctx = _props.contextSize; } - void setBatchSize(LlavaProps &_props) { + void setBatchSize(LlavaProps &_props) + { mLlavaContextParams.n_batch = _props.batchSize; } - void setDegreeOfRandomness(LlavaProps &_props) { + void setDegreeOfRandomness(LlavaProps &_props) + { mLlavaSamplingParams.temp = _props.degreeOfRandomness; } - void setGpuLayers(LlavaProps &_props) { + void setGpuLayers(LlavaProps &_props) + { mLlavaModelParams.n_gpu_layers = _props.gpuLayers; } - void compute(llama_context * llamaContext, std::vector tokens, int nBatch, int * nPast) { - int N = (int) tokens.size(); - for (int i = 0; i < N; i += nBatch) { - int nEval = (int) tokens.size() - i; - if (nEval > nBatch) { - nEval = nBatch; - } - if (llama_decode(llamaContext, llama_batch_get_one(&tokens[i], nEval, *nPast, 0))) { - LOG_ERROR << "LLAMA DECODE ERROR"; - break; - } - *nPast += nEval; + void compute(llama_context *llamaContext, std::vector tokens, + int nBatch, int *nPast) + { + int N = (int)tokens.size(); + for (int i = 0; i < N; i += nBatch) + { + int nEval = (int)tokens.size() - i; + if (nEval > nBatch) + { + nEval = nBatch; + } + if (llama_decode(llamaContext, + llama_batch_get_one(&tokens[i], nEval, *nPast, 0))) + { + LOG_ERROR << "LLAMA DECODE ERROR"; + break; + } + *nPast += nEval; } } public: LlavaProps mProps; llama_model *mLlavaModel; - llama_context *mLlavaContext = NULL; + llama_context *mLlavaContext; llama_model_params mLlavaModelParams; llama_context_params mLlavaContextParams; llama_sampling_params mLlavaSamplingParams; std::string storedData; + bool systemPromptFlag; + int nPast; }; -Llava::Llava(LlavaProps _props) : LlmModelAbstract("Llava", _props) { +Llava::Llava(LlavaProps _props) : LlmModelAbstract("Llava", _props) +{ mDetail.reset(new Detail(_props)); } Llava::~Llava() {} -bool Llava::validateUseCase(LlavaProps::UseCase useCase) { - for(auto validUseCase: mDetail->mProps.useCases) { - if(validUseCase == useCase) { +bool Llava::validateUseCase(UseCase useCase) +{ + for (auto validUseCase : mDetail->mProps.useCases) + { + if (validUseCase == useCase) + { return true; } } + throw AIPException(AIP_FATAL, "Model cannot be used for the this use case"); return false; } -bool Llava::modelInit() { +bool Llava::modelInit() +{ llama_backend_init(false /*NUMA Architecure set to false*/); - - mDetail->mLlavaModelParams = llama_model_default_params(); - mDetail->mLlavaContextParams = llama_context_default_params(); - mDetail->updateProps(mDetail->mProps); - + mDetail->setModelProps(mDetail->mProps); mDetail->mLlavaModel = llama_load_model_from_file( mDetail->mProps.modelPath.c_str(), mDetail->mLlavaModelParams); mDetail->mLlavaContext = llama_new_context_with_model( mDetail->mLlavaModel, mDetail->mLlavaContextParams); + + if (!mDetail->mLlavaContext) + { + LOG_ERROR << "Cannot Load Llava Model"; + return false; + } return LlmModelAbstract::init(); } -bool Llava::modelTerm() { +bool Llava::modelTerm() +{ llama_free(mDetail->mLlavaContext); llama_free_model(mDetail->mLlavaModel); llama_backend_free(); return LlmModelAbstract::term(); } -bool Llava::modelInference(frame_container& frames) { +bool Llava::modelInference(frame_container &inputFrameContainer, frame_container &outputFrameContainer, std::function makeFrame) +{ /*Parameter Declaration*/ - auto frame = frames.begin()->second; - auto frameType = frame->getMetadata()->getFrameType(); - int nPast = 0; - std::string systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; - std::string userPrompt = mDetail->mProps.prompt; - const bool add_bos = llama_should_add_bos_token(llama_get_model(mDetail->mLlavaContext)); + auto outputPinId = inputFrameContainer.begin()->first; + auto inputFrame = inputFrameContainer.begin()->second; + auto frameType = inputFrame->getMetadata()->getFrameType(); + + std::string systemPrompt = mDetail->mProps.systemPrompt; + std::string userPrompt = mDetail->mProps.userPrompt; + const bool add_bos = + llama_should_add_bos_token(llama_get_model(mDetail->mLlavaContext)); int nPredict = mDetail->mProps.predictionLength; int nBatch = mDetail->mProps.batchSize; /*System Prompt Tokenization*/ - std::vector systemPromptTokens = ::llama_tokenize(mDetail->mLlavaContext, systemPrompt, add_bos); - mDetail->compute(mDetail->mLlavaContext, systemPromptTokens, nBatch, &nPast); - - if(frameType == FrameMetadata::FrameType::IMAGE_EMBEDDING){ + if(mDetail->systemPromptFlag){ + std::vector systemPromptTokens = + ::llama_tokenize(mDetail->mLlavaContext, systemPrompt, add_bos); + mDetail->compute(mDetail->mLlavaContext, systemPromptTokens, nBatch, &mDetail->nPast); + mDetail->systemPromptFlag = false; + LOG_ERROR << "Loaded System Prompt"; + } + + if (frameType == FrameMetadata::FrameType::IMAGE_EMBEDDING) + { /*Image Embed Tokenization*/ - auto imageEmbed = static_cast(frame->data()); - llava_eval_image_embed(mDetail->mLlavaContext, imageEmbed, nBatch, &nPast); + auto imageEmbedding = static_cast(inputFrame->data()); + llava_eval_image_embed(mDetail->mLlavaContext, imageEmbedding, nBatch, &mDetail->nPast); } - else if(frameType == FrameMetadata::FrameType::TEXT){ + else if (frameType == FrameMetadata::FrameType::TEXT) + { /*Text Embed Tokenization*/ - auto textEmbed = static_cast(frame->data()); + auto textEmbed = static_cast(inputFrame->data()); std::string textEmbedPrompt(textEmbed); - std::vector textEmbedTokens = ::llama_tokenize(mDetail->mLlavaContext, textEmbedPrompt, false); - mDetail->compute(mDetail->mLlavaContext, textEmbedTokens, nBatch, &nPast); + std::vector textEmbedTokens = + ::llama_tokenize(mDetail->mLlavaContext, textEmbedPrompt, false); + mDetail->compute(mDetail->mLlavaContext, textEmbedTokens, nBatch, &mDetail->nPast); } - + /*User Prompt Tokenization*/ - std::vector userPromptTokens = ::llama_tokenize(mDetail->mLlavaContext, (userPrompt + "\nASSISTANT:").c_str(), false); - mDetail->compute(mDetail->mLlavaContext, userPromptTokens, nBatch, &nPast); + std::vector userPromptTokens = ::llama_tokenize( + mDetail->mLlavaContext, (userPrompt + "\nASSISTANT:").c_str(), false); + mDetail->compute(mDetail->mLlavaContext, userPromptTokens, nBatch, &mDetail->nPast); std::string output = ""; - + std::cout << "\n"; /*Prediction token by token*/ - for(int i = 0; i < nPredict; i++) { + for (int i = 0; i < nPredict; i++) + { llama_token id = 0; - auto logits = llama_get_logits(mDetail->mLlavaContext); - auto n_vocab = llama_n_vocab(llama_get_model(mDetail->mLlavaContext)); + auto logits = llama_get_logits(mDetail->mLlavaContext); + auto nVocab = llama_n_vocab(llama_get_model(mDetail->mLlavaContext)); std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + candidates.reserve(nVocab); + for (llama_token tokenId = 0; tokenId < nVocab; tokenId++) + { + candidates.emplace_back( + llama_token_data{tokenId, logits[tokenId], 0.0f}); } - llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - id = llama_sample_token_greedy(mDetail->mLlavaContext, &candidates_p); + llama_token_data_array candidatesP = {candidates.data(), candidates.size(), + false}; + id = llama_sample_token_greedy(mDetail->mLlavaContext, &candidatesP); - if (id == llama_token_eos(llama_get_model(mDetail->mLlavaContext))) { - break; + if (id == llama_token_eos(llama_get_model(mDetail->mLlavaContext))) + { + break; } std::string ret = llama_token_to_piece(mDetail->mLlavaContext, id); output += ret; - std::cout << ret; + std::cout << ret << std::flush; std::vector tokens; tokens.push_back(id); - mDetail->compute(mDetail->mLlavaContext, tokens, 1, &nPast); + mDetail->compute(mDetail->mLlavaContext, tokens, 1, &mDetail->nPast); } mDetail->storedData = output; + auto outputFrame = makeFrame(getFrameSize()); + auto metaData = boost::shared_ptr( + new FrameMetadata(FrameMetadata::FrameType::TEXT)); + outputFrame->setMetadata(metaData); + storeFrames(outputFrame); + outputFrameContainer.insert(make_pair(outputPinId, outputFrame)); return true; } -size_t Llava::getFrameSize() { +size_t Llava::getFrameSize() +{ return (mDetail->storedData.length() + 1); /* Add 1 more byte for /0 for conversion from std::string to char* */ } -void Llava::getFrames(frame_sp& frame) { +void Llava::storeFrames(frame_sp &frame) +{ memcpy(frame->data(), mDetail->storedData.c_str(), frame->size()); } \ No newline at end of file diff --git a/base/src/LlmModelAbstract.cpp b/base/src/LlmModelAbstract.cpp index 058520048..50b849f8f 100644 --- a/base/src/LlmModelAbstract.cpp +++ b/base/src/LlmModelAbstract.cpp @@ -1,33 +1,74 @@ #include "LlmModelAbstract.h" -#include "FrameContainerQueue.h" -LlmModelAbstract::LlmModelAbstract(std::string name, LlmModelAbstractProps _props) : myName(name) { +LlmModelAbstractProps::LlmModelAbstractProps() +{ + modelArchitecture = ModelArchitectureType::TRANSFORMER; + inputTypes = {FrameMetadata::FrameType::TEXT}; + outputTypes = {FrameMetadata::FrameType::TEXT}; + useCases = {UseCase::TEXT_TO_TEXT}; + qlen = 20; +} + +LlmModelAbstractProps::LlmModelAbstractProps( + ModelArchitectureType _modelArchitecture, + std::vector _inputTypes, + std::vector _outputTypes, + std::vector _useCases) +{ + modelArchitecture = _modelArchitecture; + inputTypes = _inputTypes; + outputTypes = _outputTypes; + useCases = _useCases; + qlen = 20; +} + +LlmModelAbstract::LlmModelAbstract(std::string _modelName, + LlmModelAbstractProps _props) + : modelName(_modelName) +{ mQue.reset(new FrameContainerQueue(_props.qlen)); - mProps.reset(new LlmModelAbstractProps(_props)); + mProps.reset(new LlmModelAbstractProps(_props)); } LlmModelAbstract::~LlmModelAbstract() {} -bool LlmModelAbstract::init() { +bool LlmModelAbstract::init() +{ mQue->accept(); return true; } -bool LlmModelAbstract::term() { +bool LlmModelAbstract::term() +{ mQue->clear(); return true; } -bool LlmModelAbstract::step() { - auto frames = mQue->pop(); - if (frames.size() == 0) { +bool LlmModelAbstract::step(frame_container &outputFrameContainer, + std::function makeFrame) +{ + auto inputFrameContainer = mQue->pop(); + if (inputFrameContainer.size() == 0) + { return true; } - bool ret = modelInference(frames); + bool ret = + modelInference(inputFrameContainer, outputFrameContainer, makeFrame); return ret; } -bool LlmModelAbstract::push(frame_container& frameContainer) { - mQue->push(frameContainer); +bool LlmModelAbstract::push(frame_container &inputFrameContainer, + frame_container &outputFrameContainer, + std::function makeFrame) +{ + mQue->push(inputFrameContainer); + while (mQue->size() != 0) + { + if (!step(outputFrameContainer, makeFrame)) + { + LOG_ERROR << "Step failed"; + return false; + } + } return true; } \ No newline at end of file diff --git a/base/src/ModelStrategy.cpp b/base/src/ModelStrategy.cpp index 592cc6552..3939df69b 100644 --- a/base/src/ModelStrategy.cpp +++ b/base/src/ModelStrategy.cpp @@ -1,66 +1,65 @@ #include "ModelStrategy.h" +#include "ClipEncoder.h" +#include "Llava.h" -ModelStrategy::ModelStrategy() { +ModelStrategy::ModelStrategy() {} -} - -ModelStrategy::~ModelStrategy() { - -} - -boost::shared_ptr ModelStrategy::create(ModelStrategyType type) { - switch (type) { - case ModelStrategyType::LLAVA_SCENE_DESCRIPTOR: - return boost::make_shared(); - case ModelStrategyType::LLAVA_TEXT_TO_TEXT: - return boost::make_shared(); - default: - return boost::make_shared(); - break; - } -} +ModelStrategy::~ModelStrategy() {} /*LLAVA SCENE-DESCRIPTOR STRATEGY*/ -SceneDescriptorModelStrategy::SceneDescriptorModelStrategy() : ModelStrategy() { - auto clipProps = ClipEncoderProps("./data/llm/llava/llava-v1.6-7b/mmproj-model-f16.gguf"); - auto llavaProps = LlavaProps("./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", "Describe the image", 2048, 512, 0.8, 10, 256); - - encoderModel = boost::shared_ptr(new ClipEncoder(clipProps)); +SceneDescriptorModelStrategy::SceneDescriptorModelStrategy( + SceneDescriptorXFormProps props) + : ModelStrategy() +{ + auto clipProps = ClipEncoderProps(props.encoderModelPath); + auto llavaProps = + LlavaProps(props.llmModelPath, props.systemPrompt, props.userPrompt, 4096, + 512, 0.8, props.gpuLayers, 256); + + encoderModel = + boost::shared_ptr(new ClipEncoder(clipProps)); llmModel = boost::shared_ptr(new Llava(llavaProps)); } -SceneDescriptorModelStrategy::~SceneDescriptorModelStrategy() { - -} +SceneDescriptorModelStrategy::~SceneDescriptorModelStrategy() {} -bool SceneDescriptorModelStrategy::initStrategy() { +bool SceneDescriptorModelStrategy::initStrategy() +{ encoderModel->modelInit(); llmModel->modelInit(); return true; } -bool SceneDescriptorModelStrategy::termStrategy() { +bool SceneDescriptorModelStrategy::termStrategy() +{ encoderModel->modelTerm(); llmModel->modelTerm(); return true; } /*LLAVE TEXT-TO-TEXT STRATEGY*/ -LlavaTextToTextModelStrategy::LlavaTextToTextModelStrategy() : ModelStrategy() { - auto llavaProps = LlavaProps("./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", "Tell me a story", 2048, 512, 0.8, 10, 256); +LlavaTextToTextModelStrategy::LlavaTextToTextModelStrategy() : ModelStrategy() +{ + auto llavaProps = LlavaProps( + "./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", + "A chat between a curious human and an artificial intelligence " + "assistant. The assistant gives helpful, detailed, and polite answers " + "to the human's questions.\nUSER:", + "Tell me a story", 2048, 512, 0.8, 10, 256); + ; llmModel = boost::shared_ptr(new Llava(llavaProps)); } -LlavaTextToTextModelStrategy::~LlavaTextToTextModelStrategy() { - -} +LlavaTextToTextModelStrategy::~LlavaTextToTextModelStrategy() {} -bool LlavaTextToTextModelStrategy::initStrategy() { +bool LlavaTextToTextModelStrategy::initStrategy() +{ llmModel->modelInit(); return true; } -bool LlavaTextToTextModelStrategy::termStrategy() { +bool LlavaTextToTextModelStrategy::termStrategy() +{ llmModel->modelTerm(); return true; } \ No newline at end of file diff --git a/base/src/SceneDescriptorXForm.cpp b/base/src/SceneDescriptorXForm.cpp index ece582031..54fcaf7f9 100644 --- a/base/src/SceneDescriptorXForm.cpp +++ b/base/src/SceneDescriptorXForm.cpp @@ -1,28 +1,44 @@ #include "SceneDescriptorXForm.h" -#include "ClipEncoder.h" -#include "Llava.h" #include "ModelStrategy.h" -class SceneDescriptorXForm::Detail { +SceneDescriptorXFormProps::SceneDescriptorXFormProps( + ModelStrategyType _modelStrategyType, std::string _encoderModelPath, + std::string _llmModelPath, std::string _systemPrompt, + std::string _userPrompt, int _gpuLayers) +{ + modelStrategyType = _modelStrategyType; + encoderModelPath = _encoderModelPath; + llmModelPath = _llmModelPath; + systemPrompt = _systemPrompt; + userPrompt = _userPrompt; + gpuLayers = _gpuLayers; +} + +class SceneDescriptorXForm::Detail +{ public: - Detail(SceneDescriptorXFormProps &_props) : mProps(_props) { + Detail(SceneDescriptorXFormProps &_props) : mProps(_props) + { setModelStrategy(_props); } ~Detail() {} - void setProps(SceneDescriptorXFormProps &props) { mProps = props; } + void setProps(SceneDescriptorXFormProps &_props) { mProps = _props; } - void setModelStrategy(SceneDescriptorXFormProps &props) { - switch (props.modelStrategyType) { - case SceneDescriptorXFormProps::SceneDescriptorStrategy::LLAVA: - modelStrategyType = ModelStrategy::ModelStrategyType::LLAVA_SCENE_DESCRIPTOR; + void setModelStrategy(SceneDescriptorXFormProps &_props) + { + switch (_props.modelStrategyType) + { + case SceneDescriptorXFormProps::ModelStrategyType::LLAVA: + modelStrategyType = + ModelStrategy::ModelStrategyType::LLAVA_SCENE_DESCRIPTOR; break; default: LOG_ERROR << "Please choose a valid model strategy!!!\n"; break; } - modelStrategy = ModelStrategy::create(modelStrategyType); + modelStrategy = ModelStrategy::create(modelStrategyType, _props); } public: @@ -34,14 +50,17 @@ class SceneDescriptorXForm::Detail { }; SceneDescriptorXForm::SceneDescriptorXForm(SceneDescriptorXFormProps _props) - : Module(TRANSFORM, "SceneDescriptorXForm", _props) { + : Module(TRANSFORM, "SceneDescriptorXForm", _props) +{ mDetail.reset(new Detail(_props)); } SceneDescriptorXForm::~SceneDescriptorXForm() {} -bool SceneDescriptorXForm::validateInputPins() { - if (getNumberOfInputPins() != 1) { +bool SceneDescriptorXForm::validateInputPins() +{ + if (getNumberOfInputPins() != 1) + { LOG_ERROR << "<" << getId() << ">::validateInputPins size is expected to be 1. Actual<" << getNumberOfInputPins() << ">"; @@ -52,7 +71,8 @@ bool SceneDescriptorXForm::validateInputPins() { FrameMetadata::FrameType frameType = metadata->getFrameType(); - if (frameType != FrameMetadata::FrameType::ENCODED_IMAGE) { + if (frameType != FrameMetadata::FrameType::ENCODED_IMAGE) + { LOG_ERROR << "<" << getId() << ">::validateInputPins input frameType is expected to be " "Audio. Actual<" @@ -61,7 +81,8 @@ bool SceneDescriptorXForm::validateInputPins() { } FrameMetadata::MemType memType = metadata->getMemType(); - if (memType != FrameMetadata::MemType::HOST) { + if (memType != FrameMetadata::MemType::HOST) + { LOG_ERROR << "<" << getId() << ">::validateInputPins input memType is expected to be HOST. Actual<" @@ -71,8 +92,10 @@ bool SceneDescriptorXForm::validateInputPins() { return true; } -bool SceneDescriptorXForm::validateOutputPins() { - if (getNumberOfOutputPins() != 1) { +bool SceneDescriptorXForm::validateOutputPins() +{ + if (getNumberOfOutputPins() != 1) + { LOG_ERROR << "<" << getId() << ">::validateOutputPins size is expected to be 1. Actual<" << getNumberOfOutputPins() << ">"; @@ -81,7 +104,8 @@ bool SceneDescriptorXForm::validateOutputPins() { framemetadata_sp metadata = getFirstOutputMetadata(); FrameMetadata::FrameType frameType = metadata->getFrameType(); - if (frameType != FrameMetadata::FrameType::TEXT) { + if (frameType != FrameMetadata::FrameType::TEXT) + { LOG_ERROR << "<" << getId() << ">::validateOutputPins input frameType is expected to be " "TEXT. Actual<" @@ -93,7 +117,8 @@ bool SceneDescriptorXForm::validateOutputPins() { } void SceneDescriptorXForm::addInputPin(framemetadata_sp &metadata, - string &pinId) { + string &pinId) +{ Module::addInputPin(metadata, pinId); mDetail->mOutputMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::TEXT)); @@ -101,74 +126,80 @@ void SceneDescriptorXForm::addInputPin(framemetadata_sp &metadata, mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata); } -bool SceneDescriptorXForm::init() { +bool SceneDescriptorXForm::init() +{ bool ret = mDetail->modelStrategy->initStrategy(); - if (!ret) { + if (!ret) + { return false; } return Module::init(); } -bool SceneDescriptorXForm::term() { +bool SceneDescriptorXForm::term() +{ bool ret = mDetail->modelStrategy->termStrategy(); - if (!ret) { + if (!ret) + { return false; } return Module::term(); } -bool SceneDescriptorXForm::process(frame_container &frames) { +bool SceneDescriptorXForm::process(frame_container &frames) +{ /*Encoder Model*/ - mDetail->modelStrategy->encoderModel->push(frames); - mDetail->modelStrategy->encoderModel->step(); - auto clipFrame = - makeFrame(mDetail->modelStrategy->encoderModel->getFrameSize()); - auto clipMetaData = boost::shared_ptr( - new FrameMetadata(FrameMetadata::FrameType::IMAGE_EMBEDDING)); - clipFrame->setMetadata(clipMetaData); - mDetail->modelStrategy->encoderModel->getFrames(clipFrame); - frame_container clipFrames; - clipFrames.insert(make_pair(mDetail->mOutputPinId, clipFrame)); + mDetail->modelStrategy->encoderModel->push(frames, clipFrames, [&](size_t size) -> frame_sp {return makeFrame(size, mDetail->mOutputPinId); }); /*LLM Model*/ - mDetail->modelStrategy->llmModel->push(clipFrames); - mDetail->modelStrategy->llmModel->step(); - auto outFrame = makeFrame(mDetail->modelStrategy->llmModel->getFrameSize()); - mDetail->modelStrategy->llmModel->getFrames(outFrame); + frame_container llavaFrames; + mDetail->modelStrategy->llmModel->push(clipFrames, llavaFrames, [&](size_t size) -> frame_sp {return makeFrame(size, mDetail->mOutputPinId); }); + auto outFrame = llavaFrames.begin()->second; frames.insert(make_pair(mDetail->mOutputPinId, outFrame)); send(frames); return true; } -void SceneDescriptorXForm::setMetadata(framemetadata_sp &metadata) { - if (!metadata->isSet()) { +void SceneDescriptorXForm::setMetadata(framemetadata_sp &metadata) +{ + if (!metadata->isSet()) + { return; } } -bool SceneDescriptorXForm::processSOS(frame_sp &frame) { +bool SceneDescriptorXForm::processSOS(frame_sp &frame) +{ auto metadata = frame->getMetadata(); setMetadata(metadata); return true; } -SceneDescriptorXFormProps SceneDescriptorXForm::getProps() { +SceneDescriptorXFormProps SceneDescriptorXForm::getProps() +{ fillProps(mDetail->mProps); return mDetail->mProps; } -bool SceneDescriptorXForm::handlePropsChange(frame_sp &frame) { - SceneDescriptorXFormProps props(mDetail->mProps.modelStrategyType); +bool SceneDescriptorXForm::handlePropsChange(frame_sp &frame) +{ + SceneDescriptorXFormProps props( + mDetail->mProps.modelStrategyType, mDetail->mProps.encoderModelPath, + mDetail->mProps.llmModelPath, mDetail->mProps.systemPrompt, + mDetail->mProps.userPrompt, mDetail->mProps.gpuLayers); auto ret = Module::handlePropsChange(frame, props); mDetail->setProps(props); return ret; } -void SceneDescriptorXForm::setProps(SceneDescriptorXFormProps &props) { - if (props.modelStrategyType != mDetail->mProps.modelStrategyType) { - throw AIPException(AIP_FATAL, "Model Strategy Type dynamic change not handled"); - } +void SceneDescriptorXForm::setProps(SceneDescriptorXFormProps &props) +{ + if (props.modelStrategyType != mDetail->mProps.modelStrategyType) + { + throw AIPException(AIP_FATAL, + "Model Strategy Type dynamic change not handled"); + } Module::addPropsToQueue(props); } \ No newline at end of file diff --git a/base/test/llavamodel_tests.cpp b/base/test/llavamodel_tests.cpp index 5fad3ddfa..a22aefb07 100644 --- a/base/test/llavamodel_tests.cpp +++ b/base/test/llavamodel_tests.cpp @@ -1,7 +1,7 @@ #include #include "stdafx.h" -#include -#include +#include +#include #include "FrameMetadata.h" #include "FrameMetadataFactory.h" @@ -11,7 +11,6 @@ #include "PipeLine.h" #include "FileWriterModule.h" #include "FileReaderModule.h" -#include "FileWriterModule.h" #include "LlmModelAbstract.h" #include "Llava.h" #include "Module.h" @@ -21,7 +20,12 @@ BOOST_AUTO_TEST_SUITE(llavamodel_test) BOOST_AUTO_TEST_CASE(llava_init) { - auto llavaProps = LlavaProps("C:/Users/developer/ws_kushal/llm-integration-branch/ApraPipes/data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", "Tell a story", 2048, 512, 0.8, 5, 50); + auto llavaProps = LlavaProps( + "./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", + "A chat between a curious human and an artificial intelligence " + "assistant. The assistant gives helpful, detailed, and polite answers " + "to the human's questions.\nUSER:", + "Describe the image", 2048, 512, 0.8, 10, 256); auto llavaModel = boost::shared_ptr(new Llava(llavaProps)); llavaModel->modelInit(); llavaModel->modelTerm(); diff --git a/base/test/sceneDescriptorXForm_tests.cpp b/base/test/sceneDescriptorXForm_tests.cpp index 99b4c099d..ea0db6519 100644 --- a/base/test/sceneDescriptorXForm_tests.cpp +++ b/base/test/sceneDescriptorXForm_tests.cpp @@ -1,7 +1,7 @@ #include #include "stdafx.h" -#include -#include +#include +#include #include "FrameMetadata.h" #include "FrameMetadataFactory.h" @@ -11,7 +11,6 @@ #include "PipeLine.h" #include "FileWriterModule.h" #include "FileReaderModule.h" -#include "FileWriterModule.h" #include "SceneDescriptorXForm.h" #include "ModelStrategy.h" #include "Module.h" @@ -21,36 +20,54 @@ BOOST_AUTO_TEST_SUITE(sceneDescriptorXForm_tests) BOOST_AUTO_TEST_CASE(testing) { - std::vector sceneDescriptorOutText = { "./data/sceneDescriptor_out.txt" }; - Test_Utils::FileCleaner f(sceneDescriptorOutText); - - Logger::setLogLevel(boost::log::trivial::severity_level::info); - - auto fileReaderProps = FileReaderModuleProps("./data/1280x960.jpg"); - fileReaderProps.readLoop = false; - auto fileReader = boost::shared_ptr(new FileReaderModule(fileReaderProps)); - auto metadata = framemetadata_sp(new FrameMetadata(FrameMetadata::ENCODED_IMAGE)); - auto pinId = fileReader->addOutputPin(metadata); - - auto sceneDescriptorProps = SceneDescriptorXFormProps(SceneDescriptorXFormProps::SceneDescriptorStrategy::LLAVA); - auto sceneDescriptor = boost::shared_ptr(new SceneDescriptorXForm(sceneDescriptorProps)); - fileReader->setNext(sceneDescriptor); - - auto outputFile = boost::shared_ptr(new FileWriterModule(FileWriterModuleProps(sceneDescriptorOutText[0], false))); - sceneDescriptor->setNext(outputFile); - - BOOST_TEST(fileReader->init()); - BOOST_TEST(sceneDescriptor->init()); - BOOST_TEST(outputFile->init()); - - fileReader->step(); - sceneDescriptor->step(); - outputFile->step(); - - std::ifstream in_file_text(sceneDescriptorOutText[0]); - std::ostringstream buffer; - buffer << in_file_text.rdbuf(); - in_file_text.close(); + std::vector sceneDescriptorOutText = { + "./data/sceneDescriptor_out.txt"}; + Test_Utils::FileCleaner f(sceneDescriptorOutText); + + Logger::setLogLevel(boost::log::trivial::severity_level::info); + + auto fileReaderProps = FileReaderModuleProps("./data/theft/Image_???.jpeg"); + fileReaderProps.readLoop = false; + auto fileReader = boost::shared_ptr( + new FileReaderModule(fileReaderProps)); + auto metadata = + framemetadata_sp(new FrameMetadata(FrameMetadata::ENCODED_IMAGE)); + auto pinId = fileReader->addOutputPin(metadata); + + auto sceneDescriptorProps = SceneDescriptorXFormProps( + SceneDescriptorXFormProps::ModelStrategyType::LLAVA, + "./data/llm/llava/llava-v1.6-7b/mmproj-model-f16.gguf", + "./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", + "You are a Spy AI. Your task is to analyze the provided images and " + "identify any potential suspicious activity. Look for anomalies, unusual " + "behavior, or anything that raises concerns. Describe what you find and " + "explain why it might be considered suspicious. Your analysis should " + "consider various factors such as context, environmental cues, and human " + "behavior patterns. Be detailed and provide insights into your thought " + "process as you assess the image and don't hallucinate.\nUSER:", + "Tell me the percentage of suspicious activity based on the context of the images", 10); + auto sceneDescriptor = boost::shared_ptr( + new SceneDescriptorXForm(sceneDescriptorProps)); + fileReader->setNext(sceneDescriptor); + + auto outputFile = boost::shared_ptr(new FileWriterModule( + FileWriterModuleProps(sceneDescriptorOutText[0], false))); + sceneDescriptor->setNext(outputFile); + + BOOST_TEST(fileReader->init()); + BOOST_TEST(sceneDescriptor->init()); + BOOST_TEST(outputFile->init()); + + for(int i = 0; i < 4; i++){ + fileReader->step(); + sceneDescriptor->step(); + outputFile->step(); + } + + std::ifstream in_file_text(sceneDescriptorOutText[0]); + std::ostringstream buffer; + buffer << in_file_text.rdbuf(); + in_file_text.close(); } BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file diff --git a/thirdparty/custom-overlay/llama/portfile.cmake b/thirdparty/custom-overlay/llama/portfile.cmake index bab45f3b5..b43ac2ac4 100644 --- a/thirdparty/custom-overlay/llama/portfile.cmake +++ b/thirdparty/custom-overlay/llama/portfile.cmake @@ -29,9 +29,9 @@ vcpkg_cmake_configure( vcpkg_cmake_install() vcpkg_cmake_config_fixup( - CONFIG_PATH lib/cmake/llama - PACKAGE_NAME llama - ) + CONFIG_PATH lib/cmake/Llama + PACKAGE_NAME Llama +) vcpkg_copy_pdbs() file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") From 0a42a706c8e4cbbb0c2aa252157534ac675241ac Mon Sep 17 00:00:00 2001 From: Kushal Jain Date: Fri, 3 May 2024 18:33:51 +0530 Subject: [PATCH 13/13] change SceneDescriptorXForm Module name to ImageToTextXForm Module --- base/CMakeLists.txt | 6 +- ...neDescriptorXForm.h => ImageToTextXForm.h} | 14 ++-- base/include/ModelStrategy.h | 10 +-- ...scriptorXForm.cpp => ImageToTextXForm.cpp} | 46 ++++++------ base/src/Llava.cpp | 12 ++- base/src/ModelStrategy.cpp | 10 +-- base/test/imageToTextXForm_tests.cpp | 65 +++++++++++++++++ base/test/sceneDescriptorXForm_tests.cpp | 73 ------------------- 8 files changed, 116 insertions(+), 120 deletions(-) rename base/include/{SceneDescriptorXForm.h => ImageToTextXForm.h} (81%) rename base/src/{SceneDescriptorXForm.cpp => ImageToTextXForm.cpp} (77%) create mode 100644 base/test/imageToTextXForm_tests.cpp delete mode 100644 base/test/sceneDescriptorXForm_tests.cpp diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index d278156c9..1a0e29319 100755 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -289,7 +289,7 @@ SET(IP_FILES src/EncoderModelAbstract.cpp src/Llava.cpp src/ClipEncoder.cpp - src/SceneDescriptorXForm.cpp + src/ImageToTextXForm.cpp ) SET(IP_FILES_H @@ -321,7 +321,7 @@ SET(IP_FILES_H include/EncoderModelAbstract.h include/Llava.h include/ClipEncoder.h - include/SceneDescriptorXForm.h + include/ImageToTextXForm.h ) SET(CUDA_CORE_FILES @@ -577,7 +577,7 @@ SET(UT_FILES test/testSignalGeneratorSrc_tests.cpp test/audioToTextXform_tests.cpp test/llavamodel_tests.cpp - test/sceneDescriptorXForm_tests.cpp + test/imageToTextXForm_tests.cpp ${ARM64_UT_FILES} ${CUDA_UT_FILES} ) diff --git a/base/include/SceneDescriptorXForm.h b/base/include/ImageToTextXForm.h similarity index 81% rename from base/include/SceneDescriptorXForm.h rename to base/include/ImageToTextXForm.h index 90d59bd9c..dff717531 100644 --- a/base/include/SceneDescriptorXForm.h +++ b/base/include/ImageToTextXForm.h @@ -2,7 +2,7 @@ #include "Module.h" -class SceneDescriptorXFormProps : public ModuleProps +class ImageToTextXFormProps : public ModuleProps { public: enum ModelStrategyType @@ -10,7 +10,7 @@ class SceneDescriptorXFormProps : public ModuleProps LLAVA = 0 }; - SceneDescriptorXFormProps(ModelStrategyType _modelStrategyType, + ImageToTextXFormProps(ModelStrategyType _modelStrategyType, std::string _encoderModelPath, std::string _llmModelPath, std::string _systemPrompt, std::string _userPrompt, @@ -44,15 +44,15 @@ class SceneDescriptorXFormProps : public ModuleProps } }; -class SceneDescriptorXForm : public Module +class ImageToTextXForm : public Module { public: - SceneDescriptorXForm(SceneDescriptorXFormProps _props); - virtual ~SceneDescriptorXForm(); + ImageToTextXForm(ImageToTextXFormProps _props); + virtual ~ImageToTextXForm(); bool init(); bool term(); - void setProps(SceneDescriptorXFormProps &props); - SceneDescriptorXFormProps getProps(); + void setProps(ImageToTextXFormProps &props); + ImageToTextXFormProps getProps(); protected: bool process(frame_container &frames); diff --git a/base/include/ModelStrategy.h b/base/include/ModelStrategy.h index cea24ffe0..8df66a1f4 100644 --- a/base/include/ModelStrategy.h +++ b/base/include/ModelStrategy.h @@ -1,6 +1,6 @@ #pragma once -#include "SceneDescriptorXForm.h" +#include "ImageToTextXForm.h" #include "EncoderModelAbstract.h" #include "LlmModelAbstract.h" @@ -29,11 +29,11 @@ class ModelStrategy boost::shared_ptr llmModel; }; -class SceneDescriptorModelStrategy : public ModelStrategy +class ImageToTextModelStrategy : public ModelStrategy { public: - SceneDescriptorModelStrategy(SceneDescriptorXFormProps props); - ~SceneDescriptorModelStrategy(); + ImageToTextModelStrategy(ImageToTextXFormProps props); + ~ImageToTextModelStrategy(); bool initStrategy() override; bool termStrategy() override; @@ -56,7 +56,7 @@ boost::shared_ptr ModelStrategy::create(ModelStrategyType type, switch (type) { case ModelStrategyType::LLAVA_SCENE_DESCRIPTOR: - return boost::make_shared(props); + return boost::make_shared(props); case ModelStrategyType::LLAVA_TEXT_TO_TEXT: return boost::make_shared(); default: diff --git a/base/src/SceneDescriptorXForm.cpp b/base/src/ImageToTextXForm.cpp similarity index 77% rename from base/src/SceneDescriptorXForm.cpp rename to base/src/ImageToTextXForm.cpp index 54fcaf7f9..6968c4117 100644 --- a/base/src/SceneDescriptorXForm.cpp +++ b/base/src/ImageToTextXForm.cpp @@ -1,7 +1,7 @@ -#include "SceneDescriptorXForm.h" +#include "ImageToTextXForm.h" #include "ModelStrategy.h" -SceneDescriptorXFormProps::SceneDescriptorXFormProps( +ImageToTextXFormProps::ImageToTextXFormProps( ModelStrategyType _modelStrategyType, std::string _encoderModelPath, std::string _llmModelPath, std::string _systemPrompt, std::string _userPrompt, int _gpuLayers) @@ -14,22 +14,22 @@ SceneDescriptorXFormProps::SceneDescriptorXFormProps( gpuLayers = _gpuLayers; } -class SceneDescriptorXForm::Detail +class ImageToTextXForm::Detail { public: - Detail(SceneDescriptorXFormProps &_props) : mProps(_props) + Detail(ImageToTextXFormProps &_props) : mProps(_props) { setModelStrategy(_props); } ~Detail() {} - void setProps(SceneDescriptorXFormProps &_props) { mProps = _props; } + void setProps(ImageToTextXFormProps &_props) { mProps = _props; } - void setModelStrategy(SceneDescriptorXFormProps &_props) + void setModelStrategy(ImageToTextXFormProps &_props) { switch (_props.modelStrategyType) { - case SceneDescriptorXFormProps::ModelStrategyType::LLAVA: + case ImageToTextXFormProps::ModelStrategyType::LLAVA: modelStrategyType = ModelStrategy::ModelStrategyType::LLAVA_SCENE_DESCRIPTOR; break; @@ -44,20 +44,20 @@ class SceneDescriptorXForm::Detail public: framemetadata_sp mOutputMetadata; std::string mOutputPinId; - SceneDescriptorXFormProps mProps; + ImageToTextXFormProps mProps; ModelStrategy::ModelStrategyType modelStrategyType; boost::shared_ptr modelStrategy; }; -SceneDescriptorXForm::SceneDescriptorXForm(SceneDescriptorXFormProps _props) - : Module(TRANSFORM, "SceneDescriptorXForm", _props) +ImageToTextXForm::ImageToTextXForm(ImageToTextXFormProps _props) + : Module(TRANSFORM, "ImageToTextXForm", _props) { mDetail.reset(new Detail(_props)); } -SceneDescriptorXForm::~SceneDescriptorXForm() {} +ImageToTextXForm::~ImageToTextXForm() {} -bool SceneDescriptorXForm::validateInputPins() +bool ImageToTextXForm::validateInputPins() { if (getNumberOfInputPins() != 1) { @@ -92,7 +92,7 @@ bool SceneDescriptorXForm::validateInputPins() return true; } -bool SceneDescriptorXForm::validateOutputPins() +bool ImageToTextXForm::validateOutputPins() { if (getNumberOfOutputPins() != 1) { @@ -116,7 +116,7 @@ bool SceneDescriptorXForm::validateOutputPins() return true; } -void SceneDescriptorXForm::addInputPin(framemetadata_sp &metadata, +void ImageToTextXForm::addInputPin(framemetadata_sp &metadata, string &pinId) { Module::addInputPin(metadata, pinId); @@ -126,7 +126,7 @@ void SceneDescriptorXForm::addInputPin(framemetadata_sp &metadata, mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata); } -bool SceneDescriptorXForm::init() +bool ImageToTextXForm::init() { bool ret = mDetail->modelStrategy->initStrategy(); if (!ret) @@ -136,7 +136,7 @@ bool SceneDescriptorXForm::init() return Module::init(); } -bool SceneDescriptorXForm::term() +bool ImageToTextXForm::term() { bool ret = mDetail->modelStrategy->termStrategy(); if (!ret) @@ -146,7 +146,7 @@ bool SceneDescriptorXForm::term() return Module::term(); } -bool SceneDescriptorXForm::process(frame_container &frames) +bool ImageToTextXForm::process(frame_container &frames) { /*Encoder Model*/ frame_container clipFrames; @@ -162,7 +162,7 @@ bool SceneDescriptorXForm::process(frame_container &frames) return true; } -void SceneDescriptorXForm::setMetadata(framemetadata_sp &metadata) +void ImageToTextXForm::setMetadata(framemetadata_sp &metadata) { if (!metadata->isSet()) { @@ -170,22 +170,22 @@ void SceneDescriptorXForm::setMetadata(framemetadata_sp &metadata) } } -bool SceneDescriptorXForm::processSOS(frame_sp &frame) +bool ImageToTextXForm::processSOS(frame_sp &frame) { auto metadata = frame->getMetadata(); setMetadata(metadata); return true; } -SceneDescriptorXFormProps SceneDescriptorXForm::getProps() +ImageToTextXFormProps ImageToTextXForm::getProps() { fillProps(mDetail->mProps); return mDetail->mProps; } -bool SceneDescriptorXForm::handlePropsChange(frame_sp &frame) +bool ImageToTextXForm::handlePropsChange(frame_sp &frame) { - SceneDescriptorXFormProps props( + ImageToTextXFormProps props( mDetail->mProps.modelStrategyType, mDetail->mProps.encoderModelPath, mDetail->mProps.llmModelPath, mDetail->mProps.systemPrompt, mDetail->mProps.userPrompt, mDetail->mProps.gpuLayers); @@ -194,7 +194,7 @@ bool SceneDescriptorXForm::handlePropsChange(frame_sp &frame) return ret; } -void SceneDescriptorXForm::setProps(SceneDescriptorXFormProps &props) +void ImageToTextXForm::setProps(ImageToTextXFormProps &props) { if (props.modelStrategyType != mDetail->mProps.modelStrategyType) { diff --git a/base/src/Llava.cpp b/base/src/Llava.cpp index f965e18f0..f9d3b51ad 100644 --- a/base/src/Llava.cpp +++ b/base/src/Llava.cpp @@ -175,10 +175,15 @@ bool Llava::modelInference(frame_container &inputFrameContainer, frame_container /*System Prompt Tokenization*/ if(mDetail->systemPromptFlag){ std::vector systemPromptTokens = - ::llama_tokenize(mDetail->mLlavaContext, systemPrompt, add_bos); + ::llama_tokenize(mDetail->mLlavaContext, systemPrompt, add_bos, true); mDetail->compute(mDetail->mLlavaContext, systemPromptTokens, nBatch, &mDetail->nPast); mDetail->systemPromptFlag = false; - LOG_ERROR << "Loaded System Prompt"; + } + else{ + systemPrompt = "\nUSER:"; + std::vector systemPromptTokens = + ::llama_tokenize(mDetail->mLlavaContext, systemPrompt, false); + mDetail->compute(mDetail->mLlavaContext, systemPromptTokens, nBatch, &mDetail->nPast); } if (frameType == FrameMetadata::FrameType::IMAGE_EMBEDDING) @@ -199,9 +204,8 @@ bool Llava::modelInference(frame_container &inputFrameContainer, frame_container /*User Prompt Tokenization*/ std::vector userPromptTokens = ::llama_tokenize( - mDetail->mLlavaContext, (userPrompt + "\nASSISTANT:").c_str(), false); + mDetail->mLlavaContext, (userPrompt + "\nASSISTANT:\n").c_str(), false); mDetail->compute(mDetail->mLlavaContext, userPromptTokens, nBatch, &mDetail->nPast); - std::string output = ""; std::cout << "\n"; diff --git a/base/src/ModelStrategy.cpp b/base/src/ModelStrategy.cpp index 3939df69b..8f141fbdb 100644 --- a/base/src/ModelStrategy.cpp +++ b/base/src/ModelStrategy.cpp @@ -7,8 +7,8 @@ ModelStrategy::ModelStrategy() {} ModelStrategy::~ModelStrategy() {} /*LLAVA SCENE-DESCRIPTOR STRATEGY*/ -SceneDescriptorModelStrategy::SceneDescriptorModelStrategy( - SceneDescriptorXFormProps props) +ImageToTextModelStrategy::ImageToTextModelStrategy( + ImageToTextXFormProps props) : ModelStrategy() { auto clipProps = ClipEncoderProps(props.encoderModelPath); @@ -21,16 +21,16 @@ SceneDescriptorModelStrategy::SceneDescriptorModelStrategy( llmModel = boost::shared_ptr(new Llava(llavaProps)); } -SceneDescriptorModelStrategy::~SceneDescriptorModelStrategy() {} +ImageToTextModelStrategy::~ImageToTextModelStrategy() {} -bool SceneDescriptorModelStrategy::initStrategy() +bool ImageToTextModelStrategy::initStrategy() { encoderModel->modelInit(); llmModel->modelInit(); return true; } -bool SceneDescriptorModelStrategy::termStrategy() +bool ImageToTextModelStrategy::termStrategy() { encoderModel->modelTerm(); llmModel->modelTerm(); diff --git a/base/test/imageToTextXForm_tests.cpp b/base/test/imageToTextXForm_tests.cpp new file mode 100644 index 000000000..571e709be --- /dev/null +++ b/base/test/imageToTextXForm_tests.cpp @@ -0,0 +1,65 @@ +#include +#include "stdafx.h" +#include +#include + +#include "FrameMetadata.h" +#include "FrameMetadataFactory.h" +#include "Frame.h" +#include "Logger.h" +#include "test_utils.h" +#include "PipeLine.h" +#include "FileWriterModule.h" +#include "FileReaderModule.h" +#include "ImageToTextXForm.h" +#include "ModelStrategy.h" +#include "Module.h" +#include "ExternalSinkModule.h" + +BOOST_AUTO_TEST_SUITE(imageToTextXForm_tests) + +BOOST_AUTO_TEST_CASE(testing) +{ + std::vector imageToTextOutText = { + "./data/imageToText_out.txt"}; + Test_Utils::FileCleaner f(imageToTextOutText); + + Logger::setLogLevel(boost::log::trivial::severity_level::info); + + auto fileReaderProps = FileReaderModuleProps("./data/1280x960.jpg"); + fileReaderProps.readLoop = false; + auto fileReader = boost::shared_ptr( + new FileReaderModule(fileReaderProps)); + auto metadata = + framemetadata_sp(new FrameMetadata(FrameMetadata::ENCODED_IMAGE)); + auto pinId = fileReader->addOutputPin(metadata); + + auto imageToTextProps = ImageToTextXFormProps( + ImageToTextXFormProps::ModelStrategyType::LLAVA, + "./data/llm/llava/llava-v1.6-7b/mmproj-model-f16.gguf", + "./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", + "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", + " Describe the image", 10); + auto imageToText = boost::shared_ptr( + new ImageToTextXForm(imageToTextProps)); + fileReader->setNext(imageToText); + + auto outputFile = boost::shared_ptr(new FileWriterModule( + FileWriterModuleProps(imageToTextOutText[0], false))); + imageToText->setNext(outputFile); + fileReader->play(true); + BOOST_TEST(fileReader->init()); + BOOST_TEST(imageToText->init()); + BOOST_TEST(outputFile->init()); + + fileReader->step(); + imageToText->step(); + outputFile->step(); + + std::ifstream in_file_text(imageToTextOutText[0]); + std::ostringstream buffer; + buffer << in_file_text.rdbuf(); + in_file_text.close(); +} + +BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file diff --git a/base/test/sceneDescriptorXForm_tests.cpp b/base/test/sceneDescriptorXForm_tests.cpp deleted file mode 100644 index ea0db6519..000000000 --- a/base/test/sceneDescriptorXForm_tests.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include -#include "stdafx.h" -#include -#include - -#include "FrameMetadata.h" -#include "FrameMetadataFactory.h" -#include "Frame.h" -#include "Logger.h" -#include "test_utils.h" -#include "PipeLine.h" -#include "FileWriterModule.h" -#include "FileReaderModule.h" -#include "SceneDescriptorXForm.h" -#include "ModelStrategy.h" -#include "Module.h" -#include "ExternalSinkModule.h" - -BOOST_AUTO_TEST_SUITE(sceneDescriptorXForm_tests) - -BOOST_AUTO_TEST_CASE(testing) -{ - std::vector sceneDescriptorOutText = { - "./data/sceneDescriptor_out.txt"}; - Test_Utils::FileCleaner f(sceneDescriptorOutText); - - Logger::setLogLevel(boost::log::trivial::severity_level::info); - - auto fileReaderProps = FileReaderModuleProps("./data/theft/Image_???.jpeg"); - fileReaderProps.readLoop = false; - auto fileReader = boost::shared_ptr( - new FileReaderModule(fileReaderProps)); - auto metadata = - framemetadata_sp(new FrameMetadata(FrameMetadata::ENCODED_IMAGE)); - auto pinId = fileReader->addOutputPin(metadata); - - auto sceneDescriptorProps = SceneDescriptorXFormProps( - SceneDescriptorXFormProps::ModelStrategyType::LLAVA, - "./data/llm/llava/llava-v1.6-7b/mmproj-model-f16.gguf", - "./data/llm/llava/llava-v1.6-7b/llava-v1.6-mistral-7b.Q8_0.gguf", - "You are a Spy AI. Your task is to analyze the provided images and " - "identify any potential suspicious activity. Look for anomalies, unusual " - "behavior, or anything that raises concerns. Describe what you find and " - "explain why it might be considered suspicious. Your analysis should " - "consider various factors such as context, environmental cues, and human " - "behavior patterns. Be detailed and provide insights into your thought " - "process as you assess the image and don't hallucinate.\nUSER:", - "Tell me the percentage of suspicious activity based on the context of the images", 10); - auto sceneDescriptor = boost::shared_ptr( - new SceneDescriptorXForm(sceneDescriptorProps)); - fileReader->setNext(sceneDescriptor); - - auto outputFile = boost::shared_ptr(new FileWriterModule( - FileWriterModuleProps(sceneDescriptorOutText[0], false))); - sceneDescriptor->setNext(outputFile); - - BOOST_TEST(fileReader->init()); - BOOST_TEST(sceneDescriptor->init()); - BOOST_TEST(outputFile->init()); - - for(int i = 0; i < 4; i++){ - fileReader->step(); - sceneDescriptor->step(); - outputFile->step(); - } - - std::ifstream in_file_text(sceneDescriptorOutText[0]); - std::ostringstream buffer; - buffer << in_file_text.rdbuf(); - in_file_text.close(); -} - -BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file