Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate llama.cpp in Aprapipes and a module ImageToTextXForm which can describe an image #345

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
27 changes: 24 additions & 3 deletions base/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ find_package(ZXing CONFIG REQUIRED)
find_package(bigint CONFIG REQUIRED)
find_package(SFML COMPONENTS system window audio graphics CONFIG REQUIRED)
find_package(whisper CONFIG REQUIRED)
find_package(Llama CONFIG REQUIRED)

IF(ENABLE_CUDA)
if((NOT DEFINED CMAKE_CUDA_ARCHITECTURES) OR (CMAKE_CUDA_ARCHITECTURES STREQUAL ""))
Expand Down Expand Up @@ -282,7 +283,13 @@ SET(IP_FILES
src/OverlayFactory.h
src/OverlayFactory.cpp
src/TestSignalGeneratorSrc.cpp
src/AudioToTextXForm.cpp
src/AudioToTextXForm.cpp
src/ModelStrategy.cpp
src/LlmModelAbstract.cpp
src/EncoderModelAbstract.cpp
src/Llava.cpp
src/ClipEncoder.cpp
src/ImageToTextXForm.cpp
)

SET(IP_FILES_H
Expand All @@ -308,6 +315,13 @@ SET(IP_FILES_H
include/ColorConversionXForm.h
include/Overlay.h
include/AudioToTextXForm.h
include/ModelEnums.h
include/ModelStrategy.h
include/LlmModelAbstract.h
include/EncoderModelAbstract.h
include/Llava.h
include/ClipEncoder.h
include/ImageToTextXForm.h
)

SET(CUDA_CORE_FILES
Expand Down Expand Up @@ -562,6 +576,8 @@ SET(UT_FILES
test/overlaymodule_tests.cpp
test/testSignalGeneratorSrc_tests.cpp
test/audioToTextXform_tests.cpp
test/llavamodel_tests.cpp
test/imageToTextXForm_tests.cpp
${ARM64_UT_FILES}
${CUDA_UT_FILES}
)
Expand All @@ -587,8 +603,10 @@ ENDIF (ENABLE_CUDA)

find_library(OPENH264_LIB NAMES openh264.lib libopenh264.a REQUIRED)
find_library(LIBMP4_LIB NAMES mp4lib.lib libmp4lib.a REQUIRED)
find_library(COMMON_LIB NAMES common_llama.lib libcommon_llama.a REQUIRED)
find_library(LLAVA_LIB NAMES llavalib.lib libllavalib.a REQUIRED)
Comment on lines +606 to +607
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a separate find lib if we are using find package for Llama?


target_link_libraries(aprapipesut
target_link_libraries(aprapipesut
aprapipes
${JPEG_LIBRARIES}
${LIBMP4_LIB}
Expand All @@ -608,6 +626,9 @@ target_link_libraries(aprapipesut
liblzma::liblzma
bigint::bigint
sfml-audio
${COMMON_LIB}
llama
${LLAVA_LIB}
whisper::whisper
)

Expand All @@ -617,4 +638,4 @@ IF(ENABLE_WINDOWS)
IF(GHA)
file(COPY ${RUNTIME_DLLS} DESTINATION RelWithDebInfo/)
ENDIF(GHA)
ENDIF(ENABLE_WINDOWS)
ENDIF(ENABLE_WINDOWS)
4 changes: 4 additions & 0 deletions base/fix-vcpkg-json.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ if ($removeCUDA.IsPresent)
$v.dependencies |
Where-Object { $_.name -eq 'whisper' } |
ForEach-Object { $_.features = $_.features -ne 'cuda' }

$v.dependencies |
Where-Object { $_.name -eq 'llama' } |
ForEach-Object { $_.features = $_.features -ne 'cuda' }
}

if($removeOpenCV.IsPresent)
Expand Down
4 changes: 4 additions & 0 deletions base/fix-vcpkg-json.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ if $removeCUDA; then
# Remove "cuda" features for this "whisper" instance
v=$(echo "$v" | jq ".dependencies[$index].features |= map(select(. != \"cuda\"))")
fi
if [ "$name" == "llama"]; then
# Remove "cuda" features for this "llama" instance
v=$(echo "$v" | jq ".dependencies[$index].features |= map(select(. != \"cuda\"))")
fi
done
fi

Expand Down
44 changes: 44 additions & 0 deletions base/include/ClipEncoder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#pragma once

#include "EncoderModelAbstract.h"

class ClipEncoderProps : public EncoderModelAbstractProps
{
public:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move all the definition to cpp

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

ClipEncoderProps(std::string _modelPath);

std::string modelPath;

size_t getSerializeSize()
{
return EncoderModelAbstractProps::getSerializeSize() + sizeof(modelPath);
}

private:
friend class boost::serialization::access;

template <class Archive>
void serialize(Archive &ar, const unsigned int version)
{
ar &boost::serialization::base_object<EncoderModelAbstractProps>(*this);
ar & modelPath;
}
};

class ClipEncoder : public EncoderModelAbstract
{
public:
ClipEncoder(ClipEncoderProps _props);
virtual ~ClipEncoder();
bool modelInit() override;
bool modelTerm() override;
bool modelInference(frame_container &inputFrameContainer,
frame_container &outputFrameContainer, std::function<frame_sp(size_t)> makeFrame) override;
bool validateUseCase(UseCase useCase) override;
size_t getFrameSize() override;
void storeFrames(frame_sp &frame);

private:
class Detail;
boost::shared_ptr<Detail> mDetail;
};
89 changes: 89 additions & 0 deletions base/include/EncoderModelAbstract.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#pragma once
#include "stdafx.h"
#include <boost/shared_ptr.hpp>
#include <boost/serialization/base_object.hpp>
#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/iostreams/device/array.hpp>
#include <boost/iostreams/stream.hpp>
#include "Frame.h"
#include <boost/function.hpp>
#include "BoundBuffer.h"
#include "FrameFactory.h"
#include "CommonDefs.h"
#include "FrameMetadata.h"
#include "FrameMetadataFactory.h"
#include "Command.h"
#include "BufferMaker.h"
#include "ModelEnums.h"
#include "FrameContainerQueue.h"

class EncoderModelAbstractProps
{
public:
EncoderModelAbstractProps();

EncoderModelAbstractProps(ModelArchitectureType _modelArchitecture,
std::vector<FrameMetadata::FrameType> _inputTypes,
std::vector<FrameMetadata::FrameType> _outputTypes,
std::vector<UseCase> _useCases);

size_t getSerializeSize()
{
return sizeof(modelArchitecture) + sizeof(inputTypes) +
sizeof(outputTypes) + sizeof(useCases) + sizeof(qlen);
}

ModelArchitectureType modelArchitecture;
std::vector<FrameMetadata::FrameType> inputTypes;
std::vector<FrameMetadata::FrameType> outputTypes;
std::vector<UseCase> useCases;
size_t qlen;

private:
friend class boost::serialization::access;

template <class Archive>
void serialize(Archive &ar, const unsigned int version)
{
ar &boost::serialization::base_object<EncoderModelAbstractProps>(*this);
ar & modelArchitecture;
ar & inputTypes;
ar & outputTypes;
ar & useCases;
ar & qlen;
}
};

class EncoderModelAbstract
{
public:
EncoderModelAbstract(std::string _modelName, EncoderModelAbstractProps props);
~EncoderModelAbstract();

std::string getMyName() { return modelName; }

boost::shared_ptr<FrameContainerQueue> getQue() { return mQue; }

virtual bool modelInit() = 0;
virtual bool modelTerm() = 0;
virtual bool modelInference(frame_container &inputFrameContainer,
frame_container &outputFrameContainer, std::function<frame_sp(size_t)> makeFrame)
{
return false;
}
virtual size_t getFrameSize() = 0;

virtual bool validateUseCase(UseCase useCase) = 0;

bool init();
bool term();
bool step(frame_container &outputFrameContaine, std::function<frame_sp(size_t)> makeFrame);
bool push(frame_container &inputFrameContainer,
frame_container &outputFrameContainer, std::function<frame_sp(size_t)> _makeFrame);

private:
std::string modelName;
boost::shared_ptr<FrameContainerQueue> mQue;
boost::shared_ptr<EncoderModelAbstractProps> mProps;
};
Loading
Loading