Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate Whisper CPP and write a wrapper module in Aprapipes #324

Merged
merged 43 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
8e23a6e
Add custom port vcpkg for whisper
joiskash Dec 22, 2023
fb29351
Add whisper stream
joiskash Dec 22, 2023
4ed21e2
Add whisper stream header
joiskash Dec 22, 2023
deab2d0
Add whisper cpp to Cmake list
joiskash Dec 22, 2023
9462f59
Add test frame type and minor changes
joiskash Dec 22, 2023
c55c40b
Add whisper to vcpkg
joiskash Dec 27, 2023
870862c
Add vcpkg custom overlay ports to thirdparty
joiskash Dec 27, 2023
ca4a6e4
Modify with whisper option
joiskash Dec 27, 2023
482f02c
Send whisper output as text frames
joiskash Dec 27, 2023
d12edf5
revert changes to sound record test
joiskash Dec 27, 2023
3275afd
Add whisper UT
joiskash Dec 27, 2023
acd8a1f
Fix PS to remove whisper from vcpkg json
joiskash Dec 27, 2023
9b18eb3
Revert changes to OPTIONS section, remove WHISPER option, rename Whis…
joiskash Dec 31, 2023
cf5d8a4
Move pcm to git lfs
joiskash Dec 31, 2023
5ad9157
Add pcm and model bin file to lfs
joiskash Dec 31, 2023
ded9a03
Fix UT name
joiskash Dec 31, 2023
ec0ca73
Throw AIP exception for unknown strategy
joiskash Dec 31, 2023
6d4528e
Revert sound_record_tests.cpp changes
joiskash Dec 31, 2023
91fe148
Revert changes to vcpkg indentation and remove Whisper option
joiskash Dec 31, 2023
2021355
Linux -> OFF to ON Windows ON -> OFF
joiskash Jan 3, 2024
80500ce
Add reserve statement for vector
joiskash Jan 9, 2024
42ca754
update submodule for pipeline to run
joiskash Jan 9, 2024
66cd4d8
Update whisper port with install fix
joiskash Jan 13, 2024
e817f98
update submodule
joiskash Jan 13, 2024
ce3d6e2
Update vcpkg version
joiskash Jan 13, 2024
f33644f
Add changes to handle props change
joiskash Jan 13, 2024
b6e20df
Improve UT and refactor for changing sample strategy during run time.
joiskash Jan 13, 2024
925e508
Add apt-get install libx11-dev libgles2-mesa-dev for libepoxy error
joiskash Jan 13, 2024
1d7bc11
Add memory type check in validate input pins and throw exception if m…
joiskash Feb 15, 2024
bc04e47
update submodule
joiskash Feb 15, 2024
25090aa
Merge branch 'main' into kj/whisper-asr
joiskash Feb 15, 2024
0c56895
update vcpkg mysys2
joiskash Feb 15, 2024
969e844
update submodule
joiskash Feb 15, 2024
9f58b90
Address nits
joiskash Feb 16, 2024
1e738f6
Export env variable overlay port for building in arm64
joiskash Feb 16, 2024
d478555
added fix-for-arm64.patch for whisper
kushaljain-apra Feb 23, 2024
67cbe9a
update fix-vcpkg-json.ps1
kushaljain-apra Feb 23, 2024
6ddd487
update CMakeLists.txt
Feb 23, 2024
dba812f
update vcpkg url for build
joiskash Feb 23, 2024
4716f25
update whisper tests threshold
kushaljain-apra Feb 26, 2024
f494d88
update code formatting
kushaljain-apra Feb 26, 2024
ad0977b
update whisper test
kushaljain-apra Feb 26, 2024
42df5de
added EOS for small buffer size
kushaljain-apra Feb 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions base/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
cmake_minimum_required(VERSION 3.22)

OPTION(ENABLE_LINUX "Use this switch to enable LINUX" ON)
OPTION(ENABLE_CUDA "Use this switch to enable CUDA" ON)
OPTION(ENABLE_LINUX "Use this switch to enable LINUX" OFF)
joiskash marked this conversation as resolved.
Show resolved Hide resolved
OPTION(ENABLE_CUDA "Use this switch to enable CUDA" OFF)
OPTION(ENABLE_ARM64 "Use this switch to enable ARM64" OFF)
OPTION(ENABLE_WINDOWS "Use this switch to enable WINDOWS" OFF)
OPTION(ENABLE_WINDOWS "Use this switch to enable WINDOWS" ON)
OPTION(ENABLE_WHISPER "Use Whisper ASR" ON)

set(VCPKG_INSTALL_OPTIONS "--clean-after-build")
set(VCPKG_OVERLAY_PORTS "${CMAKE_CURRENT_SOURCE_DIR}/../thirdparty/custom-overlay")
kumaakh marked this conversation as resolved.
Show resolved Hide resolved

IF(ENABLE_CUDA)
add_compile_definitions(APRA_CUDA_ENABLED)
ENDIF(ENABLE_CUDA)
Expand Down Expand Up @@ -38,7 +41,10 @@ project(APRAPIPES)
message(STATUS $ENV{PKG_CONFIG_PATH}">>>>>> PKG_CONFIG_PATH")

find_package(PkgConfig REQUIRED)


IF(ENABLE_WHISPER)
find_package(whisper CONFIG REQUIRED)
ENDIF(ENABLE_WHISPER)

find_package(Boost COMPONENTS system thread filesystem serialization log chrono unit_test_framework REQUIRED)
find_package(JPEG REQUIRED)
Expand Down Expand Up @@ -281,9 +287,7 @@ SET(IP_FILES
src/OverlayFactory.cpp
src/TestSignalGeneratorSrc.cpp
)




SET(IP_FILES_H
include/HistogramOverlay.h
include/CalcHistogramCV.h
Expand All @@ -308,6 +312,10 @@ SET(IP_FILES_H
include/Overlay.h
)

IF(ENABLE_WHISPER)
set(IP_FILES ${IP_FILES} src/WhisperStreamTransform.cpp )
joiskash marked this conversation as resolved.
Show resolved Hide resolved
set(IP_FILES_H ${IP_FILES_H} include/WhisperStreamTransform.h )
ENDIF(ENABLE_WHISPER)


SET(CUDA_CORE_FILES
Expand Down Expand Up @@ -461,6 +469,10 @@ ${NVCODEC_INCLUDE_DIR}

# aprapipes Unit Tests

IF(ENABLE_WHISPER)
SET(WHISPER_UT_FILES test/whisper_asr_tests.cpp)
ENDIF(ENABLE_WHISPER)

IF (ENABLE_ARM64)
SET(ARM64_UT_FILES
test/jpegencoderl4tm_tests.cpp
Expand Down Expand Up @@ -563,6 +575,7 @@ SET(UT_FILES
test/testSignalGeneratorSrc_tests.cpp
${ARM64_UT_FILES}
${CUDA_UT_FILES}
${WHISPER_UT_FILES}
)

IF(ENABLE_LINUX)
Expand Down Expand Up @@ -609,6 +622,10 @@ target_link_libraries(aprapipesut
sfml-audio
)

IF(ENABLE_WHISPER)
target_link_libraries(aprapipesut whisper::whisper)
ENDIF(ENABLE_WHISPER)

IF(ENABLE_WINDOWS)
file(COPY ${RUNTIME_DLLS} DESTINATION Debug/)
file(COPY ${RUNTIME_DLLS} DESTINATION Release/)
Expand Down
12 changes: 11 additions & 1 deletion base/fix-vcpkg-json.ps1
joiskash marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
#inplace fixing of a vcpkg file
param([String]$fileName='vcpkg.json', [switch]$removeOpenCV, [switch]$removeCUDA, [switch]$onlyOpenCV)
param([String]$fileName='vcpkg.json', [switch]$removeOpenCV, [switch]$removeCUDA, [switch]$onlyOpenCV, [switch]$removeWhisper)

$v = Get-Content $fileName -raw | ConvertFrom-Json

if($removeWhisper.IsPresent){
$result = $v.overrides | Where-Object { $_.name -ne 'whisper' }
# Check if the result is $null or empty and set it to an empty array if it is
if (-not $result) {
$result = @()
}
$v.overrides = $result
$v.dependencies = $v.dependencies | Where-Object { $_ -ne 'whisper'}
}

if ($removeCUDA.IsPresent)
{
$v.dependencies |
Expand Down
3 changes: 2 additions & 1 deletion base/include/FrameMetadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ class FrameMetadata {
HEVC_DATA, //H265
MOTION_VECTOR_DATA,
OVERLAY_INFO_IMAGE,
FACE_LANDMARKS_INFO
FACE_LANDMARKS_INFO,
TEXT
};

enum MemType
Expand Down
3 changes: 3 additions & 0 deletions base/include/Mp4WriterSinkUtils.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include <ctime>
#include <chrono>
#include <string>
#include <boost/filesystem.hpp>
kumaakh marked this conversation as resolved.
Show resolved Hide resolved

class Mp4WriterSinkUtils
{
Expand Down
50 changes: 50 additions & 0 deletions base/include/WhisperStreamTransform.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#pragma once

#include "Module.h"

// size of audio to process should be a parameter.
// Cache variable to collect frames for processing

class WhisperStreamTransformProps : public ModuleProps
{
public:
enum DecoderSamplingStrategy {
GREEDY, //WHISPER_SAMPLING_GREEDY
BEAM_SEARCH //WHISPER_SAMPLING_BEAM_SEARCH
};
WhisperStreamTransformProps(
joiskash marked this conversation as resolved.
Show resolved Hide resolved
DecoderSamplingStrategy _samplingStrategy,
std::string _modelPath,
int _bufferSize) : samplingStrategy(_samplingStrategy),
modelPath(_modelPath),
bufferSize(_bufferSize)
{}
DecoderSamplingStrategy samplingStrategy;
std::string modelPath;
int bufferSize;
};

class WhisperStreamTransform : public Module
{

public:
WhisperStreamTransform(WhisperStreamTransformProps _props);
virtual ~WhisperStreamTransform();
bool init();
bool term();
void setProps(WhisperStreamTransformProps& props);
WhisperStreamTransformProps getProps();

protected:
bool process(frame_container& frames);
bool processSOS(frame_sp& frame);
bool validateInputPins();
bool validateOutputPins();
void addInputPin(framemetadata_sp& metadata, string& pinId);
bool handlePropsChange(frame_sp& frame);

private:
void setMetadata(framemetadata_sp& metadata);
class Detail;
boost::shared_ptr<Detail> mDetail;
};
180 changes: 180 additions & 0 deletions base/src/WhisperStreamTransform.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#include "WhisperStreamTransform.h"
#include "FrameMetadata.h"
#include "FrameMetadataFactory.h"
#include "Frame.h"
#include "Logger.h"
#include "Utils.h"
#include "whisper.h"
#include "SFML/Config.hpp"

class WhisperStreamTransform::Detail
joiskash marked this conversation as resolved.
Show resolved Hide resolved
{
public:
Detail(WhisperStreamTransformProps& _props) : mProps(_props)
{
}
~Detail() {}

void setProps(WhisperStreamTransformProps& props)
{
mProps = props;
}

public:
framemetadata_sp mOutputMetadata;
std::string mOutputPinId;
std::vector<float> inputAudioBuffer;
WhisperStreamTransformProps mProps;
int mFrameType;
whisper_context *mWhisperContext = NULL;
whisper_full_params mWhisperFullParams;
whisper_context_params mWhisperContextParams;
};

WhisperStreamTransform::WhisperStreamTransform(WhisperStreamTransformProps _props) : Module(TRANSFORM, "WhisperStreamTransform", _props)
{
mDetail.reset(new Detail(_props));
}

WhisperStreamTransform::~WhisperStreamTransform() {}

bool WhisperStreamTransform::validateInputPins()
{
if (getNumberOfInputPins() != 1)
{
LOG_ERROR << "<" << getId() << ">::validateInputPins size is expected to be 1. Actual<" << getNumberOfInputPins() << ">";
return false;
}

framemetadata_sp metadata = getFirstInputMetadata();

FrameMetadata::FrameType frameType = metadata->getFrameType();
if (frameType != FrameMetadata::AUDIO)
{
LOG_ERROR << "<" << getId() << ">::validateInputPins input frameType is expected to be Audio. Actual<" << frameType << ">";
return false;
}

return true;
}

bool WhisperStreamTransform::validateOutputPins()
{
if (getNumberOfOutputPins() != 1)
{
LOG_ERROR << "<" << getId() << ">::validateOutputPins size is expected to be 1. Actual<" << getNumberOfOutputPins() << ">";
return false;
}

framemetadata_sp metadata = getFirstOutputMetadata();
FrameMetadata::FrameType frameType = metadata->getFrameType();
if (frameType != FrameMetadata::TEXT)
{
LOG_ERROR << "<" << getId() << ">::validateOutputPins input frameType is expected to be TEXT. Actual<" << frameType << ">";
return false;
}

return true;
}

void WhisperStreamTransform::addInputPin(framemetadata_sp& metadata, string& pinId)
{
Module::addInputPin(metadata, pinId);
mDetail->mOutputMetadata = framemetadata_sp(new FrameMetadata(FrameMetadata::FrameType::TEXT));
mDetail->mOutputMetadata->copyHint(*metadata.get());
mDetail->mOutputPinId = addOutputPin(mDetail->mOutputMetadata);
}

bool WhisperStreamTransform::init()
{
//intialize model
auto samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
switch (mDetail->mProps.samplingStrategy)
{
case WhisperStreamTransformProps::DecoderSamplingStrategy::GREEDY:
samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
break;
case WhisperStreamTransformProps::DecoderSamplingStrategy::BEAM_SEARCH:
samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH;
break;
default:
samplingStrategy = whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY;
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved
}
mDetail->mWhisperFullParams = whisper_full_default_params(samplingStrategy);
mDetail->mWhisperContextParams = whisper_context_default_params();
mDetail->mWhisperContext = whisper_init_from_file_with_params(mDetail->mProps.modelPath.c_str(), mDetail->mWhisperContextParams);
return Module::init();
}

bool WhisperStreamTransform::term()
{
whisper_free_context_params(&mDetail->mWhisperContextParams);
whisper_free_params(&mDetail->mWhisperFullParams);
whisper_free(mDetail->mWhisperContext);
return Module::term();
}

bool WhisperStreamTransform::process(frame_container& frames)
{
auto frame = frames.begin()->second;
sf::Int16* constFloatPointer = static_cast<sf::Int16*>(frame->data());
int numberOfSamples = frame->size() / 2;
kushaljain-apra marked this conversation as resolved.
Show resolved Hide resolved
for (int index = 0; index < numberOfSamples; index++) {
mDetail->inputAudioBuffer.push_back((float)constFloatPointer[index]/ 32768.0f);
mraduldubey marked this conversation as resolved.
Show resolved Hide resolved
}
if (mDetail->inputAudioBuffer.size() < mDetail->mProps.bufferSize) {
return true;
kushaljain-apra marked this conversation as resolved.
Show resolved Hide resolved
}
whisper_full(
mDetail->mWhisperContext,
mDetail->mWhisperFullParams,
mDetail->inputAudioBuffer.data(),
mDetail->inputAudioBuffer.size()
);
std::string output = "";
const int n_segments = whisper_full_n_segments(mDetail->mWhisperContext);
for (int i = 0; i < n_segments; ++i) {
const char* text = whisper_full_get_segment_text(mDetail->mWhisperContext, i);
output += text;
}
mDetail->inputAudioBuffer.clear();
joiskash marked this conversation as resolved.
Show resolved Hide resolved
auto outFrame = makeFrame(output.length());
memcpy(outFrame->data(), output.c_str(), output.length());
frames.insert(make_pair(mDetail->mOutputPinId, outFrame));
kushaljain-apra marked this conversation as resolved.
Show resolved Hide resolved
send(frames);
return true;
}

void WhisperStreamTransform::setMetadata(framemetadata_sp& metadata)
{
if (!metadata->isSet())
{
return;
}
}

bool WhisperStreamTransform::processSOS(frame_sp& frame)
{
auto metadata = frame->getMetadata();
setMetadata(metadata);
return true;
}

WhisperStreamTransformProps WhisperStreamTransform::getProps()
{
fillProps(mDetail->mProps);
return mDetail->mProps;
}

bool WhisperStreamTransform::handlePropsChange(frame_sp& frame)
{
WhisperStreamTransformProps props(mDetail->mProps.samplingStrategy, mDetail->mProps.modelPath,32000);
auto ret = Module::handlePropsChange(frame, props);
mDetail->setProps(props);
return ret;
}

void WhisperStreamTransform::setProps(WhisperStreamTransformProps& props)
{
Module::addPropsToQueue(props);
}
Loading
Loading