diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 1f1fcd4..074f795 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -410,14 +410,16 @@ jobs: strategy: matrix: - build: [Release] - arch: [x64] - cublas: [ON] - sdl2: [ON] - cuda-toolkit: [12.2.0, 11.8.0] + build: [ Release ] + arch: [ x64 ] + cublas: [ ON ] + sdl2: [ ON ] + cuda-toolkit: [ 12.2.0, 11.8.0 ] include: - arch: x64 s2arc: x64 + - sdl2: ON + s2ver: 2.28.5 steps: - name: Clone @@ -450,9 +452,6 @@ jobs: -Include cudart64_*,cublas64_*,cublasLt64_* -Destination build/bin/${{ matrix.build }} - - name: Copy SDL2.dll - if: matrix.sdl2 == 'ON' - run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }} - name: Upload binaries if: matrix.sdl2 == 'ON' diff --git a/README-EN.md b/README-EN.md index 55429d8..4e8c723 100644 --- a/README-EN.md +++ b/README-EN.md @@ -15,7 +15,7 @@ This project is based on the [ggml](https://github.com/ggerganov/ggml) framework 1. Based on ggml, it does not rely on other third-party libraries and is committed to edge deployment. 2. Feature extraction references the [kaldi-native-fbank](https://github.com/csukuangfj/kaldi-native-fbank) library, supporting multi-threaded feature extraction. -3. Flash attention decoding can be used (The speed has not improved 🤔 weird,need help). +3. Support Flash attention decoding 4. Support Q3, Q4, Q5, Q6, Q8 quantization. ### 1.1 Future Plans diff --git a/README.md b/README.md index 7d74b59..b5e4484 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ 1. 基于ggml,不依赖其他第三方库, 致力于端侧部署 2. 特征提取参考[kaldi-native-fbank](https://github.com/csukuangfj/kaldi-native-fbank)库,支持多线程特征提取。 -3. 可以使用flash attention解码(速度没有明显提升🤔不知道为啥) +3. 支持flash attention解码 4. 支持Q3, Q4, Q5, Q6, Q8量化 ### 1.1 未来计划 diff --git a/sense-voice/csrc/common.h b/sense-voice/csrc/common.h index f05239e..a6a152b 100644 --- a/sense-voice/csrc/common.h +++ b/sense-voice/csrc/common.h @@ -9,12 +9,7 @@ #include #include #include -#include -#include - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif +#include #include "sense-voice-frontend.h" diff --git a/sense-voice/csrc/main.cc b/sense-voice/csrc/main.cc index 27e5ae2..be89103 100644 --- a/sense-voice/csrc/main.cc +++ b/sense-voice/csrc/main.cc @@ -98,15 +98,12 @@ const char * sense_voice_print_system_info(void) { s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "METAL = " + std::to_string(ggml_cpu_has_metal()) + " | "; s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - s += "CUDA = " + std::to_string(ggml_cpu_has_cuda()) + " | "; s += "COREML = " + std::to_string(sense_voice_has_coreml()) + " | "; s += "OPENVINO = " + std::to_string(sense_voice_has_openvino()); diff --git a/sense-voice/csrc/sense-voice-decoder.cc b/sense-voice/csrc/sense-voice-decoder.cc index d7d27d8..ca317ce 100644 --- a/sense-voice/csrc/sense-voice-decoder.cc +++ b/sense-voice/csrc/sense-voice-decoder.cc @@ -3,32 +3,6 @@ // #include "sense-voice-decoder.h" -#include -#include "ggml-alloc.h" -#include "ggml-backend.h" -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_SYCL -#include "ggml-sycl.h" -#endif - -#ifdef GGML_USE_BLAS -#include "ggml-blas.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif #define SENSEVOICE_DECODER_MAX_NODES 8 @@ -111,16 +85,16 @@ static bool ggml_graph_compute_helper( for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) { ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i); - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + + auto * fn_set_n_threads = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (fn_set_n_threads) { + fn_set_n_threads(backend, n_threads); } - #ifdef GGML_USE_BLAS - if (ggml_backend_is_blas(backend)) { - ggml_backend_blas_set_n_threads(backend, n_threads); - } - #endif } + bool t = ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS; ggml_backend_sched_reset(sched); return t; diff --git a/sense-voice/csrc/sense-voice-encoder.cc b/sense-voice/csrc/sense-voice-encoder.cc index d8758af..8146159 100644 --- a/sense-voice/csrc/sense-voice-encoder.cc +++ b/sense-voice/csrc/sense-voice-encoder.cc @@ -3,30 +3,7 @@ // #include "sense-voice-encoder.h" - #include - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_SYCL -#include "ggml-sycl.h" -#endif - -#ifdef GGML_USE_BLAS -#include "ggml-blas.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif - - #include #include #include @@ -93,53 +70,6 @@ struct sense_voice_context_params sense_voice_context_default_params() { return result; } -static ggml_backend_t sense_voice_backend_init( - const sense_voice_context_params ¶ms) { - ggml_backend_t backend_gpu = nullptr; - - // initialize the backends -#ifdef GGML_USE_CUDA - if (params.use_gpu) { - SENSE_VOICE_LOG_INFO("%s: using CUDA backend\n", __func__); - backend_gpu = ggml_backend_cuda_init(params.gpu_device); - if (!backend_gpu) { - SENSE_VOICE_LOG_ERROR("%s: ggml_backend_cuda_init() failed\n", __func__); - } - } -#endif - -#ifdef GGML_USE_METAL - if (params.use_gpu) { - SENSEVOICE_LOG_INFO("%s: using Metal backend\n", __func__); - backend_gpu = ggml_backend_metal_init(); - if (!backend_gpu) { - SENSEVOICE_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__); - } else if (!ggml_backend_metal_supports_family(backend_gpu, 7)) { - SENSEVOICE_LOG_ERROR( - "%s: Metal GPU does not support family 7 - falling back to CPU\n", - __func__); - ggml_backend_free(backend_gpu); - backend_gpu = nullptr; - } - } -#endif - -#ifdef GGML_USE_SYCL - if (params.use_gpu) { - SENSE_VOICE_LOG_INFO("%s: using SYCL backend\n", __func__); - backend_gpu = ggml_backend_sycl_init(params.gpu_device); - if (!backend_gpu) { - SENSE_VOICE_LOG_ERROR("%s: ggml_backend_sycl_init() failed\n", __func__); - } - } -#endif - - if (backend_gpu) { - return backend_gpu; - } - return ggml_backend_cpu_init(); -} - static bool ggml_graph_compute_helper( ggml_backend_sched_t sched, @@ -148,16 +78,16 @@ static bool ggml_graph_compute_helper( for (int i = 0; i < ggml_backend_sched_get_n_backends(sched); ++i) { ggml_backend_t backend = ggml_backend_sched_get_backend(sched, i); - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); - } -#ifdef GGML_USE_BLAS - if (ggml_backend_is_blas(backend)) { - ggml_backend_blas_set_n_threads(backend, n_threads); + ggml_backend_dev_t dev = ggml_backend_get_device(backend); + ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr; + + auto * fn_set_n_threads = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); + if (fn_set_n_threads) { + fn_set_n_threads(backend, n_threads); } -#endif } + bool t = ggml_backend_sched_graph_compute(sched, graph) == GGML_STATUS_SUCCESS; ggml_backend_sched_reset(sched); return t; @@ -448,8 +378,6 @@ bool sense_voice_encode_internal(sense_voice_context &ctx, const int n_threads) { const int64_t t_start_us = ggml_time_us(); - const auto &model = ctx.model; - // encoder { diff --git a/sense-voice/csrc/sense-voice.cc b/sense-voice/csrc/sense-voice.cc index 74d0763..9a33483 100644 --- a/sense-voice/csrc/sense-voice.cc +++ b/sense-voice/csrc/sense-voice.cc @@ -6,32 +6,9 @@ #include "sense-voice-decoder.h" #include "sense-voice-cmvn.h" #include "common.h" - #include -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif #include #include -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_SYCL -#include "ggml-sycl.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif - -#ifdef GGML_USE_BLAS -#include "ggml-blas.h" -#endif - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif #define SENSE_VOICE_MAX_NODES 8192 #define SENSE_VOICE_MAX_DECODERS 8 @@ -64,91 +41,34 @@ const char * sense_voice_lang_str(int id) { } static ggml_backend_buffer_type_t sense_voice_default_buffer_type(const sense_voice_context_params & params) { - ggml_backend_buffer_type_t result = nullptr; - - params.use_gpu || (result = ggml_backend_cpu_buffer_type()); - -#ifdef GGML_USE_CUDA - result || (result = ggml_backend_cuda_buffer_type(params.gpu_device)); -#endif - -#ifdef GGML_USE_METAL - result || (result = ggml_backend_metal_buffer_type()); -#endif - -#ifdef GGML_USE_SYCL - result || (result = ggml_backend_sycl_buffer_type(params.gpu_device)); -#endif - -#ifdef GGML_USE_VULKAN - result || (result = ggml_backend_vk_buffer_type(params.gpu_device)); -#endif - -#ifdef GGML_USE_CANN - result || (result == ggml_backend_cann_buffer_type(params.gpu_device)); -#endif - - result || (result = ggml_backend_cpu_buffer_type()); + if (!params.use_gpu) { + return ggml_backend_cpu_buffer_type(); + } + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + SENSE_VOICE_LOG_INFO("%s: using device %s (%s)\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev)); + return ggml_backend_dev_buffer_type(dev); + } + } - return result; + return ggml_backend_cpu_buffer_type(); } static ggml_backend_t sense_voice_backend_init_gpu(const sense_voice_context_params & params) { ggml_backend_t result = nullptr; -#ifdef GGML_USE_CUDA - if (params.use_gpu) { - SENSE_VOICE_LOG_INFO("%s: using CUDA backend\n", __func__); - result = ggml_backend_cuda_init(params.gpu_device); - if (!result) { - SENSE_VOICE_LOG_ERROR("%s: ggml_backend_cuda_init() failed\n", __func__); - } - } -#endif - -#ifdef GGML_USE_METAL - if (params.use_gpu) { - SENSE_VOICE_LOG_INFO("%s: using Metal backend\n", __func__); - result = ggml_backend_metal_init(); - if (!result) { - SENSE_VOICE_LOG_ERROR("%s: ggml_backend_metal_init() failed\n", __func__); - } else if (!ggml_backend_metal_supports_family(result, 7)) { - SENSE_VOICE_LOG_ERROR("%s: Metal GPU does not support family 7 - falling back to CPU\n", __func__); - ggml_backend_free(result); - result = nullptr; - } - } -#endif - -#ifdef GGML_USE_SYCL - if (params.use_gpu) { - SENSE_VOICE_LOG_INFO("%s: using SYCL backend\n", __func__); - result = ggml_backend_sycl_init(params.gpu_device); - if (!result) { - SENSE_VOICE_LOG_ERROR("%s: ggml_backend_sycl_init() failed\n", __func__); - } - } -#endif - -#ifdef GGML_USE_CANN - if (params.use_gpu) { - WHISPER_LOG_INFO("%s: using CANN backend\n", __func__); - result = ggml_backend_cann_init(params.gpu_device); - if (!result) { - WHISPER_LOG_ERROR("%s: ggml_backend_cann_init() failed\n", __func__); - } - } -#endif - -#ifdef GGML_USE_VULKAN - if (params.use_gpu) { - SENSE_VOICE_LOG_INFO("%s: using Vulkan backend\n", __func__); - result = ggml_backend_vk_init(params.gpu_device); - if (!result) { - SENSE_VOICE_LOG_ERROR("%s: ggml_backend_vk_init() failed\n", __func__); + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + SENSE_VOICE_LOG_INFO("%s: using %s backend\n", __func__, ggml_backend_dev_name(dev)); + ggml_backend_t result = ggml_backend_dev_init(dev, nullptr); + if (!result) { + SENSE_VOICE_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + } + return result; } } -#endif return result; } @@ -286,15 +206,8 @@ bool sense_voice_model_load(const char *path_model, sense_voice_context &sctx) { vocab.n_vocab = sense_voice.hparams.n_vocab; - size_t ctx_size = 0; - - const ggml_type wtype = sctx.wtype; - const ggml_type vtype = - sctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv - { - const auto &hparams = sense_voice.hparams; // initialize all memory buffers // always have at least one decoder @@ -444,6 +357,9 @@ struct sense_voice_context *sense_voice_init_with_params_no_state( SENSE_VOICE_LOG_INFO("%s: use gpu = %d\n", __func__, params.use_gpu); SENSE_VOICE_LOG_INFO("%s: flash attn = %d\n", __func__, params.flash_attn); SENSE_VOICE_LOG_INFO("%s: gpu_device = %d\n", __func__, params.gpu_device); + SENSE_VOICE_LOG_INFO("%s: devices = %zu\n", __func__, ggml_backend_dev_count()); + SENSE_VOICE_LOG_INFO("%s: backends = %zu\n", __func__, ggml_backend_reg_count()); + auto *ctx = new struct sense_voice_context; ctx->params = params; @@ -481,22 +397,21 @@ static std::vector sense_voice_backend_init( result.push_back(backend_gpu); } -#ifdef GGML_USE_BLAS - { - SENSE_VOICE_LOG_INFO("%s: using BLAS backend\n", __func__); - ggml_backend_t backend_blas = ggml_backend_blas_init(); - if (!backend_blas) { - SENSE_VOICE_LOG_ERROR("%s: ggml_backend_blas_init() failed\n", __func__); - } else { - result.push_back(backend_blas); + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + SENSE_VOICE_LOG_INFO("%s: using %s backend\n", __func__, ggml_backend_dev_name(dev)); + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (!backend) { + SENSE_VOICE_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev)); + continue; + } + result.push_back(backend); } } -#endif GGML_UNUSED(params); - result.push_back(ggml_backend_cpu_init()); - return result; } diff --git a/sense-voice/csrc/third-party/ggml b/sense-voice/csrc/third-party/ggml index d51c6c0..6fcbd60 160000 --- a/sense-voice/csrc/third-party/ggml +++ b/sense-voice/csrc/third-party/ggml @@ -1 +1 @@ -Subproject commit d51c6c04d1e78775637b077fbcaf7df899ea12d7 +Subproject commit 6fcbd60bc72ac3f7ad43f78c87e535f2e6206f58