diff --git a/.github/workflows/build_and_deploy.yml b/.github/workflows/build_and_deploy.yml index 793c71efa..ab54792e1 100644 --- a/.github/workflows/build_and_deploy.yml +++ b/.github/workflows/build_and_deploy.yml @@ -154,8 +154,9 @@ jobs: run: | bash build_util/codesign.bash "artifact/${{ env.ASSET_NAME }}/voicevox_core.dll" env: - CERT_BASE64: ${{ secrets.CERT_BASE64 }} - CERT_PASSWORD: ${{ secrets.CERT_PASSWORD }} + ESIGNERCKA_USERNAME: ${{ secrets.ESIGNERCKA_USERNAME }} + ESIGNERCKA_PASSWORD: ${{ secrets.ESIGNERCKA_PASSWORD }} + ESIGNERCKA_TOTP_SECRET: ${{ secrets.ESIGNERCKA_TOTP_SECRET }} - name: Archive artifact shell: bash run: | @@ -241,8 +242,9 @@ jobs: run: | bash build_util/codesign.bash ./${{ matrix.name }} env: - CERT_BASE64: ${{ secrets.CERT_BASE64 }} - CERT_PASSWORD: ${{ secrets.CERT_PASSWORD }} + ESIGNERCKA_USERNAME: ${{ secrets.ESIGNERCKA_USERNAME }} + ESIGNERCKA_PASSWORD: ${{ secrets.ESIGNERCKA_PASSWORD }} + ESIGNERCKA_TOTP_SECRET: ${{ secrets.ESIGNERCKA_TOTP_SECRET }} - name: Upload to Release if: env.VERSION != 'DEBUG' && env.SKIP_UPLOADING_RELEASE_ASSET == '0' uses: softprops/action-gh-release@v1 diff --git a/_typos.toml b/_typos.toml index 836b1d79a..fcf98cecd 100644 --- a/_typos.toml +++ b/_typos.toml @@ -8,4 +8,4 @@ NdArray="NdArray" # onnxruntime::session::NdArray [default.extend-words] [files] -extend-exclude = ["*.svg"] +extend-exclude = ["*.svg", "*.onnx"] diff --git a/build_util/codesign.bash b/build_util/codesign.bash index 8bf3ac8be..5c80cf55f 100755 --- a/build_util/codesign.bash +++ b/build_util/codesign.bash @@ -1,14 +1,20 @@ #!/usr/bin/env bash # !!! コードサイニング証明書を取り扱うので取り扱い注意 !!! +# eSignerCKAを使ってコード署名する + set -eu -if [ ! -v CERT_BASE64 ]; then - echo "CERT_BASE64が未定義です" +if [ ! -v ESIGNERCKA_USERNAME ]; then # eSignerCKAのユーザー名 + echo "ESIGNERCKA_USERNAMEが未定義です" + exit 1 +fi +if [ ! -v ESIGNERCKA_PASSWORD ]; then # eSignerCKAのパスワード + echo "ESIGNERCKA_PASSWORDが未定義です" exit 1 fi -if [ ! -v CERT_PASSWORD ]; then - echo "CERT_PASSWORDが未定義です" +if [ ! -v ESIGNERCKA_TOTP_SECRET ]; then # eSignerCKAのTOTP Secret + echo "ESIGNERCKA_TOTP_SECRETが未定義です" exit 1 fi @@ -18,22 +24,44 @@ if [ $# -ne 1 ]; then fi target_file_glob="$1" -# 証明書 -CERT_PATH=cert.pfx -echo -n "$CERT_BASE64" | base64 -d - > $CERT_PATH +# eSignerCKAのセットアップ +INSTALL_DIR='..\eSignerCKA' +if [ ! -d "$INSTALL_DIR" ]; then + curl -LO "https://github.com/SSLcom/eSignerCKA/releases/download/v1.0.6/SSL.COM-eSigner-CKA_1.0.6.zip" + unzip -o SSL.COM-eSigner-CKA_1.0.6.zip + mv ./*eSigner*CKA_*.exe eSigner_CKA_Installer.exe + powershell " + & ./eSigner_CKA_Installer.exe /CURRENTUSER /VERYSILENT /SUPPRESSMSGBOXES /DIR='$INSTALL_DIR' | Out-Null + & '$INSTALL_DIR\eSignerCKATool.exe' config -mode product -user '$ESIGNERCKA_USERNAME' -pass '$ESIGNERCKA_PASSWORD' -totp '$ESIGNERCKA_TOTP_SECRET' -key '$INSTALL_DIR\master.key' -r + & '$INSTALL_DIR\eSignerCKATool.exe' unload + " + rm SSL.COM-eSigner-CKA_1.0.6.zip eSigner_CKA_Installer.exe +fi + +# 証明書を読み込む +powershell "& '$INSTALL_DIR\eSignerCKATool.exe' load" + +# shellcheck disable=SC2016 +THUMBPRINT=$( + powershell ' + $CodeSigningCert = Get-ChildItem Cert:\CurrentUser\My -CodeSigningCert | Select-Object -First 1 + echo "$($CodeSigningCert.Thumbprint)" + ' +) # 指定ファイルに署名する function codesign() { TARGET="$1" - SIGNTOOL=$(find "C:/Program Files (x86)/Windows Kits/10/App Certification Kit" -name "signtool.exe" | sort -V | tail -n 1) - powershell "& '$SIGNTOOL' sign /fd SHA256 /td SHA256 /tr http://timestamp.digicert.com /f $CERT_PATH /p $CERT_PASSWORD '$TARGET'" + # shellcheck disable=SC2012 + SIGNTOOL=$(ls "C:/Program Files (x86)/Windows Kits/"10/bin/*/x86/signtool.exe | sort -V | tail -n 1) # なぜかこれじゃないと動かない + powershell "& '$SIGNTOOL' sign /fd SHA256 /td SHA256 /tr http://timestamp.digicert.com /sha1 '$THUMBPRINT' '$TARGET'" } # 指定ファイルが署名されているか function is_signed() { TARGET="$1" SIGNTOOL=$(find "C:/Program Files (x86)/Windows Kits/10/App Certification Kit" -name "signtool.exe" | sort -V | tail -n 1) - powershell "& '$SIGNTOOL' verify /pa '$TARGET'" || return 1 + powershell "& '$SIGNTOOL' verify /pa '$TARGET'" >/dev/null 2>&1 || return 1 } # 署名されていなければ署名 @@ -42,10 +70,10 @@ ls $target_file_glob | while read -r target_file; do if is_signed "$target_file"; then echo "署名済み: $target_file" else - echo "署名: $target_file" + echo "署名開始: $target_file" codesign "$target_file" fi done -# 証明書を消去 -rm $CERT_PATH +# 証明書を破棄 +powershell "& '$INSTALL_DIR\eSignerCKATool.exe' unload" diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs index 3835b678d..14b53fb6a 100644 --- a/crates/voicevox_core/src/error.rs +++ b/crates/voicevox_core/src/error.rs @@ -55,6 +55,9 @@ pub enum Error { )] InvalidModelIndex { model_index: usize }, + #[error("{}", base_error_message(VOICEVOX_RESULT_UNSUPPORTED_MODEL_ERROR))] + UnsupportedModel, + #[error("{}", base_error_message(VOICEVOX_RESULT_INFERENCE_ERROR))] InferenceFailed, diff --git a/crates/voicevox_core/src/publish.rs b/crates/voicevox_core/src/publish.rs index 36d43169c..545e97695 100644 --- a/crates/voicevox_core/src/publish.rs +++ b/crates/voicevox_core/src/publish.rs @@ -157,6 +157,53 @@ impl VoicevoxCore { ) } + pub fn predict_sing_consonant_length( + &mut self, + consonant: &[i64], + vowel: &[i64], + note_duration: &[i64], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_consonant_length(consonant, vowel, note_duration, speaker_id) + } + + pub fn predict_sing_f0( + &mut self, + phoneme: &[i64], + note: &[i64], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_f0(phoneme, note, speaker_id) + } + + pub fn predict_sing_volume( + &mut self, + phoneme: &[i64], + note: &[i64], + f0: &[f32], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .predict_sing_volume(phoneme, note, f0, speaker_id) + } + + pub fn sf_decode( + &mut self, + phoneme: &[i64], + f0: &[f32], + volume: &[f32], + speaker_id: u32, + ) -> Result> { + self.synthesis_engine + .inference_core_mut() + .sf_decode(phoneme, f0, volume, speaker_id) + } + pub fn audio_query( &mut self, text: &str, @@ -282,8 +329,14 @@ impl InferenceCore { status.load_metas()?; if load_all_models { - for model_index in 0..MODEL_FILE_SET.models_count() { - status.load_model(model_index)?; + for model_index in 0..MODEL_FILE_SET.talk_models_count() { + status.load_talk_model(model_index)?; + } + for model_index in 0..MODEL_FILE_SET.sing_teacher_models_count() { + status.load_sing_teacher_model(model_index)?; + } + for model_index in 0..MODEL_FILE_SET.sf_decode_models_count() { + status.load_sf_decode_model(model_index)?; } } @@ -311,10 +364,28 @@ impl InferenceCore { .status_option .as_mut() .ok_or(Error::UninitializedStatus)?; - if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { - status.load_model(model_index) + if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { + status.load_talk_model(model_index) } else { - Err(Error::InvalidSpeakerId { speaker_id }) + // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する + let mut loaded = false; + if let Some((model_index, _)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + status.load_sing_teacher_model(model_index)?; + loaded = true; + } + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) + { + status.load_sf_decode_model(model_index)?; + loaded = true; + } + + if loaded { + Ok(()) + } else { + Err(Error::InvalidSpeakerId { speaker_id }) + } } } else { Err(Error::UninitializedStatus) @@ -322,10 +393,29 @@ impl InferenceCore { } pub fn is_model_loaded(&self, speaker_id: u32) -> bool { if let Some(status) = self.status_option.as_ref() { - if let Some((model_index, _)) = get_model_index_and_speaker_id(speaker_id) { - status.is_model_loaded(model_index) + if let Some((model_index, _)) = get_talk_model_index_and_speaker_id(speaker_id) { + status.is_talk_model_loaded(model_index) } else { - false + // ハミング機能及び歌機能モデルはどちらかが存在しない事があるので、どちらかが存在しない場合でも無視する + let mut loaded = true; + let mut model_found = false; + if let Some((model_index, _)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + loaded &= status.is_sing_teacher_model_loaded(model_index); + model_found = true; + } + if let Some((model_index, _)) = get_sf_decode_model_index_and_speaker_id(speaker_id) + { + loaded &= status.is_sf_decode_model_loaded(model_index); + model_found = true; + } + // ハミング機能及び歌機能モデルの両方が存在しなかった時はloaded = falseにする + if !model_found { + // FIXME: ワーニングを出すか、エラーにする + loaded = false + } + loaded } } else { false @@ -354,14 +444,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -371,15 +462,15 @@ impl InferenceCore { let input_tensors: Vec<&mut dyn AnyArray> = vec![&mut phoneme_vector_array, &mut speaker_id_array]; - let mut output = status.predict_duration_session_run(model_index, input_tensors)?; + let mut duration = status.predict_duration_session_run(model_index, input_tensors)?; - for output_item in output.iter_mut() { - if *output_item < PHONEME_LENGTH_MINIMAL { - *output_item = PHONEME_LENGTH_MINIMAL; + for duration_item in duration.iter_mut() { + if *duration_item < PHONEME_LENGTH_MINIMAL { + *duration_item = PHONEME_LENGTH_MINIMAL; } } - Ok(output) + Ok(duration) } #[allow(clippy::too_many_arguments)] @@ -407,14 +498,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -465,14 +557,15 @@ impl InferenceCore { return Err(Error::InvalidSpeakerId { speaker_id }); } - let (model_index, speaker_id) = - if let Some((model_index, speaker_id)) = get_model_index_and_speaker_id(speaker_id) { - (model_index, speaker_id) - } else { - return Err(Error::InvalidSpeakerId { speaker_id }); - }; + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_talk_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; - if model_index >= MODEL_FILE_SET.models_count() { + if model_index >= MODEL_FILE_SET.talk_models_count() { return Err(Error::InvalidModelIndex { model_index }); } @@ -512,6 +605,210 @@ impl InferenceCore { .map(|output| Self::trim_padding_from_output(output, padding_size)) } + pub fn predict_sing_consonant_length( + &mut self, + consonant: &[i64], + vowel: &[i64], + note_duration: &[i64], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut consonant_array = NdArray::new( + ndarray::arr1(consonant) + .into_shape([1, consonant.len()]) + .unwrap(), + ); + let mut vowel_array = + NdArray::new(ndarray::arr1(vowel).into_shape([1, vowel.len()]).unwrap()); + let mut note_duration_array = NdArray::new( + ndarray::arr1(note_duration) + .into_shape([1, note_duration.len()]) + .unwrap(), + ); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut consonant_array, + &mut vowel_array, + &mut note_duration_array, + &mut speaker_id_array, + ]; + + status.predict_sing_consonant_length_session_run(model_index, input_tensors) + } + + pub fn predict_sing_f0( + &mut self, + phoneme: &[i64], + note: &[i64], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_array = NdArray::new( + ndarray::arr1(phoneme) + .into_shape([1, phoneme.len()]) + .unwrap(), + ); + let mut note_array = NdArray::new(ndarray::arr1(note).into_shape([1, note.len()]).unwrap()); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = + vec![&mut phoneme_array, &mut note_array, &mut speaker_id_array]; + + status.predict_sing_f0_session_run(model_index, input_tensors) + } + + pub fn predict_sing_volume( + &mut self, + phoneme: &[i64], + note: &[i64], + f0: &[f32], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sing_teacher_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sing_teacher_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_array = NdArray::new( + ndarray::arr1(phoneme) + .into_shape([1, phoneme.len()]) + .unwrap(), + ); + let mut note_array = NdArray::new(ndarray::arr1(note).into_shape([1, note.len()]).unwrap()); + let mut f0_array = NdArray::new(ndarray::arr1(f0).into_shape([1, f0.len()]).unwrap()); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut phoneme_array, + &mut note_array, + &mut f0_array, + &mut speaker_id_array, + ]; + + status.predict_sing_volume_session_run(model_index, input_tensors) + } + + pub fn sf_decode( + &mut self, + phoneme: &[i64], + f0: &[f32], + volume: &[f32], + speaker_id: u32, + ) -> Result> { + if !self.initialized { + return Err(Error::UninitializedStatus); + } + + let status = self + .status_option + .as_mut() + .ok_or(Error::UninitializedStatus)?; + + if !status.validate_speaker_id(speaker_id) { + return Err(Error::InvalidSpeakerId { speaker_id }); + } + + let (model_index, speaker_id) = if let Some((model_index, speaker_id)) = + get_sf_decode_model_index_and_speaker_id(speaker_id) + { + (model_index, speaker_id) + } else { + return Err(Error::InvalidSpeakerId { speaker_id }); + }; + + if model_index >= MODEL_FILE_SET.sf_decode_models_count() { + return Err(Error::InvalidModelIndex { model_index }); + } + + let mut phoneme_array = NdArray::new( + ndarray::arr1(phoneme) + .into_shape([1, phoneme.len()]) + .unwrap(), + ); + let mut f0_array = NdArray::new(ndarray::arr1(f0).into_shape([1, f0.len()]).unwrap()); + let mut volume_array = + NdArray::new(ndarray::arr1(volume).into_shape([1, volume.len()]).unwrap()); + let mut speaker_id_array = NdArray::new(ndarray::arr1(&[speaker_id as i64])); + + let input_tensors: Vec<&mut dyn AnyArray> = vec![ + &mut phoneme_array, + &mut f0_array, + &mut volume_array, + &mut speaker_id_array, + ]; + + status.sf_decode_session_run(model_index, input_tensors) + } + fn make_f0_with_padding( f0_slice: &[f32], length_with_padding: usize, @@ -575,8 +872,22 @@ pub static SUPPORTED_DEVICES: Lazy = pub static SUPPORTED_DEVICES_CSTRING: Lazy = Lazy::new(|| CString::new(SUPPORTED_DEVICES.to_json().to_string()).unwrap()); -fn get_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { - MODEL_FILE_SET.speaker_id_map.get(&speaker_id).copied() +fn get_talk_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET.talk_speaker_id_map.get(&speaker_id).copied() +} + +fn get_sing_teacher_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET + .sing_teacher_speaker_id_map + .get(&speaker_id) + .copied() +} + +fn get_sf_decode_model_index_and_speaker_id(speaker_id: u32) -> Option<(usize, u32)> { + MODEL_FILE_SET + .sf_decode_speaker_id_map + .get(&speaker_id) + .copied() } pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'static str { @@ -598,6 +909,7 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati VOICEVOX_RESULT_UNINITIALIZED_STATUS_ERROR => "Statusが初期化されていません\0", VOICEVOX_RESULT_INVALID_SPEAKER_ID_ERROR => "無効なspeaker_idです\0", VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0", + VOICEVOX_RESULT_UNSUPPORTED_MODEL_ERROR => "未対応なモデルです\0", VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0", VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => { "入力テキストからのフルコンテキストラベル抽出に失敗しました\0" @@ -784,7 +1096,7 @@ mod tests { #[case] speaker_id: u32, #[case] expected: Option<(usize, u32)>, ) { - let actual = get_model_index_and_speaker_id(speaker_id); + let actual = get_talk_model_index_and_speaker_id(speaker_id); assert_eq!(expected, actual); } @@ -852,6 +1164,8 @@ mod tests { assert_eq!(result.unwrap().len(), vowel_phoneme_vector.len()); } + // TODO: sing系のテストを足す + #[rstest] fn decode_works() { let internal = VoicevoxCore::new_with_mutex(); @@ -896,6 +1210,114 @@ mod tests { assert_eq!(result.unwrap().len(), F0_LENGTH * 256); } + #[rstest] + fn predict_sing_f0_works() { + let internal = VoicevoxCore::new_with_mutex(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + load_all_models: true, + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) + .unwrap(); + + // 「テスト」という文章に対応する入力 + let phoneme_vector = [0, 37, 14, 35, 6, 37, 30, 0]; + let note_vector = [0, 30, 30, 40, 40, 50, 50, 0]; + + let sing_teacher_speaker_id = 6000; + let result = internal.lock().unwrap().predict_sing_f0( + &phoneme_vector, + ¬e_vector, + sing_teacher_speaker_id, + ); + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), phoneme_vector.len()); + } + + #[rstest] + fn predict_sing_volume_works() { + let internal = VoicevoxCore::new_with_mutex(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + load_all_models: true, + acceleration_mode: AccelerationMode::Cpu, + ..Default::default() + }) + .unwrap(); + + // 「テスト」という文章に対応する入力 + let phoneme_vector = [0, 37, 14, 35, 6, 37, 30, 0]; + let note_vector = [0, 30, 30, 40, 40, 50, 50, 0]; + let f0_vector = [0., 5.905218, 5.905218, 0., 0., 5.565851, 5.565851, 0.]; + + let sing_teacher_speaker_id = 6000; + let result = internal.lock().unwrap().predict_sing_volume( + &phoneme_vector, + ¬e_vector, + &f0_vector, + sing_teacher_speaker_id, + ); + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), phoneme_vector.len()); + } + + #[rstest] + fn sf_decode_works() { + let internal = VoicevoxCore::new_with_mutex(); + internal + .lock() + .unwrap() + .initialize(InitializeOptions { + acceleration_mode: AccelerationMode::Cpu, + load_all_models: true, + ..Default::default() + }) + .unwrap(); + + // 「テスト」という文章に対応する入力 + const F0_LENGTH: usize = 69; + let mut f0 = [0.; F0_LENGTH]; + f0[9..24].fill(5.905218); + f0[37..60].fill(5.565851); + + let mut volume = [0.; F0_LENGTH]; + volume[9..24].fill(0.5); + volume[24..37].fill(0.2); + volume[37..60].fill(1.0); + + let mut phoneme = [0; F0_LENGTH]; + let mut set_one = |index, range| { + for i in range { + phoneme[i] = index; + } + }; + set_one(0, 0..9); + set_one(37, 9..13); + set_one(14, 13..24); + set_one(35, 24..30); + set_one(6, 30..37); + set_one(37, 37..45); + set_one(30, 45..60); + set_one(0, 60..69); + + let sf_decode_speaker_id = 3000; + let result = + internal + .lock() + .unwrap() + .sf_decode(&phoneme, &f0, &volume, sf_decode_speaker_id); + + assert!(result.is_ok(), "{result:?}"); + assert_eq!(result.unwrap().len(), F0_LENGTH * 256); + } + type TextConsonantVowelData = [(&'static [(&'static str, &'static str, &'static str)], usize)]; diff --git a/crates/voicevox_core/src/result_code.rs b/crates/voicevox_core/src/result_code.rs index bac43f2c9..1eefa6321 100644 --- a/crates/voicevox_core/src/result_code.rs +++ b/crates/voicevox_core/src/result_code.rs @@ -23,6 +23,8 @@ pub enum VoicevoxResultCode { VOICEVOX_RESULT_INVALID_SPEAKER_ID_ERROR = 7, /// 無効なmodel_indexが指定された VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 8, + /// 対応していないmodelが指定された + VOICEVOX_RESULT_UNSUPPORTED_MODEL_ERROR = 15, /// 推論に失敗した VOICEVOX_RESULT_INFERENCE_ERROR = 9, /// コンテキストラベル出力に失敗した diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs index 309d060b8..b76dad06d 100644 --- a/crates/voicevox_core/src/status.rs +++ b/crates/voicevox_core/src/status.rs @@ -31,18 +31,30 @@ pub(crate) static MODEL_FILE_SET: Lazy = Lazy::new(|| { }); pub struct Status { - models: StatusModels, + talk_models: StatusTalkModels, + sing_teacher_models: StatusSingTeacherModels, + sf_decode_models: StatusSfModels, light_session_options: SessionOptions, // 軽いモデルはこちらを使う heavy_session_options: SessionOptions, // 重いモデルはこちらを使う supported_styles: BTreeSet, } -struct StatusModels { +struct StatusTalkModels { predict_duration: BTreeMap>, predict_intonation: BTreeMap>, decode: BTreeMap>, } +struct StatusSingTeacherModels { + predict_sing_consonant_length: BTreeMap>, + predict_sing_f0: BTreeMap>, + predict_sing_volume: BTreeMap>, +} + +struct StatusSfModels { + sf_decode: BTreeMap>, +} + #[derive(new, Getters)] struct SessionOptions { cpu_num_threads: u16, @@ -50,9 +62,13 @@ struct SessionOptions { } pub(crate) struct ModelFileSet { - pub(crate) speaker_id_map: BTreeMap, + pub(crate) talk_speaker_id_map: BTreeMap, + pub(crate) sing_teacher_speaker_id_map: BTreeMap, + pub(crate) sf_decode_speaker_id_map: BTreeMap, pub(crate) metas_str: String, - models: Vec, + talk_models: Vec, + sing_teacher_models: Vec, + sf_decode_models: Vec, } impl ModelFileSet { @@ -76,18 +92,18 @@ impl ModelFileSet { let metas_str = fs_err::read_to_string(path("metas.json"))?; - let models = model_file::MODEL_FILE_NAMES + let talk_models = model_file::TALK_MODEL_FILE_NAMES .iter() .map( - |&ModelFileNames { + |&TalkModelFileNames { predict_duration_model, predict_intonation_model, decode_model, }| { - let predict_duration_model = ModelFile::new(&path(predict_duration_model))?; - let predict_intonation_model = ModelFile::new(&path(predict_intonation_model))?; - let decode_model = ModelFile::new(&path(decode_model))?; - Ok(Model { + let predict_duration_model = path(predict_duration_model); + let predict_intonation_model = path(predict_intonation_model); + let decode_model = path(decode_model); + Ok(TalkModel { predict_duration_model, predict_intonation_model, decode_model, @@ -96,49 +112,101 @@ impl ModelFileSet { ) .collect::>()?; + let sing_teacher_models = model_file::SING_TEACHER_MODEL_FILE_NAMES + .iter() + .map( + |&SingTeacherModelFileNames { + predict_sing_consonant_length_model, + predict_sing_f0_model, + predict_sing_volume_model, + }| { + let predict_sing_consonant_length_model = + path(predict_sing_consonant_length_model); + let predict_sing_f0_model = path(predict_sing_f0_model); + let predict_sing_volume_model = path(predict_sing_volume_model); + Ok(SingTeacherModel { + predict_sing_consonant_length_model, + predict_sing_f0_model, + predict_sing_volume_model, + }) + }, + ) + .collect::>()?; + + let sf_decode_models = model_file::SF_DECODE_MODEL_FILE_NAMES + .iter() + .map(|&SfDecodeModelFileNames { sf_decode_model }| { + let sf_decode_model = path(sf_decode_model); + Ok(SfDecodeModel { sf_decode_model }) + }) + .collect::>()?; + return Ok(Self { - speaker_id_map: model_file::SPEAKER_ID_MAP.iter().copied().collect(), + talk_speaker_id_map: model_file::TALK_SPEAKER_ID_MAP.iter().copied().collect(), + sing_teacher_speaker_id_map: model_file::SING_TEACHER_SPEAKER_ID_MAP + .iter() + .copied() + .collect(), + sf_decode_speaker_id_map: model_file::SF_DECODE_SPEAKER_ID_MAP + .iter() + .copied() + .collect(), metas_str, - models, + talk_models, + sing_teacher_models, + sf_decode_models, }); const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR"; } - pub(crate) fn models_count(&self) -> usize { - self.models.len() + pub(crate) fn talk_models_count(&self) -> usize { + self.talk_models.len() + } + + pub(crate) fn sing_teacher_models_count(&self) -> usize { + self.sing_teacher_models.len() + } + + pub(crate) fn sf_decode_models_count(&self) -> usize { + self.sf_decode_models.len() } } -struct ModelFileNames { +struct TalkModelFileNames { predict_duration_model: &'static str, predict_intonation_model: &'static str, decode_model: &'static str, } +struct SingTeacherModelFileNames { + predict_sing_consonant_length_model: &'static str, + predict_sing_f0_model: &'static str, + predict_sing_volume_model: &'static str, +} + +struct SfDecodeModelFileNames { + sf_decode_model: &'static str, +} + #[derive(thiserror::Error, Debug)] #[error("不正なモデルファイルです")] struct DecryptModelError; -struct Model { - predict_duration_model: ModelFile, - predict_intonation_model: ModelFile, - decode_model: ModelFile, +struct TalkModel { + predict_duration_model: PathBuf, + predict_intonation_model: PathBuf, + decode_model: PathBuf, } -struct ModelFile { - path: PathBuf, - content: Vec, +struct SingTeacherModel { + predict_sing_consonant_length_model: PathBuf, + predict_sing_f0_model: PathBuf, + predict_sing_volume_model: PathBuf, } -impl ModelFile { - fn new(path: &Path) -> anyhow::Result { - let content = fs_err::read(path)?; - Ok(Self { - path: path.to_owned(), - content, - }) - } +struct SfDecodeModel { + sf_decode_model: PathBuf, } #[derive(Deserialize, Getters)] @@ -205,11 +273,19 @@ unsafe impl Send for Status {} impl Status { pub fn new(use_gpu: bool, cpu_num_threads: u16) -> Self { Self { - models: StatusModels { + talk_models: StatusTalkModels { predict_duration: BTreeMap::new(), predict_intonation: BTreeMap::new(), decode: BTreeMap::new(), }, + sing_teacher_models: StatusSingTeacherModels { + predict_sing_consonant_length: BTreeMap::new(), + predict_sing_f0: BTreeMap::new(), + predict_sing_volume: BTreeMap::new(), + }, + sf_decode_models: StatusSfModels { + sf_decode: BTreeMap::new(), + }, light_session_options: SessionOptions::new(cpu_num_threads, false), heavy_session_options: SessionOptions::new(cpu_num_threads, use_gpu), supported_styles: BTreeSet::default(), @@ -229,9 +305,9 @@ impl Status { Ok(()) } - pub fn load_model(&mut self, model_index: usize) -> Result<()> { - if model_index < MODEL_FILE_SET.models.len() { - let model = &MODEL_FILE_SET.models[model_index]; + pub fn load_talk_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.talk_models.len() { + let model = &MODEL_FILE_SET.talk_models[model_index]; let predict_duration_session = self.new_session(&model.predict_duration_model, &self.light_session_options)?; let predict_intonation_session = @@ -239,14 +315,83 @@ impl Status { let decode_model = self.new_session(&model.decode_model, &self.heavy_session_options)?; - self.models + self.talk_models .predict_duration .insert(model_index, predict_duration_session); - self.models + self.talk_models .predict_intonation .insert(model_index, predict_intonation_session); - self.models.decode.insert(model_index, decode_model); + self.talk_models.decode.insert(model_index, decode_model); + + Ok(()) + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn is_talk_model_loaded(&self, model_index: usize) -> bool { + self.talk_models.predict_duration.contains_key(&model_index) + && self + .talk_models + .predict_intonation + .contains_key(&model_index) + && self.talk_models.decode.contains_key(&model_index) + } + + pub fn load_sing_teacher_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sing_teacher_models.len() { + let model = &MODEL_FILE_SET.sing_teacher_models[model_index]; + let predict_sing_consonant_length_session = self.new_session( + &model.predict_sing_consonant_length_model, + &self.light_session_options, + )?; + let predict_sing_f0_session = + self.new_session(&model.predict_sing_f0_model, &self.light_session_options)?; + let predict_sing_volume_session = self.new_session( + &model.predict_sing_volume_model, + &self.light_session_options, + )?; + + self.sing_teacher_models + .predict_sing_consonant_length + .insert(model_index, predict_sing_consonant_length_session); + self.sing_teacher_models + .predict_sing_f0 + .insert(model_index, predict_sing_f0_session); + self.sing_teacher_models + .predict_sing_volume + .insert(model_index, predict_sing_volume_session); + + Ok(()) + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn is_sing_teacher_model_loaded(&self, model_index: usize) -> bool { + self.sing_teacher_models + .predict_sing_consonant_length + .contains_key(&model_index) + && self + .sing_teacher_models + .predict_sing_f0 + .contains_key(&model_index) + && self + .sing_teacher_models + .predict_sing_volume + .contains_key(&model_index) + } + + pub fn load_sf_decode_model(&mut self, model_index: usize) -> Result<()> { + if model_index < MODEL_FILE_SET.sf_decode_models.len() { + let model = &MODEL_FILE_SET.sf_decode_models[model_index]; + let sf_decode_session = + self.new_session(&model.sf_decode_model, &self.heavy_session_options)?; + + self.sf_decode_models + .sf_decode + .insert(model_index, sf_decode_session); Ok(()) } else { @@ -254,20 +399,24 @@ impl Status { } } - pub fn is_model_loaded(&self, model_index: usize) -> bool { - self.models.predict_intonation.contains_key(&model_index) - && self.models.predict_duration.contains_key(&model_index) - && self.models.decode.contains_key(&model_index) + pub fn is_sf_decode_model_loaded(&self, model_index: usize) -> bool { + self.sf_decode_models.sf_decode.contains_key(&model_index) } fn new_session( &self, - model_file: &ModelFile, + model_file: &Path, session_options: &SessionOptions, ) -> Result> { - self.new_session_from_bytes(|| model_file::decrypt(&model_file.content), session_options) + let model_bytes = &match fs_err::read(model_file) { + Ok(model_bytes) => model_bytes, + Err(err) => { + panic!("ファイルを読み込めなかったためクラッシュします: {err}"); + } + }; + self.new_session_from_bytes(|| model_file::decrypt(model_bytes), session_options) .map_err(|source| Error::LoadModel { - path: model_file.path.clone(), + path: model_file.to_owned(), source, }) } @@ -311,7 +460,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.predict_duration.get_mut(&model_index) { + if let Some(model) = self.talk_models.predict_duration.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -327,7 +476,7 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.predict_intonation.get_mut(&model_index) { + if let Some(model) = self.talk_models.predict_intonation.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -343,7 +492,83 @@ impl Status { model_index: usize, inputs: Vec<&mut dyn AnyArray>, ) -> Result> { - if let Some(model) = self.models.decode.get_mut(&model_index) { + if let Some(model) = self.talk_models.decode.get_mut(&model_index) { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_consonant_length_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_teacher_models + .predict_sing_consonant_length + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_f0_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_teacher_models + .predict_sing_f0 + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn predict_sing_volume_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self + .sing_teacher_models + .predict_sing_volume + .get_mut(&model_index) + { + if let Ok(output_tensors) = model.run(inputs) { + Ok(output_tensors[0].as_slice().unwrap().to_owned()) + } else { + Err(Error::InferenceFailed) + } + } else { + Err(Error::InvalidModelIndex { model_index }) + } + } + + pub fn sf_decode_session_run( + &mut self, + model_index: usize, + inputs: Vec<&mut dyn AnyArray>, + ) -> Result> { + if let Some(model) = self.sf_decode_models.sf_decode.get_mut(&model_index) { if let Ok(output_tensors) = model.run(inputs) { Ok(output_tensors[0].as_slice().unwrap().to_owned()) } else { @@ -381,9 +606,16 @@ mod tests { cpu_num_threads, status.heavy_session_options.cpu_num_threads ); - assert!(status.models.predict_duration.is_empty()); - assert!(status.models.predict_intonation.is_empty()); - assert!(status.models.decode.is_empty()); + assert!(status.talk_models.predict_duration.is_empty()); + assert!(status.talk_models.predict_intonation.is_empty()); + assert!(status.talk_models.decode.is_empty()); + assert!(status + .sing_teacher_models + .predict_sing_consonant_length + .is_empty()); + assert!(status.sing_teacher_models.predict_sing_f0.is_empty()); + assert!(status.sing_teacher_models.predict_sing_volume.is_empty()); + assert!(status.sf_decode_models.sf_decode.is_empty()); assert!(status.supported_styles.is_empty()); } @@ -392,7 +624,7 @@ mod tests { let mut status = Status::new(true, 0); let result = status.load_metas(); assert_eq!(Ok(()), result); - let expected = BTreeSet::from([0, 1, 2, 3]); + let expected = BTreeSet::from([0, 1, 2, 3, 3000, 6000]); assert_eq!(expected, status.supported_styles); } @@ -404,27 +636,83 @@ mod tests { } #[rstest] - fn status_load_model_works() { + fn status_load_talk_model_works() { + let mut status = Status::new(false, 0); + let result = status.load_talk_model(0); + assert_eq!(Ok(()), result); + assert_eq!(1, status.talk_models.predict_duration.len()); + assert_eq!(1, status.talk_models.predict_intonation.len()); + assert_eq!(1, status.talk_models.decode.len()); + } + + #[rstest] + fn status_is_talk_model_loaded_works() { + let mut status = Status::new(false, 0); + let model_index = 0; + assert!( + !status.is_talk_model_loaded(model_index), + "model should not be loaded" + ); + let result = status.load_talk_model(model_index); + assert_eq!(Ok(()), result); + assert!( + status.is_talk_model_loaded(model_index), + "model should be loaded" + ); + } + + #[rstest] + fn status_load_sing_teacher_model_works() { + let mut status = Status::new(false, 0); + let result = status.load_sing_teacher_model(0); + assert_eq!(Ok(()), result); + assert_eq!( + 1, + status + .sing_teacher_models + .predict_sing_consonant_length + .len() + ); + assert_eq!(1, status.sing_teacher_models.predict_sing_f0.len()); + assert_eq!(1, status.sing_teacher_models.predict_sing_volume.len()); + } + + #[rstest] + fn status_is_sing_teacher_model_loaded_works() { + let mut status = Status::new(false, 0); + let model_index = 0; + assert!( + !status.is_sing_teacher_model_loaded(model_index), + "model should not be loaded" + ); + let result = status.load_sing_teacher_model(model_index); + assert_eq!(Ok(()), result); + assert!( + status.is_sing_teacher_model_loaded(model_index), + "model should be loaded" + ); + } + + #[rstest] + fn status_load_sf_decode_model_works() { let mut status = Status::new(false, 0); - let result = status.load_model(0); + let result = status.load_sf_decode_model(0); assert_eq!(Ok(()), result); - assert_eq!(1, status.models.predict_duration.len()); - assert_eq!(1, status.models.predict_intonation.len()); - assert_eq!(1, status.models.decode.len()); + assert_eq!(1, status.sf_decode_models.sf_decode.len()); } #[rstest] - fn status_is_model_loaded_works() { + fn status_is_sf_decode_model_loaded_works() { let mut status = Status::new(false, 0); let model_index = 0; assert!( - !status.is_model_loaded(model_index), + !status.is_sf_decode_model_loaded(model_index), "model should not be loaded" ); - let result = status.load_model(model_index); + let result = status.load_sf_decode_model(model_index); assert_eq!(Ok(()), result); assert!( - status.is_model_loaded(model_index), + status.is_sf_decode_model_loaded(model_index), "model should be loaded" ); } diff --git a/crates/voicevox_core/src/status/model_file.rs b/crates/voicevox_core/src/status/model_file.rs index f5dce926a..82fdd5873 100644 --- a/crates/voicevox_core/src/status/model_file.rs +++ b/crates/voicevox_core/src/status/model_file.rs @@ -1,21 +1,31 @@ -use super::{DecryptModelError, ModelFileNames}; +use super::{ + DecryptModelError, SfDecodeModelFileNames, SingTeacherModelFileNames, TalkModelFileNames, +}; pub(super) fn decrypt(content: &[u8]) -> std::result::Result, DecryptModelError> { Ok(content.to_owned()) } -pub(super) const SPEAKER_ID_MAP: &[(u32, (usize, u32))] = - &[(0, (0, 0)), (1, (0, 1)), (2, (1, 0)), (3, (1, 1))]; +pub(super) const TALK_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(0, (0, 0)), (1, (0, 1))]; -pub(super) const MODEL_FILE_NAMES: &[ModelFileNames] = &[ - ModelFileNames { - predict_duration_model: "predict_duration-0.onnx", - predict_intonation_model: "predict_intonation-0.onnx", - decode_model: "decode-0.onnx", - }, - ModelFileNames { - predict_duration_model: "predict_duration-1.onnx", - predict_intonation_model: "predict_intonation-1.onnx", - decode_model: "decode-1.onnx", - }, -]; +pub(super) const TALK_MODEL_FILE_NAMES: &[TalkModelFileNames] = &[TalkModelFileNames { + predict_duration_model: "predict_duration-0.onnx", + predict_intonation_model: "predict_intonation-0.onnx", + decode_model: "decode-0.onnx", +}]; + +pub(super) const SING_TEACHER_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(6000, (0, 0))]; + +pub(super) const SING_TEACHER_MODEL_FILE_NAMES: &[SingTeacherModelFileNames] = + &[SingTeacherModelFileNames { + predict_sing_consonant_length_model: "predict_sing_consonant_length-0.onnx", + predict_sing_f0_model: "predict_sing_f0-0.onnx", + predict_sing_volume_model: "predict_sing_volume-0.onnx", + }]; + +pub(super) const SF_DECODE_SPEAKER_ID_MAP: &[(u32, (usize, u32))] = &[(3000, (0, 0))]; + +pub(super) const SF_DECODE_MODEL_FILE_NAMES: &[SfDecodeModelFileNames] = + &[SfDecodeModelFileNames { + sf_decode_model: "sf_decode-0.onnx", + }]; diff --git a/crates/voicevox_core_c_api/src/compatible_engine.rs b/crates/voicevox_core_c_api/src/compatible_engine.rs index 23b446072..d919f72f0 100644 --- a/crates/voicevox_core_c_api/src/compatible_engine.rs +++ b/crates/voicevox_core_c_api/src/compatible_engine.rs @@ -157,3 +157,117 @@ pub extern "C" fn decode_forward( } } } + +#[no_mangle] +pub extern "C" fn predict_sing_consonant_length_forward( + length: i64, + consonant: *mut i64, + vowel: *mut i64, + note_duration: *mut i64, + speaker_id: *mut i64, + output: *mut i64, +) -> bool { + let length = length as usize; + let result = lock_internal().predict_sing_consonant_length( + unsafe { std::slice::from_raw_parts(consonant, length) }, + unsafe { std::slice::from_raw_parts(vowel, length) }, + unsafe { std::slice::from_raw_parts(note_duration, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn predict_sing_f0_forward( + length: i64, + phoneme: *mut i64, + note: *mut i64, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let result = lock_internal().predict_sing_f0( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(note, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn predict_sing_volume_forward( + length: i64, + phoneme: *mut i64, + note: *mut i64, + f0: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let result = lock_internal().predict_sing_volume( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(note, length) }, + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} + +#[no_mangle] +pub extern "C" fn sf_decode_forward( + length: i64, + phoneme: *mut i64, + f0: *mut f32, + volume: *mut f32, + speaker_id: *mut i64, + output: *mut f32, +) -> bool { + let length = length as usize; + let result = lock_internal().sf_decode( + unsafe { std::slice::from_raw_parts(phoneme, length) }, + unsafe { std::slice::from_raw_parts(f0, length) }, + unsafe { std::slice::from_raw_parts(volume, length) }, + unsafe { *speaker_id as u32 }, + ); + match result { + Ok(output_vec) => { + let output_slice = unsafe { std::slice::from_raw_parts_mut(output, length * 256) }; + output_slice.clone_from_slice(&output_vec); + true + } + Err(err) => { + set_message(&format!("{err}")); + false + } + } +} diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs index 558ee8d98..9c0181088 100644 --- a/crates/voicevox_core_c_api/src/helpers.rs +++ b/crates/voicevox_core_c_api/src/helpers.rs @@ -29,6 +29,7 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes Err(RustApi(UninitializedStatus)) => VOICEVOX_RESULT_UNINITIALIZED_STATUS_ERROR, Err(RustApi(InvalidSpeakerId { .. })) => VOICEVOX_RESULT_INVALID_SPEAKER_ID_ERROR, Err(RustApi(InvalidModelIndex { .. })) => VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR, + Err(RustApi(UnsupportedModel { .. })) => VOICEVOX_RESULT_UNSUPPORTED_MODEL_ERROR, Err(RustApi(InferenceFailed)) => VOICEVOX_RESULT_INFERENCE_ERROR, Err(RustApi(ExtractFullContextLabel(_))) => { VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR diff --git a/model/metas.json b/model/metas.json index dc8873469..a29df1ee4 100644 --- a/model/metas.json +++ b/model/metas.json @@ -23,5 +23,14 @@ ], "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3", "version": "0.0.1" + }, + { + "name": "dummy4", + "styles": [ + { "name": "style4-1", "id": 3000, "type": "frame_decode" }, + { "name": "style4-2", "id": 6000, "type": "singing_teacher" } + ], + "speaker_uuid": "32478dc2-4c8b-44f7-b041-c836e0df6d56", + "version": "0.0.1" } ] diff --git a/model/predict_contour-1.onnx b/model/predict_contour-1.onnx new file mode 100644 index 000000000..89fe628c8 Binary files /dev/null and b/model/predict_contour-1.onnx differ diff --git a/model/predict_sing_consonant_length-0.onnx b/model/predict_sing_consonant_length-0.onnx new file mode 100644 index 000000000..88a85df7a Binary files /dev/null and b/model/predict_sing_consonant_length-0.onnx differ diff --git a/model/predict_sing_f0-0.onnx b/model/predict_sing_f0-0.onnx new file mode 100644 index 000000000..026c3fb1f Binary files /dev/null and b/model/predict_sing_f0-0.onnx differ diff --git a/model/predict_sing_volume-0.onnx b/model/predict_sing_volume-0.onnx new file mode 100644 index 000000000..d80f97cba Binary files /dev/null and b/model/predict_sing_volume-0.onnx differ diff --git a/model/sf_decode-0.onnx b/model/sf_decode-0.onnx new file mode 100644 index 000000000..169285cb4 Binary files /dev/null and b/model/sf_decode-0.onnx differ