locaal-ai · royshil · Jul 22, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini
@@ -1,13 +1,19 @@
 none_no_input="No input"
 Phonetic_Transcription="Phonetic Transcription"
+phonetic_transcription_help="If enabled, the output will transformed phonetically e.g. 'ABC' will become 'aei bee see'."
 File="File"
 Text="Text"
 Generate_Audio="Generate Audio"
 Speaker_ID="Speaker ID"
 Model="Model"
-Delete_Cached_Models="Delete Cached Models"
+Delete_Cached_Models="⚠️ Delete Cached Models ⚠️"
 Speed="Speed"
 Line_By_Line="Read Line By Line"
 line_by_line_help="If enabled, the input text or file will be read line by line, otherwise, the entire input text or file will be read at once."
 input_debounce_help="Enable waiting for input changes to end before the input text is processed. This is useful when typing or rapid changes appear in the input text or file."
 input_debounce="Input Debounce"
+Latency="Latency"
+latency_help="The time in milliseconds to wait before emitting another audio batch."
+Advanced="Advanced Settings"
+Interrupt_Mode="Interrupt Mode"
+interrupt_mode_help="If enabled, the audio generation will be interrupted when new generated audio comes in."
diff --git a/src/audio-thread.cpp b/src/audio-thread.cpp
@@ -24,13 +24,15 @@ void AudioThread::run()
 			emitFromBuffer();
 		}
 
-		// Perform the operation to be timed
-		auto end = std::chrono::high_resolution_clock::now();
-		auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+		// Calculate the time taken to process the audio samples
+		const auto end = std::chrono::high_resolution_clock::now();
+		const auto duration =
+			std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+		const auto sleep_duration_ms =
+			std::chrono::milliseconds(TARGET_BATCH_SIZE_MS) - duration;
 
 		// Sleep for [TARGET_BATCH_SIZE_MS] minus the time taken to process the audio samples
-		std::this_thread::sleep_for(std::chrono::milliseconds(TARGET_BATCH_SIZE_MS) -
-					    duration);
+		std::this_thread::sleep_for(sleep_duration_ms);
 	}
 }
 
@@ -39,16 +41,23 @@ void AudioThread::emitFromBuffer()
 	// Lock the mutex
 	std::lock_guard<std::mutex> lock(mutex);
 
-	// Get 20ms audio samples from the buffer
+	const int target_number_of_samples = TARGET_BATCH_SIZE_MS * sample_rate / 1000;
+
+	// Get audio samples from the buffer
 	std::vector<float> samples;
-	for (int i = 0; i < TARGET_BATCH_SIZE_MS * sample_rate / 1000; i++) {
+	for (int i = 0; i < target_number_of_samples; i++) {
 		if (this->buffer.empty()) {
 			break;
 		}
 		samples.push_back(this->buffer.front());
 		this->buffer.pop_front();
 	}
 
+	// if needed - pad the samples with silence to reach the target batch size
+	while ((int)samples.size() < target_number_of_samples) {
+		samples.push_back(0.0f);
+	}
+
 	// Emit audio samples
 	emitAudioSamples(samples, sample_rate);
 }

diff --git a/src/audio-thread.h b/src/audio-thread.h
@@ -43,21 +43,34 @@ class AudioThread {
 		// Lock the mutex
 		std::lock_guard<std::mutex> lock(mutex);
 
+		if (interrupt_mode) {
+			// Clear the buffer if in interrupt mode
+			buffer.clear();
+		}
+
 		// Push audio samples to the buffer
 		for (auto sample : samples) {
 			buffer.push_back(sample);
 		}
 	}
 
+	void setSampleRate(int sample_rate_) { sample_rate = sample_rate_; }
+	void setTargetBatchSizeMs(int target_batch_size_ms)
+	{
+		TARGET_BATCH_SIZE_MS = target_batch_size_ms;
+	}
+	void setInterruptMode(bool interrupt_mode_) { interrupt_mode = interrupt_mode_; }
+
 private:
-	const int TARGET_BATCH_SIZE_MS = 50;
+	int TARGET_BATCH_SIZE_MS = 50;
 
 	std::deque<float> buffer;
 	std::mutex mutex;
 	std::thread thread;
 	obs_source_t *context;
 	int sample_rate = 22050;
 	std::atomic<bool> running = false;
+	bool interrupt_mode = false;
 
 	void run();
 	void emitFromBuffer();

diff --git a/src/model-utils/model-downloader-ui.cpp b/src/model-utils/model-downloader-ui.cpp
@@ -74,6 +74,9 @@ void ModelDownloader::close()
 {
 	this->mPrepareToClose = true;
 
+	// Stop the thread
+	this->download_thread->quit();
+
 	QDialog::close();
 }
 
@@ -231,7 +234,9 @@ ModelDownloader::~ModelDownloader()
 		}
 		delete this->download_thread;
 	}
-	delete this->download_worker;
+	if (this->download_worker != nullptr) {
+		delete this->download_worker;
+	}
 }
 
 ModelDownloadWorker::~ModelDownloadWorker()

diff --git a/src/squawk-source.cpp b/src/squawk-source.cpp
@@ -78,6 +78,8 @@ void squawk_source_defaults(obs_data_t *settings)
 	obs_data_set_default_bool(settings, "line_by_line", false);
 	obs_data_set_default_bool(settings, "phonetic_transcription", true);
 	obs_data_set_default_bool(settings, "input_debounce", true);
+	obs_data_set_default_bool(settings, "interrupt_mode", false);
+	obs_data_set_default_int(settings, "latency", 50);
 }
 
 bool add_sources_to_list(void *list_property, obs_source_t *source)
@@ -200,15 +202,42 @@ obs_properties_t *squawk_source_properties(void *data)
 					original_text.c_str(), text.c_str());
 			}
 
-			generate_audio_from_text(squawk_data_->tts_context, text, speaker_id,
-						 squawk_data_->speed);
+			std::thread audio_gen_thread([squawk_data_, text, speaker_id]() {
+				generate_audio_from_text(squawk_data_->tts_context, text,
+							 speaker_id, squawk_data_->speed);
+			});
+			audio_gen_thread.detach();
 
 			return true;
 		});
 
+	// add advanced settings group
+	obs_properties_t *advanced_group = obs_properties_create();
+	obs_properties_add_group(ppts, "advanced", MT_("Advanced"), OBS_GROUP_NORMAL,
+				 advanced_group);
+
+	// add boolean propery for enabling phonetic transcription
+	obs_properties_add_bool(advanced_group, "phonetic_transcription",
+				MT_("Phonetic_Transcription"));
+	// add info desxription for phonetic transcription
+	obs_property_set_long_description(obs_properties_get(advanced_group,
+							     "phonetic_transcription"),
+					  MT_("phonetic_transcription_help"));
+
+	// add boolean property for enabling interrupt mode
+	obs_properties_add_bool(advanced_group, "interrupt_mode", MT_("Interrupt_Mode"));
+	// add info description for interrupt mode
+	obs_property_set_long_description(obs_properties_get(advanced_group, "interrupt_mode"),
+					  MT_("interrupt_mode_help"));
+
+	// add int slider for setting the latency
+	obs_properties_add_int_slider(advanced_group, "latency", MT_("Latency"), 10, 250, 10);
+	obs_property_set_long_description(obs_properties_get(advanced_group, "latency"),
+					  MT_("latency_help"));
+
 	// add button for deleting all cached models
 	obs_properties_add_button(
-		ppts, "delete_models", MT_("Delete_Cached_Models"),
+		advanced_group, "delete_models", MT_("Delete_Cached_Models"),
 		[](obs_properties_t *props, obs_property_t *property, void *data_) {
 			UNUSED_PARAMETER(props);
 			UNUSED_PARAMETER(property);
@@ -225,9 +254,6 @@ obs_properties_t *squawk_source_properties(void *data)
 			return true;
 		});
 
-	// add boolean propery for enabling phonetic transcription
-	obs_properties_add_bool(ppts, "phonetic_transcription", MT_("Phonetic_Transcription"));
-
 	// add plugin info
 	char small_info[256];
 	snprintf(small_info, sizeof(small_info), PLUGIN_INFO_TEMPLATE, PLUGIN_VERSION);
@@ -268,6 +294,9 @@ void squawk_source_update(void *data, obs_data_t *settings)
 		init_sherpa_tts_context(squawk_data->tts_context, audio_samples_callback,
 					squawk_data);
 	}
+
+	squawk_data->audioThread->setTargetBatchSizeMs((int)obs_data_get_int(settings, "latency"));
+	squawk_data->audioThread->setInterruptMode(obs_data_get_bool(settings, "interrupt_mode"));
 }
 
 void squawk_source_activate(void *data)