diff --git a/src/airunner/aihandler/engine.py b/src/airunner/aihandler/engine.py index 24e4da700..b7f77c8e1 100644 --- a/src/airunner/aihandler/engine.py +++ b/src/airunner/aihandler/engine.py @@ -6,6 +6,8 @@ from airunner.aihandler.enums import EngineRequestCode, EngineResponseCode from airunner.aihandler.logger import Logger from airunner.mediator_mixin import MediatorMixin +from airunner.workers.tts_generator_worker import TTSGeneratorWorker +from airunner.workers.tts_vocalizer_worker import TTSVocalizerWorker from airunner.workers.worker import Worker from airunner.aihandler.llm import LLMController from airunner.aihandler.logger import Logger @@ -64,7 +66,6 @@ class Engine(QObject, MediatorMixin, SettingsMixin): # Model controllers llm_controller = None - tts_controller = None stt_controller = None ocr_controller = None @@ -143,6 +144,14 @@ def __init__(self, **kwargs): self.request_worker = self.create_worker(EngineRequestWorker) self.response_worker = self.create_worker(EngineResponseWorker) + + self.generator_worker = self.create_worker(TTSGeneratorWorker) + self.vocalizer_worker = self.create_worker(TTSVocalizerWorker) + self.register("tts_request", self) + + @pyqtSlot(dict) + def on_tts_request(self, data: dict): + self.generator_worker.add_to_queue(data) def on_llm_controller_response_signal(self, message): self.do_response(message) diff --git a/src/airunner/aihandler/tts.py b/src/airunner/aihandler/tts.py index bfe970f36..ecfbf1089 100644 --- a/src/airunner/aihandler/tts.py +++ b/src/airunner/aihandler/tts.py @@ -12,172 +12,10 @@ from airunner.aihandler.logger import Logger from airunner.mediator_mixin import MediatorMixin -from airunner.workers.worker import Worker +from airunner.windows.main.settings_mixin import SettingsMixin -class VocalizerWorker(Worker): - """ - Speech (in the form of numpy arrays generated with the TTS class) is added to the - vocalizer's queue. The vocalizer plays the speech using sounddevice. - """ - reader_mode_active = False - logger = Logger(prefix="VocalizerWorker") - - def __init__(self, *args, **kwargs): - super().__init__() - self.queue = Queue() - self.stream = sd.OutputStream(samplerate=24000, channels=1) - self.stream.start() - self.data = [] - self.started = False - self.register("TTSGeneratorWorker_add_to_stream_signal", self) - - def on_TTSGeneratorWorker_add_to_stream_signal(self, response): - self.queue.put(response) - - def handle_message(self, item): - item = self.queue.get(timeout=1) - if self.started or not self.reader_mode_active: - self.stream.write(item) - else: - self.data.append(item) - - if not self.started and len(self.data) >= 6 and self.reader_mode_active: - for item in self.data: - self.stream.write(item) - self.started = True - self.data = [] - - def handle_speech(self, generated_speech): - self.logger.info("Adding speech to stream...") - try: - self.queue.put(generated_speech) - except Exception as e: - self.logger.error(f"Error while adding speech to stream: {e}") - - -class TTSGeneratorWorker(Worker): - """ - Takes input text from any source and generates speech from it using the TTS class. - """ - def __init__(self, prefix="TTSGeneratorWorker"): - super().__init__(prefix) - self.tts = TTS() - self.play_queue = [] - self.play_queue_started = False - self.tts_settings = None - - def handle_message(self, data): - tts_settings = data["tts_settings"] - self.tts_settings = tts_settings - message = data["message"] - is_end_of_message = data["is_end_of_message"] - play_queue_buffer_length = tts_settings["play_queue_buffer_length"] - play_queue.append(data) - if is_end_of_message or len(play_queue) == play_queue_buffer_length or play_queue_started: - for item in play_queue: - self.generate(item) - play_queue_started = True - play_queue = [] - if is_end_of_message or len(self.play_queue) == play_queue_buffer_length or self.play_queue_started: - self.play_queue_started = True - self.generate_message() - for item in play_queue: - self.generate(message) - self.play_queue_started = True - self.play_queue = [] - - - def generate(self, text): - self.logger.info("Generating TTS...") - text = text.replace("\n", " ").strip() - - if self.tts_settings["use_bark"]: - response = self.generate_with_bark(text) - else: - response = self.generate_with_t5(text) - - self.emit("TTSGeneratorWorker_add_to_stream_signal", response) - - def move_inputs_to_device(self, inputs): - use_cuda = self.memory_settings["use_cuda"] - if use_cuda: - self.logger.info("Moving inputs to CUDA") - try: - inputs = {k: v.cuda() for k, v in inputs.items()} - except AttributeError: - pass - return inputs - - def generate_with_bark(self, text): - self.logger.info("Generating TTS...") - text = text.replace("\n", " ").strip() - - self.logger.info("Processing inputs...") - inputs = self.parent.processor(text, voice_preset=self.tts_settings["voice"]).to(self.parent.device) - inputs = self.move_inputs_to_device(inputs) - - self.logger.info("Generating speech...") - start = time.time() - params = dict( - **inputs, - fine_temperature=self.tts_settings["fine_temperature"], - coarse_temperature=self.tts_settings["coarse_temperature"], - semantic_temperature=self.tts_settings["semantic_temperature"], - ) - speech = self.parent.model.generate(**params) - self.logger.info("Generated speech in " + str(time.time() - start) + " seconds") - - response = speech[0].cpu().float().numpy() - return response - - def generate_with_t5(self, text): - self.logger.info("Generating TTS...") - text = text.replace("\n", " ").strip() - - self.logger.info("Processing inputs...") - - inputs = self.parent.processor(text=text, return_tensors="pt") - inputs = self.move_inputs_to_device(inputs) - - self.logger.info("Generating speech...") - start = time.time() - params = dict( - **inputs, - speaker_embeddings=self.parent.speaker_embeddings, - vocoder=self.parent.vocoder, - max_length=100, - ) - speech = self.parent.model.generate(**params) - self.logger.info("Generated speech in " + str(time.time() - start) + " seconds") - response = speech.cpu().float().numpy() - return response - - -class TTSController(QObject, MediatorMixin): - """ - Handles TTS requests from the main thread and passes them to the generator worker. - Also handles speech from the generator worker and passes it to the vocalizer worker. - Responses from the vocalizer worker are passed back to the main thread. - """ - def __init__(self, *args, **kwargs): - self.engine = kwargs.pop("engine") - super().__init__(*args, **kwargs) - MediatorMixin.__init__(self) - - self.generator_worker = self.create_worker(GeneratorWorker) - self.vocalizer_worker = self.create_worker(VocalizerWorker) - - self.register("GeneratorWorker") - - self.register("tts_request", self) - - @pyqtSlot(dict) - def on_tts_request(self, data: dict): - self.generator_worker.add_to_queue(data) - - -class TTS(QObject, MediatorMixin): +class TTS(QObject, MediatorMixin, SettingsMixin): """ Generates speech from given text. Responsible for managing the model, processor, vocoder, and speaker embeddings. @@ -241,68 +79,61 @@ def device(self): def torch_dtype(self): return torch.float16 if self.use_cuda else torch.float32 - @property - def settings(self): - return self._settings - - @settings.setter - def settings(self, value): - self._settings = value - @property def word_chunks(self): - return self.settings["word_chunks"] + return self.tts_settings["word_chunks"] @property def use_bark(self): - return self.settings["use_bark"] + return self.tts_settings["use_bark"] @property def cuda_index(self): - return self.settings["cuda_index"] + return self.tts_settings["cuda_index"] @property def voice_preset(self): - return self.settings["voice"] + return self.tts_settings["voice"] @property def use_cuda(self): - return self.settings["use_cuda"] and torch.cuda.is_available() + return self.tts_settings["use_cuda"] and torch.cuda.is_available() @property def fine_temperature(self): - return self.settings["fine_temperature"] / 100 + return self.tts_settings["fine_temperature"] / 100 @property def coarse_temperature(self): - return self.settings["coarse_temperature"] / 100 + return self.tts_settings["coarse_temperature"] / 100 @property def semantic_temperature(self): - return self.settings["semantic_temperature"] / 100 + return self.tts_settings["semantic_temperature"] / 100 @property def enable_cpu_offload(self): - return self.settings["enable_cpu_offload"] + return self.tts_settings["enable_cpu_offload"] @property def play_queue_buffer_length(self): - return self.settings["play_queue_buffer_length"] + return self.tts_settings["play_queue_buffer_length"] @property def use_word_chunks(self): - return self.settings["use_word_chunks"] + return self.tts_settings["use_word_chunks"] @property def use_sentence_chunks(self): - return self.settings["use_sentence_chunks"] + return self.tts_settings["use_sentence_chunks"] @property def sentence_chunks(self): - return self.settings["sentence_chunks"] + return self.tts_settings["sentence_chunks"] def __init__(self, *args, **kwargs): super().__init__() + SettingsMixin.__init__(self) MediatorMixin.__init__(self) self.logger.info("Loading") self.corpus = [] @@ -313,7 +144,7 @@ def __init__(self, *args, **kwargs): self.sentences = [] @pyqtSlot(np.ndarray) - def on_add_to_stream(self, generated_speech: np.ndarray): + def on_add_to_stream_signal(self, generated_speech: np.ndarray): """ This function is called from the generator worker when speech has been generated. It adds the generated speech to the vocalizer's queue. @@ -352,7 +183,7 @@ def move_to_device(self): self.speaker_embeddings = self.speaker_embeddings.to(self.device) def initialize(self): - target_model = "bark" if self.use_bark else "t5" + target_model = "bark" if self.tts_settings["use_bark"] else "t5" if target_model != self.current_model: self.unload() @@ -476,7 +307,6 @@ def process_sentences(self): self.sentences.append(sentence) def add_text(self, data: dict, is_end_of_message: bool): - self.settings = data["tts_settings"] self.initialize() self.message += data["message"] #if is_end_of_message: diff --git a/src/airunner/windows/main/settings_mixin.py b/src/airunner/windows/main/settings_mixin.py index 4721703f9..23d89602b 100644 --- a/src/airunner/windows/main/settings_mixin.py +++ b/src/airunner/windows/main/settings_mixin.py @@ -12,6 +12,26 @@ from airunner.data.bootstrap.imagefilter_bootstrap_data import imagefilter_bootstrap_data +tts_settings_default = dict( + language="English", + voice="v2/en_speaker_6", + gender="Male", + fine_temperature=80, + coarse_temperature=40, + semantic_temperature=80, + use_bark=False, + enable_tts=True, + use_cuda=True, + use_sentence_chunks=True, + use_word_chunks=False, + cuda_index=0, + word_chunks=1, + sentence_chunks=1, + play_queue_buffer_length=1, + enable_cpu_offload=True, +) + + class SettingsMixin: def __init__(self): self.application_settings = QSettings("Capsize Games", "AI Runner") @@ -274,24 +294,7 @@ def __init__(self): prompt_template="Mistral 7B Instruct: Default Chatbot", override_parameters=False ), - tts_settings=dict( - language="English", - voice="v2/en_speaker_6", - gender="Male", - fine_temperature=80, - coarse_temperature=40, - semantic_temperature=80, - use_bark=False, - enable_tts=True, - use_cuda=True, - use_sentence_chunks=True, - use_word_chunks=False, - cuda_index=0, - word_chunks=1, - sentence_chunks=1, - play_queue_buffer_length=1, - enable_cpu_offload=True, - ), + tts_settings=tts_settings_default, stt_settings=dict( duration=10, fs=16000, @@ -394,175 +397,180 @@ def on_reset_settings_signal(self): self.logger.info("Resetting settings") self.application_settings.clear() self.application_settings.sync() - self.set_settings(self.get_settings()) + self.set_settings(self.settings) @property def ai_models(self): - return self.get_settings()["ai_models"] + return self.settings["ai_models"] @ai_models.setter def ai_models(self, val): - settings = self.get_settings() + settings = self.settings settings["ai_models"] = val self.set_settings(settings) @property def generator_settings(self): - return self.get_settings()["generator_settings"] + return self.settings["generator_settings"] @generator_settings.setter def generator_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["generator_settings"] = val self.set_settings(settings) @property def stt_settings(self): - return self.get_settings()["stt_settings"] + return self.settings["stt_settings"] @stt_settings.setter def stt_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["stt_settings"] = val self.set_settings(settings) @property def controlnet_settings(self): - return self.get_settings()["controlnet_settings"] + return self.settings["controlnet_settings"] @controlnet_settings.setter def controlnet_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["controlnet_settings"] = val self.set_settings(settings) @property def metadata_settings(self): - return self.get_settings()["metadata_settings"] + return self.settings["metadata_settings"] @metadata_settings.setter def metadata_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["metadata_settings"] = val self.set_settings(settings) @property def canvas_settings(self): - return self.get_settings()["canvas_settings"] + return self.settings["canvas_settings"] @canvas_settings.setter def canvas_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["canvas_settings"] = val self.set_settings(settings) @property def active_grid_settings(self): - return self.get_settings()["active_grid_settings"] + return self.settings["active_grid_settings"] @active_grid_settings.setter def active_grid_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["active_grid_settings"] = val self.set_settings(settings) @property def standard_image_settings(self): - return self.get_settings()["standard_image_settings"] + return self.settings["standard_image_settings"] @standard_image_settings.setter def standard_image_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["standard_image_settings"] = val self.set_settings(settings) @property def path_settings(self): - return self.get_settings()["path_settings"] + return self.settings["path_settings"] @path_settings.setter def path_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["path_settings"] = val self.set_settings(settings) @property def brush_settings(self): - return self.get_settings()["brush_settings"] + return self.settings["brush_settings"] @brush_settings.setter def brush_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["brush_settings"] = val self.set_settings(settings) @property def grid_settings(self): - return self.get_settings()["grid_settings"] + return self.settings["grid_settings"] @grid_settings.setter def grid_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["grid_settings"] = val self.set_settings(settings) @property def window_settings(self): - return self.get_settings()["window_settings"] + return self.settings["window_settings"] @window_settings.setter def window_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["window_settings"] = val self.set_settings(settings) @property def shortcut_key_settings(self): - return self.get_settings()["shortcut_key_settings"] + return self.settings["shortcut_key_settings"] @shortcut_key_settings.setter def shortcut_key_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["shortcut_key_settings"] = val self.set_settings(settings) @property def memory_settings(self): - return self.get_settings()["memory_settings"] + return self.settings["memory_settings"] @memory_settings.setter def memory_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["memory_settings"] = val self.set_settings(settings) @property def llm_generator_settings(self): - return self.get_settings()["llm_generator_settings"] + return self.settings["llm_generator_settings"] @llm_generator_settings.setter def llm_generator_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["llm_generator_settings"] = val self.set_settings(settings) @property def tts_settings(self): - return self.get_settings()["tts_settings"] + tts_settings = self.settings.get("tts_settings") + if tts_settings is None: + self.tts_settings = tts_settings_default + tts_settings = self.settings.get("tts_settings") + print("GETTING TTS_SETTINGS", tts_settings) + return tts_settings @tts_settings.setter def tts_settings(self, val): - settings = self.get_settings() + settings = self.settings settings["tts_settings"] = val self.set_settings(settings) @property def llm_templates(self): - return self.get_settings()["llm_templates"] + return self.settings["llm_templates"] @llm_templates.setter def llm_templates(self, val): - settings = self.get_settings() + settings = self.settings settings["llm_templates"] = val self.set_settings(settings) diff --git a/src/airunner/workers/tts_generator_worker.py b/src/airunner/workers/tts_generator_worker.py new file mode 100644 index 000000000..ec6a077f2 --- /dev/null +++ b/src/airunner/workers/tts_generator_worker.py @@ -0,0 +1,107 @@ +import time + +from airunner.workers.worker import Worker +from airunner.aihandler.tts import TTS + + +class TTSGeneratorWorker(Worker): + """ + Takes input text from any source and generates speech from it using the TTS class. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.tts = TTS() + self.tts.run() + self.play_queue = [] + self.play_queue_started = False + + def handle_message(self, data): + message = data["message"] + #play_queue_buffer_length = self.tts_settings["play_queue_buffer_length"] + self.play_queue.append(message) + if data["is_end_of_message"] or len(self.play_queue) >= 15: + # for item in self.play_queue: + # self.generate(item) + # self.play_queue = [] + # if is_end_of_message or len(self.play_queue) == play_queue_buffer_length or self.play_queue_started: + # self.play_queue_started = True + # self.generate(message) + sentence = " ".join(self.play_queue).strip() + self.logger.debug(f"Generating TTS for sentence {sentence}") + self.generate(sentence) + self.play_queue_started = True + self.play_queue = [] + + def generate(self, message): + self.logger.info("Generating TTS...") + self.logger.info(message) + if type(message) == dict: + message = message.get("message", "") + text = message.replace("\n", " ").strip() + + if text == "": + return + + self.logger.info(f"Generating TTS with {text}") + + if self.tts_settings["use_bark"]: + response = self.generate_with_bark(text) + else: + response = self.generate_with_t5(text) + + print("adding to stream", response) + self.emit("TTSGeneratorWorker_add_to_stream_signal", response) + + def move_inputs_to_device(self, inputs): + use_cuda = self.tts_settings["use_cuda"] + if use_cuda: + self.logger.info("Moving inputs to CUDA") + try: + inputs = {k: v.cuda() for k, v in inputs.items()} + except AttributeError: + pass + return inputs + + def generate_with_bark(self, text): + self.logger.info("Generating TTS...") + text = text.replace("\n", " ").strip() + + self.logger.info("Processing inputs...") + inputs = self.tts.processor(text, voice_preset=self.tts_settings["voice"]).to(self.tts.device) + inputs = self.move_inputs_to_device(inputs) + + self.logger.info("Generating speech...") + start = time.time() + params = dict( + **inputs, + fine_temperature=self.tts_settings["fine_temperature"], + coarse_temperature=self.tts_settings["coarse_temperature"], + semantic_temperature=self.tts_settings["semantic_temperature"], + ) + speech = self.tts.model.generate(**params) + self.logger.info("Generated speech in " + str(time.time() - start) + " seconds") + + response = speech[0].cpu().float().numpy() + return response + + def generate_with_t5(self, text): + self.logger.info("Generating TTS...") + text = text.replace("\n", " ").strip() + + self.logger.info("Processing inputs...") + + inputs = self.tts.processor(text=text, return_tensors="pt") + inputs = self.move_inputs_to_device(inputs) + + self.logger.info("Generating speech...") + start = time.time() + params = dict( + **inputs, + speaker_embeddings=self.tts.speaker_embeddings, + vocoder=self.tts.vocoder, + max_length=100, + ) + speech = self.tts.model.generate(**params) + self.logger.info("Generated speech in " + str(time.time() - start) + " seconds") + response = speech.cpu().float().numpy() + return response diff --git a/src/airunner/workers/tts_vocalizer_worker.py b/src/airunner/workers/tts_vocalizer_worker.py new file mode 100644 index 000000000..d1086db9f --- /dev/null +++ b/src/airunner/workers/tts_vocalizer_worker.py @@ -0,0 +1,46 @@ +import sounddevice as sd + +from queue import Queue + +from PyQt6.QtCore import pyqtSlot + +from airunner.aihandler.logger import Logger +from airunner.workers.worker import Worker + + +class TTSVocalizerWorker(Worker): + """ + Speech (in the form of numpy arrays generated with the TTS class) is added to the + vocalizer's queue. The vocalizer plays the speech using sounddevice. + """ + reader_mode_active = False + logger = Logger(prefix="VocalizerWorker") + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.queue = Queue() + self.stream = sd.OutputStream(samplerate=24000, channels=1) + self.stream.start() + self.data = [] + self.started = False + self.register("TTSGeneratorWorker_add_to_stream_signal", self) + + def on_TTSGeneratorWorker_add_to_stream_signal(self, response): + print("on_TTSGeneratorWorker_add_to_stream_signal", response) + self.logger.debug("Adding speech to stream...") + self.add_to_queue(response) + + def handle_message(self, item): + if item is None: + self.logger.warning("item is none") + return + self.stream.write(item) + self.started = True + self.data = [] + + def handle_speech(self, generated_speech): + self.logger.info("Adding speech to stream...") + try: + self.queue.put(generated_speech) + except Exception as e: + self.logger.error(f"Error while adding speech to stream: {e}") diff --git a/src/airunner/workers/worker.py b/src/airunner/workers/worker.py index ccf534d2b..a94ef2da4 100644 --- a/src/airunner/workers/worker.py +++ b/src/airunner/workers/worker.py @@ -14,9 +14,9 @@ class Worker(QObject, MediatorMixin, SettingsMixin): def __init__(self, prefix=None): self.prefix = prefix or self.__class__.__name__ - super().__init__() MediatorMixin.__init__(self) SettingsMixin.__init__(self) + super().__init__() self.logger = Logger(prefix=prefix) self.running = False self.queue = queue.Queue()