Skip to content

Commit

Permalink
[Text to audio generator] Add speech engine (#838)
Browse files Browse the repository at this point in the history
* [text to audio generator] Replaced bark with openai tts models

* [text to audio generator] Fix base url env var

* fix version

* Add speech engine

* after review
  • Loading branch information
yonishelach authored Jan 2, 2025
1 parent 7d29949 commit b3eb31a
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 84 deletions.
86 changes: 53 additions & 33 deletions text_to_audio_generator/function.yaml

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion text_to_audio_generator/item.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ spec:
image: mlrun/mlrun
kind: job
requirements:
- openai
- torchaudio
- pydub
url: ''
Expand Down
3 changes: 2 additions & 1 deletion text_to_audio_generator/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
openai>=1.58.0
bark
torchaudio>=2.1.0
openai>=1.58.0
pydub
41 changes: 38 additions & 3 deletions text_to_audio_generator/test_text_to_audio_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,47 @@
import pytest


@pytest.mark.parametrize("file_format,bits_per_sample", [("wav", 8), ("mp3", None)])
def test_generate_multi_speakers_audio(file_format, bits_per_sample):
text_to_audio_generator_function = mlrun.import_function("function.yaml")
with tempfile.TemporaryDirectory() as test_directory:
function_run = text_to_audio_generator_function.run(
handler="generate_multi_speakers_audio",
inputs={"data_path": "data/test_data.txt"},
params={
"output_directory": test_directory,
"speakers": {"Agent": 0, "Client": 1},
"available_voices": [
"v2/en_speaker_0",
"v2/en_speaker_1",
],
"engine": "bark",
"use_small_models": True,
"use_gpu": False,
"offload_cpu": True,
"file_format": file_format,
"bits_per_sample": bits_per_sample,
},
local=True,
returns=[
"audio_files: path",
"audio_files_dataframe: dataset",
"text_to_speech_errors: file",
],
artifact_path=test_directory,
)
assert function_run.error == ""
for key in ["audio_files", "audio_files_dataframe", "text_to_speech_errors"]:
assert key in function_run.outputs and function_run.outputs[key] is not None


@pytest.mark.skipif(
condition=os.getenv("OPENAI_BASE_URL") is None
condition=os.getenv("OPENAI_API_BASE") is None
and os.getenv("OPENAI_API_KEY") is None,
reason="OpenAI API key and base URL are required to run this test",
)
@pytest.mark.parametrize("file_format,bits_per_sample", [("wav", 8), ("mp3", None)])
def test_generate_multi_speakers_audio(file_format, bits_per_sample):
def test_generate_multi_speakers_audio_openai(file_format, bits_per_sample):
text_to_audio_generator_function = mlrun.import_function("function.yaml")
with tempfile.TemporaryDirectory() as test_directory:
function_run = text_to_audio_generator_function.run(
Expand All @@ -38,6 +72,7 @@ def test_generate_multi_speakers_audio(file_format, bits_per_sample):
"alloy",
"echo",
],
"engine": "openai",
"file_format": file_format,
"bits_per_sample": bits_per_sample,
},
Expand All @@ -51,4 +86,4 @@ def test_generate_multi_speakers_audio(file_format, bits_per_sample):
)
assert function_run.error == ""
for key in ["audio_files", "audio_files_dataframe", "text_to_speech_errors"]:
assert key in function_run.outputs and function_run.outputs[key] is not None
assert key in function_run.outputs and function_run.outputs[key] is not None
1 change: 1 addition & 0 deletions text_to_audio_generator/text_to_audio_generator.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@
" \"alloy\",\n",
" \"echo\",\n",
" ],\n",
" \"engine\": \"bark\",\n",
" \"file_format\": \"mp3\",\n",
" # \"bits_per_sample\": 8,\n",
" },\n",
Expand Down
182 changes: 136 additions & 46 deletions text_to_audio_generator/text_to_audio_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,41 +11,45 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import io
import logging
import os
import pathlib
import random
import tempfile
from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import openai
import pandas as pd
import torch
import torchaudio
import tqdm
from pydub import AudioSegment

# Get the global logger:
_LOGGER = logging.getLogger()

OPENAI_API_KEY = "OPENAI_API_KEY"
OPENAI_BASE_URL = "OPENAI_BASE_URL"
OPENAI_BASE_URL = "OPENAI_API_BASE"
SAMPLE_RATE = 24000


def generate_multi_speakers_audio(
data_path: str,
speakers: Union[List[str], Dict[str, int]],
available_voices: List[str],
engine: str = "openai",
output_directory: str = None,
model: str = "tts-1",
use_gpu: Optional[bool] = None,
use_small_models: Optional[bool] = None,
offload_cpu: Optional[bool] = None,
model: Optional[str] = None,
speed: Optional[float] = None,
sample_rate: int = 16000,
file_format: str = "wav",
verbose: bool = True,
bits_per_sample: Optional[int] = None,
speed: float = 1.0,
) -> Tuple[str, pd.DataFrame, dict]:
"""
Generate audio files from text files.
Expand All @@ -55,16 +59,24 @@ def generate_multi_speakers_audio(
If a list is given, the speakers will be assigned to channels in the order given.
If dictionary, the keys will be the speakers and the values will be the channels.
:param available_voices: List of available voices to use for the generation.
See here for the available voices:
https://platform.openai.com/docs/guides/text-to-speech#voice-options
See here for the available voices for bark engine:
https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
See here for the available voices for openai engine:
https://beta.openai.com/docs/api-reference/speech
:param engine: The engine to use for the generation. Select either "bark" or "openai". Default is "openai".
:param output_directory: Path to the directory to save the generated audio files to.
:param model: Which model to use for the generation.
:param use_gpu: Whether to use the GPU for the generation. Supported only in "bark" engine.
:param use_small_models: Whether to use the small models for the generation. Supported only in "bark" engine.
:param offload_cpu: To reduce the memory footprint, the models can be offloaded to the CPU after loading.
Supported only in "bark" engine.
:param model: Which model to use for the generation. Supported only in "openai" engine.
Default is "tts-1".
:param speed: The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default.
:param sample_rate: The sampling rate of the generated audio.
:param file_format: The format of the generated audio files.
:param verbose: Whether to print the progress of the generation.
:param bits_per_sample: Changes the bit depth for the supported formats.
Supported only in "wav" or "flac" formats.
:param speed: The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default.
:returns: A tuple of:
- The output directory path.
Expand All @@ -78,8 +90,17 @@ def generate_multi_speakers_audio(
data_path = pathlib.Path(data_path).absolute()
text_files = _get_text_files(data_path=data_path)

# connect to openai client:
client = _get_openai_client()

# Prepare the speech engine:
engine = _get_engine(
engine=engine,
use_gpu=use_gpu,
use_small_models=use_small_models,
offload_cpu=offload_cpu,
model=model,
file_format=file_format,
speed=speed
)

# Check for per channel generation:
if isinstance(speakers, dict):
Expand Down Expand Up @@ -152,15 +173,10 @@ def generate_multi_speakers_audio(
)
for sentence in _split_line(line=sentences):
# Generate words audio:
audio = client.audio.speech.create(
model=model,
input=sentence,
audio = engine._generate_audio(
text=sentence,
voice=chosen_voices[current_speaker],
response_format=file_format,
speed=speed,
)
audio = audio.content
audio = _bytes_to_np_array(audio=audio, file_format=file_format)

if speaker_per_channel:
silence = np.zeros_like(audio)
Expand Down Expand Up @@ -215,43 +231,117 @@ def generate_multi_speakers_audio(
return str(output_directory), successes, errors


def _get_openai_client():
api_key = os.getenv(OPENAI_API_KEY)
base_url = os.getenv(OPENAI_BASE_URL)
# Check if the key is already in the environment variables:
if not api_key or not base_url:
class SpeechEngine(ABC):
@abstractmethod
def _generate_audio(self, text: str, voice: str) -> np.ndarray:
pass


class BarkEngine(SpeechEngine):
def __init__(self, use_gpu: bool = True, use_small_models: bool = False, offload_cpu: bool = False):
try:
import mlrun

context = mlrun.get_or_create_ctx(name="context")
# Check if the key is in the secrets:
api_key = context.get_secret(OPENAI_API_KEY)
base_url = context.get_secret(OPENAI_BASE_URL)
except ModuleNotFoundError:
raise EnvironmentError(
f"One or more of the OpenAI required environment variables ('{OPENAI_API_KEY}', '{OPENAI_BASE_URL}') are missing."
f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
f"and set them as project secrets using `project.set_secrets`."
self.bark = importlib.import_module("bark")
except ImportError:
raise ImportError(
"The 'bark' library is required for the BarkEngine. Please install it using 'pip install bark-ai'."
)
return openai.OpenAI(api_key=api_key, base_url=base_url)

self.bark.preload_models(
text_use_gpu=use_gpu,
text_use_small=use_small_models,
coarse_use_gpu=use_gpu,
coarse_use_small=use_small_models,
fine_use_gpu=use_gpu,
fine_use_small=use_small_models,
codec_use_gpu=use_gpu,
force_reload=offload_cpu,
)

def _bytes_to_np_array(audio: bytes, file_format: str):
if file_format == "mp3":
audio_segment = AudioSegment.from_mp3(io.BytesIO(audio))
def _generate_audio(self, text: str, voice: str) -> np.ndarray:
# Generate words audio:
audio = self.bark.generate_audio(
text,
history_prompt=voice,
silent=True,
)
return audio

# Convert to raw PCM audio data
samples = audio_segment.get_array_of_samples()

# Convert to numpy array
audio_array = np.array(samples)
class OpenAIEngine(SpeechEngine):
def __init__(self, model: str = "tts-1", file_format: str = "wav", speed: float = 1.0):
try:
self.openai = importlib.import_module("openai")
self.pydub = importlib.import_module("pydub")
except ImportError:
raise ImportError(
"The 'openai' and 'pydub' libraries are required for the OpenAIEngine. Please install them using 'pip install openai pydub'."
)

# Normalize to float between -1 and 1
return audio_array.astype(np.float32) / np.iinfo(samples.typecode).max
else:
return np.frombuffer(audio, dtype=np.int16) / 32768.0
api_key = os.getenv(OPENAI_API_KEY)
base_url = os.getenv(OPENAI_BASE_URL)
# Check if the key is already in the environment variables:
if not api_key or not base_url:
try:
import mlrun

context = mlrun.get_or_create_ctx(name="context")
# Check if the key is in the secrets:
api_key = context.get_secret(OPENAI_API_KEY)
base_url = context.get_secret(OPENAI_BASE_URL)
except ModuleNotFoundError:
raise EnvironmentError(
f"One or more of the OpenAI required environment variables ('{OPENAI_API_KEY}', '{OPENAI_BASE_URL}') are missing."
f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
f"and set them as project secrets using `project.set_secrets`."
)

self.client = self.openai.OpenAI(api_key=api_key, base_url=base_url)
self.model = model
self.file_format = file_format
self.speed = speed

def _generate_audio(self, text: str, voice: str) -> np.ndarray:
# Generate words audio:
audio = self.client.audio.speech.create(
model=self.model,
input=text,
voice=voice,
response_format=self.file_format,
speed=self.speed,
)
audio = audio.content
audio = self._bytes_to_np_array(audio=audio)
return audio

def _bytes_to_np_array(self, audio: bytes):
if self.file_format == "mp3":
audio_segment = self.pydub.AudioSegment.from_mp3(io.BytesIO(audio))

# Convert to raw PCM audio data
samples = audio_segment.get_array_of_samples()

# Convert to numpy array
audio_array = np.array(samples)

# Normalize to float between -1 and 1
return audio_array.astype(np.float32) / np.iinfo(samples.typecode).max
else:
return np.frombuffer(audio, dtype=np.int16) / 32768.0


def _get_engine(engine: str, file_format: str, **kwargs) -> SpeechEngine:
# eliminate the None values:
kwargs = {key: value for key, value in kwargs.items() if value is not None}

if engine == "bark":
return BarkEngine(**kwargs)
elif engine == "openai":
return OpenAIEngine(file_format=file_format, **kwargs)
else:
raise ValueError(
f"Unrecognized engine. The parameter `engine` must be either 'bark' or 'openai'. Given: {engine}"
)

def _get_text_files(
data_path: pathlib.Path,
) -> List[pathlib.Path]:
Expand Down

0 comments on commit b3eb31a

Please sign in to comment.