Skip to content

Commit

Permalink
Merge pull request #77 from vcalv/piper
Browse files Browse the repository at this point in the history
Piper TTS  support
  • Loading branch information
Bryksin authored Aug 24, 2024
2 parents 4ea2e25 + 32a750d commit 25d9e52
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 3 deletions.
29 changes: 27 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ If you're interested in hearing a sample of the audiobook generated by this tool
- [Azure TTS Sample](https://audio.com/paudi/audio/0008-chapter-vii-agricultural-experience)
- [OpenAI TTS Sample](https://audio.com/paudi/audio/openai-0008-chapter-vii-agricultural-experience-i-had-now-been-in)
- Edge TTS Sample: the voice is almost the same as Azure TTS
- [Piper TTS](https://rhasspy.github.io/piper-samples/)

## Requirements

- Python 3.6+ Or ***Docker***
- For using *Azure TTS*, A Microsoft Azure account with access to the [Microsoft Cognitive Services Speech Services](https://portal.azure.com/#create/Microsoft.CognitiveServicesSpeechServices) is required.
- For using *OpenAI TTS*, OpenAI [API Key](https://platform.openai.com/api-keys) is required.
- For using *Edge TTS*, no API Key is required.
- Piper TTS executable and models for *Piper TTS*

## Audiobookshelf Integration

Expand Down Expand Up @@ -80,7 +82,7 @@ python3 main.py -h
```
```bash
usage: main.py [-h] [--tts {azure,openai,edge}]
usage: main.py [-h] [--tts {azure,openai,edge,piper}]
[--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--preview]
[--no_prompt] [--language LANGUAGE]
[--newline_mode {single,double,none}]
Expand All @@ -101,7 +103,7 @@ positional arguments:
options:
-h, --help show this help message and exit
--tts {azure,openai,edge}
--tts {azure,openai,edge,piper}
Choose TTS provider (default: azure). azure: Azure
Cognitive Services, openai: OpenAI TTS API. When using
azure, environment variables MS_TTS_KEY and
Expand Down Expand Up @@ -351,6 +353,29 @@ Here are some examples that demonstrate various option combinations:
python3 main.py "path/to/book.epub" "path/to/output/folder" --tts edge --chapter_start 5 --chapter_end 10 --break_duration "1500"
```
### Examples Using Piper TTS
This command will convert an EPUB file to an audiobook using Piper TTS using the bare minimum parameters.
You always need to specify an onnx model file and the `piper-tts` executable needs to be in the current path.
```sh
python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx
```
Some models support multiple voices and that can be specified by using the voice_name parameter.
```sh
python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --voice_name 256
```
You can also specify speed and pause duration
```sh
python3 main.py "path/to/book.epub" "path/to/output/folder" --tts piper --model_name <path_to>/en_US-libritts_r-medium.onnx --voice_name 256 --voice_rate 1.5 --break_duration 0.75
```
Piper TTS outputs `wav` format files (or raw) by default you should be able to specify any reasonable format via the `--output_format` parameter.
`opus` and `mp3` are good choices for size and compatibility.
## Troubleshooting
### ModuleNotFoundError: No module named 'importlib_metadata'
Expand Down
6 changes: 5 additions & 1 deletion audiobook_generator/tts_providers/base_tts_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
TTS_AZURE = "azure"
TTS_OPENAI = "openai"
TTS_EDGE = "edge"
TTS_PIPER = 'piper'


class BaseTTSProvider: # Base interface for TTS providers
Expand Down Expand Up @@ -34,7 +35,7 @@ def get_output_file_extension(self):

# Common support methods for all TTS providers
def get_supported_tts_providers() -> List[str]:
return [TTS_AZURE, TTS_OPENAI, TTS_EDGE]
return [TTS_AZURE, TTS_OPENAI, TTS_EDGE, TTS_PIPER]


def get_tts_provider(config) -> BaseTTSProvider:
Expand All @@ -47,5 +48,8 @@ def get_tts_provider(config) -> BaseTTSProvider:
elif config.tts == TTS_EDGE:
from audiobook_generator.tts_providers.edge_tts_provider import EdgeTTSProvider
return EdgeTTSProvider(config)
elif config.tts == TTS_PIPER:
from audiobook_generator.tts_providers.piper_tts_provider import PiperTTSProvider
return PiperTTSProvider(config)
else:
raise ValueError(f"Invalid TTS provider: {config.tts}")
91 changes: 91 additions & 0 deletions audiobook_generator/tts_providers/piper_tts_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import tempfile
from subprocess import run
from pathlib import Path
import logging


from pydub import AudioSegment

from audiobook_generator.config.general_config import GeneralConfig
from audiobook_generator.core.audio_tags import AudioTags
from audiobook_generator.core.utils import set_audio_tags
from audiobook_generator.tts_providers.base_tts_provider import BaseTTSProvider

logger = logging.getLogger(__name__)

__all__ = ["PiperTTSProvider"]


class PiperTTSProvider(BaseTTSProvider):
def __init__(self, config: GeneralConfig):
logger.setLevel(config.log)

# TTS provider specific config
config.output_format = config.output_format or "opus"

if config.voice_rate is None:
config.voice_rate = 1.0
else:
try:
config.voice_rate = float(config.voice_rate)
except ValueError:
logger.error("Invalid voice_rate %r", config.voice_rate)
config.voice_rate = 1.0
config.voice_name = config.voice_name or "0"
config.break_duration = config.break_duration or 0.2

# 0.000$ per 1 million characters
# or 0.000$ per 1000 characters
self.price = 0.000
super().__init__(config)

def __str__(self) -> str:
return f"{self.config}"

def validate_config(self):
pass

def text_to_speech(
self,
text: str,
output_file: str,
audio_tags: AudioTags,
):

with tempfile.TemporaryDirectory() as tmpdirname:
logger.debug("created temporary directory %r", tmpdirname)

tmpfilename = Path(tmpdirname) / "piper.wav"

run(
[
"piper-tts",
"--model",
self.config.model_name,
"--speaker",
self.config.voice_name,
"--sentence_silence",
str(self.config.break_duration),
"--length_scale",
str(1.0 / self.config.voice_rate),
"-f",
tmpfilename,
],
input=text.encode("utf-8"),
)

# Convert the wav file to the desired format
AudioSegment.from_wav(tmpfilename).export(
output_file, format=self.config.output_format
)

set_audio_tags(output_file, audio_tags)

def estimate_cost(self, total_chars):
return 0

def get_break_string(self):
return " "

def get_output_file_extension(self):
return self.config.output_format

0 comments on commit 25d9e52

Please sign in to comment.