Skip to content

Commit

Permalink
We Should Be Able to Search And Replace Text
Browse files Browse the repository at this point in the history
There are many times when it would be beneficial to search and replace
text in a book, before generating the audio narration. The biggest
reasons would be to:

* expand abbreviations
* fix pronunciation
* replace foreign characters not supported by the engine with phonetic
  equivalents. right now these are just skipped.
  • Loading branch information
reverendj1 committed Jul 29, 2024
1 parent ecdbf1f commit 79fe01d
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 0 deletions.
36 changes: 36 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,12 @@ options:
--remove_endnotes This will remove endnote numbers from the end or
middle of sentences. This is useful for academic
books.
--search_and_replace_file SEARCH_AND_REPLACE_FILE
Path to a file that contains 1 regex replace per line,
to help with fixing pronunciations, etc. The format is:
<search>==<replace>
Note that you may have to specify word boundaries, to
avoid replacing parts of words.
--voice_name VOICE_NAME
Various TTS providers has different voice names, look
up for your provider settings.
Expand Down Expand Up @@ -204,6 +210,36 @@ Before converting your epub file to an audiobook, you can use the `--preview` op
python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --preview
```
## Search & Replace
You may want to search and replace text, either to expand abbreviations, or to help with pronunciation. You can do this by specifying a search and replace file, which contains a single regex search and replace per line, separated by '==':
**Example**:
**search.conf**:
```text
# this is the general structure
<search>==<replace>
# this is a comment
# fix cardinal direction abbreviations
N\.E\.==north east
# be careful with your regexes, as this would also match Sally N. Smith
N\.==north
# pronounce Barbadoes like the locals
Barbadoes==Barbayduss
```
```bash
python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --search_and_replace_file search.conf
```
**Example**:
```bash
python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --preview
```
## Using with Docker
This tool is available as a Docker image, making it easy to run without needing to manage Python dependencies.
Expand Down
16 changes: 16 additions & 0 deletions audiobook_generator/book_parsers/epub_book_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def get_book_author(self) -> str:

def get_chapters(self, break_string) -> List[Tuple[str, str]]:
chapters = []
search_and_replaces = self.get_search_and_replaces()
for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
content = item.get_content()
soup = BeautifulSoup(content, "lxml-xml")
Expand All @@ -67,6 +68,11 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
cleaned_text = re.sub(r'(?<=[a-zA-Z.,!?;”")])\d+', "", cleaned_text)
logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")

# Does user defined search and replaces
for search_and_replace in search_and_replaces:
cleaned_text = re.sub(search_and_replace['search'], search_and_replace['replace'], cleaned_text)
logger.debug(f"Cleaned text step 5: <{cleaned_text[:100]}>")

# Get proper chapter title
if self.config.title_mode == "auto":
title = ""
Expand Down Expand Up @@ -98,6 +104,16 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
soup.decompose()
return chapters

def get_search_and_replaces(self):
search_and_replaces = []
if self.config.search_and_replace_file:
with open(self.config.search_and_replace_file) as fp:
search_and_replace_content = fp.readlines()
for search_and_replace in search_and_replace_content:
if '==' in search_and_replace and not search_and_replace.startswith('==') and not search_and_replace.endswith('==') and not search_and_replace.startswith('#'):
search_and_replaces = search_and_replaces + [ {'search': r"{}".format(search_and_replace.split('==')[0]), 'replace': r"{}".format(search_and_replace.split('==')[1][:-1])} ]
return search_and_replaces

@staticmethod
def _sanitize_title(title, break_string) -> str:
# replace MAGIC_BREAK_STRING with a blank space
Expand Down
1 change: 1 addition & 0 deletions audiobook_generator/config/general_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def __init__(self, args):
self.chapter_start = args.chapter_start
self.chapter_end = args.chapter_end
self.remove_endnotes = args.remove_endnotes
self.search_and_replace_file = args.search_and_replace_file

# TTS provider: common arguments
self.tts = args.tts
Expand Down
9 changes: 9 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ def handle_args():
help="This will remove endnote numbers from the end or middle of sentences. This is useful for academic books.",
)

parser.add_argument(
"--search_and_replace_file",
default="",
help="""Path to a file that contains 1 regex replace per line, to help with fixing pronunciations, etc. The format is:
<search>==<replace>
Note that you may have to specify word boundaries, to avoid replacing parts of words.
""",
)

parser.add_argument(
"--voice_name",
help="Various TTS providers has different voice names, look up for your provider settings.",
Expand Down

0 comments on commit 79fe01d

Please sign in to comment.