From 79fe01d885cf5a9f92cc96afadd997dbc9f98ff9 Mon Sep 17 00:00:00 2001 From: John Wesorick Date: Mon, 29 Jul 2024 17:49:21 -0400 Subject: [PATCH] We Should Be Able to Search And Replace Text There are many times when it would be beneficial to search and replace text in a book, before generating the audio narration. The biggest reasons would be to: * expand abbreviations * fix pronunciation * replace foreign characters not supported by the engine with phonetic equivalents. right now these are just skipped. --- README.md | 36 +++++++++++++++++++ .../book_parsers/epub_book_parser.py | 16 +++++++++ audiobook_generator/config/general_config.py | 1 + main.py | 9 +++++ 4 files changed, 62 insertions(+) diff --git a/README.md b/README.md index 87e27b9..ccbf39d 100644 --- a/README.md +++ b/README.md @@ -152,6 +152,12 @@ options: --remove_endnotes This will remove endnote numbers from the end or middle of sentences. This is useful for academic books. + --search_and_replace_file SEARCH_AND_REPLACE_FILE + Path to a file that contains 1 regex replace per line, + to help with fixing pronunciations, etc. The format is: + == + Note that you may have to specify word boundaries, to + avoid replacing parts of words. --voice_name VOICE_NAME Various TTS providers has different voice names, look up for your provider settings. @@ -204,6 +210,36 @@ Before converting your epub file to an audiobook, you can use the `--preview` op python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --preview ``` +## Search & Replace + +You may want to search and replace text, either to expand abbreviations, or to help with pronunciation. You can do this by specifying a search and replace file, which contains a single regex search and replace per line, separated by '==': + +**Example**: + +**search.conf**: + +```text +# this is the general structure +== +# this is a comment +# fix cardinal direction abbreviations +N\.E\.==north east +# be careful with your regexes, as this would also match Sally N. Smith +N\.==north +# pronounce Barbadoes like the locals +Barbadoes==Barbayduss +``` + +```bash +python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --search_and_replace_file search.conf +``` + +**Example**: + +```bash +python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --preview +``` + ## Using with Docker This tool is available as a Docker image, making it easy to run without needing to manage Python dependencies. diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py index abc6712..95e98bb 100644 --- a/audiobook_generator/book_parsers/epub_book_parser.py +++ b/audiobook_generator/book_parsers/epub_book_parser.py @@ -42,6 +42,7 @@ def get_book_author(self) -> str: def get_chapters(self, break_string) -> List[Tuple[str, str]]: chapters = [] + search_and_replaces = self.get_search_and_replaces() for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT): content = item.get_content() soup = BeautifulSoup(content, "lxml-xml") @@ -67,6 +68,11 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]: cleaned_text = re.sub(r'(?<=[a-zA-Z.,!?;”")])\d+', "", cleaned_text) logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>") + # Does user defined search and replaces + for search_and_replace in search_and_replaces: + cleaned_text = re.sub(search_and_replace['search'], search_and_replace['replace'], cleaned_text) + logger.debug(f"Cleaned text step 5: <{cleaned_text[:100]}>") + # Get proper chapter title if self.config.title_mode == "auto": title = "" @@ -98,6 +104,16 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]: soup.decompose() return chapters + def get_search_and_replaces(self): + search_and_replaces = [] + if self.config.search_and_replace_file: + with open(self.config.search_and_replace_file) as fp: + search_and_replace_content = fp.readlines() + for search_and_replace in search_and_replace_content: + if '==' in search_and_replace and not search_and_replace.startswith('==') and not search_and_replace.endswith('==') and not search_and_replace.startswith('#'): + search_and_replaces = search_and_replaces + [ {'search': r"{}".format(search_and_replace.split('==')[0]), 'replace': r"{}".format(search_and_replace.split('==')[1][:-1])} ] + return search_and_replaces + @staticmethod def _sanitize_title(title, break_string) -> str: # replace MAGIC_BREAK_STRING with a blank space diff --git a/audiobook_generator/config/general_config.py b/audiobook_generator/config/general_config.py index 513acb9..d577393 100644 --- a/audiobook_generator/config/general_config.py +++ b/audiobook_generator/config/general_config.py @@ -14,6 +14,7 @@ def __init__(self, args): self.chapter_start = args.chapter_start self.chapter_end = args.chapter_end self.remove_endnotes = args.remove_endnotes + self.search_and_replace_file = args.search_and_replace_file # TTS provider: common arguments self.tts = args.tts diff --git a/main.py b/main.py index d4677f7..368d8e1 100644 --- a/main.py +++ b/main.py @@ -81,6 +81,15 @@ def handle_args(): help="This will remove endnote numbers from the end or middle of sentences. This is useful for academic books.", ) + parser.add_argument( + "--search_and_replace_file", + default="", + help="""Path to a file that contains 1 regex replace per line, to help with fixing pronunciations, etc. The format is: + == + Note that you may have to specify word boundaries, to avoid replacing parts of words. + """, + ) + parser.add_argument( "--voice_name", help="Various TTS providers has different voice names, look up for your provider settings.",