We Should Be Able to Search And Replace Text

There are many times when it would be beneficial to search and replace text in a book, before generating the audio narration. The biggest reasons would be to: * expand abbreviations * fix pronunciation * replace foreign characters not supported by the engine with phonetic equivalents. right now these are just skipped.
p0n1 · Jul 29, 2024 · 79fe01d · 79fe01d
1 parent ecdbf1f
commit 79fe01d
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -152,6 +152,12 @@ options:
   --remove_endnotes     This will remove endnote numbers from the end or
                         middle of sentences. This is useful for academic
                         books.
+  --search_and_replace_file SEARCH_AND_REPLACE_FILE
+                        Path to a file that contains 1 regex replace per line,
+                        to help with fixing pronunciations, etc. The format is:
+                        <search>==<replace>
+                        Note that you may have to specify word boundaries, to
+                        avoid replacing parts of words.
   --voice_name VOICE_NAME
                         Various TTS providers has different voice names, look
                         up for your provider settings.
@@ -204,6 +210,36 @@ Before converting your epub file to an audiobook, you can use the `--preview` op
 python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --preview
 ```
 
+## Search & Replace
+
+You may want to search and replace text, either to expand abbreviations, or to help with pronunciation. You can do this by specifying a search and replace file, which contains a single regex search and replace per line, separated by '==':
+
+**Example**:
+
+**search.conf**:
+
+```text
+# this is the general structure
+<search>==<replace>
+# this is a comment
+# fix cardinal direction abbreviations
+N\.E\.==north east
+# be careful with your regexes, as this would also match Sally N. Smith
+N\.==north
+# pronounce Barbadoes like the locals
+Barbadoes==Barbayduss
+```
+
+```bash
+python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --search_and_replace_file search.conf
+```
+
+**Example**:
+
+```bash
+python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --preview
+```
+
 ## Using with Docker
 
 This tool is available as a Docker image, making it easy to run without needing to manage Python dependencies.

diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py
@@ -42,6 +42,7 @@ def get_book_author(self) -> str:
 
     def get_chapters(self, break_string) -> List[Tuple[str, str]]:
         chapters = []
+        search_and_replaces = self.get_search_and_replaces()
         for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
             content = item.get_content()
             soup = BeautifulSoup(content, "lxml-xml")
@@ -67,6 +68,11 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
                 cleaned_text = re.sub(r'(?<=[a-zA-Z.,!?;”")])\d+', "", cleaned_text)
                 logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")
 
+            # Does user defined search and replaces
+            for search_and_replace in search_and_replaces:
+                cleaned_text = re.sub(search_and_replace['search'], search_and_replace['replace'], cleaned_text)
+            logger.debug(f"Cleaned text step 5: <{cleaned_text[:100]}>")
+
             # Get proper chapter title
             if self.config.title_mode == "auto":
                 title = ""
@@ -98,6 +104,16 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
             soup.decompose()
         return chapters
 
+    def get_search_and_replaces(self):
+        search_and_replaces = []
+        if self.config.search_and_replace_file:
+            with open(self.config.search_and_replace_file) as fp:
+                search_and_replace_content = fp.readlines()
+                for search_and_replace in search_and_replace_content:
+                    if '==' in search_and_replace and not search_and_replace.startswith('==') and not search_and_replace.endswith('==') and not search_and_replace.startswith('#'):
+                        search_and_replaces = search_and_replaces + [ {'search': r"{}".format(search_and_replace.split('==')[0]), 'replace': r"{}".format(search_and_replace.split('==')[1][:-1])} ]
+        return search_and_replaces
+
     @staticmethod
     def _sanitize_title(title, break_string) -> str:
         # replace MAGIC_BREAK_STRING with a blank space

diff --git a/audiobook_generator/config/general_config.py b/audiobook_generator/config/general_config.py
@@ -14,6 +14,7 @@ def __init__(self, args):
         self.chapter_start = args.chapter_start
         self.chapter_end = args.chapter_end
         self.remove_endnotes = args.remove_endnotes
+        self.search_and_replace_file = args.search_and_replace_file
 
         # TTS provider: common arguments
         self.tts = args.tts

diff --git a/main.py b/main.py
@@ -81,6 +81,15 @@ def handle_args():
         help="This will remove endnote numbers from the end or middle of sentences. This is useful for academic books.",
     )
 
+    parser.add_argument(
+        "--search_and_replace_file",
+        default="",
+        help="""Path to a file that contains 1 regex replace per line, to help with fixing pronunciations, etc. The format is:
+        <search>==<replace>
+        Note that you may have to specify word boundaries, to avoid replacing parts of words.
+        """,
+    )
+
     parser.add_argument(
         "--voice_name",
         help="Various TTS providers has different voice names, look up for your provider settings.",