Skip to content

Commit

Permalink
Merge pull request #80 from reverendj1/main
Browse files Browse the repository at this point in the history
We Should Be Able to Search And Replace Text
  • Loading branch information
p0n1 authored Sep 4, 2024
2 parents 25d9e52 + 79fe01d commit 082c752
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 0 deletions.
36 changes: 36 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ options:
--remove_endnotes This will remove endnote numbers from the end or
middle of sentences. This is useful for academic
books.
--search_and_replace_file SEARCH_AND_REPLACE_FILE
Path to a file that contains 1 regex replace per line,
to help with fixing pronunciations, etc. The format is:
<search>==<replace>
Note that you may have to specify word boundaries, to
avoid replacing parts of words.
--voice_name VOICE_NAME
Various TTS providers has different voice names, look
up for your provider settings.
Expand Down Expand Up @@ -206,6 +212,36 @@ Before converting your epub file to an audiobook, you can use the `--preview` op
python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --preview
```
## Search & Replace
You may want to search and replace text, either to expand abbreviations, or to help with pronunciation. You can do this by specifying a search and replace file, which contains a single regex search and replace per line, separated by '==':
**Example**:
**search.conf**:
```text
# this is the general structure
<search>==<replace>
# this is a comment
# fix cardinal direction abbreviations
N\.E\.==north east
# be careful with your regexes, as this would also match Sally N. Smith
N\.==north
# pronounce Barbadoes like the locals
Barbadoes==Barbayduss
```
```bash
python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --search_and_replace_file search.conf
```
**Example**:
```bash
python3 main.py examples/The_Life_and_Adventures_of_Robinson_Crusoe.epub output_folder --preview
```
## Using with Docker
This tool is available as a Docker image, making it easy to run without needing to manage Python dependencies.
Expand Down
16 changes: 16 additions & 0 deletions audiobook_generator/book_parsers/epub_book_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def get_book_author(self) -> str:

def get_chapters(self, break_string) -> List[Tuple[str, str]]:
chapters = []
search_and_replaces = self.get_search_and_replaces()
for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
content = item.get_content()
soup = BeautifulSoup(content, "lxml-xml")
Expand All @@ -67,6 +68,11 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
cleaned_text = re.sub(r'(?<=[a-zA-Z.,!?;”")])\d+', "", cleaned_text)
logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")

# Does user defined search and replaces
for search_and_replace in search_and_replaces:
cleaned_text = re.sub(search_and_replace['search'], search_and_replace['replace'], cleaned_text)
logger.debug(f"Cleaned text step 5: <{cleaned_text[:100]}>")

# Get proper chapter title
if self.config.title_mode == "auto":
title = ""
Expand Down Expand Up @@ -98,6 +104,16 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
soup.decompose()
return chapters

def get_search_and_replaces(self):
search_and_replaces = []
if self.config.search_and_replace_file:
with open(self.config.search_and_replace_file) as fp:
search_and_replace_content = fp.readlines()
for search_and_replace in search_and_replace_content:
if '==' in search_and_replace and not search_and_replace.startswith('==') and not search_and_replace.endswith('==') and not search_and_replace.startswith('#'):
search_and_replaces = search_and_replaces + [ {'search': r"{}".format(search_and_replace.split('==')[0]), 'replace': r"{}".format(search_and_replace.split('==')[1][:-1])} ]
return search_and_replaces

@staticmethod
def _sanitize_title(title, break_string) -> str:
# replace MAGIC_BREAK_STRING with a blank space
Expand Down
1 change: 1 addition & 0 deletions audiobook_generator/config/general_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def __init__(self, args):
self.chapter_start = args.chapter_start
self.chapter_end = args.chapter_end
self.remove_endnotes = args.remove_endnotes
self.search_and_replace_file = args.search_and_replace_file

# TTS provider: common arguments
self.tts = args.tts
Expand Down
9 changes: 9 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ def handle_args():
help="This will remove endnote numbers from the end or middle of sentences. This is useful for academic books.",
)

parser.add_argument(
"--search_and_replace_file",
default="",
help="""Path to a file that contains 1 regex replace per line, to help with fixing pronunciations, etc. The format is:
<search>==<replace>
Note that you may have to specify word boundaries, to avoid replacing parts of words.
""",
)

parser.add_argument(
"--voice_name",
help="Various TTS providers has different voice names, look up for your provider settings.",
Expand Down

0 comments on commit 082c752

Please sign in to comment.