From 1be794c3a16054377c4547c62e756c50b925c9f6 Mon Sep 17 00:00:00 2001
From: xtmu <90405292+xtmu@users.noreply.github.com>
Date: Mon, 8 Jan 2024 20:08:48 +0800
Subject: [PATCH] fix: retrieve title.
retrieve title from fallback tag:
,,.
---
.../book_parsers/epub_book_parser.py | 12 +-
.../core/audiobook_generator.py | 126 +++++++++---------
2 files changed, 74 insertions(+), 64 deletions(-)
diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py
index 4d3f5f8..d551576 100644
--- a/audiobook_generator/book_parsers/epub_book_parser.py
+++ b/audiobook_generator/book_parsers/epub_book_parser.py
@@ -45,7 +45,12 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
content = item.get_content()
soup = BeautifulSoup(content, "lxml")
- title = soup.title.string if soup.title else ""
+ title = ""
+ title_levels = ['title', 'h1', 'h2', 'h3']
+ for level in title_levels:
+ if soup.find(level):
+ title = soup.find(level).text
+ break
raw = soup.get_text(strip=False)
logger.debug(f"Raw text: <{raw[:]}>")
@@ -67,7 +72,7 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")
# fill in the title if it's missing
- if not title:
+ if title == "":
title = cleaned_text[:60]
logger.debug(f"Raw title: <{title}>")
title = self._sanitize_title(title, break_string)
@@ -77,7 +82,8 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
soup.decompose()
return chapters
- def _sanitize_title(self, title, break_string) -> str:
+ @staticmethod
+ def _sanitize_title(title, break_string) -> str:
# replace MAGIC_BREAK_STRING with a blank space
# strip incase leading bank is missing
title = title.replace(break_string, " ")
diff --git a/audiobook_generator/core/audiobook_generator.py b/audiobook_generator/core/audiobook_generator.py
index ecaa17e..c21ef3f 100644
--- a/audiobook_generator/core/audiobook_generator.py
+++ b/audiobook_generator/core/audiobook_generator.py
@@ -34,64 +34,68 @@ def __str__(self) -> str:
return f"{self.config}"
def run(self):
- book_parser = get_book_parser(self.config)
- tts_provider = get_tts_provider(self.config)
-
- os.makedirs(self.config.output_folder, exist_ok=True)
- chapters = book_parser.get_chapters(tts_provider.get_break_string())
- # Filter out empty or very short chapters
- chapters = [(title, text) for title, text in chapters if text.strip()]
-
- logger.info(f"Chapters count: {len(chapters)}.")
-
- # Check chapter start and end args
- if self.config.chapter_start < 1 or self.config.chapter_start > len(chapters):
- raise ValueError(
- f"Chapter start index {self.config.chapter_start} is out of range. Check your input."
- )
- if self.config.chapter_end < -1 or self.config.chapter_end > len(chapters):
- raise ValueError(
- f"Chapter end index {self.config.chapter_end} is out of range. Check your input."
- )
- if self.config.chapter_end == -1:
- self.config.chapter_end = len(chapters)
- if self.config.chapter_start > self.config.chapter_end:
- raise ValueError(
- f"Chapter start index {self.config.chapter_start} is larger than chapter end index {self.config.chapter_end}. Check your input."
- )
-
- logger.info(f"Converting chapters from {self.config.chapter_start} to {self.config.chapter_end}.")
-
- # Initialize total_characters to 0
- total_characters = get_total_chars(chapters)
- logger.info(f"✨ Total characters in selected book: {total_characters} ✨")
- rough_price = tts_provider.estimate_cost(total_characters)
- confirm_conversion(rough_price)
-
- # Loop through each chapter and convert it to speech using the provided TTS provider
- for idx, (title, text) in enumerate(chapters, start=1):
- if idx < self.config.chapter_start:
- continue
- if idx > self.config.chapter_end:
- break
- logger.info(
- f"Converting chapter {idx}/{len(chapters)}: {title}, characters: {len(text)}"
- )
-
- if self.config.output_text:
- text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{title}.txt")
- with open(text_file, "w", encoding='utf-8') as file:
- file.write(text)
-
- if self.config.preview:
- continue
-
- output_file = os.path.join(self.config.output_folder,
- f"{idx:04d}_{title}.{tts_provider.get_output_file_extension()}")
-
- audio_tags = AudioTags(title, book_parser.get_book_author(), book_parser.get_book_title(), idx)
- tts_provider.text_to_speech(
- text,
- output_file,
- audio_tags,
- )
+ try:
+ book_parser = get_book_parser(self.config)
+ tts_provider = get_tts_provider(self.config)
+
+ os.makedirs(self.config.output_folder, exist_ok=True)
+ chapters = book_parser.get_chapters(tts_provider.get_break_string())
+ # Filter out empty or very short chapters
+ chapters = [(title, text) for title, text in chapters if text.strip()]
+
+ logger.info(f"Chapters count: {len(chapters)}.")
+
+ # Check chapter start and end args
+ if self.config.chapter_start < 1 or self.config.chapter_start > len(chapters):
+ raise ValueError(
+ f"Chapter start index {self.config.chapter_start} is out of range. Check your input."
+ )
+ if self.config.chapter_end < -1 or self.config.chapter_end > len(chapters):
+ raise ValueError(
+ f"Chapter end index {self.config.chapter_end} is out of range. Check your input."
+ )
+ if self.config.chapter_end == -1:
+ self.config.chapter_end = len(chapters)
+ if self.config.chapter_start > self.config.chapter_end:
+ raise ValueError(
+ f"Chapter start index {self.config.chapter_start} is larger than chapter end index {self.config.chapter_end}. Check your input."
+ )
+
+ logger.info(f"Converting chapters from {self.config.chapter_start} to {self.config.chapter_end}.")
+
+ # Initialize total_characters to 0
+ total_characters = get_total_chars(chapters)
+ logger.info(f"✨ Total characters in selected book: {total_characters} ✨")
+ rough_price = tts_provider.estimate_cost(total_characters)
+ confirm_conversion(rough_price)
+
+ # Loop through each chapter and convert it to speech using the provided TTS provider
+ for idx, (title, text) in enumerate(chapters, start=1):
+ if idx < self.config.chapter_start:
+ continue
+ if idx > self.config.chapter_end:
+ break
+ logger.info(
+ f"Converting chapter {idx}/{len(chapters)}: {title}, characters: {len(text)}"
+ )
+
+ if self.config.output_text:
+ text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{title}.txt")
+ with open(text_file, "w", encoding='utf-8') as file:
+ file.write(text)
+
+ if self.config.preview:
+ continue
+
+ output_file = os.path.join(self.config.output_folder,
+ f"{idx:04d}_{title}.{tts_provider.get_output_file_extension()}")
+
+ audio_tags = AudioTags(title, book_parser.get_book_author(), book_parser.get_book_title(), idx)
+ tts_provider.text_to_speech(
+ text,
+ output_file,
+ audio_tags,
+ )
+ except KeyboardInterrupt:
+ logger.info("Job stopped by user.")
+ exit()