Skip to content

Commit

Permalink
fix: retrieve title.
Browse files Browse the repository at this point in the history
retrieve title from fallback tag: <h1>,<h2>,<h3>.
  • Loading branch information
xtmu committed Jan 8, 2024
1 parent 825e3d9 commit 1be794c
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 64 deletions.
12 changes: 9 additions & 3 deletions audiobook_generator/book_parsers/epub_book_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,12 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
content = item.get_content()
soup = BeautifulSoup(content, "lxml")
title = soup.title.string if soup.title else ""
title = ""
title_levels = ['title', 'h1', 'h2', 'h3']
for level in title_levels:
if soup.find(level):
title = soup.find(level).text
break
raw = soup.get_text(strip=False)
logger.debug(f"Raw text: <{raw[:]}>")

Expand All @@ -67,7 +72,7 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")

# fill in the title if it's missing
if not title:
if title == "":
title = cleaned_text[:60]
logger.debug(f"Raw title: <{title}>")
title = self._sanitize_title(title, break_string)
Expand All @@ -77,7 +82,8 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]:
soup.decompose()
return chapters

def _sanitize_title(self, title, break_string) -> str:
@staticmethod
def _sanitize_title(title, break_string) -> str:
# replace MAGIC_BREAK_STRING with a blank space
# strip incase leading bank is missing
title = title.replace(break_string, " ")
Expand Down
126 changes: 65 additions & 61 deletions audiobook_generator/core/audiobook_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,64 +34,68 @@ def __str__(self) -> str:
return f"{self.config}"

def run(self):
book_parser = get_book_parser(self.config)
tts_provider = get_tts_provider(self.config)

os.makedirs(self.config.output_folder, exist_ok=True)
chapters = book_parser.get_chapters(tts_provider.get_break_string())
# Filter out empty or very short chapters
chapters = [(title, text) for title, text in chapters if text.strip()]

logger.info(f"Chapters count: {len(chapters)}.")

# Check chapter start and end args
if self.config.chapter_start < 1 or self.config.chapter_start > len(chapters):
raise ValueError(
f"Chapter start index {self.config.chapter_start} is out of range. Check your input."
)
if self.config.chapter_end < -1 or self.config.chapter_end > len(chapters):
raise ValueError(
f"Chapter end index {self.config.chapter_end} is out of range. Check your input."
)
if self.config.chapter_end == -1:
self.config.chapter_end = len(chapters)
if self.config.chapter_start > self.config.chapter_end:
raise ValueError(
f"Chapter start index {self.config.chapter_start} is larger than chapter end index {self.config.chapter_end}. Check your input."
)

logger.info(f"Converting chapters from {self.config.chapter_start} to {self.config.chapter_end}.")

# Initialize total_characters to 0
total_characters = get_total_chars(chapters)
logger.info(f"✨ Total characters in selected book: {total_characters} ✨")
rough_price = tts_provider.estimate_cost(total_characters)
confirm_conversion(rough_price)

# Loop through each chapter and convert it to speech using the provided TTS provider
for idx, (title, text) in enumerate(chapters, start=1):
if idx < self.config.chapter_start:
continue
if idx > self.config.chapter_end:
break
logger.info(
f"Converting chapter {idx}/{len(chapters)}: {title}, characters: {len(text)}"
)

if self.config.output_text:
text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{title}.txt")
with open(text_file, "w", encoding='utf-8') as file:
file.write(text)

if self.config.preview:
continue

output_file = os.path.join(self.config.output_folder,
f"{idx:04d}_{title}.{tts_provider.get_output_file_extension()}")

audio_tags = AudioTags(title, book_parser.get_book_author(), book_parser.get_book_title(), idx)
tts_provider.text_to_speech(
text,
output_file,
audio_tags,
)
try:
book_parser = get_book_parser(self.config)
tts_provider = get_tts_provider(self.config)

os.makedirs(self.config.output_folder, exist_ok=True)
chapters = book_parser.get_chapters(tts_provider.get_break_string())
# Filter out empty or very short chapters
chapters = [(title, text) for title, text in chapters if text.strip()]

logger.info(f"Chapters count: {len(chapters)}.")

# Check chapter start and end args
if self.config.chapter_start < 1 or self.config.chapter_start > len(chapters):
raise ValueError(
f"Chapter start index {self.config.chapter_start} is out of range. Check your input."
)
if self.config.chapter_end < -1 or self.config.chapter_end > len(chapters):
raise ValueError(
f"Chapter end index {self.config.chapter_end} is out of range. Check your input."
)
if self.config.chapter_end == -1:
self.config.chapter_end = len(chapters)
if self.config.chapter_start > self.config.chapter_end:
raise ValueError(
f"Chapter start index {self.config.chapter_start} is larger than chapter end index {self.config.chapter_end}. Check your input."
)

logger.info(f"Converting chapters from {self.config.chapter_start} to {self.config.chapter_end}.")

# Initialize total_characters to 0
total_characters = get_total_chars(chapters)
logger.info(f"✨ Total characters in selected book: {total_characters} ✨")
rough_price = tts_provider.estimate_cost(total_characters)
confirm_conversion(rough_price)

# Loop through each chapter and convert it to speech using the provided TTS provider
for idx, (title, text) in enumerate(chapters, start=1):
if idx < self.config.chapter_start:
continue
if idx > self.config.chapter_end:
break
logger.info(
f"Converting chapter {idx}/{len(chapters)}: {title}, characters: {len(text)}"
)

if self.config.output_text:
text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{title}.txt")
with open(text_file, "w", encoding='utf-8') as file:
file.write(text)

if self.config.preview:
continue

output_file = os.path.join(self.config.output_folder,
f"{idx:04d}_{title}.{tts_provider.get_output_file_extension()}")

audio_tags = AudioTags(title, book_parser.get_book_author(), book_parser.get_book_title(), idx)
tts_provider.text_to_speech(
text,
output_file,
audio_tags,
)
except KeyboardInterrupt:
logger.info("Job stopped by user.")
exit()

0 comments on commit 1be794c

Please sign in to comment.