diff --git a/audiobook_generator/book_parsers/epub_book_parser.py b/audiobook_generator/book_parsers/epub_book_parser.py index 4d3f5f8..d551576 100644 --- a/audiobook_generator/book_parsers/epub_book_parser.py +++ b/audiobook_generator/book_parsers/epub_book_parser.py @@ -45,7 +45,12 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]: for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT): content = item.get_content() soup = BeautifulSoup(content, "lxml") - title = soup.title.string if soup.title else "" + title = "" + title_levels = ['title', 'h1', 'h2', 'h3'] + for level in title_levels: + if soup.find(level): + title = soup.find(level).text + break raw = soup.get_text(strip=False) logger.debug(f"Raw text: <{raw[:]}>") @@ -67,7 +72,7 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]: logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>") # fill in the title if it's missing - if not title: + if title == "": title = cleaned_text[:60] logger.debug(f"Raw title: <{title}>") title = self._sanitize_title(title, break_string) @@ -77,7 +82,8 @@ def get_chapters(self, break_string) -> List[Tuple[str, str]]: soup.decompose() return chapters - def _sanitize_title(self, title, break_string) -> str: + @staticmethod + def _sanitize_title(title, break_string) -> str: # replace MAGIC_BREAK_STRING with a blank space # strip incase leading bank is missing title = title.replace(break_string, " ") diff --git a/audiobook_generator/core/audiobook_generator.py b/audiobook_generator/core/audiobook_generator.py index ecaa17e..c21ef3f 100644 --- a/audiobook_generator/core/audiobook_generator.py +++ b/audiobook_generator/core/audiobook_generator.py @@ -34,64 +34,68 @@ def __str__(self) -> str: return f"{self.config}" def run(self): - book_parser = get_book_parser(self.config) - tts_provider = get_tts_provider(self.config) - - os.makedirs(self.config.output_folder, exist_ok=True) - chapters = book_parser.get_chapters(tts_provider.get_break_string()) - # Filter out empty or very short chapters - chapters = [(title, text) for title, text in chapters if text.strip()] - - logger.info(f"Chapters count: {len(chapters)}.") - - # Check chapter start and end args - if self.config.chapter_start < 1 or self.config.chapter_start > len(chapters): - raise ValueError( - f"Chapter start index {self.config.chapter_start} is out of range. Check your input." - ) - if self.config.chapter_end < -1 or self.config.chapter_end > len(chapters): - raise ValueError( - f"Chapter end index {self.config.chapter_end} is out of range. Check your input." - ) - if self.config.chapter_end == -1: - self.config.chapter_end = len(chapters) - if self.config.chapter_start > self.config.chapter_end: - raise ValueError( - f"Chapter start index {self.config.chapter_start} is larger than chapter end index {self.config.chapter_end}. Check your input." - ) - - logger.info(f"Converting chapters from {self.config.chapter_start} to {self.config.chapter_end}.") - - # Initialize total_characters to 0 - total_characters = get_total_chars(chapters) - logger.info(f"✨ Total characters in selected book: {total_characters} ✨") - rough_price = tts_provider.estimate_cost(total_characters) - confirm_conversion(rough_price) - - # Loop through each chapter and convert it to speech using the provided TTS provider - for idx, (title, text) in enumerate(chapters, start=1): - if idx < self.config.chapter_start: - continue - if idx > self.config.chapter_end: - break - logger.info( - f"Converting chapter {idx}/{len(chapters)}: {title}, characters: {len(text)}" - ) - - if self.config.output_text: - text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{title}.txt") - with open(text_file, "w", encoding='utf-8') as file: - file.write(text) - - if self.config.preview: - continue - - output_file = os.path.join(self.config.output_folder, - f"{idx:04d}_{title}.{tts_provider.get_output_file_extension()}") - - audio_tags = AudioTags(title, book_parser.get_book_author(), book_parser.get_book_title(), idx) - tts_provider.text_to_speech( - text, - output_file, - audio_tags, - ) + try: + book_parser = get_book_parser(self.config) + tts_provider = get_tts_provider(self.config) + + os.makedirs(self.config.output_folder, exist_ok=True) + chapters = book_parser.get_chapters(tts_provider.get_break_string()) + # Filter out empty or very short chapters + chapters = [(title, text) for title, text in chapters if text.strip()] + + logger.info(f"Chapters count: {len(chapters)}.") + + # Check chapter start and end args + if self.config.chapter_start < 1 or self.config.chapter_start > len(chapters): + raise ValueError( + f"Chapter start index {self.config.chapter_start} is out of range. Check your input." + ) + if self.config.chapter_end < -1 or self.config.chapter_end > len(chapters): + raise ValueError( + f"Chapter end index {self.config.chapter_end} is out of range. Check your input." + ) + if self.config.chapter_end == -1: + self.config.chapter_end = len(chapters) + if self.config.chapter_start > self.config.chapter_end: + raise ValueError( + f"Chapter start index {self.config.chapter_start} is larger than chapter end index {self.config.chapter_end}. Check your input." + ) + + logger.info(f"Converting chapters from {self.config.chapter_start} to {self.config.chapter_end}.") + + # Initialize total_characters to 0 + total_characters = get_total_chars(chapters) + logger.info(f"✨ Total characters in selected book: {total_characters} ✨") + rough_price = tts_provider.estimate_cost(total_characters) + confirm_conversion(rough_price) + + # Loop through each chapter and convert it to speech using the provided TTS provider + for idx, (title, text) in enumerate(chapters, start=1): + if idx < self.config.chapter_start: + continue + if idx > self.config.chapter_end: + break + logger.info( + f"Converting chapter {idx}/{len(chapters)}: {title}, characters: {len(text)}" + ) + + if self.config.output_text: + text_file = os.path.join(self.config.output_folder, f"{idx:04d}_{title}.txt") + with open(text_file, "w", encoding='utf-8') as file: + file.write(text) + + if self.config.preview: + continue + + output_file = os.path.join(self.config.output_folder, + f"{idx:04d}_{title}.{tts_provider.get_output_file_extension()}") + + audio_tags = AudioTags(title, book_parser.get_book_author(), book_parser.get_book_title(), idx) + tts_provider.text_to_speech( + text, + output_file, + audio_tags, + ) + except KeyboardInterrupt: + logger.info("Job stopped by user.") + exit()