1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129import logging
import re
from typing import List, Tuple
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
from audiobook_generator.book_parsers.base_book_parser import BaseBookParser
from audiobook_generator.config.general_config import GeneralConfig
logger = logging.getLogger(__name__)
class EpubBookParser(BaseBookParser):
def __init__(self, config: GeneralConfig):
super().__init__(config)
self.book = epub.read_epub(self.config.input_file, {"ignore_ncx": True})
def __str__(self) -> str:
return super().__str__()
def validate_config(self):
if self.config.input_file is None:
raise ValueError("Epub Parser: Input file cannot be empty")
if not self.config.input_file.endswith(".epub"):
raise ValueError(f"Epub Parser: Unsupported file format: {self.config.input_file}")
def get_book(self):
return self.book
def get_book_title(self) -> str:
if self.book.get_metadata('DC', 'title'):
return self.book.get_metadata("DC", "title")[0][0]
return "Untitled"
def get_book_author(self) -> str:
if self.book.get_metadata('DC', 'creator'):
return self.book.get_metadata("DC", "creator")[0][0]
return "Unknown"
def get_chapters(self, break_string) -> List[Tuple[str, str]]:
chapters = []
search_and_replaces = self.get_search_and_replaces()
for item in self.book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
content = item.get_content()
soup = BeautifulSoup(content, "lxml-xml")
raw = soup.get_text(strip=False)
logger.debug(f"Raw text: <{raw[:]}>")
# Replace excessive whitespaces and newline characters based on the mode
if self.config.newline_mode == "single":
cleaned_text = re.sub(r"[\n]+", break_string, raw.strip())
elif self.config.newline_mode == "double":
cleaned_text = re.sub(r"[\n]{2,}", break_string, raw.strip())
elif self.config.newline_mode == "none":
cleaned_text = re.sub(r"[\n]+", " ", raw.strip())
else:
raise ValueError(f"Invalid newline mode: {self.config.newline_mode}")
logger.debug(f"Cleaned text step 1: <{cleaned_text[:]}>")
cleaned_text = re.sub(r"\s+", " ", cleaned_text)
logger.debug(f"Cleaned text step 2: <{cleaned_text[:100]}>")
# Removes end-note numbers
if self.config.remove_endnotes:
cleaned_text = re.sub(r'(?<=[a-zA-Z.,!?;โ")])\d+', "", cleaned_text)
logger.debug(f"Cleaned text step 4: <{cleaned_text[:100]}>")
# Removes references numbers like [1] or [2.3]
if self.config.remove_reference_numbers:
cleaned_text = re.sub(r'\[\d+(\.\d+)?\]', '', cleaned_text)
logger.debug(f"Cleaned text step 4.1 (removed brackets): <{cleaned_text[:100]}>")
# Does user defined search and replaces
for search_and_replace in search_and_replaces:
cleaned_text = re.sub(search_and_replace['search'], search_and_replace['replace'], cleaned_text)
logger.debug(f"Cleaned text step 5: <{cleaned_text[:100]}>")
# Get proper chapter title
if self.config.title_mode == "auto":
title = ""
title_levels = ['title', 'h1', 'h2', 'h3']
for level in title_levels:
if soup.find(level):
title = soup.find(level).text
break
if title.strip() == "" or re.match(r'^\d{1,3}$',title) is not None:
title = cleaned_text[:60]
elif self.config.title_mode == "tag_text":
title = ""
title_levels = ['title', 'h1', 'h2', 'h3']
for level in title_levels:
if soup.find(level):
title = soup.find(level).text
break
if title.strip() == "":
title = "<blank>"
elif self.config.title_mode == "first_few":
title = cleaned_text[:60]
else:
raise ValueError("Unsupported title_mode")
logger.debug(f"Raw title: <{title}>")
title = self._sanitize_title(title, break_string)
logger.debug(f"Sanitized title: <{title}>")
chapters.append((title, cleaned_text))
soup.decompose()
return chapters
def get_search_and_replaces(self):
search_and_replaces = []
if self.config.search_and_replace_file:
with open(self.config.search_and_replace_file) as fp:
search_and_replace_content = fp.readlines()
for search_and_replace in search_and_replace_content:
if '==' in search_and_replace and not search_and_replace.startswith('==') and not search_and_replace.endswith('==') and not search_and_replace.startswith('#'):
search_and_replaces = search_and_replaces + [ {'search': r"{}".format(search_and_replace.split('==')[0]), 'replace': r"{}".format(search_and_replace.split('==')[1][:-1])} ]
return search_and_replaces
@staticmethod
def _sanitize_title(title, break_string) -> str:
# replace MAGIC_BREAK_STRING with a blank space
# strip incase leading bank is missing
title = title.replace(break_string, " ")
sanitized_title = re.sub(r"[^\w\s]", "", title, flags=re.UNICODE)
sanitized_title = re.sub(r"\s+", "_", sanitized_title.strip())
return sanitized_title