1use crate::content::Page;
2use crate::cover::Cover;
3use crate::image::Image;
4use crate::metadata::Metadata;
5use crate::toc::TocEntry;
6use crate::zip_handler::ZipHandler;
7use quick_xml::events::Event;
8use std::collections::HashMap;
9use std::path::{Path, PathBuf};
10
11#[derive(Debug)]
12pub struct Epub {
13 pub metadata: Metadata,
14 pub toc: Vec<TocEntry>,
15 pub pages: Vec<Page>,
16 pub cover: Cover,
17 pub images: Vec<Image>,
18}
19
20#[derive(Debug)]
21pub enum Error {
22 InvalidEpub(String),
23 IoError(std::io::Error),
24 ZipError(zip::result::ZipError),
25 XmlError(String),
26 MissingContainer,
27 MissingOpf,
28 MissingNcx,
29}
30
31impl From<std::io::Error> for Error {
32 fn from(err: std::io::Error) -> Self {
33 Error::IoError(err)
34 }
35}
36
37impl From<zip::result::ZipError> for Error {
38 fn from(err: zip::result::ZipError) -> Self {
39 Error::ZipError(err)
40 }
41}
42
43impl From<quick_xml::Error> for Error {
44 fn from(err: quick_xml::Error) -> Self {
45 Error::XmlError(err.to_string())
46 }
47}
48
49impl Epub {
50 pub fn parse(path: &Path) -> Result<Self, Error> {
51 let mut zip_handler = ZipHandler::new(path)?;
52
53 let opf_path = zip_handler.get_opf_path()?;
54 let opf_content = zip_handler.read_file(&opf_path)?;
55
56 let (metadata, manifest, spine, ncx_path, cover_id) = Self::parse_opf(&opf_content)?;
57
58 let toc = if let Some(ncx_ref) = ncx_path {
59 let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
60 let ncx_content = zip_handler.read_file(&ncx_path_full)?;
61 Self::parse_ncx(&ncx_content)?
62 } else {
63 Vec::new()
64 };
65
66 let mut pages = Vec::new();
67 for itemref in spine {
68 if let Some(manifest_item) = manifest.get(&itemref) {
69 let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
70 let content = zip_handler.read_file(&content_path)?;
71 let text = Self::extract_text_from_html(&content)?;
72 pages.push(Page {
73 index: pages.len(),
74 content: text,
75 });
76 }
77 }
78
79 let mut cover = Cover::default();
80 if let Some(cover_id) = cover_id {
81 if let Some(cover_item) = manifest.get(&cover_id) {
82 let cover_path = Self::resolve_path(&opf_path, &cover_item.href);
83 match zip_handler.read_file_as_bytes(&cover_path) {
84 Ok(bytes) => {
85 cover.href = Some(cover_item.href.clone());
86 cover.content = Some(bytes);
87 }
88 Err(_) => {}
89 }
90 }
91 }
92
93 let mut images = Vec::new();
94 for (id, item) in &manifest {
95 if item._media_type.starts_with("image/") {
96 let image_path = Self::resolve_path(&opf_path, &item.href);
97 match zip_handler.read_file_as_bytes(&image_path) {
98 Ok(bytes) => {
99 images.push(Image {
100 id: id.clone(),
101 href: item.href.clone(),
102 media_type: item._media_type.clone(),
103 content: Some(bytes),
104 });
105 }
106 Err(_) => {
107 images.push(Image {
108 id: id.clone(),
109 href: item.href.clone(),
110 media_type: item._media_type.clone(),
111 content: None,
112 });
113 }
114 }
115 }
116 }
117
118 Ok(Epub {
119 metadata,
120 toc,
121 pages,
122 cover,
123 images,
124 })
125 }
126
127 fn parse_opf(
128 content: &str,
129 ) -> Result<
130 (
131 Metadata,
132 HashMap<String, ManifestItem>,
133 Vec<String>,
134 Option<String>,
135 Option<String>,
136 ),
137 Error,
138 > {
139 let mut reader = quick_xml::Reader::from_str(content);
140 let mut metadata = Metadata::new();
141 let mut manifest: HashMap<String, ManifestItem> = HashMap::new();
142 let mut spine: Vec<String> = Vec::new();
143 let mut ncx_path: Option<String> = None;
144 let mut cover_id: Option<String> = None;
145
146 let mut current_text_tag: Option<String> = None;
147
148 let mut buf = Vec::new();
149
150 loop {
151 match reader.read_event_into(&mut buf) {
152 Ok(Event::Start(ref e)) => {
153 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
154 if name.contains("title") {
155 current_text_tag = Some("title".to_string());
156 } else if name.contains("creator") {
157 current_text_tag = Some("author".to_string());
158 } else if name.contains("publisher") {
159 current_text_tag = Some("publisher".to_string());
160 } else if name.contains("language") {
161 current_text_tag = Some("language".to_string());
162 } else if name.contains("identifier") {
163 current_text_tag = Some("identifier".to_string());
164 } else if name.contains("date") {
165 current_text_tag = Some("date".to_string());
166 } else if name.contains("rights") {
167 current_text_tag = Some("rights".to_string());
168 }
169 }
170 Ok(Event::Empty(ref e)) => {
171 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
172 if name.contains("meta") {
173 let mut is_cover = false;
174 for attr_result in e.attributes() {
175 if let Ok(attr) = attr_result {
176 let attr_name =
177 String::from_utf8_lossy(attr.key.as_ref()).to_string();
178 if attr_name.contains("name") {
179 let value = attr
180 .decode_and_unescape_value(reader.decoder())?
181 .to_string();
182 if value == "cover" {
183 is_cover = true;
184 }
185 } else if attr_name.contains("content") {
186 if is_cover {
187 if let Some(val) =
188 attr.decode_and_unescape_value(reader.decoder()).ok()
189 {
190 cover_id = Some(val.to_string());
191 }
192 }
193 }
194 }
195 }
196 } else if name.contains("item") && !name.contains("itemref") {
197 let mut id = String::new();
198 let mut href = String::new();
199 let mut media_type = String::new();
200
201 for attr_result in e.attributes() {
202 if let Ok(attr) = attr_result {
203 let attr_name =
204 String::from_utf8_lossy(attr.key.as_ref()).to_string();
205 if attr_name == "id" || attr_name.ends_with(":id") {
206 if let Some(val) =
207 attr.decode_and_unescape_value(reader.decoder()).ok()
208 {
209 id = val.to_string();
210 }
211 } else if attr_name == "href" || attr_name.ends_with(":href") {
212 href = attr
213 .decode_and_unescape_value(reader.decoder())?
214 .to_string();
215 } else if attr_name == "media-type"
216 || attr_name.ends_with(":media-type")
217 {
218 media_type = attr
219 .decode_and_unescape_value(reader.decoder())?
220 .to_string();
221 }
222 }
223 }
224
225 if !id.is_empty() && !href.is_empty() {
226 if media_type == "application/x-dtbncx+xml" {
227 ncx_path = Some(href.clone());
228 }
229 manifest.insert(
230 id.clone(),
231 ManifestItem {
232 _id: id.clone(),
233 href,
234 _media_type: media_type,
235 },
236 );
237 }
238 } else if name.contains("itemref") {
239 let mut idref = String::new();
240
241 for attr_result in e.attributes() {
242 if let Ok(attr) = attr_result {
243 let attr_name =
244 String::from_utf8_lossy(attr.key.as_ref()).to_string();
245 if attr_name == "idref" || attr_name.ends_with(":idref") {
246 if let Some(val) =
247 attr.decode_and_unescape_value(reader.decoder()).ok()
248 {
249 idref = val.to_string();
250 }
251 break;
252 }
253 }
254 }
255
256 if !idref.is_empty() {
257 spine.push(idref);
258 }
259 }
260 }
261 Ok(Event::Text(e)) => {
262 if let Some(tag) = ¤t_text_tag {
263 let text = e.unescape()?.into_owned().trim().to_string();
264 if !text.is_empty() {
265 match tag.as_str() {
266 "title" => metadata.title = Some(text),
267 "author" => metadata.author = Some(text),
268 "publisher" => metadata.publisher = Some(text),
269 "language" => metadata.language = Some(text),
270 "identifier" => metadata.identifier = Some(text),
271 "date" => metadata.date = Some(text),
272 "rights" => metadata.rights = Some(text),
273 _ => {}
274 }
275 }
276 current_text_tag = None;
277 }
278 }
279 Ok(Event::End(_)) => {
280 current_text_tag = None;
281 }
282 Ok(Event::Eof) => break,
283 Err(e) => return Err(Error::XmlError(e.to_string())),
284 _ => {}
285 }
286 buf.clear();
287 }
288
289 Ok((metadata, manifest, spine, ncx_path, cover_id))
290 }
291
292 fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
293 let mut reader = quick_xml::Reader::from_str(content);
294 let mut toc = Vec::new();
295 let mut stack: Vec<TocEntry> = Vec::new();
296
297 let mut buf = Vec::new();
298 let mut in_nav_label = false;
299 let mut in_text = false;
300
301 loop {
302 match reader.read_event_into(&mut buf) {
303 Ok(Event::Start(ref e)) => {
304 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
305 if name == "navPoint" {
306 let entry = TocEntry {
307 label: String::new(),
308 href: String::new(),
309 children: Vec::new(),
310 };
311 stack.push(entry);
312 } else if name == "navLabel" {
313 in_nav_label = true;
314 } else if name == "text" && in_nav_label {
315 in_text = true;
316 }
317 }
318 Ok(Event::End(ref e)) => {
319 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
320 if name == "navPoint" {
321 if let Some(entry) = stack.pop() {
322 if let Some(parent) = stack.last_mut() {
323 parent.children.push(entry);
324 } else {
325 toc.push(entry);
326 }
327 }
328 } else if name == "navLabel" {
329 in_nav_label = false;
330 } else if name == "text" && in_nav_label {
331 in_text = false;
332 }
333 }
334 Ok(Event::Text(e)) => {
335 if in_text {
336 if let Some(entry) = stack.last_mut() {
337 entry.label = e.unescape()?.into_owned();
338 }
339 }
340 }
341 Ok(Event::Empty(ref e)) => {
342 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
343 if name == "content" {
344 if let Some(src) = e.try_get_attribute("src")? {
345 if let Some(entry) = stack.last_mut() {
346 entry.href = src.decode_and_unescape_value(reader.decoder())?.to_string();
347 }
348 }
349 }
350 }
351 Ok(Event::Eof) => break,
352 Err(e) => return Err(Error::XmlError(e.to_string())),
353 _ => {}
354 }
355 buf.clear();
356 }
357
358 Ok(toc)
359 }
360
361 fn extract_text_from_html(content: &str) -> Result<String, Error> {
362 let mut reader = quick_xml::Reader::from_str(content);
363 let mut text = String::new();
364 let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
365 let mut in_skip_tag = false;
366
367 let mut buf = Vec::new();
368
369 loop {
370 match reader.read_event_into(&mut buf) {
371 Ok(Event::Start(ref e)) => {
372 let tag = e.name().as_ref().to_vec();
373 if skip_tags.contains(&tag) {
374 in_skip_tag = true;
375 } else if tag.as_slice() == b"p"
376 || tag.as_slice() == b"div"
377 || tag.as_slice() == b"br"
378 || tag.as_slice() == b"li"
379 {
380 text.push('\n');
381 }
382 }
383 Ok(Event::End(ref e)) => {
384 let tag = e.name().as_ref().to_vec();
385 if skip_tags.contains(&tag) {
386 in_skip_tag = false;
387 }
388 }
389 Ok(Event::Text(e)) => {
390 if !in_skip_tag {
391 let t = e.unescape()?.into_owned();
392 let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
393 text.push_str(&trimmed);
394 text.push(' ');
395 }
396 }
397 Ok(Event::Eof) => break,
398 Err(e) => return Err(Error::XmlError(e.to_string())),
399 _ => {}
400 }
401 buf.clear();
402 }
403
404 Ok(text
405 .lines()
406 .map(|l| l.trim())
407 .filter(|l| !l.is_empty())
408 .collect::<Vec<_>>()
409 .join("\n"))
410 }
411
412 fn resolve_path(base_path: &str, href: &str) -> String {
413 let base = PathBuf::from(base_path);
414 let parent = base.parent().unwrap_or(base.as_path());
415 let resolved = parent.join(href);
416 resolved.to_string_lossy().to_string().replace('\\', "/")
417 }
418}
419
420#[derive(Debug, Clone)]
421struct ManifestItem {
422 _id: String,
423 href: String,
424 _media_type: String,
425}