Skip to main content

epub_parser/
epub.rs

1use crate::types::{Image, Metadata, Page, TocEntry};
2use crate::utils::{preprocess_html_entities, ZipHandler};
3use ordered_hash_map::OrderedHashMap;
4use quick_xml::events::Event;
5use std::io::Cursor;
6use std::path::{Path, PathBuf};
7
8/// A parsed EPUB e-book representation.
9///
10/// This struct contains all the extracted data from an EPUB file including:
11/// - Metadata (title, author, publisher, etc.)
12/// - Table of contents (hierarchical navigation)
13/// - Text content (pages in reading order)
14/// - Images (including cover image)
15///
16/// # Example
17///
18/// ```
19/// use epub_parser::Epub;
20/// use std::path::Path;
21///
22/// let epub = Epub::parse(Path::new("book.epub"))?;
23///
24/// // Access metadata
25/// if let Some(title) = &epub.metadata.title {
26///     println!("Title: {}", title);
27/// }
28///
29/// // Access all pages
30/// for page in &epub.pages {
31///     println!("Page {}: {} chars", page.index, page.content.len());
32/// }
33///
34/// // Access images
35/// for image in &epub.images {
36///     println!("Image: {} ({})", image.href, image.media_type);
37/// }
38/// ```
39#[derive(Debug)]
40pub struct Epub {
41    /// The Dublin Core metadata extracted from the EPUB.
42    pub metadata: Metadata,
43    /// The hierarchical table of contents from the NCX file.
44    pub toc: Vec<TocEntry>,
45    /// The text content of each page in reading order.
46    pub pages: Vec<Page>,
47    /// All images found in the EPUB (including cover).
48    pub images: Vec<Image>,
49}
50
51/// Errors that can occur while parsing an EPUB file.
52#[derive(Debug)]
53pub enum Error {
54    /// The EPUB file is invalid or corrupted.
55    InvalidEpub(String),
56    /// An I/O error occurred while reading the file.
57    IoError(std::io::Error),
58    /// An error occurred while reading the ZIP archive.
59    ZipError(zip::result::ZipError),
60    /// An error occurred while parsing XML.
61    XmlError(String),
62    /// The META-INF/container.xml file is missing.
63    MissingContainer,
64    /// The OPF package file is missing or not found.
65    MissingOpf,
66    /// The NCX table of contents file is missing.
67    MissingNcx,
68}
69
70impl std::fmt::Display for Error {
71    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72        match self {
73            Error::InvalidEpub(msg) => write!(f, "Invalid EPUB: {}", msg),
74            Error::IoError(e) => write!(f, "I/O error: {}", e),
75            Error::ZipError(e) => write!(f, "ZIP error: {}", e),
76            Error::XmlError(e) => write!(f, "XML error: {}", e),
77            Error::MissingContainer => write!(f, "Missing container.xml"),
78            Error::MissingOpf => write!(f, "Missing OPF file"),
79            Error::MissingNcx => write!(f, "Missing NCX file"),
80        }
81    }
82}
83
84impl std::error::Error for Error {
85    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
86        match self {
87            Error::IoError(e) => Some(e),
88            Error::ZipError(e) => Some(e),
89            _ => None,
90        }
91    }
92}
93
94impl From<std::io::Error> for Error {
95    fn from(err: std::io::Error) -> Self {
96        Error::IoError(err)
97    }
98}
99
100impl From<zip::result::ZipError> for Error {
101    fn from(err: zip::result::ZipError) -> Self {
102        Error::ZipError(err)
103    }
104}
105
106impl From<quick_xml::Error> for Error {
107    fn from(err: quick_xml::Error) -> Self {
108        Error::XmlError(err.to_string())
109    }
110}
111
112impl Epub {
113    /// Parse an EPUB file from a file path.
114    ///
115    /// # Arguments
116    ///
117    /// * `path` - The path to the EPUB file.
118    ///
119    /// # Returns
120    ///
121    /// Returns `Ok(Epub)` on success, or an `Error` if parsing fails.
122    ///
123    /// # Errors
124    ///
125    /// This function will return an error if:
126    /// - The file does not exist
127    /// - The file is not a valid ZIP archive
128    /// - The EPUB structure is invalid
129    ///
130    /// # Example
131    ///
132    /// ```
133    /// use epub_parser::Epub;
134    /// use std::path::Path;
135    ///
136    /// let epub = Epub::parse(Path::new("book.epub"))?;
137    /// println!("Parsed: {}", epub.metadata.title.unwrap_or_default());
138    /// # Ok::<(), Box<dyn std::error::Error>>(())
139    /// ```
140    pub fn parse(path: &Path) -> Result<Self, Error> {
141        let mut zip_handler = ZipHandler::new(path)?;
142        Self::parse_from_handler(&mut zip_handler)
143    }
144
145    /// Parse an EPUB file from a byte buffer.
146    ///
147    /// This is useful when you have the EPUB data in memory, for example
148    /// when downloading from a network or reading from a database.
149    ///
150    /// # Arguments
151    ///
152    /// * `buffer` - The raw bytes of the EPUB file.
153    ///
154    /// # Returns
155    ///
156    /// Returns `Ok(Epub)` on success, or an `Error` if parsing fails.
157    ///
158    /// # Example
159    ///
160    /// ```
161    /// use epub_parser::Epub;
162    ///
163    /// let bytes = std::fs::read("book.epub")?;
164    /// let epub = Epub::parse_from_buffer(&bytes)?;
165    /// println!("Parsed: {}", epub.metadata.title.unwrap_or_default());
166    /// # Ok::<(), Box<dyn std::error::Error>>(())
167    /// ```
168    pub fn parse_from_buffer(buffer: &[u8]) -> Result<Self, Error> {
169        let cursor = Cursor::new(buffer.to_vec());
170        let mut zip_handler = ZipHandler::new_from_reader(cursor)?;
171        Self::parse_from_handler(&mut zip_handler)
172    }
173
174    fn parse_from_handler<R: std::io::Read + std::io::Seek>(
175        zip_handler: &mut ZipHandler<R>,
176    ) -> Result<Self, Error> {
177        let opf_path = zip_handler.get_opf_path()?;
178        let opf_content = zip_handler.read_file(&opf_path)?;
179
180        let (metadata, manifest, spine, ncx_path) = Self::parse_opf(&opf_content)?;
181
182        let toc = if let Some(ncx_ref) = ncx_path {
183            let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
184            let ncx_content = zip_handler.read_file(&ncx_path_full)?;
185            Self::parse_ncx(&ncx_content)?
186        } else {
187            Vec::new()
188        };
189
190        let mut pages = Vec::new();
191        for itemref in spine {
192            if let Some(manifest_item) = manifest.get(&itemref) {
193                let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
194                match zip_handler.read_file(&content_path) {
195                    Ok(content) => {
196                        if let Ok(text) = Self::extract_text_from_html(&content) {
197                            pages.push(Page {
198                                index: pages.len(),
199                                content: text,
200                            });
201                        }
202                    }
203                    Err(e) => {
204                        eprintln!(
205                            "Warning: Could not read content file '{}': {}",
206                            content_path, e
207                        );
208                    }
209                }
210            }
211        }
212
213        let mut images = Vec::new();
214        for (id, item) in &manifest {
215            if item._media_type.to_lowercase().starts_with("image/") {
216                let image_path = Self::resolve_path(&opf_path, &item.href);
217                if let Ok(bytes) = zip_handler.read_file_as_bytes(&image_path) {
218                    if id.to_lowercase().contains("cover") {
219                        images.insert(
220                            0,
221                            Image {
222                                id: id.clone(),
223                                href: item.href.clone(),
224                                media_type: item._media_type.clone(),
225                                content: bytes,
226                            },
227                        );
228                    } else {
229                        images.push(Image {
230                            id: id.clone(),
231                            href: item.href.clone(),
232                            media_type: item._media_type.clone(),
233                            content: bytes,
234                        });
235                    }
236                }
237            }
238        }
239
240        Ok(Epub {
241            metadata,
242            toc,
243            pages,
244            images,
245        })
246    }
247
248    fn parse_opf(
249        content: &str,
250    ) -> Result<
251        (
252            Metadata,
253            OrderedHashMap<String, ManifestItem>,
254            Vec<String>,
255            Option<String>,
256        ),
257        Error,
258    > {
259        let content = preprocess_html_entities(content);
260        let mut reader = quick_xml::Reader::from_str(&content);
261        let mut metadata = Metadata::new();
262        let mut manifest: OrderedHashMap<String, ManifestItem> = OrderedHashMap::new();
263        let mut spine: Vec<String> = Vec::new();
264        let mut ncx_path: Option<String> = None;
265
266        let mut current_text_tag: Option<String> = None;
267
268        let mut buf = Vec::new();
269
270        loop {
271            match reader.read_event_into(&mut buf) {
272                Ok(Event::Start(ref e)) => {
273                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
274                    if name.contains("title") {
275                        current_text_tag = Some("title".to_string());
276                    } else if name.contains("creator") {
277                        current_text_tag = Some("author".to_string());
278                    } else if name.contains("publisher") {
279                        current_text_tag = Some("publisher".to_string());
280                    } else if name.contains("language") {
281                        current_text_tag = Some("language".to_string());
282                    } else if name.contains("identifier") {
283                        current_text_tag = Some("identifier".to_string());
284                    } else if name.contains("date") {
285                        current_text_tag = Some("date".to_string());
286                    } else if name.contains("rights") {
287                        current_text_tag = Some("rights".to_string());
288                    }
289                }
290                Ok(Event::Empty(ref e)) => {
291                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
292                    if name.contains("item") && !name.contains("itemref") {
293                        let mut id = String::new();
294                        let mut href = String::new();
295                        let mut media_type = String::new();
296
297                        for attr_result in e.attributes() {
298                            if let Ok(attr) = attr_result {
299                                let attr_name =
300                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
301                                if attr_name == "id" || attr_name.ends_with(":id") {
302                                    if let Some(val) =
303                                        attr.decode_and_unescape_value(reader.decoder()).ok()
304                                    {
305                                        id = val.to_string();
306                                    }
307                                } else if attr_name == "href" || attr_name.ends_with(":href") {
308                                    href = attr
309                                        .decode_and_unescape_value(reader.decoder())?
310                                        .to_string();
311                                } else if attr_name == "media-type"
312                                    || attr_name.ends_with(":media-type")
313                                {
314                                    media_type = attr
315                                        .decode_and_unescape_value(reader.decoder())?
316                                        .to_string();
317                                }
318                            }
319                        }
320
321                        if !id.is_empty() && !href.is_empty() {
322                            if media_type == "application/x-dtbncx+xml" {
323                                ncx_path = Some(href.clone());
324                            }
325                            manifest.insert(
326                                id.clone(),
327                                ManifestItem {
328                                    _id: id.clone(),
329                                    href,
330                                    _media_type: media_type,
331                                },
332                            );
333                        }
334                    } else if name.contains("itemref") {
335                        let mut idref = String::new();
336
337                        for attr_result in e.attributes() {
338                            if let Ok(attr) = attr_result {
339                                let attr_name =
340                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
341                                if attr_name == "idref" || attr_name.ends_with(":idref") {
342                                    if let Some(val) =
343                                        attr.decode_and_unescape_value(reader.decoder()).ok()
344                                    {
345                                        idref = val.to_string();
346                                    }
347                                    break;
348                                }
349                            }
350                        }
351
352                        if !idref.is_empty() {
353                            spine.push(idref);
354                        }
355                    }
356                }
357                Ok(Event::Text(e)) => {
358                    if let Some(tag) = &current_text_tag {
359                        let text = e.unescape()?.into_owned().trim().to_string();
360                        if !text.is_empty() {
361                            match tag.as_str() {
362                                "title" => metadata.title = Some(text),
363                                "author" => metadata.author = Some(text),
364                                "publisher" => metadata.publisher = Some(text),
365                                "language" => metadata.language = Some(text),
366                                "identifier" => metadata.identifier = Some(text),
367                                "date" => metadata.date = Some(text),
368                                "rights" => metadata.rights = Some(text),
369                                _ => {}
370                            }
371                        }
372                        current_text_tag = None;
373                    }
374                }
375                Ok(Event::End(_)) => {
376                    current_text_tag = None;
377                }
378                Ok(Event::Eof) => break,
379                Err(e) => return Err(Error::XmlError(e.to_string())),
380                _ => {}
381            }
382            buf.clear();
383        }
384
385        Ok((metadata, manifest, spine, ncx_path))
386    }
387
388    fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
389        let content = preprocess_html_entities(content);
390        let mut reader = quick_xml::Reader::from_str(&content);
391        let mut toc = Vec::new();
392        let mut stack: Vec<TocEntry> = Vec::new();
393
394        let mut buf = Vec::new();
395        let mut in_nav_label = false;
396        let mut in_text = false;
397
398        loop {
399            match reader.read_event_into(&mut buf) {
400                Ok(Event::Start(ref e)) => {
401                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
402                    if name == "navPoint" {
403                        let entry = TocEntry {
404                            label: String::new(),
405                            href: String::new(),
406                            children: Vec::new(),
407                        };
408                        stack.push(entry);
409                    } else if name == "navLabel" {
410                        in_nav_label = true;
411                    } else if name == "text" && in_nav_label {
412                        in_text = true;
413                    }
414                }
415                Ok(Event::End(ref e)) => {
416                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
417                    if name == "navPoint" {
418                        if let Some(entry) = stack.pop() {
419                            if let Some(parent) = stack.last_mut() {
420                                parent.children.push(entry);
421                            } else {
422                                toc.push(entry);
423                            }
424                        }
425                    } else if name == "navLabel" {
426                        in_nav_label = false;
427                    } else if name == "text" && in_nav_label {
428                        in_text = false;
429                    }
430                }
431                Ok(Event::Text(e)) => {
432                    if in_text {
433                        if let Some(entry) = stack.last_mut() {
434                            entry.label = e.unescape()?.into_owned();
435                        }
436                    }
437                }
438                Ok(Event::Empty(ref e)) => {
439                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
440                    if name == "content" {
441                        if let Some(src) = e.try_get_attribute("src")? {
442                            if let Some(entry) = stack.last_mut() {
443                                entry.href =
444                                    src.decode_and_unescape_value(reader.decoder())?.to_string();
445                            }
446                        }
447                    }
448                }
449                Ok(Event::Eof) => break,
450                Err(e) => return Err(Error::XmlError(e.to_string())),
451                _ => {}
452            }
453            buf.clear();
454        }
455
456        Ok(toc)
457    }
458
459    fn extract_text_from_html(content: &str) -> Result<String, Error> {
460        let content = preprocess_html_entities(content);
461        let mut reader = quick_xml::Reader::from_str(&content);
462        let mut text = String::new();
463        let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
464        let mut in_skip_tag = false;
465
466        let mut buf = Vec::new();
467
468        loop {
469            match reader.read_event_into(&mut buf) {
470                Ok(Event::Start(ref e)) => {
471                    let tag = e.name().as_ref().to_vec();
472                    if skip_tags.contains(&tag) {
473                        in_skip_tag = true;
474                    } else if tag.as_slice() == b"p"
475                        || tag.as_slice() == b"div"
476                        || tag.as_slice() == b"br"
477                        || tag.as_slice() == b"li"
478                    {
479                        text.push('\n');
480                    }
481                }
482                Ok(Event::End(ref e)) => {
483                    let tag = e.name().as_ref().to_vec();
484                    if skip_tags.contains(&tag) {
485                        in_skip_tag = false;
486                    }
487                }
488                Ok(Event::Text(e)) => {
489                    if !in_skip_tag {
490                        if let Ok(unescaped) = e.unescape() {
491                            let t = unescaped.into_owned();
492                            let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
493                            text.push_str(&trimmed);
494                            text.push(' ');
495                        }
496                    }
497                }
498                Ok(Event::Eof) => break,
499                Err(e) => {
500                    eprintln!(
501                        "Warning: XML parse error in HTML content, continuing: {}",
502                        e
503                    );
504                    break;
505                }
506                _ => {}
507            }
508            buf.clear();
509        }
510
511        Ok(text
512            .lines()
513            .map(|l| l.trim())
514            .filter(|l| !l.is_empty())
515            .collect::<Vec<_>>()
516            .join("\n"))
517    }
518
519    fn resolve_path(base_path: &str, href: &str) -> String {
520        let base = PathBuf::from(base_path);
521        let parent = base.parent().unwrap_or(base.as_path());
522        let resolved = parent.join(href);
523        resolved.to_string_lossy().to_string().replace('\\', "/")
524    }
525}
526
527#[derive(Debug, Clone)]
528struct ManifestItem {
529    _id: String,
530    href: String,
531    _media_type: String,
532}