epub_parser/
epub.rs

1use crate::content::Page;
2use crate::cover::Cover;
3use crate::image::Image;
4use crate::metadata::Metadata;
5use crate::toc::TocEntry;
6use crate::zip_handler::ZipHandler;
7use quick_xml::events::Event;
8use std::collections::HashMap;
9use std::io::Cursor;
10use std::path::{Path, PathBuf};
11
12#[derive(Debug)]
13pub struct Epub {
14    pub metadata: Metadata,
15    pub toc: Vec<TocEntry>,
16    pub pages: Vec<Page>,
17    pub cover: Cover,
18    pub images: Vec<Image>,
19}
20
21#[derive(Debug)]
22pub enum Error {
23    InvalidEpub(String),
24    IoError(std::io::Error),
25    ZipError(zip::result::ZipError),
26    XmlError(String),
27    MissingContainer,
28    MissingOpf,
29    MissingNcx,
30}
31
32impl From<std::io::Error> for Error {
33    fn from(err: std::io::Error) -> Self {
34        Error::IoError(err)
35    }
36}
37
38impl From<zip::result::ZipError> for Error {
39    fn from(err: zip::result::ZipError) -> Self {
40        Error::ZipError(err)
41    }
42}
43
44impl From<quick_xml::Error> for Error {
45    fn from(err: quick_xml::Error) -> Self {
46        Error::XmlError(err.to_string())
47    }
48}
49
50impl Epub {
51    pub fn parse(path: &Path) -> Result<Self, Error> {
52        let mut zip_handler = ZipHandler::new(path)?;
53        Self::parse_from_handler(&mut zip_handler)
54    }
55
56    pub fn parse_from_buffer(buffer: &[u8]) -> Result<Self, Error> {
57        let cursor = Cursor::new(buffer.to_vec());
58        let mut zip_handler = ZipHandler::new_from_reader(cursor)?;
59        Self::parse_from_handler(&mut zip_handler)
60    }
61
62    fn parse_from_handler<R: std::io::Read + std::io::Seek>(
63        zip_handler: &mut ZipHandler<R>,
64    ) -> Result<Self, Error> {
65        let opf_path = zip_handler.get_opf_path()?;
66        let opf_content = zip_handler.read_file(&opf_path)?;
67
68        let (metadata, manifest, spine, ncx_path, cover_id) = Self::parse_opf(&opf_content)?;
69
70        let toc = if let Some(ncx_ref) = ncx_path {
71            let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
72            let ncx_content = zip_handler.read_file(&ncx_path_full)?;
73            Self::parse_ncx(&ncx_content)?
74        } else {
75            Vec::new()
76        };
77
78        let mut pages = Vec::new();
79        for itemref in spine {
80            if let Some(manifest_item) = manifest.get(&itemref) {
81                let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
82                let content = zip_handler.read_file(&content_path)?;
83                let text = Self::extract_text_from_html(&content)?;
84                pages.push(Page {
85                    index: pages.len(),
86                    content: text,
87                });
88            }
89        }
90
91        let mut cover = Cover::default();
92        if let Some(cover_id) = cover_id {
93            if let Some(cover_item) = manifest.get(&cover_id) {
94                let cover_path = Self::resolve_path(&opf_path, &cover_item.href);
95                match zip_handler.read_file_as_bytes(&cover_path) {
96                    Ok(bytes) => {
97                        cover.href = Some(cover_item.href.clone());
98                        cover.content = Some(bytes);
99                    }
100                    Err(_) => {}
101                }
102            }
103        }
104
105        let mut images = Vec::new();
106        for (id, item) in &manifest {
107            if item._media_type.starts_with("image/") {
108                let image_path = Self::resolve_path(&opf_path, &item.href);
109                match zip_handler.read_file_as_bytes(&image_path) {
110                    Ok(bytes) => {
111                        images.push(Image {
112                            id: id.clone(),
113                            href: item.href.clone(),
114                            media_type: item._media_type.clone(),
115                            content: Some(bytes),
116                        });
117                    }
118                    Err(_) => {
119                        images.push(Image {
120                            id: id.clone(),
121                            href: item.href.clone(),
122                            media_type: item._media_type.clone(),
123                            content: None,
124                        });
125                    }
126                }
127            }
128        }
129
130        Ok(Epub {
131            metadata,
132            toc,
133            pages,
134            cover,
135            images,
136        })
137    }
138
139    fn parse_opf(
140        content: &str,
141    ) -> Result<
142        (
143            Metadata,
144            HashMap<String, ManifestItem>,
145            Vec<String>,
146            Option<String>,
147            Option<String>,
148        ),
149        Error,
150    > {
151        let mut reader = quick_xml::Reader::from_str(content);
152        let mut metadata = Metadata::new();
153        let mut manifest: HashMap<String, ManifestItem> = HashMap::new();
154        let mut spine: Vec<String> = Vec::new();
155        let mut ncx_path: Option<String> = None;
156        let mut cover_id: Option<String> = None;
157
158        let mut current_text_tag: Option<String> = None;
159
160        let mut buf = Vec::new();
161
162        loop {
163            match reader.read_event_into(&mut buf) {
164                Ok(Event::Start(ref e)) => {
165                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
166                    if name.contains("title") {
167                        current_text_tag = Some("title".to_string());
168                    } else if name.contains("creator") {
169                        current_text_tag = Some("author".to_string());
170                    } else if name.contains("publisher") {
171                        current_text_tag = Some("publisher".to_string());
172                    } else if name.contains("language") {
173                        current_text_tag = Some("language".to_string());
174                    } else if name.contains("identifier") {
175                        current_text_tag = Some("identifier".to_string());
176                    } else if name.contains("date") {
177                        current_text_tag = Some("date".to_string());
178                    } else if name.contains("rights") {
179                        current_text_tag = Some("rights".to_string());
180                    }
181                }
182                Ok(Event::Empty(ref e)) => {
183                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
184                    if name.contains("meta") {
185                        let mut is_cover = false;
186                        for attr_result in e.attributes() {
187                            if let Ok(attr) = attr_result {
188                                let attr_name =
189                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
190                                if attr_name.contains("name") {
191                                    let value = attr
192                                        .decode_and_unescape_value(reader.decoder())?
193                                        .to_string();
194                                    if value == "cover" {
195                                        is_cover = true;
196                                    }
197                                } else if attr_name.contains("content") {
198                                    if is_cover {
199                                        if let Some(val) =
200                                            attr.decode_and_unescape_value(reader.decoder()).ok()
201                                        {
202                                            cover_id = Some(val.to_string());
203                                        }
204                                    }
205                                }
206                            }
207                        }
208                    } else if name.contains("item") && !name.contains("itemref") {
209                        let mut id = String::new();
210                        let mut href = String::new();
211                        let mut media_type = String::new();
212
213                        for attr_result in e.attributes() {
214                            if let Ok(attr) = attr_result {
215                                let attr_name =
216                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
217                                if attr_name == "id" || attr_name.ends_with(":id") {
218                                    if let Some(val) =
219                                        attr.decode_and_unescape_value(reader.decoder()).ok()
220                                    {
221                                        id = val.to_string();
222                                    }
223                                } else if attr_name == "href" || attr_name.ends_with(":href") {
224                                    href = attr
225                                        .decode_and_unescape_value(reader.decoder())?
226                                        .to_string();
227                                } else if attr_name == "media-type"
228                                    || attr_name.ends_with(":media-type")
229                                {
230                                    media_type = attr
231                                        .decode_and_unescape_value(reader.decoder())?
232                                        .to_string();
233                                }
234                            }
235                        }
236
237                        if !id.is_empty() && !href.is_empty() {
238                            if media_type == "application/x-dtbncx+xml" {
239                                ncx_path = Some(href.clone());
240                            }
241                            manifest.insert(
242                                id.clone(),
243                                ManifestItem {
244                                    _id: id.clone(),
245                                    href,
246                                    _media_type: media_type,
247                                },
248                            );
249                        }
250                    } else if name.contains("itemref") {
251                        let mut idref = String::new();
252
253                        for attr_result in e.attributes() {
254                            if let Ok(attr) = attr_result {
255                                let attr_name =
256                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
257                                if attr_name == "idref" || attr_name.ends_with(":idref") {
258                                    if let Some(val) =
259                                        attr.decode_and_unescape_value(reader.decoder()).ok()
260                                    {
261                                        idref = val.to_string();
262                                    }
263                                    break;
264                                }
265                            }
266                        }
267
268                        if !idref.is_empty() {
269                            spine.push(idref);
270                        }
271                    }
272                }
273                Ok(Event::Text(e)) => {
274                    if let Some(tag) = &current_text_tag {
275                        let text = e.unescape()?.into_owned().trim().to_string();
276                        if !text.is_empty() {
277                            match tag.as_str() {
278                                "title" => metadata.title = Some(text),
279                                "author" => metadata.author = Some(text),
280                                "publisher" => metadata.publisher = Some(text),
281                                "language" => metadata.language = Some(text),
282                                "identifier" => metadata.identifier = Some(text),
283                                "date" => metadata.date = Some(text),
284                                "rights" => metadata.rights = Some(text),
285                                _ => {}
286                            }
287                        }
288                        current_text_tag = None;
289                    }
290                }
291                Ok(Event::End(_)) => {
292                    current_text_tag = None;
293                }
294                Ok(Event::Eof) => break,
295                Err(e) => return Err(Error::XmlError(e.to_string())),
296                _ => {}
297            }
298            buf.clear();
299        }
300
301        Ok((metadata, manifest, spine, ncx_path, cover_id))
302    }
303
304    fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
305        let mut reader = quick_xml::Reader::from_str(content);
306        let mut toc = Vec::new();
307        let mut stack: Vec<TocEntry> = Vec::new();
308
309        let mut buf = Vec::new();
310        let mut in_nav_label = false;
311        let mut in_text = false;
312
313        loop {
314            match reader.read_event_into(&mut buf) {
315                Ok(Event::Start(ref e)) => {
316                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
317                    if name == "navPoint" {
318                        let entry = TocEntry {
319                            label: String::new(),
320                            href: String::new(),
321                            children: Vec::new(),
322                        };
323                        stack.push(entry);
324                    } else if name == "navLabel" {
325                        in_nav_label = true;
326                    } else if name == "text" && in_nav_label {
327                        in_text = true;
328                    }
329                }
330                Ok(Event::End(ref e)) => {
331                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
332                    if name == "navPoint" {
333                        if let Some(entry) = stack.pop() {
334                            if let Some(parent) = stack.last_mut() {
335                                parent.children.push(entry);
336                            } else {
337                                toc.push(entry);
338                            }
339                        }
340                    } else if name == "navLabel" {
341                        in_nav_label = false;
342                    } else if name == "text" && in_nav_label {
343                        in_text = false;
344                    }
345                }
346                Ok(Event::Text(e)) => {
347                    if in_text {
348                        if let Some(entry) = stack.last_mut() {
349                            entry.label = e.unescape()?.into_owned();
350                        }
351                    }
352                }
353                Ok(Event::Empty(ref e)) => {
354                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
355                    if name == "content" {
356                        if let Some(src) = e.try_get_attribute("src")? {
357                            if let Some(entry) = stack.last_mut() {
358                                entry.href =
359                                    src.decode_and_unescape_value(reader.decoder())?.to_string();
360                            }
361                        }
362                    }
363                }
364                Ok(Event::Eof) => break,
365                Err(e) => return Err(Error::XmlError(e.to_string())),
366                _ => {}
367            }
368            buf.clear();
369        }
370
371        Ok(toc)
372    }
373
374    fn extract_text_from_html(content: &str) -> Result<String, Error> {
375        let mut reader = quick_xml::Reader::from_str(content);
376        let mut text = String::new();
377        let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
378        let mut in_skip_tag = false;
379
380        let mut buf = Vec::new();
381
382        loop {
383            match reader.read_event_into(&mut buf) {
384                Ok(Event::Start(ref e)) => {
385                    let tag = e.name().as_ref().to_vec();
386                    if skip_tags.contains(&tag) {
387                        in_skip_tag = true;
388                    } else if tag.as_slice() == b"p"
389                        || tag.as_slice() == b"div"
390                        || tag.as_slice() == b"br"
391                        || tag.as_slice() == b"li"
392                    {
393                        text.push('\n');
394                    }
395                }
396                Ok(Event::End(ref e)) => {
397                    let tag = e.name().as_ref().to_vec();
398                    if skip_tags.contains(&tag) {
399                        in_skip_tag = false;
400                    }
401                }
402                Ok(Event::Text(e)) => {
403                    if !in_skip_tag {
404                        let t = e.unescape()?.into_owned();
405                        let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
406                        text.push_str(&trimmed);
407                        text.push(' ');
408                    }
409                }
410                Ok(Event::Eof) => break,
411                Err(e) => return Err(Error::XmlError(e.to_string())),
412                _ => {}
413            }
414            buf.clear();
415        }
416
417        Ok(text
418            .lines()
419            .map(|l| l.trim())
420            .filter(|l| !l.is_empty())
421            .collect::<Vec<_>>()
422            .join("\n"))
423    }
424
425    fn resolve_path(base_path: &str, href: &str) -> String {
426        let base = PathBuf::from(base_path);
427        let parent = base.parent().unwrap_or(base.as_path());
428        let resolved = parent.join(href);
429        resolved.to_string_lossy().to_string().replace('\\', "/")
430    }
431}
432
433#[derive(Debug, Clone)]
434struct ManifestItem {
435    _id: String,
436    href: String,
437    _media_type: String,
438}