Skip to main content

epub_parser/
epub.rs

1use crate::content::Page;
2use crate::cover::Cover;
3use crate::image::Image;
4use crate::metadata::Metadata;
5use crate::toc::TocEntry;
6use crate::zip_handler::ZipHandler;
7use quick_xml::events::Event;
8use std::collections::HashMap;
9use std::io::Cursor;
10use std::path::{Path, PathBuf};
11
12#[derive(Debug)]
13pub struct Epub {
14    pub metadata: Metadata,
15    pub toc: Vec<TocEntry>,
16    pub pages: Vec<Page>,
17    pub cover: Cover,
18    pub images: Vec<Image>,
19}
20
21#[derive(Debug)]
22pub enum Error {
23    InvalidEpub(String),
24    IoError(std::io::Error),
25    ZipError(zip::result::ZipError),
26    XmlError(String),
27    MissingContainer,
28    MissingOpf,
29    MissingNcx,
30}
31
32impl From<std::io::Error> for Error {
33    fn from(err: std::io::Error) -> Self {
34        Error::IoError(err)
35    }
36}
37
38impl From<zip::result::ZipError> for Error {
39    fn from(err: zip::result::ZipError) -> Self {
40        Error::ZipError(err)
41    }
42}
43
44impl From<quick_xml::Error> for Error {
45    fn from(err: quick_xml::Error) -> Self {
46        Error::XmlError(err.to_string())
47    }
48}
49
50impl Epub {
51    pub fn parse(path: &Path) -> Result<Self, Error> {
52        let mut zip_handler = ZipHandler::new(path)?;
53        Self::parse_from_handler(&mut zip_handler)
54    }
55
56    pub fn parse_from_buffer(buffer: &[u8]) -> Result<Self, Error> {
57        let cursor = Cursor::new(buffer.to_vec());
58        let mut zip_handler = ZipHandler::new_from_reader(cursor)?;
59        Self::parse_from_handler(&mut zip_handler)
60    }
61
62    fn parse_from_handler<R: std::io::Read + std::io::Seek>(
63        zip_handler: &mut ZipHandler<R>,
64    ) -> Result<Self, Error> {
65        let opf_path = zip_handler.get_opf_path()?;
66        let opf_content = zip_handler.read_file(&opf_path)?;
67
68        let (metadata, manifest, spine, ncx_path, cover_id) = Self::parse_opf(&opf_content)?;
69
70        let toc = if let Some(ncx_ref) = ncx_path {
71            let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
72            let ncx_content = zip_handler.read_file(&ncx_path_full)?;
73            Self::parse_ncx(&ncx_content)?
74        } else {
75            Vec::new()
76        };
77
78        let mut pages = Vec::new();
79        for itemref in spine {
80            if let Some(manifest_item) = manifest.get(&itemref) {
81                let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
82                let content = zip_handler.read_file(&content_path)?;
83                let text = Self::extract_text_from_html(&content)?;
84                pages.push(Page {
85                    index: pages.len(),
86                    content: text,
87                });
88            }
89        }
90
91        let mut cover = Cover::default();
92        if let Some(cover_id) = cover_id {
93            if let Some(cover_item) = manifest.get(&cover_id) {
94                let cover_path = Self::resolve_path(&opf_path, &cover_item.href);
95                match zip_handler.read_file_as_bytes(&cover_path) {
96                    Ok(bytes) => {
97                        cover.href = Some(cover_item.href.clone());
98                        cover.content = Some(bytes);
99                    }
100                    Err(_) => {}
101                }
102            }
103        }
104
105        let mut images = Vec::new();
106        for (id, item) in &manifest {
107            if item._media_type.starts_with("image/") {
108                let image_path = Self::resolve_path(&opf_path, &item.href);
109                match zip_handler.read_file_as_bytes(&image_path) {
110                    Ok(bytes) => {
111                        images.push(Image {
112                            id: id.clone(),
113                            href: item.href.clone(),
114                            media_type: item._media_type.clone(),
115                            content: Some(bytes),
116                        });
117                    }
118                    Err(_) => {
119                        images.push(Image {
120                            id: id.clone(),
121                            href: item.href.clone(),
122                            media_type: item._media_type.clone(),
123                            content: None,
124                        });
125                    }
126                }
127            }
128        }
129
130        Ok(Epub {
131            metadata,
132            toc,
133            pages,
134            cover,
135            images,
136        })
137    }
138
139    fn parse_opf(
140        content: &str,
141    ) -> Result<
142        (
143            Metadata,
144            HashMap<String, ManifestItem>,
145            Vec<String>,
146            Option<String>,
147            Option<String>,
148        ),
149        Error,
150    > {
151        let mut reader = quick_xml::Reader::from_str(content);
152        let mut metadata = Metadata::new();
153        let mut manifest: HashMap<String, ManifestItem> = HashMap::new();
154        let mut spine: Vec<String> = Vec::new();
155        let mut ncx_path: Option<String> = None;
156        let mut cover_id: Option<String> = None;
157
158        let mut current_text_tag: Option<String> = None;
159
160        let mut buf = Vec::new();
161
162        loop {
163            match reader.read_event_into(&mut buf) {
164                Ok(Event::Start(ref e)) => {
165                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
166                    if name.contains("title") {
167                        current_text_tag = Some("title".to_string());
168                    } else if name.contains("creator") {
169                        current_text_tag = Some("author".to_string());
170                    } else if name.contains("publisher") {
171                        current_text_tag = Some("publisher".to_string());
172                    } else if name.contains("language") {
173                        current_text_tag = Some("language".to_string());
174                    } else if name.contains("identifier") {
175                        current_text_tag = Some("identifier".to_string());
176                    } else if name.contains("date") {
177                        current_text_tag = Some("date".to_string());
178                    } else if name.contains("rights") {
179                        current_text_tag = Some("rights".to_string());
180                    }
181                }
182                Ok(Event::Empty(ref e)) => {
183                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
184                    if name.contains("meta") {
185                        for attr_result in e.attributes() {
186                            if let Ok(attr) = attr_result {
187                                let attr_name =
188                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
189                                if attr_name.contains("content") {
190                                    if let Some(val) =
191                                        attr.decode_and_unescape_value(reader.decoder()).ok()
192                                    {
193                                        cover_id = Some(val.to_string());
194                                    }
195                                }
196                            }
197                        }
198                    } else if name.contains("item") && !name.contains("itemref") {
199                        let mut id = String::new();
200                        let mut href = String::new();
201                        let mut media_type = String::new();
202
203                        for attr_result in e.attributes() {
204                            if let Ok(attr) = attr_result {
205                                let attr_name =
206                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
207                                if attr_name == "id" || attr_name.ends_with(":id") {
208                                    if let Some(val) =
209                                        attr.decode_and_unescape_value(reader.decoder()).ok()
210                                    {
211                                        id = val.to_string();
212                                    }
213                                } else if attr_name == "href" || attr_name.ends_with(":href") {
214                                    href = attr
215                                        .decode_and_unescape_value(reader.decoder())?
216                                        .to_string();
217                                } else if attr_name == "media-type"
218                                    || attr_name.ends_with(":media-type")
219                                {
220                                    media_type = attr
221                                        .decode_and_unescape_value(reader.decoder())?
222                                        .to_string();
223                                }
224                            }
225                        }
226
227                        if !id.is_empty() && !href.is_empty() {
228                            if media_type == "application/x-dtbncx+xml" {
229                                ncx_path = Some(href.clone());
230                            }
231                            manifest.insert(
232                                id.clone(),
233                                ManifestItem {
234                                    _id: id.clone(),
235                                    href,
236                                    _media_type: media_type,
237                                },
238                            );
239                        }
240                    } else if name.contains("itemref") {
241                        let mut idref = String::new();
242
243                        for attr_result in e.attributes() {
244                            if let Ok(attr) = attr_result {
245                                let attr_name =
246                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
247                                if attr_name == "idref" || attr_name.ends_with(":idref") {
248                                    if let Some(val) =
249                                        attr.decode_and_unescape_value(reader.decoder()).ok()
250                                    {
251                                        idref = val.to_string();
252                                    }
253                                    break;
254                                }
255                            }
256                        }
257
258                        if !idref.is_empty() {
259                            spine.push(idref);
260                        }
261                    }
262                }
263                Ok(Event::Text(e)) => {
264                    if let Some(tag) = &current_text_tag {
265                        let text = e.unescape()?.into_owned().trim().to_string();
266                        if !text.is_empty() {
267                            match tag.as_str() {
268                                "title" => metadata.title = Some(text),
269                                "author" => metadata.author = Some(text),
270                                "publisher" => metadata.publisher = Some(text),
271                                "language" => metadata.language = Some(text),
272                                "identifier" => metadata.identifier = Some(text),
273                                "date" => metadata.date = Some(text),
274                                "rights" => metadata.rights = Some(text),
275                                _ => {}
276                            }
277                        }
278                        current_text_tag = None;
279                    }
280                }
281                Ok(Event::End(_)) => {
282                    current_text_tag = None;
283                }
284                Ok(Event::Eof) => break,
285                Err(e) => return Err(Error::XmlError(e.to_string())),
286                _ => {}
287            }
288            buf.clear();
289        }
290
291        Ok((metadata, manifest, spine, ncx_path, cover_id))
292    }
293
294    fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
295        let mut reader = quick_xml::Reader::from_str(content);
296        let mut toc = Vec::new();
297        let mut stack: Vec<TocEntry> = Vec::new();
298
299        let mut buf = Vec::new();
300        let mut in_nav_label = false;
301        let mut in_text = false;
302
303        loop {
304            match reader.read_event_into(&mut buf) {
305                Ok(Event::Start(ref e)) => {
306                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
307                    if name == "navPoint" {
308                        let entry = TocEntry {
309                            label: String::new(),
310                            href: String::new(),
311                            children: Vec::new(),
312                        };
313                        stack.push(entry);
314                    } else if name == "navLabel" {
315                        in_nav_label = true;
316                    } else if name == "text" && in_nav_label {
317                        in_text = true;
318                    }
319                }
320                Ok(Event::End(ref e)) => {
321                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
322                    if name == "navPoint" {
323                        if let Some(entry) = stack.pop() {
324                            if let Some(parent) = stack.last_mut() {
325                                parent.children.push(entry);
326                            } else {
327                                toc.push(entry);
328                            }
329                        }
330                    } else if name == "navLabel" {
331                        in_nav_label = false;
332                    } else if name == "text" && in_nav_label {
333                        in_text = false;
334                    }
335                }
336                Ok(Event::Text(e)) => {
337                    if in_text {
338                        if let Some(entry) = stack.last_mut() {
339                            entry.label = e.unescape()?.into_owned();
340                        }
341                    }
342                }
343                Ok(Event::Empty(ref e)) => {
344                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
345                    if name == "content" {
346                        if let Some(src) = e.try_get_attribute("src")? {
347                            if let Some(entry) = stack.last_mut() {
348                                entry.href =
349                                    src.decode_and_unescape_value(reader.decoder())?.to_string();
350                            }
351                        }
352                    }
353                }
354                Ok(Event::Eof) => break,
355                Err(e) => return Err(Error::XmlError(e.to_string())),
356                _ => {}
357            }
358            buf.clear();
359        }
360
361        Ok(toc)
362    }
363
364    fn extract_text_from_html(content: &str) -> Result<String, Error> {
365        let mut reader = quick_xml::Reader::from_str(content);
366        let mut text = String::new();
367        let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
368        let mut in_skip_tag = false;
369
370        let mut buf = Vec::new();
371
372        loop {
373            match reader.read_event_into(&mut buf) {
374                Ok(Event::Start(ref e)) => {
375                    let tag = e.name().as_ref().to_vec();
376                    if skip_tags.contains(&tag) {
377                        in_skip_tag = true;
378                    } else if tag.as_slice() == b"p"
379                        || tag.as_slice() == b"div"
380                        || tag.as_slice() == b"br"
381                        || tag.as_slice() == b"li"
382                    {
383                        text.push('\n');
384                    }
385                }
386                Ok(Event::End(ref e)) => {
387                    let tag = e.name().as_ref().to_vec();
388                    if skip_tags.contains(&tag) {
389                        in_skip_tag = false;
390                    }
391                }
392                Ok(Event::Text(e)) => {
393                    if !in_skip_tag {
394                        let t = e.unescape()?.into_owned();
395                        let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
396                        text.push_str(&trimmed);
397                        text.push(' ');
398                    }
399                }
400                Ok(Event::Eof) => break,
401                Err(e) => return Err(Error::XmlError(e.to_string())),
402                _ => {}
403            }
404            buf.clear();
405        }
406
407        Ok(text
408            .lines()
409            .map(|l| l.trim())
410            .filter(|l| !l.is_empty())
411            .collect::<Vec<_>>()
412            .join("\n"))
413    }
414
415    fn resolve_path(base_path: &str, href: &str) -> String {
416        let base = PathBuf::from(base_path);
417        let parent = base.parent().unwrap_or(base.as_path());
418        let resolved = parent.join(href);
419        resolved.to_string_lossy().to_string().replace('\\', "/")
420    }
421}
422
423#[derive(Debug, Clone)]
424struct ManifestItem {
425    _id: String,
426    href: String,
427    _media_type: String,
428}