epub_parser/
epub.rs

1use crate::content::Page;
2use crate::cover::Cover;
3use crate::image::Image;
4use crate::metadata::Metadata;
5use crate::toc::TocEntry;
6use crate::zip_handler::ZipHandler;
7use quick_xml::events::Event;
8use std::collections::HashMap;
9use std::path::{Path, PathBuf};
10
11#[derive(Debug)]
12pub struct Epub {
13    pub metadata: Metadata,
14    pub toc: Vec<TocEntry>,
15    pub pages: Vec<Page>,
16    pub cover: Cover,
17    pub images: Vec<Image>,
18}
19
20#[derive(Debug)]
21pub enum Error {
22    InvalidEpub(String),
23    IoError(std::io::Error),
24    ZipError(zip::result::ZipError),
25    XmlError(String),
26    MissingContainer,
27    MissingOpf,
28    MissingNcx,
29}
30
31impl From<std::io::Error> for Error {
32    fn from(err: std::io::Error) -> Self {
33        Error::IoError(err)
34    }
35}
36
37impl From<zip::result::ZipError> for Error {
38    fn from(err: zip::result::ZipError) -> Self {
39        Error::ZipError(err)
40    }
41}
42
43impl From<quick_xml::Error> for Error {
44    fn from(err: quick_xml::Error) -> Self {
45        Error::XmlError(err.to_string())
46    }
47}
48
49impl Epub {
50    pub fn parse(path: &Path) -> Result<Self, Error> {
51        let mut zip_handler = ZipHandler::new(path)?;
52
53        let opf_path = zip_handler.get_opf_path()?;
54        let opf_content = zip_handler.read_file(&opf_path)?;
55
56        let (metadata, manifest, spine, ncx_path, cover_id) = Self::parse_opf(&opf_content)?;
57
58        let toc = if let Some(ncx_ref) = ncx_path {
59            let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
60            let ncx_content = zip_handler.read_file(&ncx_path_full)?;
61            Self::parse_ncx(&ncx_content)?
62        } else {
63            Vec::new()
64        };
65
66        let mut pages = Vec::new();
67        for itemref in spine {
68            if let Some(manifest_item) = manifest.get(&itemref) {
69                let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
70                let content = zip_handler.read_file(&content_path)?;
71                let text = Self::extract_text_from_html(&content)?;
72                pages.push(Page {
73                    index: pages.len(),
74                    content: text,
75                });
76            }
77        }
78
79        let mut cover = Cover::default();
80        if let Some(cover_id) = cover_id {
81            if let Some(cover_item) = manifest.get(&cover_id) {
82                let cover_path = Self::resolve_path(&opf_path, &cover_item.href);
83                match zip_handler.read_file_as_bytes(&cover_path) {
84                    Ok(bytes) => {
85                        cover.href = Some(cover_item.href.clone());
86                        cover.content = Some(bytes);
87                    }
88                    Err(_) => {}
89                }
90            }
91        }
92
93        let mut images = Vec::new();
94        for (id, item) in &manifest {
95            if item._media_type.starts_with("image/") {
96                let image_path = Self::resolve_path(&opf_path, &item.href);
97                match zip_handler.read_file_as_bytes(&image_path) {
98                    Ok(bytes) => {
99                        images.push(Image {
100                            id: id.clone(),
101                            href: item.href.clone(),
102                            media_type: item._media_type.clone(),
103                            content: Some(bytes),
104                        });
105                    }
106                    Err(_) => {
107                        images.push(Image {
108                            id: id.clone(),
109                            href: item.href.clone(),
110                            media_type: item._media_type.clone(),
111                            content: None,
112                        });
113                    }
114                }
115            }
116        }
117
118        Ok(Epub {
119            metadata,
120            toc,
121            pages,
122            cover,
123            images,
124        })
125    }
126
127    fn parse_opf(
128        content: &str,
129    ) -> Result<
130        (
131            Metadata,
132            HashMap<String, ManifestItem>,
133            Vec<String>,
134            Option<String>,
135            Option<String>,
136        ),
137        Error,
138    > {
139        let mut reader = quick_xml::Reader::from_str(content);
140        let mut metadata = Metadata::new();
141        let mut manifest: HashMap<String, ManifestItem> = HashMap::new();
142        let mut spine: Vec<String> = Vec::new();
143        let mut ncx_path: Option<String> = None;
144        let mut cover_id: Option<String> = None;
145
146        let mut current_text_tag: Option<String> = None;
147
148        let mut buf = Vec::new();
149
150        loop {
151            match reader.read_event_into(&mut buf) {
152                Ok(Event::Start(ref e)) => {
153                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
154                    if name.contains("title") {
155                        current_text_tag = Some("title".to_string());
156                    } else if name.contains("creator") {
157                        current_text_tag = Some("author".to_string());
158                    } else if name.contains("publisher") {
159                        current_text_tag = Some("publisher".to_string());
160                    } else if name.contains("language") {
161                        current_text_tag = Some("language".to_string());
162                    } else if name.contains("identifier") {
163                        current_text_tag = Some("identifier".to_string());
164                    } else if name.contains("date") {
165                        current_text_tag = Some("date".to_string());
166                    } else if name.contains("rights") {
167                        current_text_tag = Some("rights".to_string());
168                    }
169                }
170                Ok(Event::Empty(ref e)) => {
171                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
172                    if name.contains("meta") {
173                        let mut is_cover = false;
174                        for attr_result in e.attributes() {
175                            if let Ok(attr) = attr_result {
176                                let attr_name =
177                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
178                                if attr_name.contains("name") {
179                                    let value = attr
180                                        .decode_and_unescape_value(reader.decoder())?
181                                        .to_string();
182                                    if value == "cover" {
183                                        is_cover = true;
184                                    }
185                                } else if attr_name.contains("content") {
186                                    if is_cover {
187                                        if let Some(val) =
188                                            attr.decode_and_unescape_value(reader.decoder()).ok()
189                                        {
190                                            cover_id = Some(val.to_string());
191                                        }
192                                    }
193                                }
194                            }
195                        }
196                    } else if name.contains("item") && !name.contains("itemref") {
197                        let mut id = String::new();
198                        let mut href = String::new();
199                        let mut media_type = String::new();
200
201                        for attr_result in e.attributes() {
202                            if let Ok(attr) = attr_result {
203                                let attr_name =
204                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
205                                if attr_name == "id" || attr_name.ends_with(":id") {
206                                    if let Some(val) =
207                                        attr.decode_and_unescape_value(reader.decoder()).ok()
208                                    {
209                                        id = val.to_string();
210                                    }
211                                } else if attr_name == "href" || attr_name.ends_with(":href") {
212                                    href = attr
213                                        .decode_and_unescape_value(reader.decoder())?
214                                        .to_string();
215                                } else if attr_name == "media-type"
216                                    || attr_name.ends_with(":media-type")
217                                {
218                                    media_type = attr
219                                        .decode_and_unescape_value(reader.decoder())?
220                                        .to_string();
221                                }
222                            }
223                        }
224
225                        if !id.is_empty() && !href.is_empty() {
226                            if media_type == "application/x-dtbncx+xml" {
227                                ncx_path = Some(href.clone());
228                            }
229                            manifest.insert(
230                                id.clone(),
231                                ManifestItem {
232                                    _id: id.clone(),
233                                    href,
234                                    _media_type: media_type,
235                                },
236                            );
237                        }
238                    } else if name.contains("itemref") {
239                        let mut idref = String::new();
240
241                        for attr_result in e.attributes() {
242                            if let Ok(attr) = attr_result {
243                                let attr_name =
244                                    String::from_utf8_lossy(attr.key.as_ref()).to_string();
245                                if attr_name == "idref" || attr_name.ends_with(":idref") {
246                                    if let Some(val) =
247                                        attr.decode_and_unescape_value(reader.decoder()).ok()
248                                    {
249                                        idref = val.to_string();
250                                    }
251                                    break;
252                                }
253                            }
254                        }
255
256                        if !idref.is_empty() {
257                            spine.push(idref);
258                        }
259                    }
260                }
261                Ok(Event::Text(e)) => {
262                    if let Some(tag) = &current_text_tag {
263                        let text = e.unescape()?.into_owned().trim().to_string();
264                        if !text.is_empty() {
265                            match tag.as_str() {
266                                "title" => metadata.title = Some(text),
267                                "author" => metadata.author = Some(text),
268                                "publisher" => metadata.publisher = Some(text),
269                                "language" => metadata.language = Some(text),
270                                "identifier" => metadata.identifier = Some(text),
271                                "date" => metadata.date = Some(text),
272                                "rights" => metadata.rights = Some(text),
273                                _ => {}
274                            }
275                        }
276                        current_text_tag = None;
277                    }
278                }
279                Ok(Event::End(_)) => {
280                    current_text_tag = None;
281                }
282                Ok(Event::Eof) => break,
283                Err(e) => return Err(Error::XmlError(e.to_string())),
284                _ => {}
285            }
286            buf.clear();
287        }
288
289        Ok((metadata, manifest, spine, ncx_path, cover_id))
290    }
291
292    fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
293        let mut reader = quick_xml::Reader::from_str(content);
294        let mut toc = Vec::new();
295        let mut stack: Vec<TocEntry> = Vec::new();
296
297        let mut buf = Vec::new();
298        let mut in_nav_label = false;
299        let mut in_text = false;
300
301        loop {
302            match reader.read_event_into(&mut buf) {
303                Ok(Event::Start(ref e)) => {
304                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
305                    if name == "navPoint" {
306                        let entry = TocEntry {
307                            label: String::new(),
308                            href: String::new(),
309                            children: Vec::new(),
310                        };
311                        stack.push(entry);
312                    } else if name == "navLabel" {
313                        in_nav_label = true;
314                    } else if name == "text" && in_nav_label {
315                        in_text = true;
316                    }
317                }
318                Ok(Event::End(ref e)) => {
319                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
320                    if name == "navPoint" {
321                        if let Some(entry) = stack.pop() {
322                            if let Some(parent) = stack.last_mut() {
323                                parent.children.push(entry);
324                            } else {
325                                toc.push(entry);
326                            }
327                        }
328                    } else if name == "navLabel" {
329                        in_nav_label = false;
330                    } else if name == "text" && in_nav_label {
331                        in_text = false;
332                    }
333                }
334                Ok(Event::Text(e)) => {
335                    if in_text {
336                        if let Some(entry) = stack.last_mut() {
337                            entry.label = e.unescape()?.into_owned();
338                        }
339                    }
340                }
341                Ok(Event::Empty(ref e)) => {
342                    let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
343                    if name == "content" {
344                        if let Some(src) = e.try_get_attribute("src")? {
345                            if let Some(entry) = stack.last_mut() {
346                                entry.href = src.decode_and_unescape_value(reader.decoder())?.to_string();
347                            }
348                        }
349                    }
350                }
351                Ok(Event::Eof) => break,
352                Err(e) => return Err(Error::XmlError(e.to_string())),
353                _ => {}
354            }
355            buf.clear();
356        }
357
358        Ok(toc)
359    }
360
361    fn extract_text_from_html(content: &str) -> Result<String, Error> {
362        let mut reader = quick_xml::Reader::from_str(content);
363        let mut text = String::new();
364        let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
365        let mut in_skip_tag = false;
366
367        let mut buf = Vec::new();
368
369        loop {
370            match reader.read_event_into(&mut buf) {
371                Ok(Event::Start(ref e)) => {
372                    let tag = e.name().as_ref().to_vec();
373                    if skip_tags.contains(&tag) {
374                        in_skip_tag = true;
375                    } else if tag.as_slice() == b"p"
376                        || tag.as_slice() == b"div"
377                        || tag.as_slice() == b"br"
378                        || tag.as_slice() == b"li"
379                    {
380                        text.push('\n');
381                    }
382                }
383                Ok(Event::End(ref e)) => {
384                    let tag = e.name().as_ref().to_vec();
385                    if skip_tags.contains(&tag) {
386                        in_skip_tag = false;
387                    }
388                }
389                Ok(Event::Text(e)) => {
390                    if !in_skip_tag {
391                        let t = e.unescape()?.into_owned();
392                        let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
393                        text.push_str(&trimmed);
394                        text.push(' ');
395                    }
396                }
397                Ok(Event::Eof) => break,
398                Err(e) => return Err(Error::XmlError(e.to_string())),
399                _ => {}
400            }
401            buf.clear();
402        }
403
404        Ok(text
405            .lines()
406            .map(|l| l.trim())
407            .filter(|l| !l.is_empty())
408            .collect::<Vec<_>>()
409            .join("\n"))
410    }
411
412    fn resolve_path(base_path: &str, href: &str) -> String {
413        let base = PathBuf::from(base_path);
414        let parent = base.parent().unwrap_or(base.as_path());
415        let resolved = parent.join(href);
416        resolved.to_string_lossy().to_string().replace('\\', "/")
417    }
418}
419
420#[derive(Debug, Clone)]
421struct ManifestItem {
422    _id: String,
423    href: String,
424    _media_type: String,
425}