1use crate::types::{Image, Metadata, Page, TocEntry};
2use crate::utils::ZipHandler;
3use ordered_hash_map::OrderedHashMap;
4use quick_xml::events::Event;
5use std::io::Cursor;
6use std::path::{Path, PathBuf};
7
8#[derive(Debug)]
40pub struct Epub {
41 pub metadata: Metadata,
43 pub toc: Vec<TocEntry>,
45 pub pages: Vec<Page>,
47 pub images: Vec<Image>,
49}
50
51#[derive(Debug)]
53pub enum Error {
54 InvalidEpub(String),
56 IoError(std::io::Error),
58 ZipError(zip::result::ZipError),
60 XmlError(String),
62 MissingContainer,
64 MissingOpf,
66 MissingNcx,
68}
69
70impl std::fmt::Display for Error {
71 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72 match self {
73 Error::InvalidEpub(msg) => write!(f, "Invalid EPUB: {}", msg),
74 Error::IoError(e) => write!(f, "I/O error: {}", e),
75 Error::ZipError(e) => write!(f, "ZIP error: {}", e),
76 Error::XmlError(e) => write!(f, "XML error: {}", e),
77 Error::MissingContainer => write!(f, "Missing container.xml"),
78 Error::MissingOpf => write!(f, "Missing OPF file"),
79 Error::MissingNcx => write!(f, "Missing NCX file"),
80 }
81 }
82}
83
84impl std::error::Error for Error {
85 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
86 match self {
87 Error::IoError(e) => Some(e),
88 Error::ZipError(e) => Some(e),
89 _ => None,
90 }
91 }
92}
93
94impl From<std::io::Error> for Error {
95 fn from(err: std::io::Error) -> Self {
96 Error::IoError(err)
97 }
98}
99
100impl From<zip::result::ZipError> for Error {
101 fn from(err: zip::result::ZipError) -> Self {
102 Error::ZipError(err)
103 }
104}
105
106impl From<quick_xml::Error> for Error {
107 fn from(err: quick_xml::Error) -> Self {
108 Error::XmlError(err.to_string())
109 }
110}
111
112impl Epub {
113 pub fn parse(path: &Path) -> Result<Self, Error> {
141 let mut zip_handler = ZipHandler::new(path)?;
142 Self::parse_from_handler(&mut zip_handler)
143 }
144
145 pub fn parse_from_buffer(buffer: &[u8]) -> Result<Self, Error> {
169 let cursor = Cursor::new(buffer.to_vec());
170 let mut zip_handler = ZipHandler::new_from_reader(cursor)?;
171 Self::parse_from_handler(&mut zip_handler)
172 }
173
174 fn parse_from_handler<R: std::io::Read + std::io::Seek>(
175 zip_handler: &mut ZipHandler<R>,
176 ) -> Result<Self, Error> {
177 let opf_path = zip_handler.get_opf_path()?;
178 let opf_content = zip_handler.read_file(&opf_path)?;
179
180 let (metadata, manifest, spine, ncx_path) = Self::parse_opf(&opf_content)?;
181
182 let toc = if let Some(ncx_ref) = ncx_path {
183 let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
184 let ncx_content = zip_handler.read_file(&ncx_path_full)?;
185 Self::parse_ncx(&ncx_content)?
186 } else {
187 Vec::new()
188 };
189
190 let mut pages = Vec::new();
191 for itemref in spine {
192 if let Some(manifest_item) = manifest.get(&itemref) {
193 let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
194 let content = zip_handler.read_file(&content_path)?;
195 let text = Self::extract_text_from_html(&content)?;
196 pages.push(Page {
197 index: pages.len(),
198 content: text,
199 });
200 }
201 }
202
203 let mut images = Vec::new();
204 for (id, item) in &manifest {
205 if item._media_type.starts_with("image/") {
206 let image_path = Self::resolve_path(&opf_path, &item.href);
207 match zip_handler.read_file_as_bytes(&image_path) {
208 Ok(bytes) => {
209 images.push(Image {
210 id: id.clone(),
211 href: item.href.clone(),
212 media_type: item._media_type.clone(),
213 content: Some(bytes),
214 });
215 }
216 Err(_) => {
217 images.push(Image {
218 id: id.clone(),
219 href: item.href.clone(),
220 media_type: item._media_type.clone(),
221 content: None,
222 });
223 }
224 }
225 }
226 }
227
228 Ok(Epub {
229 metadata,
230 toc,
231 pages,
232 images,
233 })
234 }
235
236 fn parse_opf(
237 content: &str,
238 ) -> Result<
239 (
240 Metadata,
241 OrderedHashMap<String, ManifestItem>,
242 Vec<String>,
243 Option<String>,
244 ),
245 Error,
246 > {
247 let mut reader = quick_xml::Reader::from_str(content);
248 let mut metadata = Metadata::new();
249 let mut manifest: OrderedHashMap<String, ManifestItem> = OrderedHashMap::new();
250 let mut spine: Vec<String> = Vec::new();
251 let mut ncx_path: Option<String> = None;
252
253 let mut current_text_tag: Option<String> = None;
254
255 let mut buf = Vec::new();
256
257 loop {
258 match reader.read_event_into(&mut buf) {
259 Ok(Event::Start(ref e)) => {
260 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
261 if name.contains("title") {
262 current_text_tag = Some("title".to_string());
263 } else if name.contains("creator") {
264 current_text_tag = Some("author".to_string());
265 } else if name.contains("publisher") {
266 current_text_tag = Some("publisher".to_string());
267 } else if name.contains("language") {
268 current_text_tag = Some("language".to_string());
269 } else if name.contains("identifier") {
270 current_text_tag = Some("identifier".to_string());
271 } else if name.contains("date") {
272 current_text_tag = Some("date".to_string());
273 } else if name.contains("rights") {
274 current_text_tag = Some("rights".to_string());
275 }
276 }
277 Ok(Event::Empty(ref e)) => {
278 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
279 if name.contains("item") && !name.contains("itemref") {
280 let mut id = String::new();
281 let mut href = String::new();
282 let mut media_type = String::new();
283
284 for attr_result in e.attributes() {
285 if let Ok(attr) = attr_result {
286 let attr_name =
287 String::from_utf8_lossy(attr.key.as_ref()).to_string();
288 if attr_name == "id" || attr_name.ends_with(":id") {
289 if let Some(val) =
290 attr.decode_and_unescape_value(reader.decoder()).ok()
291 {
292 id = val.to_string();
293 }
294 } else if attr_name == "href" || attr_name.ends_with(":href") {
295 href = attr
296 .decode_and_unescape_value(reader.decoder())?
297 .to_string();
298 } else if attr_name == "media-type"
299 || attr_name.ends_with(":media-type")
300 {
301 media_type = attr
302 .decode_and_unescape_value(reader.decoder())?
303 .to_string();
304 }
305 }
306 }
307
308 if !id.is_empty() && !href.is_empty() {
309 if media_type == "application/x-dtbncx+xml" {
310 ncx_path = Some(href.clone());
311 }
312 manifest.insert(
313 id.clone(),
314 ManifestItem {
315 _id: id.clone(),
316 href,
317 _media_type: media_type,
318 },
319 );
320 }
321 } else if name.contains("itemref") {
322 let mut idref = String::new();
323
324 for attr_result in e.attributes() {
325 if let Ok(attr) = attr_result {
326 let attr_name =
327 String::from_utf8_lossy(attr.key.as_ref()).to_string();
328 if attr_name == "idref" || attr_name.ends_with(":idref") {
329 if let Some(val) =
330 attr.decode_and_unescape_value(reader.decoder()).ok()
331 {
332 idref = val.to_string();
333 }
334 break;
335 }
336 }
337 }
338
339 if !idref.is_empty() {
340 spine.push(idref);
341 }
342 }
343 }
344 Ok(Event::Text(e)) => {
345 if let Some(tag) = ¤t_text_tag {
346 let text = e.unescape()?.into_owned().trim().to_string();
347 if !text.is_empty() {
348 match tag.as_str() {
349 "title" => metadata.title = Some(text),
350 "author" => metadata.author = Some(text),
351 "publisher" => metadata.publisher = Some(text),
352 "language" => metadata.language = Some(text),
353 "identifier" => metadata.identifier = Some(text),
354 "date" => metadata.date = Some(text),
355 "rights" => metadata.rights = Some(text),
356 _ => {}
357 }
358 }
359 current_text_tag = None;
360 }
361 }
362 Ok(Event::End(_)) => {
363 current_text_tag = None;
364 }
365 Ok(Event::Eof) => break,
366 Err(e) => return Err(Error::XmlError(e.to_string())),
367 _ => {}
368 }
369 buf.clear();
370 }
371
372 Ok((metadata, manifest, spine, ncx_path))
373 }
374
375 fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
376 let mut reader = quick_xml::Reader::from_str(content);
377 let mut toc = Vec::new();
378 let mut stack: Vec<TocEntry> = Vec::new();
379
380 let mut buf = Vec::new();
381 let mut in_nav_label = false;
382 let mut in_text = false;
383
384 loop {
385 match reader.read_event_into(&mut buf) {
386 Ok(Event::Start(ref e)) => {
387 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
388 if name == "navPoint" {
389 let entry = TocEntry {
390 label: String::new(),
391 href: String::new(),
392 children: Vec::new(),
393 };
394 stack.push(entry);
395 } else if name == "navLabel" {
396 in_nav_label = true;
397 } else if name == "text" && in_nav_label {
398 in_text = true;
399 }
400 }
401 Ok(Event::End(ref e)) => {
402 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
403 if name == "navPoint" {
404 if let Some(entry) = stack.pop() {
405 if let Some(parent) = stack.last_mut() {
406 parent.children.push(entry);
407 } else {
408 toc.push(entry);
409 }
410 }
411 } else if name == "navLabel" {
412 in_nav_label = false;
413 } else if name == "text" && in_nav_label {
414 in_text = false;
415 }
416 }
417 Ok(Event::Text(e)) => {
418 if in_text {
419 if let Some(entry) = stack.last_mut() {
420 entry.label = e.unescape()?.into_owned();
421 }
422 }
423 }
424 Ok(Event::Empty(ref e)) => {
425 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
426 if name == "content" {
427 if let Some(src) = e.try_get_attribute("src")? {
428 if let Some(entry) = stack.last_mut() {
429 entry.href =
430 src.decode_and_unescape_value(reader.decoder())?.to_string();
431 }
432 }
433 }
434 }
435 Ok(Event::Eof) => break,
436 Err(e) => return Err(Error::XmlError(e.to_string())),
437 _ => {}
438 }
439 buf.clear();
440 }
441
442 Ok(toc)
443 }
444
445 fn extract_text_from_html(content: &str) -> Result<String, Error> {
446 let mut reader = quick_xml::Reader::from_str(content);
447 let mut text = String::new();
448 let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
449 let mut in_skip_tag = false;
450
451 let mut buf = Vec::new();
452
453 loop {
454 match reader.read_event_into(&mut buf) {
455 Ok(Event::Start(ref e)) => {
456 let tag = e.name().as_ref().to_vec();
457 if skip_tags.contains(&tag) {
458 in_skip_tag = true;
459 } else if tag.as_slice() == b"p"
460 || tag.as_slice() == b"div"
461 || tag.as_slice() == b"br"
462 || tag.as_slice() == b"li"
463 {
464 text.push('\n');
465 }
466 }
467 Ok(Event::End(ref e)) => {
468 let tag = e.name().as_ref().to_vec();
469 if skip_tags.contains(&tag) {
470 in_skip_tag = false;
471 }
472 }
473 Ok(Event::Text(e)) => {
474 if !in_skip_tag {
475 let t = e.unescape()?.into_owned();
476 let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
477 text.push_str(&trimmed);
478 text.push(' ');
479 }
480 }
481 Ok(Event::Eof) => break,
482 Err(e) => return Err(Error::XmlError(e.to_string())),
483 _ => {}
484 }
485 buf.clear();
486 }
487
488 Ok(text
489 .lines()
490 .map(|l| l.trim())
491 .filter(|l| !l.is_empty())
492 .collect::<Vec<_>>()
493 .join("\n"))
494 }
495
496 fn resolve_path(base_path: &str, href: &str) -> String {
497 let base = PathBuf::from(base_path);
498 let parent = base.parent().unwrap_or(base.as_path());
499 let resolved = parent.join(href);
500 resolved.to_string_lossy().to_string().replace('\\', "/")
501 }
502}
503
504#[derive(Debug, Clone)]
505struct ManifestItem {
506 _id: String,
507 href: String,
508 _media_type: String,
509}