1use crate::types::{Image, Metadata, Page, TocEntry};
2use crate::utils::{ZipHandler, preprocess_html_entities};
3use ordered_hash_map::OrderedHashMap;
4use quick_xml::events::Event;
5use std::io::Cursor;
6use std::path::{Path, PathBuf};
7
8#[derive(Debug)]
40pub struct Epub {
41 pub metadata: Metadata,
43 pub toc: Vec<TocEntry>,
45 pub pages: Vec<Page>,
47 pub images: Vec<Image>,
49}
50
51#[derive(Debug)]
53pub enum Error {
54 InvalidEpub(String),
56 IoError(std::io::Error),
58 ZipError(zip::result::ZipError),
60 XmlError(String),
62 MissingContainer,
64 MissingOpf,
66 MissingNcx,
68}
69
70impl std::fmt::Display for Error {
71 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72 match self {
73 Error::InvalidEpub(msg) => write!(f, "Invalid EPUB: {}", msg),
74 Error::IoError(e) => write!(f, "I/O error: {}", e),
75 Error::ZipError(e) => write!(f, "ZIP error: {}", e),
76 Error::XmlError(e) => write!(f, "XML error: {}", e),
77 Error::MissingContainer => write!(f, "Missing container.xml"),
78 Error::MissingOpf => write!(f, "Missing OPF file"),
79 Error::MissingNcx => write!(f, "Missing NCX file"),
80 }
81 }
82}
83
84impl std::error::Error for Error {
85 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
86 match self {
87 Error::IoError(e) => Some(e),
88 Error::ZipError(e) => Some(e),
89 _ => None,
90 }
91 }
92}
93
94impl From<std::io::Error> for Error {
95 fn from(err: std::io::Error) -> Self {
96 Error::IoError(err)
97 }
98}
99
100impl From<zip::result::ZipError> for Error {
101 fn from(err: zip::result::ZipError) -> Self {
102 Error::ZipError(err)
103 }
104}
105
106impl From<quick_xml::Error> for Error {
107 fn from(err: quick_xml::Error) -> Self {
108 Error::XmlError(err.to_string())
109 }
110}
111
112impl Epub {
113 pub fn parse(path: &Path) -> Result<Self, Error> {
141 let mut zip_handler = ZipHandler::new(path)?;
142 Self::parse_from_handler(&mut zip_handler)
143 }
144
145 pub fn parse_from_buffer(buffer: &[u8]) -> Result<Self, Error> {
169 let cursor = Cursor::new(buffer.to_vec());
170 let mut zip_handler = ZipHandler::new_from_reader(cursor)?;
171 Self::parse_from_handler(&mut zip_handler)
172 }
173
174 fn parse_from_handler<R: std::io::Read + std::io::Seek>(
175 zip_handler: &mut ZipHandler<R>,
176 ) -> Result<Self, Error> {
177 let opf_path = zip_handler.get_opf_path()?;
178 let opf_content = zip_handler.read_file(&opf_path)?;
179
180 let (metadata, manifest, spine, ncx_path) = Self::parse_opf(&opf_content)?;
181
182 let toc = if let Some(ncx_ref) = ncx_path {
183 let ncx_path_full = Self::resolve_path(&opf_path, &ncx_ref);
184 let ncx_content = zip_handler.read_file(&ncx_path_full)?;
185 Self::parse_ncx(&ncx_content)?
186 } else {
187 Vec::new()
188 };
189
190 let mut pages = Vec::new();
191 for itemref in spine {
192 if let Some(manifest_item) = manifest.get(&itemref) {
193 let content_path = Self::resolve_path(&opf_path, &manifest_item.href);
194 match zip_handler.read_file(&content_path) {
195 Ok(content) => {
196 if let Ok(text) = Self::extract_text_from_html(&content) {
197 pages.push(Page {
198 index: pages.len(),
199 content: text,
200 });
201 }
202 }
203 Err(e) => {
204 eprintln!(
205 "Warning: Could not read content file '{}': {}",
206 content_path, e
207 );
208 }
209 }
210 }
211 }
212
213 let mut images = Vec::new();
214 for (id, item) in &manifest {
215 if item._media_type.to_lowercase().starts_with("image/") {
216 let image_path = Self::resolve_path(&opf_path, &item.href);
217 if let Ok(bytes) = zip_handler.read_file_as_bytes(&image_path) {
218 if id.to_lowercase().contains("cover") {
219 images.insert(
220 0,
221 Image {
222 id: id.clone(),
223 href: item.href.clone(),
224 media_type: item._media_type.clone(),
225 content: bytes,
226 },
227 );
228 } else {
229 images.push(Image {
230 id: id.clone(),
231 href: item.href.clone(),
232 media_type: item._media_type.clone(),
233 content: bytes,
234 });
235 }
236 }
237 }
238 }
239
240 Ok(Epub {
241 metadata,
242 toc,
243 pages,
244 images,
245 })
246 }
247
248 fn parse_opf(
249 content: &str,
250 ) -> Result<
251 (
252 Metadata,
253 OrderedHashMap<String, ManifestItem>,
254 Vec<String>,
255 Option<String>,
256 ),
257 Error,
258 > {
259 let content = preprocess_html_entities(content);
260 let mut reader = quick_xml::Reader::from_str(&content);
261 let mut metadata = Metadata::new();
262 let mut manifest: OrderedHashMap<String, ManifestItem> = OrderedHashMap::new();
263 let mut spine: Vec<String> = Vec::new();
264 let mut ncx_path: Option<String> = None;
265
266 let mut current_text_tag: Option<String> = None;
267
268 let mut buf = Vec::new();
269
270 loop {
271 match reader.read_event_into(&mut buf) {
272 Ok(Event::Start(ref e)) => {
273 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
274 if name.contains("title") {
275 current_text_tag = Some("title".to_string());
276 } else if name.contains("creator") {
277 current_text_tag = Some("author".to_string());
278 } else if name.contains("publisher") {
279 current_text_tag = Some("publisher".to_string());
280 } else if name.contains("language") {
281 current_text_tag = Some("language".to_string());
282 } else if name.contains("identifier") {
283 current_text_tag = Some("identifier".to_string());
284 } else if name.contains("date") {
285 current_text_tag = Some("date".to_string());
286 } else if name.contains("rights") {
287 current_text_tag = Some("rights".to_string());
288 }
289 }
290 Ok(Event::Empty(ref e)) => {
291 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
292 if name.contains("item") && !name.contains("itemref") {
293 let mut id = String::new();
294 let mut href = String::new();
295 let mut media_type = String::new();
296
297 for attr_result in e.attributes() {
298 if let Ok(attr) = attr_result {
299 let attr_name =
300 String::from_utf8_lossy(attr.key.as_ref()).to_string();
301 if attr_name == "id" || attr_name.ends_with(":id") {
302 if let Some(val) =
303 attr.decode_and_unescape_value(reader.decoder()).ok()
304 {
305 id = val.to_string();
306 }
307 } else if attr_name == "href" || attr_name.ends_with(":href") {
308 href = attr
309 .decode_and_unescape_value(reader.decoder())?
310 .to_string();
311 } else if attr_name == "media-type"
312 || attr_name.ends_with(":media-type")
313 {
314 media_type = attr
315 .decode_and_unescape_value(reader.decoder())?
316 .to_string();
317 }
318 }
319 }
320
321 if !id.is_empty() && !href.is_empty() {
322 if media_type == "application/x-dtbncx+xml" {
323 ncx_path = Some(href.clone());
324 }
325 manifest.insert(
326 id.clone(),
327 ManifestItem {
328 _id: id.clone(),
329 href,
330 _media_type: media_type,
331 },
332 );
333 }
334 } else if name.contains("itemref") {
335 let mut idref = String::new();
336
337 for attr_result in e.attributes() {
338 if let Ok(attr) = attr_result {
339 let attr_name =
340 String::from_utf8_lossy(attr.key.as_ref()).to_string();
341 if attr_name == "idref" || attr_name.ends_with(":idref") {
342 if let Some(val) =
343 attr.decode_and_unescape_value(reader.decoder()).ok()
344 {
345 idref = val.to_string();
346 }
347 break;
348 }
349 }
350 }
351
352 if !idref.is_empty() {
353 spine.push(idref);
354 }
355 }
356 }
357 Ok(Event::Text(e)) => {
358 if let Some(tag) = ¤t_text_tag {
359 let text = e
360 .unescape()
361 .unwrap_or_else(|_| {
362 std::str::from_utf8(e.as_ref()).unwrap_or_default().into()
363 })
364 .into_owned()
365 .trim()
366 .to_string();
367 if !text.is_empty() {
368 match tag.as_str() {
369 "title" => metadata.title = Some(text),
370 "author" => metadata.author = Some(text),
371 "publisher" => metadata.publisher = Some(text),
372 "language" => metadata.language = Some(text),
373 "identifier" => metadata.identifier = Some(text),
374 "date" => metadata.date = Some(text),
375 "rights" => metadata.rights = Some(text),
376 _ => {}
377 }
378 }
379 current_text_tag = None;
380 }
381 }
382 Ok(Event::End(_)) => {
383 current_text_tag = None;
384 }
385 Ok(Event::Eof) => break,
386 Err(e) => return Err(Error::XmlError(e.to_string())),
387 _ => {}
388 }
389 buf.clear();
390 }
391
392 Ok((metadata, manifest, spine, ncx_path))
393 }
394
395 fn parse_ncx(content: &str) -> Result<Vec<TocEntry>, Error> {
396 let content = preprocess_html_entities(content);
397 let mut reader = quick_xml::Reader::from_str(&content);
398 let mut toc = Vec::new();
399 let mut stack: Vec<TocEntry> = Vec::new();
400
401 let mut buf = Vec::new();
402 let mut in_nav_label = false;
403 let mut in_text = false;
404
405 loop {
406 match reader.read_event_into(&mut buf) {
407 Ok(Event::Start(ref e)) => {
408 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
409 if name == "navPoint" {
410 let entry = TocEntry {
411 label: String::new(),
412 href: String::new(),
413 children: Vec::new(),
414 };
415 stack.push(entry);
416 } else if name == "navLabel" {
417 in_nav_label = true;
418 } else if name == "text" && in_nav_label {
419 in_text = true;
420 }
421 }
422 Ok(Event::End(ref e)) => {
423 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
424 if name == "navPoint" {
425 if let Some(entry) = stack.pop() {
426 if let Some(parent) = stack.last_mut() {
427 parent.children.push(entry);
428 } else {
429 toc.push(entry);
430 }
431 }
432 } else if name == "navLabel" {
433 in_nav_label = false;
434 } else if name == "text" && in_nav_label {
435 in_text = false;
436 }
437 }
438 Ok(Event::Text(e)) => {
439 if in_text {
440 if let Some(entry) = stack.last_mut() {
441 entry.label = e
442 .unescape()
443 .unwrap_or_else(|_| {
444 std::str::from_utf8(e.as_ref()).unwrap_or_default().into()
445 })
446 .into_owned();
447 }
448 }
449 }
450 Ok(Event::Empty(ref e)) => {
451 let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
452 if name == "content" {
453 if let Some(src) = e.try_get_attribute("src")? {
454 if let Some(entry) = stack.last_mut() {
455 entry.href =
456 src.decode_and_unescape_value(reader.decoder())?.to_string();
457 }
458 }
459 }
460 }
461 Ok(Event::Eof) => break,
462 Err(e) => return Err(Error::XmlError(e.to_string())),
463 _ => {}
464 }
465 buf.clear();
466 }
467
468 Ok(toc)
469 }
470
471 fn extract_text_from_html(content: &str) -> Result<String, Error> {
472 let content = preprocess_html_entities(content);
473 let mut reader = quick_xml::Reader::from_str(&content);
474 let mut text = String::new();
475 let skip_tags: Vec<Vec<u8>> = vec![b"script".to_vec(), b"style".to_vec(), b"head".to_vec()];
476 let mut in_skip_tag = false;
477
478 let mut buf = Vec::new();
479
480 loop {
481 match reader.read_event_into(&mut buf) {
482 Ok(Event::Start(ref e)) => {
483 let tag = e.name().as_ref().to_vec();
484 if skip_tags.contains(&tag) {
485 in_skip_tag = true;
486 } else if tag.as_slice() == b"p"
487 || tag.as_slice() == b"div"
488 || tag.as_slice() == b"br"
489 || tag.as_slice() == b"li"
490 {
491 text.push('\n');
492 }
493 }
494 Ok(Event::End(ref e)) => {
495 let tag = e.name().as_ref().to_vec();
496 if skip_tags.contains(&tag) {
497 in_skip_tag = false;
498 }
499 }
500 Ok(Event::Text(e)) => {
501 if !in_skip_tag {
502 let unescaped = e.unescape().unwrap_or_else(|_| {
503 std::str::from_utf8(e.as_ref()).unwrap_or_default().into()
504 });
505 let t = unescaped.into_owned();
506 let trimmed: String = t.chars().filter(|c| !c.is_control()).collect();
507 text.push_str(&trimmed);
508 text.push(' ');
509 }
510 }
511 Ok(Event::Eof) => break,
512 Err(e) => {
513 eprintln!(
514 "Warning: XML parse error in HTML content, continuing: {}",
515 e
516 );
517 break;
518 }
519 _ => {}
520 }
521 buf.clear();
522 }
523
524 Ok(text
525 .lines()
526 .map(|l| l.trim())
527 .filter(|l| !l.is_empty())
528 .collect::<Vec<_>>()
529 .join("\n"))
530 }
531
532 fn resolve_path(base_path: &str, href: &str) -> String {
533 let base = PathBuf::from(base_path);
534 let parent = base.parent().unwrap_or(base.as_path());
535 let resolved = parent.join(href);
536 resolved.to_string_lossy().to_string().replace('\\', "/")
537 }
538}
539
540#[derive(Debug, Clone)]
541struct ManifestItem {
542 _id: String,
543 href: String,
544 _media_type: String,
545}