lib_epub/epub.rs
1//! The core module of the EPUB parsing library
2//!
3//! This module provides complete parsing functionality for EPUB ebook files
4//! and is the core component of the entire library. The `EpubDoc` structure
5//! encapsulates all the parsing logic and data access interfaces for EPUB files.
6//!
7//! ## Main references to EPUB specs:
8//! - <https://www.w3.org/TR/epub-33>
9//! - <https://idpf.org/epub/201>
10//!
11//! ## Potential Issues
12//! - The generic parameter `R: Read + Seek` increases complexity, particularly
13//! in asynchronous environments. The current design is not conducive to multi-threaded
14//! concurrent access and requires an external synchronization mechanism.
15//! - Some error handling may not be sufficiently nuanced, and certain edge cases
16//! may not be adequately considered.
17//! - Loading the entire EPUB document at once may result in significant memory consumption,
18//! especially for large publications.
19//!
20//! ## Future Work
21//! - Adds support for asynchronous I/O, improving the user experience in asynchronous
22//! environments. Considering adding support for multi-threaded access.
23//! - Supports more EPUB specification features, such as media overlay and scripts.
24
25use std::{
26 collections::HashMap,
27 fs::{File, canonicalize},
28 io::{BufReader, Read, Seek},
29 path::{Path, PathBuf},
30};
31
32use log::warn;
33use zip::{ZipArchive, result::ZipError};
34
35use crate::{
36 error::EpubError,
37 types::{
38 EncryptionData, EpubVersion, ManifestItem, MetadataItem, MetadataLinkItem,
39 MetadataRefinement, NavPoint, SpineItem,
40 },
41 utils::{
42 DecodeBytes, NormalizeWhitespace, XmlElement, XmlReader, adobe_font_dencryption,
43 check_realtive_link_leakage, compression_method_check, get_file_in_zip_archive,
44 idpf_font_dencryption,
45 },
46};
47
48/// EPUB document parser, representing a loaded and parsed EPUB publication
49///
50/// The `EpubDoc` structure is the core of the entire EPUB parsing library.
51/// It encapsulates all the parsing logic and data access interfaces for EPUB files.
52/// It is responsible for parsing various components of an EPUB, including metadata,
53/// manifests, reading order, table of contents navigation, and encrypted information,
54/// and provides methods for accessing this data.
55///
56/// Provides a unified data access interface for EPUB files, hiding the underlying
57/// file structure and parsing details. Strictly adheres to the EPUB specification
58/// in implementing the parsing logic to ensure compatibility with the standard.
59///
60/// # Usage
61///
62/// ```rust
63/// use lib_epub::epub::EpubDoc;
64///
65/// let doc = EpubDoc::new("./test_case/epub-33.epub");
66/// assert!(doc.is_ok());
67/// ```
68pub struct EpubDoc<R: Read + Seek> {
69 /// The structure of the epub file that actually holds it
70 pub(crate) archive: ZipArchive<R>,
71
72 /// The path to the target epub file
73 pub(crate) epub_path: PathBuf,
74
75 /// The path to the OPF file
76 pub package_path: PathBuf,
77
78 /// The path to the directory where the opf file is located
79 pub base_path: PathBuf,
80
81 /// The epub version
82 pub version: EpubVersion,
83
84 /// The unique identifier of the epub file
85 ///
86 /// This identifier is the actual value of the unique-identifier attribute of the package.
87 pub unique_identifier: String,
88
89 /// Epub metadata extracted from OPF
90 pub metadata: Vec<MetadataItem>,
91
92 /// Data in metadata that points to external files
93 pub metadata_link: Vec<MetadataLinkItem>,
94
95 /// A list of resources contained inside an epub extracted from OPF
96 ///
97 /// All resources in the epub file are declared here,
98 /// and undeclared resources should not be stored in the epub file and cannot be obtained from it.
99 pub manifest: HashMap<String, ManifestItem>,
100
101 /// Physical reading order of publications extracted from OPF
102 ///
103 /// This attribute declares the order in which multiple files
104 /// containing published content should be displayed.
105 pub spine: Vec<SpineItem>,
106
107 /// The encryption.xml extracted from the META-INF directory
108 pub encryption: Option<Vec<EncryptionData>>,
109
110 /// The navigation data of the epub file
111 pub catalog: Vec<NavPoint>,
112
113 /// The title of the catalog
114 pub catalog_title: String,
115
116 /// The index of the current reading spine
117 pub current_spine_index: usize,
118}
119
120impl<R: Read + Seek> EpubDoc<R> {
121 /// Creates a new EPUB document instance from a reader
122 ///
123 /// This function is responsible for the core logic of parsing EPUB files,
124 /// including verifying the file format, parsing container information,
125 /// loading the OPF package document, and extracting metadata, manifest,
126 /// reading order, and other core information.
127 ///
128 /// # Parameters
129 /// - `reader`: The data source that implements the `Read` and `Seek` traits,
130 /// usually a file or memory buffer
131 /// - `epub_path`: The path to the EPUB file, used for path resolution and validation
132 ///
133 /// # Return
134 /// - `Ok(EpubDoc<R>)`: The successfully parsed EPUB document object
135 /// - `Err(EpubError)`: Errors encountered during parsing
136 ///
137 /// # Notes
138 /// - This function assumes the EPUB file structure is valid
139 pub fn from_reader(reader: R, epub_path: PathBuf) -> Result<Self, EpubError> {
140 // Parsing process
141 // 1. Verify that the ZIP compression method conforms to the EPUB specification
142 // 2. Parse `META-INF/container.xml` retrieves the location of the OPF file
143 // 3. Parses the OPF file to obtain package documentation information
144 // 4. Extracts version information
145 // 5. Parses metadata, manifest, and spine
146 // 6. Parses encrypted information and directory navigation
147 // 7. Verifies and extracts the unique identifier
148
149 let mut archive = ZipArchive::new(reader).map_err(EpubError::from)?;
150 let epub_path = canonicalize(epub_path)?;
151
152 compression_method_check(&mut archive)?;
153
154 let container =
155 get_file_in_zip_archive(&mut archive, "META-INF/container.xml")?.decode()?;
156 let package_path = Self::parse_container(container)?;
157 let base_path = package_path
158 .parent()
159 .expect("所有文件的父目录不能为空")
160 .to_path_buf();
161
162 let opf_file =
163 get_file_in_zip_archive(&mut archive, package_path.to_str().unwrap())?.decode()?;
164 let package = XmlReader::parse(&opf_file)?;
165 // let document = kiss_xml::parse_str(opf_file).unwrap();
166
167 // let package = document.root_element();
168 let version = Self::determine_epub_version(&package)?;
169
170 let mut doc = Self {
171 archive,
172 epub_path,
173 package_path,
174 base_path,
175 version,
176 unique_identifier: String::new(),
177 metadata: vec![],
178 metadata_link: vec![],
179 manifest: HashMap::new(),
180 spine: vec![],
181 encryption: None,
182 catalog: vec![],
183 catalog_title: String::new(),
184 current_spine_index: 0,
185 };
186
187 let metadata_element = package.find_elements_by_name("metadata").next().unwrap();
188 let manifest_element = package.find_elements_by_name("manifest").next().unwrap();
189 let spine_element = package.find_elements_by_name("spine").next().unwrap();
190
191 doc.parse_metadata(metadata_element)?;
192 doc.parse_manifest(manifest_element)?;
193 doc.parse_spine(spine_element)?;
194 doc.parse_encryption()?;
195 doc.parse_catalog()?;
196
197 // 断言必有唯一标识符
198 doc.unique_identifier = if let Some(uid) = package.get_attr("unique-identifier") {
199 doc.metadata.iter().find(|item| {
200 item.property == "identifier" && item.id.as_ref().is_some_and(|id| id == &uid)
201 })
202 } else {
203 doc.metadata
204 .iter()
205 .find(|item| item.property == "identifier")
206 }
207 .map(|item| item.value.clone())
208 .ok_or_else(|| EpubError::NonCanonicalFile {
209 tag: "dc:identifier".to_string(),
210 })?;
211
212 Ok(doc)
213 }
214
215 /// Parse the EPUB container file (META-INF/container.xml)
216 ///
217 /// This function parses the container information in the EPUB file 、
218 /// to extract the path to the OPF package file. According to the EPUB
219 /// specification, the `container.xml` file must exist in the `META-INF`
220 /// directory and contain at least one `rootfile` element pointing to
221 /// the main OPF file. When multiple `rootfile` elements exist, the first
222 /// element pointing to the OPF file is used as the default.
223 ///
224 /// # Parameters
225 /// - `data`: The content string of the container.xml
226 ///
227 /// # Return
228 /// - `Ok(PathBuf)`: The path to the successfully parsed OPF file
229 /// - `Err(EpubError)`: Errors encountered during parsing
230 fn parse_container(data: String) -> Result<PathBuf, EpubError> {
231 let root = XmlReader::parse(&data)?;
232 let rootfile = root
233 .find_elements_by_name("rootfile")
234 .next()
235 .ok_or_else(|| EpubError::NonCanonicalFile {
236 tag: "rootfile".to_string(),
237 })?;
238
239 let attr =
240 rootfile
241 .get_attr("full-path")
242 .ok_or_else(|| EpubError::MissingRequiredAttribute {
243 tag: "rootfile".to_string(),
244 attribute: "full-path".to_string(),
245 })?;
246
247 Ok(PathBuf::from(attr))
248 }
249
250 /// Parse the EPUB metadata section
251 ///
252 /// This function is responsible for parsing the `<metadata>` elements
253 /// in the OPF file to extract basic information about the publication.
254 /// It handles metadata elements from different namespaces:
255 /// - Elements in the Dublin Core namespace (`http://purl.org/dc/elements/1.1/`)
256 /// - Elements in the OPF namespace (`http://www.idpf.org/2007/opf`)
257 ///
258 /// # Parameters
259 /// - `metadata_element`: A reference to the `<metadata>` element in the OPF file
260 fn parse_metadata(&mut self, metadata_element: &XmlElement) -> Result<(), EpubError> {
261 const DC_NAMESPACE: &str = "http://purl.org/dc/elements/1.1/";
262 const OPF_NAMESPACE: &str = "http://www.idpf.org/2007/opf";
263
264 let mut metadata = Vec::new();
265 let mut metadata_link = Vec::new();
266 let mut refinements = HashMap::<String, Vec<MetadataRefinement>>::new();
267
268 for element in metadata_element.children() {
269 match &element.namespace {
270 Some(namespace) if namespace == DC_NAMESPACE => {
271 self.parse_dc_metadata(element, &mut metadata)?
272 }
273
274 Some(namespace) if namespace == OPF_NAMESPACE => self.parse_opf_metadata(
275 element,
276 &mut metadata,
277 &mut metadata_link,
278 &mut refinements,
279 )?,
280
281 _ => {}
282 };
283 }
284
285 for item in metadata.iter_mut() {
286 if let Some(id) = &item.id {
287 if let Some(refinements) = refinements.remove(id) {
288 item.refined = refinements;
289 }
290 }
291 }
292
293 self.metadata = metadata;
294 self.metadata_link = metadata_link;
295 Ok(())
296 }
297
298 /// Parse the EPUB manifest section
299 ///
300 /// This function parses the `<manifest>` element in the OPF file, extracting
301 /// information about all resource files in the publication. Each resource contains
302 /// basic information such as id, file path, MIME type, as well as optional
303 /// attributes and fallback resource information.
304 ///
305 /// # Parameters
306 /// - `manifest_element`: A reference to the `<manifest>` element in the OPF file
307 fn parse_manifest(&mut self, manifest_element: &XmlElement) -> Result<(), EpubError> {
308 let estimated_items = manifest_element.children().count();
309 let mut resources = HashMap::with_capacity(estimated_items);
310
311 for element in manifest_element.children() {
312 let id = element
313 .get_attr("id")
314 .ok_or_else(|| EpubError::MissingRequiredAttribute {
315 tag: element.tag_name(),
316 attribute: "id".to_string(),
317 })?
318 .to_string();
319 let path = element
320 .get_attr("href")
321 .ok_or_else(|| EpubError::MissingRequiredAttribute {
322 tag: element.tag_name(),
323 attribute: "href".to_string(),
324 })?
325 .to_string();
326 let mime = element
327 .get_attr("media-type")
328 .ok_or_else(|| EpubError::MissingRequiredAttribute {
329 tag: element.tag_name(),
330 attribute: "media-type".to_string(),
331 })?
332 .to_string();
333 let properties = element.get_attr("properties");
334 let fallback = element.get_attr("fallback");
335
336 resources.insert(
337 id.clone(),
338 ManifestItem {
339 id,
340 path: self.normalize_manifest_path(&path)?,
341 mime,
342 properties,
343 fallback,
344 },
345 );
346 }
347
348 self.manifest = resources;
349 self.validate_fallback_chains();
350 Ok(())
351 }
352
353 /// Parse the EPUB spine section
354 ///
355 /// This function parses the `<spine>` elements in the OPF file to extract
356 /// the reading order information of the publication. The spine defines the
357 /// linear reading order of the publication's content documents, and each
358 /// spine item references resources in the manifest.
359 ///
360 /// # Parameters
361 /// - `spine_element`: A reference to the `<spine>` element in the OPF file
362 fn parse_spine(&mut self, spine_element: &XmlElement) -> Result<(), EpubError> {
363 let mut spine = Vec::new();
364 for element in spine_element.children() {
365 let idref = element
366 .get_attr("idref")
367 .ok_or_else(|| EpubError::MissingRequiredAttribute {
368 tag: element.tag_name(),
369 attribute: "idref".to_string(),
370 })?
371 .to_string();
372 let id = element.get_attr("id");
373 let linear = element
374 .get_attr("linear")
375 .map(|linear| linear == "yes")
376 .unwrap_or(true);
377 let properties = element.get_attr("properties");
378
379 spine.push(SpineItem {
380 idref,
381 id,
382 linear,
383 properties,
384 });
385 }
386
387 self.spine = spine;
388 Ok(())
389 }
390
391 /// Parse the EPUB encryption file (META-INF/encryption.xml)
392 ///
393 /// This function is responsible for parsing the `encryption.xml` file
394 /// in the `META-INF` directory to extract information about encrypted
395 /// resources in the publication. According to the EPUB specification,
396 /// the encryption information describes which resources are encrypted
397 /// and the encryption methods used.
398 ///
399 /// TODO: 需要对使用非对称加密数据的加密项进行额外处理,以获取非对称加密密钥
400 fn parse_encryption(&mut self) -> Result<(), EpubError> {
401 if !self.has_encryption() {
402 return Ok(());
403 }
404
405 let encryption_file =
406 get_file_in_zip_archive(&mut self.archive, "META-INF/encryption.xml")?.decode()?;
407
408 let root = XmlReader::parse(&encryption_file)?;
409
410 let mut encryption_data = Vec::new();
411 for data in root.children() {
412 if data.name != "EncryptedData" {
413 continue;
414 }
415
416 let method = data
417 .find_elements_by_name("EncryptionMethod")
418 .next()
419 .ok_or_else(|| EpubError::NonCanonicalFile {
420 tag: "EncryptionMethod".to_string(),
421 })?;
422 let reference = data
423 .find_elements_by_name("CipherReference")
424 .next()
425 .ok_or_else(|| EpubError::NonCanonicalFile {
426 tag: "CipherReference".to_string(),
427 })?;
428
429 encryption_data.push(EncryptionData {
430 method: method
431 .get_attr("Algorithm")
432 .ok_or_else(|| EpubError::MissingRequiredAttribute {
433 tag: "EncryptionMethod".to_string(),
434 attribute: "Algorithm".to_string(),
435 })?
436 .to_string(),
437 data: reference
438 .get_attr("URI")
439 .ok_or_else(|| EpubError::MissingRequiredAttribute {
440 tag: "CipherReference".to_string(),
441 attribute: "URI".to_string(),
442 })?
443 .to_string(),
444 });
445 }
446
447 if !encryption_data.is_empty() {
448 self.encryption = Some(encryption_data);
449 }
450
451 Ok(())
452 }
453
454 /// Parse the EPUB navigation information
455 ///
456 /// This function is responsible for parsing the navigation information of EPUB
457 /// publications. Different parsing strategies are used depending on the EPUB version:
458 /// - EPUB 2.0: Parses the NCX file to obtain directory information
459 /// - EPUB 3.0: Parses the Navigation Document (NAV) file to obtain directory information
460 fn parse_catalog(&mut self) -> Result<(), EpubError> {
461 const HEAD_TAGS: [&str; 6] = ["h1", "h2", "h3", "h4", "h5", "h6"];
462
463 match self.version {
464 EpubVersion::Version2_0 => {
465 let opf_file = get_file_in_zip_archive(
466 &mut self.archive,
467 self.package_path.to_str().unwrap(),
468 )?
469 .decode()?;
470 let opf_element = XmlReader::parse(&opf_file)?;
471
472 let toc_id = opf_element
473 .find_children_by_name("spine")
474 .next()
475 .ok_or_else(|| EpubError::NonCanonicalFile {
476 tag: "spine".to_string(),
477 })?
478 .get_attr("toc")
479 .ok_or_else(|| EpubError::MissingRequiredAttribute {
480 tag: "spine".to_string(),
481 attribute: "toc".to_string(),
482 })?
483 .to_owned();
484 let toc_path = self
485 .manifest
486 .get(&toc_id)
487 .ok_or(EpubError::ResourceIdNotExist { id: toc_id })?
488 .path
489 .to_str()
490 .unwrap();
491
492 let ncx_file = get_file_in_zip_archive(&mut self.archive, toc_path)?.decode()?;
493 let ncx = XmlReader::parse(&ncx_file)?;
494
495 match ncx.find_elements_by_name("docTitle").next() {
496 Some(element) => self.catalog_title = element.text(),
497 None => warn!(
498 "Expecting to get docTitle information from the ncx file, but it's missing."
499 ),
500 };
501
502 let nav_map = ncx.find_elements_by_name("navMap").next().ok_or_else(|| {
503 EpubError::NonCanonicalFile {
504 tag: "navMap".to_string(),
505 }
506 })?;
507
508 self.catalog = self.parse_nav_points(nav_map)?;
509
510 Ok(())
511 }
512
513 EpubVersion::Version3_0 => {
514 let nav_path = self
515 .manifest
516 .values()
517 .find(|item| {
518 if let Some(property) = &item.properties {
519 return property.contains("nav");
520 }
521 false
522 })
523 .map(|item| item.path.clone())
524 .ok_or_else(|| EpubError::NonCanonicalEpub {
525 expected_file: "Navigation Document".to_string(),
526 })?;
527
528 let nav_file =
529 get_file_in_zip_archive(&mut self.archive, nav_path.to_str().unwrap())?
530 .decode()?;
531
532 let nav_element = XmlReader::parse(&nav_file)?;
533 let nav = nav_element
534 .find_elements_by_name("nav")
535 .find(|&element| element.get_attr("epub:type") == Some(String::from("toc")))
536 .ok_or_else(|| EpubError::NonCanonicalFile {
537 tag: "nav".to_string(),
538 })?;
539 let nav_title = nav.find_children_by_names(&HEAD_TAGS).next();
540 let nav_list = nav.find_children_by_name("ol").next().ok_or_else(|| {
541 EpubError::NonCanonicalFile {
542 tag: "ol".to_string(),
543 }
544 })?;
545
546 self.catalog = self.parse_catalog_list(nav_list)?;
547 if let Some(nav_title) = nav_title {
548 self.catalog_title = nav_title.text();
549 };
550 Ok(())
551 }
552 }
553 }
554
555 /// Check if the EPUB file contains `encryption.xml`
556 ///
557 /// This function determines whether a publication contains encrypted resources
558 /// by checking if a `META-INF/encryption.xml` file exists in the EPUB package.
559 /// According to the EPUB specification, when resources in a publication are
560 /// encrypted, the corresponding encryption information must be declared in
561 /// the `META-INF/encryption.xml` file.
562 ///
563 /// # Return
564 /// - `true` if the publication contains encrypted resources
565 /// - `false` if the publication does not contain encrypted resources
566 ///
567 /// # Notes
568 /// - This function only checks the existence of the encrypted file;
569 /// it does not verify the validity of the encrypted information.
570 pub fn has_encryption(&mut self) -> bool {
571 self.archive
572 .by_path(Path::new("META-INF/encryption.xml"))
573 .is_ok()
574 }
575
576 /// Retrieves a list of metadata items
577 ///
578 /// This function retrieves all matching metadata items from the EPUB metadata
579 /// based on the specified attribute name (key). Metadata items may come from
580 /// the DC (Dublin Core) namespace or the OPF namespace and contain basic
581 /// information about the publication, such as title, author, identifier, etc.
582 ///
583 /// # Parameters
584 /// - `key`: The name of the metadata attribute to retrieve
585 ///
586 /// # Return
587 /// - `Some(Vec<MetadataItem>)`: A vector containing all matching metadata items
588 /// - `None`: If no matching metadata items are found
589 pub fn get_metadata(&self, key: &str) -> Option<Vec<MetadataItem>> {
590 let metadatas = self
591 .metadata
592 .iter()
593 .filter(|item| item.property == key)
594 .cloned()
595 .collect::<Vec<MetadataItem>>();
596
597 (!metadatas.is_empty()).then_some(metadatas)
598 }
599
600 /// Retrieves a list of values for specific metadata items
601 ///
602 /// This function retrieves the values of all matching metadata items from
603 /// the EPUB metadata based on the given property name (key).
604 ///
605 /// # Parameters
606 /// - `key`: The name of the metadata attribute to retrieve
607 ///
608 /// # Return
609 /// - `Some(Vec<String>)`: A vector containing all matching metadata item values
610 /// - `None`: If no matching metadata items are found
611 pub fn get_metadata_value(&self, key: &str) -> Option<Vec<String>> {
612 let values = self
613 .metadata
614 .iter()
615 .filter(|item| item.property == key)
616 .map(|item| item.value.clone())
617 .collect::<Vec<String>>();
618
619 (!values.is_empty()).then_some(values)
620 }
621
622 /// Retrieves the title of the publication
623 ///
624 /// This function retrieves all title information from the EPUB metadata.
625 /// According to the EPUB specification, a publication can have multiple titles,
626 /// which are returned in the order they appear in the metadata.
627 ///
628 /// # Return
629 /// - `Result<Vec<String>, EpubError>`: A vector containing all title information
630 /// - `EpubError`: If and only if the OPF file does not contain `<dc:title>`
631 ///
632 /// # Notes
633 /// - The EPUB specification requires each publication to have at least one title.
634 pub fn get_title(&self) -> Result<Vec<String>, EpubError> {
635 self.get_metadata_value("title")
636 .ok_or_else(|| EpubError::NonCanonicalFile {
637 tag: "title".to_string(),
638 })
639 }
640
641 /// Retrieves the language used in the publication
642 ///
643 /// This function retrieves the language information of a publication from the EPUB
644 /// metadata. According to the EPUB specification, language information identifies
645 /// the primary language of the publication and can have multiple language identifiers.
646 ///
647 /// # Return
648 /// - `Ok(Vec<String>)`: A vector containing all language identifiers
649 /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:language>`
650 ///
651 /// # Notes
652 /// - The EPUB specification requires that each publication specify at least one primary language.
653 /// - Language identifiers should conform to RFC 3066 or later standards.
654 pub fn get_language(&self) -> Result<Vec<String>, EpubError> {
655 self.get_metadata_value("language")
656 .ok_or_else(|| EpubError::NonCanonicalFile {
657 tag: "language".to_string(),
658 })
659 }
660
661 /// Retrieves the identifier of a publication
662 ///
663 /// This function retrieves the identifier information of a publication from
664 /// the EPUB metadata. According to the EPUB specification, each publication
665 /// must have a identifier, typically an ISBN, UUID, or other unique identifier.
666 ///
667 /// # Return
668 /// - `Ok(Vec<String>)`: A vector containing all identifier information
669 /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:identifier>`
670 ///
671 /// # Notes
672 /// - The EPUB specification requires each publication to have at least one identifier.
673 /// - In the OPF file, the `unique-identifier` attribute of the `<package>` element
674 /// should point to a `<dc:identifier>` element used to uniquely identify the publication.
675 /// This means that `unique-identifier` is not exactly equal to `<dc:identifier>`.
676 pub fn get_identifier(&self) -> Result<Vec<String>, EpubError> {
677 self.get_metadata_value("identifier")
678 .ok_or_else(|| EpubError::NonCanonicalFile {
679 tag: "identifier".to_string(),
680 })
681 }
682
683 /// Retrieve resource data by resource ID
684 ///
685 /// This function will find the resource with the specified ID in the manifest.
686 /// If the resource is encrypted, it will be automatically decrypted.
687 ///
688 /// # Parameters
689 /// - `id`: The ID of the resource to retrieve
690 ///
691 /// # Return
692 /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
693 /// the MIME type
694 /// - `Err(EpubError)`: Errors that occurred during the retrieval process
695 ///
696 /// # Notes
697 /// - This function will automatically decrypt the resource if it is encrypted.
698 /// - For unsupported encryption methods, the corresponding error will be returned.
699 pub fn get_manifest_item(&mut self, id: &str) -> Result<(Vec<u8>, String), EpubError> {
700 let resource_item = self
701 .manifest
702 .get(id)
703 .cloned()
704 .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
705
706 let path = resource_item.path.to_str().unwrap();
707
708 let mut data = match self.archive.by_name(path) {
709 Ok(mut file) => {
710 let mut entry = Vec::<u8>::new();
711 file.read_to_end(&mut entry)?;
712
713 Ok(entry)
714 }
715 Err(ZipError::FileNotFound) => Err(EpubError::ResourceNotFound {
716 resource: path.to_string(),
717 }),
718 Err(err) => Err(EpubError::from(err)),
719 }?;
720
721 if let Some(method) = self.is_encryption_file(path) {
722 data = self.auto_dencrypt(&method, &mut data)?;
723 }
724
725 Ok((data, resource_item.mime))
726 }
727
728 /// Retrieves resource item data by resource path
729 ///
730 /// This function retrieves resources from the manifest based on the input path.
731 /// The input path must be a relative path to the root directory of the EPUB container;
732 /// using an absolute path or a relative path to another location will result in an error.
733 ///
734 /// # Parameters
735 /// - `path`: The path of the resource to retrieve
736 ///
737 /// # Return
738 /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
739 /// the MIME type
740 /// - `Err(EpubError)`: Errors that occurred during the retrieval process
741 ///
742 /// # Notes
743 /// - This function will automatically decrypt the resource if it is encrypted.
744 /// - For unsupported encryption methods, the corresponding error will be returned.
745 /// - Relative paths other than the root directory of the Epub container are not supported.
746 pub fn get_manifest_item_by_path(
747 &mut self,
748 path: &str,
749 ) -> Result<(Vec<u8>, String), EpubError> {
750 let id = self
751 .manifest
752 .iter()
753 .find(|(_, item)| item.path.to_str().unwrap() == path)
754 .map(|(id, _)| id.to_string())
755 .ok_or_else(|| EpubError::ResourceNotFound {
756 resource: path.to_string(),
757 })?;
758
759 self.get_manifest_item(&id)
760 }
761
762 /// Retrieves supported resource items by resource ID, with fallback mechanism supported
763 ///
764 /// This function attempts to retrieve the resource item with the specified ID and
765 /// checks if its MIME type is in the list of supported formats. If the current resource
766 /// format is not supported, it searches for a supported resource format along the
767 /// fallback chain according to the fallback mechanism defined in the EPUB specification.
768 ///
769 /// # Parameters
770 /// - `id`: The ID of the resource to retrieve
771 /// - `supported_format`: A vector of supported MIME types
772 ///
773 /// # Return
774 /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
775 /// the MIME type
776 /// - `Err(EpubError)`: Errors that occurred during the retrieval process
777 pub fn get_manifest_item_with_fallback(
778 &mut self,
779 id: &str,
780 supported_format: Vec<&str>,
781 ) -> Result<(Vec<u8>, String), EpubError> {
782 let mut manifest_item = self
783 .manifest
784 .get(id)
785 .cloned()
786 .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
787
788 let mut current_manifest_id = id.to_string();
789 let mut fallback_chain = Vec::<String>::new();
790 'fallback: loop {
791 if supported_format.contains(&manifest_item.mime.as_str()) {
792 return self.get_manifest_item(¤t_manifest_id);
793 }
794
795 let fallback_id = manifest_item.fallback.clone();
796
797 match fallback_id {
798 // The loop ends when no fallback resource exists
799 None => break 'fallback,
800
801 // End the loop when the loop continues to fallback if a fallback resource exists
802 Some(id) if fallback_chain.contains(&id) => break 'fallback,
803
804 Some(id) => {
805 fallback_chain.push(id.clone());
806
807 // Since only warnings are issued for fallback resource checks
808 // during initialization, the issue of fallback resources possibly
809 // not existing needs to be handled here.
810 manifest_item = self
811 .manifest
812 .get(&manifest_item.fallback.unwrap())
813 .cloned()
814 .ok_or(EpubError::ResourceIdNotExist { id: id.clone() })?;
815 current_manifest_id = id;
816 }
817 };
818 }
819
820 Err(EpubError::NoSupportedFileFormat)
821 }
822
823 /// Retrieves the cover of the EPUB document
824 ///
825 /// This function searches for the cover of the EPUB document by examining manifest
826 /// items in the manifest. It looks for manifest items whose ID or attribute contains
827 /// "cover" (case-insensitive) and attempts to retrieve the content of the first match.
828 ///
829 /// # Return
830 /// - `Some((Vec<u8>, String))`: Successfully retrieved and decrypted cover data and
831 /// the MIME type
832 /// - `None`: No cover resource was found
833 ///
834 /// # Notes
835 /// - This function only returns the first successfully retrieved cover resource,
836 /// even if multiple matches exist
837 /// - The retrieved cover may not be an image resource; users need to pay attention
838 /// to the resource's MIME type.
839 pub fn get_cover(&mut self) -> Option<(Vec<u8>, String)> {
840 self.manifest
841 .values()
842 .filter_map(|manifest| {
843 if manifest.id.to_ascii_lowercase().contains("cover") {
844 return Some(manifest.id.clone());
845 }
846
847 if let Some(properties) = &manifest.properties {
848 if properties.to_ascii_lowercase().contains("cover") {
849 return Some(manifest.id.clone());
850 }
851 }
852
853 None
854 })
855 .collect::<Vec<String>>()
856 .iter()
857 .find_map(|id| self.get_manifest_item(id).ok())
858 }
859
860 /// Navigate to a specified chapter using the spine index
861 ///
862 /// This function retrieves the content data of the corresponding chapter based
863 /// on the index position in the EPUB spine. The spine defines the linear reading
864 /// order of the publication's content documents, and each spine item references
865 /// resources in the manifest.
866 ///
867 /// # Parameters
868 /// - `index`: The index position in the spine, starting from 0
869 ///
870 /// # Return
871 /// - `Some((Vec<u8>, String))`: Successfully retrieved chapter content data and the MIME type
872 /// - `None`: Index out of range or data retrieval error
873 ///
874 /// # Notes
875 /// - The index must be less than the total number of spine projects.
876 /// - If the resource is encrypted, it will be automatically decrypted before returning.(TODO)
877 /// - It does not check whether the Spine project follows a linear reading order.
878 pub fn navigate_by_spine_index(&mut self, index: usize) -> Option<(Vec<u8>, String)> {
879 if index >= self.spine.len() {
880 return None;
881 }
882
883 let manifest_id = self.spine[index].idref.clone();
884 self.current_spine_index = index;
885 self.get_manifest_item(&manifest_id).ok()
886 }
887
888 /// Navigate to the previous linear reading chapter
889 ///
890 /// This function searches backwards in the EPUB spine for the previous linear
891 /// reading chapter and returns the content data of that chapter. It only navigates
892 /// to chapters marked as linear reading.
893 ///
894 /// # Return
895 /// - `Some((Vec<u8>, String))`: Successfully retrieved previous chapter content data and
896 /// the MIME type
897 /// - `None`: Already in the first chapter, the current chapter is not linear,
898 /// or data retrieval failed
899 pub fn spine_prev(&mut self) -> Option<(Vec<u8>, String)> {
900 if self.current_spine_index == 0 || !self.spine[self.current_spine_index].linear {
901 return None;
902 }
903
904 let prev_index = (0..self.current_spine_index)
905 .rev()
906 .find(|&index| self.spine[index].linear)?;
907
908 self.current_spine_index = prev_index;
909 let manifest_id = self.spine[prev_index].idref.clone();
910 self.get_manifest_item(&manifest_id).ok()
911 }
912
913 /// Navigate to the next linear reading chapter
914 ///
915 /// This function searches forwards in the EPUB spine for the next linear reading
916 /// chapter and returns the content data of that chapter. It only navigates to
917 /// chapters marked as linear reading.
918 ///
919 /// # Return
920 /// - `Some((Vec<u8>, String))`: Successfully retrieved next chapter content data and
921 /// the MIME type
922 /// - `None`: Already in the last chapter, the current chapter is not linear,
923 /// or data retrieval failed
924 pub fn spine_next(&mut self) -> Option<(Vec<u8>, String)> {
925 if self.current_spine_index >= self.spine.len() - 1
926 || !self.spine[self.current_spine_index].linear
927 {
928 return None;
929 }
930
931 let next_index = (self.current_spine_index + 1..self.spine.len())
932 .find(|&index| self.spine[index].linear)?;
933
934 self.current_spine_index = next_index;
935 let manifest_id = self.spine[next_index].idref.clone();
936 self.get_manifest_item(&manifest_id).ok()
937 }
938
939 /// Retrieves the content data of the current chapter
940 ///
941 /// This function returns the content data of the chapter at the current
942 /// index position in the EPUB spine.
943 ///
944 /// # Return
945 /// - `Some((Vec<u8>, String))`: Successfully retrieved current chapter content data and
946 /// the MIME type
947 /// - `None`: Data retrieval failed
948 pub fn spine_current(&mut self) -> Option<(Vec<u8>, String)> {
949 let manifest_id = self.spine[self.current_spine_index].idref.clone();
950 self.get_manifest_item(&manifest_id).ok()
951 }
952
953 /// Determine the EPUB version from the OPF file
954 ///
955 /// This function is used to detect the version of an epub file from an OPF file.
956 /// When the version attribute in the package is abnormal, version information will
957 /// be identified through some version characteristics of the epub file. An error is
958 /// returned when neither direct nor indirect methods can identify the version.
959 ///
960 /// # Parameters
961 /// - `opf_element`: A reference to the OPF file element
962 fn determine_epub_version(opf_element: &XmlElement) -> Result<EpubVersion, EpubError> {
963 // Check the explicit version attribute
964 if let Some(version) = opf_element.get_attr("version") {
965 match version.as_str() {
966 "2.0" => return Ok(EpubVersion::Version2_0),
967 "3.0" => return Ok(EpubVersion::Version3_0),
968 _ => {}
969 }
970 }
971
972 let spine_element = opf_element
973 .find_elements_by_name("spine")
974 .next()
975 .ok_or_else(|| EpubError::NonCanonicalFile {
976 tag: "spine".to_string(),
977 })?;
978
979 // Look for EPUB 2.x specific features
980 if spine_element.get_attr("toc").is_some() {
981 return Ok(EpubVersion::Version2_0);
982 }
983
984 let manifest_element = opf_element
985 .find_elements_by_name("manifest")
986 .next()
987 .ok_or_else(|| EpubError::NonCanonicalFile {
988 tag: "manifest".to_string(),
989 })?;
990
991 // Look for EPUB 3.x specific features
992 manifest_element
993 .children()
994 .find_map(|element| {
995 if let Some(id) = element.get_attr("id") {
996 if id.eq("nav") {
997 return Some(EpubVersion::Version3_0);
998 }
999 }
1000
1001 None
1002 })
1003 .ok_or(EpubError::UnrecognizedEpubVersion)
1004 }
1005
1006 /// Parse metadata elements under the Dublin Core namespace
1007 ///
1008 /// This function handles the `<metadata>` Dublin Core element in the OPF file (namespace
1009 /// is "http://purl.org/dc/elements/1.1/"). These elements usually contain the basic
1010 /// information of the publication, such as title, author, publication date, etc.
1011 ///
1012 /// # Notes
1013 /// - In EPUB 3.0, granular information is handled by separate '<meta>' elements and 'refines' attributes
1014 /// - All text content is normalized by whitespace
1015 #[inline]
1016 fn parse_dc_metadata(
1017 &mut self,
1018 element: &XmlElement,
1019 metadata: &mut Vec<MetadataItem>,
1020 // refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1021 ) -> Result<(), EpubError> {
1022 let id = element.get_attr("id");
1023 let lang = element.get_attr("lang");
1024 let property = element.name.clone();
1025 let value = element.text().normalize_whitespace();
1026
1027 let refined = match self.version {
1028 // In EPUB 2.0, supplementary metadata (refinements) are represented
1029 // through other attribute data pairs of the tag.
1030 EpubVersion::Version2_0 => element
1031 .attributes
1032 .iter()
1033 .map(|(name, value)| {
1034 let property = name.to_string();
1035 let value = value.to_string().normalize_whitespace();
1036
1037 MetadataRefinement {
1038 refines: id.clone().unwrap(),
1039 property,
1040 value,
1041 lang: None,
1042 scheme: None,
1043 }
1044 })
1045 .collect(),
1046 EpubVersion::Version3_0 => vec![],
1047 };
1048
1049 metadata.push(MetadataItem {
1050 id,
1051 property,
1052 value,
1053 lang,
1054 refined,
1055 });
1056
1057 Ok(())
1058 }
1059
1060 /// Parse metadata elements under the OPF namespace
1061 ///
1062 /// This function handles the `<metadata>` OPF element in the OPF file (namespace
1063 /// is "http://www.idpf.org/2007/opf"). These elements include '<meta>' and '<link>',
1064 /// which are used to provide extended metadata and links to external resources for EPUB publications.
1065 ///
1066 /// # Notes
1067 /// - The function is only responsible for distribution processing, and the
1068 /// specific parsing logic is implemented in the dedicated function
1069 /// - All parsing results are added directly to the incoming collection and no new collection is returned
1070 #[inline]
1071 fn parse_opf_metadata(
1072 &mut self,
1073 element: &XmlElement,
1074 metadata: &mut Vec<MetadataItem>,
1075 metadata_link: &mut Vec<MetadataLinkItem>,
1076 refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1077 ) -> Result<(), EpubError> {
1078 match element.name.as_str() {
1079 "meta" => self.parse_meta_element(element, metadata, refinements),
1080 "link" => self.parse_link_element(element, metadata_link),
1081 _ => Ok(()),
1082 }
1083 }
1084
1085 #[inline]
1086 fn parse_meta_element(
1087 &mut self,
1088 element: &XmlElement,
1089 metadata: &mut Vec<MetadataItem>,
1090 refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1091 ) -> Result<(), EpubError> {
1092 match self.version {
1093 EpubVersion::Version2_0 => {
1094 let property =
1095 element
1096 .get_attr("name")
1097 .ok_or_else(|| EpubError::NonCanonicalFile {
1098 tag: element.tag_name(),
1099 })?;
1100 let value = element
1101 .get_attr("content")
1102 .ok_or_else(|| EpubError::MissingRequiredAttribute {
1103 tag: element.tag_name(),
1104 attribute: "content".to_string(),
1105 })?
1106 .normalize_whitespace();
1107
1108 metadata.push(MetadataItem {
1109 id: None,
1110 property,
1111 value,
1112 lang: None,
1113 refined: vec![],
1114 });
1115 }
1116
1117 EpubVersion::Version3_0 => {
1118 let property = element.get_attr("property").ok_or_else(|| {
1119 EpubError::MissingRequiredAttribute {
1120 tag: element.tag_name(),
1121 attribute: "property".to_string(),
1122 }
1123 })?;
1124 let value = element.text().normalize_whitespace();
1125 let lang = element.get_attr("lang");
1126
1127 if let Some(refines) = element.get_attr("refines") {
1128 let id = refines.strip_prefix("#").unwrap_or(&refines).to_string();
1129 let scheme = element.get_attr("scheme");
1130 let refinement = MetadataRefinement {
1131 refines: id.clone(),
1132 property,
1133 value,
1134 lang,
1135 scheme,
1136 };
1137
1138 if let Some(refinements) = refinements.get_mut(&id) {
1139 refinements.push(refinement);
1140 } else {
1141 refinements.insert(id, vec![refinement]);
1142 }
1143 } else {
1144 let id = element.get_attr("id");
1145 let item = MetadataItem {
1146 id,
1147 property,
1148 value,
1149 lang,
1150 refined: vec![],
1151 };
1152
1153 metadata.push(item);
1154 };
1155 }
1156 }
1157 Ok(())
1158 }
1159
1160 #[inline]
1161 fn parse_link_element(
1162 &mut self,
1163 element: &XmlElement,
1164 metadata_link: &mut Vec<MetadataLinkItem>,
1165 ) -> Result<(), EpubError> {
1166 let href = element
1167 .get_attr("href")
1168 .ok_or_else(|| EpubError::MissingRequiredAttribute {
1169 tag: element.tag_name(),
1170 attribute: "href".to_string(),
1171 })?;
1172 let rel = element
1173 .get_attr("rel")
1174 .ok_or_else(|| EpubError::MissingRequiredAttribute {
1175 tag: element.tag_name(),
1176 attribute: "rel".to_string(),
1177 })?;
1178 let hreflang = element.get_attr("hreflang");
1179 let id = element.get_attr("id");
1180 let mime = element.get_attr("media-type");
1181 let properties = element.get_attr("properties");
1182
1183 metadata_link.push(MetadataLinkItem {
1184 href,
1185 rel,
1186 hreflang,
1187 id,
1188 mime,
1189 properties,
1190 refines: None,
1191 });
1192 Ok(())
1193 }
1194
1195 /// Recursively parse NCX navigation points from navMap or nested navPoint elements
1196 ///
1197 /// This function parses the hierarchical navigation structure defined in NCX files
1198 /// for EPUB 2.x documents. It handles nested navPoint elements to build a complete
1199 /// tree representation of the publication's table of contents.
1200 fn parse_nav_points(&self, parent_element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1201 let mut nav_points = Vec::new();
1202 for nav_point in parent_element.find_children_by_name("navPoint") {
1203 let label = match nav_point.find_children_by_name("navLabel").next() {
1204 Some(element) => element.text(),
1205 None => String::new(),
1206 };
1207
1208 let content = nav_point
1209 .find_children_by_name("content")
1210 .next()
1211 .map(|element| PathBuf::from(element.text()));
1212
1213 let play_order = nav_point
1214 .get_attr("playOrder")
1215 .and_then(|order| order.parse::<usize>().ok());
1216
1217 let children = self.parse_nav_points(nav_point)?;
1218
1219 nav_points.push(NavPoint {
1220 label,
1221 content,
1222 play_order,
1223 children,
1224 });
1225 }
1226
1227 nav_points.sort();
1228 Ok(nav_points)
1229 }
1230
1231 /// Recursively parses directory list structures
1232 ///
1233 /// This function recursively parses HTML navigation list structures,
1234 /// converting `<ol>` and `<li>` elements into NavPoint structures.
1235 /// Multi-level nested directory structures are supported.
1236 fn parse_catalog_list(&self, element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1237 let mut catalog = Vec::new();
1238 for item in element.children() {
1239 if item.tag_name() != "li" {
1240 return Err(EpubError::NonCanonicalFile {
1241 tag: "li".to_string(),
1242 });
1243 }
1244
1245 let title_element = item
1246 .find_children_by_names(&["span", "a"])
1247 .next()
1248 .ok_or_else(|| EpubError::NonCanonicalFile {
1249 tag: "span/a".to_string(),
1250 })?;
1251 let content_href = title_element.get_attr("href").map(PathBuf::from);
1252 let sub_list = if let Some(list) = item.find_children_by_name("ol").next() {
1253 self.parse_catalog_list(list)?
1254 } else {
1255 vec![]
1256 };
1257
1258 catalog.push(NavPoint {
1259 label: title_element.text(),
1260 content: content_href,
1261 children: sub_list,
1262 play_order: None,
1263 });
1264 }
1265
1266 Ok(catalog)
1267 }
1268
1269 /// Converts relative paths in the manifest to normalized paths
1270 /// relative to the EPUB root directory
1271 ///
1272 /// This function processes the href attribute of resources in the EPUB
1273 /// manifest and converts it to a normalized path representation.
1274 /// It handles three types of paths:
1275 /// - Relative paths starting with `../` (checks if they exceed the EPUB package scope)
1276 /// - Absolute paths starting with `/` (relative to the EPUB root directory)
1277 /// - Other relative paths (relative to the directory containing the OPF file)
1278 ///
1279 /// # Parameters
1280 /// - `path`: The href attribute value of the resource in the manifest
1281 ///
1282 /// # Return
1283 /// - `Ok(PathBuf)`: The parsed normalized path
1284 /// - `Err(EpubError)`: Relative link leakage
1285 #[inline]
1286 fn normalize_manifest_path(&self, path: &str) -> Result<PathBuf, EpubError> {
1287 let mut path = if path.starts_with("../") {
1288 let mut current_dir = self.epub_path.join(&self.package_path);
1289 current_dir.pop();
1290
1291 check_realtive_link_leakage(self.epub_path.clone(), current_dir, path)
1292 .map(PathBuf::from)
1293 .ok_or_else(|| EpubError::RealtiveLinkLeakage {
1294 path: path.to_string(),
1295 })?
1296 } else if let Some(path) = path.strip_prefix("/") {
1297 PathBuf::from(path.to_string())
1298 } else {
1299 self.base_path.join(path)
1300 };
1301
1302 #[cfg(windows)]
1303 {
1304 path = PathBuf::from(path.to_string_lossy().replace('\\', "/"));
1305 }
1306
1307 Ok(path)
1308 }
1309
1310 /// Verify the fallback chain of all manifest items
1311 ///
1312 /// This function iterates through all manifest items with the fallback
1313 /// attribute and verifies the validity of their fallback chains, including checking:
1314 /// - Whether circular references exist
1315 /// - Whether the fallback resource exists in the manifest
1316 ///
1317 /// # Notes
1318 /// If an invalid fallback chain is found, a warning log will be logged
1319 /// but the processing flow will not be interrupted.
1320 fn validate_fallback_chains(&self) {
1321 for (id, item) in &self.manifest {
1322 if item.fallback.is_none() {
1323 continue;
1324 }
1325
1326 let mut fallback_chain = Vec::new();
1327 if let Err(msg) = self.validate_fallback_chain(id, &mut fallback_chain) {
1328 warn!("Invalid fallback chain for item {}: {}", id, msg);
1329 }
1330 }
1331 }
1332
1333 /// Recursively verify the validity of a single fallback chain
1334 ///
1335 /// This function recursively traces the fallback chain to check for the following issues:
1336 /// - Circular reference
1337 /// - The referenced fallback resource does not exist
1338 ///
1339 /// # Parameters
1340 /// - `manifest_id`: The id of the manifest item currently being verified
1341 /// - `fallback_chain`: The visited fallback chain paths used to detect circular references
1342 ///
1343 /// # Return
1344 /// - `Ok(())`: The fallback chain is valid
1345 /// - `Err(String)`: A string containing error information
1346 fn validate_fallback_chain(
1347 &self,
1348 manifest_id: &str,
1349 fallback_chain: &mut Vec<String>,
1350 ) -> Result<(), String> {
1351 if fallback_chain.contains(&manifest_id.to_string()) {
1352 fallback_chain.push(manifest_id.to_string());
1353
1354 return Err(format!(
1355 "Circular reference detected in fallback chain for {}",
1356 fallback_chain.join("->")
1357 ));
1358 }
1359
1360 // Get the current item; its existence can be ensured based on the calling context.
1361 let item = self.manifest.get(manifest_id).unwrap();
1362
1363 if let Some(fallback_id) = &item.fallback {
1364 if !self.manifest.contains_key(fallback_id) {
1365 return Err(format!(
1366 "Fallback resource {} does not exist in manifest",
1367 fallback_id
1368 ));
1369 }
1370
1371 fallback_chain.push(manifest_id.to_string());
1372 self.validate_fallback_chain(fallback_id, fallback_chain)
1373 } else {
1374 // The end of the fallback chain
1375 Ok(())
1376 }
1377 }
1378
1379 /// Checks if a resource at the specified path is an encrypted file
1380 ///
1381 /// This function queries whether a specific resource path is marked as an encrypted
1382 /// file in the EPUB encryption information. It checks the encrypted data stored in
1383 /// `self.encryption`, looking for an entry that matches the given path.
1384 ///
1385 /// # Parameters
1386 /// - `path`: The path of the resource to check
1387 ///
1388 /// # Return
1389 /// - `Some(String)`: The encryption method used for the resource
1390 /// - `None`: The resource is not encrypted
1391 fn is_encryption_file(&self, path: &str) -> Option<String> {
1392 self.encryption.as_ref().and_then(|encryptions| {
1393 encryptions
1394 .iter()
1395 .find(|encryption| encryption.data == path)
1396 .map(|encryption| encryption.method.clone())
1397 })
1398 }
1399
1400 /// Automatically decrypts encrypted resource data
1401 ///
1402 /// Automatically decrypts data based on the provided encryption method.
1403 /// This function supports various encryption methods defined by the EPUB
1404 /// specification, including font obfuscation and the XML encryption standard.
1405 ///
1406 /// # Parameters
1407 /// - `method`: The encryption method used for the resource
1408 /// - `data`: The encrypted resource data
1409 ///
1410 /// # Return
1411 /// - `Ok(Vec<u8>)`: The decrypted resource data
1412 /// - `Err(EpubError)`: Unsupported encryption method
1413 ///
1414 /// # Supported Encryption Methods
1415 /// - IDPF font obfuscation: `http://www.idpf.org/2008/embedding`
1416 /// - Adobe font obfuscation: `http://ns.adobe.com/pdf/enc#RC`
1417 #[inline]
1418 fn auto_dencrypt(&self, method: &str, data: &mut [u8]) -> Result<Vec<u8>, EpubError> {
1419 match method {
1420 "http://www.idpf.org/2008/embedding" => {
1421 Ok(idpf_font_dencryption(data, &self.unique_identifier))
1422 }
1423 "http://ns.adobe.com/pdf/enc#RC" => {
1424 Ok(adobe_font_dencryption(data, &self.unique_identifier))
1425 }
1426 _ => Err(EpubError::UnsupportedEncryptedMethod {
1427 method: method.to_string(),
1428 }),
1429 }
1430 }
1431}
1432
1433impl EpubDoc<BufReader<File>> {
1434 /// Creates a new EPUB document instance
1435 ///
1436 /// This function is a convenience constructor for `EpubDoc`,
1437 /// used to create an EPUB parser instance directly from a file path.
1438 ///
1439 /// # Parameters
1440 /// - `path`: The path to the EPUB file
1441 ///
1442 /// # Return
1443 /// - `Ok(EpubDoc)`: The created EPUB document instance
1444 /// - `Err(EpubError)`: An error occurred during initialization
1445 pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, EpubError> {
1446 let file = File::open(&path).map_err(EpubError::from)?;
1447 let path = canonicalize(path)?;
1448
1449 Self::from_reader(BufReader::new(file), path)
1450 }
1451}
1452
1453#[cfg(test)]
1454mod tests {
1455 use std::path::Path;
1456
1457 use crate::epub::EpubDoc;
1458
1459 /// Section 3.3 package documents
1460 mod package_documents_tests {
1461 use std::path::Path;
1462
1463 use crate::epub::{EpubDoc, EpubVersion};
1464
1465 /// ID: pkg-collections-unknown
1466 ///
1467 /// The package document contains a collection with an unknown role. The reading system must open the EPUB successfully.
1468 #[test]
1469 fn test_pkg_collections_unknown() {
1470 let epub_file = Path::new("./test_case/pkg-collections-unknown.epub");
1471 let doc = EpubDoc::new(epub_file);
1472 assert!(doc.is_ok());
1473 }
1474
1475 /// ID: pkg-creator-order
1476 ///
1477 /// Several creators are listed in the package document. The reading system must not display them out of order (but it may display only the first).
1478 #[test]
1479 fn test_pkg_creator_order() {
1480 let epub_file = Path::new("./test_case/pkg-creator-order.epub");
1481 let doc = EpubDoc::new(epub_file);
1482 assert!(doc.is_ok());
1483
1484 let doc = doc.unwrap();
1485 let creators = doc.get_metadata_value("creator");
1486 assert!(creators.is_some());
1487
1488 let creators = creators.unwrap();
1489 assert_eq!(creators.len(), 5);
1490 assert_eq!(
1491 creators,
1492 vec![
1493 "Dave Cramer",
1494 "Wendy Reid",
1495 "Dan Lazin",
1496 "Ivan Herman",
1497 "Brady Duga",
1498 ]
1499 );
1500 }
1501
1502 /// ID: pkg-manifest-unknown
1503 ///
1504 /// The package document contains a manifest item with unknown properties. The reading system must open the EPUB successfully.
1505 #[test]
1506 fn test_pkg_manifest_order() {
1507 let epub_file = Path::new("./test_case/pkg-manifest-unknown.epub");
1508 let doc = EpubDoc::new(epub_file);
1509 assert!(doc.is_ok());
1510
1511 let mut doc = doc.unwrap();
1512 assert_eq!(doc.manifest.len(), 2);
1513 assert!(doc.get_manifest_item("nav").is_ok());
1514 assert!(doc.get_manifest_item("content_001").is_ok());
1515 assert!(doc.get_manifest_item("content_002").is_err());
1516 }
1517
1518 /// ID: pkg-meta-unknown
1519 ///
1520 /// The package document contains a meta tag with an unknown property. The reading system must open the EPUB successfully.
1521 #[test]
1522 fn test_pkg_meta_unknown() {
1523 let epub_file = Path::new("./test_case/pkg-meta-unknown.epub");
1524 let doc = EpubDoc::new(epub_file);
1525 assert!(doc.is_ok());
1526
1527 let doc = doc.unwrap();
1528 let value = doc.get_metadata_value("dcterms:isReferencedBy");
1529 assert!(value.is_some());
1530 let value = value.unwrap();
1531 assert_eq!(value.len(), 1);
1532 assert_eq!(
1533 value,
1534 vec!["https://www.w3.org/TR/epub-rs/#confreq-rs-pkg-meta-unknown"]
1535 );
1536
1537 let value = doc.get_metadata_value("dcterms:modified");
1538 assert!(value.is_some());
1539 let value = value.unwrap();
1540 assert_eq!(value.len(), 1);
1541 assert_eq!(value, vec!["2021-01-11T00:00:00Z"]);
1542
1543 let value = doc.get_metadata_value("dcterms:title");
1544 assert!(value.is_none());
1545 }
1546
1547 /// ID: pkg-meta-whitespace
1548 ///
1549 /// The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases.
1550 #[test]
1551 fn test_pkg_meta_white_space() {
1552 let epub_file = Path::new("./test_case/pkg-meta-whitespace.epub");
1553 let doc = EpubDoc::new(epub_file);
1554 assert!(doc.is_ok());
1555
1556 let doc = doc.unwrap();
1557 let value = doc.get_metadata_value("creator");
1558 assert!(value.is_some());
1559 let value = value.unwrap();
1560 assert_eq!(value.len(), 1);
1561 assert_eq!(value, vec!["Dave Cramer"]);
1562
1563 let value = doc.get_metadata_value("description");
1564 assert!(value.is_some());
1565 let value = value.unwrap();
1566 assert_eq!(value.len(), 1);
1567 assert_eq!(
1568 value,
1569 vec![
1570 "The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases."
1571 ]
1572 );
1573 }
1574
1575 /// ID: pkg-spine-duplicate-item-hyperlink
1576 ///
1577 /// The spine contains several references to the same content document. The reading system must move to the position of the first duplicate in the reading order when following a hyperlink.
1578 #[test]
1579 fn test_pkg_spine_duplicate_item_hyperlink() {
1580 let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-hyperlink.epub");
1581 let doc = EpubDoc::new(epub_file);
1582 assert!(doc.is_ok());
1583
1584 let mut doc = doc.unwrap();
1585 assert_eq!(doc.spine.len(), 4);
1586 assert_eq!(
1587 doc.navigate_by_spine_index(0).unwrap(),
1588 doc.get_manifest_item("content_001").unwrap()
1589 );
1590 assert_eq!(
1591 doc.navigate_by_spine_index(1).unwrap(),
1592 doc.get_manifest_item("content_002").unwrap()
1593 );
1594 assert_eq!(
1595 doc.navigate_by_spine_index(2).unwrap(),
1596 doc.get_manifest_item("content_002").unwrap()
1597 );
1598 assert_eq!(
1599 doc.navigate_by_spine_index(3).unwrap(),
1600 doc.get_manifest_item("content_002").unwrap()
1601 );
1602 }
1603
1604 /// ID: pkg-spine-duplicate-item-rendering
1605 ///
1606 /// The spine contains several references to the same content document. The reading system must not skip the duplicates when rendering the reading order.
1607 #[test]
1608 fn test_pkg_spine_duplicate_item_rendering() {
1609 let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-rendering.epub");
1610 let doc = EpubDoc::new(epub_file);
1611 assert!(doc.is_ok());
1612
1613 let mut doc = doc.unwrap();
1614 assert_eq!(doc.spine.len(), 4);
1615
1616 let result = doc.spine_prev();
1617 assert!(result.is_none());
1618
1619 let result = doc.spine_next();
1620 assert!(result.is_some());
1621
1622 doc.spine_next();
1623 doc.spine_next();
1624 let result = doc.spine_next();
1625 assert!(result.is_none());
1626 }
1627
1628 /// ID: pkg-spine-nonlinear-activation
1629 ///
1630 /// An itemref in the spine is marked as non-linear. Although it (possibly) cannot be accessed through the table of contents, it can be reached from a link in the XHTML content.
1631 #[test]
1632 fn test_pkg_spine_nonlinear_activation() {
1633 let epub_file = Path::new("./test_case/pkg-spine-nonlinear-activation.epub");
1634 let doc = EpubDoc::new(epub_file);
1635 assert!(doc.is_ok());
1636
1637 let mut doc = doc.unwrap();
1638 assert!(doc.spine_prev().is_none());
1639 assert!(doc.spine_next().is_none());
1640
1641 assert!(doc.navigate_by_spine_index(1).is_some());
1642 assert!(doc.spine_prev().is_none());
1643 assert!(doc.spine_next().is_none());
1644 }
1645
1646 /// ID: pkg-spine-order
1647 ///
1648 /// Basic test of whether a reading system can display spine items in the correct order. The test fails if the reading system presents content in the order in which the file names sort, or if it presents files in manifest order rather than spine order.
1649 #[test]
1650 fn test_pkg_spine_order() {
1651 let epub_file = Path::new("./test_case/pkg-spine-order.epub");
1652 let doc = EpubDoc::new(epub_file);
1653 assert!(doc.is_ok());
1654
1655 let doc = doc.unwrap();
1656 assert_eq!(doc.spine.len(), 4);
1657 assert_eq!(
1658 doc.spine
1659 .iter()
1660 .map(|item| item.idref.clone())
1661 .collect::<Vec<String>>(),
1662 vec![
1663 "d-content_001",
1664 "c-content_002",
1665 "b-content_003",
1666 "a-content_004",
1667 ]
1668 );
1669 }
1670
1671 /// ID: pkg-spine-order-svg
1672 ///
1673 /// Basic test of whether a reading system can display SVG spine items in the correct order.
1674 #[test]
1675 fn test_spine_order_svg() {
1676 let epub_file = Path::new("./test_case/pkg-spine-order-svg.epub");
1677 let doc = EpubDoc::new(epub_file);
1678 assert!(doc.is_ok());
1679
1680 let mut doc = doc.unwrap();
1681 assert_eq!(doc.spine.len(), 4);
1682
1683 loop {
1684 if let Some(spine) = doc.spine_next() {
1685 let idref = doc.spine[doc.current_spine_index].idref.clone();
1686 let resource = doc.get_manifest_item(&idref);
1687 assert!(resource.is_ok());
1688
1689 let resource = resource.unwrap();
1690 assert_eq!(spine, resource);
1691 } else {
1692 break;
1693 }
1694 }
1695
1696 assert_eq!(doc.current_spine_index, 3);
1697 }
1698
1699 /// ID: pkg-spine-unknown
1700 ///
1701 /// The package document contains a spine item with unknown properties. The reading system must open the EPUB successfully.
1702 #[test]
1703 fn test_pkg_spine_unknown() {
1704 let epub_file = Path::new("./test_case/pkg-spine-unknown.epub");
1705 let doc = EpubDoc::new(epub_file);
1706 assert!(doc.is_ok());
1707
1708 let doc = doc.unwrap();
1709 assert_eq!(doc.spine.len(), 1);
1710 assert_eq!(doc.spine[0].idref, "content_001");
1711 assert_eq!(doc.spine[0].id, None);
1712 assert_eq!(doc.spine[0].linear, true);
1713 assert_eq!(doc.spine[0].properties, Some("untrustworthy".to_string()));
1714 }
1715
1716 /// ID: pkg-title-order
1717 ///
1718 /// Several titles are listed in the package document. The reading system must use the first title (and whether to use other titles is not defined).
1719 #[test]
1720 fn test_pkg_title_order() {
1721 let epub_file = Path::new("./test_case/pkg-title-order.epub");
1722 let doc = EpubDoc::new(epub_file);
1723 assert!(doc.is_ok());
1724
1725 let doc = doc.unwrap();
1726 let title_list = doc.get_title();
1727 assert!(title_list.is_ok());
1728
1729 let title_list = title_list.unwrap();
1730 assert_eq!(title_list.len(), 6);
1731 assert_eq!(
1732 title_list,
1733 vec![
1734 "pkg-title-order",
1735 "This title must not display first",
1736 "Also, this title must not display first",
1737 "This title also must not display first",
1738 "This title must also not display first",
1739 "This title must not display first, also",
1740 ]
1741 );
1742 }
1743
1744 /// ID: pkg-unique-id
1745 ///
1746 /// The package document's dc:identifier is identical across two publications. The reading system should display both publications independently.
1747 #[test]
1748 fn test_pkg_unique_id() {
1749 let epub_file = Path::new("./test_case/pkg-unique-id.epub");
1750 let doc_1 = EpubDoc::new(epub_file);
1751 assert!(doc_1.is_ok());
1752
1753 let epub_file = Path::new("./test_case/pkg-unique-id_duplicate.epub");
1754 let doc_2 = EpubDoc::new(epub_file);
1755 assert!(doc_2.is_ok());
1756
1757 let doc_1 = doc_1.unwrap();
1758 let doc_2 = doc_2.unwrap();
1759
1760 assert_eq!(
1761 doc_1.get_identifier().unwrap(),
1762 doc_2.get_identifier().unwrap()
1763 );
1764 assert_eq!(doc_1.unique_identifier, "pkg-unique-id");
1765 assert_eq!(doc_2.unique_identifier, "pkg-unique-id");
1766 }
1767
1768 /// ID: pkg-version-backward
1769 ///
1770 /// “Reading Systems MUST attempt to process an EPUB Publication whose Package Document version attribute is less than "3.0"”. This is an EPUB with package version attribute set to "0", to see if a reading system will open it.
1771 #[test]
1772 fn test_pkg_version_backward() {
1773 let epub_file = Path::new("./test_case/pkg-version-backward.epub");
1774 let doc = EpubDoc::new(epub_file);
1775 assert!(doc.is_ok());
1776
1777 let doc = doc.unwrap();
1778 assert_eq!(doc.version, EpubVersion::Version3_0);
1779 }
1780
1781 /// ID: pkg-linked-records
1782 ///
1783 /// Reading System must process and display the title and creator metadata from the package document. An ONIX 3.0 format linked metadata record exists, but contains neither title nor creator metadata.
1784 #[test]
1785 fn test_pkg_linked_records() {
1786 let epub_file = Path::new("./test_case/pkg-linked-records.epub");
1787 let doc = EpubDoc::new(epub_file);
1788 assert!(doc.is_ok());
1789
1790 let doc = doc.unwrap();
1791 assert_eq!(doc.metadata_link.len(), 3);
1792
1793 let item = doc.metadata_link.iter().find(|&item| {
1794 if let Some(properties) = &item.properties {
1795 properties.eq("onix")
1796 } else {
1797 false
1798 }
1799 });
1800 assert!(item.is_some());
1801 }
1802
1803 /// ID: pkg-manifest-unlisted-resource
1804 ///
1805 /// The XHTML content references an image that does not appear in the manifest. The image should not be shown.
1806 #[test]
1807 fn test_pkg_manifest_unlisted_resource() {
1808 let epub_file = Path::new("./test_case/pkg-manifest-unlisted-resource.epub");
1809 let doc = EpubDoc::new(epub_file);
1810 assert!(doc.is_ok());
1811
1812 let mut doc = doc.unwrap();
1813 assert!(
1814 doc.get_manifest_item_by_path("EPUB/content_001.xhtml")
1815 .is_ok()
1816 );
1817
1818 assert!(doc.get_manifest_item_by_path("EPUB/red.png").is_err());
1819 let err = doc.get_manifest_item_by_path("EPUB/red.png").unwrap_err();
1820 assert_eq!(
1821 err.to_string(),
1822 "Resource not found: Unable to find resource from \"EPUB/red.png\"."
1823 );
1824 }
1825 }
1826
1827 /// Section 3.4 manifest fallbacks
1828 ///
1829 /// The tests under this module seem to favor the reading system rather than the EPUB format itself
1830 mod manifest_fallbacks_tests {
1831 use std::path::Path;
1832
1833 use crate::epub::EpubDoc;
1834
1835 /// ID: pub-foreign_bad-fallback
1836 ///
1837 /// This is a test of manifest fallbacks where both the spine item and the fallback are likely to be unsupported. The spine item is a DMG, with a fallback to a PSD file. Reading systems may raise an error on the ingenstion workflow.
1838 #[test]
1839 fn test_pub_foreign_bad_fallback() {
1840 let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
1841 let doc = EpubDoc::new(epub_file);
1842 assert!(doc.is_ok());
1843
1844 let mut doc = doc.unwrap();
1845 assert!(doc.get_manifest_item("content_001").is_ok());
1846 assert!(doc.get_manifest_item("bar").is_ok());
1847
1848 assert_eq!(
1849 doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
1850 .unwrap_err()
1851 .to_string(),
1852 "No supported file format: The fallback resource does not contain the file format you support."
1853 );
1854 }
1855
1856 /// ID: pub-foreign_image
1857 ///
1858 /// An HTML content file contains a PSD image, with a manifest fallback to a PNG image. This tests fallbacks for resources that are not in the spine.
1859 #[test]
1860 fn test_pub_foreign_image() {
1861 let epub_file = Path::new("./test_case/pub-foreign_image.epub");
1862 let doc = EpubDoc::new(epub_file);
1863 assert!(doc.is_ok());
1864
1865 let mut doc = doc.unwrap();
1866 let result = doc.get_manifest_item_with_fallback(
1867 "image-tiff",
1868 vec!["image/png", "application/xhtml+xml"],
1869 );
1870 assert!(result.is_ok());
1871
1872 let (_, mime) = result.unwrap();
1873 assert_eq!(mime, "image/png");
1874 }
1875
1876 /// ID: pub-foreign_json-spine
1877 ///
1878 /// This EPUB uses a JSON content file in the spine, with a manifest fallback to an HTML document. If the reading system does not support JSON, it should display the HTML.
1879 #[test]
1880 fn test_pub_foreign_json_spine() {
1881 let epub_file = Path::new("./test_case/pub-foreign_json-spine.epub");
1882 let doc = EpubDoc::new(epub_file);
1883 assert!(doc.is_ok());
1884
1885 let mut doc = doc.unwrap();
1886 let result = doc.get_manifest_item_with_fallback(
1887 "content_primary",
1888 vec!["application/xhtml+xml", "application/json"],
1889 );
1890 assert!(result.is_ok());
1891 let (_, mime) = result.unwrap();
1892 assert_eq!(mime, "application/json");
1893
1894 let result = doc
1895 .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1896 assert!(result.is_ok());
1897 let (_, mime) = result.unwrap();
1898 assert_eq!(mime, "application/xhtml+xml");
1899 }
1900
1901 /// ID: pub-foreign_xml-spine
1902 ///
1903 /// This EPUB uses an ordinary XML content file with mimetype application/xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1904 #[test]
1905 fn test_pub_foreign_xml_spine() {
1906 let epub_file = Path::new("./test_case/pub-foreign_xml-spine.epub");
1907 let doc = EpubDoc::new(epub_file);
1908 assert!(doc.is_ok());
1909
1910 let mut doc = doc.unwrap();
1911 let result = doc.get_manifest_item_with_fallback(
1912 "content_primary",
1913 vec!["application/xhtml+xml", "application/xml"],
1914 );
1915 assert!(result.is_ok());
1916 let (_, mime) = result.unwrap();
1917 assert_eq!(mime, "application/xml");
1918
1919 let result = doc
1920 .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1921 assert!(result.is_ok());
1922 let (_, mime) = result.unwrap();
1923 assert_eq!(mime, "application/xhtml+xml");
1924 }
1925
1926 /// ID: pub-foreign_xml-suffix-spine
1927 ///
1928 /// This EPUB uses an custom XML content file with mimetype application/dtc+xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1929 #[test]
1930 fn test_pub_foreign_xml_suffix_spine() {
1931 let epub_file = Path::new("./test_case/pub-foreign_xml-suffix-spine.epub");
1932 let doc = EpubDoc::new(epub_file);
1933 assert!(doc.is_ok());
1934
1935 let mut doc = doc.unwrap();
1936 let result = doc.get_manifest_item_with_fallback(
1937 "content_primary",
1938 vec!["application/xhtml+xml", "application/dtc+xml"],
1939 );
1940 assert!(result.is_ok());
1941 let (_, mime) = result.unwrap();
1942 assert_eq!(mime, "application/dtc+xml");
1943
1944 let result = doc
1945 .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1946 assert!(result.is_ok());
1947 let (_, mime) = result.unwrap();
1948 assert_eq!(mime, "application/xhtml+xml");
1949 }
1950 }
1951
1952 /// Section 3.9 open container format
1953 mod open_container_format_tests {
1954 use std::{cmp::min, io::Read, path::Path};
1955
1956 use sha1::{Digest, Sha1};
1957
1958 use crate::epub::EpubDoc;
1959
1960 /// ID: ocf-metainf-inc
1961 ///
1962 /// An extra configuration file, not in the reserved files' list, is added to the META-INF folder; this file must be ignored.
1963 #[test]
1964 fn test_ocf_metainf_inc() {
1965 let epub_file = Path::new("./test_case/ocf-metainf-inc.epub");
1966 let doc = EpubDoc::new(epub_file);
1967 assert!(doc.is_ok());
1968 }
1969
1970 /// ID: ocf-metainf-manifest
1971 ///
1972 /// An ancillary manifest file, containing an extra spine item, is present in the META-INF directory; this extra item must be ignored by the reading system.
1973 #[test]
1974 fn test_ocf_metainf_manifest() {
1975 let epub_file = Path::new("./test_case/ocf-metainf-manifest.epub");
1976 let doc = EpubDoc::new(epub_file);
1977 assert!(doc.is_ok());
1978 }
1979
1980 /// ID: ocf-package_arbitrary
1981 ///
1982 /// The EPUB contains three valid package files and three corresponding sets of content documents, but only one of the packages, in an unusual subdirectory, is referenced by the container.xml file. The reading system must use this package.
1983 #[test]
1984 fn test_ocf_package_arbitrary() {
1985 let epub_file = Path::new("./test_case/ocf-package_arbitrary.epub");
1986 let doc = EpubDoc::new(epub_file);
1987 assert!(doc.is_ok());
1988
1989 let doc = doc.unwrap();
1990 assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1991 }
1992
1993 /// ID: ocf-package_multiple
1994 ///
1995 /// The EPUB contains three valid package files and three corresponding sets of content documents, all referenced by the container.xml file. The reading system must use the first package.
1996 #[test]
1997 fn test_ocf_package_multiple() {
1998 let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
1999 let doc = EpubDoc::new(epub_file);
2000 assert!(doc.is_ok());
2001
2002 let doc = doc.unwrap();
2003 assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
2004 assert_eq!(doc.base_path, Path::new("FOO/BAR"));
2005 }
2006
2007 /// ID: ocf-url_link-leaking-relative
2008 ///
2009 /// Use a relative link with several double-dot path segments from the content to a photograph. The folder hierarchy containing the photograph starts at the root level; the relative image reference exceeds depth of hierarchy.
2010 #[test]
2011 fn test_ocf_url_link_leaking_relative() {
2012 let epub_file = Path::new("./test_case/ocf-url_link-leaking-relative.epub");
2013 let doc = EpubDoc::new(epub_file);
2014 assert!(doc.is_err());
2015 assert_eq!(
2016 doc.err().unwrap().to_string(),
2017 String::from(
2018 "Relative link leakage: Path \"../../../../media/imgs/monastery.jpg\" is out of container range."
2019 )
2020 )
2021 }
2022
2023 /// ID: ocf-url_link-path-absolute
2024 ///
2025 /// Use a path-absolute link, i.e., beginning with a leading slash, from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2026 #[test]
2027 fn test_ocf_url_link_path_absolute() {
2028 let epub_file = Path::new("./test_case/ocf-url_link-path-absolute.epub");
2029 let doc = EpubDoc::new(epub_file);
2030 assert!(doc.is_ok());
2031
2032 let doc = doc.unwrap();
2033 let resource = doc.manifest.get("photo").unwrap();
2034 assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2035 }
2036
2037 /// ID: ocf-url_link-relative
2038 ///
2039 /// A simple relative link from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2040 #[test]
2041 fn test_ocf_url_link_relative() {
2042 let epub_file = Path::new("./test_case/ocf-url_link-relative.epub");
2043 let doc = EpubDoc::new(epub_file);
2044 assert!(doc.is_ok());
2045
2046 let doc = doc.unwrap();
2047 let resource = doc.manifest.get("photo").unwrap();
2048 assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2049 }
2050
2051 /// ID: ocf-url_manifest
2052 ///
2053 /// The manifest refers to an XHTML file in an arbitrary subfolder. The reading system must be able to find the content.
2054 #[test]
2055 fn test_ocf_url_manifest() {
2056 let epub_file = Path::new("./test_case/ocf-url_manifest.epub");
2057 let doc = EpubDoc::new(epub_file);
2058 assert!(doc.is_ok());
2059
2060 let mut doc = doc.unwrap();
2061 assert!(doc.get_manifest_item("nav").is_ok());
2062 assert!(doc.get_manifest_item("content_001").is_ok());
2063 assert!(doc.get_manifest_item("content_002").is_err());
2064 }
2065
2066 /// ID: ocf-url_relative
2067 ///
2068 /// The manifest refers to an XHTML file in an arbitrary subfolder that is relative to the package's own arbitrary folder. The reading system must be able to find the content.
2069 #[test]
2070 fn test_ocf_url_relative() {
2071 let epub_file = Path::new("./test_case/ocf-url_relative.epub");
2072 let doc = EpubDoc::new(epub_file);
2073 assert!(doc.is_ok());
2074
2075 let mut doc = doc.unwrap();
2076 assert_eq!(doc.package_path, Path::new("foo/BAR/baz.opf"));
2077 assert_eq!(doc.base_path, Path::new("foo/BAR"));
2078 assert_eq!(
2079 doc.manifest.get("nav").unwrap().path,
2080 Path::new("foo/BAR/nav.xhtml")
2081 );
2082 assert_eq!(
2083 doc.manifest.get("content_001").unwrap().path,
2084 Path::new("foo/BAR/qux/content_001.xhtml")
2085 );
2086 assert!(doc.get_manifest_item("nav").is_ok());
2087 assert!(doc.get_manifest_item("content_001").is_ok());
2088 }
2089
2090 /// ID: ocf-zip-comp
2091 ///
2092 /// MUST treat any OCF ZIP container that uses compression techniques other than Deflate as in error.
2093 /// This test case does not use compression methods other than Deflate and cannot detect whether it is effective.
2094 #[test]
2095 fn test_ocf_zip_comp() {
2096 let epub_file = Path::new("./test_case/ocf-zip-comp.epub");
2097 let doc = EpubDoc::new(epub_file);
2098 assert!(doc.is_ok());
2099 }
2100
2101 /// ID: ocf-zip-mult
2102 ///
2103 /// MUST treat any OCF ZIP container that splits the content into segments as in error.
2104 /// This test case is not a segmented OCF ZIP container and cannot be tested to see if it is valid.
2105 #[test]
2106 fn test_ocf_zip_mult() {
2107 let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2108 let doc = EpubDoc::new(epub_file);
2109 assert!(doc.is_ok());
2110 }
2111
2112 /// ID: ocf-font_obfuscation
2113 ///
2114 /// An obfuscated (TrueType) font should be displayed after de-obfuscation.
2115 #[test]
2116 fn test_ocf_font_obfuscation() {
2117 let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2118 let doc = EpubDoc::new(epub_file);
2119 assert!(doc.is_ok());
2120
2121 let mut doc = doc.unwrap();
2122 let unique_id = doc.unique_identifier.clone();
2123
2124 let mut hasher = Sha1::new();
2125 hasher.update(unique_id.as_bytes());
2126 let hash = hasher.finalize();
2127 let mut key = vec![0u8; 1040];
2128 for i in 0..1040 {
2129 key[i] = hash[i % hash.len()];
2130 }
2131
2132 assert!(doc.encryption.is_some());
2133 assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2134
2135 let data = &doc.encryption.unwrap()[0];
2136 assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2137
2138 let font_file = doc
2139 .archive
2140 .by_name(&data.data)
2141 .unwrap()
2142 .bytes()
2143 .collect::<Result<Vec<u8>, _>>();
2144 assert!(font_file.is_ok());
2145 let font_file = font_file.unwrap();
2146
2147 // 根据EPUB规范,字体混淆是直接对字体文件进行的,不需要解压步骤,直接进行去混淆处理
2148 let mut deobfuscated = font_file.clone();
2149 for i in 0..min(1040, deobfuscated.len()) {
2150 deobfuscated[i] ^= key[i];
2151 }
2152
2153 assert!(is_valid_font(&deobfuscated));
2154 }
2155
2156 /// ID: ocf-font_obfuscation-bis
2157 ///
2158 /// An obfuscated (TrueType) font should not be displayed after de-obfuscation, because the obfuscation used a different publication id.
2159 #[test]
2160 fn test_ocf_font_obfuscation_bis() {
2161 let epub_file = Path::new("./test_case/ocf-font_obfuscation_bis.epub");
2162 let doc = EpubDoc::new(epub_file);
2163 assert!(doc.is_ok());
2164
2165 let mut doc = doc.unwrap();
2166
2167 let wrong_unique_id = "wrong-publication-id";
2168 let mut hasher = Sha1::new();
2169 hasher.update(wrong_unique_id.as_bytes());
2170 let hash = hasher.finalize();
2171 let mut wrong_key = vec![0u8; 1040];
2172 for i in 0..1040 {
2173 wrong_key[i] = hash[i % hash.len()];
2174 }
2175
2176 assert!(doc.encryption.is_some());
2177 assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2178
2179 let data = &doc.encryption.unwrap()[0];
2180 assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2181
2182 let font_file = doc
2183 .archive
2184 .by_name(&data.data)
2185 .unwrap()
2186 .bytes()
2187 .collect::<Result<Vec<u8>, _>>();
2188 assert!(font_file.is_ok());
2189 let font_file = font_file.unwrap();
2190
2191 // 使用错误的密钥进行去混淆
2192 let mut deobfuscated_with_wrong_key = font_file.clone();
2193 for i in 0..std::cmp::min(1040, deobfuscated_with_wrong_key.len()) {
2194 deobfuscated_with_wrong_key[i] ^= wrong_key[i];
2195 }
2196
2197 assert!(!is_valid_font(&deobfuscated_with_wrong_key));
2198 }
2199
2200 fn is_valid_font(data: &[u8]) -> bool {
2201 if data.len() < 4 {
2202 return false;
2203 }
2204 let sig = &data[0..4];
2205 // OTF: "OTTO"
2206 // TTF: 0x00010000, 0x00020000, "true", "typ1"
2207 sig == b"OTTO"
2208 || sig == b"\x00\x01\x00\x00"
2209 || sig == b"\x00\x02\x00\x00"
2210 || sig == b"true"
2211 || sig == b"typ1"
2212 }
2213 }
2214
2215 /// Test for function `has_encryption`
2216 #[test]
2217 fn test_fn_has_encryption() {
2218 let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2219 let doc = EpubDoc::new(epub_file);
2220 assert!(doc.is_ok());
2221
2222 let mut doc = doc.unwrap();
2223 assert!(doc.has_encryption());
2224 }
2225
2226 /// This test is used to detect whether the "META-INF/encryption.xml" file is parsed correctly
2227 #[test]
2228 fn test_fn_parse_encryption() {
2229 let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2230 let doc = EpubDoc::new(epub_file);
2231 assert!(doc.is_ok());
2232
2233 let doc = doc.unwrap();
2234 assert!(doc.encryption.is_some());
2235
2236 let encryption = doc.encryption.unwrap();
2237 assert_eq!(encryption.len(), 1);
2238 assert_eq!(encryption[0].method, "http://www.idpf.org/2008/embedding");
2239 assert_eq!(encryption[0].data, "EPUB/fonts/Lobster.ttf");
2240 }
2241
2242 #[test]
2243 fn test_get_metadata_existing_key() {
2244 let epub_file = Path::new("./test_case/epub-33.epub");
2245 let doc = EpubDoc::new(epub_file);
2246 assert!(doc.is_ok());
2247
2248 let doc = doc.unwrap();
2249
2250 let titles = doc.get_metadata("title");
2251 assert!(titles.is_some());
2252
2253 let titles = titles.unwrap();
2254 assert_eq!(titles.len(), 1);
2255 assert_eq!(titles[0].property, "title");
2256 assert_eq!(titles[0].value, "EPUB 3.3");
2257
2258 let languages = doc.get_metadata("language");
2259 assert!(languages.is_some());
2260
2261 let languages = languages.unwrap();
2262 assert_eq!(languages.len(), 1);
2263 assert_eq!(languages[0].property, "language");
2264 assert_eq!(languages[0].value, "en-us");
2265 }
2266
2267 #[test]
2268 fn test_get_metadata_nonexistent_key() {
2269 let epub_file = Path::new("./test_case/epub-33.epub");
2270 let doc = EpubDoc::new(epub_file);
2271 assert!(doc.is_ok());
2272
2273 let doc = doc.unwrap();
2274 let metadata = doc.get_metadata("nonexistent");
2275 assert!(metadata.is_none());
2276 }
2277
2278 #[test]
2279 fn test_get_metadata_multiple_items_same_type() {
2280 let epub_file = Path::new("./test_case/epub-33.epub");
2281 let doc = EpubDoc::new(epub_file);
2282 assert!(doc.is_ok());
2283
2284 let doc = doc.unwrap();
2285
2286 let creators = doc.get_metadata("creator");
2287 assert!(creators.is_some());
2288
2289 let creators = creators.unwrap();
2290 assert_eq!(creators.len(), 3);
2291
2292 assert_eq!(creators[0].id, Some("creator_id_0".to_string()));
2293 assert_eq!(creators[0].property, "creator");
2294 assert_eq!(creators[0].value, "Matt Garrish, DAISY Consortium");
2295
2296 assert_eq!(creators[1].id, Some("creator_id_1".to_string()));
2297 assert_eq!(creators[1].property, "creator");
2298 assert_eq!(creators[1].value, "Ivan Herman, W3C");
2299
2300 assert_eq!(creators[2].id, Some("creator_id_2".to_string()));
2301 assert_eq!(creators[2].property, "creator");
2302 assert_eq!(creators[2].value, "Dave Cramer, Invited Expert");
2303 }
2304
2305 #[test]
2306 fn test_get_metadata_with_refinement() {
2307 let epub_file = Path::new("./test_case/epub-33.epub");
2308 let doc = EpubDoc::new(epub_file);
2309 assert!(doc.is_ok());
2310
2311 let doc = doc.unwrap();
2312
2313 let title = doc.get_metadata("title");
2314 assert!(title.is_some());
2315
2316 let title = title.unwrap();
2317 assert_eq!(title.len(), 1);
2318 assert_eq!(title[0].refined.len(), 1);
2319 assert_eq!(title[0].refined[0].property, "title-type");
2320 assert_eq!(title[0].refined[0].value, "main");
2321 }
2322
2323 #[test]
2324 fn test_get_manifest_item_with_fallback() {
2325 let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
2326 let doc = EpubDoc::new(epub_file);
2327 assert!(doc.is_ok());
2328
2329 let mut doc = doc.unwrap();
2330 assert!(doc.get_manifest_item("content_001").is_ok());
2331 assert!(doc.get_manifest_item("bar").is_ok());
2332
2333 // 当回退链上存在可回退资源时能获取资源
2334 if let Ok((_, mime)) = doc.get_manifest_item_with_fallback("content_001", vec!["image/psd"])
2335 {
2336 assert_eq!(mime, "image/psd");
2337 } else {
2338 assert!(false, "get_manifest_item_with_fallback failed");
2339 }
2340
2341 // 当回退链上不存在可回退资源时无法获取资源
2342 assert_eq!(
2343 doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
2344 .unwrap_err()
2345 .to_string(),
2346 "No supported file format: The fallback resource does not contain the file format you support."
2347 );
2348 }
2349
2350 #[test]
2351 fn test_get_cover() {
2352 let epub_file = Path::new("./test_case/pkg-cover-image.epub");
2353 let doc = EpubDoc::new(epub_file);
2354 if let Err(err) = &doc {
2355 println!("{}", err);
2356 }
2357 assert!(doc.is_ok());
2358
2359 let mut doc = doc.unwrap();
2360 let result = doc.get_cover();
2361 assert!(result.is_some());
2362
2363 let (data, mime) = result.unwrap();
2364 assert_eq!(data.len(), 5785);
2365 assert_eq!(mime, "image/jpeg");
2366 }
2367}