lib_epub/
epub.rs

1//! The core module of the EPUB parsing library
2//!
3//! This module provides complete parsing functionality for EPUB ebook files
4//! and is the core component of the entire library. The `EpubDoc` structure
5//! encapsulates all the parsing logic and data access interfaces for EPUB files.
6//!
7//! ## Main references to EPUB specs:
8//! - <https://www.w3.org/TR/epub-33>
9//! - <https://idpf.org/epub/201>
10//!
11//! ## Potential Issues
12//! - The generic parameter `R: Read + Seek` increases complexity, particularly
13//!   in asynchronous environments. The current design is not conducive to multi-threaded
14//!   concurrent access and requires an external synchronization mechanism.
15//! - Some error handling may not be sufficiently nuanced, and certain edge cases
16//!   may not be adequately considered.
17//! - Loading the entire EPUB document at once may result in significant memory consumption,
18//!   especially for large publications.
19//!
20//! ## Future Work
21//! - Supports more EPUB specification features, such as media overlay and scripts.
22
23use std::{
24    collections::HashMap,
25    fs::{File, canonicalize},
26    io::{BufReader, Read, Seek},
27    path::{Path, PathBuf},
28    sync::{
29        Arc, Mutex,
30        atomic::{AtomicUsize, Ordering},
31    },
32};
33
34use log::warn;
35use zip::{ZipArchive, result::ZipError};
36
37use crate::{
38    error::EpubError,
39    types::{
40        EncryptionData, EpubVersion, ManifestItem, MetadataItem, MetadataLinkItem,
41        MetadataRefinement, NavPoint, SpineItem,
42    },
43    utils::{
44        DecodeBytes, NormalizeWhitespace, XmlElement, XmlReader, adobe_font_dencryption,
45        check_realtive_link_leakage, compression_method_check, get_file_in_zip_archive,
46        idpf_font_dencryption,
47    },
48};
49
50/// EPUB document parser, representing a loaded and parsed EPUB publication
51///
52/// The `EpubDoc` structure is the core of the entire EPUB parsing library.
53/// It encapsulates all the parsing logic and data access interfaces for EPUB files.
54/// It is responsible for parsing various components of an EPUB, including metadata,
55/// manifests, reading order, table of contents navigation, and encrypted information,
56/// and provides methods for accessing this data.
57///
58/// Provides a unified data access interface for EPUB files, hiding the underlying
59/// file structure and parsing details. Strictly adheres to the EPUB specification
60/// in implementing the parsing logic to ensure compatibility with the standard.
61///
62/// ## Usage
63///
64/// ```rust
65/// use lib_epub::epub::EpubDoc;
66///
67/// let doc = EpubDoc::new("./test_case/epub-33.epub");
68/// assert!(doc.is_ok());
69/// ```
70///
71/// ## Notes
72/// - The `EpubDoc` structure is thread-safe **if and only if** the structure is immutable. 
73/// - The fact that `EpubDoc` is mutable has no practical meaning; modifications 
74///   to the structure data are not stored in the epub file.
75pub struct EpubDoc<R: Read + Seek> {
76    /// The structure of the epub file that actually holds it
77    pub(crate) archive: Arc<Mutex<ZipArchive<R>>>,
78
79    /// The path to the target epub file
80    pub(crate) epub_path: PathBuf,
81
82    /// The path to the OPF file
83    pub package_path: PathBuf,
84
85    /// The path to the directory where the opf file is located
86    pub base_path: PathBuf,
87
88    /// The epub version
89    pub version: EpubVersion,
90
91    /// The unique identifier of the epub file
92    ///
93    /// This identifier is the actual value of the unique-identifier attribute of the package.
94    pub unique_identifier: String,
95
96    /// Epub metadata extracted from OPF
97    pub metadata: Vec<MetadataItem>,
98
99    /// Data in metadata that points to external files
100    pub metadata_link: Vec<MetadataLinkItem>,
101
102    /// A list of resources contained inside an epub extracted from OPF
103    ///
104    /// All resources in the epub file are declared here,
105    /// and undeclared resources should not be stored in the epub file and cannot be obtained from it.
106    pub manifest: HashMap<String, ManifestItem>,
107
108    /// Physical reading order of publications extracted from OPF
109    ///
110    /// This attribute declares the order in which multiple files
111    /// containing published content should be displayed.
112    pub spine: Vec<SpineItem>,
113
114    /// The encryption.xml extracted from the META-INF directory
115    pub encryption: Option<Vec<EncryptionData>>,
116
117    /// The navigation data of the epub file
118    pub catalog: Vec<NavPoint>,
119
120    /// The title of the catalog
121    pub catalog_title: String,
122
123    /// The index of the current reading spine
124    current_spine_index: AtomicUsize,
125
126    /// Whether the epub file contains encryption information
127    has_encryption: bool,
128}
129
130impl<R: Read + Seek> EpubDoc<R> {
131    /// Creates a new EPUB document instance from a reader
132    ///
133    /// This function is responsible for the core logic of parsing EPUB files,
134    /// including verifying the file format, parsing container information,
135    /// loading the OPF package document, and extracting metadata, manifest,
136    /// reading order, and other core information.
137    ///
138    /// ## Parameters
139    /// - `reader`: The data source that implements the `Read` and `Seek` traits,
140    ///   usually a file or memory buffer
141    /// - `epub_path`: The path to the EPUB file, used for path resolution and validation
142    ///
143    /// ## Return
144    /// - `Ok(EpubDoc<R>)`: The successfully parsed EPUB document object
145    /// - `Err(EpubError)`: Errors encountered during parsing
146    ///
147    /// ## Notes
148    /// - This function assumes the EPUB file structure is valid
149    pub fn from_reader(reader: R, epub_path: PathBuf) -> Result<Self, EpubError> {
150        // Parsing process
151        // 1. Verify that the ZIP compression method conforms to the EPUB specification
152        // 2. Parse `META-INF/container.xml` retrieves the location of the OPF file
153        // 3. Parses the OPF file to obtain package documentation information
154        // 4. Extracts version information
155        // 5. Parses metadata, manifest, and spine
156        // 6. Parses encrypted information and directory navigation
157        // 7. Verifies and extracts the unique identifier
158
159        let mut archive = ZipArchive::new(reader).map_err(EpubError::from)?;
160        let epub_path = canonicalize(epub_path)?;
161
162        compression_method_check(&mut archive)?;
163
164        let container =
165            get_file_in_zip_archive(&mut archive, "META-INF/container.xml")?.decode()?;
166        let package_path = Self::parse_container(container)?;
167        let base_path = package_path
168            .parent()
169            .expect("所有文件的父目录不能为空")
170            .to_path_buf();
171
172        let opf_file =
173            get_file_in_zip_archive(&mut archive, package_path.to_str().unwrap())?.decode()?;
174        let package = XmlReader::parse(&opf_file)?;
175
176        let version = Self::determine_epub_version(&package)?;
177        let has_encryption = archive
178            .by_path(Path::new("META-INF/encryption.xml"))
179            .is_ok();
180
181        let mut doc = Self {
182            archive: Arc::new(Mutex::new(archive)),
183            epub_path,
184            package_path,
185            base_path,
186            version,
187            unique_identifier: String::new(),
188            metadata: vec![],
189            metadata_link: vec![],
190            manifest: HashMap::new(),
191            spine: vec![],
192            encryption: None,
193            catalog: vec![],
194            catalog_title: String::new(),
195            current_spine_index: AtomicUsize::new(0),
196            has_encryption,
197        };
198
199        let metadata_element = package.find_elements_by_name("metadata").next().unwrap();
200        let manifest_element = package.find_elements_by_name("manifest").next().unwrap();
201        let spine_element = package.find_elements_by_name("spine").next().unwrap();
202
203        doc.parse_metadata(metadata_element)?;
204        doc.parse_manifest(manifest_element)?;
205        doc.parse_spine(spine_element)?;
206        doc.parse_encryption()?;
207        doc.parse_catalog()?;
208
209        // 断言必有唯一标识符
210        doc.unique_identifier = if let Some(uid) = package.get_attr("unique-identifier") {
211            doc.metadata.iter().find(|item| {
212                item.property == "identifier" && item.id.as_ref().is_some_and(|id| id == &uid)
213            })
214        } else {
215            doc.metadata
216                .iter()
217                .find(|item| item.property == "identifier")
218        }
219        .map(|item| item.value.clone())
220        .ok_or_else(|| EpubError::NonCanonicalFile {
221            tag: "dc:identifier".to_string(),
222        })?;
223
224        Ok(doc)
225    }
226
227    /// Parse the EPUB container file (META-INF/container.xml)
228    ///
229    /// This function parses the container information in the EPUB file 、
230    /// to extract the path to the OPF package file. According to the EPUB
231    /// specification, the `container.xml` file must exist in the `META-INF`
232    /// directory and contain at least one `rootfile` element pointing to
233    /// the main OPF file. When multiple `rootfile` elements exist, the first
234    /// element pointing to the OPF file is used as the default.
235    ///
236    /// ## Parameters
237    /// - `data`: The content string of the container.xml
238    ///
239    /// ## Return
240    /// - `Ok(PathBuf)`: The path to the successfully parsed OPF file
241    /// - `Err(EpubError)`: Errors encountered during parsing
242    fn parse_container(data: String) -> Result<PathBuf, EpubError> {
243        let root = XmlReader::parse(&data)?;
244        let rootfile = root
245            .find_elements_by_name("rootfile")
246            .next()
247            .ok_or_else(|| EpubError::NonCanonicalFile {
248                tag: "rootfile".to_string(),
249            })?;
250
251        let attr =
252            rootfile
253                .get_attr("full-path")
254                .ok_or_else(|| EpubError::MissingRequiredAttribute {
255                    tag: "rootfile".to_string(),
256                    attribute: "full-path".to_string(),
257                })?;
258
259        Ok(PathBuf::from(attr))
260    }
261
262    /// Parse the EPUB metadata section
263    ///
264    /// This function is responsible for parsing the `<metadata>` elements
265    /// in the OPF file to extract basic information about the publication.
266    /// It handles metadata elements from different namespaces:
267    /// - Elements in the Dublin Core namespace (`http://purl.org/dc/elements/1.1/`)
268    /// - Elements in the OPF namespace (`http://www.idpf.org/2007/opf`)
269    ///
270    /// ## Parameters
271    /// - `metadata_element`: A reference to the `<metadata>` element in the OPF file
272    fn parse_metadata(&mut self, metadata_element: &XmlElement) -> Result<(), EpubError> {
273        const DC_NAMESPACE: &str = "http://purl.org/dc/elements/1.1/";
274        const OPF_NAMESPACE: &str = "http://www.idpf.org/2007/opf";
275
276        let mut metadata = Vec::new();
277        let mut metadata_link = Vec::new();
278        let mut refinements = HashMap::<String, Vec<MetadataRefinement>>::new();
279
280        for element in metadata_element.children() {
281            match &element.namespace {
282                Some(namespace) if namespace == DC_NAMESPACE => {
283                    self.parse_dc_metadata(element, &mut metadata)?
284                }
285
286                Some(namespace) if namespace == OPF_NAMESPACE => self.parse_opf_metadata(
287                    element,
288                    &mut metadata,
289                    &mut metadata_link,
290                    &mut refinements,
291                )?,
292
293                _ => {}
294            };
295        }
296
297        for item in metadata.iter_mut() {
298            if let Some(id) = &item.id {
299                if let Some(refinements) = refinements.remove(id) {
300                    item.refined = refinements;
301                }
302            }
303        }
304
305        self.metadata = metadata;
306        self.metadata_link = metadata_link;
307        Ok(())
308    }
309
310    /// Parse the EPUB manifest section
311    ///
312    /// This function parses the `<manifest>` element in the OPF file, extracting
313    /// information about all resource files in the publication. Each resource contains
314    /// basic information such as id, file path, MIME type, as well as optional
315    /// attributes and fallback resource information.
316    ///
317    /// ## Parameters
318    /// - `manifest_element`: A reference to the `<manifest>` element in the OPF file
319    fn parse_manifest(&mut self, manifest_element: &XmlElement) -> Result<(), EpubError> {
320        let estimated_items = manifest_element.children().count();
321        let mut resources = HashMap::with_capacity(estimated_items);
322
323        for element in manifest_element.children() {
324            let id = element
325                .get_attr("id")
326                .ok_or_else(|| EpubError::MissingRequiredAttribute {
327                    tag: element.tag_name(),
328                    attribute: "id".to_string(),
329                })?
330                .to_string();
331            let path = element
332                .get_attr("href")
333                .ok_or_else(|| EpubError::MissingRequiredAttribute {
334                    tag: element.tag_name(),
335                    attribute: "href".to_string(),
336                })?
337                .to_string();
338            let mime = element
339                .get_attr("media-type")
340                .ok_or_else(|| EpubError::MissingRequiredAttribute {
341                    tag: element.tag_name(),
342                    attribute: "media-type".to_string(),
343                })?
344                .to_string();
345            let properties = element.get_attr("properties");
346            let fallback = element.get_attr("fallback");
347
348            resources.insert(
349                id.clone(),
350                ManifestItem {
351                    id,
352                    path: self.normalize_manifest_path(&path)?,
353                    mime,
354                    properties,
355                    fallback,
356                },
357            );
358        }
359
360        self.manifest = resources;
361        self.validate_fallback_chains();
362        Ok(())
363    }
364
365    /// Parse the EPUB spine section
366    ///
367    /// This function parses the `<spine>` elements in the OPF file to extract
368    /// the reading order information of the publication. The spine defines the
369    /// linear reading order of the publication's content documents, and each
370    /// spine item references resources in the manifest.
371    ///
372    /// ## Parameters
373    /// - `spine_element`: A reference to the `<spine>` element in the OPF file
374    fn parse_spine(&mut self, spine_element: &XmlElement) -> Result<(), EpubError> {
375        let mut spine = Vec::new();
376        for element in spine_element.children() {
377            let idref = element
378                .get_attr("idref")
379                .ok_or_else(|| EpubError::MissingRequiredAttribute {
380                    tag: element.tag_name(),
381                    attribute: "idref".to_string(),
382                })?
383                .to_string();
384            let id = element.get_attr("id");
385            let linear = element
386                .get_attr("linear")
387                .map(|linear| linear == "yes")
388                .unwrap_or(true);
389            let properties = element.get_attr("properties");
390
391            spine.push(SpineItem {
392                idref,
393                id,
394                linear,
395                properties,
396            });
397        }
398
399        self.spine = spine;
400        Ok(())
401    }
402
403    /// Parse the EPUB encryption file (META-INF/encryption.xml)
404    ///
405    /// This function is responsible for parsing the `encryption.xml` file
406    /// in the `META-INF` directory to extract information about encrypted
407    /// resources in the publication. According to the EPUB specification,
408    /// the encryption information describes which resources are encrypted
409    /// and the encryption methods used.
410    ///
411    /// TODO: 需要对使用非对称加密数据的加密项进行额外处理，以获取非对称加密密钥
412    fn parse_encryption(&mut self) -> Result<(), EpubError> {
413        if !self.has_encryption() {
414            return Ok(());
415        }
416
417        let mut archive = self.archive.lock()?;
418        let encryption_file =
419            get_file_in_zip_archive(&mut archive, "META-INF/encryption.xml")?.decode()?;
420
421        let root = XmlReader::parse(&encryption_file)?;
422
423        let mut encryption_data = Vec::new();
424        for data in root.children() {
425            if data.name != "EncryptedData" {
426                continue;
427            }
428
429            let method = data
430                .find_elements_by_name("EncryptionMethod")
431                .next()
432                .ok_or_else(|| EpubError::NonCanonicalFile {
433                    tag: "EncryptionMethod".to_string(),
434                })?;
435            let reference = data
436                .find_elements_by_name("CipherReference")
437                .next()
438                .ok_or_else(|| EpubError::NonCanonicalFile {
439                    tag: "CipherReference".to_string(),
440                })?;
441
442            encryption_data.push(EncryptionData {
443                method: method
444                    .get_attr("Algorithm")
445                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
446                        tag: "EncryptionMethod".to_string(),
447                        attribute: "Algorithm".to_string(),
448                    })?
449                    .to_string(),
450                data: reference
451                    .get_attr("URI")
452                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
453                        tag: "CipherReference".to_string(),
454                        attribute: "URI".to_string(),
455                    })?
456                    .to_string(),
457            });
458        }
459
460        if !encryption_data.is_empty() {
461            self.encryption = Some(encryption_data);
462        }
463
464        Ok(())
465    }
466
467    /// Parse the EPUB navigation information
468    ///
469    /// This function is responsible for parsing the navigation information of EPUB
470    /// publications. Different parsing strategies are used depending on the EPUB version:
471    /// - EPUB 2.0: Parses the NCX file to obtain directory information
472    /// - EPUB 3.0: Parses the Navigation Document (NAV) file to obtain directory information
473    fn parse_catalog(&mut self) -> Result<(), EpubError> {
474        const HEAD_TAGS: [&str; 6] = ["h1", "h2", "h3", "h4", "h5", "h6"];
475
476        let mut archive = self.archive.lock()?;
477        match self.version {
478            EpubVersion::Version2_0 => {
479                let opf_file =
480                    get_file_in_zip_archive(&mut archive, self.package_path.to_str().unwrap())?
481                        .decode()?;
482                let opf_element = XmlReader::parse(&opf_file)?;
483
484                let toc_id = opf_element
485                    .find_children_by_name("spine")
486                    .next()
487                    .ok_or_else(|| EpubError::NonCanonicalFile {
488                        tag: "spine".to_string(),
489                    })?
490                    .get_attr("toc")
491                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
492                        tag: "spine".to_string(),
493                        attribute: "toc".to_string(),
494                    })?
495                    .to_owned();
496                let toc_path = self
497                    .manifest
498                    .get(&toc_id)
499                    .ok_or(EpubError::ResourceIdNotExist { id: toc_id })?
500                    .path
501                    .to_str()
502                    .unwrap();
503
504                let ncx_file = get_file_in_zip_archive(&mut archive, toc_path)?.decode()?;
505                let ncx = XmlReader::parse(&ncx_file)?;
506
507                match ncx.find_elements_by_name("docTitle").next() {
508                    Some(element) => self.catalog_title = element.text(),
509                    None => warn!(
510                        "Expecting to get docTitle information from the ncx file, but it's missing."
511                    ),
512                };
513
514                let nav_map = ncx.find_elements_by_name("navMap").next().ok_or_else(|| {
515                    EpubError::NonCanonicalFile {
516                        tag: "navMap".to_string(),
517                    }
518                })?;
519
520                self.catalog = self.parse_nav_points(nav_map)?;
521
522                Ok(())
523            }
524
525            EpubVersion::Version3_0 => {
526                let nav_path = self
527                    .manifest
528                    .values()
529                    .find(|item| {
530                        if let Some(property) = &item.properties {
531                            return property.contains("nav");
532                        }
533                        false
534                    })
535                    .map(|item| item.path.clone())
536                    .ok_or_else(|| EpubError::NonCanonicalEpub {
537                        expected_file: "Navigation Document".to_string(),
538                    })?;
539
540                let nav_file =
541                    get_file_in_zip_archive(&mut archive, nav_path.to_str().unwrap())?.decode()?;
542
543                let nav_element = XmlReader::parse(&nav_file)?;
544                let nav = nav_element
545                    .find_elements_by_name("nav")
546                    .find(|&element| element.get_attr("epub:type") == Some(String::from("toc")))
547                    .ok_or_else(|| EpubError::NonCanonicalFile {
548                        tag: "nav".to_string(),
549                    })?;
550                let nav_title = nav.find_children_by_names(&HEAD_TAGS).next();
551                let nav_list = nav.find_children_by_name("ol").next().ok_or_else(|| {
552                    EpubError::NonCanonicalFile {
553                        tag: "ol".to_string(),
554                    }
555                })?;
556
557                self.catalog = self.parse_catalog_list(nav_list)?;
558                if let Some(nav_title) = nav_title {
559                    self.catalog_title = nav_title.text();
560                };
561                Ok(())
562            }
563        }
564    }
565
566    /// Check if the EPUB file contains `encryption.xml`
567    ///
568    /// This function determines whether a publication contains encrypted resources
569    /// by checking if a `META-INF/encryption.xml` file exists in the EPUB package.
570    /// According to the EPUB specification, when resources in a publication are
571    /// encrypted, the corresponding encryption information must be declared in
572    /// the `META-INF/encryption.xml` file.
573    ///
574    /// ## Return
575    /// - `true` if the publication contains encrypted resources
576    /// - `false` if the publication does not contain encrypted resources
577    ///
578    /// ## Notes
579    /// - This function only checks the existence of the encrypted file;
580    ///   it does not verify the validity of the encrypted information.
581    pub fn has_encryption(&self) -> bool {
582        self.has_encryption
583    }
584
585    /// Retrieves a list of metadata items
586    ///
587    /// This function retrieves all matching metadata items from the EPUB metadata
588    /// based on the specified attribute name (key). Metadata items may come from
589    /// the DC (Dublin Core) namespace or the OPF namespace and contain basic
590    /// information about the publication, such as title, author, identifier, etc.
591    ///
592    /// ## Parameters
593    /// - `key`: The name of the metadata attribute to retrieve
594    ///
595    /// ## Return
596    /// - `Some(Vec<MetadataItem>)`: A vector containing all matching metadata items
597    /// - `None`: If no matching metadata items are found
598    pub fn get_metadata(&self, key: &str) -> Option<Vec<MetadataItem>> {
599        let metadatas = self
600            .metadata
601            .iter()
602            .filter(|item| item.property == key)
603            .cloned()
604            .collect::<Vec<MetadataItem>>();
605
606        (!metadatas.is_empty()).then_some(metadatas)
607    }
608
609    /// Retrieves a list of values for specific metadata items
610    ///
611    /// This function retrieves the values of all matching metadata items from
612    /// the EPUB metadata based on the given property name (key).
613    ///
614    /// ## Parameters
615    /// - `key`: The name of the metadata attribute to retrieve
616    ///
617    /// ## Return
618    /// - `Some(Vec<String>)`: A vector containing all matching metadata item values
619    /// - `None`: If no matching metadata items are found
620    pub fn get_metadata_value(&self, key: &str) -> Option<Vec<String>> {
621        let values = self
622            .metadata
623            .iter()
624            .filter(|item| item.property == key)
625            .map(|item| item.value.clone())
626            .collect::<Vec<String>>();
627
628        (!values.is_empty()).then_some(values)
629    }
630
631    /// Retrieves the title of the publication
632    ///
633    /// This function retrieves all title information from the EPUB metadata.
634    /// According to the EPUB specification, a publication can have multiple titles,
635    /// which are returned in the order they appear in the metadata.
636    ///
637    /// ## Return
638    /// - `Result<Vec<String>, EpubError>`: A vector containing all title information
639    /// - `EpubError`: If and only if the OPF file does not contain `<dc:title>`
640    ///
641    /// ## Notes
642    /// - The EPUB specification requires each publication to have at least one title.
643    pub fn get_title(&self) -> Result<Vec<String>, EpubError> {
644        self.get_metadata_value("title")
645            .ok_or_else(|| EpubError::NonCanonicalFile {
646                tag: "title".to_string(),
647            })
648    }
649
650    /// Retrieves the language used in the publication
651    ///
652    /// This function retrieves the language information of a publication from the EPUB
653    /// metadata. According to the EPUB specification, language information identifies
654    /// the primary language of the publication and can have multiple language identifiers.
655    ///
656    /// ## Return
657    /// - `Ok(Vec<String>)`: A vector containing all language identifiers
658    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:language>`
659    ///
660    /// ## Notes
661    /// - The EPUB specification requires that each publication specify at least one primary language.
662    /// - Language identifiers should conform to RFC 3066 or later standards.
663    pub fn get_language(&self) -> Result<Vec<String>, EpubError> {
664        self.get_metadata_value("language")
665            .ok_or_else(|| EpubError::NonCanonicalFile {
666                tag: "language".to_string(),
667            })
668    }
669
670    /// Retrieves the identifier of a publication
671    ///
672    /// This function retrieves the identifier information of a publication from
673    /// the EPUB metadata. According to the EPUB specification, each publication
674    /// must have a identifier, typically an ISBN, UUID, or other unique identifier.
675    ///
676    /// ## Return
677    /// - `Ok(Vec<String>)`: A vector containing all identifier information
678    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:identifier>`
679    ///
680    /// ## Notes
681    /// - The EPUB specification requires each publication to have at least one identifier.
682    /// - In the OPF file, the `unique-identifier` attribute of the `<package>` element
683    ///   should point to a `<dc:identifier>` element used to uniquely identify the publication.
684    ///   This means that `unique-identifier` is not exactly equal to `<dc:identifier>`.
685    pub fn get_identifier(&self) -> Result<Vec<String>, EpubError> {
686        self.get_metadata_value("identifier")
687            .ok_or_else(|| EpubError::NonCanonicalFile {
688                tag: "identifier".to_string(),
689            })
690    }
691
692    /// Retrieve resource data by resource ID
693    ///
694    /// This function will find the resource with the specified ID in the manifest.
695    /// If the resource is encrypted, it will be automatically decrypted.
696    ///
697    /// ## Parameters
698    /// - `id`: The ID of the resource to retrieve
699    ///
700    /// ## Return
701    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
702    ///   the MIME type
703    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
704    ///
705    /// ## Notes
706    /// - This function will automatically decrypt the resource if it is encrypted.
707    /// - For unsupported encryption methods, the corresponding error will be returned.
708    pub fn get_manifest_item(&self, id: &str) -> Result<(Vec<u8>, String), EpubError> {
709        let resource_item = self
710            .manifest
711            .get(id)
712            .cloned()
713            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
714
715        let path = resource_item.path.to_str().unwrap();
716
717        let mut archive = self.archive.lock()?;
718        let mut data = match archive.by_name(path) {
719            Ok(mut file) => {
720                let mut entry = Vec::<u8>::new();
721                file.read_to_end(&mut entry)?;
722
723                Ok(entry)
724            }
725            Err(ZipError::FileNotFound) => Err(EpubError::ResourceNotFound {
726                resource: path.to_string(),
727            }),
728            Err(err) => Err(EpubError::from(err)),
729        }?;
730
731        if let Some(method) = self.is_encryption_file(path) {
732            data = self.auto_dencrypt(&method, &mut data)?;
733        }
734
735        Ok((data, resource_item.mime))
736    }
737
738    /// Retrieves resource item data by resource path
739    ///
740    /// This function retrieves resources from the manifest based on the input path.
741    /// The input path must be a relative path to the root directory of the EPUB container;
742    /// using an absolute path or a relative path to another location will result in an error.
743    ///
744    /// ## Parameters
745    /// - `path`: The path of the resource to retrieve
746    ///
747    /// ## Return
748    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
749    ///   the MIME type
750    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
751    ///
752    /// ## Notes
753    /// - This function will automatically decrypt the resource if it is encrypted.
754    /// - For unsupported encryption methods, the corresponding error will be returned.
755    /// - Relative paths other than the root directory of the Epub container are not supported.
756    pub fn get_manifest_item_by_path(&self, path: &str) -> Result<(Vec<u8>, String), EpubError> {
757        let id = self
758            .manifest
759            .iter()
760            .find(|(_, item)| item.path.to_str().unwrap() == path)
761            .map(|(id, _)| id.to_string())
762            .ok_or_else(|| EpubError::ResourceNotFound {
763                resource: path.to_string(),
764            })?;
765
766        self.get_manifest_item(&id)
767    }
768
769    /// Retrieves supported resource items by resource ID, with fallback mechanism supported
770    ///
771    /// This function attempts to retrieve the resource item with the specified ID and
772    /// checks if its MIME type is in the list of supported formats. If the current resource
773    /// format is not supported, it searches for a supported resource format along the
774    /// fallback chain according to the fallback mechanism defined in the EPUB specification.
775    ///
776    /// ## Parameters
777    /// - `id`: The ID of the resource to retrieve
778    /// - `supported_format`: A vector of supported MIME types
779    ///
780    /// ## Return
781    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
782    ///   the MIME type
783    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
784    pub fn get_manifest_item_with_fallback(
785        &self,
786        id: &str,
787        supported_format: Vec<&str>,
788    ) -> Result<(Vec<u8>, String), EpubError> {
789        let mut manifest_item = self
790            .manifest
791            .get(id)
792            .cloned()
793            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
794
795        let mut current_manifest_id = id.to_string();
796        let mut fallback_chain = Vec::<String>::new();
797        'fallback: loop {
798            if supported_format.contains(&manifest_item.mime.as_str()) {
799                return self.get_manifest_item(&current_manifest_id);
800            }
801
802            let fallback_id = manifest_item.fallback.clone();
803
804            match fallback_id {
805                // The loop ends when no fallback resource exists
806                None => break 'fallback,
807
808                // End the loop when the loop continues to fallback if a fallback resource exists
809                Some(id) if fallback_chain.contains(&id) => break 'fallback,
810
811                Some(id) => {
812                    fallback_chain.push(id.clone());
813
814                    // Since only warnings are issued for fallback resource checks
815                    // during initialization, the issue of fallback resources possibly
816                    // not existing needs to be handled here.
817                    manifest_item = self
818                        .manifest
819                        .get(&manifest_item.fallback.unwrap())
820                        .cloned()
821                        .ok_or(EpubError::ResourceIdNotExist { id: id.clone() })?;
822                    current_manifest_id = id;
823                }
824            };
825        }
826
827        Err(EpubError::NoSupportedFileFormat)
828    }
829
830    /// Retrieves the cover of the EPUB document
831    ///
832    /// This function searches for the cover of the EPUB document by examining manifest
833    /// items in the manifest. It looks for manifest items whose ID or attribute contains
834    /// "cover" (case-insensitive) and attempts to retrieve the content of the first match.
835    ///
836    /// ## Return
837    /// - `Some((Vec<u8>, String))`: Successfully retrieved and decrypted cover data and
838    ///   the MIME type
839    /// - `None`: No cover resource was found
840    ///
841    /// ## Notes
842    /// - This function only returns the first successfully retrieved cover resource,
843    ///   even if multiple matches exist
844    /// - The retrieved cover may not be an image resource; users need to pay attention
845    ///   to the resource's MIME type.
846    pub fn get_cover(&self) -> Option<(Vec<u8>, String)> {
847        self.manifest
848            .values()
849            .filter_map(|manifest| {
850                if manifest.id.to_ascii_lowercase().contains("cover") {
851                    return Some(manifest.id.clone());
852                }
853
854                if let Some(properties) = &manifest.properties {
855                    if properties.to_ascii_lowercase().contains("cover") {
856                        return Some(manifest.id.clone());
857                    }
858                }
859
860                None
861            })
862            .collect::<Vec<String>>()
863            .iter()
864            .find_map(|id| self.get_manifest_item(id).ok())
865    }
866
867    /// Navigate to a specified chapter using the spine index
868    ///
869    /// This function retrieves the content data of the corresponding chapter based
870    /// on the index position in the EPUB spine. The spine defines the linear reading
871    /// order of the publication's content documents, and each spine item references
872    /// resources in the manifest.
873    ///
874    /// ## Parameters
875    /// - `index`: The index position in the spine, starting from 0
876    ///
877    /// ## Return
878    /// - `Some((Vec<u8>, String))`: Successfully retrieved chapter content data and the MIME type
879    /// - `None`: Index out of range or data retrieval error
880    ///
881    /// ## Notes
882    /// - The index must be less than the total number of spine projects.
883    /// - If the resource is encrypted, it will be automatically decrypted before returning.(TODO)
884    /// - It does not check whether the Spine project follows a linear reading order.
885    pub fn navigate_by_spine_index(&mut self, index: usize) -> Option<(Vec<u8>, String)> {
886        if index >= self.spine.len() {
887            return None;
888        }
889
890        let manifest_id = self.spine[index].idref.clone();
891        self.current_spine_index.store(index, Ordering::SeqCst);
892        self.get_manifest_item(&manifest_id).ok()
893    }
894
895    /// Navigate to the previous linear reading chapter
896    ///
897    /// This function searches backwards in the EPUB spine for the previous linear
898    /// reading chapter and returns the content data of that chapter. It only navigates
899    /// to chapters marked as linear reading.
900    ///
901    /// ## Return
902    /// - `Some((Vec<u8>, String))`: Successfully retrieved previous chapter content data and
903    ///   the MIME type
904    /// - `None`: Already in the first chapter, the current chapter is not linear,
905    ///   or data retrieval failed
906    pub fn spine_prev(&self) -> Option<(Vec<u8>, String)> {
907        let current_index = self.current_spine_index.load(Ordering::SeqCst);
908        if current_index == 0 || !self.spine[current_index].linear {
909            return None;
910        }
911
912        let prev_index = (0..current_index)
913            .rev()
914            .find(|&index| self.spine[index].linear)?;
915
916        self.current_spine_index.store(prev_index, Ordering::SeqCst);
917        let manifest_id = self.spine[prev_index].idref.clone();
918        self.get_manifest_item(&manifest_id).ok()
919    }
920
921    /// Navigate to the next linear reading chapter
922    ///
923    /// This function searches forwards in the EPUB spine for the next linear reading
924    /// chapter and returns the content data of that chapter. It only navigates to
925    /// chapters marked as linear reading.
926    ///
927    /// ## Return
928    /// - `Some((Vec<u8>, String))`: Successfully retrieved next chapter content data and
929    ///   the MIME type
930    /// - `None`: Already in the last chapter, the current chapter is not linear,
931    ///   or data retrieval failed
932    pub fn spine_next(&mut self) -> Option<(Vec<u8>, String)> {
933        let current_index = self.current_spine_index.load(Ordering::SeqCst);
934        if current_index >= self.spine.len() - 1 || !self.spine[current_index].linear {
935            return None;
936        }
937
938        let next_index =
939            (current_index + 1..self.spine.len()).find(|&index| self.spine[index].linear)?;
940
941        self.current_spine_index.store(next_index, Ordering::SeqCst);
942        let manifest_id = self.spine[next_index].idref.clone();
943        self.get_manifest_item(&manifest_id).ok()
944    }
945
946    /// Retrieves the content data of the current chapter
947    ///
948    /// This function returns the content data of the chapter at the current
949    /// index position in the EPUB spine.
950    ///
951    /// ## Return
952    /// - `Some((Vec<u8>, String))`: Successfully retrieved current chapter content data and
953    ///   the MIME type
954    /// - `None`: Data retrieval failed
955    pub fn spine_current(&self) -> Option<(Vec<u8>, String)> {
956        let manifest_id = self.spine[self.current_spine_index.load(Ordering::SeqCst)]
957            .idref
958            .clone();
959        self.get_manifest_item(&manifest_id).ok()
960    }
961
962    /// Determine the EPUB version from the OPF file
963    ///
964    /// This function is used to detect the version of an epub file from an OPF file.
965    /// When the version attribute in the package is abnormal, version information will
966    /// be identified through some version characteristics of the epub file. An error is
967    /// returned when neither direct nor indirect methods can identify the version.
968    ///
969    /// ## Parameters
970    /// - `opf_element`: A reference to the OPF file element
971    fn determine_epub_version(opf_element: &XmlElement) -> Result<EpubVersion, EpubError> {
972        // Check the explicit version attribute
973        if let Some(version) = opf_element.get_attr("version") {
974            match version.as_str() {
975                "2.0" => return Ok(EpubVersion::Version2_0),
976                "3.0" => return Ok(EpubVersion::Version3_0),
977                _ => {}
978            }
979        }
980
981        let spine_element = opf_element
982            .find_elements_by_name("spine")
983            .next()
984            .ok_or_else(|| EpubError::NonCanonicalFile {
985                tag: "spine".to_string(),
986            })?;
987
988        // Look for EPUB 2.x specific features
989        if spine_element.get_attr("toc").is_some() {
990            return Ok(EpubVersion::Version2_0);
991        }
992
993        let manifest_element = opf_element
994            .find_elements_by_name("manifest")
995            .next()
996            .ok_or_else(|| EpubError::NonCanonicalFile {
997                tag: "manifest".to_string(),
998            })?;
999
1000        // Look for EPUB 3.x specific features
1001        manifest_element
1002            .children()
1003            .find_map(|element| {
1004                if let Some(id) = element.get_attr("id") {
1005                    if id.eq("nav") {
1006                        return Some(EpubVersion::Version3_0);
1007                    }
1008                }
1009
1010                None
1011            })
1012            .ok_or(EpubError::UnrecognizedEpubVersion)
1013    }
1014
1015    /// Parse metadata elements under the Dublin Core namespace
1016    ///
1017    /// This function handles the `<metadata>` Dublin Core element in the OPF file (namespace
1018    /// is "http://purl.org/dc/elements/1.1/"). These elements usually contain the basic
1019    /// information of the publication, such as title, author, publication date, etc.
1020    ///
1021    /// ## Notes
1022    /// - In EPUB 3.0, granular information is handled by separate '<meta>' elements and 'refines' attributes
1023    /// - All text content is normalized by whitespace
1024    #[inline]
1025    fn parse_dc_metadata(
1026        &self,
1027        element: &XmlElement,
1028        metadata: &mut Vec<MetadataItem>,
1029        // refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1030    ) -> Result<(), EpubError> {
1031        let id = element.get_attr("id");
1032        let lang = element.get_attr("lang");
1033        let property = element.name.clone();
1034        let value = element.text().normalize_whitespace();
1035
1036        let refined = match self.version {
1037            // In EPUB 2.0, supplementary metadata (refinements) are represented
1038            // through other attribute data pairs of the tag.
1039            EpubVersion::Version2_0 => element
1040                .attributes
1041                .iter()
1042                .map(|(name, value)| {
1043                    let property = name.to_string();
1044                    let value = value.to_string().normalize_whitespace();
1045
1046                    MetadataRefinement {
1047                        refines: id.clone().unwrap(),
1048                        property,
1049                        value,
1050                        lang: None,
1051                        scheme: None,
1052                    }
1053                })
1054                .collect(),
1055            EpubVersion::Version3_0 => vec![],
1056        };
1057
1058        metadata.push(MetadataItem {
1059            id,
1060            property,
1061            value,
1062            lang,
1063            refined,
1064        });
1065
1066        Ok(())
1067    }
1068
1069    /// Parse metadata elements under the OPF namespace
1070    ///
1071    /// This function handles the `<metadata>` OPF element in the OPF file (namespace
1072    /// is "http://www.idpf.org/2007/opf"). These elements include '<meta>' and '<link>',
1073    /// which are used to provide extended metadata and links to external resources for EPUB publications.
1074    ///
1075    /// ## Notes
1076    /// - The function is only responsible for distribution processing, and the
1077    ///   specific parsing logic is implemented in the dedicated function
1078    /// - All parsing results are added directly to the incoming collection and no new collection is returned
1079    #[inline]
1080    fn parse_opf_metadata(
1081        &self,
1082        element: &XmlElement,
1083        metadata: &mut Vec<MetadataItem>,
1084        metadata_link: &mut Vec<MetadataLinkItem>,
1085        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1086    ) -> Result<(), EpubError> {
1087        match element.name.as_str() {
1088            "meta" => self.parse_meta_element(element, metadata, refinements),
1089            "link" => self.parse_link_element(element, metadata_link),
1090            _ => Ok(()),
1091        }
1092    }
1093
1094    #[inline]
1095    fn parse_meta_element(
1096        &self,
1097        element: &XmlElement,
1098        metadata: &mut Vec<MetadataItem>,
1099        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1100    ) -> Result<(), EpubError> {
1101        match self.version {
1102            EpubVersion::Version2_0 => {
1103                let property =
1104                    element
1105                        .get_attr("name")
1106                        .ok_or_else(|| EpubError::NonCanonicalFile {
1107                            tag: element.tag_name(),
1108                        })?;
1109                let value = element
1110                    .get_attr("content")
1111                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
1112                        tag: element.tag_name(),
1113                        attribute: "content".to_string(),
1114                    })?
1115                    .normalize_whitespace();
1116
1117                metadata.push(MetadataItem {
1118                    id: None,
1119                    property,
1120                    value,
1121                    lang: None,
1122                    refined: vec![],
1123                });
1124            }
1125
1126            EpubVersion::Version3_0 => {
1127                let property = element.get_attr("property").ok_or_else(|| {
1128                    EpubError::MissingRequiredAttribute {
1129                        tag: element.tag_name(),
1130                        attribute: "property".to_string(),
1131                    }
1132                })?;
1133                let value = element.text().normalize_whitespace();
1134                let lang = element.get_attr("lang");
1135
1136                if let Some(refines) = element.get_attr("refines") {
1137                    let id = refines.strip_prefix("#").unwrap_or(&refines).to_string();
1138                    let scheme = element.get_attr("scheme");
1139                    let refinement = MetadataRefinement {
1140                        refines: id.clone(),
1141                        property,
1142                        value,
1143                        lang,
1144                        scheme,
1145                    };
1146
1147                    if let Some(refinements) = refinements.get_mut(&id) {
1148                        refinements.push(refinement);
1149                    } else {
1150                        refinements.insert(id, vec![refinement]);
1151                    }
1152                } else {
1153                    let id = element.get_attr("id");
1154                    let item = MetadataItem {
1155                        id,
1156                        property,
1157                        value,
1158                        lang,
1159                        refined: vec![],
1160                    };
1161
1162                    metadata.push(item);
1163                };
1164            }
1165        }
1166        Ok(())
1167    }
1168
1169    #[inline]
1170    fn parse_link_element(
1171        &self,
1172        element: &XmlElement,
1173        metadata_link: &mut Vec<MetadataLinkItem>,
1174    ) -> Result<(), EpubError> {
1175        let href = element
1176            .get_attr("href")
1177            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1178                tag: element.tag_name(),
1179                attribute: "href".to_string(),
1180            })?;
1181        let rel = element
1182            .get_attr("rel")
1183            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1184                tag: element.tag_name(),
1185                attribute: "rel".to_string(),
1186            })?;
1187        let hreflang = element.get_attr("hreflang");
1188        let id = element.get_attr("id");
1189        let mime = element.get_attr("media-type");
1190        let properties = element.get_attr("properties");
1191
1192        metadata_link.push(MetadataLinkItem {
1193            href,
1194            rel,
1195            hreflang,
1196            id,
1197            mime,
1198            properties,
1199            refines: None,
1200        });
1201        Ok(())
1202    }
1203
1204    /// Recursively parse NCX navigation points from navMap or nested navPoint elements
1205    ///
1206    /// This function parses the hierarchical navigation structure defined in NCX files
1207    /// for EPUB 2.x documents. It handles nested navPoint elements to build a complete
1208    /// tree representation of the publication's table of contents.
1209    fn parse_nav_points(&self, parent_element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1210        let mut nav_points = Vec::new();
1211        for nav_point in parent_element.find_children_by_name("navPoint") {
1212            let label = match nav_point.find_children_by_name("navLabel").next() {
1213                Some(element) => element.text(),
1214                None => String::new(),
1215            };
1216
1217            let content = nav_point
1218                .find_children_by_name("content")
1219                .next()
1220                .map(|element| PathBuf::from(element.text()));
1221
1222            let play_order = nav_point
1223                .get_attr("playOrder")
1224                .and_then(|order| order.parse::<usize>().ok());
1225
1226            let children = self.parse_nav_points(nav_point)?;
1227
1228            nav_points.push(NavPoint {
1229                label,
1230                content,
1231                play_order,
1232                children,
1233            });
1234        }
1235
1236        nav_points.sort();
1237        Ok(nav_points)
1238    }
1239
1240    /// Recursively parses directory list structures
1241    ///
1242    /// This function recursively parses HTML navigation list structures,
1243    /// converting `<ol>` and `<li>` elements into NavPoint structures.
1244    /// Multi-level nested directory structures are supported.
1245    fn parse_catalog_list(&self, element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1246        let mut catalog = Vec::new();
1247        for item in element.children() {
1248            if item.tag_name() != "li" {
1249                return Err(EpubError::NonCanonicalFile {
1250                    tag: "li".to_string(),
1251                });
1252            }
1253
1254            let title_element = item
1255                .find_children_by_names(&["span", "a"])
1256                .next()
1257                .ok_or_else(|| EpubError::NonCanonicalFile {
1258                    tag: "span/a".to_string(),
1259                })?;
1260            let content_href = title_element.get_attr("href").map(PathBuf::from);
1261            let sub_list = if let Some(list) = item.find_children_by_name("ol").next() {
1262                self.parse_catalog_list(list)?
1263            } else {
1264                vec![]
1265            };
1266
1267            catalog.push(NavPoint {
1268                label: title_element.text(),
1269                content: content_href,
1270                children: sub_list,
1271                play_order: None,
1272            });
1273        }
1274
1275        Ok(catalog)
1276    }
1277
1278    /// Converts relative paths in the manifest to normalized paths
1279    /// relative to the EPUB root directory
1280    ///
1281    /// This function processes the href attribute of resources in the EPUB
1282    /// manifest and converts it to a normalized path representation.
1283    /// It handles three types of paths:
1284    /// - Relative paths starting with `../` (checks if they exceed the EPUB package scope)
1285    /// - Absolute paths starting with `/` (relative to the EPUB root directory)
1286    /// - Other relative paths (relative to the directory containing the OPF file)
1287    ///
1288    /// ## Parameters
1289    /// - `path`: The href attribute value of the resource in the manifest
1290    ///
1291    /// ## Return
1292    /// - `Ok(PathBuf)`: The parsed normalized path
1293    /// - `Err(EpubError)`: Relative link leakage
1294    #[inline]
1295    fn normalize_manifest_path(&self, path: &str) -> Result<PathBuf, EpubError> {
1296        let mut path = if path.starts_with("../") {
1297            let mut current_dir = self.epub_path.join(&self.package_path);
1298            current_dir.pop();
1299
1300            check_realtive_link_leakage(self.epub_path.clone(), current_dir, path)
1301                .map(PathBuf::from)
1302                .ok_or_else(|| EpubError::RealtiveLinkLeakage {
1303                    path: path.to_string(),
1304                })?
1305        } else if let Some(path) = path.strip_prefix("/") {
1306            PathBuf::from(path.to_string())
1307        } else {
1308            self.base_path.join(path)
1309        };
1310
1311        #[cfg(windows)]
1312        {
1313            path = PathBuf::from(path.to_string_lossy().replace('\\', "/"));
1314        }
1315
1316        Ok(path)
1317    }
1318
1319    /// Verify the fallback chain of all manifest items
1320    ///
1321    /// This function iterates through all manifest items with the fallback
1322    /// attribute and verifies the validity of their fallback chains, including checking:
1323    /// - Whether circular references exist
1324    /// - Whether the fallback resource exists in the manifest
1325    ///
1326    /// ## Notes
1327    /// If an invalid fallback chain is found, a warning log will be logged
1328    /// but the processing flow will not be interrupted.
1329    fn validate_fallback_chains(&self) {
1330        for (id, item) in &self.manifest {
1331            if item.fallback.is_none() {
1332                continue;
1333            }
1334
1335            let mut fallback_chain = Vec::new();
1336            if let Err(msg) = self.validate_fallback_chain(id, &mut fallback_chain) {
1337                warn!("Invalid fallback chain for item {}: {}", id, msg);
1338            }
1339        }
1340    }
1341
1342    /// Recursively verify the validity of a single fallback chain
1343    ///
1344    /// This function recursively traces the fallback chain to check for the following issues:
1345    /// - Circular reference
1346    /// - The referenced fallback resource does not exist
1347    ///
1348    /// ## Parameters
1349    /// - `manifest_id`: The id of the manifest item currently being verified
1350    /// - `fallback_chain`: The visited fallback chain paths used to detect circular references
1351    ///
1352    /// ## Return
1353    /// - `Ok(())`: The fallback chain is valid
1354    /// - `Err(String)`: A string containing error information
1355    fn validate_fallback_chain(
1356        &self,
1357        manifest_id: &str,
1358        fallback_chain: &mut Vec<String>,
1359    ) -> Result<(), String> {
1360        if fallback_chain.contains(&manifest_id.to_string()) {
1361            fallback_chain.push(manifest_id.to_string());
1362
1363            return Err(format!(
1364                "Circular reference detected in fallback chain for {}",
1365                fallback_chain.join("->")
1366            ));
1367        }
1368
1369        // Get the current item; its existence can be ensured based on the calling context.
1370        let item = self.manifest.get(manifest_id).unwrap();
1371
1372        if let Some(fallback_id) = &item.fallback {
1373            if !self.manifest.contains_key(fallback_id) {
1374                return Err(format!(
1375                    "Fallback resource {} does not exist in manifest",
1376                    fallback_id
1377                ));
1378            }
1379
1380            fallback_chain.push(manifest_id.to_string());
1381            self.validate_fallback_chain(fallback_id, fallback_chain)
1382        } else {
1383            // The end of the fallback chain
1384            Ok(())
1385        }
1386    }
1387
1388    /// Checks if a resource at the specified path is an encrypted file
1389    ///
1390    /// This function queries whether a specific resource path is marked as an encrypted
1391    /// file in the EPUB encryption information. It checks the encrypted data stored in
1392    /// `self.encryption`, looking for an entry that matches the given path.
1393    ///
1394    /// ## Parameters
1395    /// - `path`: The path of the resource to check
1396    ///
1397    /// ## Return
1398    /// - `Some(String)`: The encryption method used for the resource
1399    /// - `None`: The resource is not encrypted
1400    fn is_encryption_file(&self, path: &str) -> Option<String> {
1401        self.encryption.as_ref().and_then(|encryptions| {
1402            encryptions
1403                .iter()
1404                .find(|encryption| encryption.data == path)
1405                .map(|encryption| encryption.method.clone())
1406        })
1407    }
1408
1409    /// Automatically decrypts encrypted resource data
1410    ///
1411    /// Automatically decrypts data based on the provided encryption method.
1412    /// This function supports various encryption methods defined by the EPUB
1413    /// specification, including font obfuscation and the XML encryption standard.
1414    ///
1415    /// ## Parameters
1416    /// - `method`: The encryption method used for the resource
1417    /// - `data`: The encrypted resource data
1418    ///
1419    /// ## Return
1420    /// - `Ok(Vec<u8>)`: The decrypted resource data
1421    /// - `Err(EpubError)`: Unsupported encryption method
1422    ///
1423    /// ## Supported Encryption Methods
1424    /// - IDPF font obfuscation: `http://www.idpf.org/2008/embedding`
1425    /// - Adobe font obfuscation: `http://ns.adobe.com/pdf/enc#RC`
1426    #[inline]
1427    fn auto_dencrypt(&self, method: &str, data: &mut [u8]) -> Result<Vec<u8>, EpubError> {
1428        match method {
1429            "http://www.idpf.org/2008/embedding" => {
1430                Ok(idpf_font_dencryption(data, &self.unique_identifier))
1431            }
1432            "http://ns.adobe.com/pdf/enc#RC" => {
1433                Ok(adobe_font_dencryption(data, &self.unique_identifier))
1434            }
1435            _ => Err(EpubError::UnsupportedEncryptedMethod {
1436                method: method.to_string(),
1437            }),
1438        }
1439    }
1440}
1441
1442impl EpubDoc<BufReader<File>> {
1443    /// Creates a new EPUB document instance
1444    ///
1445    /// This function is a convenience constructor for `EpubDoc`,
1446    /// used to create an EPUB parser instance directly from a file path.
1447    ///
1448    /// ## Parameters
1449    /// - `path`: The path to the EPUB file
1450    ///
1451    /// ## Return
1452    /// - `Ok(EpubDoc)`: The created EPUB document instance
1453    /// - `Err(EpubError)`: An error occurred during initialization
1454    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, EpubError> {
1455        let file = File::open(&path).map_err(EpubError::from)?;
1456        let path = canonicalize(path)?;
1457
1458        Self::from_reader(BufReader::new(file), path)
1459    }
1460}
1461
1462#[cfg(test)]
1463mod tests {
1464    use std::{
1465        fs::File,
1466        io::BufReader,
1467        path::{Path, PathBuf},
1468    };
1469
1470    use crate::{epub::EpubDoc, error::EpubError, utils::XmlReader};
1471
1472    /// Section 3.3 package documents
1473    mod package_documents_tests {
1474        use std::{path::Path, sync::atomic::Ordering};
1475
1476        use crate::epub::{EpubDoc, EpubVersion};
1477
1478        /// ID: pkg-collections-unknown
1479        ///
1480        /// The package document contains a collection with an unknown role. The reading system must open the EPUB successfully.
1481        #[test]
1482        fn test_pkg_collections_unknown() {
1483            let epub_file = Path::new("./test_case/pkg-collections-unknown.epub");
1484            let doc = EpubDoc::new(epub_file);
1485            assert!(doc.is_ok());
1486        }
1487
1488        /// ID: pkg-creator-order
1489        ///
1490        /// Several creators are listed in the package document. The reading system must not display them out of order (but it may display only the first).
1491        #[test]
1492        fn test_pkg_creator_order() {
1493            let epub_file = Path::new("./test_case/pkg-creator-order.epub");
1494            let doc = EpubDoc::new(epub_file);
1495            assert!(doc.is_ok());
1496
1497            let doc = doc.unwrap();
1498            let creators = doc.get_metadata_value("creator");
1499            assert!(creators.is_some());
1500
1501            let creators = creators.unwrap();
1502            assert_eq!(creators.len(), 5);
1503            assert_eq!(
1504                creators,
1505                vec![
1506                    "Dave Cramer",
1507                    "Wendy Reid",
1508                    "Dan Lazin",
1509                    "Ivan Herman",
1510                    "Brady Duga",
1511                ]
1512            );
1513        }
1514
1515        /// ID: pkg-manifest-unknown
1516        ///
1517        /// The package document contains a manifest item with unknown properties. The reading system must open the EPUB successfully.
1518        #[test]
1519        fn test_pkg_manifest_order() {
1520            let epub_file = Path::new("./test_case/pkg-manifest-unknown.epub");
1521            let doc = EpubDoc::new(epub_file);
1522            assert!(doc.is_ok());
1523
1524            let doc = doc.unwrap();
1525            assert_eq!(doc.manifest.len(), 2);
1526            assert!(doc.get_manifest_item("nav").is_ok());
1527            assert!(doc.get_manifest_item("content_001").is_ok());
1528            assert!(doc.get_manifest_item("content_002").is_err());
1529        }
1530
1531        /// ID: pkg-meta-unknown
1532        ///
1533        /// The package document contains a meta tag with an unknown property. The reading system must open the EPUB successfully.
1534        #[test]
1535        fn test_pkg_meta_unknown() {
1536            let epub_file = Path::new("./test_case/pkg-meta-unknown.epub");
1537            let doc = EpubDoc::new(epub_file);
1538            assert!(doc.is_ok());
1539
1540            let doc = doc.unwrap();
1541            let value = doc.get_metadata_value("dcterms:isReferencedBy");
1542            assert!(value.is_some());
1543            let value = value.unwrap();
1544            assert_eq!(value.len(), 1);
1545            assert_eq!(
1546                value,
1547                vec!["https://www.w3.org/TR/epub-rs/#confreq-rs-pkg-meta-unknown"]
1548            );
1549
1550            let value = doc.get_metadata_value("dcterms:modified");
1551            assert!(value.is_some());
1552            let value = value.unwrap();
1553            assert_eq!(value.len(), 1);
1554            assert_eq!(value, vec!["2021-01-11T00:00:00Z"]);
1555
1556            let value = doc.get_metadata_value("dcterms:title");
1557            assert!(value.is_none());
1558        }
1559
1560        /// ID: pkg-meta-whitespace
1561        ///
1562        /// The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases.
1563        #[test]
1564        fn test_pkg_meta_white_space() {
1565            let epub_file = Path::new("./test_case/pkg-meta-whitespace.epub");
1566            let doc = EpubDoc::new(epub_file);
1567            assert!(doc.is_ok());
1568
1569            let doc = doc.unwrap();
1570            let value = doc.get_metadata_value("creator");
1571            assert!(value.is_some());
1572            let value = value.unwrap();
1573            assert_eq!(value.len(), 1);
1574            assert_eq!(value, vec!["Dave Cramer"]);
1575
1576            let value = doc.get_metadata_value("description");
1577            assert!(value.is_some());
1578            let value = value.unwrap();
1579            assert_eq!(value.len(), 1);
1580            assert_eq!(
1581                value,
1582                vec![
1583                    "The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases."
1584                ]
1585            );
1586        }
1587
1588        /// ID: pkg-spine-duplicate-item-hyperlink
1589        ///
1590        /// The spine contains several references to the same content document. The reading system must move to the position of the first duplicate in the reading order when following a hyperlink.
1591        #[test]
1592        fn test_pkg_spine_duplicate_item_hyperlink() {
1593            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-hyperlink.epub");
1594            let doc = EpubDoc::new(epub_file);
1595            assert!(doc.is_ok());
1596
1597            let mut doc = doc.unwrap();
1598            assert_eq!(doc.spine.len(), 4);
1599            assert_eq!(
1600                doc.navigate_by_spine_index(0).unwrap(),
1601                doc.get_manifest_item("content_001").unwrap()
1602            );
1603            assert_eq!(
1604                doc.navigate_by_spine_index(1).unwrap(),
1605                doc.get_manifest_item("content_002").unwrap()
1606            );
1607            assert_eq!(
1608                doc.navigate_by_spine_index(2).unwrap(),
1609                doc.get_manifest_item("content_002").unwrap()
1610            );
1611            assert_eq!(
1612                doc.navigate_by_spine_index(3).unwrap(),
1613                doc.get_manifest_item("content_002").unwrap()
1614            );
1615        }
1616
1617        /// ID: pkg-spine-duplicate-item-rendering
1618        ///
1619        /// The spine contains several references to the same content document. The reading system must not skip the duplicates when rendering the reading order.
1620        #[test]
1621        fn test_pkg_spine_duplicate_item_rendering() {
1622            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-rendering.epub");
1623            let doc = EpubDoc::new(epub_file);
1624            assert!(doc.is_ok());
1625
1626            let mut doc = doc.unwrap();
1627            assert_eq!(doc.spine.len(), 4);
1628
1629            let result = doc.spine_prev();
1630            assert!(result.is_none());
1631
1632            let result = doc.spine_next();
1633            assert!(result.is_some());
1634
1635            doc.spine_next();
1636            doc.spine_next();
1637            let result = doc.spine_next();
1638            assert!(result.is_none());
1639        }
1640
1641        /// ID: pkg-spine-nonlinear-activation
1642        ///
1643        /// An itemref in the spine is marked as non-linear. Although it (possibly) cannot be accessed through the table of contents, it can be reached from a link in the XHTML content.
1644        #[test]
1645        fn test_pkg_spine_nonlinear_activation() {
1646            let epub_file = Path::new("./test_case/pkg-spine-nonlinear-activation.epub");
1647            let doc = EpubDoc::new(epub_file);
1648            assert!(doc.is_ok());
1649
1650            let mut doc = doc.unwrap();
1651            assert!(doc.spine_prev().is_none());
1652            assert!(doc.spine_next().is_none());
1653
1654            assert!(doc.navigate_by_spine_index(1).is_some());
1655            assert!(doc.spine_prev().is_none());
1656            assert!(doc.spine_next().is_none());
1657        }
1658
1659        /// ID: pkg-spine-order
1660        ///
1661        /// Basic test of whether a reading system can display spine items in the correct order. The test fails if the reading system presents content in the order in which the file names sort, or if it presents files in manifest order rather than spine order.
1662        #[test]
1663        fn test_pkg_spine_order() {
1664            let epub_file = Path::new("./test_case/pkg-spine-order.epub");
1665            let doc = EpubDoc::new(epub_file);
1666            assert!(doc.is_ok());
1667
1668            let doc = doc.unwrap();
1669            assert_eq!(doc.spine.len(), 4);
1670            assert_eq!(
1671                doc.spine
1672                    .iter()
1673                    .map(|item| item.idref.clone())
1674                    .collect::<Vec<String>>(),
1675                vec![
1676                    "d-content_001",
1677                    "c-content_002",
1678                    "b-content_003",
1679                    "a-content_004",
1680                ]
1681            );
1682        }
1683
1684        /// ID: pkg-spine-order-svg
1685        ///
1686        /// Basic test of whether a reading system can display SVG spine items in the correct order.
1687        #[test]
1688        fn test_spine_order_svg() {
1689            let epub_file = Path::new("./test_case/pkg-spine-order-svg.epub");
1690            let doc = EpubDoc::new(epub_file);
1691            assert!(doc.is_ok());
1692
1693            let mut doc = doc.unwrap();
1694            assert_eq!(doc.spine.len(), 4);
1695
1696            loop {
1697                if let Some(spine) = doc.spine_next() {
1698                    let idref = doc.spine[doc.current_spine_index.load(Ordering::Relaxed)]
1699                        .idref
1700                        .clone();
1701                    let resource = doc.get_manifest_item(&idref);
1702                    assert!(resource.is_ok());
1703
1704                    let resource = resource.unwrap();
1705                    assert_eq!(spine, resource);
1706                } else {
1707                    break;
1708                }
1709            }
1710
1711            assert_eq!(doc.current_spine_index.load(Ordering::Relaxed), 3);
1712        }
1713
1714        /// ID: pkg-spine-unknown
1715        ///
1716        /// The package document contains a spine item with unknown properties. The reading system must open the EPUB successfully.
1717        #[test]
1718        fn test_pkg_spine_unknown() {
1719            let epub_file = Path::new("./test_case/pkg-spine-unknown.epub");
1720            let doc = EpubDoc::new(epub_file);
1721            assert!(doc.is_ok());
1722
1723            let doc = doc.unwrap();
1724            assert_eq!(doc.spine.len(), 1);
1725            assert_eq!(doc.spine[0].idref, "content_001");
1726            assert_eq!(doc.spine[0].id, None);
1727            assert_eq!(doc.spine[0].linear, true);
1728            assert_eq!(doc.spine[0].properties, Some("untrustworthy".to_string()));
1729        }
1730
1731        /// ID: pkg-title-order
1732        ///
1733        /// Several titles are listed in the package document. The reading system must use the first title (and whether to use other titles is not defined).
1734        #[test]
1735        fn test_pkg_title_order() {
1736            let epub_file = Path::new("./test_case/pkg-title-order.epub");
1737            let doc = EpubDoc::new(epub_file);
1738            assert!(doc.is_ok());
1739
1740            let doc = doc.unwrap();
1741            let title_list = doc.get_title();
1742            assert!(title_list.is_ok());
1743
1744            let title_list = title_list.unwrap();
1745            assert_eq!(title_list.len(), 6);
1746            assert_eq!(
1747                title_list,
1748                vec![
1749                    "pkg-title-order",
1750                    "This title must not display first",
1751                    "Also, this title must not display first",
1752                    "This title also must not display first",
1753                    "This title must also not display first",
1754                    "This title must not display first, also",
1755                ]
1756            );
1757        }
1758
1759        /// ID: pkg-unique-id
1760        ///
1761        /// The package document's dc:identifier is identical across two publications. The reading system should display both publications independently.
1762        #[test]
1763        fn test_pkg_unique_id() {
1764            let epub_file = Path::new("./test_case/pkg-unique-id.epub");
1765            let doc_1 = EpubDoc::new(epub_file);
1766            assert!(doc_1.is_ok());
1767
1768            let epub_file = Path::new("./test_case/pkg-unique-id_duplicate.epub");
1769            let doc_2 = EpubDoc::new(epub_file);
1770            assert!(doc_2.is_ok());
1771
1772            let doc_1 = doc_1.unwrap();
1773            let doc_2 = doc_2.unwrap();
1774
1775            assert_eq!(
1776                doc_1.get_identifier().unwrap(),
1777                doc_2.get_identifier().unwrap()
1778            );
1779            assert_eq!(doc_1.unique_identifier, "pkg-unique-id");
1780            assert_eq!(doc_2.unique_identifier, "pkg-unique-id");
1781        }
1782
1783        /// ID: pkg-version-backward
1784        ///
1785        /// “Reading Systems MUST attempt to process an EPUB Publication whose Package Document version attribute is less than "3.0"”. This is an EPUB with package version attribute set to "0", to see if a reading system will open it.
1786        #[test]
1787        fn test_pkg_version_backward() {
1788            let epub_file = Path::new("./test_case/pkg-version-backward.epub");
1789            let doc = EpubDoc::new(epub_file);
1790            assert!(doc.is_ok());
1791
1792            let doc = doc.unwrap();
1793            assert_eq!(doc.version, EpubVersion::Version3_0);
1794        }
1795
1796        /// ID: pkg-linked-records
1797        ///
1798        /// Reading System must process and display the title and creator metadata from the package document. An ONIX 3.0 format linked metadata record exists, but contains neither title nor creator metadata.
1799        #[test]
1800        fn test_pkg_linked_records() {
1801            let epub_file = Path::new("./test_case/pkg-linked-records.epub");
1802            let doc = EpubDoc::new(epub_file);
1803            assert!(doc.is_ok());
1804
1805            let doc = doc.unwrap();
1806            assert_eq!(doc.metadata_link.len(), 3);
1807
1808            let item = doc.metadata_link.iter().find(|&item| {
1809                if let Some(properties) = &item.properties {
1810                    properties.eq("onix")
1811                } else {
1812                    false
1813                }
1814            });
1815            assert!(item.is_some());
1816        }
1817
1818        /// ID: pkg-manifest-unlisted-resource
1819        ///
1820        /// The XHTML content references an image that does not appear in the manifest. The image should not be shown.
1821        #[test]
1822        fn test_pkg_manifest_unlisted_resource() {
1823            let epub_file = Path::new("./test_case/pkg-manifest-unlisted-resource.epub");
1824            let doc = EpubDoc::new(epub_file);
1825            assert!(doc.is_ok());
1826
1827            let doc = doc.unwrap();
1828            assert!(
1829                doc.get_manifest_item_by_path("EPUB/content_001.xhtml")
1830                    .is_ok()
1831            );
1832
1833            assert!(doc.get_manifest_item_by_path("EPUB/red.png").is_err());
1834            let err = doc.get_manifest_item_by_path("EPUB/red.png").unwrap_err();
1835            assert_eq!(
1836                err.to_string(),
1837                "Resource not found: Unable to find resource from \"EPUB/red.png\"."
1838            );
1839        }
1840    }
1841
1842    /// Section 3.4 manifest fallbacks
1843    ///
1844    /// The tests under this module seem to favor the reading system rather than the EPUB format itself
1845    mod manifest_fallbacks_tests {
1846        use std::path::Path;
1847
1848        use crate::epub::EpubDoc;
1849
1850        /// ID: pub-foreign_bad-fallback
1851        ///
1852        /// This is a test of manifest fallbacks where both the spine item and the fallback are likely to be unsupported. The spine item is a DMG, with a fallback to a PSD file. Reading systems may raise an error on the ingenstion workflow.
1853        #[test]
1854        fn test_pub_foreign_bad_fallback() {
1855            let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
1856            let doc = EpubDoc::new(epub_file);
1857            assert!(doc.is_ok());
1858
1859            let doc = doc.unwrap();
1860            assert!(doc.get_manifest_item("content_001").is_ok());
1861            assert!(doc.get_manifest_item("bar").is_ok());
1862
1863            assert_eq!(
1864                doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
1865                    .unwrap_err()
1866                    .to_string(),
1867                "No supported file format: The fallback resource does not contain the file format you support."
1868            );
1869        }
1870
1871        /// ID: pub-foreign_image
1872        ///
1873        /// An HTML content file contains a PSD image, with a manifest fallback to a PNG image. This tests fallbacks for resources that are not in the spine.
1874        #[test]
1875        fn test_pub_foreign_image() {
1876            let epub_file = Path::new("./test_case/pub-foreign_image.epub");
1877            let doc = EpubDoc::new(epub_file);
1878            assert!(doc.is_ok());
1879
1880            let doc = doc.unwrap();
1881            let result = doc.get_manifest_item_with_fallback(
1882                "image-tiff",
1883                vec!["image/png", "application/xhtml+xml"],
1884            );
1885            assert!(result.is_ok());
1886
1887            let (_, mime) = result.unwrap();
1888            assert_eq!(mime, "image/png");
1889        }
1890
1891        /// ID: pub-foreign_json-spine
1892        ///
1893        /// This EPUB uses a JSON content file in the spine, with a manifest fallback to an HTML document. If the reading system does not support JSON, it should display the HTML.
1894        #[test]
1895        fn test_pub_foreign_json_spine() {
1896            let epub_file = Path::new("./test_case/pub-foreign_json-spine.epub");
1897            let doc = EpubDoc::new(epub_file);
1898            assert!(doc.is_ok());
1899
1900            let doc = doc.unwrap();
1901            let result = doc.get_manifest_item_with_fallback(
1902                "content_primary",
1903                vec!["application/xhtml+xml", "application/json"],
1904            );
1905            assert!(result.is_ok());
1906            let (_, mime) = result.unwrap();
1907            assert_eq!(mime, "application/json");
1908
1909            let result = doc
1910                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1911            assert!(result.is_ok());
1912            let (_, mime) = result.unwrap();
1913            assert_eq!(mime, "application/xhtml+xml");
1914        }
1915
1916        /// ID: pub-foreign_xml-spine
1917        ///
1918        /// This EPUB uses an ordinary XML content file with mimetype application/xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1919        #[test]
1920        fn test_pub_foreign_xml_spine() {
1921            let epub_file = Path::new("./test_case/pub-foreign_xml-spine.epub");
1922            let doc = EpubDoc::new(epub_file);
1923            assert!(doc.is_ok());
1924
1925            let doc = doc.unwrap();
1926            let result = doc.get_manifest_item_with_fallback(
1927                "content_primary",
1928                vec!["application/xhtml+xml", "application/xml"],
1929            );
1930            assert!(result.is_ok());
1931            let (_, mime) = result.unwrap();
1932            assert_eq!(mime, "application/xml");
1933
1934            let result = doc
1935                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1936            assert!(result.is_ok());
1937            let (_, mime) = result.unwrap();
1938            assert_eq!(mime, "application/xhtml+xml");
1939        }
1940
1941        /// ID: pub-foreign_xml-suffix-spine
1942        ///
1943        /// This EPUB uses an custom XML content file with mimetype application/dtc+xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1944        #[test]
1945        fn test_pub_foreign_xml_suffix_spine() {
1946            let epub_file = Path::new("./test_case/pub-foreign_xml-suffix-spine.epub");
1947            let doc = EpubDoc::new(epub_file);
1948            assert!(doc.is_ok());
1949
1950            let doc = doc.unwrap();
1951            let result = doc.get_manifest_item_with_fallback(
1952                "content_primary",
1953                vec!["application/xhtml+xml", "application/dtc+xml"],
1954            );
1955            assert!(result.is_ok());
1956            let (_, mime) = result.unwrap();
1957            assert_eq!(mime, "application/dtc+xml");
1958
1959            let result = doc
1960                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1961            assert!(result.is_ok());
1962            let (_, mime) = result.unwrap();
1963            assert_eq!(mime, "application/xhtml+xml");
1964        }
1965    }
1966
1967    /// Section 3.9 open container format
1968    mod open_container_format_tests {
1969        use std::{cmp::min, io::Read, path::Path};
1970
1971        use sha1::{Digest, Sha1};
1972
1973        use crate::epub::EpubDoc;
1974
1975        /// ID: ocf-metainf-inc
1976        ///
1977        /// An extra configuration file, not in the reserved files' list, is added to the META-INF folder; this file must be ignored.
1978        #[test]
1979        fn test_ocf_metainf_inc() {
1980            let epub_file = Path::new("./test_case/ocf-metainf-inc.epub");
1981            let doc = EpubDoc::new(epub_file);
1982            assert!(doc.is_ok());
1983        }
1984
1985        /// ID: ocf-metainf-manifest
1986        ///
1987        /// An ancillary manifest file, containing an extra spine item, is present in the META-INF directory; this extra item must be ignored by the reading system.
1988        #[test]
1989        fn test_ocf_metainf_manifest() {
1990            let epub_file = Path::new("./test_case/ocf-metainf-manifest.epub");
1991            let doc = EpubDoc::new(epub_file);
1992            assert!(doc.is_ok());
1993        }
1994
1995        /// ID: ocf-package_arbitrary
1996        ///
1997        /// The EPUB contains three valid package files and three corresponding sets of content documents, but only one of the packages, in an unusual subdirectory, is referenced by the container.xml file. The reading system must use this package.
1998        #[test]
1999        fn test_ocf_package_arbitrary() {
2000            let epub_file = Path::new("./test_case/ocf-package_arbitrary.epub");
2001            let doc = EpubDoc::new(epub_file);
2002            assert!(doc.is_ok());
2003
2004            let doc = doc.unwrap();
2005            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
2006        }
2007
2008        /// ID: ocf-package_multiple
2009        ///
2010        /// The EPUB contains three valid package files and three corresponding sets of content documents, all referenced by the container.xml file. The reading system must use the first package.
2011        #[test]
2012        fn test_ocf_package_multiple() {
2013            let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
2014            let doc = EpubDoc::new(epub_file);
2015            assert!(doc.is_ok());
2016
2017            let doc = doc.unwrap();
2018            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
2019            assert_eq!(doc.base_path, Path::new("FOO/BAR"));
2020        }
2021
2022        /// ID: ocf-url_link-leaking-relative
2023        ///
2024        /// Use a relative link with several double-dot path segments from the content to a photograph. The folder hierarchy containing the photograph starts at the root level; the relative image reference exceeds depth of hierarchy.
2025        #[test]
2026        fn test_ocf_url_link_leaking_relative() {
2027            let epub_file = Path::new("./test_case/ocf-url_link-leaking-relative.epub");
2028            let doc = EpubDoc::new(epub_file);
2029            assert!(doc.is_err());
2030            assert_eq!(
2031                doc.err().unwrap().to_string(),
2032                String::from(
2033                    "Relative link leakage: Path \"../../../../media/imgs/monastery.jpg\" is out of container range."
2034                )
2035            )
2036        }
2037
2038        /// ID: ocf-url_link-path-absolute
2039        ///
2040        /// Use a path-absolute link, i.e., beginning with a leading slash, from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2041        #[test]
2042        fn test_ocf_url_link_path_absolute() {
2043            let epub_file = Path::new("./test_case/ocf-url_link-path-absolute.epub");
2044            let doc = EpubDoc::new(epub_file);
2045            assert!(doc.is_ok());
2046
2047            let doc = doc.unwrap();
2048            let resource = doc.manifest.get("photo").unwrap();
2049            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2050        }
2051
2052        /// ID: ocf-url_link-relative
2053        ///
2054        /// A simple relative link from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2055        #[test]
2056        fn test_ocf_url_link_relative() {
2057            let epub_file = Path::new("./test_case/ocf-url_link-relative.epub");
2058            let doc = EpubDoc::new(epub_file);
2059            assert!(doc.is_ok());
2060
2061            let doc = doc.unwrap();
2062            let resource = doc.manifest.get("photo").unwrap();
2063            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2064        }
2065
2066        /// ID: ocf-url_manifest
2067        ///
2068        /// The manifest refers to an XHTML file in an arbitrary subfolder. The reading system must be able to find the content.
2069        #[test]
2070        fn test_ocf_url_manifest() {
2071            let epub_file = Path::new("./test_case/ocf-url_manifest.epub");
2072            let doc = EpubDoc::new(epub_file);
2073            assert!(doc.is_ok());
2074
2075            let doc = doc.unwrap();
2076            assert!(doc.get_manifest_item("nav").is_ok());
2077            assert!(doc.get_manifest_item("content_001").is_ok());
2078            assert!(doc.get_manifest_item("content_002").is_err());
2079        }
2080
2081        /// ID: ocf-url_relative
2082        ///
2083        /// The manifest refers to an XHTML file in an arbitrary subfolder that is relative to the package's own arbitrary folder. The reading system must be able to find the content.
2084        #[test]
2085        fn test_ocf_url_relative() {
2086            let epub_file = Path::new("./test_case/ocf-url_relative.epub");
2087            let doc = EpubDoc::new(epub_file);
2088            assert!(doc.is_ok());
2089
2090            let doc = doc.unwrap();
2091            assert_eq!(doc.package_path, Path::new("foo/BAR/baz.opf"));
2092            assert_eq!(doc.base_path, Path::new("foo/BAR"));
2093            assert_eq!(
2094                doc.manifest.get("nav").unwrap().path,
2095                Path::new("foo/BAR/nav.xhtml")
2096            );
2097            assert_eq!(
2098                doc.manifest.get("content_001").unwrap().path,
2099                Path::new("foo/BAR/qux/content_001.xhtml")
2100            );
2101            assert!(doc.get_manifest_item("nav").is_ok());
2102            assert!(doc.get_manifest_item("content_001").is_ok());
2103        }
2104
2105        /// ID: ocf-zip-comp
2106        ///
2107        /// MUST treat any OCF ZIP container that uses compression techniques other than Deflate as in error.
2108        /// This test case does not use compression methods other than Deflate and cannot detect whether it is effective.
2109        #[test]
2110        fn test_ocf_zip_comp() {
2111            let epub_file = Path::new("./test_case/ocf-zip-comp.epub");
2112            let doc = EpubDoc::new(epub_file);
2113            assert!(doc.is_ok());
2114        }
2115
2116        /// ID: ocf-zip-mult
2117        ///
2118        /// MUST treat any OCF ZIP container that splits the content into segments as in error.
2119        /// This test case is not a segmented OCF ZIP container and cannot be tested to see if it is valid.
2120        #[test]
2121        fn test_ocf_zip_mult() {
2122            let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2123            let doc = EpubDoc::new(epub_file);
2124            assert!(doc.is_ok());
2125        }
2126
2127        /// ID: ocf-font_obfuscation
2128        ///
2129        /// An obfuscated (TrueType) font should be displayed after de-obfuscation.
2130        #[test]
2131        fn test_ocf_font_obfuscation() {
2132            let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2133            let doc = EpubDoc::new(epub_file);
2134            assert!(doc.is_ok());
2135
2136            let doc = doc.unwrap();
2137            let unique_id = doc.unique_identifier.clone();
2138
2139            let mut hasher = Sha1::new();
2140            hasher.update(unique_id.as_bytes());
2141            let hash = hasher.finalize();
2142            let mut key = vec![0u8; 1040];
2143            for i in 0..1040 {
2144                key[i] = hash[i % hash.len()];
2145            }
2146
2147            assert!(doc.encryption.is_some());
2148            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2149
2150            let data = &doc.encryption.unwrap()[0];
2151            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2152
2153            let font_file = doc
2154                .archive
2155                .lock()
2156                .unwrap()
2157                .by_name(&data.data)
2158                .unwrap()
2159                .bytes()
2160                .collect::<Result<Vec<u8>, _>>();
2161            assert!(font_file.is_ok());
2162            let font_file = font_file.unwrap();
2163
2164            // 根据EPUB规范，字体混淆是直接对字体文件进行的，不需要解压步骤，直接进行去混淆处理
2165            let mut deobfuscated = font_file.clone();
2166            for i in 0..min(1040, deobfuscated.len()) {
2167                deobfuscated[i] ^= key[i];
2168            }
2169
2170            assert!(is_valid_font(&deobfuscated));
2171        }
2172
2173        /// ID: ocf-font_obfuscation-bis
2174        ///
2175        /// An obfuscated (TrueType) font should not be displayed after de-obfuscation, because the obfuscation used a different publication id.
2176        #[test]
2177        fn test_ocf_font_obfuscation_bis() {
2178            let epub_file = Path::new("./test_case/ocf-font_obfuscation_bis.epub");
2179            let doc = EpubDoc::new(epub_file);
2180            assert!(doc.is_ok());
2181
2182            let doc = doc.unwrap();
2183
2184            let wrong_unique_id = "wrong-publication-id";
2185            let mut hasher = Sha1::new();
2186            hasher.update(wrong_unique_id.as_bytes());
2187            let hash = hasher.finalize();
2188            let mut wrong_key = vec![0u8; 1040];
2189            for i in 0..1040 {
2190                wrong_key[i] = hash[i % hash.len()];
2191            }
2192
2193            assert!(doc.encryption.is_some());
2194            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2195
2196            let data = &doc.encryption.unwrap()[0];
2197            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2198
2199            let font_file = doc
2200                .archive
2201                .lock()
2202                .unwrap()
2203                .by_name(&data.data)
2204                .unwrap()
2205                .bytes()
2206                .collect::<Result<Vec<u8>, _>>();
2207            assert!(font_file.is_ok());
2208            let font_file = font_file.unwrap();
2209
2210            // 使用错误的密钥进行去混淆
2211            let mut deobfuscated_with_wrong_key = font_file.clone();
2212            for i in 0..std::cmp::min(1040, deobfuscated_with_wrong_key.len()) {
2213                deobfuscated_with_wrong_key[i] ^= wrong_key[i];
2214            }
2215
2216            assert!(!is_valid_font(&deobfuscated_with_wrong_key));
2217        }
2218
2219        fn is_valid_font(data: &[u8]) -> bool {
2220            if data.len() < 4 {
2221                return false;
2222            }
2223            let sig = &data[0..4];
2224            // OTF: "OTTO"
2225            // TTF: 0x00010000, 0x00020000, "true", "typ1"
2226            sig == b"OTTO"
2227                || sig == b"\x00\x01\x00\x00"
2228                || sig == b"\x00\x02\x00\x00"
2229                || sig == b"true"
2230                || sig == b"typ1"
2231        }
2232    }
2233
2234    #[test]
2235    fn test_parse_container() {
2236        let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2237        let doc = EpubDoc::new(epub_file);
2238        assert!(doc.is_ok());
2239
2240        // let doc = doc.unwrap();
2241        let container = r#"
2242        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2243            <rootfiles></rootfiles>
2244        </container>
2245        "#
2246        .to_string();
2247
2248        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2249        assert!(result.is_err());
2250        assert_eq!(
2251            result.unwrap_err(),
2252            EpubError::NonCanonicalFile {
2253                tag: "rootfile".to_string()
2254            }
2255        );
2256
2257        let container = r#"
2258        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2259            <rootfiles>
2260                <rootfile media-type="application/oebps-package+xml"/>
2261            </rootfiles>
2262        </container>
2263        "#
2264        .to_string();
2265
2266        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2267        assert!(result.is_err());
2268        assert_eq!(
2269            result.unwrap_err(),
2270            EpubError::MissingRequiredAttribute {
2271                tag: "rootfile".to_string(),
2272                attribute: "full-path".to_string(),
2273            }
2274        );
2275
2276        let container = r#"
2277        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2278            <rootfiles>
2279                <rootfile media-type="application/oebps-package+xml" full-path="EPUB/content.opf"/>
2280            </rootfiles>
2281        </container>
2282        "#
2283        .to_string();
2284
2285        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2286        assert!(result.is_ok());
2287        assert_eq!(result.unwrap(), PathBuf::from("EPUB/content.opf"))
2288    }
2289
2290    #[test]
2291    fn test_parse_manifest() {
2292        let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
2293        let doc = EpubDoc::new(epub_file);
2294        assert!(doc.is_ok());
2295
2296        let manifest = r#"
2297        <manifest>
2298            <item href="content_001.xhtml" media-type="application/xhtml+xml"/>
2299            <item properties="nav" href="nav.xhtml" media-type="application/xhtml+xml"/>
2300        </manifest>
2301        "#;
2302        let mut doc = doc.unwrap();
2303        let element = XmlReader::parse(manifest);
2304        assert!(element.is_ok());
2305
2306        let element = element.unwrap();
2307        let result = doc.parse_manifest(&element);
2308        assert!(result.is_err());
2309        assert_eq!(
2310            result.unwrap_err(),
2311            EpubError::MissingRequiredAttribute {
2312                tag: "item".to_string(),
2313                attribute: "id".to_string(),
2314            },
2315        );
2316
2317        let manifest = r#"
2318        <manifest>
2319            <item id="content_001" media-type="application/xhtml+xml"/>
2320            <item id="nav" properties="nav" media-type="application/xhtml+xml"/>
2321        </manifest>
2322        "#;
2323        let element = XmlReader::parse(manifest);
2324        assert!(element.is_ok());
2325
2326        let element = element.unwrap();
2327        let result = doc.parse_manifest(&element);
2328        assert!(result.is_err());
2329        assert_eq!(
2330            result.unwrap_err(),
2331            EpubError::MissingRequiredAttribute {
2332                tag: "item".to_string(),
2333                attribute: "href".to_string(),
2334            },
2335        );
2336
2337        let manifest = r#"
2338        <manifest>
2339            <item id="content_001" href="content_001.xhtml"/>
2340            <item id="nav" properties="nav" href="nav.xhtml"/>
2341        </manifest>
2342        "#;
2343        let element = XmlReader::parse(manifest);
2344        assert!(element.is_ok());
2345
2346        let element = element.unwrap();
2347        let result = doc.parse_manifest(&element);
2348        assert!(result.is_err());
2349        assert_eq!(
2350            result.unwrap_err(),
2351            EpubError::MissingRequiredAttribute {
2352                tag: "item".to_string(),
2353                attribute: "media-type".to_string(),
2354            },
2355        );
2356
2357        let manifest = r#"
2358        <manifest>
2359            <item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
2360            <item id="nav" properties="nav" href="nav.xhtml" media-type="application/xhtml+xml"/>
2361        </manifest>
2362        "#;
2363        let element = XmlReader::parse(manifest);
2364        assert!(element.is_ok());
2365
2366        let element = element.unwrap();
2367        let result = doc.parse_manifest(&element);
2368        assert!(result.is_ok());
2369    }
2370
2371    /// Test for function `has_encryption`
2372    #[test]
2373    fn test_fn_has_encryption() {
2374        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2375        let doc = EpubDoc::new(epub_file);
2376        assert!(doc.is_ok());
2377
2378        let doc = doc.unwrap();
2379        assert!(doc.has_encryption());
2380    }
2381
2382    /// This test is used to detect whether the "META-INF/encryption.xml" file is parsed correctly
2383    #[test]
2384    fn test_fn_parse_encryption() {
2385        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2386        let doc = EpubDoc::new(epub_file);
2387        assert!(doc.is_ok());
2388
2389        let doc = doc.unwrap();
2390        assert!(doc.encryption.is_some());
2391
2392        let encryption = doc.encryption.unwrap();
2393        assert_eq!(encryption.len(), 1);
2394        assert_eq!(encryption[0].method, "http://www.idpf.org/2008/embedding");
2395        assert_eq!(encryption[0].data, "EPUB/fonts/Lobster.ttf");
2396    }
2397
2398    #[test]
2399    fn test_get_metadata_existing_key() {
2400        let epub_file = Path::new("./test_case/epub-33.epub");
2401        let doc = EpubDoc::new(epub_file);
2402        assert!(doc.is_ok());
2403
2404        let doc = doc.unwrap();
2405
2406        let titles = doc.get_metadata("title");
2407        assert!(titles.is_some());
2408
2409        let titles = titles.unwrap();
2410        assert_eq!(titles.len(), 1);
2411        assert_eq!(titles[0].property, "title");
2412        assert_eq!(titles[0].value, "EPUB 3.3");
2413
2414        let languages = doc.get_metadata("language");
2415        assert!(languages.is_some());
2416
2417        let languages = languages.unwrap();
2418        assert_eq!(languages.len(), 1);
2419        assert_eq!(languages[0].property, "language");
2420        assert_eq!(languages[0].value, "en-us");
2421
2422        let language = doc.get_language();
2423        assert!(language.is_ok());
2424        assert_eq!(language.unwrap(), vec!["en-us"]);
2425    }
2426
2427    #[test]
2428    fn test_get_metadata_nonexistent_key() {
2429        let epub_file = Path::new("./test_case/epub-33.epub");
2430        let doc = EpubDoc::new(epub_file);
2431        assert!(doc.is_ok());
2432
2433        let doc = doc.unwrap();
2434        let metadata = doc.get_metadata("nonexistent");
2435        assert!(metadata.is_none());
2436    }
2437
2438    #[test]
2439    fn test_get_metadata_multiple_items_same_type() {
2440        let epub_file = Path::new("./test_case/epub-33.epub");
2441        let doc = EpubDoc::new(epub_file);
2442        assert!(doc.is_ok());
2443
2444        let doc = doc.unwrap();
2445
2446        let creators = doc.get_metadata("creator");
2447        assert!(creators.is_some());
2448
2449        let creators = creators.unwrap();
2450        assert_eq!(creators.len(), 3);
2451
2452        assert_eq!(creators[0].id, Some("creator_id_0".to_string()));
2453        assert_eq!(creators[0].property, "creator");
2454        assert_eq!(creators[0].value, "Matt Garrish, DAISY Consortium");
2455
2456        assert_eq!(creators[1].id, Some("creator_id_1".to_string()));
2457        assert_eq!(creators[1].property, "creator");
2458        assert_eq!(creators[1].value, "Ivan Herman, W3C");
2459
2460        assert_eq!(creators[2].id, Some("creator_id_2".to_string()));
2461        assert_eq!(creators[2].property, "creator");
2462        assert_eq!(creators[2].value, "Dave Cramer, Invited Expert");
2463    }
2464
2465    #[test]
2466    fn test_get_metadata_with_refinement() {
2467        let epub_file = Path::new("./test_case/epub-33.epub");
2468        let doc = EpubDoc::new(epub_file);
2469        assert!(doc.is_ok());
2470
2471        let doc = doc.unwrap();
2472
2473        let title = doc.get_metadata("title");
2474        assert!(title.is_some());
2475
2476        let title = title.unwrap();
2477        assert_eq!(title.len(), 1);
2478        assert_eq!(title[0].refined.len(), 1);
2479        assert_eq!(title[0].refined[0].property, "title-type");
2480        assert_eq!(title[0].refined[0].value, "main");
2481    }
2482
2483    #[test]
2484    fn test_get_manifest_item_with_fallback() {
2485        let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
2486        let doc = EpubDoc::new(epub_file);
2487        assert!(doc.is_ok());
2488
2489        let doc = doc.unwrap();
2490        assert!(doc.get_manifest_item("content_001").is_ok());
2491        assert!(doc.get_manifest_item("bar").is_ok());
2492
2493        // 当回退链上存在可回退资源时能获取资源
2494        if let Ok((_, mime)) = doc.get_manifest_item_with_fallback("content_001", vec!["image/psd"])
2495        {
2496            assert_eq!(mime, "image/psd");
2497        } else {
2498            assert!(false, "get_manifest_item_with_fallback failed");
2499        }
2500
2501        // 当回退链上不存在可回退资源时无法获取资源
2502        assert_eq!(
2503            doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
2504                .unwrap_err()
2505                .to_string(),
2506            "No supported file format: The fallback resource does not contain the file format you support."
2507        );
2508    }
2509
2510    #[test]
2511    fn test_get_cover() {
2512        let epub_file = Path::new("./test_case/pkg-cover-image.epub");
2513        let doc = EpubDoc::new(epub_file);
2514        if let Err(err) = &doc {
2515            println!("{}", err);
2516        }
2517        assert!(doc.is_ok());
2518
2519        let doc = doc.unwrap();
2520        let result = doc.get_cover();
2521        assert!(result.is_some());
2522
2523        let (data, mime) = result.unwrap();
2524        assert_eq!(data.len(), 5785);
2525        assert_eq!(mime, "image/jpeg");
2526    }
2527
2528    #[test]
2529    fn test_epub_2() {
2530        let epub_file = Path::new("./test_case/epub-2.epub");
2531        let doc = EpubDoc::new(epub_file);
2532        assert!(doc.is_ok());
2533
2534        let doc = doc.unwrap();
2535
2536        let titles = doc.get_title();
2537        assert!(titles.is_ok());
2538        assert_eq!(titles.unwrap(), vec!["Minimal EPUB 2.0"]);
2539    }
2540}
lib_epub/epub.rs

lib_epub/
epub.rs