lib_epub/
epub.rs

1//! The core module of the EPUB parsing library
2//!
3//! This module provides complete parsing functionality for EPUB ebook files
4//! and is the core component of the entire library. The `EpubDoc` structure
5//! encapsulates all the parsing logic and data access interfaces for EPUB files.
6//!
7//! ## Main references to EPUB specs:
8//! - <https://www.w3.org/TR/epub-33>
9//! - <https://idpf.org/epub/201>
10//!
11//! ## Potential Issues
12//! - The generic parameter `R: Read + Seek` increases complexity, particularly
13//!   in asynchronous environments. The current design is not conducive to multi-threaded
14//!   concurrent access and requires an external synchronization mechanism.
15//! - Some error handling may not be sufficiently nuanced, and certain edge cases
16//!   may not be adequately considered.
17//! - Loading the entire EPUB document at once may result in significant memory consumption,
18//!   especially for large publications.
19//!
20//! ## Future Work
21//! - Supports more EPUB specification features, such as media overlay and scripts.
22
23use std::{
24    collections::HashMap,
25    fs::{File, canonicalize},
26    io::{BufReader, Read, Seek},
27    path::{Path, PathBuf},
28    sync::{
29        Arc, Mutex,
30        atomic::{AtomicUsize, Ordering},
31    },
32};
33
34use log::warn;
35use zip::{ZipArchive, result::ZipError};
36
37use crate::{
38    error::EpubError,
39    types::{
40        EncryptionData, EpubVersion, ManifestItem, MetadataItem, MetadataLinkItem,
41        MetadataRefinement, NavPoint, SpineItem,
42    },
43    utils::{
44        DecodeBytes, NormalizeWhitespace, XmlElement, XmlReader, adobe_font_dencryption,
45        check_realtive_link_leakage, compression_method_check, get_file_in_zip_archive,
46        idpf_font_dencryption,
47    },
48};
49
50/// EPUB document parser, representing a loaded and parsed EPUB publication
51///
52/// The `EpubDoc` structure is the core of the entire EPUB parsing library.
53/// It encapsulates all the parsing logic and data access interfaces for EPUB files.
54/// It is responsible for parsing various components of an EPUB, including metadata,
55/// manifests, reading order, table of contents navigation, and encrypted information,
56/// and provides methods for accessing this data.
57///
58/// Provides a unified data access interface for EPUB files, hiding the underlying
59/// file structure and parsing details. Strictly adheres to the EPUB specification
60/// in implementing the parsing logic to ensure compatibility with the standard.
61///
62/// ## Usage
63///
64/// ```rust
65/// use lib_epub::epub::EpubDoc;
66///
67/// let doc = EpubDoc::new("./test_case/epub-33.epub");
68/// assert!(doc.is_ok());
69/// ```
70///
71/// ## Notes
72/// - The `EpubDoc` structure is thread-safe **if and only if** the structure is immutable.
73/// - The fact that `EpubDoc` is mutable has no practical meaning; modifications
74///   to the structure data are not stored in the epub file.
75pub struct EpubDoc<R: Read + Seek> {
76    /// The structure of the epub file that actually holds it
77    pub(crate) archive: Arc<Mutex<ZipArchive<R>>>,
78
79    /// The path to the target epub file
80    pub(crate) epub_path: PathBuf,
81
82    /// The path to the OPF file
83    pub package_path: PathBuf,
84
85    /// The path to the directory where the opf file is located
86    pub base_path: PathBuf,
87
88    /// The epub version
89    pub version: EpubVersion,
90
91    /// The unique identifier of the epub file
92    ///
93    /// This identifier is the actual value of the unique-identifier attribute of the package.
94    pub unique_identifier: String,
95
96    /// Epub metadata extracted from OPF
97    pub metadata: Vec<MetadataItem>,
98
99    /// Data in metadata that points to external files
100    pub metadata_link: Vec<MetadataLinkItem>,
101
102    /// A list of resources contained inside an epub extracted from OPF
103    ///
104    /// All resources in the epub file are declared here,
105    /// and undeclared resources should not be stored in the epub file and cannot be obtained from it.
106    pub manifest: HashMap<String, ManifestItem>,
107
108    /// Physical reading order of publications extracted from OPF
109    ///
110    /// This attribute declares the order in which multiple files
111    /// containing published content should be displayed.
112    pub spine: Vec<SpineItem>,
113
114    /// The encryption.xml extracted from the META-INF directory
115    pub encryption: Option<Vec<EncryptionData>>,
116
117    /// The navigation data of the epub file
118    pub catalog: Vec<NavPoint>,
119
120    /// The title of the catalog
121    pub catalog_title: String,
122
123    /// The index of the current reading spine
124    current_spine_index: AtomicUsize,
125
126    /// Whether the epub file contains encryption information
127    has_encryption: bool,
128}
129
130impl<R: Read + Seek> EpubDoc<R> {
131    /// Creates a new EPUB document instance from a reader
132    ///
133    /// This function is responsible for the core logic of parsing EPUB files,
134    /// including verifying the file format, parsing container information,
135    /// loading the OPF package document, and extracting metadata, manifest,
136    /// reading order, and other core information.
137    ///
138    /// ## Parameters
139    /// - `reader`: The data source that implements the `Read` and `Seek` traits,
140    ///   usually a file or memory buffer
141    /// - `epub_path`: The path to the EPUB file, used for path resolution and validation
142    ///
143    /// ## Return
144    /// - `Ok(EpubDoc<R>)`: The successfully parsed EPUB document object
145    /// - `Err(EpubError)`: Errors encountered during parsing
146    ///
147    /// ## Notes
148    /// - This function assumes the EPUB file structure is valid
149    pub fn from_reader(reader: R, epub_path: PathBuf) -> Result<Self, EpubError> {
150        // Parsing process
151        // 1. Verify that the ZIP compression method conforms to the EPUB specification
152        // 2. Parse `META-INF/container.xml` retrieves the location of the OPF file
153        // 3. Parses the OPF file to obtain package documentation information
154        // 4. Extracts version information
155        // 5. Parses metadata, manifest, and spine
156        // 6. Parses encrypted information and directory navigation
157        // 7. Verifies and extracts the unique identifier
158
159        let mut archive = ZipArchive::new(reader).map_err(EpubError::from)?;
160        let epub_path = canonicalize(epub_path)?;
161
162        compression_method_check(&mut archive)?;
163
164        let container =
165            get_file_in_zip_archive(&mut archive, "META-INF/container.xml")?.decode()?;
166        let package_path = Self::parse_container(container)?;
167        let base_path = package_path
168            .parent()
169            .expect("所有文件的父目录不能为空")
170            .to_path_buf();
171
172        let opf_file =
173            get_file_in_zip_archive(&mut archive, package_path.to_str().unwrap())?.decode()?;
174        let package = XmlReader::parse(&opf_file)?;
175
176        let version = Self::determine_epub_version(&package)?;
177        let has_encryption = archive
178            .by_path(Path::new("META-INF/encryption.xml"))
179            .is_ok();
180
181        let mut doc = Self {
182            archive: Arc::new(Mutex::new(archive)),
183            epub_path,
184            package_path,
185            base_path,
186            version,
187            unique_identifier: String::new(),
188            metadata: vec![],
189            metadata_link: vec![],
190            manifest: HashMap::new(),
191            spine: vec![],
192            encryption: None,
193            catalog: vec![],
194            catalog_title: String::new(),
195            current_spine_index: AtomicUsize::new(0),
196            has_encryption,
197        };
198
199        let metadata_element = package.find_elements_by_name("metadata").next().unwrap();
200        let manifest_element = package.find_elements_by_name("manifest").next().unwrap();
201        let spine_element = package.find_elements_by_name("spine").next().unwrap();
202
203        doc.parse_metadata(metadata_element)?;
204        doc.parse_manifest(manifest_element)?;
205        doc.parse_spine(spine_element)?;
206        doc.parse_encryption()?;
207        doc.parse_catalog()?;
208
209        // 断言必有唯一标识符
210        doc.unique_identifier = if let Some(uid) = package.get_attr("unique-identifier") {
211            doc.metadata.iter().find(|item| {
212                item.property == "identifier" && item.id.as_ref().is_some_and(|id| id == &uid)
213            })
214        } else {
215            doc.metadata
216                .iter()
217                .find(|item| item.property == "identifier")
218        }
219        .map(|item| item.value.clone())
220        .ok_or_else(|| EpubError::NonCanonicalFile { tag: "dc:identifier".to_string() })?;
221
222        Ok(doc)
223    }
224
225    /// Parse the EPUB container file (META-INF/container.xml)
226    ///
227    /// This function parses the container information in the EPUB file 、
228    /// to extract the path to the OPF package file. According to the EPUB
229    /// specification, the `container.xml` file must exist in the `META-INF`
230    /// directory and contain at least one `rootfile` element pointing to
231    /// the main OPF file. When multiple `rootfile` elements exist, the first
232    /// element pointing to the OPF file is used as the default.
233    ///
234    /// ## Parameters
235    /// - `data`: The content string of the container.xml
236    ///
237    /// ## Return
238    /// - `Ok(PathBuf)`: The path to the successfully parsed OPF file
239    /// - `Err(EpubError)`: Errors encountered during parsing
240    fn parse_container(data: String) -> Result<PathBuf, EpubError> {
241        let root = XmlReader::parse(&data)?;
242        let rootfile = root
243            .find_elements_by_name("rootfile")
244            .next()
245            .ok_or_else(|| EpubError::NonCanonicalFile { tag: "rootfile".to_string() })?;
246
247        let attr =
248            rootfile
249                .get_attr("full-path")
250                .ok_or_else(|| EpubError::MissingRequiredAttribute {
251                    tag: "rootfile".to_string(),
252                    attribute: "full-path".to_string(),
253                })?;
254
255        Ok(PathBuf::from(attr))
256    }
257
258    /// Parse the EPUB metadata section
259    ///
260    /// This function is responsible for parsing the `<metadata>` elements
261    /// in the OPF file to extract basic information about the publication.
262    /// It handles metadata elements from different namespaces:
263    /// - Elements in the Dublin Core namespace (`http://purl.org/dc/elements/1.1/`)
264    /// - Elements in the OPF namespace (`http://www.idpf.org/2007/opf`)
265    ///
266    /// ## Parameters
267    /// - `metadata_element`: A reference to the `<metadata>` element in the OPF file
268    fn parse_metadata(&mut self, metadata_element: &XmlElement) -> Result<(), EpubError> {
269        const DC_NAMESPACE: &str = "http://purl.org/dc/elements/1.1/";
270        const OPF_NAMESPACE: &str = "http://www.idpf.org/2007/opf";
271
272        let mut metadata = Vec::new();
273        let mut metadata_link = Vec::new();
274        let mut refinements = HashMap::<String, Vec<MetadataRefinement>>::new();
275
276        for element in metadata_element.children() {
277            match &element.namespace {
278                Some(namespace) if namespace == DC_NAMESPACE => {
279                    self.parse_dc_metadata(element, &mut metadata)?
280                }
281
282                Some(namespace) if namespace == OPF_NAMESPACE => self.parse_opf_metadata(
283                    element,
284                    &mut metadata,
285                    &mut metadata_link,
286                    &mut refinements,
287                )?,
288
289                _ => {}
290            };
291        }
292
293        for item in metadata.iter_mut() {
294            if let Some(id) = &item.id {
295                if let Some(refinements) = refinements.remove(id) {
296                    item.refined = refinements;
297                }
298            }
299        }
300
301        self.metadata = metadata;
302        self.metadata_link = metadata_link;
303        Ok(())
304    }
305
306    /// Parse the EPUB manifest section
307    ///
308    /// This function parses the `<manifest>` element in the OPF file, extracting
309    /// information about all resource files in the publication. Each resource contains
310    /// basic information such as id, file path, MIME type, as well as optional
311    /// attributes and fallback resource information.
312    ///
313    /// ## Parameters
314    /// - `manifest_element`: A reference to the `<manifest>` element in the OPF file
315    fn parse_manifest(&mut self, manifest_element: &XmlElement) -> Result<(), EpubError> {
316        let estimated_items = manifest_element.children().count();
317        let mut resources = HashMap::with_capacity(estimated_items);
318
319        for element in manifest_element.children() {
320            let id = element
321                .get_attr("id")
322                .ok_or_else(|| EpubError::MissingRequiredAttribute {
323                    tag: element.tag_name(),
324                    attribute: "id".to_string(),
325                })?
326                .to_string();
327            let path = element
328                .get_attr("href")
329                .ok_or_else(|| EpubError::MissingRequiredAttribute {
330                    tag: element.tag_name(),
331                    attribute: "href".to_string(),
332                })?
333                .to_string();
334            let mime = element
335                .get_attr("media-type")
336                .ok_or_else(|| EpubError::MissingRequiredAttribute {
337                    tag: element.tag_name(),
338                    attribute: "media-type".to_string(),
339                })?
340                .to_string();
341            let properties = element.get_attr("properties");
342            let fallback = element.get_attr("fallback");
343
344            resources.insert(
345                id.clone(),
346                ManifestItem {
347                    id,
348                    path: self.normalize_manifest_path(&path)?,
349                    mime,
350                    properties,
351                    fallback,
352                },
353            );
354        }
355
356        self.manifest = resources;
357        self.validate_fallback_chains();
358        Ok(())
359    }
360
361    /// Parse the EPUB spine section
362    ///
363    /// This function parses the `<spine>` elements in the OPF file to extract
364    /// the reading order information of the publication. The spine defines the
365    /// linear reading order of the publication's content documents, and each
366    /// spine item references resources in the manifest.
367    ///
368    /// ## Parameters
369    /// - `spine_element`: A reference to the `<spine>` element in the OPF file
370    fn parse_spine(&mut self, spine_element: &XmlElement) -> Result<(), EpubError> {
371        let mut spine = Vec::new();
372        for element in spine_element.children() {
373            let idref = element
374                .get_attr("idref")
375                .ok_or_else(|| EpubError::MissingRequiredAttribute {
376                    tag: element.tag_name(),
377                    attribute: "idref".to_string(),
378                })?
379                .to_string();
380            let id = element.get_attr("id");
381            let linear = element
382                .get_attr("linear")
383                .map(|linear| linear == "yes")
384                .unwrap_or(true);
385            let properties = element.get_attr("properties");
386
387            spine.push(SpineItem { idref, id, linear, properties });
388        }
389
390        self.spine = spine;
391        Ok(())
392    }
393
394    /// Parse the EPUB encryption file (META-INF/encryption.xml)
395    ///
396    /// This function is responsible for parsing the `encryption.xml` file
397    /// in the `META-INF` directory to extract information about encrypted
398    /// resources in the publication. According to the EPUB specification,
399    /// the encryption information describes which resources are encrypted
400    /// and the encryption methods used.
401    ///
402    /// TODO: 需要对使用非对称加密数据的加密项进行额外处理，以获取非对称加密密钥
403    fn parse_encryption(&mut self) -> Result<(), EpubError> {
404        if !self.has_encryption() {
405            return Ok(());
406        }
407
408        let mut archive = self.archive.lock()?;
409        let encryption_file =
410            get_file_in_zip_archive(&mut archive, "META-INF/encryption.xml")?.decode()?;
411
412        let root = XmlReader::parse(&encryption_file)?;
413
414        let mut encryption_data = Vec::new();
415        for data in root.children() {
416            if data.name != "EncryptedData" {
417                continue;
418            }
419
420            let method = data
421                .find_elements_by_name("EncryptionMethod")
422                .next()
423                .ok_or_else(|| EpubError::NonCanonicalFile {
424                    tag: "EncryptionMethod".to_string(),
425                })?;
426            let reference = data
427                .find_elements_by_name("CipherReference")
428                .next()
429                .ok_or_else(|| EpubError::NonCanonicalFile {
430                    tag: "CipherReference".to_string(),
431                })?;
432
433            encryption_data.push(EncryptionData {
434                method: method
435                    .get_attr("Algorithm")
436                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
437                        tag: "EncryptionMethod".to_string(),
438                        attribute: "Algorithm".to_string(),
439                    })?
440                    .to_string(),
441                data: reference
442                    .get_attr("URI")
443                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
444                        tag: "CipherReference".to_string(),
445                        attribute: "URI".to_string(),
446                    })?
447                    .to_string(),
448            });
449        }
450
451        if !encryption_data.is_empty() {
452            self.encryption = Some(encryption_data);
453        }
454
455        Ok(())
456    }
457
458    /// Parse the EPUB navigation information
459    ///
460    /// This function is responsible for parsing the navigation information of EPUB
461    /// publications. Different parsing strategies are used depending on the EPUB version:
462    /// - EPUB 2.0: Parses the NCX file to obtain directory information
463    /// - EPUB 3.0: Parses the Navigation Document (NAV) file to obtain directory information
464    fn parse_catalog(&mut self) -> Result<(), EpubError> {
465        const HEAD_TAGS: [&str; 6] = ["h1", "h2", "h3", "h4", "h5", "h6"];
466
467        let mut archive = self.archive.lock()?;
468        match self.version {
469            EpubVersion::Version2_0 => {
470                let opf_file =
471                    get_file_in_zip_archive(&mut archive, self.package_path.to_str().unwrap())?
472                        .decode()?;
473                let opf_element = XmlReader::parse(&opf_file)?;
474
475                let toc_id = opf_element
476                    .find_children_by_name("spine")
477                    .next()
478                    .ok_or_else(|| EpubError::NonCanonicalFile { tag: "spine".to_string() })?
479                    .get_attr("toc")
480                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
481                        tag: "spine".to_string(),
482                        attribute: "toc".to_string(),
483                    })?
484                    .to_owned();
485                let toc_path = self
486                    .manifest
487                    .get(&toc_id)
488                    .ok_or(EpubError::ResourceIdNotExist { id: toc_id })?
489                    .path
490                    .to_str()
491                    .unwrap();
492
493                let ncx_file = get_file_in_zip_archive(&mut archive, toc_path)?.decode()?;
494                let ncx = XmlReader::parse(&ncx_file)?;
495
496                match ncx.find_elements_by_name("docTitle").next() {
497                    Some(element) => self.catalog_title = element.text(),
498                    None => warn!(
499                        "Expecting to get docTitle information from the ncx file, but it's missing."
500                    ),
501                };
502
503                let nav_map = ncx
504                    .find_elements_by_name("navMap")
505                    .next()
506                    .ok_or_else(|| EpubError::NonCanonicalFile { tag: "navMap".to_string() })?;
507
508                self.catalog = self.parse_nav_points(nav_map)?;
509
510                Ok(())
511            }
512
513            EpubVersion::Version3_0 => {
514                let nav_path = self
515                    .manifest
516                    .values()
517                    .find(|item| {
518                        if let Some(property) = &item.properties {
519                            return property.contains("nav");
520                        }
521                        false
522                    })
523                    .map(|item| item.path.clone())
524                    .ok_or_else(|| EpubError::NonCanonicalEpub {
525                        expected_file: "Navigation Document".to_string(),
526                    })?;
527
528                let nav_file =
529                    get_file_in_zip_archive(&mut archive, nav_path.to_str().unwrap())?.decode()?;
530
531                let nav_element = XmlReader::parse(&nav_file)?;
532                let nav = nav_element
533                    .find_elements_by_name("nav")
534                    .find(|&element| element.get_attr("epub:type") == Some(String::from("toc")))
535                    .ok_or_else(|| EpubError::NonCanonicalFile { tag: "nav".to_string() })?;
536                let nav_title = nav.find_children_by_names(&HEAD_TAGS).next();
537                let nav_list = nav
538                    .find_children_by_name("ol")
539                    .next()
540                    .ok_or_else(|| EpubError::NonCanonicalFile { tag: "ol".to_string() })?;
541
542                self.catalog = self.parse_catalog_list(nav_list)?;
543                if let Some(nav_title) = nav_title {
544                    self.catalog_title = nav_title.text();
545                };
546                Ok(())
547            }
548        }
549    }
550
551    /// Check if the EPUB file contains `encryption.xml`
552    ///
553    /// This function determines whether a publication contains encrypted resources
554    /// by checking if a `META-INF/encryption.xml` file exists in the EPUB package.
555    /// According to the EPUB specification, when resources in a publication are
556    /// encrypted, the corresponding encryption information must be declared in
557    /// the `META-INF/encryption.xml` file.
558    ///
559    /// ## Return
560    /// - `true` if the publication contains encrypted resources
561    /// - `false` if the publication does not contain encrypted resources
562    ///
563    /// ## Notes
564    /// - This function only checks the existence of the encrypted file;
565    ///   it does not verify the validity of the encrypted information.
566    pub fn has_encryption(&self) -> bool {
567        self.has_encryption
568    }
569
570    /// Retrieves a list of metadata items
571    ///
572    /// This function retrieves all matching metadata items from the EPUB metadata
573    /// based on the specified attribute name (key). Metadata items may come from
574    /// the DC (Dublin Core) namespace or the OPF namespace and contain basic
575    /// information about the publication, such as title, author, identifier, etc.
576    ///
577    /// ## Parameters
578    /// - `key`: The name of the metadata attribute to retrieve
579    ///
580    /// ## Return
581    /// - `Some(Vec<MetadataItem>)`: A vector containing all matching metadata items
582    /// - `None`: If no matching metadata items are found
583    pub fn get_metadata(&self, key: &str) -> Option<Vec<MetadataItem>> {
584        let metadatas = self
585            .metadata
586            .iter()
587            .filter(|item| item.property == key)
588            .cloned()
589            .collect::<Vec<MetadataItem>>();
590
591        (!metadatas.is_empty()).then_some(metadatas)
592    }
593
594    /// Retrieves a list of values for specific metadata items
595    ///
596    /// This function retrieves the values of all matching metadata items from
597    /// the EPUB metadata based on the given property name (key).
598    ///
599    /// ## Parameters
600    /// - `key`: The name of the metadata attribute to retrieve
601    ///
602    /// ## Return
603    /// - `Some(Vec<String>)`: A vector containing all matching metadata item values
604    /// - `None`: If no matching metadata items are found
605    pub fn get_metadata_value(&self, key: &str) -> Option<Vec<String>> {
606        let values = self
607            .metadata
608            .iter()
609            .filter(|item| item.property == key)
610            .map(|item| item.value.clone())
611            .collect::<Vec<String>>();
612
613        (!values.is_empty()).then_some(values)
614    }
615
616    /// Retrieves the title of the publication
617    ///
618    /// This function retrieves all title information from the EPUB metadata.
619    /// According to the EPUB specification, a publication can have multiple titles,
620    /// which are returned in the order they appear in the metadata.
621    ///
622    /// ## Return
623    /// - `Result<Vec<String>, EpubError>`: A vector containing all title information
624    /// - `EpubError`: If and only if the OPF file does not contain `<dc:title>`
625    ///
626    /// ## Notes
627    /// - The EPUB specification requires each publication to have at least one title.
628    pub fn get_title(&self) -> Result<Vec<String>, EpubError> {
629        self.get_metadata_value("title")
630            .ok_or_else(|| EpubError::NonCanonicalFile { tag: "title".to_string() })
631    }
632
633    /// Retrieves the language used in the publication
634    ///
635    /// This function retrieves the language information of a publication from the EPUB
636    /// metadata. According to the EPUB specification, language information identifies
637    /// the primary language of the publication and can have multiple language identifiers.
638    ///
639    /// ## Return
640    /// - `Ok(Vec<String>)`: A vector containing all language identifiers
641    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:language>`
642    ///
643    /// ## Notes
644    /// - The EPUB specification requires that each publication specify at least one primary language.
645    /// - Language identifiers should conform to RFC 3066 or later standards.
646    pub fn get_language(&self) -> Result<Vec<String>, EpubError> {
647        self.get_metadata_value("language")
648            .ok_or_else(|| EpubError::NonCanonicalFile { tag: "language".to_string() })
649    }
650
651    /// Retrieves the identifier of a publication
652    ///
653    /// This function retrieves the identifier information of a publication from
654    /// the EPUB metadata. According to the EPUB specification, each publication
655    /// must have a identifier, typically an ISBN, UUID, or other unique identifier.
656    ///
657    /// ## Return
658    /// - `Ok(Vec<String>)`: A vector containing all identifier information
659    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:identifier>`
660    ///
661    /// ## Notes
662    /// - The EPUB specification requires each publication to have at least one identifier.
663    /// - In the OPF file, the `unique-identifier` attribute of the `<package>` element
664    ///   should point to a `<dc:identifier>` element used to uniquely identify the publication.
665    ///   This means that `unique-identifier` is not exactly equal to `<dc:identifier>`.
666    pub fn get_identifier(&self) -> Result<Vec<String>, EpubError> {
667        self.get_metadata_value("identifier")
668            .ok_or_else(|| EpubError::NonCanonicalFile { tag: "identifier".to_string() })
669    }
670
671    /// Retrieve resource data by resource ID
672    ///
673    /// This function will find the resource with the specified ID in the manifest.
674    /// If the resource is encrypted, it will be automatically decrypted.
675    ///
676    /// ## Parameters
677    /// - `id`: The ID of the resource to retrieve
678    ///
679    /// ## Return
680    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
681    ///   the MIME type
682    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
683    ///
684    /// ## Notes
685    /// - This function will automatically decrypt the resource if it is encrypted.
686    /// - For unsupported encryption methods, the corresponding error will be returned.
687    pub fn get_manifest_item(&self, id: &str) -> Result<(Vec<u8>, String), EpubError> {
688        let resource_item = self
689            .manifest
690            .get(id)
691            .cloned()
692            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
693
694        let path = resource_item.path.to_str().unwrap();
695
696        let mut archive = self.archive.lock()?;
697        let mut data = match archive.by_name(path) {
698            Ok(mut file) => {
699                let mut entry = Vec::<u8>::new();
700                file.read_to_end(&mut entry)?;
701
702                Ok(entry)
703            }
704            Err(ZipError::FileNotFound) => {
705                Err(EpubError::ResourceNotFound { resource: path.to_string() })
706            }
707            Err(err) => Err(EpubError::from(err)),
708        }?;
709
710        if let Some(method) = self.is_encryption_file(path) {
711            data = self.auto_dencrypt(&method, &mut data)?;
712        }
713
714        Ok((data, resource_item.mime))
715    }
716
717    /// Retrieves resource item data by resource path
718    ///
719    /// This function retrieves resources from the manifest based on the input path.
720    /// The input path must be a relative path to the root directory of the EPUB container;
721    /// using an absolute path or a relative path to another location will result in an error.
722    ///
723    /// ## Parameters
724    /// - `path`: The path of the resource to retrieve
725    ///
726    /// ## Return
727    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
728    ///   the MIME type
729    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
730    ///
731    /// ## Notes
732    /// - This function will automatically decrypt the resource if it is encrypted.
733    /// - For unsupported encryption methods, the corresponding error will be returned.
734    /// - Relative paths other than the root directory of the Epub container are not supported.
735    pub fn get_manifest_item_by_path(&self, path: &str) -> Result<(Vec<u8>, String), EpubError> {
736        let id = self
737            .manifest
738            .iter()
739            .find(|(_, item)| item.path.to_str().unwrap() == path)
740            .map(|(id, _)| id.to_string())
741            .ok_or_else(|| EpubError::ResourceNotFound { resource: path.to_string() })?;
742
743        self.get_manifest_item(&id)
744    }
745
746    /// Retrieves supported resource items by resource ID, with fallback mechanism supported
747    ///
748    /// This function attempts to retrieve the resource item with the specified ID and
749    /// checks if its MIME type is in the list of supported formats. If the current resource
750    /// format is not supported, it searches for a supported resource format along the
751    /// fallback chain according to the fallback mechanism defined in the EPUB specification.
752    ///
753    /// ## Parameters
754    /// - `id`: The ID of the resource to retrieve
755    /// - `supported_format`: A vector of supported MIME types
756    ///
757    /// ## Return
758    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
759    ///   the MIME type
760    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
761    pub fn get_manifest_item_with_fallback(
762        &self,
763        id: &str,
764        supported_format: Vec<&str>,
765    ) -> Result<(Vec<u8>, String), EpubError> {
766        let mut manifest_item = self
767            .manifest
768            .get(id)
769            .cloned()
770            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
771
772        let mut current_manifest_id = id.to_string();
773        let mut fallback_chain = Vec::<String>::new();
774        'fallback: loop {
775            if supported_format.contains(&manifest_item.mime.as_str()) {
776                return self.get_manifest_item(&current_manifest_id);
777            }
778
779            let fallback_id = manifest_item.fallback.clone();
780
781            match fallback_id {
782                // The loop ends when no fallback resource exists
783                None => break 'fallback,
784
785                // End the loop when the loop continues to fallback if a fallback resource exists
786                Some(id) if fallback_chain.contains(&id) => break 'fallback,
787
788                Some(id) => {
789                    fallback_chain.push(id.clone());
790
791                    // Since only warnings are issued for fallback resource checks
792                    // during initialization, the issue of fallback resources possibly
793                    // not existing needs to be handled here.
794                    manifest_item = self
795                        .manifest
796                        .get(&manifest_item.fallback.unwrap())
797                        .cloned()
798                        .ok_or(EpubError::ResourceIdNotExist { id: id.clone() })?;
799                    current_manifest_id = id;
800                }
801            };
802        }
803
804        Err(EpubError::NoSupportedFileFormat)
805    }
806
807    /// Retrieves the cover of the EPUB document
808    ///
809    /// This function searches for the cover of the EPUB document by examining manifest
810    /// items in the manifest. It looks for manifest items whose ID or attribute contains
811    /// "cover" (case-insensitive) and attempts to retrieve the content of the first match.
812    ///
813    /// ## Return
814    /// - `Some((Vec<u8>, String))`: Successfully retrieved and decrypted cover data and
815    ///   the MIME type
816    /// - `None`: No cover resource was found
817    ///
818    /// ## Notes
819    /// - This function only returns the first successfully retrieved cover resource,
820    ///   even if multiple matches exist
821    /// - The retrieved cover may not be an image resource; users need to pay attention
822    ///   to the resource's MIME type.
823    pub fn get_cover(&self) -> Option<(Vec<u8>, String)> {
824        self.manifest
825            .values()
826            .filter_map(|manifest| {
827                if manifest.id.to_ascii_lowercase().contains("cover") {
828                    return Some(manifest.id.clone());
829                }
830
831                if let Some(properties) = &manifest.properties {
832                    if properties.to_ascii_lowercase().contains("cover") {
833                        return Some(manifest.id.clone());
834                    }
835                }
836
837                None
838            })
839            .collect::<Vec<String>>()
840            .iter()
841            .find_map(|id| self.get_manifest_item(id).ok())
842    }
843
844    /// Navigate to a specified chapter using the spine index
845    ///
846    /// This function retrieves the content data of the corresponding chapter based
847    /// on the index position in the EPUB spine. The spine defines the linear reading
848    /// order of the publication's content documents, and each spine item references
849    /// resources in the manifest.
850    ///
851    /// ## Parameters
852    /// - `index`: The index position in the spine, starting from 0
853    ///
854    /// ## Return
855    /// - `Some((Vec<u8>, String))`: Successfully retrieved chapter content data and the MIME type
856    /// - `None`: Index out of range or data retrieval error
857    ///
858    /// ## Notes
859    /// - The index must be less than the total number of spine projects.
860    /// - If the resource is encrypted, it will be automatically decrypted before returning.(TODO)
861    /// - It does not check whether the Spine project follows a linear reading order.
862    pub fn navigate_by_spine_index(&mut self, index: usize) -> Option<(Vec<u8>, String)> {
863        if index >= self.spine.len() {
864            return None;
865        }
866
867        let manifest_id = self.spine[index].idref.clone();
868        self.current_spine_index.store(index, Ordering::SeqCst);
869        self.get_manifest_item(&manifest_id).ok()
870    }
871
872    /// Navigate to the previous linear reading chapter
873    ///
874    /// This function searches backwards in the EPUB spine for the previous linear
875    /// reading chapter and returns the content data of that chapter. It only navigates
876    /// to chapters marked as linear reading.
877    ///
878    /// ## Return
879    /// - `Some((Vec<u8>, String))`: Successfully retrieved previous chapter content data and
880    ///   the MIME type
881    /// - `None`: Already in the first chapter, the current chapter is not linear,
882    ///   or data retrieval failed
883    pub fn spine_prev(&self) -> Option<(Vec<u8>, String)> {
884        let current_index = self.current_spine_index.load(Ordering::SeqCst);
885        if current_index == 0 || !self.spine[current_index].linear {
886            return None;
887        }
888
889        let prev_index = (0..current_index)
890            .rev()
891            .find(|&index| self.spine[index].linear)?;
892
893        self.current_spine_index.store(prev_index, Ordering::SeqCst);
894        let manifest_id = self.spine[prev_index].idref.clone();
895        self.get_manifest_item(&manifest_id).ok()
896    }
897
898    /// Navigate to the next linear reading chapter
899    ///
900    /// This function searches forwards in the EPUB spine for the next linear reading
901    /// chapter and returns the content data of that chapter. It only navigates to
902    /// chapters marked as linear reading.
903    ///
904    /// ## Return
905    /// - `Some((Vec<u8>, String))`: Successfully retrieved next chapter content data and
906    ///   the MIME type
907    /// - `None`: Already in the last chapter, the current chapter is not linear,
908    ///   or data retrieval failed
909    pub fn spine_next(&mut self) -> Option<(Vec<u8>, String)> {
910        let current_index = self.current_spine_index.load(Ordering::SeqCst);
911        if current_index >= self.spine.len() - 1 || !self.spine[current_index].linear {
912            return None;
913        }
914
915        let next_index =
916            (current_index + 1..self.spine.len()).find(|&index| self.spine[index].linear)?;
917
918        self.current_spine_index.store(next_index, Ordering::SeqCst);
919        let manifest_id = self.spine[next_index].idref.clone();
920        self.get_manifest_item(&manifest_id).ok()
921    }
922
923    /// Retrieves the content data of the current chapter
924    ///
925    /// This function returns the content data of the chapter at the current
926    /// index position in the EPUB spine.
927    ///
928    /// ## Return
929    /// - `Some((Vec<u8>, String))`: Successfully retrieved current chapter content data and
930    ///   the MIME type
931    /// - `None`: Data retrieval failed
932    pub fn spine_current(&self) -> Option<(Vec<u8>, String)> {
933        let manifest_id = self.spine[self.current_spine_index.load(Ordering::SeqCst)]
934            .idref
935            .clone();
936        self.get_manifest_item(&manifest_id).ok()
937    }
938
939    /// Determine the EPUB version from the OPF file
940    ///
941    /// This function is used to detect the version of an epub file from an OPF file.
942    /// When the version attribute in the package is abnormal, version information will
943    /// be identified through some version characteristics of the epub file. An error is
944    /// returned when neither direct nor indirect methods can identify the version.
945    ///
946    /// ## Parameters
947    /// - `opf_element`: A reference to the OPF file element
948    fn determine_epub_version(opf_element: &XmlElement) -> Result<EpubVersion, EpubError> {
949        // Check the explicit version attribute
950        if let Some(version) = opf_element.get_attr("version") {
951            match version.as_str() {
952                "2.0" => return Ok(EpubVersion::Version2_0),
953                "3.0" => return Ok(EpubVersion::Version3_0),
954                _ => {}
955            }
956        }
957
958        let spine_element = opf_element
959            .find_elements_by_name("spine")
960            .next()
961            .ok_or_else(|| EpubError::NonCanonicalFile { tag: "spine".to_string() })?;
962
963        // Look for EPUB 2.x specific features
964        if spine_element.get_attr("toc").is_some() {
965            return Ok(EpubVersion::Version2_0);
966        }
967
968        let manifest_element = opf_element
969            .find_elements_by_name("manifest")
970            .next()
971            .ok_or_else(|| EpubError::NonCanonicalFile { tag: "manifest".to_string() })?;
972
973        // Look for EPUB 3.x specific features
974        manifest_element
975            .children()
976            .find_map(|element| {
977                if let Some(id) = element.get_attr("id") {
978                    if id.eq("nav") {
979                        return Some(EpubVersion::Version3_0);
980                    }
981                }
982
983                None
984            })
985            .ok_or(EpubError::UnrecognizedEpubVersion)
986    }
987
988    /// Parse metadata elements under the Dublin Core namespace
989    ///
990    /// This function handles the `<metadata>` Dublin Core element in the OPF file (namespace
991    /// is "http://purl.org/dc/elements/1.1/"). These elements usually contain the basic
992    /// information of the publication, such as title, author, publication date, etc.
993    ///
994    /// ## Notes
995    /// - In EPUB 3.0, granular information is handled by separate '<meta>' elements and 'refines' attributes
996    /// - All text content is normalized by whitespace
997    #[inline]
998    fn parse_dc_metadata(
999        &self,
1000        element: &XmlElement,
1001        metadata: &mut Vec<MetadataItem>,
1002        // refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1003    ) -> Result<(), EpubError> {
1004        let id = element.get_attr("id");
1005        let lang = element.get_attr("lang");
1006        let property = element.name.clone();
1007        let value = element.text().normalize_whitespace();
1008
1009        let refined = match self.version {
1010            // In EPUB 2.0, supplementary metadata (refinements) are represented
1011            // through other attribute data pairs of the tag.
1012            EpubVersion::Version2_0 => element
1013                .attributes
1014                .iter()
1015                .map(|(name, value)| {
1016                    let property = name.to_string();
1017                    let value = value.to_string().normalize_whitespace();
1018
1019                    MetadataRefinement {
1020                        refines: id.clone().unwrap(),
1021                        property,
1022                        value,
1023                        lang: None,
1024                        scheme: None,
1025                    }
1026                })
1027                .collect(),
1028            EpubVersion::Version3_0 => vec![],
1029        };
1030
1031        metadata.push(MetadataItem { id, property, value, lang, refined });
1032
1033        Ok(())
1034    }
1035
1036    /// Parse metadata elements under the OPF namespace
1037    ///
1038    /// This function handles the `<metadata>` OPF element in the OPF file (namespace
1039    /// is "http://www.idpf.org/2007/opf"). These elements include '<meta>' and '<link>',
1040    /// which are used to provide extended metadata and links to external resources for EPUB publications.
1041    ///
1042    /// ## Notes
1043    /// - The function is only responsible for distribution processing, and the
1044    ///   specific parsing logic is implemented in the dedicated function
1045    /// - All parsing results are added directly to the incoming collection and no new collection is returned
1046    #[inline]
1047    fn parse_opf_metadata(
1048        &self,
1049        element: &XmlElement,
1050        metadata: &mut Vec<MetadataItem>,
1051        metadata_link: &mut Vec<MetadataLinkItem>,
1052        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1053    ) -> Result<(), EpubError> {
1054        match element.name.as_str() {
1055            "meta" => self.parse_meta_element(element, metadata, refinements),
1056            "link" => self.parse_link_element(element, metadata_link),
1057            _ => Ok(()),
1058        }
1059    }
1060
1061    #[inline]
1062    fn parse_meta_element(
1063        &self,
1064        element: &XmlElement,
1065        metadata: &mut Vec<MetadataItem>,
1066        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1067    ) -> Result<(), EpubError> {
1068        match self.version {
1069            EpubVersion::Version2_0 => {
1070                let property = element
1071                    .get_attr("name")
1072                    .ok_or_else(|| EpubError::NonCanonicalFile { tag: element.tag_name() })?;
1073                let value = element
1074                    .get_attr("content")
1075                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
1076                        tag: element.tag_name(),
1077                        attribute: "content".to_string(),
1078                    })?
1079                    .normalize_whitespace();
1080
1081                metadata.push(MetadataItem {
1082                    id: None,
1083                    property,
1084                    value,
1085                    lang: None,
1086                    refined: vec![],
1087                });
1088            }
1089
1090            EpubVersion::Version3_0 => {
1091                let property = element.get_attr("property").ok_or_else(|| {
1092                    EpubError::MissingRequiredAttribute {
1093                        tag: element.tag_name(),
1094                        attribute: "property".to_string(),
1095                    }
1096                })?;
1097                let value = element.text().normalize_whitespace();
1098                let lang = element.get_attr("lang");
1099
1100                if let Some(refines) = element.get_attr("refines") {
1101                    let id = refines.strip_prefix("#").unwrap_or(&refines).to_string();
1102                    let scheme = element.get_attr("scheme");
1103                    let refinement = MetadataRefinement {
1104                        refines: id.clone(),
1105                        property,
1106                        value,
1107                        lang,
1108                        scheme,
1109                    };
1110
1111                    if let Some(refinements) = refinements.get_mut(&id) {
1112                        refinements.push(refinement);
1113                    } else {
1114                        refinements.insert(id, vec![refinement]);
1115                    }
1116                } else {
1117                    let id = element.get_attr("id");
1118                    let item = MetadataItem {
1119                        id,
1120                        property,
1121                        value,
1122                        lang,
1123                        refined: vec![],
1124                    };
1125
1126                    metadata.push(item);
1127                };
1128            }
1129        }
1130        Ok(())
1131    }
1132
1133    #[inline]
1134    fn parse_link_element(
1135        &self,
1136        element: &XmlElement,
1137        metadata_link: &mut Vec<MetadataLinkItem>,
1138    ) -> Result<(), EpubError> {
1139        let href = element
1140            .get_attr("href")
1141            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1142                tag: element.tag_name(),
1143                attribute: "href".to_string(),
1144            })?;
1145        let rel = element
1146            .get_attr("rel")
1147            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1148                tag: element.tag_name(),
1149                attribute: "rel".to_string(),
1150            })?;
1151        let hreflang = element.get_attr("hreflang");
1152        let id = element.get_attr("id");
1153        let mime = element.get_attr("media-type");
1154        let properties = element.get_attr("properties");
1155
1156        metadata_link.push(MetadataLinkItem {
1157            href,
1158            rel,
1159            hreflang,
1160            id,
1161            mime,
1162            properties,
1163            refines: None,
1164        });
1165        Ok(())
1166    }
1167
1168    /// Recursively parse NCX navigation points from navMap or nested navPoint elements
1169    ///
1170    /// This function parses the hierarchical navigation structure defined in NCX files
1171    /// for EPUB 2.x documents. It handles nested navPoint elements to build a complete
1172    /// tree representation of the publication's table of contents.
1173    fn parse_nav_points(&self, parent_element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1174        let mut nav_points = Vec::new();
1175        for nav_point in parent_element.find_children_by_name("navPoint") {
1176            let label = match nav_point.find_children_by_name("navLabel").next() {
1177                Some(element) => element.text(),
1178                None => String::new(),
1179            };
1180
1181            let content = nav_point
1182                .find_children_by_name("content")
1183                .next()
1184                .map(|element| PathBuf::from(element.text()));
1185
1186            let play_order = nav_point
1187                .get_attr("playOrder")
1188                .and_then(|order| order.parse::<usize>().ok());
1189
1190            let children = self.parse_nav_points(nav_point)?;
1191
1192            nav_points.push(NavPoint { label, content, play_order, children });
1193        }
1194
1195        nav_points.sort();
1196        Ok(nav_points)
1197    }
1198
1199    /// Recursively parses directory list structures
1200    ///
1201    /// This function recursively parses HTML navigation list structures,
1202    /// converting `<ol>` and `<li>` elements into NavPoint structures.
1203    /// Multi-level nested directory structures are supported.
1204    fn parse_catalog_list(&self, element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1205        let mut catalog = Vec::new();
1206        for item in element.children() {
1207            if item.tag_name() != "li" {
1208                return Err(EpubError::NonCanonicalFile { tag: "li".to_string() });
1209            }
1210
1211            let title_element = item
1212                .find_children_by_names(&["span", "a"])
1213                .next()
1214                .ok_or_else(|| EpubError::NonCanonicalFile { tag: "span/a".to_string() })?;
1215            let content_href = title_element.get_attr("href").map(PathBuf::from);
1216            let sub_list = if let Some(list) = item.find_children_by_name("ol").next() {
1217                self.parse_catalog_list(list)?
1218            } else {
1219                vec![]
1220            };
1221
1222            catalog.push(NavPoint {
1223                label: title_element.text(),
1224                content: content_href,
1225                children: sub_list,
1226                play_order: None,
1227            });
1228        }
1229
1230        Ok(catalog)
1231    }
1232
1233    /// Converts relative paths in the manifest to normalized paths
1234    /// relative to the EPUB root directory
1235    ///
1236    /// This function processes the href attribute of resources in the EPUB
1237    /// manifest and converts it to a normalized path representation.
1238    /// It handles three types of paths:
1239    /// - Relative paths starting with `../` (checks if they exceed the EPUB package scope)
1240    /// - Absolute paths starting with `/` (relative to the EPUB root directory)
1241    /// - Other relative paths (relative to the directory containing the OPF file)
1242    ///
1243    /// ## Parameters
1244    /// - `path`: The href attribute value of the resource in the manifest
1245    ///
1246    /// ## Return
1247    /// - `Ok(PathBuf)`: The parsed normalized path
1248    /// - `Err(EpubError)`: Relative link leakage
1249    #[inline]
1250    fn normalize_manifest_path(&self, path: &str) -> Result<PathBuf, EpubError> {
1251        let mut path = if path.starts_with("../") {
1252            let mut current_dir = self.epub_path.join(&self.package_path);
1253            current_dir.pop();
1254
1255            check_realtive_link_leakage(self.epub_path.clone(), current_dir, path)
1256                .map(PathBuf::from)
1257                .ok_or_else(|| EpubError::RealtiveLinkLeakage { path: path.to_string() })?
1258        } else if let Some(path) = path.strip_prefix("/") {
1259            PathBuf::from(path.to_string())
1260        } else {
1261            self.base_path.join(path)
1262        };
1263
1264        #[cfg(windows)]
1265        {
1266            path = PathBuf::from(path.to_string_lossy().replace('\\', "/"));
1267        }
1268
1269        Ok(path)
1270    }
1271
1272    /// Verify the fallback chain of all manifest items
1273    ///
1274    /// This function iterates through all manifest items with the fallback
1275    /// attribute and verifies the validity of their fallback chains, including checking:
1276    /// - Whether circular references exist
1277    /// - Whether the fallback resource exists in the manifest
1278    ///
1279    /// ## Notes
1280    /// If an invalid fallback chain is found, a warning log will be logged
1281    /// but the processing flow will not be interrupted.
1282    fn validate_fallback_chains(&self) {
1283        for (id, item) in &self.manifest {
1284            if item.fallback.is_none() {
1285                continue;
1286            }
1287
1288            let mut fallback_chain = Vec::new();
1289            if let Err(msg) = self.validate_fallback_chain(id, &mut fallback_chain) {
1290                warn!("Invalid fallback chain for item {}: {}", id, msg);
1291            }
1292        }
1293    }
1294
1295    /// Recursively verify the validity of a single fallback chain
1296    ///
1297    /// This function recursively traces the fallback chain to check for the following issues:
1298    /// - Circular reference
1299    /// - The referenced fallback resource does not exist
1300    ///
1301    /// ## Parameters
1302    /// - `manifest_id`: The id of the manifest item currently being verified
1303    /// - `fallback_chain`: The visited fallback chain paths used to detect circular references
1304    ///
1305    /// ## Return
1306    /// - `Ok(())`: The fallback chain is valid
1307    /// - `Err(String)`: A string containing error information
1308    fn validate_fallback_chain(
1309        &self,
1310        manifest_id: &str,
1311        fallback_chain: &mut Vec<String>,
1312    ) -> Result<(), String> {
1313        if fallback_chain.contains(&manifest_id.to_string()) {
1314            fallback_chain.push(manifest_id.to_string());
1315
1316            return Err(format!(
1317                "Circular reference detected in fallback chain for {}",
1318                fallback_chain.join("->")
1319            ));
1320        }
1321
1322        // Get the current item; its existence can be ensured based on the calling context.
1323        let item = self.manifest.get(manifest_id).unwrap();
1324
1325        if let Some(fallback_id) = &item.fallback {
1326            if !self.manifest.contains_key(fallback_id) {
1327                return Err(format!(
1328                    "Fallback resource {} does not exist in manifest",
1329                    fallback_id
1330                ));
1331            }
1332
1333            fallback_chain.push(manifest_id.to_string());
1334            self.validate_fallback_chain(fallback_id, fallback_chain)
1335        } else {
1336            // The end of the fallback chain
1337            Ok(())
1338        }
1339    }
1340
1341    /// Checks if a resource at the specified path is an encrypted file
1342    ///
1343    /// This function queries whether a specific resource path is marked as an encrypted
1344    /// file in the EPUB encryption information. It checks the encrypted data stored in
1345    /// `self.encryption`, looking for an entry that matches the given path.
1346    ///
1347    /// ## Parameters
1348    /// - `path`: The path of the resource to check
1349    ///
1350    /// ## Return
1351    /// - `Some(String)`: The encryption method used for the resource
1352    /// - `None`: The resource is not encrypted
1353    fn is_encryption_file(&self, path: &str) -> Option<String> {
1354        self.encryption.as_ref().and_then(|encryptions| {
1355            encryptions
1356                .iter()
1357                .find(|encryption| encryption.data == path)
1358                .map(|encryption| encryption.method.clone())
1359        })
1360    }
1361
1362    /// Automatically decrypts encrypted resource data
1363    ///
1364    /// Automatically decrypts data based on the provided encryption method.
1365    /// This function supports various encryption methods defined by the EPUB
1366    /// specification, including font obfuscation and the XML encryption standard.
1367    ///
1368    /// ## Parameters
1369    /// - `method`: The encryption method used for the resource
1370    /// - `data`: The encrypted resource data
1371    ///
1372    /// ## Return
1373    /// - `Ok(Vec<u8>)`: The decrypted resource data
1374    /// - `Err(EpubError)`: Unsupported encryption method
1375    ///
1376    /// ## Supported Encryption Methods
1377    /// - IDPF font obfuscation: `http://www.idpf.org/2008/embedding`
1378    /// - Adobe font obfuscation: `http://ns.adobe.com/pdf/enc#RC`
1379    #[inline]
1380    fn auto_dencrypt(&self, method: &str, data: &mut [u8]) -> Result<Vec<u8>, EpubError> {
1381        match method {
1382            "http://www.idpf.org/2008/embedding" => {
1383                Ok(idpf_font_dencryption(data, &self.unique_identifier))
1384            }
1385            "http://ns.adobe.com/pdf/enc#RC" => {
1386                Ok(adobe_font_dencryption(data, &self.unique_identifier))
1387            }
1388            _ => Err(EpubError::UnsupportedEncryptedMethod { method: method.to_string() }),
1389        }
1390    }
1391}
1392
1393impl EpubDoc<BufReader<File>> {
1394    /// Creates a new EPUB document instance
1395    ///
1396    /// This function is a convenience constructor for `EpubDoc`,
1397    /// used to create an EPUB parser instance directly from a file path.
1398    ///
1399    /// ## Parameters
1400    /// - `path`: The path to the EPUB file
1401    ///
1402    /// ## Return
1403    /// - `Ok(EpubDoc)`: The created EPUB document instance
1404    /// - `Err(EpubError)`: An error occurred during initialization
1405    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, EpubError> {
1406        let file = File::open(&path).map_err(EpubError::from)?;
1407        let path = canonicalize(path)?;
1408
1409        Self::from_reader(BufReader::new(file), path)
1410    }
1411}
1412
1413#[cfg(test)]
1414mod tests {
1415    use std::{
1416        fs::File,
1417        io::BufReader,
1418        path::{Path, PathBuf},
1419    };
1420
1421    use crate::{epub::EpubDoc, error::EpubError, utils::XmlReader};
1422
1423    /// Section 3.3 package documents
1424    mod package_documents_tests {
1425        use std::{path::Path, sync::atomic::Ordering};
1426
1427        use crate::epub::{EpubDoc, EpubVersion};
1428
1429        /// ID: pkg-collections-unknown
1430        ///
1431        /// The package document contains a collection with an unknown role. The reading system must open the EPUB successfully.
1432        #[test]
1433        fn test_pkg_collections_unknown() {
1434            let epub_file = Path::new("./test_case/pkg-collections-unknown.epub");
1435            let doc = EpubDoc::new(epub_file);
1436            assert!(doc.is_ok());
1437        }
1438
1439        /// ID: pkg-creator-order
1440        ///
1441        /// Several creators are listed in the package document. The reading system must not display them out of order (but it may display only the first).
1442        #[test]
1443        fn test_pkg_creator_order() {
1444            let epub_file = Path::new("./test_case/pkg-creator-order.epub");
1445            let doc = EpubDoc::new(epub_file);
1446            assert!(doc.is_ok());
1447
1448            let doc = doc.unwrap();
1449            let creators = doc.get_metadata_value("creator");
1450            assert!(creators.is_some());
1451
1452            let creators = creators.unwrap();
1453            assert_eq!(creators.len(), 5);
1454            assert_eq!(
1455                creators,
1456                vec![
1457                    "Dave Cramer",
1458                    "Wendy Reid",
1459                    "Dan Lazin",
1460                    "Ivan Herman",
1461                    "Brady Duga",
1462                ]
1463            );
1464        }
1465
1466        /// ID: pkg-manifest-unknown
1467        ///
1468        /// The package document contains a manifest item with unknown properties. The reading system must open the EPUB successfully.
1469        #[test]
1470        fn test_pkg_manifest_order() {
1471            let epub_file = Path::new("./test_case/pkg-manifest-unknown.epub");
1472            let doc = EpubDoc::new(epub_file);
1473            assert!(doc.is_ok());
1474
1475            let doc = doc.unwrap();
1476            assert_eq!(doc.manifest.len(), 2);
1477            assert!(doc.get_manifest_item("nav").is_ok());
1478            assert!(doc.get_manifest_item("content_001").is_ok());
1479            assert!(doc.get_manifest_item("content_002").is_err());
1480        }
1481
1482        /// ID: pkg-meta-unknown
1483        ///
1484        /// The package document contains a meta tag with an unknown property. The reading system must open the EPUB successfully.
1485        #[test]
1486        fn test_pkg_meta_unknown() {
1487            let epub_file = Path::new("./test_case/pkg-meta-unknown.epub");
1488            let doc = EpubDoc::new(epub_file);
1489            assert!(doc.is_ok());
1490
1491            let doc = doc.unwrap();
1492            let value = doc.get_metadata_value("dcterms:isReferencedBy");
1493            assert!(value.is_some());
1494            let value = value.unwrap();
1495            assert_eq!(value.len(), 1);
1496            assert_eq!(
1497                value,
1498                vec!["https://www.w3.org/TR/epub-rs/#confreq-rs-pkg-meta-unknown"]
1499            );
1500
1501            let value = doc.get_metadata_value("dcterms:modified");
1502            assert!(value.is_some());
1503            let value = value.unwrap();
1504            assert_eq!(value.len(), 1);
1505            assert_eq!(value, vec!["2021-01-11T00:00:00Z"]);
1506
1507            let value = doc.get_metadata_value("dcterms:title");
1508            assert!(value.is_none());
1509        }
1510
1511        /// ID: pkg-meta-whitespace
1512        ///
1513        /// The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases.
1514        #[test]
1515        fn test_pkg_meta_white_space() {
1516            let epub_file = Path::new("./test_case/pkg-meta-whitespace.epub");
1517            let doc = EpubDoc::new(epub_file);
1518            assert!(doc.is_ok());
1519
1520            let doc = doc.unwrap();
1521            let value = doc.get_metadata_value("creator");
1522            assert!(value.is_some());
1523            let value = value.unwrap();
1524            assert_eq!(value.len(), 1);
1525            assert_eq!(value, vec!["Dave Cramer"]);
1526
1527            let value = doc.get_metadata_value("description");
1528            assert!(value.is_some());
1529            let value = value.unwrap();
1530            assert_eq!(value.len(), 1);
1531            assert_eq!(
1532                value,
1533                vec![
1534                    "The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases."
1535                ]
1536            );
1537        }
1538
1539        /// ID: pkg-spine-duplicate-item-hyperlink
1540        ///
1541        /// The spine contains several references to the same content document. The reading system must move to the position of the first duplicate in the reading order when following a hyperlink.
1542        #[test]
1543        fn test_pkg_spine_duplicate_item_hyperlink() {
1544            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-hyperlink.epub");
1545            let doc = EpubDoc::new(epub_file);
1546            assert!(doc.is_ok());
1547
1548            let mut doc = doc.unwrap();
1549            assert_eq!(doc.spine.len(), 4);
1550            assert_eq!(
1551                doc.navigate_by_spine_index(0).unwrap(),
1552                doc.get_manifest_item("content_001").unwrap()
1553            );
1554            assert_eq!(
1555                doc.navigate_by_spine_index(1).unwrap(),
1556                doc.get_manifest_item("content_002").unwrap()
1557            );
1558            assert_eq!(
1559                doc.navigate_by_spine_index(2).unwrap(),
1560                doc.get_manifest_item("content_002").unwrap()
1561            );
1562            assert_eq!(
1563                doc.navigate_by_spine_index(3).unwrap(),
1564                doc.get_manifest_item("content_002").unwrap()
1565            );
1566        }
1567
1568        /// ID: pkg-spine-duplicate-item-rendering
1569        ///
1570        /// The spine contains several references to the same content document. The reading system must not skip the duplicates when rendering the reading order.
1571        #[test]
1572        fn test_pkg_spine_duplicate_item_rendering() {
1573            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-rendering.epub");
1574            let doc = EpubDoc::new(epub_file);
1575            assert!(doc.is_ok());
1576
1577            let mut doc = doc.unwrap();
1578            assert_eq!(doc.spine.len(), 4);
1579
1580            let result = doc.spine_prev();
1581            assert!(result.is_none());
1582
1583            let result = doc.spine_next();
1584            assert!(result.is_some());
1585
1586            doc.spine_next();
1587            doc.spine_next();
1588            let result = doc.spine_next();
1589            assert!(result.is_none());
1590        }
1591
1592        /// ID: pkg-spine-nonlinear-activation
1593        ///
1594        /// An itemref in the spine is marked as non-linear. Although it (possibly) cannot be accessed through the table of contents, it can be reached from a link in the XHTML content.
1595        #[test]
1596        fn test_pkg_spine_nonlinear_activation() {
1597            let epub_file = Path::new("./test_case/pkg-spine-nonlinear-activation.epub");
1598            let doc = EpubDoc::new(epub_file);
1599            assert!(doc.is_ok());
1600
1601            let mut doc = doc.unwrap();
1602            assert!(doc.spine_prev().is_none());
1603            assert!(doc.spine_next().is_none());
1604
1605            assert!(doc.navigate_by_spine_index(1).is_some());
1606            assert!(doc.spine_prev().is_none());
1607            assert!(doc.spine_next().is_none());
1608        }
1609
1610        /// ID: pkg-spine-order
1611        ///
1612        /// Basic test of whether a reading system can display spine items in the correct order. The test fails if the reading system presents content in the order in which the file names sort, or if it presents files in manifest order rather than spine order.
1613        #[test]
1614        fn test_pkg_spine_order() {
1615            let epub_file = Path::new("./test_case/pkg-spine-order.epub");
1616            let doc = EpubDoc::new(epub_file);
1617            assert!(doc.is_ok());
1618
1619            let doc = doc.unwrap();
1620            assert_eq!(doc.spine.len(), 4);
1621            assert_eq!(
1622                doc.spine
1623                    .iter()
1624                    .map(|item| item.idref.clone())
1625                    .collect::<Vec<String>>(),
1626                vec![
1627                    "d-content_001",
1628                    "c-content_002",
1629                    "b-content_003",
1630                    "a-content_004",
1631                ]
1632            );
1633        }
1634
1635        /// ID: pkg-spine-order-svg
1636        ///
1637        /// Basic test of whether a reading system can display SVG spine items in the correct order.
1638        #[test]
1639        fn test_spine_order_svg() {
1640            let epub_file = Path::new("./test_case/pkg-spine-order-svg.epub");
1641            let doc = EpubDoc::new(epub_file);
1642            assert!(doc.is_ok());
1643
1644            let mut doc = doc.unwrap();
1645            assert_eq!(doc.spine.len(), 4);
1646
1647            loop {
1648                if let Some(spine) = doc.spine_next() {
1649                    let idref = doc.spine[doc.current_spine_index.load(Ordering::Relaxed)]
1650                        .idref
1651                        .clone();
1652                    let resource = doc.get_manifest_item(&idref);
1653                    assert!(resource.is_ok());
1654
1655                    let resource = resource.unwrap();
1656                    assert_eq!(spine, resource);
1657                } else {
1658                    break;
1659                }
1660            }
1661
1662            assert_eq!(doc.current_spine_index.load(Ordering::Relaxed), 3);
1663        }
1664
1665        /// ID: pkg-spine-unknown
1666        ///
1667        /// The package document contains a spine item with unknown properties. The reading system must open the EPUB successfully.
1668        #[test]
1669        fn test_pkg_spine_unknown() {
1670            let epub_file = Path::new("./test_case/pkg-spine-unknown.epub");
1671            let doc = EpubDoc::new(epub_file);
1672            assert!(doc.is_ok());
1673
1674            let doc = doc.unwrap();
1675            assert_eq!(doc.spine.len(), 1);
1676            assert_eq!(doc.spine[0].idref, "content_001");
1677            assert_eq!(doc.spine[0].id, None);
1678            assert_eq!(doc.spine[0].linear, true);
1679            assert_eq!(doc.spine[0].properties, Some("untrustworthy".to_string()));
1680        }
1681
1682        /// ID: pkg-title-order
1683        ///
1684        /// Several titles are listed in the package document. The reading system must use the first title (and whether to use other titles is not defined).
1685        #[test]
1686        fn test_pkg_title_order() {
1687            let epub_file = Path::new("./test_case/pkg-title-order.epub");
1688            let doc = EpubDoc::new(epub_file);
1689            assert!(doc.is_ok());
1690
1691            let doc = doc.unwrap();
1692            let title_list = doc.get_title();
1693            assert!(title_list.is_ok());
1694
1695            let title_list = title_list.unwrap();
1696            assert_eq!(title_list.len(), 6);
1697            assert_eq!(
1698                title_list,
1699                vec![
1700                    "pkg-title-order",
1701                    "This title must not display first",
1702                    "Also, this title must not display first",
1703                    "This title also must not display first",
1704                    "This title must also not display first",
1705                    "This title must not display first, also",
1706                ]
1707            );
1708        }
1709
1710        /// ID: pkg-unique-id
1711        ///
1712        /// The package document's dc:identifier is identical across two publications. The reading system should display both publications independently.
1713        #[test]
1714        fn test_pkg_unique_id() {
1715            let epub_file = Path::new("./test_case/pkg-unique-id.epub");
1716            let doc_1 = EpubDoc::new(epub_file);
1717            assert!(doc_1.is_ok());
1718
1719            let epub_file = Path::new("./test_case/pkg-unique-id_duplicate.epub");
1720            let doc_2 = EpubDoc::new(epub_file);
1721            assert!(doc_2.is_ok());
1722
1723            let doc_1 = doc_1.unwrap();
1724            let doc_2 = doc_2.unwrap();
1725
1726            assert_eq!(
1727                doc_1.get_identifier().unwrap(),
1728                doc_2.get_identifier().unwrap()
1729            );
1730            assert_eq!(doc_1.unique_identifier, "pkg-unique-id");
1731            assert_eq!(doc_2.unique_identifier, "pkg-unique-id");
1732        }
1733
1734        /// ID: pkg-version-backward
1735        ///
1736        /// “Reading Systems MUST attempt to process an EPUB Publication whose Package Document version attribute is less than "3.0"”. This is an EPUB with package version attribute set to "0", to see if a reading system will open it.
1737        #[test]
1738        fn test_pkg_version_backward() {
1739            let epub_file = Path::new("./test_case/pkg-version-backward.epub");
1740            let doc = EpubDoc::new(epub_file);
1741            assert!(doc.is_ok());
1742
1743            let doc = doc.unwrap();
1744            assert_eq!(doc.version, EpubVersion::Version3_0);
1745        }
1746
1747        /// ID: pkg-linked-records
1748        ///
1749        /// Reading System must process and display the title and creator metadata from the package document. An ONIX 3.0 format linked metadata record exists, but contains neither title nor creator metadata.
1750        #[test]
1751        fn test_pkg_linked_records() {
1752            let epub_file = Path::new("./test_case/pkg-linked-records.epub");
1753            let doc = EpubDoc::new(epub_file);
1754            assert!(doc.is_ok());
1755
1756            let doc = doc.unwrap();
1757            assert_eq!(doc.metadata_link.len(), 3);
1758
1759            let item = doc.metadata_link.iter().find(|&item| {
1760                if let Some(properties) = &item.properties {
1761                    properties.eq("onix")
1762                } else {
1763                    false
1764                }
1765            });
1766            assert!(item.is_some());
1767        }
1768
1769        /// ID: pkg-manifest-unlisted-resource
1770        ///
1771        /// The XHTML content references an image that does not appear in the manifest. The image should not be shown.
1772        #[test]
1773        fn test_pkg_manifest_unlisted_resource() {
1774            let epub_file = Path::new("./test_case/pkg-manifest-unlisted-resource.epub");
1775            let doc = EpubDoc::new(epub_file);
1776            assert!(doc.is_ok());
1777
1778            let doc = doc.unwrap();
1779            assert!(
1780                doc.get_manifest_item_by_path("EPUB/content_001.xhtml")
1781                    .is_ok()
1782            );
1783
1784            assert!(doc.get_manifest_item_by_path("EPUB/red.png").is_err());
1785            let err = doc.get_manifest_item_by_path("EPUB/red.png").unwrap_err();
1786            assert_eq!(
1787                err.to_string(),
1788                "Resource not found: Unable to find resource from \"EPUB/red.png\"."
1789            );
1790        }
1791    }
1792
1793    /// Section 3.4 manifest fallbacks
1794    ///
1795    /// The tests under this module seem to favor the reading system rather than the EPUB format itself
1796    mod manifest_fallbacks_tests {
1797        use std::path::Path;
1798
1799        use crate::epub::EpubDoc;
1800
1801        /// ID: pub-foreign_bad-fallback
1802        ///
1803        /// This is a test of manifest fallbacks where both the spine item and the fallback are likely to be unsupported. The spine item is a DMG, with a fallback to a PSD file. Reading systems may raise an error on the ingenstion workflow.
1804        #[test]
1805        fn test_pub_foreign_bad_fallback() {
1806            let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
1807            let doc = EpubDoc::new(epub_file);
1808            assert!(doc.is_ok());
1809
1810            let doc = doc.unwrap();
1811            assert!(doc.get_manifest_item("content_001").is_ok());
1812            assert!(doc.get_manifest_item("bar").is_ok());
1813
1814            assert_eq!(
1815                doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
1816                    .unwrap_err()
1817                    .to_string(),
1818                "No supported file format: The fallback resource does not contain the file format you support."
1819            );
1820        }
1821
1822        /// ID: pub-foreign_image
1823        ///
1824        /// An HTML content file contains a PSD image, with a manifest fallback to a PNG image. This tests fallbacks for resources that are not in the spine.
1825        #[test]
1826        fn test_pub_foreign_image() {
1827            let epub_file = Path::new("./test_case/pub-foreign_image.epub");
1828            let doc = EpubDoc::new(epub_file);
1829            assert!(doc.is_ok());
1830
1831            let doc = doc.unwrap();
1832            let result = doc.get_manifest_item_with_fallback(
1833                "image-tiff",
1834                vec!["image/png", "application/xhtml+xml"],
1835            );
1836            assert!(result.is_ok());
1837
1838            let (_, mime) = result.unwrap();
1839            assert_eq!(mime, "image/png");
1840        }
1841
1842        /// ID: pub-foreign_json-spine
1843        ///
1844        /// This EPUB uses a JSON content file in the spine, with a manifest fallback to an HTML document. If the reading system does not support JSON, it should display the HTML.
1845        #[test]
1846        fn test_pub_foreign_json_spine() {
1847            let epub_file = Path::new("./test_case/pub-foreign_json-spine.epub");
1848            let doc = EpubDoc::new(epub_file);
1849            assert!(doc.is_ok());
1850
1851            let doc = doc.unwrap();
1852            let result = doc.get_manifest_item_with_fallback(
1853                "content_primary",
1854                vec!["application/xhtml+xml", "application/json"],
1855            );
1856            assert!(result.is_ok());
1857            let (_, mime) = result.unwrap();
1858            assert_eq!(mime, "application/json");
1859
1860            let result = doc
1861                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1862            assert!(result.is_ok());
1863            let (_, mime) = result.unwrap();
1864            assert_eq!(mime, "application/xhtml+xml");
1865        }
1866
1867        /// ID: pub-foreign_xml-spine
1868        ///
1869        /// This EPUB uses an ordinary XML content file with mimetype application/xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1870        #[test]
1871        fn test_pub_foreign_xml_spine() {
1872            let epub_file = Path::new("./test_case/pub-foreign_xml-spine.epub");
1873            let doc = EpubDoc::new(epub_file);
1874            assert!(doc.is_ok());
1875
1876            let doc = doc.unwrap();
1877            let result = doc.get_manifest_item_with_fallback(
1878                "content_primary",
1879                vec!["application/xhtml+xml", "application/xml"],
1880            );
1881            assert!(result.is_ok());
1882            let (_, mime) = result.unwrap();
1883            assert_eq!(mime, "application/xml");
1884
1885            let result = doc
1886                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1887            assert!(result.is_ok());
1888            let (_, mime) = result.unwrap();
1889            assert_eq!(mime, "application/xhtml+xml");
1890        }
1891
1892        /// ID: pub-foreign_xml-suffix-spine
1893        ///
1894        /// This EPUB uses an custom XML content file with mimetype application/dtc+xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1895        #[test]
1896        fn test_pub_foreign_xml_suffix_spine() {
1897            let epub_file = Path::new("./test_case/pub-foreign_xml-suffix-spine.epub");
1898            let doc = EpubDoc::new(epub_file);
1899            assert!(doc.is_ok());
1900
1901            let doc = doc.unwrap();
1902            let result = doc.get_manifest_item_with_fallback(
1903                "content_primary",
1904                vec!["application/xhtml+xml", "application/dtc+xml"],
1905            );
1906            assert!(result.is_ok());
1907            let (_, mime) = result.unwrap();
1908            assert_eq!(mime, "application/dtc+xml");
1909
1910            let result = doc
1911                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1912            assert!(result.is_ok());
1913            let (_, mime) = result.unwrap();
1914            assert_eq!(mime, "application/xhtml+xml");
1915        }
1916    }
1917
1918    /// Section 3.9 open container format
1919    mod open_container_format_tests {
1920        use std::{cmp::min, io::Read, path::Path};
1921
1922        use sha1::{Digest, Sha1};
1923
1924        use crate::epub::EpubDoc;
1925
1926        /// ID: ocf-metainf-inc
1927        ///
1928        /// An extra configuration file, not in the reserved files' list, is added to the META-INF folder; this file must be ignored.
1929        #[test]
1930        fn test_ocf_metainf_inc() {
1931            let epub_file = Path::new("./test_case/ocf-metainf-inc.epub");
1932            let doc = EpubDoc::new(epub_file);
1933            assert!(doc.is_ok());
1934        }
1935
1936        /// ID: ocf-metainf-manifest
1937        ///
1938        /// An ancillary manifest file, containing an extra spine item, is present in the META-INF directory; this extra item must be ignored by the reading system.
1939        #[test]
1940        fn test_ocf_metainf_manifest() {
1941            let epub_file = Path::new("./test_case/ocf-metainf-manifest.epub");
1942            let doc = EpubDoc::new(epub_file);
1943            assert!(doc.is_ok());
1944        }
1945
1946        /// ID: ocf-package_arbitrary
1947        ///
1948        /// The EPUB contains three valid package files and three corresponding sets of content documents, but only one of the packages, in an unusual subdirectory, is referenced by the container.xml file. The reading system must use this package.
1949        #[test]
1950        fn test_ocf_package_arbitrary() {
1951            let epub_file = Path::new("./test_case/ocf-package_arbitrary.epub");
1952            let doc = EpubDoc::new(epub_file);
1953            assert!(doc.is_ok());
1954
1955            let doc = doc.unwrap();
1956            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1957        }
1958
1959        /// ID: ocf-package_multiple
1960        ///
1961        /// The EPUB contains three valid package files and three corresponding sets of content documents, all referenced by the container.xml file. The reading system must use the first package.
1962        #[test]
1963        fn test_ocf_package_multiple() {
1964            let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
1965            let doc = EpubDoc::new(epub_file);
1966            assert!(doc.is_ok());
1967
1968            let doc = doc.unwrap();
1969            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1970            assert_eq!(doc.base_path, Path::new("FOO/BAR"));
1971        }
1972
1973        /// ID: ocf-url_link-leaking-relative
1974        ///
1975        /// Use a relative link with several double-dot path segments from the content to a photograph. The folder hierarchy containing the photograph starts at the root level; the relative image reference exceeds depth of hierarchy.
1976        #[test]
1977        fn test_ocf_url_link_leaking_relative() {
1978            let epub_file = Path::new("./test_case/ocf-url_link-leaking-relative.epub");
1979            let doc = EpubDoc::new(epub_file);
1980            assert!(doc.is_err());
1981            assert_eq!(
1982                doc.err().unwrap().to_string(),
1983                String::from(
1984                    "Relative link leakage: Path \"../../../../media/imgs/monastery.jpg\" is out of container range."
1985                )
1986            )
1987        }
1988
1989        /// ID: ocf-url_link-path-absolute
1990        ///
1991        /// Use a path-absolute link, i.e., beginning with a leading slash, from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
1992        #[test]
1993        fn test_ocf_url_link_path_absolute() {
1994            let epub_file = Path::new("./test_case/ocf-url_link-path-absolute.epub");
1995            let doc = EpubDoc::new(epub_file);
1996            assert!(doc.is_ok());
1997
1998            let doc = doc.unwrap();
1999            let resource = doc.manifest.get("photo").unwrap();
2000            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2001        }
2002
2003        /// ID: ocf-url_link-relative
2004        ///
2005        /// A simple relative link from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2006        #[test]
2007        fn test_ocf_url_link_relative() {
2008            let epub_file = Path::new("./test_case/ocf-url_link-relative.epub");
2009            let doc = EpubDoc::new(epub_file);
2010            assert!(doc.is_ok());
2011
2012            let doc = doc.unwrap();
2013            let resource = doc.manifest.get("photo").unwrap();
2014            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2015        }
2016
2017        /// ID: ocf-url_manifest
2018        ///
2019        /// The manifest refers to an XHTML file in an arbitrary subfolder. The reading system must be able to find the content.
2020        #[test]
2021        fn test_ocf_url_manifest() {
2022            let epub_file = Path::new("./test_case/ocf-url_manifest.epub");
2023            let doc = EpubDoc::new(epub_file);
2024            assert!(doc.is_ok());
2025
2026            let doc = doc.unwrap();
2027            assert!(doc.get_manifest_item("nav").is_ok());
2028            assert!(doc.get_manifest_item("content_001").is_ok());
2029            assert!(doc.get_manifest_item("content_002").is_err());
2030        }
2031
2032        /// ID: ocf-url_relative
2033        ///
2034        /// The manifest refers to an XHTML file in an arbitrary subfolder that is relative to the package's own arbitrary folder. The reading system must be able to find the content.
2035        #[test]
2036        fn test_ocf_url_relative() {
2037            let epub_file = Path::new("./test_case/ocf-url_relative.epub");
2038            let doc = EpubDoc::new(epub_file);
2039            assert!(doc.is_ok());
2040
2041            let doc = doc.unwrap();
2042            assert_eq!(doc.package_path, Path::new("foo/BAR/baz.opf"));
2043            assert_eq!(doc.base_path, Path::new("foo/BAR"));
2044            assert_eq!(
2045                doc.manifest.get("nav").unwrap().path,
2046                Path::new("foo/BAR/nav.xhtml")
2047            );
2048            assert_eq!(
2049                doc.manifest.get("content_001").unwrap().path,
2050                Path::new("foo/BAR/qux/content_001.xhtml")
2051            );
2052            assert!(doc.get_manifest_item("nav").is_ok());
2053            assert!(doc.get_manifest_item("content_001").is_ok());
2054        }
2055
2056        /// ID: ocf-zip-comp
2057        ///
2058        /// MUST treat any OCF ZIP container that uses compression techniques other than Deflate as in error.
2059        /// This test case does not use compression methods other than Deflate and cannot detect whether it is effective.
2060        #[test]
2061        fn test_ocf_zip_comp() {
2062            let epub_file = Path::new("./test_case/ocf-zip-comp.epub");
2063            let doc = EpubDoc::new(epub_file);
2064            assert!(doc.is_ok());
2065        }
2066
2067        /// ID: ocf-zip-mult
2068        ///
2069        /// MUST treat any OCF ZIP container that splits the content into segments as in error.
2070        /// This test case is not a segmented OCF ZIP container and cannot be tested to see if it is valid.
2071        #[test]
2072        fn test_ocf_zip_mult() {
2073            let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2074            let doc = EpubDoc::new(epub_file);
2075            assert!(doc.is_ok());
2076        }
2077
2078        /// ID: ocf-font_obfuscation
2079        ///
2080        /// An obfuscated (TrueType) font should be displayed after de-obfuscation.
2081        #[test]
2082        fn test_ocf_font_obfuscation() {
2083            let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2084            let doc = EpubDoc::new(epub_file);
2085            assert!(doc.is_ok());
2086
2087            let doc = doc.unwrap();
2088            let unique_id = doc.unique_identifier.clone();
2089
2090            let mut hasher = Sha1::new();
2091            hasher.update(unique_id.as_bytes());
2092            let hash = hasher.finalize();
2093            let mut key = vec![0u8; 1040];
2094            for i in 0..1040 {
2095                key[i] = hash[i % hash.len()];
2096            }
2097
2098            assert!(doc.encryption.is_some());
2099            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2100
2101            let data = &doc.encryption.unwrap()[0];
2102            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2103
2104            let font_file = doc
2105                .archive
2106                .lock()
2107                .unwrap()
2108                .by_name(&data.data)
2109                .unwrap()
2110                .bytes()
2111                .collect::<Result<Vec<u8>, _>>();
2112            assert!(font_file.is_ok());
2113            let font_file = font_file.unwrap();
2114
2115            // 根据EPUB规范，字体混淆是直接对字体文件进行的，不需要解压步骤，直接进行去混淆处理
2116            let mut deobfuscated = font_file.clone();
2117            for i in 0..min(1040, deobfuscated.len()) {
2118                deobfuscated[i] ^= key[i];
2119            }
2120
2121            assert!(is_valid_font(&deobfuscated));
2122        }
2123
2124        /// ID: ocf-font_obfuscation-bis
2125        ///
2126        /// An obfuscated (TrueType) font should not be displayed after de-obfuscation, because the obfuscation used a different publication id.
2127        #[test]
2128        fn test_ocf_font_obfuscation_bis() {
2129            let epub_file = Path::new("./test_case/ocf-font_obfuscation_bis.epub");
2130            let doc = EpubDoc::new(epub_file);
2131            assert!(doc.is_ok());
2132
2133            let doc = doc.unwrap();
2134
2135            let wrong_unique_id = "wrong-publication-id";
2136            let mut hasher = Sha1::new();
2137            hasher.update(wrong_unique_id.as_bytes());
2138            let hash = hasher.finalize();
2139            let mut wrong_key = vec![0u8; 1040];
2140            for i in 0..1040 {
2141                wrong_key[i] = hash[i % hash.len()];
2142            }
2143
2144            assert!(doc.encryption.is_some());
2145            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2146
2147            let data = &doc.encryption.unwrap()[0];
2148            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2149
2150            let font_file = doc
2151                .archive
2152                .lock()
2153                .unwrap()
2154                .by_name(&data.data)
2155                .unwrap()
2156                .bytes()
2157                .collect::<Result<Vec<u8>, _>>();
2158            assert!(font_file.is_ok());
2159            let font_file = font_file.unwrap();
2160
2161            // 使用错误的密钥进行去混淆
2162            let mut deobfuscated_with_wrong_key = font_file.clone();
2163            for i in 0..std::cmp::min(1040, deobfuscated_with_wrong_key.len()) {
2164                deobfuscated_with_wrong_key[i] ^= wrong_key[i];
2165            }
2166
2167            assert!(!is_valid_font(&deobfuscated_with_wrong_key));
2168        }
2169
2170        fn is_valid_font(data: &[u8]) -> bool {
2171            if data.len() < 4 {
2172                return false;
2173            }
2174            let sig = &data[0..4];
2175            // OTF: "OTTO"
2176            // TTF: 0x00010000, 0x00020000, "true", "typ1"
2177            sig == b"OTTO"
2178                || sig == b"\x00\x01\x00\x00"
2179                || sig == b"\x00\x02\x00\x00"
2180                || sig == b"true"
2181                || sig == b"typ1"
2182        }
2183    }
2184
2185    #[test]
2186    fn test_parse_container() {
2187        let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2188        let doc = EpubDoc::new(epub_file);
2189        assert!(doc.is_ok());
2190
2191        // let doc = doc.unwrap();
2192        let container = r#"
2193        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2194            <rootfiles></rootfiles>
2195        </container>
2196        "#
2197        .to_string();
2198
2199        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2200        assert!(result.is_err());
2201        assert_eq!(
2202            result.unwrap_err(),
2203            EpubError::NonCanonicalFile { tag: "rootfile".to_string() }
2204        );
2205
2206        let container = r#"
2207        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2208            <rootfiles>
2209                <rootfile media-type="application/oebps-package+xml"/>
2210            </rootfiles>
2211        </container>
2212        "#
2213        .to_string();
2214
2215        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2216        assert!(result.is_err());
2217        assert_eq!(
2218            result.unwrap_err(),
2219            EpubError::MissingRequiredAttribute {
2220                tag: "rootfile".to_string(),
2221                attribute: "full-path".to_string(),
2222            }
2223        );
2224
2225        let container = r#"
2226        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2227            <rootfiles>
2228                <rootfile media-type="application/oebps-package+xml" full-path="EPUB/content.opf"/>
2229            </rootfiles>
2230        </container>
2231        "#
2232        .to_string();
2233
2234        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2235        assert!(result.is_ok());
2236        assert_eq!(result.unwrap(), PathBuf::from("EPUB/content.opf"))
2237    }
2238
2239    #[test]
2240    fn test_parse_manifest() {
2241        let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
2242        let doc = EpubDoc::new(epub_file);
2243        assert!(doc.is_ok());
2244
2245        let manifest = r#"
2246        <manifest>
2247            <item href="content_001.xhtml" media-type="application/xhtml+xml"/>
2248            <item properties="nav" href="nav.xhtml" media-type="application/xhtml+xml"/>
2249        </manifest>
2250        "#;
2251        let mut doc = doc.unwrap();
2252        let element = XmlReader::parse(manifest);
2253        assert!(element.is_ok());
2254
2255        let element = element.unwrap();
2256        let result = doc.parse_manifest(&element);
2257        assert!(result.is_err());
2258        assert_eq!(
2259            result.unwrap_err(),
2260            EpubError::MissingRequiredAttribute {
2261                tag: "item".to_string(),
2262                attribute: "id".to_string(),
2263            },
2264        );
2265
2266        let manifest = r#"
2267        <manifest>
2268            <item id="content_001" media-type="application/xhtml+xml"/>
2269            <item id="nav" properties="nav" media-type="application/xhtml+xml"/>
2270        </manifest>
2271        "#;
2272        let element = XmlReader::parse(manifest);
2273        assert!(element.is_ok());
2274
2275        let element = element.unwrap();
2276        let result = doc.parse_manifest(&element);
2277        assert!(result.is_err());
2278        assert_eq!(
2279            result.unwrap_err(),
2280            EpubError::MissingRequiredAttribute {
2281                tag: "item".to_string(),
2282                attribute: "href".to_string(),
2283            },
2284        );
2285
2286        let manifest = r#"
2287        <manifest>
2288            <item id="content_001" href="content_001.xhtml"/>
2289            <item id="nav" properties="nav" href="nav.xhtml"/>
2290        </manifest>
2291        "#;
2292        let element = XmlReader::parse(manifest);
2293        assert!(element.is_ok());
2294
2295        let element = element.unwrap();
2296        let result = doc.parse_manifest(&element);
2297        assert!(result.is_err());
2298        assert_eq!(
2299            result.unwrap_err(),
2300            EpubError::MissingRequiredAttribute {
2301                tag: "item".to_string(),
2302                attribute: "media-type".to_string(),
2303            },
2304        );
2305
2306        let manifest = r#"
2307        <manifest>
2308            <item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
2309            <item id="nav" properties="nav" href="nav.xhtml" media-type="application/xhtml+xml"/>
2310        </manifest>
2311        "#;
2312        let element = XmlReader::parse(manifest);
2313        assert!(element.is_ok());
2314
2315        let element = element.unwrap();
2316        let result = doc.parse_manifest(&element);
2317        assert!(result.is_ok());
2318    }
2319
2320    /// Test for function `has_encryption`
2321    #[test]
2322    fn test_fn_has_encryption() {
2323        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2324        let doc = EpubDoc::new(epub_file);
2325        assert!(doc.is_ok());
2326
2327        let doc = doc.unwrap();
2328        assert!(doc.has_encryption());
2329    }
2330
2331    /// This test is used to detect whether the "META-INF/encryption.xml" file is parsed correctly
2332    #[test]
2333    fn test_fn_parse_encryption() {
2334        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2335        let doc = EpubDoc::new(epub_file);
2336        assert!(doc.is_ok());
2337
2338        let doc = doc.unwrap();
2339        assert!(doc.encryption.is_some());
2340
2341        let encryption = doc.encryption.unwrap();
2342        assert_eq!(encryption.len(), 1);
2343        assert_eq!(encryption[0].method, "http://www.idpf.org/2008/embedding");
2344        assert_eq!(encryption[0].data, "EPUB/fonts/Lobster.ttf");
2345    }
2346
2347    #[test]
2348    fn test_get_metadata_existing_key() {
2349        let epub_file = Path::new("./test_case/epub-33.epub");
2350        let doc = EpubDoc::new(epub_file);
2351        assert!(doc.is_ok());
2352
2353        let doc = doc.unwrap();
2354
2355        let titles = doc.get_metadata("title");
2356        assert!(titles.is_some());
2357
2358        let titles = titles.unwrap();
2359        assert_eq!(titles.len(), 1);
2360        assert_eq!(titles[0].property, "title");
2361        assert_eq!(titles[0].value, "EPUB 3.3");
2362
2363        let languages = doc.get_metadata("language");
2364        assert!(languages.is_some());
2365
2366        let languages = languages.unwrap();
2367        assert_eq!(languages.len(), 1);
2368        assert_eq!(languages[0].property, "language");
2369        assert_eq!(languages[0].value, "en-us");
2370
2371        let language = doc.get_language();
2372        assert!(language.is_ok());
2373        assert_eq!(language.unwrap(), vec!["en-us"]);
2374    }
2375
2376    #[test]
2377    fn test_get_metadata_nonexistent_key() {
2378        let epub_file = Path::new("./test_case/epub-33.epub");
2379        let doc = EpubDoc::new(epub_file);
2380        assert!(doc.is_ok());
2381
2382        let doc = doc.unwrap();
2383        let metadata = doc.get_metadata("nonexistent");
2384        assert!(metadata.is_none());
2385    }
2386
2387    #[test]
2388    fn test_get_metadata_multiple_items_same_type() {
2389        let epub_file = Path::new("./test_case/epub-33.epub");
2390        let doc = EpubDoc::new(epub_file);
2391        assert!(doc.is_ok());
2392
2393        let doc = doc.unwrap();
2394
2395        let creators = doc.get_metadata("creator");
2396        assert!(creators.is_some());
2397
2398        let creators = creators.unwrap();
2399        assert_eq!(creators.len(), 3);
2400
2401        assert_eq!(creators[0].id, Some("creator_id_0".to_string()));
2402        assert_eq!(creators[0].property, "creator");
2403        assert_eq!(creators[0].value, "Matt Garrish, DAISY Consortium");
2404
2405        assert_eq!(creators[1].id, Some("creator_id_1".to_string()));
2406        assert_eq!(creators[1].property, "creator");
2407        assert_eq!(creators[1].value, "Ivan Herman, W3C");
2408
2409        assert_eq!(creators[2].id, Some("creator_id_2".to_string()));
2410        assert_eq!(creators[2].property, "creator");
2411        assert_eq!(creators[2].value, "Dave Cramer, Invited Expert");
2412    }
2413
2414    #[test]
2415    fn test_get_metadata_with_refinement() {
2416        let epub_file = Path::new("./test_case/epub-33.epub");
2417        let doc = EpubDoc::new(epub_file);
2418        assert!(doc.is_ok());
2419
2420        let doc = doc.unwrap();
2421
2422        let title = doc.get_metadata("title");
2423        assert!(title.is_some());
2424
2425        let title = title.unwrap();
2426        assert_eq!(title.len(), 1);
2427        assert_eq!(title[0].refined.len(), 1);
2428        assert_eq!(title[0].refined[0].property, "title-type");
2429        assert_eq!(title[0].refined[0].value, "main");
2430    }
2431
2432    #[test]
2433    fn test_get_manifest_item_with_fallback() {
2434        let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
2435        let doc = EpubDoc::new(epub_file);
2436        assert!(doc.is_ok());
2437
2438        let doc = doc.unwrap();
2439        assert!(doc.get_manifest_item("content_001").is_ok());
2440        assert!(doc.get_manifest_item("bar").is_ok());
2441
2442        // 当回退链上存在可回退资源时能获取资源
2443        if let Ok((_, mime)) = doc.get_manifest_item_with_fallback("content_001", vec!["image/psd"])
2444        {
2445            assert_eq!(mime, "image/psd");
2446        } else {
2447            assert!(false, "get_manifest_item_with_fallback failed");
2448        }
2449
2450        // 当回退链上不存在可回退资源时无法获取资源
2451        assert_eq!(
2452            doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
2453                .unwrap_err()
2454                .to_string(),
2455            "No supported file format: The fallback resource does not contain the file format you support."
2456        );
2457    }
2458
2459    #[test]
2460    fn test_get_cover() {
2461        let epub_file = Path::new("./test_case/pkg-cover-image.epub");
2462        let doc = EpubDoc::new(epub_file);
2463        if let Err(err) = &doc {
2464            println!("{}", err);
2465        }
2466        assert!(doc.is_ok());
2467
2468        let doc = doc.unwrap();
2469        let result = doc.get_cover();
2470        assert!(result.is_some());
2471
2472        let (data, mime) = result.unwrap();
2473        assert_eq!(data.len(), 5785);
2474        assert_eq!(mime, "image/jpeg");
2475    }
2476
2477    #[test]
2478    fn test_epub_2() {
2479        let epub_file = Path::new("./test_case/epub-2.epub");
2480        let doc = EpubDoc::new(epub_file);
2481        assert!(doc.is_ok());
2482
2483        let doc = doc.unwrap();
2484
2485        let titles = doc.get_title();
2486        assert!(titles.is_ok());
2487        assert_eq!(titles.unwrap(), vec!["Minimal EPUB 2.0"]);
2488    }
2489}
lib_epub/epub.rs

lib_epub/
epub.rs