lib_epub/
epub.rs

1//! The core module of the EPUB parsing library
2//!
3//! This module provides complete parsing functionality for EPUB ebook files
4//! and is the core component of the entire library. The `EpubDoc` structure
5//! encapsulates all the parsing logic and data access interfaces for EPUB files.
6//!
7//! ## Main references to EPUB specs:
8//! - <https://www.w3.org/TR/epub-33>
9//! - <https://idpf.org/epub/201>
10//!
11//! ## Potential Issues
12//! - The generic parameter `R: Read + Seek` increases complexity, particularly
13//!   in asynchronous environments. The current design is not conducive to multi-threaded
14//!   concurrent access and requires an external synchronization mechanism.
15//! - Some error handling may not be sufficiently nuanced, and certain edge cases
16//!   may not be adequately considered.
17//! - Loading the entire EPUB document at once may result in significant memory consumption,
18//!   especially for large publications.
19//!
20//! ## Future Work
21//! - Adds support for asynchronous I/O, improving the user experience in asynchronous
22//!   environments. Considering adding support for multi-threaded access.
23//! - Supports more EPUB specification features, such as media overlay and scripts.
24
25use std::{
26    collections::HashMap,
27    fs::{File, canonicalize},
28    io::{BufReader, Read, Seek},
29    path::{Path, PathBuf},
30};
31
32use log::warn;
33use zip::{ZipArchive, result::ZipError};
34
35use crate::{
36    error::EpubError,
37    types::{
38        EncryptionData, EpubVersion, ManifestItem, MetadataItem, MetadataLinkItem,
39        MetadataRefinement, NavPoint, SpineItem,
40    },
41    utils::{
42        DecodeBytes, NormalizeWhitespace, XmlElement, XmlReader, adobe_font_dencryption,
43        check_realtive_link_leakage, compression_method_check, get_file_in_zip_archive,
44        idpf_font_dencryption,
45    },
46};
47
48/// EPUB document parser, representing a loaded and parsed EPUB publication
49///
50/// The `EpubDoc` structure is the core of the entire EPUB parsing library.
51/// It encapsulates all the parsing logic and data access interfaces for EPUB files.
52/// It is responsible for parsing various components of an EPUB, including metadata,
53/// manifests, reading order, table of contents navigation, and encrypted information,
54/// and provides methods for accessing this data.
55///
56/// Provides a unified data access interface for EPUB files, hiding the underlying
57/// file structure and parsing details. Strictly adheres to the EPUB specification
58/// in implementing the parsing logic to ensure compatibility with the standard.
59///
60/// # Usage
61///
62/// ```rust
63/// use lib_epub::epub::EpubDoc;
64///
65/// let doc = EpubDoc::new("./test_case/epub-33.epub");
66/// assert!(doc.is_ok());
67/// ```
68pub struct EpubDoc<R: Read + Seek> {
69    /// The structure of the epub file that actually holds it
70    pub(crate) archive: ZipArchive<R>,
71
72    /// The path to the target epub file
73    pub(crate) epub_path: PathBuf,
74
75    /// The path to the OPF file
76    pub package_path: PathBuf,
77
78    /// The path to the directory where the opf file is located
79    pub base_path: PathBuf,
80
81    /// The epub version
82    pub version: EpubVersion,
83
84    /// The unique identifier of the epub file
85    ///
86    /// This identifier is the actual value of the unique-identifier attribute of the package.
87    pub unique_identifier: String,
88
89    /// Epub metadata extracted from OPF
90    pub metadata: Vec<MetadataItem>,
91
92    /// Data in metadata that points to external files
93    pub metadata_link: Vec<MetadataLinkItem>,
94
95    /// A list of resources contained inside an epub extracted from OPF
96    ///
97    /// All resources in the epub file are declared here,
98    /// and undeclared resources should not be stored in the epub file and cannot be obtained from it.
99    pub manifest: HashMap<String, ManifestItem>,
100
101    /// Physical reading order of publications extracted from OPF
102    ///
103    /// This attribute declares the order in which multiple files
104    /// containing published content should be displayed.
105    pub spine: Vec<SpineItem>,
106
107    /// The encryption.xml extracted from the META-INF directory
108    pub encryption: Option<Vec<EncryptionData>>,
109
110    /// The navigation data of the epub file
111    pub catalog: Vec<NavPoint>,
112
113    /// The title of the catalog
114    pub catalog_title: String,
115
116    /// The index of the current reading spine
117    pub current_spine_index: usize,
118}
119
120impl<R: Read + Seek> EpubDoc<R> {
121    /// Creates a new EPUB document instance from a reader
122    ///
123    /// This function is responsible for the core logic of parsing EPUB files,
124    /// including verifying the file format, parsing container information,
125    /// loading the OPF package document, and extracting metadata, manifest,
126    /// reading order, and other core information.
127    ///
128    /// # Parameters
129    /// - `reader`: The data source that implements the `Read` and `Seek` traits,
130    ///   usually a file or memory buffer
131    /// - `epub_path`: The path to the EPUB file, used for path resolution and validation
132    ///
133    /// # Return
134    /// - `Ok(EpubDoc<R>)`: The successfully parsed EPUB document object
135    /// - `Err(EpubError)`: Errors encountered during parsing
136    ///
137    /// # Notes
138    /// - This function assumes the EPUB file structure is valid
139    pub fn from_reader(reader: R, epub_path: PathBuf) -> Result<Self, EpubError> {
140        // Parsing process
141        // 1. Verify that the ZIP compression method conforms to the EPUB specification
142        // 2. Parse `META-INF/container.xml` retrieves the location of the OPF file
143        // 3. Parses the OPF file to obtain package documentation information
144        // 4. Extracts version information
145        // 5. Parses metadata, manifest, and spine
146        // 6. Parses encrypted information and directory navigation
147        // 7. Verifies and extracts the unique identifier
148
149        let mut archive = ZipArchive::new(reader).map_err(EpubError::from)?;
150        let epub_path = canonicalize(epub_path)?;
151
152        compression_method_check(&mut archive)?;
153
154        let container =
155            get_file_in_zip_archive(&mut archive, "META-INF/container.xml")?.decode()?;
156        let package_path = Self::parse_container(container)?;
157        let base_path = package_path
158            .parent()
159            .expect("所有文件的父目录不能为空")
160            .to_path_buf();
161
162        let opf_file =
163            get_file_in_zip_archive(&mut archive, package_path.to_str().unwrap())?.decode()?;
164        let package = XmlReader::parse(&opf_file)?;
165        // let document = kiss_xml::parse_str(opf_file).unwrap();
166
167        // let package = document.root_element();
168        let version = Self::determine_epub_version(&package)?;
169
170        let mut doc = Self {
171            archive,
172            epub_path,
173            package_path,
174            base_path,
175            version,
176            unique_identifier: String::new(),
177            metadata: vec![],
178            metadata_link: vec![],
179            manifest: HashMap::new(),
180            spine: vec![],
181            encryption: None,
182            catalog: vec![],
183            catalog_title: String::new(),
184            current_spine_index: 0,
185        };
186
187        let metadata_element = package.find_elements_by_name("metadata").next().unwrap();
188        let manifest_element = package.find_elements_by_name("manifest").next().unwrap();
189        let spine_element = package.find_elements_by_name("spine").next().unwrap();
190
191        doc.parse_metadata(metadata_element)?;
192        doc.parse_manifest(manifest_element)?;
193        doc.parse_spine(spine_element)?;
194        doc.parse_encryption()?;
195        doc.parse_catalog()?;
196
197        // 断言必有唯一标识符
198        doc.unique_identifier = if let Some(uid) = package.get_attr("unique-identifier") {
199            doc.metadata.iter().find(|item| {
200                item.property == "identifier" && item.id.as_ref().is_some_and(|id| id == &uid)
201            })
202        } else {
203            doc.metadata
204                .iter()
205                .find(|item| item.property == "identifier")
206        }
207        .map(|item| item.value.clone())
208        .ok_or_else(|| EpubError::NonCanonicalFile {
209            tag: "dc:identifier".to_string(),
210        })?;
211
212        Ok(doc)
213    }
214
215    /// Parse the EPUB container file (META-INF/container.xml)
216    ///
217    /// This function parses the container information in the EPUB file 、
218    /// to extract the path to the OPF package file. According to the EPUB
219    /// specification, the `container.xml` file must exist in the `META-INF`
220    /// directory and contain at least one `rootfile` element pointing to
221    /// the main OPF file. When multiple `rootfile` elements exist, the first
222    /// element pointing to the OPF file is used as the default.
223    ///
224    /// # Parameters
225    /// - `data`: The content string of the container.xml
226    ///
227    /// # Return
228    /// - `Ok(PathBuf)`: The path to the successfully parsed OPF file
229    /// - `Err(EpubError)`: Errors encountered during parsing
230    fn parse_container(data: String) -> Result<PathBuf, EpubError> {
231        let root = XmlReader::parse(&data)?;
232        let rootfile = root
233            .find_elements_by_name("rootfile")
234            .next()
235            .ok_or_else(|| EpubError::NonCanonicalFile {
236                tag: "rootfile".to_string(),
237            })?;
238
239        let attr =
240            rootfile
241                .get_attr("full-path")
242                .ok_or_else(|| EpubError::MissingRequiredAttribute {
243                    tag: "rootfile".to_string(),
244                    attribute: "full-path".to_string(),
245                })?;
246
247        Ok(PathBuf::from(attr))
248    }
249
250    /// Parse the EPUB metadata section
251    ///
252    /// This function is responsible for parsing the `<metadata>` elements
253    /// in the OPF file to extract basic information about the publication.
254    /// It handles metadata elements from different namespaces:
255    /// - Elements in the Dublin Core namespace (`http://purl.org/dc/elements/1.1/`)
256    /// - Elements in the OPF namespace (`http://www.idpf.org/2007/opf`)
257    ///
258    /// # Parameters
259    /// - `metadata_element`: A reference to the `<metadata>` element in the OPF file
260    fn parse_metadata(&mut self, metadata_element: &XmlElement) -> Result<(), EpubError> {
261        const DC_NAMESPACE: &str = "http://purl.org/dc/elements/1.1/";
262        const OPF_NAMESPACE: &str = "http://www.idpf.org/2007/opf";
263
264        let mut metadata = Vec::new();
265        let mut metadata_link = Vec::new();
266        let mut refinements = HashMap::<String, Vec<MetadataRefinement>>::new();
267
268        for element in metadata_element.children() {
269            match &element.namespace {
270                Some(namespace) if namespace == DC_NAMESPACE => {
271                    self.parse_dc_metadata(element, &mut metadata)?
272                }
273
274                Some(namespace) if namespace == OPF_NAMESPACE => self.parse_opf_metadata(
275                    element,
276                    &mut metadata,
277                    &mut metadata_link,
278                    &mut refinements,
279                )?,
280
281                _ => {}
282            };
283        }
284
285        for item in metadata.iter_mut() {
286            if let Some(id) = &item.id {
287                if let Some(refinements) = refinements.remove(id) {
288                    item.refined = refinements;
289                }
290            }
291        }
292
293        self.metadata = metadata;
294        self.metadata_link = metadata_link;
295        Ok(())
296    }
297
298    /// Parse the EPUB manifest section
299    ///
300    /// This function parses the `<manifest>` element in the OPF file, extracting
301    /// information about all resource files in the publication. Each resource contains
302    /// basic information such as id, file path, MIME type, as well as optional
303    /// attributes and fallback resource information.
304    ///
305    /// # Parameters
306    /// - `manifest_element`: A reference to the `<manifest>` element in the OPF file
307    fn parse_manifest(&mut self, manifest_element: &XmlElement) -> Result<(), EpubError> {
308        let estimated_items = manifest_element.children().count();
309        let mut resources = HashMap::with_capacity(estimated_items);
310
311        for element in manifest_element.children() {
312            let id = element
313                .get_attr("id")
314                .ok_or_else(|| EpubError::MissingRequiredAttribute {
315                    tag: element.tag_name(),
316                    attribute: "id".to_string(),
317                })?
318                .to_string();
319            let path = element
320                .get_attr("href")
321                .ok_or_else(|| EpubError::MissingRequiredAttribute {
322                    tag: element.tag_name(),
323                    attribute: "href".to_string(),
324                })?
325                .to_string();
326            let mime = element
327                .get_attr("media-type")
328                .ok_or_else(|| EpubError::MissingRequiredAttribute {
329                    tag: element.tag_name(),
330                    attribute: "media-type".to_string(),
331                })?
332                .to_string();
333            let properties = element.get_attr("properties");
334            let fallback = element.get_attr("fallback");
335
336            resources.insert(
337                id.clone(),
338                ManifestItem {
339                    id,
340                    path: self.normalize_manifest_path(&path)?,
341                    mime,
342                    properties,
343                    fallback,
344                },
345            );
346        }
347
348        self.manifest = resources;
349        self.validate_fallback_chains();
350        Ok(())
351    }
352
353    /// Parse the EPUB spine section
354    ///
355    /// This function parses the `<spine>` elements in the OPF file to extract
356    /// the reading order information of the publication. The spine defines the
357    /// linear reading order of the publication's content documents, and each
358    /// spine item references resources in the manifest.
359    ///
360    /// # Parameters
361    /// - `spine_element`: A reference to the `<spine>` element in the OPF file
362    fn parse_spine(&mut self, spine_element: &XmlElement) -> Result<(), EpubError> {
363        let mut spine = Vec::new();
364        for element in spine_element.children() {
365            let idref = element
366                .get_attr("idref")
367                .ok_or_else(|| EpubError::MissingRequiredAttribute {
368                    tag: element.tag_name(),
369                    attribute: "idref".to_string(),
370                })?
371                .to_string();
372            let id = element.get_attr("id");
373            let linear = element
374                .get_attr("linear")
375                .map(|linear| linear == "yes")
376                .unwrap_or(true);
377            let properties = element.get_attr("properties");
378
379            spine.push(SpineItem {
380                idref,
381                id,
382                linear,
383                properties,
384            });
385        }
386
387        self.spine = spine;
388        Ok(())
389    }
390
391    /// Parse the EPUB encryption file (META-INF/encryption.xml)
392    ///
393    /// This function is responsible for parsing the `encryption.xml` file
394    /// in the `META-INF` directory to extract information about encrypted
395    /// resources in the publication. According to the EPUB specification,
396    /// the encryption information describes which resources are encrypted
397    /// and the encryption methods used.
398    ///
399    /// TODO: 需要对使用非对称加密数据的加密项进行额外处理，以获取非对称加密密钥
400    fn parse_encryption(&mut self) -> Result<(), EpubError> {
401        if !self.has_encryption() {
402            return Ok(());
403        }
404
405        let encryption_file =
406            get_file_in_zip_archive(&mut self.archive, "META-INF/encryption.xml")?.decode()?;
407
408        let root = XmlReader::parse(&encryption_file)?;
409
410        let mut encryption_data = Vec::new();
411        for data in root.children() {
412            if data.name != "EncryptedData" {
413                continue;
414            }
415
416            let method = data
417                .find_elements_by_name("EncryptionMethod")
418                .next()
419                .ok_or_else(|| EpubError::NonCanonicalFile {
420                    tag: "EncryptionMethod".to_string(),
421                })?;
422            let reference = data
423                .find_elements_by_name("CipherReference")
424                .next()
425                .ok_or_else(|| EpubError::NonCanonicalFile {
426                    tag: "CipherReference".to_string(),
427                })?;
428
429            encryption_data.push(EncryptionData {
430                method: method
431                    .get_attr("Algorithm")
432                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
433                        tag: "EncryptionMethod".to_string(),
434                        attribute: "Algorithm".to_string(),
435                    })?
436                    .to_string(),
437                data: reference
438                    .get_attr("URI")
439                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
440                        tag: "CipherReference".to_string(),
441                        attribute: "URI".to_string(),
442                    })?
443                    .to_string(),
444            });
445        }
446
447        if !encryption_data.is_empty() {
448            self.encryption = Some(encryption_data);
449        }
450
451        Ok(())
452    }
453
454    /// Parse the EPUB navigation information
455    ///
456    /// This function is responsible for parsing the navigation information of EPUB
457    /// publications. Different parsing strategies are used depending on the EPUB version:
458    /// - EPUB 2.0: Parses the NCX file to obtain directory information
459    /// - EPUB 3.0: Parses the Navigation Document (NAV) file to obtain directory information
460    fn parse_catalog(&mut self) -> Result<(), EpubError> {
461        const HEAD_TAGS: [&str; 6] = ["h1", "h2", "h3", "h4", "h5", "h6"];
462
463        match self.version {
464            EpubVersion::Version2_0 => {
465                let opf_file = get_file_in_zip_archive(
466                    &mut self.archive,
467                    self.package_path.to_str().unwrap(),
468                )?
469                .decode()?;
470                let opf_element = XmlReader::parse(&opf_file)?;
471
472                let toc_id = opf_element
473                    .find_children_by_name("spine")
474                    .next()
475                    .ok_or_else(|| EpubError::NonCanonicalFile {
476                        tag: "spine".to_string(),
477                    })?
478                    .get_attr("toc")
479                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
480                        tag: "spine".to_string(),
481                        attribute: "toc".to_string(),
482                    })?
483                    .to_owned();
484                let toc_path = self
485                    .manifest
486                    .get(&toc_id)
487                    .ok_or(EpubError::ResourceIdNotExist { id: toc_id })?
488                    .path
489                    .to_str()
490                    .unwrap();
491
492                let ncx_file = get_file_in_zip_archive(&mut self.archive, toc_path)?.decode()?;
493                let ncx = XmlReader::parse(&ncx_file)?;
494
495                match ncx.find_elements_by_name("docTitle").next() {
496                    Some(element) => self.catalog_title = element.text(),
497                    None => warn!(
498                        "Expecting to get docTitle information from the ncx file, but it's missing."
499                    ),
500                };
501
502                let nav_map = ncx.find_elements_by_name("navMap").next().ok_or_else(|| {
503                    EpubError::NonCanonicalFile {
504                        tag: "navMap".to_string(),
505                    }
506                })?;
507
508                self.catalog = self.parse_nav_points(nav_map)?;
509
510                Ok(())
511            }
512
513            EpubVersion::Version3_0 => {
514                let nav_path = self
515                    .manifest
516                    .values()
517                    .find(|item| {
518                        if let Some(property) = &item.properties {
519                            return property.contains("nav");
520                        }
521                        false
522                    })
523                    .map(|item| item.path.clone())
524                    .ok_or_else(|| EpubError::NonCanonicalEpub {
525                        expected_file: "Navigation Document".to_string(),
526                    })?;
527
528                let nav_file =
529                    get_file_in_zip_archive(&mut self.archive, nav_path.to_str().unwrap())?
530                        .decode()?;
531
532                let nav_element = XmlReader::parse(&nav_file)?;
533                let nav = nav_element
534                    .find_elements_by_name("nav")
535                    .find(|&element| element.get_attr("epub:type") == Some(String::from("toc")))
536                    .ok_or_else(|| EpubError::NonCanonicalFile {
537                        tag: "nav".to_string(),
538                    })?;
539                let nav_title = nav.find_children_by_names(&HEAD_TAGS).next();
540                let nav_list = nav.find_children_by_name("ol").next().ok_or_else(|| {
541                    EpubError::NonCanonicalFile {
542                        tag: "ol".to_string(),
543                    }
544                })?;
545
546                self.catalog = self.parse_catalog_list(nav_list)?;
547                if let Some(nav_title) = nav_title {
548                    self.catalog_title = nav_title.text();
549                };
550                Ok(())
551            }
552        }
553    }
554
555    /// Check if the EPUB file contains `encryption.xml`
556    ///
557    /// This function determines whether a publication contains encrypted resources
558    /// by checking if a `META-INF/encryption.xml` file exists in the EPUB package.
559    /// According to the EPUB specification, when resources in a publication are
560    /// encrypted, the corresponding encryption information must be declared in
561    /// the `META-INF/encryption.xml` file.
562    ///
563    /// # Return
564    /// - `true` if the publication contains encrypted resources
565    /// - `false` if the publication does not contain encrypted resources
566    ///
567    /// # Notes
568    /// - This function only checks the existence of the encrypted file;
569    ///   it does not verify the validity of the encrypted information.
570    pub fn has_encryption(&mut self) -> bool {
571        self.archive
572            .by_path(Path::new("META-INF/encryption.xml"))
573            .is_ok()
574    }
575
576    /// Retrieves a list of metadata items
577    ///
578    /// This function retrieves all matching metadata items from the EPUB metadata
579    /// based on the specified attribute name (key). Metadata items may come from
580    /// the DC (Dublin Core) namespace or the OPF namespace and contain basic
581    /// information about the publication, such as title, author, identifier, etc.
582    ///
583    /// # Parameters
584    /// - `key`: The name of the metadata attribute to retrieve
585    ///
586    /// # Return
587    /// - `Some(Vec<MetadataItem>)`: A vector containing all matching metadata items
588    /// - `None`: If no matching metadata items are found
589    pub fn get_metadata(&self, key: &str) -> Option<Vec<MetadataItem>> {
590        let metadatas = self
591            .metadata
592            .iter()
593            .filter(|item| item.property == key)
594            .cloned()
595            .collect::<Vec<MetadataItem>>();
596
597        (!metadatas.is_empty()).then_some(metadatas)
598    }
599
600    /// Retrieves a list of values for specific metadata items
601    ///
602    /// This function retrieves the values of all matching metadata items from
603    /// the EPUB metadata based on the given property name (key).
604    ///
605    /// # Parameters
606    /// - `key`: The name of the metadata attribute to retrieve
607    ///
608    /// # Return
609    /// - `Some(Vec<String>)`: A vector containing all matching metadata item values
610    /// - `None`: If no matching metadata items are found
611    pub fn get_metadata_value(&self, key: &str) -> Option<Vec<String>> {
612        let values = self
613            .metadata
614            .iter()
615            .filter(|item| item.property == key)
616            .map(|item| item.value.clone())
617            .collect::<Vec<String>>();
618
619        (!values.is_empty()).then_some(values)
620    }
621
622    /// Retrieves the title of the publication
623    ///
624    /// This function retrieves all title information from the EPUB metadata.
625    /// According to the EPUB specification, a publication can have multiple titles,
626    /// which are returned in the order they appear in the metadata.
627    ///
628    /// # Return
629    /// - `Result<Vec<String>, EpubError>`: A vector containing all title information
630    /// - `EpubError`: If and only if the OPF file does not contain `<dc:title>`
631    ///
632    /// # Notes
633    /// - The EPUB specification requires each publication to have at least one title.
634    pub fn get_title(&self) -> Result<Vec<String>, EpubError> {
635        self.get_metadata_value("title")
636            .ok_or_else(|| EpubError::NonCanonicalFile {
637                tag: "title".to_string(),
638            })
639    }
640
641    /// Retrieves the language used in the publication
642    ///
643    /// This function retrieves the language information of a publication from the EPUB
644    /// metadata. According to the EPUB specification, language information identifies
645    /// the primary language of the publication and can have multiple language identifiers.
646    ///
647    /// # Return
648    /// - `Ok(Vec<String>)`: A vector containing all language identifiers
649    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:language>`
650    ///
651    /// # Notes
652    /// - The EPUB specification requires that each publication specify at least one primary language.
653    /// - Language identifiers should conform to RFC 3066 or later standards.
654    pub fn get_language(&self) -> Result<Vec<String>, EpubError> {
655        self.get_metadata_value("language")
656            .ok_or_else(|| EpubError::NonCanonicalFile {
657                tag: "language".to_string(),
658            })
659    }
660
661    /// Retrieves the identifier of a publication
662    ///
663    /// This function retrieves the identifier information of a publication from
664    /// the EPUB metadata. According to the EPUB specification, each publication
665    /// must have a identifier, typically an ISBN, UUID, or other unique identifier.
666    ///
667    /// # Return
668    /// - `Ok(Vec<String>)`: A vector containing all identifier information
669    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:identifier>`
670    ///
671    /// # Notes
672    /// - The EPUB specification requires each publication to have at least one identifier.
673    /// - In the OPF file, the `unique-identifier` attribute of the `<package>` element
674    ///   should point to a `<dc:identifier>` element used to uniquely identify the publication.
675    ///   This means that `unique-identifier` is not exactly equal to `<dc:identifier>`.
676    pub fn get_identifier(&self) -> Result<Vec<String>, EpubError> {
677        self.get_metadata_value("identifier")
678            .ok_or_else(|| EpubError::NonCanonicalFile {
679                tag: "identifier".to_string(),
680            })
681    }
682
683    /// Retrieve resource data by resource ID
684    ///
685    /// This function will find the resource with the specified ID in the manifest.
686    /// If the resource is encrypted, it will be automatically decrypted.
687    ///
688    /// # Parameters
689    /// - `id`: The ID of the resource to retrieve
690    ///
691    /// # Return
692    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
693    ///   the MIME type
694    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
695    ///
696    /// # Notes
697    /// - This function will automatically decrypt the resource if it is encrypted.
698    /// - For unsupported encryption methods, the corresponding error will be returned.
699    pub fn get_manifest_item(&mut self, id: &str) -> Result<(Vec<u8>, String), EpubError> {
700        let resource_item = self
701            .manifest
702            .get(id)
703            .cloned()
704            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
705
706        let path = resource_item.path.to_str().unwrap();
707
708        let mut data = match self.archive.by_name(path) {
709            Ok(mut file) => {
710                let mut entry = Vec::<u8>::new();
711                file.read_to_end(&mut entry)?;
712
713                Ok(entry)
714            }
715            Err(ZipError::FileNotFound) => Err(EpubError::ResourceNotFound {
716                resource: path.to_string(),
717            }),
718            Err(err) => Err(EpubError::from(err)),
719        }?;
720
721        if let Some(method) = self.is_encryption_file(path) {
722            data = self.auto_dencrypt(&method, &mut data)?;
723        }
724
725        Ok((data, resource_item.mime))
726    }
727
728    /// Retrieves resource item data by resource path
729    ///
730    /// This function retrieves resources from the manifest based on the input path.
731    /// The input path must be a relative path to the root directory of the EPUB container;
732    /// using an absolute path or a relative path to another location will result in an error.
733    ///
734    /// # Parameters
735    /// - `path`: The path of the resource to retrieve
736    ///
737    /// # Return
738    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
739    ///   the MIME type
740    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
741    ///
742    /// # Notes
743    /// - This function will automatically decrypt the resource if it is encrypted.
744    /// - For unsupported encryption methods, the corresponding error will be returned.
745    /// - Relative paths other than the root directory of the Epub container are not supported.
746    pub fn get_manifest_item_by_path(
747        &mut self,
748        path: &str,
749    ) -> Result<(Vec<u8>, String), EpubError> {
750        let id = self
751            .manifest
752            .iter()
753            .find(|(_, item)| item.path.to_str().unwrap() == path)
754            .map(|(id, _)| id.to_string())
755            .ok_or_else(|| EpubError::ResourceNotFound {
756                resource: path.to_string(),
757            })?;
758
759        self.get_manifest_item(&id)
760    }
761
762    /// Retrieves supported resource items by resource ID, with fallback mechanism supported
763    ///
764    /// This function attempts to retrieve the resource item with the specified ID and
765    /// checks if its MIME type is in the list of supported formats. If the current resource
766    /// format is not supported, it searches for a supported resource format along the
767    /// fallback chain according to the fallback mechanism defined in the EPUB specification.
768    ///
769    /// # Parameters
770    /// - `id`: The ID of the resource to retrieve
771    /// - `supported_format`: A vector of supported MIME types
772    ///
773    /// # Return
774    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
775    ///   the MIME type
776    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
777    pub fn get_manifest_item_with_fallback(
778        &mut self,
779        id: &str,
780        supported_format: Vec<&str>,
781    ) -> Result<(Vec<u8>, String), EpubError> {
782        let mut manifest_item = self
783            .manifest
784            .get(id)
785            .cloned()
786            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
787
788        let mut current_manifest_id = id.to_string();
789        let mut fallback_chain = Vec::<String>::new();
790        'fallback: loop {
791            if supported_format.contains(&manifest_item.mime.as_str()) {
792                return self.get_manifest_item(&current_manifest_id);
793            }
794
795            let fallback_id = manifest_item.fallback.clone();
796
797            match fallback_id {
798                // The loop ends when no fallback resource exists
799                None => break 'fallback,
800
801                // End the loop when the loop continues to fallback if a fallback resource exists
802                Some(id) if fallback_chain.contains(&id) => break 'fallback,
803
804                Some(id) => {
805                    fallback_chain.push(id.clone());
806
807                    // Since only warnings are issued for fallback resource checks
808                    // during initialization, the issue of fallback resources possibly
809                    // not existing needs to be handled here.
810                    manifest_item = self
811                        .manifest
812                        .get(&manifest_item.fallback.unwrap())
813                        .cloned()
814                        .ok_or(EpubError::ResourceIdNotExist { id: id.clone() })?;
815                    current_manifest_id = id;
816                }
817            };
818        }
819
820        Err(EpubError::NoSupportedFileFormat)
821    }
822
823    /// Retrieves the cover of the EPUB document
824    ///
825    /// This function searches for the cover of the EPUB document by examining manifest
826    /// items in the manifest. It looks for manifest items whose ID or attribute contains
827    /// "cover" (case-insensitive) and attempts to retrieve the content of the first match.
828    ///
829    /// # Return
830    /// - `Some((Vec<u8>, String))`: Successfully retrieved and decrypted cover data and
831    ///   the MIME type
832    /// - `None`: No cover resource was found
833    ///
834    /// # Notes
835    /// - This function only returns the first successfully retrieved cover resource,
836    ///   even if multiple matches exist
837    /// - The retrieved cover may not be an image resource; users need to pay attention
838    ///   to the resource's MIME type.
839    pub fn get_cover(&mut self) -> Option<(Vec<u8>, String)> {
840        self.manifest
841            .values()
842            .filter_map(|manifest| {
843                if manifest.id.to_ascii_lowercase().contains("cover") {
844                    return Some(manifest.id.clone());
845                }
846
847                if let Some(properties) = &manifest.properties {
848                    if properties.to_ascii_lowercase().contains("cover") {
849                        return Some(manifest.id.clone());
850                    }
851                }
852
853                None
854            })
855            .collect::<Vec<String>>()
856            .iter()
857            .find_map(|id| self.get_manifest_item(id).ok())
858    }
859
860    /// Navigate to a specified chapter using the spine index
861    ///
862    /// This function retrieves the content data of the corresponding chapter based
863    /// on the index position in the EPUB spine. The spine defines the linear reading
864    /// order of the publication's content documents, and each spine item references
865    /// resources in the manifest.
866    ///
867    /// # Parameters
868    /// - `index`: The index position in the spine, starting from 0
869    ///
870    /// # Return
871    /// - `Some((Vec<u8>, String))`: Successfully retrieved chapter content data and the MIME type
872    /// - `None`: Index out of range or data retrieval error
873    ///
874    /// # Notes
875    /// - The index must be less than the total number of spine projects.
876    /// - If the resource is encrypted, it will be automatically decrypted before returning.(TODO)
877    /// - It does not check whether the Spine project follows a linear reading order.
878    pub fn navigate_by_spine_index(&mut self, index: usize) -> Option<(Vec<u8>, String)> {
879        if index >= self.spine.len() {
880            return None;
881        }
882
883        let manifest_id = self.spine[index].idref.clone();
884        self.current_spine_index = index;
885        self.get_manifest_item(&manifest_id).ok()
886    }
887
888    /// Navigate to the previous linear reading chapter
889    ///
890    /// This function searches backwards in the EPUB spine for the previous linear
891    /// reading chapter and returns the content data of that chapter. It only navigates
892    /// to chapters marked as linear reading.
893    ///
894    /// # Return
895    /// - `Some((Vec<u8>, String))`: Successfully retrieved previous chapter content data and
896    ///   the MIME type
897    /// - `None`: Already in the first chapter, the current chapter is not linear,
898    ///   or data retrieval failed
899    pub fn spine_prev(&mut self) -> Option<(Vec<u8>, String)> {
900        if self.current_spine_index == 0 || !self.spine[self.current_spine_index].linear {
901            return None;
902        }
903
904        let prev_index = (0..self.current_spine_index)
905            .rev()
906            .find(|&index| self.spine[index].linear)?;
907
908        self.current_spine_index = prev_index;
909        let manifest_id = self.spine[prev_index].idref.clone();
910        self.get_manifest_item(&manifest_id).ok()
911    }
912
913    /// Navigate to the next linear reading chapter
914    ///
915    /// This function searches forwards in the EPUB spine for the next linear reading
916    /// chapter and returns the content data of that chapter. It only navigates to
917    /// chapters marked as linear reading.
918    ///
919    /// # Return
920    /// - `Some((Vec<u8>, String))`: Successfully retrieved next chapter content data and
921    ///   the MIME type
922    /// - `None`: Already in the last chapter, the current chapter is not linear,
923    ///   or data retrieval failed
924    pub fn spine_next(&mut self) -> Option<(Vec<u8>, String)> {
925        if self.current_spine_index >= self.spine.len() - 1
926            || !self.spine[self.current_spine_index].linear
927        {
928            return None;
929        }
930
931        let next_index = (self.current_spine_index + 1..self.spine.len())
932            .find(|&index| self.spine[index].linear)?;
933
934        self.current_spine_index = next_index;
935        let manifest_id = self.spine[next_index].idref.clone();
936        self.get_manifest_item(&manifest_id).ok()
937    }
938
939    /// Retrieves the content data of the current chapter
940    ///
941    /// This function returns the content data of the chapter at the current
942    /// index position in the EPUB spine.
943    ///
944    /// # Return
945    /// - `Some((Vec<u8>, String))`: Successfully retrieved current chapter content data and
946    ///   the MIME type
947    /// - `None`: Data retrieval failed
948    pub fn spine_current(&mut self) -> Option<(Vec<u8>, String)> {
949        let manifest_id = self.spine[self.current_spine_index].idref.clone();
950        self.get_manifest_item(&manifest_id).ok()
951    }
952
953    /// Determine the EPUB version from the OPF file
954    ///
955    /// This function is used to detect the version of an epub file from an OPF file.
956    /// When the version attribute in the package is abnormal, version information will
957    /// be identified through some version characteristics of the epub file. An error is
958    /// returned when neither direct nor indirect methods can identify the version.
959    ///
960    /// # Parameters
961    /// - `opf_element`: A reference to the OPF file element
962    fn determine_epub_version(opf_element: &XmlElement) -> Result<EpubVersion, EpubError> {
963        // Check the explicit version attribute
964        if let Some(version) = opf_element.get_attr("version") {
965            match version.as_str() {
966                "2.0" => return Ok(EpubVersion::Version2_0),
967                "3.0" => return Ok(EpubVersion::Version3_0),
968                _ => {}
969            }
970        }
971
972        let spine_element = opf_element
973            .find_elements_by_name("spine")
974            .next()
975            .ok_or_else(|| EpubError::NonCanonicalFile {
976                tag: "spine".to_string(),
977            })?;
978
979        // Look for EPUB 2.x specific features
980        if spine_element.get_attr("toc").is_some() {
981            return Ok(EpubVersion::Version2_0);
982        }
983
984        let manifest_element = opf_element
985            .find_elements_by_name("manifest")
986            .next()
987            .ok_or_else(|| EpubError::NonCanonicalFile {
988                tag: "manifest".to_string(),
989            })?;
990
991        // Look for EPUB 3.x specific features
992        manifest_element
993            .children()
994            .find_map(|element| {
995                if let Some(id) = element.get_attr("id") {
996                    if id.eq("nav") {
997                        return Some(EpubVersion::Version3_0);
998                    }
999                }
1000
1001                None
1002            })
1003            .ok_or(EpubError::UnrecognizedEpubVersion)
1004    }
1005
1006    /// Parse metadata elements under the Dublin Core namespace
1007    ///
1008    /// This function handles the `<metadata>` Dublin Core element in the OPF file (namespace
1009    /// is "http://purl.org/dc/elements/1.1/"). These elements usually contain the basic
1010    /// information of the publication, such as title, author, publication date, etc.
1011    ///
1012    /// # Notes
1013    /// - In EPUB 3.0, granular information is handled by separate '<meta>' elements and 'refines' attributes
1014    /// - All text content is normalized by whitespace
1015    #[inline]
1016    fn parse_dc_metadata(
1017        &mut self,
1018        element: &XmlElement,
1019        metadata: &mut Vec<MetadataItem>,
1020        // refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1021    ) -> Result<(), EpubError> {
1022        let id = element.get_attr("id");
1023        let lang = element.get_attr("lang");
1024        let property = element.name.clone();
1025        let value = element.text().normalize_whitespace();
1026
1027        let refined = match self.version {
1028            // In EPUB 2.0, supplementary metadata (refinements) are represented
1029            // through other attribute data pairs of the tag.
1030            EpubVersion::Version2_0 => element
1031                .attributes
1032                .iter()
1033                .map(|(name, value)| {
1034                    let property = name.to_string();
1035                    let value = value.to_string().normalize_whitespace();
1036
1037                    MetadataRefinement {
1038                        refines: id.clone().unwrap(),
1039                        property,
1040                        value,
1041                        lang: None,
1042                        scheme: None,
1043                    }
1044                })
1045                .collect(),
1046            EpubVersion::Version3_0 => vec![],
1047        };
1048
1049        metadata.push(MetadataItem {
1050            id,
1051            property,
1052            value,
1053            lang,
1054            refined,
1055        });
1056
1057        Ok(())
1058    }
1059
1060    /// Parse metadata elements under the OPF namespace
1061    ///
1062    /// This function handles the `<metadata>` OPF element in the OPF file (namespace
1063    /// is "http://www.idpf.org/2007/opf"). These elements include '<meta>' and '<link>',
1064    /// which are used to provide extended metadata and links to external resources for EPUB publications.
1065    ///
1066    /// # Notes
1067    /// - The function is only responsible for distribution processing, and the
1068    ///   specific parsing logic is implemented in the dedicated function
1069    /// - All parsing results are added directly to the incoming collection and no new collection is returned
1070    #[inline]
1071    fn parse_opf_metadata(
1072        &mut self,
1073        element: &XmlElement,
1074        metadata: &mut Vec<MetadataItem>,
1075        metadata_link: &mut Vec<MetadataLinkItem>,
1076        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1077    ) -> Result<(), EpubError> {
1078        match element.name.as_str() {
1079            "meta" => self.parse_meta_element(element, metadata, refinements),
1080            "link" => self.parse_link_element(element, metadata_link),
1081            _ => Ok(()),
1082        }
1083    }
1084
1085    #[inline]
1086    fn parse_meta_element(
1087        &mut self,
1088        element: &XmlElement,
1089        metadata: &mut Vec<MetadataItem>,
1090        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1091    ) -> Result<(), EpubError> {
1092        match self.version {
1093            EpubVersion::Version2_0 => {
1094                let property =
1095                    element
1096                        .get_attr("name")
1097                        .ok_or_else(|| EpubError::NonCanonicalFile {
1098                            tag: element.tag_name(),
1099                        })?;
1100                let value = element
1101                    .get_attr("content")
1102                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
1103                        tag: element.tag_name(),
1104                        attribute: "content".to_string(),
1105                    })?
1106                    .normalize_whitespace();
1107
1108                metadata.push(MetadataItem {
1109                    id: None,
1110                    property,
1111                    value,
1112                    lang: None,
1113                    refined: vec![],
1114                });
1115            }
1116
1117            EpubVersion::Version3_0 => {
1118                let property = element.get_attr("property").ok_or_else(|| {
1119                    EpubError::MissingRequiredAttribute {
1120                        tag: element.tag_name(),
1121                        attribute: "property".to_string(),
1122                    }
1123                })?;
1124                let value = element.text().normalize_whitespace();
1125                let lang = element.get_attr("lang");
1126
1127                if let Some(refines) = element.get_attr("refines") {
1128                    let id = refines.strip_prefix("#").unwrap_or(&refines).to_string();
1129                    let scheme = element.get_attr("scheme");
1130                    let refinement = MetadataRefinement {
1131                        refines: id.clone(),
1132                        property,
1133                        value,
1134                        lang,
1135                        scheme,
1136                    };
1137
1138                    if let Some(refinements) = refinements.get_mut(&id) {
1139                        refinements.push(refinement);
1140                    } else {
1141                        refinements.insert(id, vec![refinement]);
1142                    }
1143                } else {
1144                    let id = element.get_attr("id");
1145                    let item = MetadataItem {
1146                        id,
1147                        property,
1148                        value,
1149                        lang,
1150                        refined: vec![],
1151                    };
1152
1153                    metadata.push(item);
1154                };
1155            }
1156        }
1157        Ok(())
1158    }
1159
1160    #[inline]
1161    fn parse_link_element(
1162        &mut self,
1163        element: &XmlElement,
1164        metadata_link: &mut Vec<MetadataLinkItem>,
1165    ) -> Result<(), EpubError> {
1166        let href = element
1167            .get_attr("href")
1168            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1169                tag: element.tag_name(),
1170                attribute: "href".to_string(),
1171            })?;
1172        let rel = element
1173            .get_attr("rel")
1174            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1175                tag: element.tag_name(),
1176                attribute: "rel".to_string(),
1177            })?;
1178        let hreflang = element.get_attr("hreflang");
1179        let id = element.get_attr("id");
1180        let mime = element.get_attr("media-type");
1181        let properties = element.get_attr("properties");
1182
1183        metadata_link.push(MetadataLinkItem {
1184            href,
1185            rel,
1186            hreflang,
1187            id,
1188            mime,
1189            properties,
1190            refines: None,
1191        });
1192        Ok(())
1193    }
1194
1195    /// Recursively parse NCX navigation points from navMap or nested navPoint elements
1196    ///
1197    /// This function parses the hierarchical navigation structure defined in NCX files
1198    /// for EPUB 2.x documents. It handles nested navPoint elements to build a complete
1199    /// tree representation of the publication's table of contents.
1200    fn parse_nav_points(&self, parent_element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1201        let mut nav_points = Vec::new();
1202        for nav_point in parent_element.find_children_by_name("navPoint") {
1203            let label = match nav_point.find_children_by_name("navLabel").next() {
1204                Some(element) => element.text(),
1205                None => String::new(),
1206            };
1207
1208            let content = nav_point
1209                .find_children_by_name("content")
1210                .next()
1211                .map(|element| PathBuf::from(element.text()));
1212
1213            let play_order = nav_point
1214                .get_attr("playOrder")
1215                .and_then(|order| order.parse::<usize>().ok());
1216
1217            let children = self.parse_nav_points(nav_point)?;
1218
1219            nav_points.push(NavPoint {
1220                label,
1221                content,
1222                play_order,
1223                children,
1224            });
1225        }
1226
1227        nav_points.sort();
1228        Ok(nav_points)
1229    }
1230
1231    /// Recursively parses directory list structures
1232    ///
1233    /// This function recursively parses HTML navigation list structures,
1234    /// converting `<ol>` and `<li>` elements into NavPoint structures.
1235    /// Multi-level nested directory structures are supported.
1236    fn parse_catalog_list(&self, element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1237        let mut catalog = Vec::new();
1238        for item in element.children() {
1239            if item.tag_name() != "li" {
1240                return Err(EpubError::NonCanonicalFile {
1241                    tag: "li".to_string(),
1242                });
1243            }
1244
1245            let title_element = item
1246                .find_children_by_names(&["span", "a"])
1247                .next()
1248                .ok_or_else(|| EpubError::NonCanonicalFile {
1249                    tag: "span/a".to_string(),
1250                })?;
1251            let content_href = title_element.get_attr("href").map(PathBuf::from);
1252            let sub_list = if let Some(list) = item.find_children_by_name("ol").next() {
1253                self.parse_catalog_list(list)?
1254            } else {
1255                vec![]
1256            };
1257
1258            catalog.push(NavPoint {
1259                label: title_element.text(),
1260                content: content_href,
1261                children: sub_list,
1262                play_order: None,
1263            });
1264        }
1265
1266        Ok(catalog)
1267    }
1268
1269    /// Converts relative paths in the manifest to normalized paths
1270    /// relative to the EPUB root directory
1271    ///
1272    /// This function processes the href attribute of resources in the EPUB
1273    /// manifest and converts it to a normalized path representation.
1274    /// It handles three types of paths:
1275    /// - Relative paths starting with `../` (checks if they exceed the EPUB package scope)
1276    /// - Absolute paths starting with `/` (relative to the EPUB root directory)
1277    /// - Other relative paths (relative to the directory containing the OPF file)
1278    ///
1279    /// # Parameters
1280    /// - `path`: The href attribute value of the resource in the manifest
1281    ///
1282    /// # Return
1283    /// - `Ok(PathBuf)`: The parsed normalized path
1284    /// - `Err(EpubError)`: Relative link leakage
1285    #[inline]
1286    fn normalize_manifest_path(&self, path: &str) -> Result<PathBuf, EpubError> {
1287        let mut path = if path.starts_with("../") {
1288            let mut current_dir = self.epub_path.join(&self.package_path);
1289            current_dir.pop();
1290
1291            check_realtive_link_leakage(self.epub_path.clone(), current_dir, path)
1292                .map(PathBuf::from)
1293                .ok_or_else(|| EpubError::RealtiveLinkLeakage {
1294                    path: path.to_string(),
1295                })?
1296        } else if let Some(path) = path.strip_prefix("/") {
1297            PathBuf::from(path.to_string())
1298        } else {
1299            self.base_path.join(path)
1300        };
1301
1302        #[cfg(windows)]
1303        {
1304            path = PathBuf::from(path.to_string_lossy().replace('\\', "/"));
1305        }
1306
1307        Ok(path)
1308    }
1309
1310    /// Verify the fallback chain of all manifest items
1311    ///
1312    /// This function iterates through all manifest items with the fallback
1313    /// attribute and verifies the validity of their fallback chains, including checking:
1314    /// - Whether circular references exist
1315    /// - Whether the fallback resource exists in the manifest
1316    ///
1317    /// # Notes
1318    /// If an invalid fallback chain is found, a warning log will be logged
1319    /// but the processing flow will not be interrupted.
1320    fn validate_fallback_chains(&self) {
1321        for (id, item) in &self.manifest {
1322            if item.fallback.is_none() {
1323                continue;
1324            }
1325
1326            let mut fallback_chain = Vec::new();
1327            if let Err(msg) = self.validate_fallback_chain(id, &mut fallback_chain) {
1328                warn!("Invalid fallback chain for item {}: {}", id, msg);
1329            }
1330        }
1331    }
1332
1333    /// Recursively verify the validity of a single fallback chain
1334    ///
1335    /// This function recursively traces the fallback chain to check for the following issues:
1336    /// - Circular reference
1337    /// - The referenced fallback resource does not exist
1338    ///
1339    /// # Parameters
1340    /// - `manifest_id`: The id of the manifest item currently being verified
1341    /// - `fallback_chain`: The visited fallback chain paths used to detect circular references
1342    ///
1343    /// # Return
1344    /// - `Ok(())`: The fallback chain is valid
1345    /// - `Err(String)`: A string containing error information
1346    fn validate_fallback_chain(
1347        &self,
1348        manifest_id: &str,
1349        fallback_chain: &mut Vec<String>,
1350    ) -> Result<(), String> {
1351        if fallback_chain.contains(&manifest_id.to_string()) {
1352            fallback_chain.push(manifest_id.to_string());
1353
1354            return Err(format!(
1355                "Circular reference detected in fallback chain for {}",
1356                fallback_chain.join("->")
1357            ));
1358        }
1359
1360        // Get the current item; its existence can be ensured based on the calling context.
1361        let item = self.manifest.get(manifest_id).unwrap();
1362
1363        if let Some(fallback_id) = &item.fallback {
1364            if !self.manifest.contains_key(fallback_id) {
1365                return Err(format!(
1366                    "Fallback resource {} does not exist in manifest",
1367                    fallback_id
1368                ));
1369            }
1370
1371            fallback_chain.push(manifest_id.to_string());
1372            self.validate_fallback_chain(fallback_id, fallback_chain)
1373        } else {
1374            // The end of the fallback chain
1375            Ok(())
1376        }
1377    }
1378
1379    /// Checks if a resource at the specified path is an encrypted file
1380    ///
1381    /// This function queries whether a specific resource path is marked as an encrypted
1382    /// file in the EPUB encryption information. It checks the encrypted data stored in
1383    /// `self.encryption`, looking for an entry that matches the given path.
1384    ///
1385    /// # Parameters
1386    /// - `path`: The path of the resource to check
1387    ///
1388    /// # Return
1389    /// - `Some(String)`: The encryption method used for the resource
1390    /// - `None`: The resource is not encrypted
1391    fn is_encryption_file(&self, path: &str) -> Option<String> {
1392        self.encryption.as_ref().and_then(|encryptions| {
1393            encryptions
1394                .iter()
1395                .find(|encryption| encryption.data == path)
1396                .map(|encryption| encryption.method.clone())
1397        })
1398    }
1399
1400    /// Automatically decrypts encrypted resource data
1401    ///
1402    /// Automatically decrypts data based on the provided encryption method.
1403    /// This function supports various encryption methods defined by the EPUB
1404    /// specification, including font obfuscation and the XML encryption standard.
1405    ///
1406    /// # Parameters
1407    /// - `method`: The encryption method used for the resource
1408    /// - `data`: The encrypted resource data
1409    ///
1410    /// # Return
1411    /// - `Ok(Vec<u8>)`: The decrypted resource data
1412    /// - `Err(EpubError)`: Unsupported encryption method
1413    ///
1414    /// # Supported Encryption Methods
1415    /// - IDPF font obfuscation: `http://www.idpf.org/2008/embedding`
1416    /// - Adobe font obfuscation: `http://ns.adobe.com/pdf/enc#RC`
1417    #[inline]
1418    fn auto_dencrypt(&self, method: &str, data: &mut [u8]) -> Result<Vec<u8>, EpubError> {
1419        match method {
1420            "http://www.idpf.org/2008/embedding" => {
1421                Ok(idpf_font_dencryption(data, &self.unique_identifier))
1422            }
1423            "http://ns.adobe.com/pdf/enc#RC" => {
1424                Ok(adobe_font_dencryption(data, &self.unique_identifier))
1425            }
1426            _ => Err(EpubError::UnsupportedEncryptedMethod {
1427                method: method.to_string(),
1428            }),
1429        }
1430    }
1431}
1432
1433impl EpubDoc<BufReader<File>> {
1434    /// Creates a new EPUB document instance
1435    ///
1436    /// This function is a convenience constructor for `EpubDoc`,
1437    /// used to create an EPUB parser instance directly from a file path.
1438    ///
1439    /// # Parameters
1440    /// - `path`: The path to the EPUB file
1441    ///
1442    /// # Return
1443    /// - `Ok(EpubDoc)`: The created EPUB document instance
1444    /// - `Err(EpubError)`: An error occurred during initialization
1445    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, EpubError> {
1446        let file = File::open(&path).map_err(EpubError::from)?;
1447        let path = canonicalize(path)?;
1448
1449        Self::from_reader(BufReader::new(file), path)
1450    }
1451}
1452
1453#[cfg(test)]
1454mod tests {
1455    use std::path::Path;
1456
1457    use crate::epub::EpubDoc;
1458
1459    /// Section 3.3 package documents
1460    mod package_documents_tests {
1461        use std::path::Path;
1462
1463        use crate::epub::{EpubDoc, EpubVersion};
1464
1465        /// ID: pkg-collections-unknown
1466        ///
1467        /// The package document contains a collection with an unknown role. The reading system must open the EPUB successfully.
1468        #[test]
1469        fn test_pkg_collections_unknown() {
1470            let epub_file = Path::new("./test_case/pkg-collections-unknown.epub");
1471            let doc = EpubDoc::new(epub_file);
1472            assert!(doc.is_ok());
1473        }
1474
1475        /// ID: pkg-creator-order
1476        ///
1477        /// Several creators are listed in the package document. The reading system must not display them out of order (but it may display only the first).
1478        #[test]
1479        fn test_pkg_creator_order() {
1480            let epub_file = Path::new("./test_case/pkg-creator-order.epub");
1481            let doc = EpubDoc::new(epub_file);
1482            assert!(doc.is_ok());
1483
1484            let doc = doc.unwrap();
1485            let creators = doc.get_metadata_value("creator");
1486            assert!(creators.is_some());
1487
1488            let creators = creators.unwrap();
1489            assert_eq!(creators.len(), 5);
1490            assert_eq!(
1491                creators,
1492                vec![
1493                    "Dave Cramer",
1494                    "Wendy Reid",
1495                    "Dan Lazin",
1496                    "Ivan Herman",
1497                    "Brady Duga",
1498                ]
1499            );
1500        }
1501
1502        /// ID: pkg-manifest-unknown
1503        ///
1504        /// The package document contains a manifest item with unknown properties. The reading system must open the EPUB successfully.
1505        #[test]
1506        fn test_pkg_manifest_order() {
1507            let epub_file = Path::new("./test_case/pkg-manifest-unknown.epub");
1508            let doc = EpubDoc::new(epub_file);
1509            assert!(doc.is_ok());
1510
1511            let mut doc = doc.unwrap();
1512            assert_eq!(doc.manifest.len(), 2);
1513            assert!(doc.get_manifest_item("nav").is_ok());
1514            assert!(doc.get_manifest_item("content_001").is_ok());
1515            assert!(doc.get_manifest_item("content_002").is_err());
1516        }
1517
1518        /// ID: pkg-meta-unknown
1519        ///
1520        /// The package document contains a meta tag with an unknown property. The reading system must open the EPUB successfully.
1521        #[test]
1522        fn test_pkg_meta_unknown() {
1523            let epub_file = Path::new("./test_case/pkg-meta-unknown.epub");
1524            let doc = EpubDoc::new(epub_file);
1525            assert!(doc.is_ok());
1526
1527            let doc = doc.unwrap();
1528            let value = doc.get_metadata_value("dcterms:isReferencedBy");
1529            assert!(value.is_some());
1530            let value = value.unwrap();
1531            assert_eq!(value.len(), 1);
1532            assert_eq!(
1533                value,
1534                vec!["https://www.w3.org/TR/epub-rs/#confreq-rs-pkg-meta-unknown"]
1535            );
1536
1537            let value = doc.get_metadata_value("dcterms:modified");
1538            assert!(value.is_some());
1539            let value = value.unwrap();
1540            assert_eq!(value.len(), 1);
1541            assert_eq!(value, vec!["2021-01-11T00:00:00Z"]);
1542
1543            let value = doc.get_metadata_value("dcterms:title");
1544            assert!(value.is_none());
1545        }
1546
1547        /// ID: pkg-meta-whitespace
1548        ///
1549        /// The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases.
1550        #[test]
1551        fn test_pkg_meta_white_space() {
1552            let epub_file = Path::new("./test_case/pkg-meta-whitespace.epub");
1553            let doc = EpubDoc::new(epub_file);
1554            assert!(doc.is_ok());
1555
1556            let doc = doc.unwrap();
1557            let value = doc.get_metadata_value("creator");
1558            assert!(value.is_some());
1559            let value = value.unwrap();
1560            assert_eq!(value.len(), 1);
1561            assert_eq!(value, vec!["Dave Cramer"]);
1562
1563            let value = doc.get_metadata_value("description");
1564            assert!(value.is_some());
1565            let value = value.unwrap();
1566            assert_eq!(value.len(), 1);
1567            assert_eq!(
1568                value,
1569                vec![
1570                    "The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases."
1571                ]
1572            );
1573        }
1574
1575        /// ID: pkg-spine-duplicate-item-hyperlink
1576        ///
1577        /// The spine contains several references to the same content document. The reading system must move to the position of the first duplicate in the reading order when following a hyperlink.
1578        #[test]
1579        fn test_pkg_spine_duplicate_item_hyperlink() {
1580            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-hyperlink.epub");
1581            let doc = EpubDoc::new(epub_file);
1582            assert!(doc.is_ok());
1583
1584            let mut doc = doc.unwrap();
1585            assert_eq!(doc.spine.len(), 4);
1586            assert_eq!(
1587                doc.navigate_by_spine_index(0).unwrap(),
1588                doc.get_manifest_item("content_001").unwrap()
1589            );
1590            assert_eq!(
1591                doc.navigate_by_spine_index(1).unwrap(),
1592                doc.get_manifest_item("content_002").unwrap()
1593            );
1594            assert_eq!(
1595                doc.navigate_by_spine_index(2).unwrap(),
1596                doc.get_manifest_item("content_002").unwrap()
1597            );
1598            assert_eq!(
1599                doc.navigate_by_spine_index(3).unwrap(),
1600                doc.get_manifest_item("content_002").unwrap()
1601            );
1602        }
1603
1604        /// ID: pkg-spine-duplicate-item-rendering
1605        ///
1606        /// The spine contains several references to the same content document. The reading system must not skip the duplicates when rendering the reading order.
1607        #[test]
1608        fn test_pkg_spine_duplicate_item_rendering() {
1609            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-rendering.epub");
1610            let doc = EpubDoc::new(epub_file);
1611            assert!(doc.is_ok());
1612
1613            let mut doc = doc.unwrap();
1614            assert_eq!(doc.spine.len(), 4);
1615
1616            let result = doc.spine_prev();
1617            assert!(result.is_none());
1618
1619            let result = doc.spine_next();
1620            assert!(result.is_some());
1621
1622            doc.spine_next();
1623            doc.spine_next();
1624            let result = doc.spine_next();
1625            assert!(result.is_none());
1626        }
1627
1628        /// ID: pkg-spine-nonlinear-activation
1629        ///
1630        /// An itemref in the spine is marked as non-linear. Although it (possibly) cannot be accessed through the table of contents, it can be reached from a link in the XHTML content.
1631        #[test]
1632        fn test_pkg_spine_nonlinear_activation() {
1633            let epub_file = Path::new("./test_case/pkg-spine-nonlinear-activation.epub");
1634            let doc = EpubDoc::new(epub_file);
1635            assert!(doc.is_ok());
1636
1637            let mut doc = doc.unwrap();
1638            assert!(doc.spine_prev().is_none());
1639            assert!(doc.spine_next().is_none());
1640
1641            assert!(doc.navigate_by_spine_index(1).is_some());
1642            assert!(doc.spine_prev().is_none());
1643            assert!(doc.spine_next().is_none());
1644        }
1645
1646        /// ID: pkg-spine-order
1647        ///
1648        /// Basic test of whether a reading system can display spine items in the correct order. The test fails if the reading system presents content in the order in which the file names sort, or if it presents files in manifest order rather than spine order.
1649        #[test]
1650        fn test_pkg_spine_order() {
1651            let epub_file = Path::new("./test_case/pkg-spine-order.epub");
1652            let doc = EpubDoc::new(epub_file);
1653            assert!(doc.is_ok());
1654
1655            let doc = doc.unwrap();
1656            assert_eq!(doc.spine.len(), 4);
1657            assert_eq!(
1658                doc.spine
1659                    .iter()
1660                    .map(|item| item.idref.clone())
1661                    .collect::<Vec<String>>(),
1662                vec![
1663                    "d-content_001",
1664                    "c-content_002",
1665                    "b-content_003",
1666                    "a-content_004",
1667                ]
1668            );
1669        }
1670
1671        /// ID: pkg-spine-order-svg
1672        ///
1673        /// Basic test of whether a reading system can display SVG spine items in the correct order.
1674        #[test]
1675        fn test_spine_order_svg() {
1676            let epub_file = Path::new("./test_case/pkg-spine-order-svg.epub");
1677            let doc = EpubDoc::new(epub_file);
1678            assert!(doc.is_ok());
1679
1680            let mut doc = doc.unwrap();
1681            assert_eq!(doc.spine.len(), 4);
1682
1683            loop {
1684                if let Some(spine) = doc.spine_next() {
1685                    let idref = doc.spine[doc.current_spine_index].idref.clone();
1686                    let resource = doc.get_manifest_item(&idref);
1687                    assert!(resource.is_ok());
1688
1689                    let resource = resource.unwrap();
1690                    assert_eq!(spine, resource);
1691                } else {
1692                    break;
1693                }
1694            }
1695
1696            assert_eq!(doc.current_spine_index, 3);
1697        }
1698
1699        /// ID: pkg-spine-unknown
1700        ///
1701        /// The package document contains a spine item with unknown properties. The reading system must open the EPUB successfully.
1702        #[test]
1703        fn test_pkg_spine_unknown() {
1704            let epub_file = Path::new("./test_case/pkg-spine-unknown.epub");
1705            let doc = EpubDoc::new(epub_file);
1706            assert!(doc.is_ok());
1707
1708            let doc = doc.unwrap();
1709            assert_eq!(doc.spine.len(), 1);
1710            assert_eq!(doc.spine[0].idref, "content_001");
1711            assert_eq!(doc.spine[0].id, None);
1712            assert_eq!(doc.spine[0].linear, true);
1713            assert_eq!(doc.spine[0].properties, Some("untrustworthy".to_string()));
1714        }
1715
1716        /// ID: pkg-title-order
1717        ///
1718        /// Several titles are listed in the package document. The reading system must use the first title (and whether to use other titles is not defined).
1719        #[test]
1720        fn test_pkg_title_order() {
1721            let epub_file = Path::new("./test_case/pkg-title-order.epub");
1722            let doc = EpubDoc::new(epub_file);
1723            assert!(doc.is_ok());
1724
1725            let doc = doc.unwrap();
1726            let title_list = doc.get_title();
1727            assert!(title_list.is_ok());
1728
1729            let title_list = title_list.unwrap();
1730            assert_eq!(title_list.len(), 6);
1731            assert_eq!(
1732                title_list,
1733                vec![
1734                    "pkg-title-order",
1735                    "This title must not display first",
1736                    "Also, this title must not display first",
1737                    "This title also must not display first",
1738                    "This title must also not display first",
1739                    "This title must not display first, also",
1740                ]
1741            );
1742        }
1743
1744        /// ID: pkg-unique-id
1745        ///
1746        /// The package document's dc:identifier is identical across two publications. The reading system should display both publications independently.
1747        #[test]
1748        fn test_pkg_unique_id() {
1749            let epub_file = Path::new("./test_case/pkg-unique-id.epub");
1750            let doc_1 = EpubDoc::new(epub_file);
1751            assert!(doc_1.is_ok());
1752
1753            let epub_file = Path::new("./test_case/pkg-unique-id_duplicate.epub");
1754            let doc_2 = EpubDoc::new(epub_file);
1755            assert!(doc_2.is_ok());
1756
1757            let doc_1 = doc_1.unwrap();
1758            let doc_2 = doc_2.unwrap();
1759
1760            assert_eq!(
1761                doc_1.get_identifier().unwrap(),
1762                doc_2.get_identifier().unwrap()
1763            );
1764            assert_eq!(doc_1.unique_identifier, "pkg-unique-id");
1765            assert_eq!(doc_2.unique_identifier, "pkg-unique-id");
1766        }
1767
1768        /// ID: pkg-version-backward
1769        ///
1770        /// “Reading Systems MUST attempt to process an EPUB Publication whose Package Document version attribute is less than "3.0"”. This is an EPUB with package version attribute set to "0", to see if a reading system will open it.
1771        #[test]
1772        fn test_pkg_version_backward() {
1773            let epub_file = Path::new("./test_case/pkg-version-backward.epub");
1774            let doc = EpubDoc::new(epub_file);
1775            assert!(doc.is_ok());
1776
1777            let doc = doc.unwrap();
1778            assert_eq!(doc.version, EpubVersion::Version3_0);
1779        }
1780
1781        /// ID: pkg-linked-records
1782        ///
1783        /// Reading System must process and display the title and creator metadata from the package document. An ONIX 3.0 format linked metadata record exists, but contains neither title nor creator metadata.
1784        #[test]
1785        fn test_pkg_linked_records() {
1786            let epub_file = Path::new("./test_case/pkg-linked-records.epub");
1787            let doc = EpubDoc::new(epub_file);
1788            assert!(doc.is_ok());
1789
1790            let doc = doc.unwrap();
1791            assert_eq!(doc.metadata_link.len(), 3);
1792
1793            let item = doc.metadata_link.iter().find(|&item| {
1794                if let Some(properties) = &item.properties {
1795                    properties.eq("onix")
1796                } else {
1797                    false
1798                }
1799            });
1800            assert!(item.is_some());
1801        }
1802
1803        /// ID: pkg-manifest-unlisted-resource
1804        ///
1805        /// The XHTML content references an image that does not appear in the manifest. The image should not be shown.
1806        #[test]
1807        fn test_pkg_manifest_unlisted_resource() {
1808            let epub_file = Path::new("./test_case/pkg-manifest-unlisted-resource.epub");
1809            let doc = EpubDoc::new(epub_file);
1810            assert!(doc.is_ok());
1811
1812            let mut doc = doc.unwrap();
1813            assert!(
1814                doc.get_manifest_item_by_path("EPUB/content_001.xhtml")
1815                    .is_ok()
1816            );
1817
1818            assert!(doc.get_manifest_item_by_path("EPUB/red.png").is_err());
1819            let err = doc.get_manifest_item_by_path("EPUB/red.png").unwrap_err();
1820            assert_eq!(
1821                err.to_string(),
1822                "Resource not found: Unable to find resource from \"EPUB/red.png\"."
1823            );
1824        }
1825    }
1826
1827    /// Section 3.4 manifest fallbacks
1828    ///
1829    /// The tests under this module seem to favor the reading system rather than the EPUB format itself
1830    mod manifest_fallbacks_tests {
1831        use std::path::Path;
1832
1833        use crate::epub::EpubDoc;
1834
1835        /// ID: pub-foreign_bad-fallback
1836        ///
1837        /// This is a test of manifest fallbacks where both the spine item and the fallback are likely to be unsupported. The spine item is a DMG, with a fallback to a PSD file. Reading systems may raise an error on the ingenstion workflow.
1838        #[test]
1839        fn test_pub_foreign_bad_fallback() {
1840            let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
1841            let doc = EpubDoc::new(epub_file);
1842            assert!(doc.is_ok());
1843
1844            let mut doc = doc.unwrap();
1845            assert!(doc.get_manifest_item("content_001").is_ok());
1846            assert!(doc.get_manifest_item("bar").is_ok());
1847
1848            assert_eq!(
1849                doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
1850                    .unwrap_err()
1851                    .to_string(),
1852                "No supported file format: The fallback resource does not contain the file format you support."
1853            );
1854        }
1855
1856        /// ID: pub-foreign_image
1857        ///
1858        /// An HTML content file contains a PSD image, with a manifest fallback to a PNG image. This tests fallbacks for resources that are not in the spine.
1859        #[test]
1860        fn test_pub_foreign_image() {
1861            let epub_file = Path::new("./test_case/pub-foreign_image.epub");
1862            let doc = EpubDoc::new(epub_file);
1863            assert!(doc.is_ok());
1864
1865            let mut doc = doc.unwrap();
1866            let result = doc.get_manifest_item_with_fallback(
1867                "image-tiff",
1868                vec!["image/png", "application/xhtml+xml"],
1869            );
1870            assert!(result.is_ok());
1871
1872            let (_, mime) = result.unwrap();
1873            assert_eq!(mime, "image/png");
1874        }
1875
1876        /// ID: pub-foreign_json-spine
1877        ///
1878        /// This EPUB uses a JSON content file in the spine, with a manifest fallback to an HTML document. If the reading system does not support JSON, it should display the HTML.
1879        #[test]
1880        fn test_pub_foreign_json_spine() {
1881            let epub_file = Path::new("./test_case/pub-foreign_json-spine.epub");
1882            let doc = EpubDoc::new(epub_file);
1883            assert!(doc.is_ok());
1884
1885            let mut doc = doc.unwrap();
1886            let result = doc.get_manifest_item_with_fallback(
1887                "content_primary",
1888                vec!["application/xhtml+xml", "application/json"],
1889            );
1890            assert!(result.is_ok());
1891            let (_, mime) = result.unwrap();
1892            assert_eq!(mime, "application/json");
1893
1894            let result = doc
1895                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1896            assert!(result.is_ok());
1897            let (_, mime) = result.unwrap();
1898            assert_eq!(mime, "application/xhtml+xml");
1899        }
1900
1901        /// ID: pub-foreign_xml-spine
1902        ///
1903        /// This EPUB uses an ordinary XML content file with mimetype application/xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1904        #[test]
1905        fn test_pub_foreign_xml_spine() {
1906            let epub_file = Path::new("./test_case/pub-foreign_xml-spine.epub");
1907            let doc = EpubDoc::new(epub_file);
1908            assert!(doc.is_ok());
1909
1910            let mut doc = doc.unwrap();
1911            let result = doc.get_manifest_item_with_fallback(
1912                "content_primary",
1913                vec!["application/xhtml+xml", "application/xml"],
1914            );
1915            assert!(result.is_ok());
1916            let (_, mime) = result.unwrap();
1917            assert_eq!(mime, "application/xml");
1918
1919            let result = doc
1920                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1921            assert!(result.is_ok());
1922            let (_, mime) = result.unwrap();
1923            assert_eq!(mime, "application/xhtml+xml");
1924        }
1925
1926        /// ID: pub-foreign_xml-suffix-spine
1927        ///
1928        /// This EPUB uses an custom XML content file with mimetype application/dtc+xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1929        #[test]
1930        fn test_pub_foreign_xml_suffix_spine() {
1931            let epub_file = Path::new("./test_case/pub-foreign_xml-suffix-spine.epub");
1932            let doc = EpubDoc::new(epub_file);
1933            assert!(doc.is_ok());
1934
1935            let mut doc = doc.unwrap();
1936            let result = doc.get_manifest_item_with_fallback(
1937                "content_primary",
1938                vec!["application/xhtml+xml", "application/dtc+xml"],
1939            );
1940            assert!(result.is_ok());
1941            let (_, mime) = result.unwrap();
1942            assert_eq!(mime, "application/dtc+xml");
1943
1944            let result = doc
1945                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1946            assert!(result.is_ok());
1947            let (_, mime) = result.unwrap();
1948            assert_eq!(mime, "application/xhtml+xml");
1949        }
1950    }
1951
1952    /// Section 3.9 open container format
1953    mod open_container_format_tests {
1954        use std::{cmp::min, io::Read, path::Path};
1955
1956        use sha1::{Digest, Sha1};
1957
1958        use crate::epub::EpubDoc;
1959
1960        /// ID: ocf-metainf-inc
1961        ///
1962        /// An extra configuration file, not in the reserved files' list, is added to the META-INF folder; this file must be ignored.
1963        #[test]
1964        fn test_ocf_metainf_inc() {
1965            let epub_file = Path::new("./test_case/ocf-metainf-inc.epub");
1966            let doc = EpubDoc::new(epub_file);
1967            assert!(doc.is_ok());
1968        }
1969
1970        /// ID: ocf-metainf-manifest
1971        ///
1972        /// An ancillary manifest file, containing an extra spine item, is present in the META-INF directory; this extra item must be ignored by the reading system.
1973        #[test]
1974        fn test_ocf_metainf_manifest() {
1975            let epub_file = Path::new("./test_case/ocf-metainf-manifest.epub");
1976            let doc = EpubDoc::new(epub_file);
1977            assert!(doc.is_ok());
1978        }
1979
1980        /// ID: ocf-package_arbitrary
1981        ///
1982        /// The EPUB contains three valid package files and three corresponding sets of content documents, but only one of the packages, in an unusual subdirectory, is referenced by the container.xml file. The reading system must use this package.
1983        #[test]
1984        fn test_ocf_package_arbitrary() {
1985            let epub_file = Path::new("./test_case/ocf-package_arbitrary.epub");
1986            let doc = EpubDoc::new(epub_file);
1987            assert!(doc.is_ok());
1988
1989            let doc = doc.unwrap();
1990            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1991        }
1992
1993        /// ID: ocf-package_multiple
1994        ///
1995        /// The EPUB contains three valid package files and three corresponding sets of content documents, all referenced by the container.xml file. The reading system must use the first package.
1996        #[test]
1997        fn test_ocf_package_multiple() {
1998            let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
1999            let doc = EpubDoc::new(epub_file);
2000            assert!(doc.is_ok());
2001
2002            let doc = doc.unwrap();
2003            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
2004            assert_eq!(doc.base_path, Path::new("FOO/BAR"));
2005        }
2006
2007        /// ID: ocf-url_link-leaking-relative
2008        ///
2009        /// Use a relative link with several double-dot path segments from the content to a photograph. The folder hierarchy containing the photograph starts at the root level; the relative image reference exceeds depth of hierarchy.
2010        #[test]
2011        fn test_ocf_url_link_leaking_relative() {
2012            let epub_file = Path::new("./test_case/ocf-url_link-leaking-relative.epub");
2013            let doc = EpubDoc::new(epub_file);
2014            assert!(doc.is_err());
2015            assert_eq!(
2016                doc.err().unwrap().to_string(),
2017                String::from(
2018                    "Relative link leakage: Path \"../../../../media/imgs/monastery.jpg\" is out of container range."
2019                )
2020            )
2021        }
2022
2023        /// ID: ocf-url_link-path-absolute
2024        ///
2025        /// Use a path-absolute link, i.e., beginning with a leading slash, from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2026        #[test]
2027        fn test_ocf_url_link_path_absolute() {
2028            let epub_file = Path::new("./test_case/ocf-url_link-path-absolute.epub");
2029            let doc = EpubDoc::new(epub_file);
2030            assert!(doc.is_ok());
2031
2032            let doc = doc.unwrap();
2033            let resource = doc.manifest.get("photo").unwrap();
2034            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2035        }
2036
2037        /// ID: ocf-url_link-relative
2038        ///
2039        /// A simple relative link from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2040        #[test]
2041        fn test_ocf_url_link_relative() {
2042            let epub_file = Path::new("./test_case/ocf-url_link-relative.epub");
2043            let doc = EpubDoc::new(epub_file);
2044            assert!(doc.is_ok());
2045
2046            let doc = doc.unwrap();
2047            let resource = doc.manifest.get("photo").unwrap();
2048            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2049        }
2050
2051        /// ID: ocf-url_manifest
2052        ///
2053        /// The manifest refers to an XHTML file in an arbitrary subfolder. The reading system must be able to find the content.
2054        #[test]
2055        fn test_ocf_url_manifest() {
2056            let epub_file = Path::new("./test_case/ocf-url_manifest.epub");
2057            let doc = EpubDoc::new(epub_file);
2058            assert!(doc.is_ok());
2059
2060            let mut doc = doc.unwrap();
2061            assert!(doc.get_manifest_item("nav").is_ok());
2062            assert!(doc.get_manifest_item("content_001").is_ok());
2063            assert!(doc.get_manifest_item("content_002").is_err());
2064        }
2065
2066        /// ID: ocf-url_relative
2067        ///
2068        /// The manifest refers to an XHTML file in an arbitrary subfolder that is relative to the package's own arbitrary folder. The reading system must be able to find the content.
2069        #[test]
2070        fn test_ocf_url_relative() {
2071            let epub_file = Path::new("./test_case/ocf-url_relative.epub");
2072            let doc = EpubDoc::new(epub_file);
2073            assert!(doc.is_ok());
2074
2075            let mut doc = doc.unwrap();
2076            assert_eq!(doc.package_path, Path::new("foo/BAR/baz.opf"));
2077            assert_eq!(doc.base_path, Path::new("foo/BAR"));
2078            assert_eq!(
2079                doc.manifest.get("nav").unwrap().path,
2080                Path::new("foo/BAR/nav.xhtml")
2081            );
2082            assert_eq!(
2083                doc.manifest.get("content_001").unwrap().path,
2084                Path::new("foo/BAR/qux/content_001.xhtml")
2085            );
2086            assert!(doc.get_manifest_item("nav").is_ok());
2087            assert!(doc.get_manifest_item("content_001").is_ok());
2088        }
2089
2090        /// ID: ocf-zip-comp
2091        ///
2092        /// MUST treat any OCF ZIP container that uses compression techniques other than Deflate as in error.
2093        /// This test case does not use compression methods other than Deflate and cannot detect whether it is effective.
2094        #[test]
2095        fn test_ocf_zip_comp() {
2096            let epub_file = Path::new("./test_case/ocf-zip-comp.epub");
2097            let doc = EpubDoc::new(epub_file);
2098            assert!(doc.is_ok());
2099        }
2100
2101        /// ID: ocf-zip-mult
2102        ///
2103        /// MUST treat any OCF ZIP container that splits the content into segments as in error.
2104        /// This test case is not a segmented OCF ZIP container and cannot be tested to see if it is valid.
2105        #[test]
2106        fn test_ocf_zip_mult() {
2107            let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2108            let doc = EpubDoc::new(epub_file);
2109            assert!(doc.is_ok());
2110        }
2111
2112        /// ID: ocf-font_obfuscation
2113        ///
2114        /// An obfuscated (TrueType) font should be displayed after de-obfuscation.
2115        #[test]
2116        fn test_ocf_font_obfuscation() {
2117            let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2118            let doc = EpubDoc::new(epub_file);
2119            assert!(doc.is_ok());
2120
2121            let mut doc = doc.unwrap();
2122            let unique_id = doc.unique_identifier.clone();
2123
2124            let mut hasher = Sha1::new();
2125            hasher.update(unique_id.as_bytes());
2126            let hash = hasher.finalize();
2127            let mut key = vec![0u8; 1040];
2128            for i in 0..1040 {
2129                key[i] = hash[i % hash.len()];
2130            }
2131
2132            assert!(doc.encryption.is_some());
2133            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2134
2135            let data = &doc.encryption.unwrap()[0];
2136            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2137
2138            let font_file = doc
2139                .archive
2140                .by_name(&data.data)
2141                .unwrap()
2142                .bytes()
2143                .collect::<Result<Vec<u8>, _>>();
2144            assert!(font_file.is_ok());
2145            let font_file = font_file.unwrap();
2146
2147            // 根据EPUB规范，字体混淆是直接对字体文件进行的，不需要解压步骤，直接进行去混淆处理
2148            let mut deobfuscated = font_file.clone();
2149            for i in 0..min(1040, deobfuscated.len()) {
2150                deobfuscated[i] ^= key[i];
2151            }
2152
2153            assert!(is_valid_font(&deobfuscated));
2154        }
2155
2156        /// ID: ocf-font_obfuscation-bis
2157        ///
2158        /// An obfuscated (TrueType) font should not be displayed after de-obfuscation, because the obfuscation used a different publication id.
2159        #[test]
2160        fn test_ocf_font_obfuscation_bis() {
2161            let epub_file = Path::new("./test_case/ocf-font_obfuscation_bis.epub");
2162            let doc = EpubDoc::new(epub_file);
2163            assert!(doc.is_ok());
2164
2165            let mut doc = doc.unwrap();
2166
2167            let wrong_unique_id = "wrong-publication-id";
2168            let mut hasher = Sha1::new();
2169            hasher.update(wrong_unique_id.as_bytes());
2170            let hash = hasher.finalize();
2171            let mut wrong_key = vec![0u8; 1040];
2172            for i in 0..1040 {
2173                wrong_key[i] = hash[i % hash.len()];
2174            }
2175
2176            assert!(doc.encryption.is_some());
2177            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2178
2179            let data = &doc.encryption.unwrap()[0];
2180            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2181
2182            let font_file = doc
2183                .archive
2184                .by_name(&data.data)
2185                .unwrap()
2186                .bytes()
2187                .collect::<Result<Vec<u8>, _>>();
2188            assert!(font_file.is_ok());
2189            let font_file = font_file.unwrap();
2190
2191            // 使用错误的密钥进行去混淆
2192            let mut deobfuscated_with_wrong_key = font_file.clone();
2193            for i in 0..std::cmp::min(1040, deobfuscated_with_wrong_key.len()) {
2194                deobfuscated_with_wrong_key[i] ^= wrong_key[i];
2195            }
2196
2197            assert!(!is_valid_font(&deobfuscated_with_wrong_key));
2198        }
2199
2200        fn is_valid_font(data: &[u8]) -> bool {
2201            if data.len() < 4 {
2202                return false;
2203            }
2204            let sig = &data[0..4];
2205            // OTF: "OTTO"
2206            // TTF: 0x00010000, 0x00020000, "true", "typ1"
2207            sig == b"OTTO"
2208                || sig == b"\x00\x01\x00\x00"
2209                || sig == b"\x00\x02\x00\x00"
2210                || sig == b"true"
2211                || sig == b"typ1"
2212        }
2213    }
2214
2215    /// Test for function `has_encryption`
2216    #[test]
2217    fn test_fn_has_encryption() {
2218        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2219        let doc = EpubDoc::new(epub_file);
2220        assert!(doc.is_ok());
2221
2222        let mut doc = doc.unwrap();
2223        assert!(doc.has_encryption());
2224    }
2225
2226    /// This test is used to detect whether the "META-INF/encryption.xml" file is parsed correctly
2227    #[test]
2228    fn test_fn_parse_encryption() {
2229        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2230        let doc = EpubDoc::new(epub_file);
2231        assert!(doc.is_ok());
2232
2233        let doc = doc.unwrap();
2234        assert!(doc.encryption.is_some());
2235
2236        let encryption = doc.encryption.unwrap();
2237        assert_eq!(encryption.len(), 1);
2238        assert_eq!(encryption[0].method, "http://www.idpf.org/2008/embedding");
2239        assert_eq!(encryption[0].data, "EPUB/fonts/Lobster.ttf");
2240    }
2241
2242    #[test]
2243    fn test_get_metadata_existing_key() {
2244        let epub_file = Path::new("./test_case/epub-33.epub");
2245        let doc = EpubDoc::new(epub_file);
2246        assert!(doc.is_ok());
2247
2248        let doc = doc.unwrap();
2249
2250        let titles = doc.get_metadata("title");
2251        assert!(titles.is_some());
2252
2253        let titles = titles.unwrap();
2254        assert_eq!(titles.len(), 1);
2255        assert_eq!(titles[0].property, "title");
2256        assert_eq!(titles[0].value, "EPUB 3.3");
2257
2258        let languages = doc.get_metadata("language");
2259        assert!(languages.is_some());
2260
2261        let languages = languages.unwrap();
2262        assert_eq!(languages.len(), 1);
2263        assert_eq!(languages[0].property, "language");
2264        assert_eq!(languages[0].value, "en-us");
2265    }
2266
2267    #[test]
2268    fn test_get_metadata_nonexistent_key() {
2269        let epub_file = Path::new("./test_case/epub-33.epub");
2270        let doc = EpubDoc::new(epub_file);
2271        assert!(doc.is_ok());
2272
2273        let doc = doc.unwrap();
2274        let metadata = doc.get_metadata("nonexistent");
2275        assert!(metadata.is_none());
2276    }
2277
2278    #[test]
2279    fn test_get_metadata_multiple_items_same_type() {
2280        let epub_file = Path::new("./test_case/epub-33.epub");
2281        let doc = EpubDoc::new(epub_file);
2282        assert!(doc.is_ok());
2283
2284        let doc = doc.unwrap();
2285
2286        let creators = doc.get_metadata("creator");
2287        assert!(creators.is_some());
2288
2289        let creators = creators.unwrap();
2290        assert_eq!(creators.len(), 3);
2291
2292        assert_eq!(creators[0].id, Some("creator_id_0".to_string()));
2293        assert_eq!(creators[0].property, "creator");
2294        assert_eq!(creators[0].value, "Matt Garrish, DAISY Consortium");
2295
2296        assert_eq!(creators[1].id, Some("creator_id_1".to_string()));
2297        assert_eq!(creators[1].property, "creator");
2298        assert_eq!(creators[1].value, "Ivan Herman, W3C");
2299
2300        assert_eq!(creators[2].id, Some("creator_id_2".to_string()));
2301        assert_eq!(creators[2].property, "creator");
2302        assert_eq!(creators[2].value, "Dave Cramer, Invited Expert");
2303    }
2304
2305    #[test]
2306    fn test_get_metadata_with_refinement() {
2307        let epub_file = Path::new("./test_case/epub-33.epub");
2308        let doc = EpubDoc::new(epub_file);
2309        assert!(doc.is_ok());
2310
2311        let doc = doc.unwrap();
2312
2313        let title = doc.get_metadata("title");
2314        assert!(title.is_some());
2315
2316        let title = title.unwrap();
2317        assert_eq!(title.len(), 1);
2318        assert_eq!(title[0].refined.len(), 1);
2319        assert_eq!(title[0].refined[0].property, "title-type");
2320        assert_eq!(title[0].refined[0].value, "main");
2321    }
2322
2323    #[test]
2324    fn test_get_manifest_item_with_fallback() {
2325        let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
2326        let doc = EpubDoc::new(epub_file);
2327        assert!(doc.is_ok());
2328
2329        let mut doc = doc.unwrap();
2330        assert!(doc.get_manifest_item("content_001").is_ok());
2331        assert!(doc.get_manifest_item("bar").is_ok());
2332
2333        // 当回退链上存在可回退资源时能获取资源
2334        if let Ok((_, mime)) = doc.get_manifest_item_with_fallback("content_001", vec!["image/psd"])
2335        {
2336            assert_eq!(mime, "image/psd");
2337        } else {
2338            assert!(false, "get_manifest_item_with_fallback failed");
2339        }
2340
2341        // 当回退链上不存在可回退资源时无法获取资源
2342        assert_eq!(
2343            doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
2344                .unwrap_err()
2345                .to_string(),
2346            "No supported file format: The fallback resource does not contain the file format you support."
2347        );
2348    }
2349
2350    #[test]
2351    fn test_get_cover() {
2352        let epub_file = Path::new("./test_case/pkg-cover-image.epub");
2353        let doc = EpubDoc::new(epub_file);
2354        if let Err(err) = &doc {
2355            println!("{}", err);
2356        }
2357        assert!(doc.is_ok());
2358
2359        let mut doc = doc.unwrap();
2360        let result = doc.get_cover();
2361        assert!(result.is_some());
2362
2363        let (data, mime) = result.unwrap();
2364        assert_eq!(data.len(), 5785);
2365        assert_eq!(mime, "image/jpeg");
2366    }
2367}
lib_epub/epub.rs

lib_epub/
epub.rs