lib_epub/
epub.rs

1//! The core module of the EPUB parsing library
2//!
3//! This module provides complete parsing functionality for EPUB ebook files
4//! and is the core component of the entire library. The `EpubDoc` structure
5//! encapsulates all the parsing logic and data access interfaces for EPUB files.
6//!
7//! ## Main references to EPUB specs:
8//! - <https://www.w3.org/TR/epub-33>
9//! - <https://idpf.org/epub/201>
10//!
11//! ## Potential Issues
12//! - The generic parameter `R: Read + Seek` increases complexity, particularly
13//!   in asynchronous environments. The current design is not conducive to multi-threaded
14//!   concurrent access and requires an external synchronization mechanism.
15//! - Some error handling may not be sufficiently nuanced, and certain edge cases
16//!   may not be adequately considered.
17//! - Loading the entire EPUB document at once may result in significant memory consumption,
18//!   especially for large publications.
19//!
20//! ## Future Work
21//! - Adds support for asynchronous I/O, improving the user experience in asynchronous
22//!   environments. Considering adding support for multi-threaded access.
23//! - Supports more EPUB specification features, such as media overlay and scripts.
24
25use std::{
26    collections::HashMap,
27    fs::{File, canonicalize},
28    io::{BufReader, Read, Seek},
29    path::{Path, PathBuf},
30};
31
32use log::warn;
33use zip::{ZipArchive, result::ZipError};
34
35use crate::{
36    error::EpubError,
37    types::{
38        EncryptionData, EpubVersion, ManifestItem, MetadataItem, MetadataLinkItem,
39        MetadataRefinement, NavPoint, SpineItem,
40    },
41    utils::{
42        DecodeBytes, NormalizeWhitespace, XmlElement, XmlReader, adobe_font_dencryption,
43        check_realtive_link_leakage, compression_method_check, get_file_in_zip_archive,
44        idpf_font_dencryption,
45    },
46};
47
48/// EPUB document parser, representing a loaded and parsed EPUB publication
49///
50/// The `EpubDoc` structure is the core of the entire EPUB parsing library.
51/// It encapsulates all the parsing logic and data access interfaces for EPUB files.
52/// It is responsible for parsing various components of an EPUB, including metadata,
53/// manifests, reading order, table of contents navigation, and encrypted information,
54/// and provides methods for accessing this data.
55///
56/// Provides a unified data access interface for EPUB files, hiding the underlying
57/// file structure and parsing details. Strictly adheres to the EPUB specification
58/// in implementing the parsing logic to ensure compatibility with the standard.
59///
60/// # Usage
61///
62/// ```rust
63/// use lib_epub::epub::EpubDoc;
64///
65/// let doc = EpubDoc::new("./test_case/epub-33.epub");
66/// assert!(doc.is_ok());
67/// ```
68pub struct EpubDoc<R: Read + Seek> {
69    /// The structure of the epub file that actually holds it
70    pub(crate) archive: ZipArchive<R>,
71
72    /// The path to the target epub file
73    pub(crate) epub_path: PathBuf,
74
75    /// The path to the OPF file
76    pub package_path: PathBuf,
77
78    /// The path to the directory where the opf file is located
79    pub base_path: PathBuf,
80
81    /// The epub version
82    pub version: EpubVersion,
83
84    /// The unique identifier of the epub file
85    ///
86    /// This identifier is the actual value of the unique-identifier attribute of the package.
87    pub unique_identifier: String,
88
89    /// Epub metadata extracted from OPF
90    pub metadata: Vec<MetadataItem>,
91
92    /// Data in metadata that points to external files
93    pub metadata_link: Vec<MetadataLinkItem>,
94
95    /// A list of resources contained inside an epub extracted from OPF
96    ///
97    /// All resources in the epub file are declared here,
98    /// and undeclared resources should not be stored in the epub file and cannot be obtained from it.
99    pub manifest: HashMap<String, ManifestItem>,
100
101    /// Physical reading order of publications extracted from OPF
102    ///
103    /// This attribute declares the order in which multiple files
104    /// containing published content should be displayed.
105    pub spine: Vec<SpineItem>,
106
107    /// The encryption.xml extracted from the META-INF directory
108    pub encryption: Option<Vec<EncryptionData>>,
109
110    /// The navigation data of the epub file
111    pub catalog: Vec<NavPoint>,
112
113    /// The title of the catalog
114    pub catalog_title: String,
115
116    /// The index of the current reading spine
117    pub current_spine_index: usize,
118}
119
120impl<R: Read + Seek> EpubDoc<R> {
121    /// Creates a new EPUB document instance from a reader
122    ///
123    /// This function is responsible for the core logic of parsing EPUB files,
124    /// including verifying the file format, parsing container information,
125    /// loading the OPF package document, and extracting metadata, manifest,
126    /// reading order, and other core information.
127    ///
128    /// # Parameters
129    /// - `reader`: The data source that implements the `Read` and `Seek` traits,
130    ///   usually a file or memory buffer
131    /// - `epub_path`: The path to the EPUB file, used for path resolution and validation
132    ///
133    /// # Return
134    /// - `Ok(EpubDoc<R>)`: The successfully parsed EPUB document object
135    /// - `Err(EpubError)`: Errors encountered during parsing
136    ///
137    /// # Notes
138    /// - This function assumes the EPUB file structure is valid
139    pub fn from_reader(reader: R, epub_path: PathBuf) -> Result<Self, EpubError> {
140        // Parsing process
141        // 1. Verify that the ZIP compression method conforms to the EPUB specification
142        // 2. Parse `META-INF/container.xml` retrieves the location of the OPF file
143        // 3. Parses the OPF file to obtain package documentation information
144        // 4. Extracts version information
145        // 5. Parses metadata, manifest, and spine
146        // 6. Parses encrypted information and directory navigation
147        // 7. Verifies and extracts the unique identifier
148
149        let mut archive = ZipArchive::new(reader).map_err(EpubError::from)?;
150        let epub_path = canonicalize(epub_path)?;
151
152        compression_method_check(&mut archive)?;
153
154        let container =
155            get_file_in_zip_archive(&mut archive, "META-INF/container.xml")?.decode()?;
156        let package_path = Self::parse_container(container)?;
157        let base_path = package_path
158            .parent()
159            .expect("所有文件的父目录不能为空")
160            .to_path_buf();
161
162        let opf_file =
163            get_file_in_zip_archive(&mut archive, package_path.to_str().unwrap())?.decode()?;
164        let package = XmlReader::parse(&opf_file)?;
165        // let document = kiss_xml::parse_str(opf_file).unwrap();
166
167        // let package = document.root_element();
168        let version = Self::determine_epub_version(&package)?;
169
170        let mut doc = Self {
171            archive,
172            epub_path,
173            package_path,
174            base_path,
175            version,
176            unique_identifier: String::new(),
177            metadata: vec![],
178            metadata_link: vec![],
179            manifest: HashMap::new(),
180            spine: vec![],
181            encryption: None,
182            catalog: vec![],
183            catalog_title: String::new(),
184            current_spine_index: 0,
185        };
186
187        let metadata_element = package.find_elements_by_name("metadata").next().unwrap();
188        let manifest_element = package.find_elements_by_name("manifest").next().unwrap();
189        let spine_element = package.find_elements_by_name("spine").next().unwrap();
190
191        doc.parse_metadata(metadata_element)?;
192        doc.parse_manifest(manifest_element)?;
193        doc.parse_spine(spine_element)?;
194        doc.parse_encryption()?;
195        doc.parse_catalog()?;
196
197        // 断言必有唯一标识符
198        doc.unique_identifier = if let Some(uid) = package.get_attr("unique-identifier") {
199            doc.metadata.iter().find(|item| {
200                item.property == "identifier" && item.id.as_ref().is_some_and(|id| id == &uid)
201            })
202        } else {
203            doc.metadata
204                .iter()
205                .find(|item| item.property == "identifier")
206        }
207        .map(|item| item.value.clone())
208        .ok_or_else(|| EpubError::NonCanonicalFile {
209            tag: "dc:identifier".to_string(),
210        })?;
211
212        Ok(doc)
213    }
214
215    /// Parse the EPUB container file (META-INF/container.xml)
216    ///
217    /// This function parses the container information in the EPUB file 、
218    /// to extract the path to the OPF package file. According to the EPUB
219    /// specification, the `container.xml` file must exist in the `META-INF`
220    /// directory and contain at least one `rootfile` element pointing to
221    /// the main OPF file. When multiple `rootfile` elements exist, the first
222    /// element pointing to the OPF file is used as the default.
223    ///
224    /// # Parameters
225    /// - `data`: The content string of the container.xml
226    ///
227    /// # Return
228    /// - `Ok(PathBuf)`: The path to the successfully parsed OPF file
229    /// - `Err(EpubError)`: Errors encountered during parsing
230    fn parse_container(data: String) -> Result<PathBuf, EpubError> {
231        let root = XmlReader::parse(&data)?;
232        let rootfile = root
233            .find_elements_by_name("rootfile")
234            .next()
235            .ok_or_else(|| EpubError::NonCanonicalFile {
236                tag: "rootfile".to_string(),
237            })?;
238
239        let attr =
240            rootfile
241                .get_attr("full-path")
242                .ok_or_else(|| EpubError::MissingRequiredAttribute {
243                    tag: "rootfile".to_string(),
244                    attribute: "full-path".to_string(),
245                })?;
246
247        Ok(PathBuf::from(attr))
248    }
249
250    /// Parse the EPUB metadata section
251    ///
252    /// This function is responsible for parsing the `<metadata>` elements
253    /// in the OPF file to extract basic information about the publication.
254    /// It handles metadata elements from different namespaces:
255    /// - Elements in the Dublin Core namespace (`http://purl.org/dc/elements/1.1/`)
256    /// - Elements in the OPF namespace (`http://www.idpf.org/2007/opf`)
257    ///
258    /// # Parameters
259    /// - `metadata_element`: A reference to the `<metadata>` element in the OPF file
260    fn parse_metadata(&mut self, metadata_element: &XmlElement) -> Result<(), EpubError> {
261        const DC_NAMESPACE: &str = "http://purl.org/dc/elements/1.1/";
262        const OPF_NAMESPACE: &str = "http://www.idpf.org/2007/opf";
263
264        let mut metadata = Vec::new();
265        let mut metadata_link = Vec::new();
266        let mut refinements = HashMap::<String, Vec<MetadataRefinement>>::new();
267
268        for element in metadata_element.children() {
269            match &element.namespace {
270                Some(namespace) if namespace == DC_NAMESPACE => {
271                    self.parse_dc_metadata(element, &mut metadata)?
272                }
273
274                Some(namespace) if namespace == OPF_NAMESPACE => self.parse_opf_metadata(
275                    element,
276                    &mut metadata,
277                    &mut metadata_link,
278                    &mut refinements,
279                )?,
280
281                _ => {}
282            };
283        }
284
285        for item in metadata.iter_mut() {
286            if let Some(id) = &item.id {
287                if let Some(refinements) = refinements.remove(id) {
288                    item.refined = refinements;
289                }
290            }
291        }
292
293        self.metadata = metadata;
294        self.metadata_link = metadata_link;
295        Ok(())
296    }
297
298    /// Parse the EPUB manifest section
299    ///
300    /// This function parses the `<manifest>` element in the OPF file, extracting
301    /// information about all resource files in the publication. Each resource contains
302    /// basic information such as id, file path, MIME type, as well as optional
303    /// attributes and fallback resource information.
304    ///
305    /// # Parameters
306    /// - `manifest_element`: A reference to the `<manifest>` element in the OPF file
307    fn parse_manifest(&mut self, manifest_element: &XmlElement) -> Result<(), EpubError> {
308        let estimated_items = manifest_element.children().count();
309        let mut resources = HashMap::with_capacity(estimated_items);
310
311        for element in manifest_element.children() {
312            let id = element
313                .get_attr("id")
314                .ok_or_else(|| EpubError::MissingRequiredAttribute {
315                    tag: element.tag_name(),
316                    attribute: "id".to_string(),
317                })?
318                .to_string();
319            let path = element
320                .get_attr("href")
321                .ok_or_else(|| EpubError::MissingRequiredAttribute {
322                    tag: element.tag_name(),
323                    attribute: "href".to_string(),
324                })?
325                .to_string();
326            let mime = element
327                .get_attr("media-type")
328                .ok_or_else(|| EpubError::MissingRequiredAttribute {
329                    tag: element.tag_name(),
330                    attribute: "media-type".to_string(),
331                })?
332                .to_string();
333            let properties = element.get_attr("properties");
334            let fallback = element.get_attr("fallback");
335
336            resources.insert(
337                id.clone(),
338                ManifestItem {
339                    id,
340                    path: self.normalize_manifest_path(&path)?,
341                    mime,
342                    properties,
343                    fallback,
344                },
345            );
346        }
347
348        self.manifest = resources;
349        self.validate_fallback_chains();
350        Ok(())
351    }
352
353    /// Parse the EPUB spine section
354    ///
355    /// This function parses the `<spine>` elements in the OPF file to extract
356    /// the reading order information of the publication. The spine defines the
357    /// linear reading order of the publication's content documents, and each
358    /// spine item references resources in the manifest.
359    ///
360    /// # Parameters
361    /// - `spine_element`: A reference to the `<spine>` element in the OPF file
362    fn parse_spine(&mut self, spine_element: &XmlElement) -> Result<(), EpubError> {
363        let mut spine = Vec::new();
364        for element in spine_element.children() {
365            let idref = element
366                .get_attr("idref")
367                .ok_or_else(|| EpubError::MissingRequiredAttribute {
368                    tag: element.tag_name(),
369                    attribute: "idref".to_string(),
370                })?
371                .to_string();
372            let id = element.get_attr("id");
373            let linear = element
374                .get_attr("linear")
375                .map(|linear| linear == "yes")
376                .unwrap_or(true);
377            let properties = element.get_attr("properties");
378
379            spine.push(SpineItem {
380                idref,
381                id,
382                linear,
383                properties,
384            });
385        }
386
387        self.spine = spine;
388        Ok(())
389    }
390
391    /// Parse the EPUB encryption file (META-INF/encryption.xml)
392    ///
393    /// This function is responsible for parsing the `encryption.xml` file
394    /// in the `META-INF` directory to extract information about encrypted
395    /// resources in the publication. According to the EPUB specification,
396    /// the encryption information describes which resources are encrypted
397    /// and the encryption methods used.
398    ///
399    /// TODO: 需要对使用非对称加密数据的加密项进行额外处理，以获取非对称加密密钥
400    fn parse_encryption(&mut self) -> Result<(), EpubError> {
401        if !self.has_encryption() {
402            return Ok(());
403        }
404
405        let encryption_file =
406            get_file_in_zip_archive(&mut self.archive, "META-INF/encryption.xml")?.decode()?;
407
408        let root = XmlReader::parse(&encryption_file)?;
409
410        let mut encryption_data = Vec::new();
411        for data in root.children() {
412            if data.name != "EncryptedData" {
413                continue;
414            }
415
416            let method = data
417                .find_elements_by_name("EncryptionMethod")
418                .next()
419                .ok_or_else(|| EpubError::NonCanonicalFile {
420                    tag: "EncryptionMethod".to_string(),
421                })?;
422            let reference = data
423                .find_elements_by_name("CipherReference")
424                .next()
425                .ok_or_else(|| EpubError::NonCanonicalFile {
426                    tag: "CipherReference".to_string(),
427                })?;
428
429            encryption_data.push(EncryptionData {
430                method: method
431                    .get_attr("Algorithm")
432                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
433                        tag: "EncryptionMethod".to_string(),
434                        attribute: "Algorithm".to_string(),
435                    })?
436                    .to_string(),
437                data: reference
438                    .get_attr("URI")
439                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
440                        tag: "CipherReference".to_string(),
441                        attribute: "URI".to_string(),
442                    })?
443                    .to_string(),
444            });
445        }
446
447        if !encryption_data.is_empty() {
448            self.encryption = Some(encryption_data);
449        }
450
451        Ok(())
452    }
453
454    /// Parse the EPUB navigation information
455    ///
456    /// This function is responsible for parsing the navigation information of EPUB
457    /// publications. Different parsing strategies are used depending on the EPUB version:
458    /// - EPUB 2.0: Parses the NCX file to obtain directory information
459    /// - EPUB 3.0: Parses the Navigation Document (NAV) file to obtain directory information
460    fn parse_catalog(&mut self) -> Result<(), EpubError> {
461        const HEAD_TAGS: [&str; 6] = ["h1", "h2", "h3", "h4", "h5", "h6"];
462
463        match self.version {
464            EpubVersion::Version2_0 => {
465                let opf_file = get_file_in_zip_archive(
466                    &mut self.archive,
467                    self.package_path.to_str().unwrap(),
468                )?
469                .decode()?;
470                let opf_element = XmlReader::parse(&opf_file)?;
471
472                let toc_id = opf_element
473                    .find_children_by_name("spine")
474                    .next()
475                    .ok_or_else(|| EpubError::NonCanonicalFile {
476                        tag: "spine".to_string(),
477                    })?
478                    .get_attr("toc")
479                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
480                        tag: "spine".to_string(),
481                        attribute: "toc".to_string(),
482                    })?
483                    .to_owned();
484                let toc_path = self
485                    .manifest
486                    .get(&toc_id)
487                    .ok_or(EpubError::ResourceIdNotExist { id: toc_id })?
488                    .path
489                    .to_str()
490                    .unwrap();
491
492                let ncx_file = get_file_in_zip_archive(&mut self.archive, toc_path)?.decode()?;
493                let ncx = XmlReader::parse(&ncx_file)?;
494
495                match ncx.find_elements_by_name("docTitle").next() {
496                    Some(element) => self.catalog_title = element.text(),
497                    None => warn!(
498                        "Expecting to get docTitle information from the ncx file, but it's missing."
499                    ),
500                };
501
502                let nav_map = ncx.find_elements_by_name("navMap").next().ok_or_else(|| {
503                    EpubError::NonCanonicalFile {
504                        tag: "navMap".to_string(),
505                    }
506                })?;
507
508                self.catalog = self.parse_nav_points(nav_map)?;
509
510                Ok(())
511            }
512
513            EpubVersion::Version3_0 => {
514                let nav_path = self
515                    .manifest
516                    .values()
517                    .find(|item| {
518                        if let Some(property) = &item.properties {
519                            return property.contains("nav");
520                        }
521                        false
522                    })
523                    .map(|item| item.path.clone())
524                    .ok_or_else(|| EpubError::NonCanonicalEpub {
525                        expected_file: "Navigation Document".to_string(),
526                    })?;
527
528                let nav_file =
529                    get_file_in_zip_archive(&mut self.archive, nav_path.to_str().unwrap())?
530                        .decode()?;
531
532                let nav_element = XmlReader::parse(&nav_file)?;
533                let nav = nav_element
534                    .find_elements_by_name("nav")
535                    .find(|&element| element.get_attr("epub:type") == Some(String::from("toc")))
536                    .ok_or_else(|| EpubError::NonCanonicalFile {
537                        tag: "nav".to_string(),
538                    })?;
539                let nav_title = nav.find_children_by_names(&HEAD_TAGS).next();
540                let nav_list = nav.find_children_by_name("ol").next().ok_or_else(|| {
541                    EpubError::NonCanonicalFile {
542                        tag: "ol".to_string(),
543                    }
544                })?;
545
546                self.catalog = self.parse_catalog_list(nav_list)?;
547                if let Some(nav_title) = nav_title {
548                    self.catalog_title = nav_title.text();
549                };
550                Ok(())
551            }
552        }
553    }
554
555    /// Check if the EPUB file contains `encryption.xml`
556    ///
557    /// This function determines whether a publication contains encrypted resources
558    /// by checking if a `META-INF/encryption.xml` file exists in the EPUB package.
559    /// According to the EPUB specification, when resources in a publication are
560    /// encrypted, the corresponding encryption information must be declared in
561    /// the `META-INF/encryption.xml` file.
562    ///
563    /// # Return
564    /// - `true` if the publication contains encrypted resources
565    /// - `false` if the publication does not contain encrypted resources
566    ///
567    /// # Notes
568    /// - This function only checks the existence of the encrypted file;
569    ///   it does not verify the validity of the encrypted information.
570    pub fn has_encryption(&mut self) -> bool {
571        self.archive
572            .by_path(Path::new("META-INF/encryption.xml"))
573            .is_ok()
574    }
575
576    /// Retrieves a list of metadata items
577    ///
578    /// This function retrieves all matching metadata items from the EPUB metadata
579    /// based on the specified attribute name (key). Metadata items may come from
580    /// the DC (Dublin Core) namespace or the OPF namespace and contain basic
581    /// information about the publication, such as title, author, identifier, etc.
582    ///
583    /// # Parameters
584    /// - `key`: The name of the metadata attribute to retrieve
585    ///
586    /// # Return
587    /// - `Some(Vec<MetadataItem>)`: A vector containing all matching metadata items
588    /// - `None`: If no matching metadata items are found
589    pub fn get_metadata(&self, key: &str) -> Option<Vec<MetadataItem>> {
590        let metadatas = self
591            .metadata
592            .iter()
593            .filter(|item| item.property == key)
594            .cloned()
595            .collect::<Vec<MetadataItem>>();
596
597        (!metadatas.is_empty()).then_some(metadatas)
598    }
599
600    /// Retrieves a list of values for specific metadata items
601    ///
602    /// This function retrieves the values of all matching metadata items from
603    /// the EPUB metadata based on the given property name (key).
604    ///
605    /// # Parameters
606    /// - `key`: The name of the metadata attribute to retrieve
607    ///
608    /// # Return
609    /// - `Some(Vec<String>)`: A vector containing all matching metadata item values
610    /// - `None`: If no matching metadata items are found
611    pub fn get_metadata_value(&self, key: &str) -> Option<Vec<String>> {
612        let values = self
613            .metadata
614            .iter()
615            .filter(|item| item.property == key)
616            .map(|item| item.value.clone())
617            .collect::<Vec<String>>();
618
619        (!values.is_empty()).then_some(values)
620    }
621
622    /// Retrieves the title of the publication
623    ///
624    /// This function retrieves all title information from the EPUB metadata.
625    /// According to the EPUB specification, a publication can have multiple titles,
626    /// which are returned in the order they appear in the metadata.
627    ///
628    /// # Return
629    /// - `Result<Vec<String>, EpubError>`: A vector containing all title information
630    /// - `EpubError`: If and only if the OPF file does not contain `<dc:title>`
631    ///
632    /// # Notes
633    /// - The EPUB specification requires each publication to have at least one title.
634    pub fn get_title(&self) -> Result<Vec<String>, EpubError> {
635        self.get_metadata_value("title")
636            .ok_or_else(|| EpubError::NonCanonicalFile {
637                tag: "title".to_string(),
638            })
639    }
640
641    /// Retrieves the language used in the publication
642    ///
643    /// This function retrieves the language information of a publication from the EPUB
644    /// metadata. According to the EPUB specification, language information identifies
645    /// the primary language of the publication and can have multiple language identifiers.
646    ///
647    /// # Return
648    /// - `Ok(Vec<String>)`: A vector containing all language identifiers
649    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:language>`
650    ///
651    /// # Notes
652    /// - The EPUB specification requires that each publication specify at least one primary language.
653    /// - Language identifiers should conform to RFC 3066 or later standards.
654    pub fn get_language(&self) -> Result<Vec<String>, EpubError> {
655        self.get_metadata_value("language")
656            .ok_or_else(|| EpubError::NonCanonicalFile {
657                tag: "language".to_string(),
658            })
659    }
660
661    /// Retrieves the identifier of a publication
662    ///
663    /// This function retrieves the identifier information of a publication from
664    /// the EPUB metadata. According to the EPUB specification, each publication
665    /// must have a identifier, typically an ISBN, UUID, or other unique identifier.
666    ///
667    /// # Return
668    /// - `Ok(Vec<String>)`: A vector containing all identifier information
669    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:identifier>`
670    ///
671    /// # Notes
672    /// - The EPUB specification requires each publication to have at least one identifier.
673    /// - In the OPF file, the `unique-identifier` attribute of the `<package>` element
674    ///   should point to a `<dc:identifier>` element used to uniquely identify the publication.
675    ///   This means that `unique-identifier` is not exactly equal to `<dc:identifier>`.
676    pub fn get_identifier(&self) -> Result<Vec<String>, EpubError> {
677        self.get_metadata_value("identifier")
678            .ok_or_else(|| EpubError::NonCanonicalFile {
679                tag: "identifier".to_string(),
680            })
681    }
682
683    /// Retrieve resource data by resource ID
684    ///
685    /// This function will find the resource with the specified ID in the manifest.
686    /// If the resource is encrypted, it will be automatically decrypted.
687    ///
688    /// # Parameters
689    /// - `id`: The ID of the resource to retrieve
690    ///
691    /// # Return
692    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
693    ///   the MIME type
694    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
695    ///
696    /// # Notes
697    /// - This function will automatically decrypt the resource if it is encrypted.
698    /// - For unsupported encryption methods, the corresponding error will be returned.
699    pub fn get_manifest_item(&mut self, id: &str) -> Result<(Vec<u8>, String), EpubError> {
700        let resource_item = self
701            .manifest
702            .get(id)
703            .cloned()
704            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
705
706        let path = resource_item.path.to_str().unwrap();
707
708        let mut data = match self.archive.by_name(path) {
709            Ok(mut file) => {
710                let mut entry = Vec::<u8>::new();
711                file.read_to_end(&mut entry)?;
712
713                Ok(entry)
714            }
715            Err(ZipError::FileNotFound) => Err(EpubError::ResourceNotFound {
716                resource: path.to_string(),
717            }),
718            Err(err) => Err(EpubError::from(err)),
719        }?;
720
721        if let Some(method) = self.is_encryption_file(path) {
722            data = self.auto_dencrypt(&method, &mut data)?;
723        }
724
725        Ok((data, resource_item.mime))
726    }
727
728    /// Retrieves resource item data by resource path
729    ///
730    /// This function retrieves resources from the manifest based on the input path.
731    /// The input path must be a relative path to the root directory of the EPUB container;
732    /// using an absolute path or a relative path to another location will result in an error.
733    ///
734    /// # Parameters
735    /// - `path`: The path of the resource to retrieve
736    ///
737    /// # Return
738    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
739    ///   the MIME type
740    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
741    ///
742    /// # Notes
743    /// - This function will automatically decrypt the resource if it is encrypted.
744    /// - For unsupported encryption methods, the corresponding error will be returned.
745    /// - Relative paths other than the root directory of the Epub container are not supported.
746    pub fn get_manifest_item_by_path(
747        &mut self,
748        path: &str,
749    ) -> Result<(Vec<u8>, String), EpubError> {
750        let id = self
751            .manifest
752            .iter()
753            .find(|(_, item)| item.path.to_str().unwrap() == path)
754            .map(|(id, _)| id.to_string())
755            .ok_or_else(|| EpubError::ResourceNotFound {
756                resource: path.to_string(),
757            })?;
758
759        self.get_manifest_item(&id)
760    }
761
762    /// Retrieves supported resource items by resource ID, with fallback mechanism supported
763    ///
764    /// This function attempts to retrieve the resource item with the specified ID and
765    /// checks if its MIME type is in the list of supported formats. If the current resource
766    /// format is not supported, it searches for a supported resource format along the
767    /// fallback chain according to the fallback mechanism defined in the EPUB specification.
768    ///
769    /// # Parameters
770    /// - `id`: The ID of the resource to retrieve
771    /// - `supported_format`: A vector of supported MIME types
772    ///
773    /// # Return
774    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
775    ///   the MIME type
776    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
777    pub fn get_manifest_item_with_fallback(
778        &mut self,
779        id: &str,
780        supported_format: Vec<&str>,
781    ) -> Result<(Vec<u8>, String), EpubError> {
782        let mut manifest_item = self
783            .manifest
784            .get(id)
785            .cloned()
786            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
787
788        let mut current_manifest_id = id.to_string();
789        let mut fallback_chain = Vec::<String>::new();
790        'fallback: loop {
791            if supported_format.contains(&manifest_item.mime.as_str()) {
792                return self.get_manifest_item(&current_manifest_id);
793            }
794
795            let fallback_id = manifest_item.fallback.clone();
796
797            match fallback_id {
798                // The loop ends when no fallback resource exists
799                None => break 'fallback,
800
801                // End the loop when the loop continues to fallback if a fallback resource exists
802                Some(id) if fallback_chain.contains(&id) => break 'fallback,
803
804                Some(id) => {
805                    fallback_chain.push(id.clone());
806
807                    // Since only warnings are issued for fallback resource checks
808                    // during initialization, the issue of fallback resources possibly
809                    // not existing needs to be handled here.
810                    manifest_item = self
811                        .manifest
812                        .get(&manifest_item.fallback.unwrap())
813                        .cloned()
814                        .ok_or(EpubError::ResourceIdNotExist { id: id.clone() })?;
815                    current_manifest_id = id;
816                }
817            };
818        }
819
820        Err(EpubError::NoSupportedFileFormat)
821    }
822
823    /// Navigate to a specified chapter using the spine index
824    ///
825    /// This function retrieves the content data of the corresponding chapter based
826    /// on the index position in the EPUB spine. The spine defines the linear reading
827    /// order of the publication's content documents, and each spine item references
828    /// resources in the manifest.
829    ///
830    /// # Parameters
831    /// - `index`: The index position in the spine, starting from 0
832    ///
833    /// # Return
834    /// - `Some((Vec<u8>, String))`: Successfully retrieved chapter content data and the MIME type
835    /// - `None`: Index out of range or data retrieval error
836    ///
837    /// # Notes
838    /// - The index must be less than the total number of spine projects.
839    /// - If the resource is encrypted, it will be automatically decrypted before returning.(TODO)
840    /// - It does not check whether the Spine project follows a linear reading order.
841    pub fn navigate_by_spine_index(&mut self, index: usize) -> Option<(Vec<u8>, String)> {
842        if index >= self.spine.len() {
843            return None;
844        }
845
846        let manifest_id = self.spine[index].idref.clone();
847        self.current_spine_index = index;
848        self.get_manifest_item(&manifest_id).ok()
849    }
850
851    /// Navigate to the previous linear reading chapter
852    ///
853    /// This function searches backwards in the EPUB spine for the previous linear
854    /// reading chapter and returns the content data of that chapter. It only navigates
855    /// to chapters marked as linear reading.
856    ///
857    /// # Return
858    /// - `Some((Vec<u8>, String))`: Successfully retrieved previous chapter content data and
859    ///   the MIME type
860    /// - `None`: Already in the first chapter, the current chapter is not linear,
861    ///   or data retrieval failed
862    pub fn spine_prev(&mut self) -> Option<(Vec<u8>, String)> {
863        if self.current_spine_index == 0 || !self.spine[self.current_spine_index].linear {
864            return None;
865        }
866
867        let prev_index = (0..self.current_spine_index)
868            .rev()
869            .find(|&index| self.spine[index].linear)?;
870
871        self.current_spine_index = prev_index;
872        let manifest_id = self.spine[prev_index].idref.clone();
873        self.get_manifest_item(&manifest_id).ok()
874    }
875
876    /// Navigate to the next linear reading chapter
877    ///
878    /// This function searches forwards in the EPUB spine for the next linear reading
879    /// chapter and returns the content data of that chapter. It only navigates to
880    /// chapters marked as linear reading.
881    ///
882    /// # Return
883    /// - `Some((Vec<u8>, String))`: Successfully retrieved next chapter content data and
884    ///   the MIME type
885    /// - `None`: Already in the last chapter, the current chapter is not linear,
886    ///   or data retrieval failed
887    pub fn spine_next(&mut self) -> Option<(Vec<u8>, String)> {
888        if self.current_spine_index >= self.spine.len() - 1
889            || !self.spine[self.current_spine_index].linear
890        {
891            return None;
892        }
893
894        let next_index = (self.current_spine_index + 1..self.spine.len())
895            .find(|&index| self.spine[index].linear)?;
896
897        self.current_spine_index = next_index;
898        let manifest_id = self.spine[next_index].idref.clone();
899        self.get_manifest_item(&manifest_id).ok()
900    }
901
902    /// Retrieves the content data of the current chapter
903    ///
904    /// This function returns the content data of the chapter at the current
905    /// index position in the EPUB spine.
906    ///
907    /// # Return
908    /// - `Some((Vec<u8>, String))`: Successfully retrieved current chapter content data and
909    ///   the MIME type
910    /// - `None`: Data retrieval failed
911    pub fn spine_current(&mut self) -> Option<(Vec<u8>, String)> {
912        let manifest_id = self.spine[self.current_spine_index].idref.clone();
913        self.get_manifest_item(&manifest_id).ok()
914    }
915
916    /// Determine the EPUB version from the OPF file
917    ///
918    /// This function is used to detect the version of an epub file from an OPF file.
919    /// When the version attribute in the package is abnormal, version information will
920    /// be identified through some version characteristics of the epub file. An error is
921    /// returned when neither direct nor indirect methods can identify the version.
922    ///
923    /// # Parameters
924    /// - `opf_element`: A reference to the OPF file element
925    fn determine_epub_version(opf_element: &XmlElement) -> Result<EpubVersion, EpubError> {
926        // Check the explicit version attribute
927        if let Some(version) = opf_element.get_attr("version") {
928            match version.as_str() {
929                "2.0" => return Ok(EpubVersion::Version2_0),
930                "3.0" => return Ok(EpubVersion::Version3_0),
931                _ => {}
932            }
933        }
934
935        let spine_element = opf_element
936            .find_elements_by_name("spine")
937            .next()
938            .ok_or_else(|| EpubError::NonCanonicalFile {
939                tag: "spine".to_string(),
940            })?;
941
942        // Look for EPUB 2.x specific features
943        if spine_element.get_attr("toc").is_some() {
944            return Ok(EpubVersion::Version2_0);
945        }
946
947        let manifest_element = opf_element
948            .find_elements_by_name("manifest")
949            .next()
950            .ok_or_else(|| EpubError::NonCanonicalFile {
951                tag: "manifest".to_string(),
952            })?;
953
954        // Look for EPUB 3.x specific features
955        manifest_element
956            .children()
957            .find_map(|element| {
958                if let Some(id) = element.get_attr("id") {
959                    if id.eq("nav") {
960                        return Some(EpubVersion::Version3_0);
961                    }
962                }
963
964                None
965            })
966            .ok_or(EpubError::UnrecognizedEpubVersion)
967    }
968
969    /// Parse metadata elements under the Dublin Core namespace
970    ///
971    /// This function handles the `<metadata>` Dublin Core element in the OPF file (namespace
972    /// is "http://purl.org/dc/elements/1.1/"). These elements usually contain the basic
973    /// information of the publication, such as title, author, publication date, etc.
974    ///
975    /// # Notes
976    /// - In EPUB 3.0, granular information is handled by separate '<meta>' elements and 'refines' attributes
977    /// - All text content is normalized by whitespace
978    #[inline]
979    fn parse_dc_metadata(
980        &mut self,
981        element: &XmlElement,
982        metadata: &mut Vec<MetadataItem>,
983        // refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
984    ) -> Result<(), EpubError> {
985        let id = element.get_attr("id");
986        let lang = element.get_attr("lang");
987        let property = element.name.clone();
988        let value = element.text().normalize_whitespace();
989
990        let refined = match self.version {
991            // In EPUB 2.0, supplementary metadata (refinements) are represented
992            // through other attribute data pairs of the tag.
993            EpubVersion::Version2_0 => element
994                .attributes
995                .iter()
996                .map(|(name, value)| {
997                    let property = name.to_string();
998                    let value = value.to_string().normalize_whitespace();
999
1000                    MetadataRefinement {
1001                        refines: id.clone().unwrap(),
1002                        property,
1003                        value,
1004                        lang: None,
1005                        scheme: None,
1006                    }
1007                })
1008                .collect(),
1009            EpubVersion::Version3_0 => vec![],
1010        };
1011
1012        metadata.push(MetadataItem {
1013            id,
1014            property,
1015            value,
1016            lang,
1017            refined,
1018        });
1019
1020        Ok(())
1021    }
1022
1023    /// Parse metadata elements under the OPF namespace
1024    ///
1025    /// This function handles the `<metadata>` OPF element in the OPF file (namespace
1026    /// is "http://www.idpf.org/2007/opf"). These elements include '<meta>' and '<link>',
1027    /// which are used to provide extended metadata and links to external resources for EPUB publications.
1028    ///
1029    /// # Notes
1030    /// - The function is only responsible for distribution processing, and the
1031    ///   specific parsing logic is implemented in the dedicated function
1032    /// - All parsing results are added directly to the incoming collection and no new collection is returned
1033    #[inline]
1034    fn parse_opf_metadata(
1035        &mut self,
1036        element: &XmlElement,
1037        metadata: &mut Vec<MetadataItem>,
1038        metadata_link: &mut Vec<MetadataLinkItem>,
1039        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1040    ) -> Result<(), EpubError> {
1041        match element.name.as_str() {
1042            "meta" => self.parse_meta_element(element, metadata, refinements),
1043            "link" => self.parse_link_element(element, metadata_link),
1044            _ => Ok(()),
1045        }
1046    }
1047
1048    #[inline]
1049    fn parse_meta_element(
1050        &mut self,
1051        element: &XmlElement,
1052        metadata: &mut Vec<MetadataItem>,
1053        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1054    ) -> Result<(), EpubError> {
1055        match self.version {
1056            EpubVersion::Version2_0 => {
1057                let property =
1058                    element
1059                        .get_attr("name")
1060                        .ok_or_else(|| EpubError::NonCanonicalFile {
1061                            tag: element.tag_name(),
1062                        })?;
1063                let value = element
1064                    .get_attr("content")
1065                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
1066                        tag: element.tag_name(),
1067                        attribute: "content".to_string(),
1068                    })?
1069                    .normalize_whitespace();
1070
1071                metadata.push(MetadataItem {
1072                    id: None,
1073                    property,
1074                    value,
1075                    lang: None,
1076                    refined: vec![],
1077                });
1078            }
1079
1080            EpubVersion::Version3_0 => {
1081                let property = element.get_attr("property").ok_or_else(|| {
1082                    EpubError::MissingRequiredAttribute {
1083                        tag: element.tag_name(),
1084                        attribute: "property".to_string(),
1085                    }
1086                })?;
1087                let value = element.text().normalize_whitespace();
1088                let lang = element.get_attr("lang");
1089
1090                if let Some(refines) = element.get_attr("refines") {
1091                    let id = refines.strip_prefix("#").unwrap_or(&refines).to_string();
1092                    let scheme = element.get_attr("scheme");
1093                    let refinement = MetadataRefinement {
1094                        refines: id.clone(),
1095                        property,
1096                        value,
1097                        lang,
1098                        scheme,
1099                    };
1100
1101                    if let Some(refinements) = refinements.get_mut(&id) {
1102                        refinements.push(refinement);
1103                    } else {
1104                        refinements.insert(id, vec![refinement]);
1105                    }
1106                } else {
1107                    let id = element.get_attr("id");
1108                    let item = MetadataItem {
1109                        id,
1110                        property,
1111                        value,
1112                        lang,
1113                        refined: vec![],
1114                    };
1115
1116                    metadata.push(item);
1117                };
1118            }
1119        }
1120        Ok(())
1121    }
1122
1123    #[inline]
1124    fn parse_link_element(
1125        &mut self,
1126        element: &XmlElement,
1127        metadata_link: &mut Vec<MetadataLinkItem>,
1128    ) -> Result<(), EpubError> {
1129        let href = element
1130            .get_attr("href")
1131            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1132                tag: element.tag_name(),
1133                attribute: "href".to_string(),
1134            })?;
1135        let rel = element
1136            .get_attr("rel")
1137            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1138                tag: element.tag_name(),
1139                attribute: "rel".to_string(),
1140            })?;
1141        let hreflang = element.get_attr("hreflang");
1142        let id = element.get_attr("id");
1143        let mime = element.get_attr("media-type");
1144        let properties = element.get_attr("properties");
1145
1146        metadata_link.push(MetadataLinkItem {
1147            href,
1148            rel,
1149            hreflang,
1150            id,
1151            mime,
1152            properties,
1153            refines: None,
1154        });
1155        Ok(())
1156    }
1157
1158    /// Recursively parse NCX navigation points from navMap or nested navPoint elements
1159    ///
1160    /// This function parses the hierarchical navigation structure defined in NCX files
1161    /// for EPUB 2.x documents. It handles nested navPoint elements to build a complete
1162    /// tree representation of the publication's table of contents.
1163    fn parse_nav_points(&self, parent_element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1164        let mut nav_points = Vec::new();
1165        for nav_point in parent_element.find_children_by_name("navPoint") {
1166            let label = match nav_point.find_children_by_name("navLabel").next() {
1167                Some(element) => element.text(),
1168                None => String::new(),
1169            };
1170
1171            let content = nav_point
1172                .find_children_by_name("content")
1173                .next()
1174                .map(|element| PathBuf::from(element.text()));
1175
1176            let play_order = nav_point
1177                .get_attr("playOrder")
1178                .and_then(|order| order.parse::<usize>().ok());
1179
1180            let children = self.parse_nav_points(nav_point)?;
1181
1182            nav_points.push(NavPoint {
1183                label,
1184                content,
1185                play_order,
1186                children,
1187            });
1188        }
1189
1190        nav_points.sort();
1191        Ok(nav_points)
1192    }
1193
1194    /// Recursively parses directory list structures
1195    ///
1196    /// This function recursively parses HTML navigation list structures,
1197    /// converting `<ol>` and `<li>` elements into NavPoint structures.
1198    /// Multi-level nested directory structures are supported.
1199    fn parse_catalog_list(&self, element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1200        let mut catalog = Vec::new();
1201        for item in element.children() {
1202            if item.tag_name() != "li" {
1203                return Err(EpubError::NonCanonicalFile {
1204                    tag: "li".to_string(),
1205                });
1206            }
1207
1208            let title_element = item
1209                .find_children_by_names(&["span", "a"])
1210                .next()
1211                .ok_or_else(|| EpubError::NonCanonicalFile {
1212                    tag: "span/a".to_string(),
1213                })?;
1214            let content_href = title_element.get_attr("href").map(PathBuf::from);
1215            let sub_list = if let Some(list) = item.find_children_by_name("ol").next() {
1216                self.parse_catalog_list(list)?
1217            } else {
1218                vec![]
1219            };
1220
1221            catalog.push(NavPoint {
1222                label: title_element.text(),
1223                content: content_href,
1224                children: sub_list,
1225                play_order: None,
1226            });
1227        }
1228
1229        Ok(catalog)
1230    }
1231
1232    /// Converts relative paths in the manifest to normalized paths
1233    /// relative to the EPUB root directory
1234    ///
1235    /// This function processes the href attribute of resources in the EPUB
1236    /// manifest and converts it to a normalized path representation.
1237    /// It handles three types of paths:
1238    /// - Relative paths starting with `../` (checks if they exceed the EPUB package scope)
1239    /// - Absolute paths starting with `/` (relative to the EPUB root directory)
1240    /// - Other relative paths (relative to the directory containing the OPF file)
1241    ///
1242    /// # Parameters
1243    /// - `path`: The href attribute value of the resource in the manifest
1244    ///
1245    /// # Return
1246    /// - `Ok(PathBuf)`: The parsed normalized path
1247    /// - `Err(EpubError)`: Relative link leakage
1248    #[inline]
1249    fn normalize_manifest_path(&self, path: &str) -> Result<PathBuf, EpubError> {
1250        let mut path = if path.starts_with("../") {
1251            let mut current_dir = self.epub_path.join(&self.package_path);
1252            current_dir.pop();
1253
1254            check_realtive_link_leakage(self.epub_path.clone(), current_dir, path)
1255                .map(PathBuf::from)
1256                .ok_or_else(|| EpubError::RealtiveLinkLeakage {
1257                    path: path.to_string(),
1258                })?
1259        } else if let Some(path) = path.strip_prefix("/") {
1260            PathBuf::from(path.to_string())
1261        } else {
1262            self.base_path.join(path)
1263        };
1264
1265        #[cfg(windows)]
1266        {
1267            path = PathBuf::from(path.to_string_lossy().replace('\\', "/"));
1268        }
1269
1270        Ok(path)
1271    }
1272
1273    /// Verify the fallback chain of all manifest items
1274    ///
1275    /// This function iterates through all manifest items with the fallback
1276    /// attribute and verifies the validity of their fallback chains, including checking:
1277    /// - Whether circular references exist
1278    /// - Whether the fallback resource exists in the manifest
1279    ///
1280    /// # Notes
1281    /// If an invalid fallback chain is found, a warning log will be logged
1282    /// but the processing flow will not be interrupted.
1283    fn validate_fallback_chains(&self) {
1284        for (id, item) in &self.manifest {
1285            if item.fallback.is_none() {
1286                continue;
1287            }
1288
1289            let mut fallback_chain = Vec::new();
1290            if let Err(msg) = self.validate_fallback_chain(id, &mut fallback_chain) {
1291                warn!("Invalid fallback chain for item {}: {}", id, msg);
1292            }
1293        }
1294    }
1295
1296    /// Recursively verify the validity of a single fallback chain
1297    ///
1298    /// This function recursively traces the fallback chain to check for the following issues:
1299    /// - Circular reference
1300    /// - The referenced fallback resource does not exist
1301    ///
1302    /// # Parameters
1303    /// - `manifest_id`: The id of the manifest item currently being verified
1304    /// - `fallback_chain`: The visited fallback chain paths used to detect circular references
1305    ///
1306    /// # Return
1307    /// - `Ok(())`: The fallback chain is valid
1308    /// - `Err(String)`: A string containing error information
1309    fn validate_fallback_chain(
1310        &self,
1311        manifest_id: &str,
1312        fallback_chain: &mut Vec<String>,
1313    ) -> Result<(), String> {
1314        if fallback_chain.contains(&manifest_id.to_string()) {
1315            fallback_chain.push(manifest_id.to_string());
1316
1317            return Err(format!(
1318                "Circular reference detected in fallback chain for {}",
1319                fallback_chain.join("->")
1320            ));
1321        }
1322
1323        // Get the current item; its existence can be ensured based on the calling context.
1324        let item = self.manifest.get(manifest_id).unwrap();
1325
1326        if let Some(fallback_id) = &item.fallback {
1327            if !self.manifest.contains_key(fallback_id) {
1328                return Err(format!(
1329                    "Fallback resource {} does not exist in manifest",
1330                    fallback_id
1331                ));
1332            }
1333
1334            fallback_chain.push(manifest_id.to_string());
1335            self.validate_fallback_chain(fallback_id, fallback_chain)
1336        } else {
1337            // The end of the fallback chain
1338            Ok(())
1339        }
1340    }
1341
1342    /// Checks if a resource at the specified path is an encrypted file
1343    ///
1344    /// This function queries whether a specific resource path is marked as an encrypted
1345    /// file in the EPUB encryption information. It checks the encrypted data stored in
1346    /// `self.encryption`, looking for an entry that matches the given path.
1347    ///
1348    /// # Parameters
1349    /// - `path`: The path of the resource to check
1350    ///
1351    /// # Return
1352    /// - `Some(String)`: The encryption method used for the resource
1353    /// - `None`: The resource is not encrypted
1354    fn is_encryption_file(&self, path: &str) -> Option<String> {
1355        self.encryption.as_ref().and_then(|encryptions| {
1356            encryptions
1357                .iter()
1358                .find(|encryption| encryption.data == path)
1359                .map(|encryption| encryption.method.clone())
1360        })
1361    }
1362
1363    /// Automatically decrypts encrypted resource data
1364    ///
1365    /// Automatically decrypts data based on the provided encryption method.
1366    /// This function supports various encryption methods defined by the EPUB
1367    /// specification, including font obfuscation and the XML encryption standard.
1368    ///
1369    /// # Parameters
1370    /// - `method`: The encryption method used for the resource
1371    /// - `data`: The encrypted resource data
1372    ///
1373    /// # Return
1374    /// - `Ok(Vec<u8>)`: The decrypted resource data
1375    /// - `Err(EpubError)`: Unsupported encryption method
1376    ///
1377    /// # Supported Encryption Methods
1378    /// - IDPF font obfuscation: `http://www.idpf.org/2008/embedding`
1379    /// - Adobe font obfuscation: `http://ns.adobe.com/pdf/enc#RC`
1380    #[inline]
1381    fn auto_dencrypt(&self, method: &str, data: &mut [u8]) -> Result<Vec<u8>, EpubError> {
1382        match method {
1383            "http://www.idpf.org/2008/embedding" => {
1384                Ok(idpf_font_dencryption(data, &self.unique_identifier))
1385            }
1386            "http://ns.adobe.com/pdf/enc#RC" => {
1387                Ok(adobe_font_dencryption(data, &self.unique_identifier))
1388            }
1389            _ => Err(EpubError::UnsupportedEncryptedMethod {
1390                method: method.to_string(),
1391            }),
1392        }
1393    }
1394}
1395
1396impl EpubDoc<BufReader<File>> {
1397    /// Creates a new EPUB document instance
1398    ///
1399    /// This function is a convenience constructor for `EpubDoc`,
1400    /// used to create an EPUB parser instance directly from a file path.
1401    ///
1402    /// # Parameters
1403    /// - `path`: The path to the EPUB file
1404    ///
1405    /// # Return
1406    /// - `Ok(EpubDoc)`: The created EPUB document instance
1407    /// - `Err(EpubError)`: An error occurred during initialization
1408    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, EpubError> {
1409        let file = File::open(&path).map_err(EpubError::from)?;
1410        let path = canonicalize(path)?;
1411
1412        Self::from_reader(BufReader::new(file), path)
1413    }
1414}
1415
1416#[cfg(test)]
1417mod tests {
1418    use std::path::Path;
1419
1420    use crate::epub::EpubDoc;
1421
1422    /// Section 3.3 package documents
1423    mod package_documents_tests {
1424        use std::path::Path;
1425
1426        use crate::epub::{EpubDoc, EpubVersion};
1427
1428        /// ID: pkg-collections-unknown
1429        ///
1430        /// The package document contains a collection with an unknown role. The reading system must open the EPUB successfully.
1431        #[test]
1432        fn test_pkg_collections_unknown() {
1433            let epub_file = Path::new("./test_case/pkg-collections-unknown.epub");
1434            let doc = EpubDoc::new(epub_file);
1435            assert!(doc.is_ok());
1436        }
1437
1438        /// ID: pkg-creator-order
1439        ///
1440        /// Several creators are listed in the package document. The reading system must not display them out of order (but it may display only the first).
1441        #[test]
1442        fn test_pkg_creator_order() {
1443            let epub_file = Path::new("./test_case/pkg-creator-order.epub");
1444            let doc = EpubDoc::new(epub_file);
1445            assert!(doc.is_ok());
1446
1447            let doc = doc.unwrap();
1448            let creators = doc.get_metadata_value("creator");
1449            assert!(creators.is_some());
1450
1451            let creators = creators.unwrap();
1452            assert_eq!(creators.len(), 5);
1453            assert_eq!(
1454                creators,
1455                vec![
1456                    "Dave Cramer",
1457                    "Wendy Reid",
1458                    "Dan Lazin",
1459                    "Ivan Herman",
1460                    "Brady Duga",
1461                ]
1462            );
1463        }
1464
1465        /// ID: pkg-manifest-unknown
1466        ///
1467        /// The package document contains a manifest item with unknown properties. The reading system must open the EPUB successfully.
1468        #[test]
1469        fn test_pkg_manifest_order() {
1470            let epub_file = Path::new("./test_case/pkg-manifest-unknown.epub");
1471            let doc = EpubDoc::new(epub_file);
1472            assert!(doc.is_ok());
1473
1474            let mut doc = doc.unwrap();
1475            assert_eq!(doc.manifest.len(), 2);
1476            assert!(doc.get_manifest_item("nav").is_ok());
1477            assert!(doc.get_manifest_item("content_001").is_ok());
1478            assert!(doc.get_manifest_item("content_002").is_err());
1479        }
1480
1481        /// ID: pkg-meta-unknown
1482        ///
1483        /// The package document contains a meta tag with an unknown property. The reading system must open the EPUB successfully.
1484        #[test]
1485        fn test_pkg_meta_unknown() {
1486            let epub_file = Path::new("./test_case/pkg-meta-unknown.epub");
1487            let doc = EpubDoc::new(epub_file);
1488            assert!(doc.is_ok());
1489
1490            let doc = doc.unwrap();
1491            let value = doc.get_metadata_value("dcterms:isReferencedBy");
1492            assert!(value.is_some());
1493            let value = value.unwrap();
1494            assert_eq!(value.len(), 1);
1495            assert_eq!(
1496                value,
1497                vec!["https://www.w3.org/TR/epub-rs/#confreq-rs-pkg-meta-unknown"]
1498            );
1499
1500            let value = doc.get_metadata_value("dcterms:modified");
1501            assert!(value.is_some());
1502            let value = value.unwrap();
1503            assert_eq!(value.len(), 1);
1504            assert_eq!(value, vec!["2021-01-11T00:00:00Z"]);
1505
1506            let value = doc.get_metadata_value("dcterms:title");
1507            assert!(value.is_none());
1508        }
1509
1510        /// ID: pkg-meta-whitespace
1511        ///
1512        /// The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases.
1513        #[test]
1514        fn test_pkg_meta_white_space() {
1515            let epub_file = Path::new("./test_case/pkg-meta-whitespace.epub");
1516            let doc = EpubDoc::new(epub_file);
1517            assert!(doc.is_ok());
1518
1519            let doc = doc.unwrap();
1520            let value = doc.get_metadata_value("creator");
1521            assert!(value.is_some());
1522            let value = value.unwrap();
1523            assert_eq!(value.len(), 1);
1524            assert_eq!(value, vec!["Dave Cramer"]);
1525
1526            let value = doc.get_metadata_value("description");
1527            assert!(value.is_some());
1528            let value = value.unwrap();
1529            assert_eq!(value.len(), 1);
1530            assert_eq!(
1531                value,
1532                vec![
1533                    "The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases."
1534                ]
1535            );
1536        }
1537
1538        /// ID: pkg-spine-duplicate-item-hyperlink
1539        ///
1540        /// The spine contains several references to the same content document. The reading system must move to the position of the first duplicate in the reading order when following a hyperlink.
1541        #[test]
1542        fn test_pkg_spine_duplicate_item_hyperlink() {
1543            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-hyperlink.epub");
1544            let doc = EpubDoc::new(epub_file);
1545            assert!(doc.is_ok());
1546
1547            let mut doc = doc.unwrap();
1548            assert_eq!(doc.spine.len(), 4);
1549            assert_eq!(
1550                doc.navigate_by_spine_index(0).unwrap(),
1551                doc.get_manifest_item("content_001").unwrap()
1552            );
1553            assert_eq!(
1554                doc.navigate_by_spine_index(1).unwrap(),
1555                doc.get_manifest_item("content_002").unwrap()
1556            );
1557            assert_eq!(
1558                doc.navigate_by_spine_index(2).unwrap(),
1559                doc.get_manifest_item("content_002").unwrap()
1560            );
1561            assert_eq!(
1562                doc.navigate_by_spine_index(3).unwrap(),
1563                doc.get_manifest_item("content_002").unwrap()
1564            );
1565        }
1566
1567        /// ID: pkg-spine-duplicate-item-rendering
1568        ///
1569        /// The spine contains several references to the same content document. The reading system must not skip the duplicates when rendering the reading order.
1570        #[test]
1571        fn test_pkg_spine_duplicate_item_rendering() {
1572            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-rendering.epub");
1573            let doc = EpubDoc::new(epub_file);
1574            assert!(doc.is_ok());
1575
1576            let mut doc = doc.unwrap();
1577            assert_eq!(doc.spine.len(), 4);
1578
1579            let result = doc.spine_prev();
1580            assert!(result.is_none());
1581
1582            let result = doc.spine_next();
1583            assert!(result.is_some());
1584
1585            doc.spine_next();
1586            doc.spine_next();
1587            let result = doc.spine_next();
1588            assert!(result.is_none());
1589        }
1590
1591        /// ID: pkg-spine-nonlinear-activation
1592        ///
1593        /// An itemref in the spine is marked as non-linear. Although it (possibly) cannot be accessed through the table of contents, it can be reached from a link in the XHTML content.
1594        #[test]
1595        fn test_pkg_spine_nonlinear_activation() {
1596            let epub_file = Path::new("./test_case/pkg-spine-nonlinear-activation.epub");
1597            let doc = EpubDoc::new(epub_file);
1598            assert!(doc.is_ok());
1599
1600            let mut doc = doc.unwrap();
1601            assert!(doc.spine_prev().is_none());
1602            assert!(doc.spine_next().is_none());
1603
1604            assert!(doc.navigate_by_spine_index(1).is_some());
1605            assert!(doc.spine_prev().is_none());
1606            assert!(doc.spine_next().is_none());
1607        }
1608
1609        /// ID: pkg-spine-order
1610        ///
1611        /// Basic test of whether a reading system can display spine items in the correct order. The test fails if the reading system presents content in the order in which the file names sort, or if it presents files in manifest order rather than spine order.
1612        #[test]
1613        fn test_pkg_spine_order() {
1614            let epub_file = Path::new("./test_case/pkg-spine-order.epub");
1615            let doc = EpubDoc::new(epub_file);
1616            assert!(doc.is_ok());
1617
1618            let doc = doc.unwrap();
1619            assert_eq!(doc.spine.len(), 4);
1620            assert_eq!(
1621                doc.spine
1622                    .iter()
1623                    .map(|item| item.idref.clone())
1624                    .collect::<Vec<String>>(),
1625                vec![
1626                    "d-content_001",
1627                    "c-content_002",
1628                    "b-content_003",
1629                    "a-content_004",
1630                ]
1631            );
1632        }
1633
1634        /// ID: pkg-spine-order-svg
1635        ///
1636        /// Basic test of whether a reading system can display SVG spine items in the correct order.
1637        #[test]
1638        fn test_spine_order_svg() {
1639            let epub_file = Path::new("./test_case/pkg-spine-order-svg.epub");
1640            let doc = EpubDoc::new(epub_file);
1641            assert!(doc.is_ok());
1642
1643            let mut doc = doc.unwrap();
1644            assert_eq!(doc.spine.len(), 4);
1645
1646            loop {
1647                if let Some(spine) = doc.spine_next() {
1648                    let idref = doc.spine[doc.current_spine_index].idref.clone();
1649                    let resource = doc.get_manifest_item(&idref);
1650                    assert!(resource.is_ok());
1651
1652                    let resource = resource.unwrap();
1653                    assert_eq!(spine, resource);
1654                } else {
1655                    break;
1656                }
1657            }
1658
1659            assert_eq!(doc.current_spine_index, 3);
1660        }
1661
1662        /// ID: pkg-spine-unknown
1663        ///
1664        /// The package document contains a spine item with unknown properties. The reading system must open the EPUB successfully.
1665        #[test]
1666        fn test_pkg_spine_unknown() {
1667            let epub_file = Path::new("./test_case/pkg-spine-unknown.epub");
1668            let doc = EpubDoc::new(epub_file);
1669            assert!(doc.is_ok());
1670
1671            let doc = doc.unwrap();
1672            assert_eq!(doc.spine.len(), 1);
1673            assert_eq!(doc.spine[0].idref, "content_001");
1674            assert_eq!(doc.spine[0].id, None);
1675            assert_eq!(doc.spine[0].linear, true);
1676            assert_eq!(doc.spine[0].properties, Some("untrustworthy".to_string()));
1677        }
1678
1679        /// ID: pkg-title-order
1680        ///
1681        /// Several titles are listed in the package document. The reading system must use the first title (and whether to use other titles is not defined).
1682        #[test]
1683        fn test_pkg_title_order() {
1684            let epub_file = Path::new("./test_case/pkg-title-order.epub");
1685            let doc = EpubDoc::new(epub_file);
1686            assert!(doc.is_ok());
1687
1688            let doc = doc.unwrap();
1689            let title_list = doc.get_title();
1690            assert!(title_list.is_ok());
1691
1692            let title_list = title_list.unwrap();
1693            assert_eq!(title_list.len(), 6);
1694            assert_eq!(
1695                title_list,
1696                vec![
1697                    "pkg-title-order",
1698                    "This title must not display first",
1699                    "Also, this title must not display first",
1700                    "This title also must not display first",
1701                    "This title must also not display first",
1702                    "This title must not display first, also",
1703                ]
1704            );
1705        }
1706
1707        /// ID: pkg-unique-id
1708        ///
1709        /// The package document's dc:identifier is identical across two publications. The reading system should display both publications independently.
1710        #[test]
1711        fn test_pkg_unique_id() {
1712            let epub_file = Path::new("./test_case/pkg-unique-id.epub");
1713            let doc_1 = EpubDoc::new(epub_file);
1714            assert!(doc_1.is_ok());
1715
1716            let epub_file = Path::new("./test_case/pkg-unique-id_duplicate.epub");
1717            let doc_2 = EpubDoc::new(epub_file);
1718            assert!(doc_2.is_ok());
1719
1720            let doc_1 = doc_1.unwrap();
1721            let doc_2 = doc_2.unwrap();
1722
1723            assert_eq!(
1724                doc_1.get_identifier().unwrap(),
1725                doc_2.get_identifier().unwrap()
1726            );
1727            assert_eq!(doc_1.unique_identifier, "pkg-unique-id");
1728            assert_eq!(doc_2.unique_identifier, "pkg-unique-id");
1729        }
1730
1731        /// ID: pkg-version-backward
1732        ///
1733        /// “Reading Systems MUST attempt to process an EPUB Publication whose Package Document version attribute is less than "3.0"”. This is an EPUB with package version attribute set to "0", to see if a reading system will open it.
1734        #[test]
1735        fn test_pkg_version_backward() {
1736            let epub_file = Path::new("./test_case/pkg-version-backward.epub");
1737            let doc = EpubDoc::new(epub_file);
1738            assert!(doc.is_ok());
1739
1740            let doc = doc.unwrap();
1741            assert_eq!(doc.version, EpubVersion::Version3_0);
1742        }
1743
1744        /// ID: pkg-linked-records
1745        ///
1746        /// Reading System must process and display the title and creator metadata from the package document. An ONIX 3.0 format linked metadata record exists, but contains neither title nor creator metadata.
1747        #[test]
1748        fn test_pkg_linked_records() {
1749            let epub_file = Path::new("./test_case/pkg-linked-records.epub");
1750            let doc = EpubDoc::new(epub_file);
1751            assert!(doc.is_ok());
1752
1753            let doc = doc.unwrap();
1754            assert_eq!(doc.metadata_link.len(), 3);
1755
1756            let item = doc.metadata_link.iter().find(|&item| {
1757                if let Some(properties) = &item.properties {
1758                    properties.eq("onix")
1759                } else {
1760                    false
1761                }
1762            });
1763            assert!(item.is_some());
1764        }
1765
1766        /// ID: pkg-manifest-unlisted-resource
1767        ///
1768        /// The XHTML content references an image that does not appear in the manifest. The image should not be shown.
1769        #[test]
1770        fn test_pkg_manifest_unlisted_resource() {
1771            let epub_file = Path::new("./test_case/pkg-manifest-unlisted-resource.epub");
1772            let doc = EpubDoc::new(epub_file);
1773            assert!(doc.is_ok());
1774
1775            let mut doc = doc.unwrap();
1776            assert!(
1777                doc.get_manifest_item_by_path("EPUB/content_001.xhtml")
1778                    .is_ok()
1779            );
1780
1781            assert!(doc.get_manifest_item_by_path("EPUB/red.png").is_err());
1782            let err = doc.get_manifest_item_by_path("EPUB/red.png").unwrap_err();
1783            assert_eq!(
1784                err.to_string(),
1785                "Resource not found: Unable to find resource from \"EPUB/red.png\"."
1786            );
1787        }
1788    }
1789
1790    /// Section 3.4 manifest fallbacks
1791    ///
1792    /// The tests under this module seem to favor the reading system rather than the EPUB format itself
1793    mod manifest_fallbacks_tests {
1794        use std::path::Path;
1795
1796        use crate::epub::EpubDoc;
1797
1798        /// ID: pub-foreign_bad-fallback
1799        ///
1800        /// This is a test of manifest fallbacks where both the spine item and the fallback are likely to be unsupported. The spine item is a DMG, with a fallback to a PSD file. Reading systems may raise an error on the ingenstion workflow.
1801        #[test]
1802        fn test_pub_foreign_bad_fallback() {
1803            let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
1804            let doc = EpubDoc::new(epub_file);
1805            assert!(doc.is_ok());
1806
1807            let mut doc = doc.unwrap();
1808            assert!(doc.get_manifest_item("content_001").is_ok());
1809            assert!(doc.get_manifest_item("bar").is_ok());
1810
1811            assert_eq!(
1812                doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
1813                    .unwrap_err()
1814                    .to_string(),
1815                "No supported file format: The fallback resource does not contain the file format you support."
1816            );
1817        }
1818
1819        /// ID: pub-foreign_image
1820        ///
1821        /// An HTML content file contains a PSD image, with a manifest fallback to a PNG image. This tests fallbacks for resources that are not in the spine.
1822        #[test]
1823        fn test_pub_foreign_image() {
1824            let epub_file = Path::new("./test_case/pub-foreign_image.epub");
1825            let doc = EpubDoc::new(epub_file);
1826            assert!(doc.is_ok());
1827
1828            let mut doc = doc.unwrap();
1829            let result = doc.get_manifest_item_with_fallback(
1830                "image-tiff",
1831                vec!["image/png", "application/xhtml+xml"],
1832            );
1833            assert!(result.is_ok());
1834
1835            let (_, mime) = result.unwrap();
1836            assert_eq!(mime, "image/png");
1837        }
1838
1839        /// ID: pub-foreign_json-spine
1840        ///
1841        /// This EPUB uses a JSON content file in the spine, with a manifest fallback to an HTML document. If the reading system does not support JSON, it should display the HTML.
1842        #[test]
1843        fn test_pub_foreign_json_spine() {
1844            let epub_file = Path::new("./test_case/pub-foreign_json-spine.epub");
1845            let doc = EpubDoc::new(epub_file);
1846            assert!(doc.is_ok());
1847
1848            let mut doc = doc.unwrap();
1849            let result = doc.get_manifest_item_with_fallback(
1850                "content_primary",
1851                vec!["application/xhtml+xml", "application/json"],
1852            );
1853            assert!(result.is_ok());
1854            let (_, mime) = result.unwrap();
1855            assert_eq!(mime, "application/json");
1856
1857            let result = doc
1858                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1859            assert!(result.is_ok());
1860            let (_, mime) = result.unwrap();
1861            assert_eq!(mime, "application/xhtml+xml");
1862        }
1863
1864        /// ID: pub-foreign_xml-spine
1865        ///
1866        /// This EPUB uses an ordinary XML content file with mimetype application/xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1867        #[test]
1868        fn test_pub_foreign_xml_spine() {
1869            let epub_file = Path::new("./test_case/pub-foreign_xml-spine.epub");
1870            let doc = EpubDoc::new(epub_file);
1871            assert!(doc.is_ok());
1872
1873            let mut doc = doc.unwrap();
1874            let result = doc.get_manifest_item_with_fallback(
1875                "content_primary",
1876                vec!["application/xhtml+xml", "application/xml"],
1877            );
1878            assert!(result.is_ok());
1879            let (_, mime) = result.unwrap();
1880            assert_eq!(mime, "application/xml");
1881
1882            let result = doc
1883                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1884            assert!(result.is_ok());
1885            let (_, mime) = result.unwrap();
1886            assert_eq!(mime, "application/xhtml+xml");
1887        }
1888
1889        /// ID: pub-foreign_xml-suffix-spine
1890        ///
1891        /// This EPUB uses an custom XML content file with mimetype application/dtc+xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1892        #[test]
1893        fn test_pub_foreign_xml_suffix_spine() {
1894            let epub_file = Path::new("./test_case/pub-foreign_xml-suffix-spine.epub");
1895            let doc = EpubDoc::new(epub_file);
1896            assert!(doc.is_ok());
1897
1898            let mut doc = doc.unwrap();
1899            let result = doc.get_manifest_item_with_fallback(
1900                "content_primary",
1901                vec!["application/xhtml+xml", "application/dtc+xml"],
1902            );
1903            assert!(result.is_ok());
1904            let (_, mime) = result.unwrap();
1905            assert_eq!(mime, "application/dtc+xml");
1906
1907            let result = doc
1908                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1909            assert!(result.is_ok());
1910            let (_, mime) = result.unwrap();
1911            assert_eq!(mime, "application/xhtml+xml");
1912        }
1913    }
1914
1915    /// Section 3.9 open container format
1916    mod open_container_format_tests {
1917        use std::{cmp::min, io::Read, path::Path};
1918
1919        use sha1::{Digest, Sha1};
1920
1921        use crate::epub::EpubDoc;
1922
1923        /// ID: ocf-metainf-inc
1924        ///
1925        /// An extra configuration file, not in the reserved files' list, is added to the META-INF folder; this file must be ignored.
1926        #[test]
1927        fn test_ocf_metainf_inc() {
1928            let epub_file = Path::new("./test_case/ocf-metainf-inc.epub");
1929            let doc = EpubDoc::new(epub_file);
1930            assert!(doc.is_ok());
1931        }
1932
1933        /// ID: ocf-metainf-manifest
1934        ///
1935        /// An ancillary manifest file, containing an extra spine item, is present in the META-INF directory; this extra item must be ignored by the reading system.
1936        #[test]
1937        fn test_ocf_metainf_manifest() {
1938            let epub_file = Path::new("./test_case/ocf-metainf-manifest.epub");
1939            let doc = EpubDoc::new(epub_file);
1940            assert!(doc.is_ok());
1941        }
1942
1943        /// ID: ocf-package_arbitrary
1944        ///
1945        /// The EPUB contains three valid package files and three corresponding sets of content documents, but only one of the packages, in an unusual subdirectory, is referenced by the container.xml file. The reading system must use this package.
1946        #[test]
1947        fn test_ocf_package_arbitrary() {
1948            let epub_file = Path::new("./test_case/ocf-package_arbitrary.epub");
1949            let doc = EpubDoc::new(epub_file);
1950            assert!(doc.is_ok());
1951
1952            let doc = doc.unwrap();
1953            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1954        }
1955
1956        /// ID: ocf-package_multiple
1957        ///
1958        /// The EPUB contains three valid package files and three corresponding sets of content documents, all referenced by the container.xml file. The reading system must use the first package.
1959        #[test]
1960        fn test_ocf_package_multiple() {
1961            let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
1962            let doc = EpubDoc::new(epub_file);
1963            assert!(doc.is_ok());
1964
1965            let doc = doc.unwrap();
1966            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1967            assert_eq!(doc.base_path, Path::new("FOO/BAR"));
1968        }
1969
1970        /// ID: ocf-url_link-leaking-relative
1971        ///
1972        /// Use a relative link with several double-dot path segments from the content to a photograph. The folder hierarchy containing the photograph starts at the root level; the relative image reference exceeds depth of hierarchy.
1973        #[test]
1974        fn test_ocf_url_link_leaking_relative() {
1975            let epub_file = Path::new("./test_case/ocf-url_link-leaking-relative.epub");
1976            let doc = EpubDoc::new(epub_file);
1977            assert!(doc.is_err());
1978            assert_eq!(
1979                doc.err().unwrap().to_string(),
1980                String::from(
1981                    "Relative link leakage: Path \"../../../../media/imgs/monastery.jpg\" is out of container range."
1982                )
1983            )
1984        }
1985
1986        /// ID: ocf-url_link-path-absolute
1987        ///
1988        /// Use a path-absolute link, i.e., beginning with a leading slash, from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
1989        #[test]
1990        fn test_ocf_url_link_path_absolute() {
1991            let epub_file = Path::new("./test_case/ocf-url_link-path-absolute.epub");
1992            let doc = EpubDoc::new(epub_file);
1993            assert!(doc.is_ok());
1994
1995            let doc = doc.unwrap();
1996            let resource = doc.manifest.get("photo").unwrap();
1997            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
1998        }
1999
2000        /// ID: ocf-url_link-relative
2001        ///
2002        /// A simple relative link from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2003        #[test]
2004        fn test_ocf_url_link_relative() {
2005            let epub_file = Path::new("./test_case/ocf-url_link-relative.epub");
2006            let doc = EpubDoc::new(epub_file);
2007            assert!(doc.is_ok());
2008
2009            let doc = doc.unwrap();
2010            let resource = doc.manifest.get("photo").unwrap();
2011            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2012        }
2013
2014        /// ID: ocf-url_manifest
2015        ///
2016        /// The manifest refers to an XHTML file in an arbitrary subfolder. The reading system must be able to find the content.
2017        #[test]
2018        fn test_ocf_url_manifest() {
2019            let epub_file = Path::new("./test_case/ocf-url_manifest.epub");
2020            let doc = EpubDoc::new(epub_file);
2021            assert!(doc.is_ok());
2022
2023            let mut doc = doc.unwrap();
2024            assert!(doc.get_manifest_item("nav").is_ok());
2025            assert!(doc.get_manifest_item("content_001").is_ok());
2026            assert!(doc.get_manifest_item("content_002").is_err());
2027        }
2028
2029        /// ID: ocf-url_relative
2030        ///
2031        /// The manifest refers to an XHTML file in an arbitrary subfolder that is relative to the package's own arbitrary folder. The reading system must be able to find the content.
2032        #[test]
2033        fn test_ocf_url_relative() {
2034            let epub_file = Path::new("./test_case/ocf-url_relative.epub");
2035            let doc = EpubDoc::new(epub_file);
2036            assert!(doc.is_ok());
2037
2038            let mut doc = doc.unwrap();
2039            assert_eq!(doc.package_path, Path::new("foo/BAR/baz.opf"));
2040            assert_eq!(doc.base_path, Path::new("foo/BAR"));
2041            assert_eq!(
2042                doc.manifest.get("nav").unwrap().path,
2043                Path::new("foo/BAR/nav.xhtml")
2044            );
2045            assert_eq!(
2046                doc.manifest.get("content_001").unwrap().path,
2047                Path::new("foo/BAR/qux/content_001.xhtml")
2048            );
2049            assert!(doc.get_manifest_item("nav").is_ok());
2050            assert!(doc.get_manifest_item("content_001").is_ok());
2051        }
2052
2053        /// ID: ocf-zip-comp
2054        ///
2055        /// MUST treat any OCF ZIP container that uses compression techniques other than Deflate as in error.
2056        /// This test case does not use compression methods other than Deflate and cannot detect whether it is effective.
2057        #[test]
2058        fn test_ocf_zip_comp() {
2059            let epub_file = Path::new("./test_case/ocf-zip-comp.epub");
2060            let doc = EpubDoc::new(epub_file);
2061            assert!(doc.is_ok());
2062        }
2063
2064        /// ID: ocf-zip-mult
2065        ///
2066        /// MUST treat any OCF ZIP container that splits the content into segments as in error.
2067        /// This test case is not a segmented OCF ZIP container and cannot be tested to see if it is valid.
2068        #[test]
2069        fn test_ocf_zip_mult() {
2070            let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2071            let doc = EpubDoc::new(epub_file);
2072            assert!(doc.is_ok());
2073        }
2074
2075        /// ID: ocf-font_obfuscation
2076        ///
2077        /// An obfuscated (TrueType) font should be displayed after de-obfuscation.
2078        #[test]
2079        fn test_ocf_font_obfuscation() {
2080            let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2081            let doc = EpubDoc::new(epub_file);
2082            assert!(doc.is_ok());
2083
2084            let mut doc = doc.unwrap();
2085            let unique_id = doc.unique_identifier.clone();
2086
2087            let mut hasher = Sha1::new();
2088            hasher.update(unique_id.as_bytes());
2089            let hash = hasher.finalize();
2090            let mut key = vec![0u8; 1040];
2091            for i in 0..1040 {
2092                key[i] = hash[i % hash.len()];
2093            }
2094
2095            assert!(doc.encryption.is_some());
2096            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2097
2098            let data = &doc.encryption.unwrap()[0];
2099            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2100
2101            let font_file = doc
2102                .archive
2103                .by_name(&data.data)
2104                .unwrap()
2105                .bytes()
2106                .collect::<Result<Vec<u8>, _>>();
2107            assert!(font_file.is_ok());
2108            let font_file = font_file.unwrap();
2109
2110            // 根据EPUB规范，字体混淆是直接对字体文件进行的，不需要解压步骤，直接进行去混淆处理
2111            let mut deobfuscated = font_file.clone();
2112            for i in 0..min(1040, deobfuscated.len()) {
2113                deobfuscated[i] ^= key[i];
2114            }
2115
2116            assert!(is_valid_font(&deobfuscated));
2117        }
2118
2119        /// ID: ocf-font_obfuscation-bis
2120        ///
2121        /// An obfuscated (TrueType) font should not be displayed after de-obfuscation, because the obfuscation used a different publication id.
2122        #[test]
2123        fn test_ocf_font_obfuscation_bis() {
2124            let epub_file = Path::new("./test_case/ocf-font_obfuscation_bis.epub");
2125            let doc = EpubDoc::new(epub_file);
2126            assert!(doc.is_ok());
2127
2128            let mut doc = doc.unwrap();
2129
2130            let wrong_unique_id = "wrong-publication-id";
2131            let mut hasher = Sha1::new();
2132            hasher.update(wrong_unique_id.as_bytes());
2133            let hash = hasher.finalize();
2134            let mut wrong_key = vec![0u8; 1040];
2135            for i in 0..1040 {
2136                wrong_key[i] = hash[i % hash.len()];
2137            }
2138
2139            assert!(doc.encryption.is_some());
2140            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2141
2142            let data = &doc.encryption.unwrap()[0];
2143            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2144
2145            let font_file = doc
2146                .archive
2147                .by_name(&data.data)
2148                .unwrap()
2149                .bytes()
2150                .collect::<Result<Vec<u8>, _>>();
2151            assert!(font_file.is_ok());
2152            let font_file = font_file.unwrap();
2153
2154            // 使用错误的密钥进行去混淆
2155            let mut deobfuscated_with_wrong_key = font_file.clone();
2156            for i in 0..std::cmp::min(1040, deobfuscated_with_wrong_key.len()) {
2157                deobfuscated_with_wrong_key[i] ^= wrong_key[i];
2158            }
2159
2160            assert!(!is_valid_font(&deobfuscated_with_wrong_key));
2161        }
2162
2163        fn is_valid_font(data: &[u8]) -> bool {
2164            if data.len() < 4 {
2165                return false;
2166            }
2167            let sig = &data[0..4];
2168            // OTF: "OTTO"
2169            // TTF: 0x00010000, 0x00020000, "true", "typ1"
2170            sig == b"OTTO"
2171                || sig == b"\x00\x01\x00\x00"
2172                || sig == b"\x00\x02\x00\x00"
2173                || sig == b"true"
2174                || sig == b"typ1"
2175        }
2176    }
2177
2178    /// Test for function `has_encryption`
2179    #[test]
2180    fn test_fn_has_encryption() {
2181        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2182        let doc = EpubDoc::new(epub_file);
2183        assert!(doc.is_ok());
2184
2185        let mut doc = doc.unwrap();
2186        assert!(doc.has_encryption());
2187    }
2188
2189    /// This test is used to detect whether the "META-INF/encryption.xml" file is parsed correctly
2190    #[test]
2191    fn test_fn_parse_encryption() {
2192        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2193        let doc = EpubDoc::new(epub_file);
2194        assert!(doc.is_ok());
2195
2196        let doc = doc.unwrap();
2197        assert!(doc.encryption.is_some());
2198
2199        let encryption = doc.encryption.unwrap();
2200        assert_eq!(encryption.len(), 1);
2201        assert_eq!(encryption[0].method, "http://www.idpf.org/2008/embedding");
2202        assert_eq!(encryption[0].data, "EPUB/fonts/Lobster.ttf");
2203    }
2204
2205    #[test]
2206    fn test_get_metadata_existing_key() {
2207        let epub_file = Path::new("./test_case/epub-33.epub");
2208        let doc = EpubDoc::new(epub_file);
2209        assert!(doc.is_ok());
2210
2211        let doc = doc.unwrap();
2212
2213        let titles = doc.get_metadata("title");
2214        assert!(titles.is_some());
2215
2216        let titles = titles.unwrap();
2217        assert_eq!(titles.len(), 1);
2218        assert_eq!(titles[0].property, "title");
2219        assert_eq!(titles[0].value, "EPUB 3.3");
2220
2221        let languages = doc.get_metadata("language");
2222        assert!(languages.is_some());
2223
2224        let languages = languages.unwrap();
2225        assert_eq!(languages.len(), 1);
2226        assert_eq!(languages[0].property, "language");
2227        assert_eq!(languages[0].value, "en-us");
2228    }
2229
2230    #[test]
2231    fn test_get_metadata_nonexistent_key() {
2232        let epub_file = Path::new("./test_case/epub-33.epub");
2233        let doc = EpubDoc::new(epub_file);
2234        assert!(doc.is_ok());
2235
2236        let doc = doc.unwrap();
2237        let metadata = doc.get_metadata("nonexistent");
2238        assert!(metadata.is_none());
2239    }
2240
2241    #[test]
2242    fn test_get_metadata_multiple_items_same_type() {
2243        let epub_file = Path::new("./test_case/epub-33.epub");
2244        let doc = EpubDoc::new(epub_file);
2245        assert!(doc.is_ok());
2246
2247        let doc = doc.unwrap();
2248
2249        let creators = doc.get_metadata("creator");
2250        assert!(creators.is_some());
2251
2252        let creators = creators.unwrap();
2253        assert_eq!(creators.len(), 3);
2254
2255        assert_eq!(creators[0].id, Some("creator_id_0".to_string()));
2256        assert_eq!(creators[0].property, "creator");
2257        assert_eq!(creators[0].value, "Matt Garrish, DAISY Consortium");
2258
2259        assert_eq!(creators[1].id, Some("creator_id_1".to_string()));
2260        assert_eq!(creators[1].property, "creator");
2261        assert_eq!(creators[1].value, "Ivan Herman, W3C");
2262
2263        assert_eq!(creators[2].id, Some("creator_id_2".to_string()));
2264        assert_eq!(creators[2].property, "creator");
2265        assert_eq!(creators[2].value, "Dave Cramer, Invited Expert");
2266    }
2267
2268    #[test]
2269    fn test_get_metadata_with_refinement() {
2270        let epub_file = Path::new("./test_case/epub-33.epub");
2271        let doc = EpubDoc::new(epub_file);
2272        assert!(doc.is_ok());
2273
2274        let doc = doc.unwrap();
2275
2276        let title = doc.get_metadata("title");
2277        assert!(title.is_some());
2278
2279        let title = title.unwrap();
2280        assert_eq!(title.len(), 1);
2281        assert_eq!(title[0].refined.len(), 1);
2282        assert_eq!(title[0].refined[0].property, "title-type");
2283        assert_eq!(title[0].refined[0].value, "main");
2284    }
2285
2286    #[test]
2287    fn test_get_manifest_item_with_fallback() {
2288        let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
2289        let doc = EpubDoc::new(epub_file);
2290        assert!(doc.is_ok());
2291
2292        let mut doc = doc.unwrap();
2293        assert!(doc.get_manifest_item("content_001").is_ok());
2294        assert!(doc.get_manifest_item("bar").is_ok());
2295
2296        // 当回退链上存在可回退资源时能获取资源
2297        if let Ok((_, mime)) = doc.get_manifest_item_with_fallback("content_001", vec!["image/psd"])
2298        {
2299            assert_eq!(mime, "image/psd");
2300        } else {
2301            assert!(false, "get_manifest_item_with_fallback failed");
2302        }
2303
2304        // 当回退链上不存在可回退资源时无法获取资源
2305        assert_eq!(
2306            doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
2307                .unwrap_err()
2308                .to_string(),
2309            "No supported file format: The fallback resource does not contain the file format you support."
2310        );
2311    }
2312}
lib_epub/epub.rs

lib_epub/
epub.rs