lib_epub/
epub.rs

1//! The core module of the EPUB parsing library
2//!
3//! This module provides complete parsing functionality for EPUB ebook files
4//! and is the core component of the entire library. The `EpubDoc` structure
5//! encapsulates all the parsing logic and data access interfaces for EPUB files.
6//!
7//! ## Main references to EPUB specs:
8//! - <https://www.w3.org/TR/epub-33>
9//! - <https://idpf.org/epub/201>
10//!
11//! ## Potential Issues
12//! - The generic parameter `R: Read + Seek` increases complexity, particularly
13//!   in asynchronous environments. The current design is not conducive to multi-threaded
14//!   concurrent access and requires an external synchronization mechanism.
15//! - Some error handling may not be sufficiently nuanced, and certain edge cases
16//!   may not be adequately considered.
17//! - Loading the entire EPUB document at once may result in significant memory consumption,
18//!   especially for large publications.
19//!
20//! ## Future Work
21//! - Adds support for asynchronous I/O, improving the user experience in asynchronous
22//!   environments. Considering adding support for multi-threaded access.
23//! - Supports more EPUB specification features, such as media overlay and scripts.
24
25use std::{
26    collections::HashMap,
27    fs::{File, canonicalize},
28    io::{BufReader, Read, Seek},
29    path::{Path, PathBuf},
30};
31
32use log::warn;
33use zip::{ZipArchive, result::ZipError};
34
35use crate::{
36    error::EpubError,
37    types::{
38        EncryptionData, EpubVersion, ManifestItem, MetadataItem, MetadataLinkItem,
39        MetadataRefinement, NavPoint, SpineItem,
40    },
41    utils::{
42        DecodeBytes, NormalizeWhitespace, XmlElement, XmlReader, adobe_font_dencryption,
43        check_realtive_link_leakage, compression_method_check, get_file_in_zip_archive,
44        idpf_font_dencryption,
45    },
46};
47
48/// EPUB document parser, representing a loaded and parsed EPUB publication
49///
50/// The `EpubDoc` structure is the core of the entire EPUB parsing library.
51/// It encapsulates all the parsing logic and data access interfaces for EPUB files.
52/// It is responsible for parsing various components of an EPUB, including metadata,
53/// manifests, reading order, table of contents navigation, and encrypted information,
54/// and provides methods for accessing this data.
55///
56/// Provides a unified data access interface for EPUB files, hiding the underlying
57/// file structure and parsing details. Strictly adheres to the EPUB specification
58/// in implementing the parsing logic to ensure compatibility with the standard.
59///
60/// ## Usage
61///
62/// ```rust
63/// use lib_epub::epub::EpubDoc;
64///
65/// let doc = EpubDoc::new("./test_case/epub-33.epub");
66/// assert!(doc.is_ok());
67/// ```
68pub struct EpubDoc<R: Read + Seek> {
69    /// The structure of the epub file that actually holds it
70    pub(crate) archive: ZipArchive<R>,
71
72    /// The path to the target epub file
73    pub(crate) epub_path: PathBuf,
74
75    /// The path to the OPF file
76    pub package_path: PathBuf,
77
78    /// The path to the directory where the opf file is located
79    pub base_path: PathBuf,
80
81    /// The epub version
82    pub version: EpubVersion,
83
84    /// The unique identifier of the epub file
85    ///
86    /// This identifier is the actual value of the unique-identifier attribute of the package.
87    pub unique_identifier: String,
88
89    /// Epub metadata extracted from OPF
90    pub metadata: Vec<MetadataItem>,
91
92    /// Data in metadata that points to external files
93    pub metadata_link: Vec<MetadataLinkItem>,
94
95    /// A list of resources contained inside an epub extracted from OPF
96    ///
97    /// All resources in the epub file are declared here,
98    /// and undeclared resources should not be stored in the epub file and cannot be obtained from it.
99    pub manifest: HashMap<String, ManifestItem>,
100
101    /// Physical reading order of publications extracted from OPF
102    ///
103    /// This attribute declares the order in which multiple files
104    /// containing published content should be displayed.
105    pub spine: Vec<SpineItem>,
106
107    /// The encryption.xml extracted from the META-INF directory
108    pub encryption: Option<Vec<EncryptionData>>,
109
110    /// The navigation data of the epub file
111    pub catalog: Vec<NavPoint>,
112
113    /// The title of the catalog
114    pub catalog_title: String,
115
116    /// The index of the current reading spine
117    pub current_spine_index: usize,
118}
119
120impl<R: Read + Seek> EpubDoc<R> {
121    /// Creates a new EPUB document instance from a reader
122    ///
123    /// This function is responsible for the core logic of parsing EPUB files,
124    /// including verifying the file format, parsing container information,
125    /// loading the OPF package document, and extracting metadata, manifest,
126    /// reading order, and other core information.
127    ///
128    /// ## Parameters
129    /// - `reader`: The data source that implements the `Read` and `Seek` traits,
130    ///   usually a file or memory buffer
131    /// - `epub_path`: The path to the EPUB file, used for path resolution and validation
132    ///
133    /// ## Return
134    /// - `Ok(EpubDoc<R>)`: The successfully parsed EPUB document object
135    /// - `Err(EpubError)`: Errors encountered during parsing
136    ///
137    /// ## Notes
138    /// - This function assumes the EPUB file structure is valid
139    pub fn from_reader(reader: R, epub_path: PathBuf) -> Result<Self, EpubError> {
140        // Parsing process
141        // 1. Verify that the ZIP compression method conforms to the EPUB specification
142        // 2. Parse `META-INF/container.xml` retrieves the location of the OPF file
143        // 3. Parses the OPF file to obtain package documentation information
144        // 4. Extracts version information
145        // 5. Parses metadata, manifest, and spine
146        // 6. Parses encrypted information and directory navigation
147        // 7. Verifies and extracts the unique identifier
148
149        let mut archive = ZipArchive::new(reader).map_err(EpubError::from)?;
150        let epub_path = canonicalize(epub_path)?;
151
152        compression_method_check(&mut archive)?;
153
154        let container =
155            get_file_in_zip_archive(&mut archive, "META-INF/container.xml")?.decode()?;
156        let package_path = Self::parse_container(container)?;
157        let base_path = package_path
158            .parent()
159            .expect("所有文件的父目录不能为空")
160            .to_path_buf();
161
162        let opf_file =
163            get_file_in_zip_archive(&mut archive, package_path.to_str().unwrap())?.decode()?;
164        let package = XmlReader::parse(&opf_file)?;
165        // let document = kiss_xml::parse_str(opf_file).unwrap();
166
167        // let package = document.root_element();
168        let version = Self::determine_epub_version(&package)?;
169
170        let mut doc = Self {
171            archive,
172            epub_path,
173            package_path,
174            base_path,
175            version,
176            unique_identifier: String::new(),
177            metadata: vec![],
178            metadata_link: vec![],
179            manifest: HashMap::new(),
180            spine: vec![],
181            encryption: None,
182            catalog: vec![],
183            catalog_title: String::new(),
184            current_spine_index: 0,
185        };
186
187        let metadata_element = package.find_elements_by_name("metadata").next().unwrap();
188        let manifest_element = package.find_elements_by_name("manifest").next().unwrap();
189        let spine_element = package.find_elements_by_name("spine").next().unwrap();
190
191        doc.parse_metadata(metadata_element)?;
192        doc.parse_manifest(manifest_element)?;
193        doc.parse_spine(spine_element)?;
194        doc.parse_encryption()?;
195        doc.parse_catalog()?;
196
197        // 断言必有唯一标识符
198        doc.unique_identifier = if let Some(uid) = package.get_attr("unique-identifier") {
199            doc.metadata.iter().find(|item| {
200                item.property == "identifier" && item.id.as_ref().is_some_and(|id| id == &uid)
201            })
202        } else {
203            doc.metadata
204                .iter()
205                .find(|item| item.property == "identifier")
206        }
207        .map(|item| item.value.clone())
208        .ok_or_else(|| EpubError::NonCanonicalFile {
209            tag: "dc:identifier".to_string(),
210        })?;
211
212        Ok(doc)
213    }
214
215    /// Parse the EPUB container file (META-INF/container.xml)
216    ///
217    /// This function parses the container information in the EPUB file 、
218    /// to extract the path to the OPF package file. According to the EPUB
219    /// specification, the `container.xml` file must exist in the `META-INF`
220    /// directory and contain at least one `rootfile` element pointing to
221    /// the main OPF file. When multiple `rootfile` elements exist, the first
222    /// element pointing to the OPF file is used as the default.
223    ///
224    /// ## Parameters
225    /// - `data`: The content string of the container.xml
226    ///
227    /// ## Return
228    /// - `Ok(PathBuf)`: The path to the successfully parsed OPF file
229    /// - `Err(EpubError)`: Errors encountered during parsing
230    fn parse_container(data: String) -> Result<PathBuf, EpubError> {
231        let root = XmlReader::parse(&data)?;
232        let rootfile = root
233            .find_elements_by_name("rootfile")
234            .next()
235            .ok_or_else(|| EpubError::NonCanonicalFile {
236                tag: "rootfile".to_string(),
237            })?;
238
239        let attr =
240            rootfile
241                .get_attr("full-path")
242                .ok_or_else(|| EpubError::MissingRequiredAttribute {
243                    tag: "rootfile".to_string(),
244                    attribute: "full-path".to_string(),
245                })?;
246
247        Ok(PathBuf::from(attr))
248    }
249
250    /// Parse the EPUB metadata section
251    ///
252    /// This function is responsible for parsing the `<metadata>` elements
253    /// in the OPF file to extract basic information about the publication.
254    /// It handles metadata elements from different namespaces:
255    /// - Elements in the Dublin Core namespace (`http://purl.org/dc/elements/1.1/`)
256    /// - Elements in the OPF namespace (`http://www.idpf.org/2007/opf`)
257    ///
258    /// ## Parameters
259    /// - `metadata_element`: A reference to the `<metadata>` element in the OPF file
260    fn parse_metadata(&mut self, metadata_element: &XmlElement) -> Result<(), EpubError> {
261        const DC_NAMESPACE: &str = "http://purl.org/dc/elements/1.1/";
262        const OPF_NAMESPACE: &str = "http://www.idpf.org/2007/opf";
263
264        let mut metadata = Vec::new();
265        let mut metadata_link = Vec::new();
266        let mut refinements = HashMap::<String, Vec<MetadataRefinement>>::new();
267
268        for element in metadata_element.children() {
269            match &element.namespace {
270                Some(namespace) if namespace == DC_NAMESPACE => {
271                    self.parse_dc_metadata(element, &mut metadata)?
272                }
273
274                Some(namespace) if namespace == OPF_NAMESPACE => self.parse_opf_metadata(
275                    element,
276                    &mut metadata,
277                    &mut metadata_link,
278                    &mut refinements,
279                )?,
280
281                _ => {}
282            };
283        }
284
285        for item in metadata.iter_mut() {
286            if let Some(id) = &item.id {
287                if let Some(refinements) = refinements.remove(id) {
288                    item.refined = refinements;
289                }
290            }
291        }
292
293        self.metadata = metadata;
294        self.metadata_link = metadata_link;
295        Ok(())
296    }
297
298    /// Parse the EPUB manifest section
299    ///
300    /// This function parses the `<manifest>` element in the OPF file, extracting
301    /// information about all resource files in the publication. Each resource contains
302    /// basic information such as id, file path, MIME type, as well as optional
303    /// attributes and fallback resource information.
304    ///
305    /// ## Parameters
306    /// - `manifest_element`: A reference to the `<manifest>` element in the OPF file
307    fn parse_manifest(&mut self, manifest_element: &XmlElement) -> Result<(), EpubError> {
308        let estimated_items = manifest_element.children().count();
309        let mut resources = HashMap::with_capacity(estimated_items);
310
311        for element in manifest_element.children() {
312            let id = element
313                .get_attr("id")
314                .ok_or_else(|| EpubError::MissingRequiredAttribute {
315                    tag: element.tag_name(),
316                    attribute: "id".to_string(),
317                })?
318                .to_string();
319            let path = element
320                .get_attr("href")
321                .ok_or_else(|| EpubError::MissingRequiredAttribute {
322                    tag: element.tag_name(),
323                    attribute: "href".to_string(),
324                })?
325                .to_string();
326            let mime = element
327                .get_attr("media-type")
328                .ok_or_else(|| EpubError::MissingRequiredAttribute {
329                    tag: element.tag_name(),
330                    attribute: "media-type".to_string(),
331                })?
332                .to_string();
333            let properties = element.get_attr("properties");
334            let fallback = element.get_attr("fallback");
335
336            resources.insert(
337                id.clone(),
338                ManifestItem {
339                    id,
340                    path: self.normalize_manifest_path(&path)?,
341                    mime,
342                    properties,
343                    fallback,
344                },
345            );
346        }
347
348        self.manifest = resources;
349        self.validate_fallback_chains();
350        Ok(())
351    }
352
353    /// Parse the EPUB spine section
354    ///
355    /// This function parses the `<spine>` elements in the OPF file to extract
356    /// the reading order information of the publication. The spine defines the
357    /// linear reading order of the publication's content documents, and each
358    /// spine item references resources in the manifest.
359    ///
360    /// ## Parameters
361    /// - `spine_element`: A reference to the `<spine>` element in the OPF file
362    fn parse_spine(&mut self, spine_element: &XmlElement) -> Result<(), EpubError> {
363        let mut spine = Vec::new();
364        for element in spine_element.children() {
365            let idref = element
366                .get_attr("idref")
367                .ok_or_else(|| EpubError::MissingRequiredAttribute {
368                    tag: element.tag_name(),
369                    attribute: "idref".to_string(),
370                })?
371                .to_string();
372            let id = element.get_attr("id");
373            let linear = element
374                .get_attr("linear")
375                .map(|linear| linear == "yes")
376                .unwrap_or(true);
377            let properties = element.get_attr("properties");
378
379            spine.push(SpineItem {
380                idref,
381                id,
382                linear,
383                properties,
384            });
385        }
386
387        self.spine = spine;
388        Ok(())
389    }
390
391    /// Parse the EPUB encryption file (META-INF/encryption.xml)
392    ///
393    /// This function is responsible for parsing the `encryption.xml` file
394    /// in the `META-INF` directory to extract information about encrypted
395    /// resources in the publication. According to the EPUB specification,
396    /// the encryption information describes which resources are encrypted
397    /// and the encryption methods used.
398    ///
399    /// TODO: 需要对使用非对称加密数据的加密项进行额外处理，以获取非对称加密密钥
400    fn parse_encryption(&mut self) -> Result<(), EpubError> {
401        if !self.has_encryption() {
402            return Ok(());
403        }
404
405        let encryption_file =
406            get_file_in_zip_archive(&mut self.archive, "META-INF/encryption.xml")?.decode()?;
407
408        let root = XmlReader::parse(&encryption_file)?;
409
410        let mut encryption_data = Vec::new();
411        for data in root.children() {
412            if data.name != "EncryptedData" {
413                continue;
414            }
415
416            let method = data
417                .find_elements_by_name("EncryptionMethod")
418                .next()
419                .ok_or_else(|| EpubError::NonCanonicalFile {
420                    tag: "EncryptionMethod".to_string(),
421                })?;
422            let reference = data
423                .find_elements_by_name("CipherReference")
424                .next()
425                .ok_or_else(|| EpubError::NonCanonicalFile {
426                    tag: "CipherReference".to_string(),
427                })?;
428
429            encryption_data.push(EncryptionData {
430                method: method
431                    .get_attr("Algorithm")
432                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
433                        tag: "EncryptionMethod".to_string(),
434                        attribute: "Algorithm".to_string(),
435                    })?
436                    .to_string(),
437                data: reference
438                    .get_attr("URI")
439                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
440                        tag: "CipherReference".to_string(),
441                        attribute: "URI".to_string(),
442                    })?
443                    .to_string(),
444            });
445        }
446
447        if !encryption_data.is_empty() {
448            self.encryption = Some(encryption_data);
449        }
450
451        Ok(())
452    }
453
454    /// Parse the EPUB navigation information
455    ///
456    /// This function is responsible for parsing the navigation information of EPUB
457    /// publications. Different parsing strategies are used depending on the EPUB version:
458    /// - EPUB 2.0: Parses the NCX file to obtain directory information
459    /// - EPUB 3.0: Parses the Navigation Document (NAV) file to obtain directory information
460    fn parse_catalog(&mut self) -> Result<(), EpubError> {
461        const HEAD_TAGS: [&str; 6] = ["h1", "h2", "h3", "h4", "h5", "h6"];
462
463        match self.version {
464            EpubVersion::Version2_0 => {
465                let opf_file = get_file_in_zip_archive(
466                    &mut self.archive,
467                    self.package_path.to_str().unwrap(),
468                )?
469                .decode()?;
470                let opf_element = XmlReader::parse(&opf_file)?;
471
472                let toc_id = opf_element
473                    .find_children_by_name("spine")
474                    .next()
475                    .ok_or_else(|| EpubError::NonCanonicalFile {
476                        tag: "spine".to_string(),
477                    })?
478                    .get_attr("toc")
479                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
480                        tag: "spine".to_string(),
481                        attribute: "toc".to_string(),
482                    })?
483                    .to_owned();
484                let toc_path = self
485                    .manifest
486                    .get(&toc_id)
487                    .ok_or(EpubError::ResourceIdNotExist { id: toc_id })?
488                    .path
489                    .to_str()
490                    .unwrap();
491
492                let ncx_file = get_file_in_zip_archive(&mut self.archive, toc_path)?.decode()?;
493                let ncx = XmlReader::parse(&ncx_file)?;
494
495                match ncx.find_elements_by_name("docTitle").next() {
496                    Some(element) => self.catalog_title = element.text(),
497                    None => warn!(
498                        "Expecting to get docTitle information from the ncx file, but it's missing."
499                    ),
500                };
501
502                let nav_map = ncx.find_elements_by_name("navMap").next().ok_or_else(|| {
503                    EpubError::NonCanonicalFile {
504                        tag: "navMap".to_string(),
505                    }
506                })?;
507
508                self.catalog = self.parse_nav_points(nav_map)?;
509
510                Ok(())
511            }
512
513            EpubVersion::Version3_0 => {
514                let nav_path = self
515                    .manifest
516                    .values()
517                    .find(|item| {
518                        if let Some(property) = &item.properties {
519                            return property.contains("nav");
520                        }
521                        false
522                    })
523                    .map(|item| item.path.clone())
524                    .ok_or_else(|| EpubError::NonCanonicalEpub {
525                        expected_file: "Navigation Document".to_string(),
526                    })?;
527
528                let nav_file =
529                    get_file_in_zip_archive(&mut self.archive, nav_path.to_str().unwrap())?
530                        .decode()?;
531
532                let nav_element = XmlReader::parse(&nav_file)?;
533                let nav = nav_element
534                    .find_elements_by_name("nav")
535                    .find(|&element| element.get_attr("epub:type") == Some(String::from("toc")))
536                    .ok_or_else(|| EpubError::NonCanonicalFile {
537                        tag: "nav".to_string(),
538                    })?;
539                let nav_title = nav.find_children_by_names(&HEAD_TAGS).next();
540                let nav_list = nav.find_children_by_name("ol").next().ok_or_else(|| {
541                    EpubError::NonCanonicalFile {
542                        tag: "ol".to_string(),
543                    }
544                })?;
545
546                self.catalog = self.parse_catalog_list(nav_list)?;
547                if let Some(nav_title) = nav_title {
548                    self.catalog_title = nav_title.text();
549                };
550                Ok(())
551            }
552        }
553    }
554
555    /// Check if the EPUB file contains `encryption.xml`
556    ///
557    /// This function determines whether a publication contains encrypted resources
558    /// by checking if a `META-INF/encryption.xml` file exists in the EPUB package.
559    /// According to the EPUB specification, when resources in a publication are
560    /// encrypted, the corresponding encryption information must be declared in
561    /// the `META-INF/encryption.xml` file.
562    ///
563    /// ## Return
564    /// - `true` if the publication contains encrypted resources
565    /// - `false` if the publication does not contain encrypted resources
566    ///
567    /// ## Notes
568    /// - This function only checks the existence of the encrypted file;
569    ///   it does not verify the validity of the encrypted information.
570    pub fn has_encryption(&mut self) -> bool {
571        self.archive
572            .by_path(Path::new("META-INF/encryption.xml"))
573            .is_ok()
574    }
575
576    /// Retrieves a list of metadata items
577    ///
578    /// This function retrieves all matching metadata items from the EPUB metadata
579    /// based on the specified attribute name (key). Metadata items may come from
580    /// the DC (Dublin Core) namespace or the OPF namespace and contain basic
581    /// information about the publication, such as title, author, identifier, etc.
582    ///
583    /// ## Parameters
584    /// - `key`: The name of the metadata attribute to retrieve
585    ///
586    /// ## Return
587    /// - `Some(Vec<MetadataItem>)`: A vector containing all matching metadata items
588    /// - `None`: If no matching metadata items are found
589    pub fn get_metadata(&self, key: &str) -> Option<Vec<MetadataItem>> {
590        let metadatas = self
591            .metadata
592            .iter()
593            .filter(|item| item.property == key)
594            .cloned()
595            .collect::<Vec<MetadataItem>>();
596
597        (!metadatas.is_empty()).then_some(metadatas)
598    }
599
600    /// Retrieves a list of values for specific metadata items
601    ///
602    /// This function retrieves the values of all matching metadata items from
603    /// the EPUB metadata based on the given property name (key).
604    ///
605    /// ## Parameters
606    /// - `key`: The name of the metadata attribute to retrieve
607    ///
608    /// ## Return
609    /// - `Some(Vec<String>)`: A vector containing all matching metadata item values
610    /// - `None`: If no matching metadata items are found
611    pub fn get_metadata_value(&self, key: &str) -> Option<Vec<String>> {
612        let values = self
613            .metadata
614            .iter()
615            .filter(|item| item.property == key)
616            .map(|item| item.value.clone())
617            .collect::<Vec<String>>();
618
619        (!values.is_empty()).then_some(values)
620    }
621
622    /// Retrieves the title of the publication
623    ///
624    /// This function retrieves all title information from the EPUB metadata.
625    /// According to the EPUB specification, a publication can have multiple titles,
626    /// which are returned in the order they appear in the metadata.
627    ///
628    /// ## Return
629    /// - `Result<Vec<String>, EpubError>`: A vector containing all title information
630    /// - `EpubError`: If and only if the OPF file does not contain `<dc:title>`
631    ///
632    /// ## Notes
633    /// - The EPUB specification requires each publication to have at least one title.
634    pub fn get_title(&self) -> Result<Vec<String>, EpubError> {
635        self.get_metadata_value("title")
636            .ok_or_else(|| EpubError::NonCanonicalFile {
637                tag: "title".to_string(),
638            })
639    }
640
641    /// Retrieves the language used in the publication
642    ///
643    /// This function retrieves the language information of a publication from the EPUB
644    /// metadata. According to the EPUB specification, language information identifies
645    /// the primary language of the publication and can have multiple language identifiers.
646    ///
647    /// ## Return
648    /// - `Ok(Vec<String>)`: A vector containing all language identifiers
649    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:language>`
650    ///
651    /// ## Notes
652    /// - The EPUB specification requires that each publication specify at least one primary language.
653    /// - Language identifiers should conform to RFC 3066 or later standards.
654    pub fn get_language(&self) -> Result<Vec<String>, EpubError> {
655        self.get_metadata_value("language")
656            .ok_or_else(|| EpubError::NonCanonicalFile {
657                tag: "language".to_string(),
658            })
659    }
660
661    /// Retrieves the identifier of a publication
662    ///
663    /// This function retrieves the identifier information of a publication from
664    /// the EPUB metadata. According to the EPUB specification, each publication
665    /// must have a identifier, typically an ISBN, UUID, or other unique identifier.
666    ///
667    /// ## Return
668    /// - `Ok(Vec<String>)`: A vector containing all identifier information
669    /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:identifier>`
670    ///
671    /// ## Notes
672    /// - The EPUB specification requires each publication to have at least one identifier.
673    /// - In the OPF file, the `unique-identifier` attribute of the `<package>` element
674    ///   should point to a `<dc:identifier>` element used to uniquely identify the publication.
675    ///   This means that `unique-identifier` is not exactly equal to `<dc:identifier>`.
676    pub fn get_identifier(&self) -> Result<Vec<String>, EpubError> {
677        self.get_metadata_value("identifier")
678            .ok_or_else(|| EpubError::NonCanonicalFile {
679                tag: "identifier".to_string(),
680            })
681    }
682
683    /// Retrieve resource data by resource ID
684    ///
685    /// This function will find the resource with the specified ID in the manifest.
686    /// If the resource is encrypted, it will be automatically decrypted.
687    ///
688    /// ## Parameters
689    /// - `id`: The ID of the resource to retrieve
690    ///
691    /// ## Return
692    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
693    ///   the MIME type
694    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
695    ///
696    /// ## Notes
697    /// - This function will automatically decrypt the resource if it is encrypted.
698    /// - For unsupported encryption methods, the corresponding error will be returned.
699    pub fn get_manifest_item(&mut self, id: &str) -> Result<(Vec<u8>, String), EpubError> {
700        let resource_item = self
701            .manifest
702            .get(id)
703            .cloned()
704            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
705
706        let path = resource_item.path.to_str().unwrap();
707
708        let mut data = match self.archive.by_name(path) {
709            Ok(mut file) => {
710                let mut entry = Vec::<u8>::new();
711                file.read_to_end(&mut entry)?;
712
713                Ok(entry)
714            }
715            Err(ZipError::FileNotFound) => Err(EpubError::ResourceNotFound {
716                resource: path.to_string(),
717            }),
718            Err(err) => Err(EpubError::from(err)),
719        }?;
720
721        if let Some(method) = self.is_encryption_file(path) {
722            data = self.auto_dencrypt(&method, &mut data)?;
723        }
724
725        Ok((data, resource_item.mime))
726    }
727
728    /// Retrieves resource item data by resource path
729    ///
730    /// This function retrieves resources from the manifest based on the input path.
731    /// The input path must be a relative path to the root directory of the EPUB container;
732    /// using an absolute path or a relative path to another location will result in an error.
733    ///
734    /// ## Parameters
735    /// - `path`: The path of the resource to retrieve
736    ///
737    /// ## Return
738    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
739    ///   the MIME type
740    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
741    ///
742    /// ## Notes
743    /// - This function will automatically decrypt the resource if it is encrypted.
744    /// - For unsupported encryption methods, the corresponding error will be returned.
745    /// - Relative paths other than the root directory of the Epub container are not supported.
746    pub fn get_manifest_item_by_path(
747        &mut self,
748        path: &str,
749    ) -> Result<(Vec<u8>, String), EpubError> {
750        let id = self
751            .manifest
752            .iter()
753            .find(|(_, item)| item.path.to_str().unwrap() == path)
754            .map(|(id, _)| id.to_string())
755            .ok_or_else(|| EpubError::ResourceNotFound {
756                resource: path.to_string(),
757            })?;
758
759        self.get_manifest_item(&id)
760    }
761
762    /// Retrieves supported resource items by resource ID, with fallback mechanism supported
763    ///
764    /// This function attempts to retrieve the resource item with the specified ID and
765    /// checks if its MIME type is in the list of supported formats. If the current resource
766    /// format is not supported, it searches for a supported resource format along the
767    /// fallback chain according to the fallback mechanism defined in the EPUB specification.
768    ///
769    /// ## Parameters
770    /// - `id`: The ID of the resource to retrieve
771    /// - `supported_format`: A vector of supported MIME types
772    ///
773    /// ## Return
774    /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
775    ///   the MIME type
776    /// - `Err(EpubError)`: Errors that occurred during the retrieval process
777    pub fn get_manifest_item_with_fallback(
778        &mut self,
779        id: &str,
780        supported_format: Vec<&str>,
781    ) -> Result<(Vec<u8>, String), EpubError> {
782        let mut manifest_item = self
783            .manifest
784            .get(id)
785            .cloned()
786            .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
787
788        let mut current_manifest_id = id.to_string();
789        let mut fallback_chain = Vec::<String>::new();
790        'fallback: loop {
791            if supported_format.contains(&manifest_item.mime.as_str()) {
792                return self.get_manifest_item(&current_manifest_id);
793            }
794
795            let fallback_id = manifest_item.fallback.clone();
796
797            match fallback_id {
798                // The loop ends when no fallback resource exists
799                None => break 'fallback,
800
801                // End the loop when the loop continues to fallback if a fallback resource exists
802                Some(id) if fallback_chain.contains(&id) => break 'fallback,
803
804                Some(id) => {
805                    fallback_chain.push(id.clone());
806
807                    // Since only warnings are issued for fallback resource checks
808                    // during initialization, the issue of fallback resources possibly
809                    // not existing needs to be handled here.
810                    manifest_item = self
811                        .manifest
812                        .get(&manifest_item.fallback.unwrap())
813                        .cloned()
814                        .ok_or(EpubError::ResourceIdNotExist { id: id.clone() })?;
815                    current_manifest_id = id;
816                }
817            };
818        }
819
820        Err(EpubError::NoSupportedFileFormat)
821    }
822
823    /// Retrieves the cover of the EPUB document
824    ///
825    /// This function searches for the cover of the EPUB document by examining manifest
826    /// items in the manifest. It looks for manifest items whose ID or attribute contains
827    /// "cover" (case-insensitive) and attempts to retrieve the content of the first match.
828    ///
829    /// ## Return
830    /// - `Some((Vec<u8>, String))`: Successfully retrieved and decrypted cover data and
831    ///   the MIME type
832    /// - `None`: No cover resource was found
833    ///
834    /// ## Notes
835    /// - This function only returns the first successfully retrieved cover resource,
836    ///   even if multiple matches exist
837    /// - The retrieved cover may not be an image resource; users need to pay attention
838    ///   to the resource's MIME type.
839    pub fn get_cover(&mut self) -> Option<(Vec<u8>, String)> {
840        self.manifest
841            .values()
842            .filter_map(|manifest| {
843                if manifest.id.to_ascii_lowercase().contains("cover") {
844                    return Some(manifest.id.clone());
845                }
846
847                if let Some(properties) = &manifest.properties {
848                    if properties.to_ascii_lowercase().contains("cover") {
849                        return Some(manifest.id.clone());
850                    }
851                }
852
853                None
854            })
855            .collect::<Vec<String>>()
856            .iter()
857            .find_map(|id| self.get_manifest_item(id).ok())
858    }
859
860    /// Navigate to a specified chapter using the spine index
861    ///
862    /// This function retrieves the content data of the corresponding chapter based
863    /// on the index position in the EPUB spine. The spine defines the linear reading
864    /// order of the publication's content documents, and each spine item references
865    /// resources in the manifest.
866    ///
867    /// ## Parameters
868    /// - `index`: The index position in the spine, starting from 0
869    ///
870    /// ## Return
871    /// - `Some((Vec<u8>, String))`: Successfully retrieved chapter content data and the MIME type
872    /// - `None`: Index out of range or data retrieval error
873    ///
874    /// ## Notes
875    /// - The index must be less than the total number of spine projects.
876    /// - If the resource is encrypted, it will be automatically decrypted before returning.(TODO)
877    /// - It does not check whether the Spine project follows a linear reading order.
878    pub fn navigate_by_spine_index(&mut self, index: usize) -> Option<(Vec<u8>, String)> {
879        if index >= self.spine.len() {
880            return None;
881        }
882
883        let manifest_id = self.spine[index].idref.clone();
884        self.current_spine_index = index;
885        self.get_manifest_item(&manifest_id).ok()
886    }
887
888    /// Navigate to the previous linear reading chapter
889    ///
890    /// This function searches backwards in the EPUB spine for the previous linear
891    /// reading chapter and returns the content data of that chapter. It only navigates
892    /// to chapters marked as linear reading.
893    ///
894    /// ## Return
895    /// - `Some((Vec<u8>, String))`: Successfully retrieved previous chapter content data and
896    ///   the MIME type
897    /// - `None`: Already in the first chapter, the current chapter is not linear,
898    ///   or data retrieval failed
899    pub fn spine_prev(&mut self) -> Option<(Vec<u8>, String)> {
900        if self.current_spine_index == 0 || !self.spine[self.current_spine_index].linear {
901            return None;
902        }
903
904        let prev_index = (0..self.current_spine_index)
905            .rev()
906            .find(|&index| self.spine[index].linear)?;
907
908        self.current_spine_index = prev_index;
909        let manifest_id = self.spine[prev_index].idref.clone();
910        self.get_manifest_item(&manifest_id).ok()
911    }
912
913    /// Navigate to the next linear reading chapter
914    ///
915    /// This function searches forwards in the EPUB spine for the next linear reading
916    /// chapter and returns the content data of that chapter. It only navigates to
917    /// chapters marked as linear reading.
918    ///
919    /// ## Return
920    /// - `Some((Vec<u8>, String))`: Successfully retrieved next chapter content data and
921    ///   the MIME type
922    /// - `None`: Already in the last chapter, the current chapter is not linear,
923    ///   or data retrieval failed
924    pub fn spine_next(&mut self) -> Option<(Vec<u8>, String)> {
925        if self.current_spine_index >= self.spine.len() - 1
926            || !self.spine[self.current_spine_index].linear
927        {
928            return None;
929        }
930
931        let next_index = (self.current_spine_index + 1..self.spine.len())
932            .find(|&index| self.spine[index].linear)?;
933
934        self.current_spine_index = next_index;
935        let manifest_id = self.spine[next_index].idref.clone();
936        self.get_manifest_item(&manifest_id).ok()
937    }
938
939    /// Retrieves the content data of the current chapter
940    ///
941    /// This function returns the content data of the chapter at the current
942    /// index position in the EPUB spine.
943    ///
944    /// ## Return
945    /// - `Some((Vec<u8>, String))`: Successfully retrieved current chapter content data and
946    ///   the MIME type
947    /// - `None`: Data retrieval failed
948    pub fn spine_current(&mut self) -> Option<(Vec<u8>, String)> {
949        let manifest_id = self.spine[self.current_spine_index].idref.clone();
950        self.get_manifest_item(&manifest_id).ok()
951    }
952
953    /// Determine the EPUB version from the OPF file
954    ///
955    /// This function is used to detect the version of an epub file from an OPF file.
956    /// When the version attribute in the package is abnormal, version information will
957    /// be identified through some version characteristics of the epub file. An error is
958    /// returned when neither direct nor indirect methods can identify the version.
959    ///
960    /// ## Parameters
961    /// - `opf_element`: A reference to the OPF file element
962    fn determine_epub_version(opf_element: &XmlElement) -> Result<EpubVersion, EpubError> {
963        // Check the explicit version attribute
964        if let Some(version) = opf_element.get_attr("version") {
965            match version.as_str() {
966                "2.0" => return Ok(EpubVersion::Version2_0),
967                "3.0" => return Ok(EpubVersion::Version3_0),
968                _ => {}
969            }
970        }
971
972        let spine_element = opf_element
973            .find_elements_by_name("spine")
974            .next()
975            .ok_or_else(|| EpubError::NonCanonicalFile {
976                tag: "spine".to_string(),
977            })?;
978
979        // Look for EPUB 2.x specific features
980        if spine_element.get_attr("toc").is_some() {
981            return Ok(EpubVersion::Version2_0);
982        }
983
984        let manifest_element = opf_element
985            .find_elements_by_name("manifest")
986            .next()
987            .ok_or_else(|| EpubError::NonCanonicalFile {
988                tag: "manifest".to_string(),
989            })?;
990
991        // Look for EPUB 3.x specific features
992        manifest_element
993            .children()
994            .find_map(|element| {
995                if let Some(id) = element.get_attr("id") {
996                    if id.eq("nav") {
997                        return Some(EpubVersion::Version3_0);
998                    }
999                }
1000
1001                None
1002            })
1003            .ok_or(EpubError::UnrecognizedEpubVersion)
1004    }
1005
1006    /// Parse metadata elements under the Dublin Core namespace
1007    ///
1008    /// This function handles the `<metadata>` Dublin Core element in the OPF file (namespace
1009    /// is "http://purl.org/dc/elements/1.1/"). These elements usually contain the basic
1010    /// information of the publication, such as title, author, publication date, etc.
1011    ///
1012    /// ## Notes
1013    /// - In EPUB 3.0, granular information is handled by separate '<meta>' elements and 'refines' attributes
1014    /// - All text content is normalized by whitespace
1015    #[inline]
1016    fn parse_dc_metadata(
1017        &mut self,
1018        element: &XmlElement,
1019        metadata: &mut Vec<MetadataItem>,
1020        // refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1021    ) -> Result<(), EpubError> {
1022        let id = element.get_attr("id");
1023        let lang = element.get_attr("lang");
1024        let property = element.name.clone();
1025        let value = element.text().normalize_whitespace();
1026
1027        let refined = match self.version {
1028            // In EPUB 2.0, supplementary metadata (refinements) are represented
1029            // through other attribute data pairs of the tag.
1030            EpubVersion::Version2_0 => element
1031                .attributes
1032                .iter()
1033                .map(|(name, value)| {
1034                    let property = name.to_string();
1035                    let value = value.to_string().normalize_whitespace();
1036
1037                    MetadataRefinement {
1038                        refines: id.clone().unwrap(),
1039                        property,
1040                        value,
1041                        lang: None,
1042                        scheme: None,
1043                    }
1044                })
1045                .collect(),
1046            EpubVersion::Version3_0 => vec![],
1047        };
1048
1049        metadata.push(MetadataItem {
1050            id,
1051            property,
1052            value,
1053            lang,
1054            refined,
1055        });
1056
1057        Ok(())
1058    }
1059
1060    /// Parse metadata elements under the OPF namespace
1061    ///
1062    /// This function handles the `<metadata>` OPF element in the OPF file (namespace
1063    /// is "http://www.idpf.org/2007/opf"). These elements include '<meta>' and '<link>',
1064    /// which are used to provide extended metadata and links to external resources for EPUB publications.
1065    ///
1066    /// ## Notes
1067    /// - The function is only responsible for distribution processing, and the
1068    ///   specific parsing logic is implemented in the dedicated function
1069    /// - All parsing results are added directly to the incoming collection and no new collection is returned
1070    #[inline]
1071    fn parse_opf_metadata(
1072        &mut self,
1073        element: &XmlElement,
1074        metadata: &mut Vec<MetadataItem>,
1075        metadata_link: &mut Vec<MetadataLinkItem>,
1076        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1077    ) -> Result<(), EpubError> {
1078        match element.name.as_str() {
1079            "meta" => self.parse_meta_element(element, metadata, refinements),
1080            "link" => self.parse_link_element(element, metadata_link),
1081            _ => Ok(()),
1082        }
1083    }
1084
1085    #[inline]
1086    fn parse_meta_element(
1087        &mut self,
1088        element: &XmlElement,
1089        metadata: &mut Vec<MetadataItem>,
1090        refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1091    ) -> Result<(), EpubError> {
1092        match self.version {
1093            EpubVersion::Version2_0 => {
1094                let property =
1095                    element
1096                        .get_attr("name")
1097                        .ok_or_else(|| EpubError::NonCanonicalFile {
1098                            tag: element.tag_name(),
1099                        })?;
1100                let value = element
1101                    .get_attr("content")
1102                    .ok_or_else(|| EpubError::MissingRequiredAttribute {
1103                        tag: element.tag_name(),
1104                        attribute: "content".to_string(),
1105                    })?
1106                    .normalize_whitespace();
1107
1108                metadata.push(MetadataItem {
1109                    id: None,
1110                    property,
1111                    value,
1112                    lang: None,
1113                    refined: vec![],
1114                });
1115            }
1116
1117            EpubVersion::Version3_0 => {
1118                let property = element.get_attr("property").ok_or_else(|| {
1119                    EpubError::MissingRequiredAttribute {
1120                        tag: element.tag_name(),
1121                        attribute: "property".to_string(),
1122                    }
1123                })?;
1124                let value = element.text().normalize_whitespace();
1125                let lang = element.get_attr("lang");
1126
1127                if let Some(refines) = element.get_attr("refines") {
1128                    let id = refines.strip_prefix("#").unwrap_or(&refines).to_string();
1129                    let scheme = element.get_attr("scheme");
1130                    let refinement = MetadataRefinement {
1131                        refines: id.clone(),
1132                        property,
1133                        value,
1134                        lang,
1135                        scheme,
1136                    };
1137
1138                    if let Some(refinements) = refinements.get_mut(&id) {
1139                        refinements.push(refinement);
1140                    } else {
1141                        refinements.insert(id, vec![refinement]);
1142                    }
1143                } else {
1144                    let id = element.get_attr("id");
1145                    let item = MetadataItem {
1146                        id,
1147                        property,
1148                        value,
1149                        lang,
1150                        refined: vec![],
1151                    };
1152
1153                    metadata.push(item);
1154                };
1155            }
1156        }
1157        Ok(())
1158    }
1159
1160    #[inline]
1161    fn parse_link_element(
1162        &mut self,
1163        element: &XmlElement,
1164        metadata_link: &mut Vec<MetadataLinkItem>,
1165    ) -> Result<(), EpubError> {
1166        let href = element
1167            .get_attr("href")
1168            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1169                tag: element.tag_name(),
1170                attribute: "href".to_string(),
1171            })?;
1172        let rel = element
1173            .get_attr("rel")
1174            .ok_or_else(|| EpubError::MissingRequiredAttribute {
1175                tag: element.tag_name(),
1176                attribute: "rel".to_string(),
1177            })?;
1178        let hreflang = element.get_attr("hreflang");
1179        let id = element.get_attr("id");
1180        let mime = element.get_attr("media-type");
1181        let properties = element.get_attr("properties");
1182
1183        metadata_link.push(MetadataLinkItem {
1184            href,
1185            rel,
1186            hreflang,
1187            id,
1188            mime,
1189            properties,
1190            refines: None,
1191        });
1192        Ok(())
1193    }
1194
1195    /// Recursively parse NCX navigation points from navMap or nested navPoint elements
1196    ///
1197    /// This function parses the hierarchical navigation structure defined in NCX files
1198    /// for EPUB 2.x documents. It handles nested navPoint elements to build a complete
1199    /// tree representation of the publication's table of contents.
1200    fn parse_nav_points(&self, parent_element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1201        let mut nav_points = Vec::new();
1202        for nav_point in parent_element.find_children_by_name("navPoint") {
1203            let label = match nav_point.find_children_by_name("navLabel").next() {
1204                Some(element) => element.text(),
1205                None => String::new(),
1206            };
1207
1208            let content = nav_point
1209                .find_children_by_name("content")
1210                .next()
1211                .map(|element| PathBuf::from(element.text()));
1212
1213            let play_order = nav_point
1214                .get_attr("playOrder")
1215                .and_then(|order| order.parse::<usize>().ok());
1216
1217            let children = self.parse_nav_points(nav_point)?;
1218
1219            nav_points.push(NavPoint {
1220                label,
1221                content,
1222                play_order,
1223                children,
1224            });
1225        }
1226
1227        nav_points.sort();
1228        Ok(nav_points)
1229    }
1230
1231    /// Recursively parses directory list structures
1232    ///
1233    /// This function recursively parses HTML navigation list structures,
1234    /// converting `<ol>` and `<li>` elements into NavPoint structures.
1235    /// Multi-level nested directory structures are supported.
1236    fn parse_catalog_list(&self, element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1237        let mut catalog = Vec::new();
1238        for item in element.children() {
1239            if item.tag_name() != "li" {
1240                return Err(EpubError::NonCanonicalFile {
1241                    tag: "li".to_string(),
1242                });
1243            }
1244
1245            let title_element = item
1246                .find_children_by_names(&["span", "a"])
1247                .next()
1248                .ok_or_else(|| EpubError::NonCanonicalFile {
1249                    tag: "span/a".to_string(),
1250                })?;
1251            let content_href = title_element.get_attr("href").map(PathBuf::from);
1252            let sub_list = if let Some(list) = item.find_children_by_name("ol").next() {
1253                self.parse_catalog_list(list)?
1254            } else {
1255                vec![]
1256            };
1257
1258            catalog.push(NavPoint {
1259                label: title_element.text(),
1260                content: content_href,
1261                children: sub_list,
1262                play_order: None,
1263            });
1264        }
1265
1266        Ok(catalog)
1267    }
1268
1269    /// Converts relative paths in the manifest to normalized paths
1270    /// relative to the EPUB root directory
1271    ///
1272    /// This function processes the href attribute of resources in the EPUB
1273    /// manifest and converts it to a normalized path representation.
1274    /// It handles three types of paths:
1275    /// - Relative paths starting with `../` (checks if they exceed the EPUB package scope)
1276    /// - Absolute paths starting with `/` (relative to the EPUB root directory)
1277    /// - Other relative paths (relative to the directory containing the OPF file)
1278    ///
1279    /// ## Parameters
1280    /// - `path`: The href attribute value of the resource in the manifest
1281    ///
1282    /// ## Return
1283    /// - `Ok(PathBuf)`: The parsed normalized path
1284    /// - `Err(EpubError)`: Relative link leakage
1285    #[inline]
1286    fn normalize_manifest_path(&self, path: &str) -> Result<PathBuf, EpubError> {
1287        let mut path = if path.starts_with("../") {
1288            let mut current_dir = self.epub_path.join(&self.package_path);
1289            current_dir.pop();
1290
1291            check_realtive_link_leakage(self.epub_path.clone(), current_dir, path)
1292                .map(PathBuf::from)
1293                .ok_or_else(|| EpubError::RealtiveLinkLeakage {
1294                    path: path.to_string(),
1295                })?
1296        } else if let Some(path) = path.strip_prefix("/") {
1297            PathBuf::from(path.to_string())
1298        } else {
1299            self.base_path.join(path)
1300        };
1301
1302        #[cfg(windows)]
1303        {
1304            path = PathBuf::from(path.to_string_lossy().replace('\\', "/"));
1305        }
1306
1307        Ok(path)
1308    }
1309
1310    /// Verify the fallback chain of all manifest items
1311    ///
1312    /// This function iterates through all manifest items with the fallback
1313    /// attribute and verifies the validity of their fallback chains, including checking:
1314    /// - Whether circular references exist
1315    /// - Whether the fallback resource exists in the manifest
1316    ///
1317    /// ## Notes
1318    /// If an invalid fallback chain is found, a warning log will be logged
1319    /// but the processing flow will not be interrupted.
1320    fn validate_fallback_chains(&self) {
1321        for (id, item) in &self.manifest {
1322            if item.fallback.is_none() {
1323                continue;
1324            }
1325
1326            let mut fallback_chain = Vec::new();
1327            if let Err(msg) = self.validate_fallback_chain(id, &mut fallback_chain) {
1328                warn!("Invalid fallback chain for item {}: {}", id, msg);
1329            }
1330        }
1331    }
1332
1333    /// Recursively verify the validity of a single fallback chain
1334    ///
1335    /// This function recursively traces the fallback chain to check for the following issues:
1336    /// - Circular reference
1337    /// - The referenced fallback resource does not exist
1338    ///
1339    /// ## Parameters
1340    /// - `manifest_id`: The id of the manifest item currently being verified
1341    /// - `fallback_chain`: The visited fallback chain paths used to detect circular references
1342    ///
1343    /// ## Return
1344    /// - `Ok(())`: The fallback chain is valid
1345    /// - `Err(String)`: A string containing error information
1346    fn validate_fallback_chain(
1347        &self,
1348        manifest_id: &str,
1349        fallback_chain: &mut Vec<String>,
1350    ) -> Result<(), String> {
1351        if fallback_chain.contains(&manifest_id.to_string()) {
1352            fallback_chain.push(manifest_id.to_string());
1353
1354            return Err(format!(
1355                "Circular reference detected in fallback chain for {}",
1356                fallback_chain.join("->")
1357            ));
1358        }
1359
1360        // Get the current item; its existence can be ensured based on the calling context.
1361        let item = self.manifest.get(manifest_id).unwrap();
1362
1363        if let Some(fallback_id) = &item.fallback {
1364            if !self.manifest.contains_key(fallback_id) {
1365                return Err(format!(
1366                    "Fallback resource {} does not exist in manifest",
1367                    fallback_id
1368                ));
1369            }
1370
1371            fallback_chain.push(manifest_id.to_string());
1372            self.validate_fallback_chain(fallback_id, fallback_chain)
1373        } else {
1374            // The end of the fallback chain
1375            Ok(())
1376        }
1377    }
1378
1379    /// Checks if a resource at the specified path is an encrypted file
1380    ///
1381    /// This function queries whether a specific resource path is marked as an encrypted
1382    /// file in the EPUB encryption information. It checks the encrypted data stored in
1383    /// `self.encryption`, looking for an entry that matches the given path.
1384    ///
1385    /// ## Parameters
1386    /// - `path`: The path of the resource to check
1387    ///
1388    /// ## Return
1389    /// - `Some(String)`: The encryption method used for the resource
1390    /// - `None`: The resource is not encrypted
1391    fn is_encryption_file(&self, path: &str) -> Option<String> {
1392        self.encryption.as_ref().and_then(|encryptions| {
1393            encryptions
1394                .iter()
1395                .find(|encryption| encryption.data == path)
1396                .map(|encryption| encryption.method.clone())
1397        })
1398    }
1399
1400    /// Automatically decrypts encrypted resource data
1401    ///
1402    /// Automatically decrypts data based on the provided encryption method.
1403    /// This function supports various encryption methods defined by the EPUB
1404    /// specification, including font obfuscation and the XML encryption standard.
1405    ///
1406    /// ## Parameters
1407    /// - `method`: The encryption method used for the resource
1408    /// - `data`: The encrypted resource data
1409    ///
1410    /// ## Return
1411    /// - `Ok(Vec<u8>)`: The decrypted resource data
1412    /// - `Err(EpubError)`: Unsupported encryption method
1413    ///
1414    /// ## Supported Encryption Methods
1415    /// - IDPF font obfuscation: `http://www.idpf.org/2008/embedding`
1416    /// - Adobe font obfuscation: `http://ns.adobe.com/pdf/enc#RC`
1417    #[inline]
1418    fn auto_dencrypt(&self, method: &str, data: &mut [u8]) -> Result<Vec<u8>, EpubError> {
1419        match method {
1420            "http://www.idpf.org/2008/embedding" => {
1421                Ok(idpf_font_dencryption(data, &self.unique_identifier))
1422            }
1423            "http://ns.adobe.com/pdf/enc#RC" => {
1424                Ok(adobe_font_dencryption(data, &self.unique_identifier))
1425            }
1426            _ => Err(EpubError::UnsupportedEncryptedMethod {
1427                method: method.to_string(),
1428            }),
1429        }
1430    }
1431}
1432
1433impl EpubDoc<BufReader<File>> {
1434    /// Creates a new EPUB document instance
1435    ///
1436    /// This function is a convenience constructor for `EpubDoc`,
1437    /// used to create an EPUB parser instance directly from a file path.
1438    ///
1439    /// ## Parameters
1440    /// - `path`: The path to the EPUB file
1441    ///
1442    /// ## Return
1443    /// - `Ok(EpubDoc)`: The created EPUB document instance
1444    /// - `Err(EpubError)`: An error occurred during initialization
1445    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, EpubError> {
1446        let file = File::open(&path).map_err(EpubError::from)?;
1447        let path = canonicalize(path)?;
1448
1449        Self::from_reader(BufReader::new(file), path)
1450    }
1451}
1452
1453#[cfg(test)]
1454mod tests {
1455    use std::{
1456        fs::File,
1457        io::BufReader,
1458        path::{Path, PathBuf},
1459    };
1460
1461    use crate::{epub::EpubDoc, error::EpubError, utils::XmlReader};
1462
1463    /// Section 3.3 package documents
1464    mod package_documents_tests {
1465        use std::path::Path;
1466
1467        use crate::epub::{EpubDoc, EpubVersion};
1468
1469        /// ID: pkg-collections-unknown
1470        ///
1471        /// The package document contains a collection with an unknown role. The reading system must open the EPUB successfully.
1472        #[test]
1473        fn test_pkg_collections_unknown() {
1474            let epub_file = Path::new("./test_case/pkg-collections-unknown.epub");
1475            let doc = EpubDoc::new(epub_file);
1476            assert!(doc.is_ok());
1477        }
1478
1479        /// ID: pkg-creator-order
1480        ///
1481        /// Several creators are listed in the package document. The reading system must not display them out of order (but it may display only the first).
1482        #[test]
1483        fn test_pkg_creator_order() {
1484            let epub_file = Path::new("./test_case/pkg-creator-order.epub");
1485            let doc = EpubDoc::new(epub_file);
1486            assert!(doc.is_ok());
1487
1488            let doc = doc.unwrap();
1489            let creators = doc.get_metadata_value("creator");
1490            assert!(creators.is_some());
1491
1492            let creators = creators.unwrap();
1493            assert_eq!(creators.len(), 5);
1494            assert_eq!(
1495                creators,
1496                vec![
1497                    "Dave Cramer",
1498                    "Wendy Reid",
1499                    "Dan Lazin",
1500                    "Ivan Herman",
1501                    "Brady Duga",
1502                ]
1503            );
1504        }
1505
1506        /// ID: pkg-manifest-unknown
1507        ///
1508        /// The package document contains a manifest item with unknown properties. The reading system must open the EPUB successfully.
1509        #[test]
1510        fn test_pkg_manifest_order() {
1511            let epub_file = Path::new("./test_case/pkg-manifest-unknown.epub");
1512            let doc = EpubDoc::new(epub_file);
1513            assert!(doc.is_ok());
1514
1515            let mut doc = doc.unwrap();
1516            assert_eq!(doc.manifest.len(), 2);
1517            assert!(doc.get_manifest_item("nav").is_ok());
1518            assert!(doc.get_manifest_item("content_001").is_ok());
1519            assert!(doc.get_manifest_item("content_002").is_err());
1520        }
1521
1522        /// ID: pkg-meta-unknown
1523        ///
1524        /// The package document contains a meta tag with an unknown property. The reading system must open the EPUB successfully.
1525        #[test]
1526        fn test_pkg_meta_unknown() {
1527            let epub_file = Path::new("./test_case/pkg-meta-unknown.epub");
1528            let doc = EpubDoc::new(epub_file);
1529            assert!(doc.is_ok());
1530
1531            let doc = doc.unwrap();
1532            let value = doc.get_metadata_value("dcterms:isReferencedBy");
1533            assert!(value.is_some());
1534            let value = value.unwrap();
1535            assert_eq!(value.len(), 1);
1536            assert_eq!(
1537                value,
1538                vec!["https://www.w3.org/TR/epub-rs/#confreq-rs-pkg-meta-unknown"]
1539            );
1540
1541            let value = doc.get_metadata_value("dcterms:modified");
1542            assert!(value.is_some());
1543            let value = value.unwrap();
1544            assert_eq!(value.len(), 1);
1545            assert_eq!(value, vec!["2021-01-11T00:00:00Z"]);
1546
1547            let value = doc.get_metadata_value("dcterms:title");
1548            assert!(value.is_none());
1549        }
1550
1551        /// ID: pkg-meta-whitespace
1552        ///
1553        /// The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases.
1554        #[test]
1555        fn test_pkg_meta_white_space() {
1556            let epub_file = Path::new("./test_case/pkg-meta-whitespace.epub");
1557            let doc = EpubDoc::new(epub_file);
1558            assert!(doc.is_ok());
1559
1560            let doc = doc.unwrap();
1561            let value = doc.get_metadata_value("creator");
1562            assert!(value.is_some());
1563            let value = value.unwrap();
1564            assert_eq!(value.len(), 1);
1565            assert_eq!(value, vec!["Dave Cramer"]);
1566
1567            let value = doc.get_metadata_value("description");
1568            assert!(value.is_some());
1569            let value = value.unwrap();
1570            assert_eq!(value.len(), 1);
1571            assert_eq!(
1572                value,
1573                vec![
1574                    "The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases."
1575                ]
1576            );
1577        }
1578
1579        /// ID: pkg-spine-duplicate-item-hyperlink
1580        ///
1581        /// The spine contains several references to the same content document. The reading system must move to the position of the first duplicate in the reading order when following a hyperlink.
1582        #[test]
1583        fn test_pkg_spine_duplicate_item_hyperlink() {
1584            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-hyperlink.epub");
1585            let doc = EpubDoc::new(epub_file);
1586            assert!(doc.is_ok());
1587
1588            let mut doc = doc.unwrap();
1589            assert_eq!(doc.spine.len(), 4);
1590            assert_eq!(
1591                doc.navigate_by_spine_index(0).unwrap(),
1592                doc.get_manifest_item("content_001").unwrap()
1593            );
1594            assert_eq!(
1595                doc.navigate_by_spine_index(1).unwrap(),
1596                doc.get_manifest_item("content_002").unwrap()
1597            );
1598            assert_eq!(
1599                doc.navigate_by_spine_index(2).unwrap(),
1600                doc.get_manifest_item("content_002").unwrap()
1601            );
1602            assert_eq!(
1603                doc.navigate_by_spine_index(3).unwrap(),
1604                doc.get_manifest_item("content_002").unwrap()
1605            );
1606        }
1607
1608        /// ID: pkg-spine-duplicate-item-rendering
1609        ///
1610        /// The spine contains several references to the same content document. The reading system must not skip the duplicates when rendering the reading order.
1611        #[test]
1612        fn test_pkg_spine_duplicate_item_rendering() {
1613            let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-rendering.epub");
1614            let doc = EpubDoc::new(epub_file);
1615            assert!(doc.is_ok());
1616
1617            let mut doc = doc.unwrap();
1618            assert_eq!(doc.spine.len(), 4);
1619
1620            let result = doc.spine_prev();
1621            assert!(result.is_none());
1622
1623            let result = doc.spine_next();
1624            assert!(result.is_some());
1625
1626            doc.spine_next();
1627            doc.spine_next();
1628            let result = doc.spine_next();
1629            assert!(result.is_none());
1630        }
1631
1632        /// ID: pkg-spine-nonlinear-activation
1633        ///
1634        /// An itemref in the spine is marked as non-linear. Although it (possibly) cannot be accessed through the table of contents, it can be reached from a link in the XHTML content.
1635        #[test]
1636        fn test_pkg_spine_nonlinear_activation() {
1637            let epub_file = Path::new("./test_case/pkg-spine-nonlinear-activation.epub");
1638            let doc = EpubDoc::new(epub_file);
1639            assert!(doc.is_ok());
1640
1641            let mut doc = doc.unwrap();
1642            assert!(doc.spine_prev().is_none());
1643            assert!(doc.spine_next().is_none());
1644
1645            assert!(doc.navigate_by_spine_index(1).is_some());
1646            assert!(doc.spine_prev().is_none());
1647            assert!(doc.spine_next().is_none());
1648        }
1649
1650        /// ID: pkg-spine-order
1651        ///
1652        /// Basic test of whether a reading system can display spine items in the correct order. The test fails if the reading system presents content in the order in which the file names sort, or if it presents files in manifest order rather than spine order.
1653        #[test]
1654        fn test_pkg_spine_order() {
1655            let epub_file = Path::new("./test_case/pkg-spine-order.epub");
1656            let doc = EpubDoc::new(epub_file);
1657            assert!(doc.is_ok());
1658
1659            let doc = doc.unwrap();
1660            assert_eq!(doc.spine.len(), 4);
1661            assert_eq!(
1662                doc.spine
1663                    .iter()
1664                    .map(|item| item.idref.clone())
1665                    .collect::<Vec<String>>(),
1666                vec![
1667                    "d-content_001",
1668                    "c-content_002",
1669                    "b-content_003",
1670                    "a-content_004",
1671                ]
1672            );
1673        }
1674
1675        /// ID: pkg-spine-order-svg
1676        ///
1677        /// Basic test of whether a reading system can display SVG spine items in the correct order.
1678        #[test]
1679        fn test_spine_order_svg() {
1680            let epub_file = Path::new("./test_case/pkg-spine-order-svg.epub");
1681            let doc = EpubDoc::new(epub_file);
1682            assert!(doc.is_ok());
1683
1684            let mut doc = doc.unwrap();
1685            assert_eq!(doc.spine.len(), 4);
1686
1687            loop {
1688                if let Some(spine) = doc.spine_next() {
1689                    let idref = doc.spine[doc.current_spine_index].idref.clone();
1690                    let resource = doc.get_manifest_item(&idref);
1691                    assert!(resource.is_ok());
1692
1693                    let resource = resource.unwrap();
1694                    assert_eq!(spine, resource);
1695                } else {
1696                    break;
1697                }
1698            }
1699
1700            assert_eq!(doc.current_spine_index, 3);
1701        }
1702
1703        /// ID: pkg-spine-unknown
1704        ///
1705        /// The package document contains a spine item with unknown properties. The reading system must open the EPUB successfully.
1706        #[test]
1707        fn test_pkg_spine_unknown() {
1708            let epub_file = Path::new("./test_case/pkg-spine-unknown.epub");
1709            let doc = EpubDoc::new(epub_file);
1710            assert!(doc.is_ok());
1711
1712            let doc = doc.unwrap();
1713            assert_eq!(doc.spine.len(), 1);
1714            assert_eq!(doc.spine[0].idref, "content_001");
1715            assert_eq!(doc.spine[0].id, None);
1716            assert_eq!(doc.spine[0].linear, true);
1717            assert_eq!(doc.spine[0].properties, Some("untrustworthy".to_string()));
1718        }
1719
1720        /// ID: pkg-title-order
1721        ///
1722        /// Several titles are listed in the package document. The reading system must use the first title (and whether to use other titles is not defined).
1723        #[test]
1724        fn test_pkg_title_order() {
1725            let epub_file = Path::new("./test_case/pkg-title-order.epub");
1726            let doc = EpubDoc::new(epub_file);
1727            assert!(doc.is_ok());
1728
1729            let doc = doc.unwrap();
1730            let title_list = doc.get_title();
1731            assert!(title_list.is_ok());
1732
1733            let title_list = title_list.unwrap();
1734            assert_eq!(title_list.len(), 6);
1735            assert_eq!(
1736                title_list,
1737                vec![
1738                    "pkg-title-order",
1739                    "This title must not display first",
1740                    "Also, this title must not display first",
1741                    "This title also must not display first",
1742                    "This title must also not display first",
1743                    "This title must not display first, also",
1744                ]
1745            );
1746        }
1747
1748        /// ID: pkg-unique-id
1749        ///
1750        /// The package document's dc:identifier is identical across two publications. The reading system should display both publications independently.
1751        #[test]
1752        fn test_pkg_unique_id() {
1753            let epub_file = Path::new("./test_case/pkg-unique-id.epub");
1754            let doc_1 = EpubDoc::new(epub_file);
1755            assert!(doc_1.is_ok());
1756
1757            let epub_file = Path::new("./test_case/pkg-unique-id_duplicate.epub");
1758            let doc_2 = EpubDoc::new(epub_file);
1759            assert!(doc_2.is_ok());
1760
1761            let doc_1 = doc_1.unwrap();
1762            let doc_2 = doc_2.unwrap();
1763
1764            assert_eq!(
1765                doc_1.get_identifier().unwrap(),
1766                doc_2.get_identifier().unwrap()
1767            );
1768            assert_eq!(doc_1.unique_identifier, "pkg-unique-id");
1769            assert_eq!(doc_2.unique_identifier, "pkg-unique-id");
1770        }
1771
1772        /// ID: pkg-version-backward
1773        ///
1774        /// “Reading Systems MUST attempt to process an EPUB Publication whose Package Document version attribute is less than "3.0"”. This is an EPUB with package version attribute set to "0", to see if a reading system will open it.
1775        #[test]
1776        fn test_pkg_version_backward() {
1777            let epub_file = Path::new("./test_case/pkg-version-backward.epub");
1778            let doc = EpubDoc::new(epub_file);
1779            assert!(doc.is_ok());
1780
1781            let doc = doc.unwrap();
1782            assert_eq!(doc.version, EpubVersion::Version3_0);
1783        }
1784
1785        /// ID: pkg-linked-records
1786        ///
1787        /// Reading System must process and display the title and creator metadata from the package document. An ONIX 3.0 format linked metadata record exists, but contains neither title nor creator metadata.
1788        #[test]
1789        fn test_pkg_linked_records() {
1790            let epub_file = Path::new("./test_case/pkg-linked-records.epub");
1791            let doc = EpubDoc::new(epub_file);
1792            assert!(doc.is_ok());
1793
1794            let doc = doc.unwrap();
1795            assert_eq!(doc.metadata_link.len(), 3);
1796
1797            let item = doc.metadata_link.iter().find(|&item| {
1798                if let Some(properties) = &item.properties {
1799                    properties.eq("onix")
1800                } else {
1801                    false
1802                }
1803            });
1804            assert!(item.is_some());
1805        }
1806
1807        /// ID: pkg-manifest-unlisted-resource
1808        ///
1809        /// The XHTML content references an image that does not appear in the manifest. The image should not be shown.
1810        #[test]
1811        fn test_pkg_manifest_unlisted_resource() {
1812            let epub_file = Path::new("./test_case/pkg-manifest-unlisted-resource.epub");
1813            let doc = EpubDoc::new(epub_file);
1814            assert!(doc.is_ok());
1815
1816            let mut doc = doc.unwrap();
1817            assert!(
1818                doc.get_manifest_item_by_path("EPUB/content_001.xhtml")
1819                    .is_ok()
1820            );
1821
1822            assert!(doc.get_manifest_item_by_path("EPUB/red.png").is_err());
1823            let err = doc.get_manifest_item_by_path("EPUB/red.png").unwrap_err();
1824            assert_eq!(
1825                err.to_string(),
1826                "Resource not found: Unable to find resource from \"EPUB/red.png\"."
1827            );
1828        }
1829    }
1830
1831    /// Section 3.4 manifest fallbacks
1832    ///
1833    /// The tests under this module seem to favor the reading system rather than the EPUB format itself
1834    mod manifest_fallbacks_tests {
1835        use std::path::Path;
1836
1837        use crate::epub::EpubDoc;
1838
1839        /// ID: pub-foreign_bad-fallback
1840        ///
1841        /// This is a test of manifest fallbacks where both the spine item and the fallback are likely to be unsupported. The spine item is a DMG, with a fallback to a PSD file. Reading systems may raise an error on the ingenstion workflow.
1842        #[test]
1843        fn test_pub_foreign_bad_fallback() {
1844            let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
1845            let doc = EpubDoc::new(epub_file);
1846            assert!(doc.is_ok());
1847
1848            let mut doc = doc.unwrap();
1849            assert!(doc.get_manifest_item("content_001").is_ok());
1850            assert!(doc.get_manifest_item("bar").is_ok());
1851
1852            assert_eq!(
1853                doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
1854                    .unwrap_err()
1855                    .to_string(),
1856                "No supported file format: The fallback resource does not contain the file format you support."
1857            );
1858        }
1859
1860        /// ID: pub-foreign_image
1861        ///
1862        /// An HTML content file contains a PSD image, with a manifest fallback to a PNG image. This tests fallbacks for resources that are not in the spine.
1863        #[test]
1864        fn test_pub_foreign_image() {
1865            let epub_file = Path::new("./test_case/pub-foreign_image.epub");
1866            let doc = EpubDoc::new(epub_file);
1867            assert!(doc.is_ok());
1868
1869            let mut doc = doc.unwrap();
1870            let result = doc.get_manifest_item_with_fallback(
1871                "image-tiff",
1872                vec!["image/png", "application/xhtml+xml"],
1873            );
1874            assert!(result.is_ok());
1875
1876            let (_, mime) = result.unwrap();
1877            assert_eq!(mime, "image/png");
1878        }
1879
1880        /// ID: pub-foreign_json-spine
1881        ///
1882        /// This EPUB uses a JSON content file in the spine, with a manifest fallback to an HTML document. If the reading system does not support JSON, it should display the HTML.
1883        #[test]
1884        fn test_pub_foreign_json_spine() {
1885            let epub_file = Path::new("./test_case/pub-foreign_json-spine.epub");
1886            let doc = EpubDoc::new(epub_file);
1887            assert!(doc.is_ok());
1888
1889            let mut doc = doc.unwrap();
1890            let result = doc.get_manifest_item_with_fallback(
1891                "content_primary",
1892                vec!["application/xhtml+xml", "application/json"],
1893            );
1894            assert!(result.is_ok());
1895            let (_, mime) = result.unwrap();
1896            assert_eq!(mime, "application/json");
1897
1898            let result = doc
1899                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1900            assert!(result.is_ok());
1901            let (_, mime) = result.unwrap();
1902            assert_eq!(mime, "application/xhtml+xml");
1903        }
1904
1905        /// ID: pub-foreign_xml-spine
1906        ///
1907        /// This EPUB uses an ordinary XML content file with mimetype application/xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1908        #[test]
1909        fn test_pub_foreign_xml_spine() {
1910            let epub_file = Path::new("./test_case/pub-foreign_xml-spine.epub");
1911            let doc = EpubDoc::new(epub_file);
1912            assert!(doc.is_ok());
1913
1914            let mut doc = doc.unwrap();
1915            let result = doc.get_manifest_item_with_fallback(
1916                "content_primary",
1917                vec!["application/xhtml+xml", "application/xml"],
1918            );
1919            assert!(result.is_ok());
1920            let (_, mime) = result.unwrap();
1921            assert_eq!(mime, "application/xml");
1922
1923            let result = doc
1924                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1925            assert!(result.is_ok());
1926            let (_, mime) = result.unwrap();
1927            assert_eq!(mime, "application/xhtml+xml");
1928        }
1929
1930        /// ID: pub-foreign_xml-suffix-spine
1931        ///
1932        /// This EPUB uses an custom XML content file with mimetype application/dtc+xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1933        #[test]
1934        fn test_pub_foreign_xml_suffix_spine() {
1935            let epub_file = Path::new("./test_case/pub-foreign_xml-suffix-spine.epub");
1936            let doc = EpubDoc::new(epub_file);
1937            assert!(doc.is_ok());
1938
1939            let mut doc = doc.unwrap();
1940            let result = doc.get_manifest_item_with_fallback(
1941                "content_primary",
1942                vec!["application/xhtml+xml", "application/dtc+xml"],
1943            );
1944            assert!(result.is_ok());
1945            let (_, mime) = result.unwrap();
1946            assert_eq!(mime, "application/dtc+xml");
1947
1948            let result = doc
1949                .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1950            assert!(result.is_ok());
1951            let (_, mime) = result.unwrap();
1952            assert_eq!(mime, "application/xhtml+xml");
1953        }
1954    }
1955
1956    /// Section 3.9 open container format
1957    mod open_container_format_tests {
1958        use std::{cmp::min, io::Read, path::Path};
1959
1960        use sha1::{Digest, Sha1};
1961
1962        use crate::epub::EpubDoc;
1963
1964        /// ID: ocf-metainf-inc
1965        ///
1966        /// An extra configuration file, not in the reserved files' list, is added to the META-INF folder; this file must be ignored.
1967        #[test]
1968        fn test_ocf_metainf_inc() {
1969            let epub_file = Path::new("./test_case/ocf-metainf-inc.epub");
1970            let doc = EpubDoc::new(epub_file);
1971            assert!(doc.is_ok());
1972        }
1973
1974        /// ID: ocf-metainf-manifest
1975        ///
1976        /// An ancillary manifest file, containing an extra spine item, is present in the META-INF directory; this extra item must be ignored by the reading system.
1977        #[test]
1978        fn test_ocf_metainf_manifest() {
1979            let epub_file = Path::new("./test_case/ocf-metainf-manifest.epub");
1980            let doc = EpubDoc::new(epub_file);
1981            assert!(doc.is_ok());
1982        }
1983
1984        /// ID: ocf-package_arbitrary
1985        ///
1986        /// The EPUB contains three valid package files and three corresponding sets of content documents, but only one of the packages, in an unusual subdirectory, is referenced by the container.xml file. The reading system must use this package.
1987        #[test]
1988        fn test_ocf_package_arbitrary() {
1989            let epub_file = Path::new("./test_case/ocf-package_arbitrary.epub");
1990            let doc = EpubDoc::new(epub_file);
1991            assert!(doc.is_ok());
1992
1993            let doc = doc.unwrap();
1994            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1995        }
1996
1997        /// ID: ocf-package_multiple
1998        ///
1999        /// The EPUB contains three valid package files and three corresponding sets of content documents, all referenced by the container.xml file. The reading system must use the first package.
2000        #[test]
2001        fn test_ocf_package_multiple() {
2002            let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
2003            let doc = EpubDoc::new(epub_file);
2004            assert!(doc.is_ok());
2005
2006            let doc = doc.unwrap();
2007            assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
2008            assert_eq!(doc.base_path, Path::new("FOO/BAR"));
2009        }
2010
2011        /// ID: ocf-url_link-leaking-relative
2012        ///
2013        /// Use a relative link with several double-dot path segments from the content to a photograph. The folder hierarchy containing the photograph starts at the root level; the relative image reference exceeds depth of hierarchy.
2014        #[test]
2015        fn test_ocf_url_link_leaking_relative() {
2016            let epub_file = Path::new("./test_case/ocf-url_link-leaking-relative.epub");
2017            let doc = EpubDoc::new(epub_file);
2018            assert!(doc.is_err());
2019            assert_eq!(
2020                doc.err().unwrap().to_string(),
2021                String::from(
2022                    "Relative link leakage: Path \"../../../../media/imgs/monastery.jpg\" is out of container range."
2023                )
2024            )
2025        }
2026
2027        /// ID: ocf-url_link-path-absolute
2028        ///
2029        /// Use a path-absolute link, i.e., beginning with a leading slash, from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2030        #[test]
2031        fn test_ocf_url_link_path_absolute() {
2032            let epub_file = Path::new("./test_case/ocf-url_link-path-absolute.epub");
2033            let doc = EpubDoc::new(epub_file);
2034            assert!(doc.is_ok());
2035
2036            let doc = doc.unwrap();
2037            let resource = doc.manifest.get("photo").unwrap();
2038            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2039        }
2040
2041        /// ID: ocf-url_link-relative
2042        ///
2043        /// A simple relative link from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2044        #[test]
2045        fn test_ocf_url_link_relative() {
2046            let epub_file = Path::new("./test_case/ocf-url_link-relative.epub");
2047            let doc = EpubDoc::new(epub_file);
2048            assert!(doc.is_ok());
2049
2050            let doc = doc.unwrap();
2051            let resource = doc.manifest.get("photo").unwrap();
2052            assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2053        }
2054
2055        /// ID: ocf-url_manifest
2056        ///
2057        /// The manifest refers to an XHTML file in an arbitrary subfolder. The reading system must be able to find the content.
2058        #[test]
2059        fn test_ocf_url_manifest() {
2060            let epub_file = Path::new("./test_case/ocf-url_manifest.epub");
2061            let doc = EpubDoc::new(epub_file);
2062            assert!(doc.is_ok());
2063
2064            let mut doc = doc.unwrap();
2065            assert!(doc.get_manifest_item("nav").is_ok());
2066            assert!(doc.get_manifest_item("content_001").is_ok());
2067            assert!(doc.get_manifest_item("content_002").is_err());
2068        }
2069
2070        /// ID: ocf-url_relative
2071        ///
2072        /// The manifest refers to an XHTML file in an arbitrary subfolder that is relative to the package's own arbitrary folder. The reading system must be able to find the content.
2073        #[test]
2074        fn test_ocf_url_relative() {
2075            let epub_file = Path::new("./test_case/ocf-url_relative.epub");
2076            let doc = EpubDoc::new(epub_file);
2077            assert!(doc.is_ok());
2078
2079            let mut doc = doc.unwrap();
2080            assert_eq!(doc.package_path, Path::new("foo/BAR/baz.opf"));
2081            assert_eq!(doc.base_path, Path::new("foo/BAR"));
2082            assert_eq!(
2083                doc.manifest.get("nav").unwrap().path,
2084                Path::new("foo/BAR/nav.xhtml")
2085            );
2086            assert_eq!(
2087                doc.manifest.get("content_001").unwrap().path,
2088                Path::new("foo/BAR/qux/content_001.xhtml")
2089            );
2090            assert!(doc.get_manifest_item("nav").is_ok());
2091            assert!(doc.get_manifest_item("content_001").is_ok());
2092        }
2093
2094        /// ID: ocf-zip-comp
2095        ///
2096        /// MUST treat any OCF ZIP container that uses compression techniques other than Deflate as in error.
2097        /// This test case does not use compression methods other than Deflate and cannot detect whether it is effective.
2098        #[test]
2099        fn test_ocf_zip_comp() {
2100            let epub_file = Path::new("./test_case/ocf-zip-comp.epub");
2101            let doc = EpubDoc::new(epub_file);
2102            assert!(doc.is_ok());
2103        }
2104
2105        /// ID: ocf-zip-mult
2106        ///
2107        /// MUST treat any OCF ZIP container that splits the content into segments as in error.
2108        /// This test case is not a segmented OCF ZIP container and cannot be tested to see if it is valid.
2109        #[test]
2110        fn test_ocf_zip_mult() {
2111            let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2112            let doc = EpubDoc::new(epub_file);
2113            assert!(doc.is_ok());
2114        }
2115
2116        /// ID: ocf-font_obfuscation
2117        ///
2118        /// An obfuscated (TrueType) font should be displayed after de-obfuscation.
2119        #[test]
2120        fn test_ocf_font_obfuscation() {
2121            let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2122            let doc = EpubDoc::new(epub_file);
2123            assert!(doc.is_ok());
2124
2125            let mut doc = doc.unwrap();
2126            let unique_id = doc.unique_identifier.clone();
2127
2128            let mut hasher = Sha1::new();
2129            hasher.update(unique_id.as_bytes());
2130            let hash = hasher.finalize();
2131            let mut key = vec![0u8; 1040];
2132            for i in 0..1040 {
2133                key[i] = hash[i % hash.len()];
2134            }
2135
2136            assert!(doc.encryption.is_some());
2137            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2138
2139            let data = &doc.encryption.unwrap()[0];
2140            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2141
2142            let font_file = doc
2143                .archive
2144                .by_name(&data.data)
2145                .unwrap()
2146                .bytes()
2147                .collect::<Result<Vec<u8>, _>>();
2148            assert!(font_file.is_ok());
2149            let font_file = font_file.unwrap();
2150
2151            // 根据EPUB规范，字体混淆是直接对字体文件进行的，不需要解压步骤，直接进行去混淆处理
2152            let mut deobfuscated = font_file.clone();
2153            for i in 0..min(1040, deobfuscated.len()) {
2154                deobfuscated[i] ^= key[i];
2155            }
2156
2157            assert!(is_valid_font(&deobfuscated));
2158        }
2159
2160        /// ID: ocf-font_obfuscation-bis
2161        ///
2162        /// An obfuscated (TrueType) font should not be displayed after de-obfuscation, because the obfuscation used a different publication id.
2163        #[test]
2164        fn test_ocf_font_obfuscation_bis() {
2165            let epub_file = Path::new("./test_case/ocf-font_obfuscation_bis.epub");
2166            let doc = EpubDoc::new(epub_file);
2167            assert!(doc.is_ok());
2168
2169            let mut doc = doc.unwrap();
2170
2171            let wrong_unique_id = "wrong-publication-id";
2172            let mut hasher = Sha1::new();
2173            hasher.update(wrong_unique_id.as_bytes());
2174            let hash = hasher.finalize();
2175            let mut wrong_key = vec![0u8; 1040];
2176            for i in 0..1040 {
2177                wrong_key[i] = hash[i % hash.len()];
2178            }
2179
2180            assert!(doc.encryption.is_some());
2181            assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2182
2183            let data = &doc.encryption.unwrap()[0];
2184            assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2185
2186            let font_file = doc
2187                .archive
2188                .by_name(&data.data)
2189                .unwrap()
2190                .bytes()
2191                .collect::<Result<Vec<u8>, _>>();
2192            assert!(font_file.is_ok());
2193            let font_file = font_file.unwrap();
2194
2195            // 使用错误的密钥进行去混淆
2196            let mut deobfuscated_with_wrong_key = font_file.clone();
2197            for i in 0..std::cmp::min(1040, deobfuscated_with_wrong_key.len()) {
2198                deobfuscated_with_wrong_key[i] ^= wrong_key[i];
2199            }
2200
2201            assert!(!is_valid_font(&deobfuscated_with_wrong_key));
2202        }
2203
2204        fn is_valid_font(data: &[u8]) -> bool {
2205            if data.len() < 4 {
2206                return false;
2207            }
2208            let sig = &data[0..4];
2209            // OTF: "OTTO"
2210            // TTF: 0x00010000, 0x00020000, "true", "typ1"
2211            sig == b"OTTO"
2212                || sig == b"\x00\x01\x00\x00"
2213                || sig == b"\x00\x02\x00\x00"
2214                || sig == b"true"
2215                || sig == b"typ1"
2216        }
2217    }
2218
2219    #[test]
2220    fn test_parse_container() {
2221        let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2222        let doc = EpubDoc::new(epub_file);
2223        assert!(doc.is_ok());
2224
2225        // let doc = doc.unwrap();
2226        let container = r#"
2227        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2228            <rootfiles></rootfiles>
2229        </container>
2230        "#
2231        .to_string();
2232
2233        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2234        assert!(result.is_err());
2235        assert_eq!(
2236            result.unwrap_err(),
2237            EpubError::NonCanonicalFile {
2238                tag: "rootfile".to_string()
2239            }
2240        );
2241
2242        let container = r#"
2243        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2244            <rootfiles>
2245                <rootfile media-type="application/oebps-package+xml"/>
2246            </rootfiles>
2247        </container>
2248        "#
2249        .to_string();
2250
2251        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2252        assert!(result.is_err());
2253        assert_eq!(
2254            result.unwrap_err(),
2255            EpubError::MissingRequiredAttribute {
2256                tag: "rootfile".to_string(),
2257                attribute: "full-path".to_string(),
2258            }
2259        );
2260
2261        let container = r#"
2262        <container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
2263            <rootfiles>
2264                <rootfile media-type="application/oebps-package+xml" full-path="EPUB/content.opf"/>
2265            </rootfiles>
2266        </container>
2267        "#
2268        .to_string();
2269
2270        let result = EpubDoc::<BufReader<File>>::parse_container(container);
2271        assert!(result.is_ok());
2272        assert_eq!(result.unwrap(), PathBuf::from("EPUB/content.opf"))
2273    }
2274
2275    #[test]
2276    fn test_parse_manifest() {
2277        let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
2278        let doc = EpubDoc::new(epub_file);
2279        assert!(doc.is_ok());
2280
2281        let manifest = r#"
2282        <manifest>
2283            <item href="content_001.xhtml" media-type="application/xhtml+xml"/>
2284            <item properties="nav" href="nav.xhtml" media-type="application/xhtml+xml"/>
2285        </manifest>
2286        "#;
2287        let mut doc = doc.unwrap();
2288        let element = XmlReader::parse(manifest);
2289        assert!(element.is_ok());
2290
2291        let element = element.unwrap();
2292        let result = doc.parse_manifest(&element);
2293        assert!(result.is_err());
2294        assert_eq!(
2295            result.unwrap_err(),
2296            EpubError::MissingRequiredAttribute {
2297                tag: "item".to_string(),
2298                attribute: "id".to_string(),
2299            },
2300        );
2301
2302        let manifest = r#"
2303        <manifest>
2304            <item id="content_001" media-type="application/xhtml+xml"/>
2305            <item id="nav" properties="nav" media-type="application/xhtml+xml"/>
2306        </manifest>
2307        "#;
2308        let element = XmlReader::parse(manifest);
2309        assert!(element.is_ok());
2310
2311        let element = element.unwrap();
2312        let result = doc.parse_manifest(&element);
2313        assert!(result.is_err());
2314        assert_eq!(
2315            result.unwrap_err(),
2316            EpubError::MissingRequiredAttribute {
2317                tag: "item".to_string(),
2318                attribute: "href".to_string(),
2319            },
2320        );
2321
2322        let manifest = r#"
2323        <manifest>
2324            <item id="content_001" href="content_001.xhtml"/>
2325            <item id="nav" properties="nav" href="nav.xhtml"/>
2326        </manifest>
2327        "#;
2328        let element = XmlReader::parse(manifest);
2329        assert!(element.is_ok());
2330
2331        let element = element.unwrap();
2332        let result = doc.parse_manifest(&element);
2333        assert!(result.is_err());
2334        assert_eq!(
2335            result.unwrap_err(),
2336            EpubError::MissingRequiredAttribute {
2337                tag: "item".to_string(),
2338                attribute: "media-type".to_string(),
2339            },
2340        );
2341
2342        let manifest = r#"
2343        <manifest>
2344            <item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
2345            <item id="nav" properties="nav" href="nav.xhtml" media-type="application/xhtml+xml"/>
2346        </manifest>
2347        "#;
2348        let element = XmlReader::parse(manifest);
2349        assert!(element.is_ok());
2350
2351        let element = element.unwrap();
2352        let result = doc.parse_manifest(&element);
2353        assert!(result.is_ok());
2354    }
2355
2356    /// Test for function `has_encryption`
2357    #[test]
2358    fn test_fn_has_encryption() {
2359        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2360        let doc = EpubDoc::new(epub_file);
2361        assert!(doc.is_ok());
2362
2363        let mut doc = doc.unwrap();
2364        assert!(doc.has_encryption());
2365    }
2366
2367    /// This test is used to detect whether the "META-INF/encryption.xml" file is parsed correctly
2368    #[test]
2369    fn test_fn_parse_encryption() {
2370        let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2371        let doc = EpubDoc::new(epub_file);
2372        assert!(doc.is_ok());
2373
2374        let doc = doc.unwrap();
2375        assert!(doc.encryption.is_some());
2376
2377        let encryption = doc.encryption.unwrap();
2378        assert_eq!(encryption.len(), 1);
2379        assert_eq!(encryption[0].method, "http://www.idpf.org/2008/embedding");
2380        assert_eq!(encryption[0].data, "EPUB/fonts/Lobster.ttf");
2381    }
2382
2383    #[test]
2384    fn test_get_metadata_existing_key() {
2385        let epub_file = Path::new("./test_case/epub-33.epub");
2386        let doc = EpubDoc::new(epub_file);
2387        assert!(doc.is_ok());
2388
2389        let doc = doc.unwrap();
2390
2391        let titles = doc.get_metadata("title");
2392        assert!(titles.is_some());
2393
2394        let titles = titles.unwrap();
2395        assert_eq!(titles.len(), 1);
2396        assert_eq!(titles[0].property, "title");
2397        assert_eq!(titles[0].value, "EPUB 3.3");
2398
2399        let languages = doc.get_metadata("language");
2400        assert!(languages.is_some());
2401
2402        let languages = languages.unwrap();
2403        assert_eq!(languages.len(), 1);
2404        assert_eq!(languages[0].property, "language");
2405        assert_eq!(languages[0].value, "en-us");
2406
2407        let language = doc.get_language();
2408        assert!(language.is_ok());
2409        assert_eq!(language.unwrap(), vec!["en-us"]);
2410    }
2411
2412    #[test]
2413    fn test_get_metadata_nonexistent_key() {
2414        let epub_file = Path::new("./test_case/epub-33.epub");
2415        let doc = EpubDoc::new(epub_file);
2416        assert!(doc.is_ok());
2417
2418        let doc = doc.unwrap();
2419        let metadata = doc.get_metadata("nonexistent");
2420        assert!(metadata.is_none());
2421    }
2422
2423    #[test]
2424    fn test_get_metadata_multiple_items_same_type() {
2425        let epub_file = Path::new("./test_case/epub-33.epub");
2426        let doc = EpubDoc::new(epub_file);
2427        assert!(doc.is_ok());
2428
2429        let doc = doc.unwrap();
2430
2431        let creators = doc.get_metadata("creator");
2432        assert!(creators.is_some());
2433
2434        let creators = creators.unwrap();
2435        assert_eq!(creators.len(), 3);
2436
2437        assert_eq!(creators[0].id, Some("creator_id_0".to_string()));
2438        assert_eq!(creators[0].property, "creator");
2439        assert_eq!(creators[0].value, "Matt Garrish, DAISY Consortium");
2440
2441        assert_eq!(creators[1].id, Some("creator_id_1".to_string()));
2442        assert_eq!(creators[1].property, "creator");
2443        assert_eq!(creators[1].value, "Ivan Herman, W3C");
2444
2445        assert_eq!(creators[2].id, Some("creator_id_2".to_string()));
2446        assert_eq!(creators[2].property, "creator");
2447        assert_eq!(creators[2].value, "Dave Cramer, Invited Expert");
2448    }
2449
2450    #[test]
2451    fn test_get_metadata_with_refinement() {
2452        let epub_file = Path::new("./test_case/epub-33.epub");
2453        let doc = EpubDoc::new(epub_file);
2454        assert!(doc.is_ok());
2455
2456        let doc = doc.unwrap();
2457
2458        let title = doc.get_metadata("title");
2459        assert!(title.is_some());
2460
2461        let title = title.unwrap();
2462        assert_eq!(title.len(), 1);
2463        assert_eq!(title[0].refined.len(), 1);
2464        assert_eq!(title[0].refined[0].property, "title-type");
2465        assert_eq!(title[0].refined[0].value, "main");
2466    }
2467
2468    #[test]
2469    fn test_get_manifest_item_with_fallback() {
2470        let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
2471        let doc = EpubDoc::new(epub_file);
2472        assert!(doc.is_ok());
2473
2474        let mut doc = doc.unwrap();
2475        assert!(doc.get_manifest_item("content_001").is_ok());
2476        assert!(doc.get_manifest_item("bar").is_ok());
2477
2478        // 当回退链上存在可回退资源时能获取资源
2479        if let Ok((_, mime)) = doc.get_manifest_item_with_fallback("content_001", vec!["image/psd"])
2480        {
2481            assert_eq!(mime, "image/psd");
2482        } else {
2483            assert!(false, "get_manifest_item_with_fallback failed");
2484        }
2485
2486        // 当回退链上不存在可回退资源时无法获取资源
2487        assert_eq!(
2488            doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
2489                .unwrap_err()
2490                .to_string(),
2491            "No supported file format: The fallback resource does not contain the file format you support."
2492        );
2493    }
2494
2495    #[test]
2496    fn test_get_cover() {
2497        let epub_file = Path::new("./test_case/pkg-cover-image.epub");
2498        let doc = EpubDoc::new(epub_file);
2499        if let Err(err) = &doc {
2500            println!("{}", err);
2501        }
2502        assert!(doc.is_ok());
2503
2504        let mut doc = doc.unwrap();
2505        let result = doc.get_cover();
2506        assert!(result.is_some());
2507
2508        let (data, mime) = result.unwrap();
2509        assert_eq!(data.len(), 5785);
2510        assert_eq!(mime, "image/jpeg");
2511    }
2512
2513    #[test]
2514    fn test_epub_2() {
2515        let epub_file = Path::new("./test_case/epub-2.epub");
2516        let doc = EpubDoc::new(epub_file);
2517        assert!(doc.is_ok());
2518
2519        let doc = doc.unwrap();
2520
2521        let titles = doc.get_title();
2522        assert!(titles.is_ok());
2523        assert_eq!(titles.unwrap(), vec!["Minimal EPUB 2.0"]);
2524    }
2525}
lib_epub/epub.rs

lib_epub/
epub.rs