lib_epub/epub.rs
1//! The core module of the EPUB parsing library
2//!
3//! This module provides complete parsing functionality for EPUB ebook files
4//! and is the core component of the entire library. The `EpubDoc` structure
5//! encapsulates all the parsing logic and data access interfaces for EPUB files.
6//!
7//! ## Main references to EPUB specs:
8//! - <https://www.w3.org/TR/epub-33>
9//! - <https://idpf.org/epub/201>
10//!
11//! ## Potential Issues
12//! - The generic parameter `R: Read + Seek` increases complexity, particularly
13//! in asynchronous environments. The current design is not conducive to multi-threaded
14//! concurrent access and requires an external synchronization mechanism.
15//! - Some error handling may not be sufficiently nuanced, and certain edge cases
16//! may not be adequately considered.
17//! - Loading the entire EPUB document at once may result in significant memory consumption,
18//! especially for large publications.
19//!
20//! ## Future Work
21//! - Adds support for asynchronous I/O, improving the user experience in asynchronous
22//! environments. Considering adding support for multi-threaded access.
23//! - Supports more EPUB specification features, such as media overlay and scripts.
24
25use std::{
26 collections::HashMap,
27 fs::{File, canonicalize},
28 io::{BufReader, Read, Seek},
29 path::{Path, PathBuf},
30};
31
32use log::warn;
33use zip::{ZipArchive, result::ZipError};
34
35use crate::{
36 error::EpubError,
37 types::{
38 EncryptionData, EpubVersion, ManifestItem, MetadataItem, MetadataLinkItem,
39 MetadataRefinement, NavPoint, SpineItem,
40 },
41 utils::{
42 DecodeBytes, NormalizeWhitespace, XmlElement, XmlReader, adobe_font_dencryption,
43 check_realtive_link_leakage, compression_method_check, get_file_in_zip_archive,
44 idpf_font_dencryption,
45 },
46};
47
48/// EPUB document parser, representing a loaded and parsed EPUB publication
49///
50/// The `EpubDoc` structure is the core of the entire EPUB parsing library.
51/// It encapsulates all the parsing logic and data access interfaces for EPUB files.
52/// It is responsible for parsing various components of an EPUB, including metadata,
53/// manifests, reading order, table of contents navigation, and encrypted information,
54/// and provides methods for accessing this data.
55///
56/// Provides a unified data access interface for EPUB files, hiding the underlying
57/// file structure and parsing details. Strictly adheres to the EPUB specification
58/// in implementing the parsing logic to ensure compatibility with the standard.
59///
60/// # Usage
61///
62/// ```rust
63/// use lib_epub::epub::EpubDoc;
64///
65/// let doc = EpubDoc::new("./test_case/epub-33.epub");
66/// assert!(doc.is_ok());
67/// ```
68pub struct EpubDoc<R: Read + Seek> {
69 /// The structure of the epub file that actually holds it
70 pub(crate) archive: ZipArchive<R>,
71
72 /// The path to the target epub file
73 pub(crate) epub_path: PathBuf,
74
75 /// The path to the OPF file
76 pub package_path: PathBuf,
77
78 /// The path to the directory where the opf file is located
79 pub base_path: PathBuf,
80
81 /// The epub version
82 pub version: EpubVersion,
83
84 /// The unique identifier of the epub file
85 ///
86 /// This identifier is the actual value of the unique-identifier attribute of the package.
87 pub unique_identifier: String,
88
89 /// Epub metadata extracted from OPF
90 pub metadata: Vec<MetadataItem>,
91
92 /// Data in metadata that points to external files
93 pub metadata_link: Vec<MetadataLinkItem>,
94
95 /// A list of resources contained inside an epub extracted from OPF
96 ///
97 /// All resources in the epub file are declared here,
98 /// and undeclared resources should not be stored in the epub file and cannot be obtained from it.
99 pub manifest: HashMap<String, ManifestItem>,
100
101 /// Physical reading order of publications extracted from OPF
102 ///
103 /// This attribute declares the order in which multiple files
104 /// containing published content should be displayed.
105 pub spine: Vec<SpineItem>,
106
107 /// The encryption.xml extracted from the META-INF directory
108 pub encryption: Option<Vec<EncryptionData>>,
109
110 /// The navigation data of the epub file
111 pub catalog: Vec<NavPoint>,
112
113 /// The title of the catalog
114 pub catalog_title: String,
115
116 /// The index of the current reading spine
117 pub current_spine_index: usize,
118}
119
120impl<R: Read + Seek> EpubDoc<R> {
121 /// Creates a new EPUB document instance from a reader
122 ///
123 /// This function is responsible for the core logic of parsing EPUB files,
124 /// including verifying the file format, parsing container information,
125 /// loading the OPF package document, and extracting metadata, manifest,
126 /// reading order, and other core information.
127 ///
128 /// # Parameters
129 /// - `reader`: The data source that implements the `Read` and `Seek` traits,
130 /// usually a file or memory buffer
131 /// - `epub_path`: The path to the EPUB file, used for path resolution and validation
132 ///
133 /// # Return
134 /// - `Ok(EpubDoc<R>)`: The successfully parsed EPUB document object
135 /// - `Err(EpubError)`: Errors encountered during parsing
136 ///
137 /// # Notes
138 /// - This function assumes the EPUB file structure is valid
139 pub fn from_reader(reader: R, epub_path: PathBuf) -> Result<Self, EpubError> {
140 // Parsing process
141 // 1. Verify that the ZIP compression method conforms to the EPUB specification
142 // 2. Parse `META-INF/container.xml` retrieves the location of the OPF file
143 // 3. Parses the OPF file to obtain package documentation information
144 // 4. Extracts version information
145 // 5. Parses metadata, manifest, and spine
146 // 6. Parses encrypted information and directory navigation
147 // 7. Verifies and extracts the unique identifier
148
149 let mut archive = ZipArchive::new(reader).map_err(EpubError::from)?;
150 let epub_path = canonicalize(epub_path)?;
151
152 compression_method_check(&mut archive)?;
153
154 let container =
155 get_file_in_zip_archive(&mut archive, "META-INF/container.xml")?.decode()?;
156 let package_path = Self::parse_container(container)?;
157 let base_path = package_path
158 .parent()
159 .expect("所有文件的父目录不能为空")
160 .to_path_buf();
161
162 let opf_file =
163 get_file_in_zip_archive(&mut archive, package_path.to_str().unwrap())?.decode()?;
164 let package = XmlReader::parse(&opf_file)?;
165 // let document = kiss_xml::parse_str(opf_file).unwrap();
166
167 // let package = document.root_element();
168 let version = Self::determine_epub_version(&package)?;
169
170 let mut doc = Self {
171 archive,
172 epub_path,
173 package_path,
174 base_path,
175 version,
176 unique_identifier: String::new(),
177 metadata: vec![],
178 metadata_link: vec![],
179 manifest: HashMap::new(),
180 spine: vec![],
181 encryption: None,
182 catalog: vec![],
183 catalog_title: String::new(),
184 current_spine_index: 0,
185 };
186
187 let metadata_element = package.find_elements_by_name("metadata").next().unwrap();
188 let manifest_element = package.find_elements_by_name("manifest").next().unwrap();
189 let spine_element = package.find_elements_by_name("spine").next().unwrap();
190
191 doc.parse_metadata(metadata_element)?;
192 doc.parse_manifest(manifest_element)?;
193 doc.parse_spine(spine_element)?;
194 doc.parse_encryption()?;
195 doc.parse_catalog()?;
196
197 // 断言必有唯一标识符
198 doc.unique_identifier = if let Some(uid) = package.get_attr("unique-identifier") {
199 doc.metadata.iter().find(|item| {
200 item.property == "identifier" && item.id.as_ref().is_some_and(|id| id == &uid)
201 })
202 } else {
203 doc.metadata
204 .iter()
205 .find(|item| item.property == "identifier")
206 }
207 .map(|item| item.value.clone())
208 .ok_or_else(|| EpubError::NonCanonicalFile {
209 tag: "dc:identifier".to_string(),
210 })?;
211
212 Ok(doc)
213 }
214
215 /// Parse the EPUB container file (META-INF/container.xml)
216 ///
217 /// This function parses the container information in the EPUB file 、
218 /// to extract the path to the OPF package file. According to the EPUB
219 /// specification, the `container.xml` file must exist in the `META-INF`
220 /// directory and contain at least one `rootfile` element pointing to
221 /// the main OPF file. When multiple `rootfile` elements exist, the first
222 /// element pointing to the OPF file is used as the default.
223 ///
224 /// # Parameters
225 /// - `data`: The content string of the container.xml
226 ///
227 /// # Return
228 /// - `Ok(PathBuf)`: The path to the successfully parsed OPF file
229 /// - `Err(EpubError)`: Errors encountered during parsing
230 fn parse_container(data: String) -> Result<PathBuf, EpubError> {
231 let root = XmlReader::parse(&data)?;
232 let rootfile = root
233 .find_elements_by_name("rootfile")
234 .next()
235 .ok_or_else(|| EpubError::NonCanonicalFile {
236 tag: "rootfile".to_string(),
237 })?;
238
239 let attr =
240 rootfile
241 .get_attr("full-path")
242 .ok_or_else(|| EpubError::MissingRequiredAttribute {
243 tag: "rootfile".to_string(),
244 attribute: "full-path".to_string(),
245 })?;
246
247 Ok(PathBuf::from(attr))
248 }
249
250 /// Parse the EPUB metadata section
251 ///
252 /// This function is responsible for parsing the `<metadata>` elements
253 /// in the OPF file to extract basic information about the publication.
254 /// It handles metadata elements from different namespaces:
255 /// - Elements in the Dublin Core namespace (`http://purl.org/dc/elements/1.1/`)
256 /// - Elements in the OPF namespace (`http://www.idpf.org/2007/opf`)
257 ///
258 /// # Parameters
259 /// - `metadata_element`: A reference to the `<metadata>` element in the OPF file
260 fn parse_metadata(&mut self, metadata_element: &XmlElement) -> Result<(), EpubError> {
261 const DC_NAMESPACE: &str = "http://purl.org/dc/elements/1.1/";
262 const OPF_NAMESPACE: &str = "http://www.idpf.org/2007/opf";
263
264 let mut metadata = Vec::new();
265 let mut metadata_link = Vec::new();
266 let mut refinements = HashMap::<String, Vec<MetadataRefinement>>::new();
267
268 for element in metadata_element.children() {
269 match &element.namespace {
270 Some(namespace) if namespace == DC_NAMESPACE => {
271 self.parse_dc_metadata(element, &mut metadata)?
272 }
273
274 Some(namespace) if namespace == OPF_NAMESPACE => self.parse_opf_metadata(
275 element,
276 &mut metadata,
277 &mut metadata_link,
278 &mut refinements,
279 )?,
280
281 _ => {}
282 };
283 }
284
285 for item in metadata.iter_mut() {
286 if let Some(id) = &item.id {
287 if let Some(refinements) = refinements.remove(id) {
288 item.refined = refinements;
289 }
290 }
291 }
292
293 self.metadata = metadata;
294 self.metadata_link = metadata_link;
295 Ok(())
296 }
297
298 /// Parse the EPUB manifest section
299 ///
300 /// This function parses the `<manifest>` element in the OPF file, extracting
301 /// information about all resource files in the publication. Each resource contains
302 /// basic information such as id, file path, MIME type, as well as optional
303 /// attributes and fallback resource information.
304 ///
305 /// # Parameters
306 /// - `manifest_element`: A reference to the `<manifest>` element in the OPF file
307 fn parse_manifest(&mut self, manifest_element: &XmlElement) -> Result<(), EpubError> {
308 let estimated_items = manifest_element.children().count();
309 let mut resources = HashMap::with_capacity(estimated_items);
310
311 for element in manifest_element.children() {
312 let id = element
313 .get_attr("id")
314 .ok_or_else(|| EpubError::MissingRequiredAttribute {
315 tag: element.tag_name(),
316 attribute: "id".to_string(),
317 })?
318 .to_string();
319 let path = element
320 .get_attr("href")
321 .ok_or_else(|| EpubError::MissingRequiredAttribute {
322 tag: element.tag_name(),
323 attribute: "href".to_string(),
324 })?
325 .to_string();
326 let mime = element
327 .get_attr("media-type")
328 .ok_or_else(|| EpubError::MissingRequiredAttribute {
329 tag: element.tag_name(),
330 attribute: "media-type".to_string(),
331 })?
332 .to_string();
333 let properties = element.get_attr("properties");
334 let fallback = element.get_attr("fallback");
335
336 resources.insert(
337 id.clone(),
338 ManifestItem {
339 id,
340 path: self.normalize_manifest_path(&path)?,
341 mime,
342 properties,
343 fallback,
344 },
345 );
346 }
347
348 self.manifest = resources;
349 self.validate_fallback_chains();
350 Ok(())
351 }
352
353 /// Parse the EPUB spine section
354 ///
355 /// This function parses the `<spine>` elements in the OPF file to extract
356 /// the reading order information of the publication. The spine defines the
357 /// linear reading order of the publication's content documents, and each
358 /// spine item references resources in the manifest.
359 ///
360 /// # Parameters
361 /// - `spine_element`: A reference to the `<spine>` element in the OPF file
362 fn parse_spine(&mut self, spine_element: &XmlElement) -> Result<(), EpubError> {
363 let mut spine = Vec::new();
364 for element in spine_element.children() {
365 let idref = element
366 .get_attr("idref")
367 .ok_or_else(|| EpubError::MissingRequiredAttribute {
368 tag: element.tag_name(),
369 attribute: "idref".to_string(),
370 })?
371 .to_string();
372 let id = element.get_attr("id");
373 let linear = element
374 .get_attr("linear")
375 .map(|linear| linear == "yes")
376 .unwrap_or(true);
377 let properties = element.get_attr("properties");
378
379 spine.push(SpineItem {
380 idref,
381 id,
382 linear,
383 properties,
384 });
385 }
386
387 self.spine = spine;
388 Ok(())
389 }
390
391 /// Parse the EPUB encryption file (META-INF/encryption.xml)
392 ///
393 /// This function is responsible for parsing the `encryption.xml` file
394 /// in the `META-INF` directory to extract information about encrypted
395 /// resources in the publication. According to the EPUB specification,
396 /// the encryption information describes which resources are encrypted
397 /// and the encryption methods used.
398 ///
399 /// TODO: 需要对使用非对称加密数据的加密项进行额外处理,以获取非对称加密密钥
400 fn parse_encryption(&mut self) -> Result<(), EpubError> {
401 if !self.has_encryption() {
402 return Ok(());
403 }
404
405 let encryption_file =
406 get_file_in_zip_archive(&mut self.archive, "META-INF/encryption.xml")?.decode()?;
407
408 let root = XmlReader::parse(&encryption_file)?;
409
410 let mut encryption_data = Vec::new();
411 for data in root.children() {
412 if data.name != "EncryptedData" {
413 continue;
414 }
415
416 let method = data
417 .find_elements_by_name("EncryptionMethod")
418 .next()
419 .ok_or_else(|| EpubError::NonCanonicalFile {
420 tag: "EncryptionMethod".to_string(),
421 })?;
422 let reference = data
423 .find_elements_by_name("CipherReference")
424 .next()
425 .ok_or_else(|| EpubError::NonCanonicalFile {
426 tag: "CipherReference".to_string(),
427 })?;
428
429 encryption_data.push(EncryptionData {
430 method: method
431 .get_attr("Algorithm")
432 .ok_or_else(|| EpubError::MissingRequiredAttribute {
433 tag: "EncryptionMethod".to_string(),
434 attribute: "Algorithm".to_string(),
435 })?
436 .to_string(),
437 data: reference
438 .get_attr("URI")
439 .ok_or_else(|| EpubError::MissingRequiredAttribute {
440 tag: "CipherReference".to_string(),
441 attribute: "URI".to_string(),
442 })?
443 .to_string(),
444 });
445 }
446
447 if !encryption_data.is_empty() {
448 self.encryption = Some(encryption_data);
449 }
450
451 Ok(())
452 }
453
454 /// Parse the EPUB navigation information
455 ///
456 /// This function is responsible for parsing the navigation information of EPUB
457 /// publications. Different parsing strategies are used depending on the EPUB version:
458 /// - EPUB 2.0: Parses the NCX file to obtain directory information
459 /// - EPUB 3.0: Parses the Navigation Document (NAV) file to obtain directory information
460 fn parse_catalog(&mut self) -> Result<(), EpubError> {
461 const HEAD_TAGS: [&str; 6] = ["h1", "h2", "h3", "h4", "h5", "h6"];
462
463 match self.version {
464 EpubVersion::Version2_0 => {
465 let opf_file = get_file_in_zip_archive(
466 &mut self.archive,
467 self.package_path.to_str().unwrap(),
468 )?
469 .decode()?;
470 let opf_element = XmlReader::parse(&opf_file)?;
471
472 let toc_id = opf_element
473 .find_children_by_name("spine")
474 .next()
475 .ok_or_else(|| EpubError::NonCanonicalFile {
476 tag: "spine".to_string(),
477 })?
478 .get_attr("toc")
479 .ok_or_else(|| EpubError::MissingRequiredAttribute {
480 tag: "spine".to_string(),
481 attribute: "toc".to_string(),
482 })?
483 .to_owned();
484 let toc_path = self
485 .manifest
486 .get(&toc_id)
487 .ok_or(EpubError::ResourceIdNotExist { id: toc_id })?
488 .path
489 .to_str()
490 .unwrap();
491
492 let ncx_file = get_file_in_zip_archive(&mut self.archive, toc_path)?.decode()?;
493 let ncx = XmlReader::parse(&ncx_file)?;
494
495 match ncx.find_elements_by_name("docTitle").next() {
496 Some(element) => self.catalog_title = element.text(),
497 None => warn!(
498 "Expecting to get docTitle information from the ncx file, but it's missing."
499 ),
500 };
501
502 let nav_map = ncx.find_elements_by_name("navMap").next().ok_or_else(|| {
503 EpubError::NonCanonicalFile {
504 tag: "navMap".to_string(),
505 }
506 })?;
507
508 self.catalog = self.parse_nav_points(nav_map)?;
509
510 Ok(())
511 }
512
513 EpubVersion::Version3_0 => {
514 let nav_path = self
515 .manifest
516 .values()
517 .find(|item| {
518 if let Some(property) = &item.properties {
519 return property.contains("nav");
520 }
521 false
522 })
523 .map(|item| item.path.clone())
524 .ok_or_else(|| EpubError::NonCanonicalEpub {
525 expected_file: "Navigation Document".to_string(),
526 })?;
527
528 let nav_file =
529 get_file_in_zip_archive(&mut self.archive, nav_path.to_str().unwrap())?
530 .decode()?;
531
532 let nav_element = XmlReader::parse(&nav_file)?;
533 let nav = nav_element
534 .find_elements_by_name("nav")
535 .find(|&element| element.get_attr("epub:type") == Some(String::from("toc")))
536 .ok_or_else(|| EpubError::NonCanonicalFile {
537 tag: "nav".to_string(),
538 })?;
539 let nav_title = nav.find_children_by_names(&HEAD_TAGS).next();
540 let nav_list = nav.find_children_by_name("ol").next().ok_or_else(|| {
541 EpubError::NonCanonicalFile {
542 tag: "ol".to_string(),
543 }
544 })?;
545
546 self.catalog = self.parse_catalog_list(nav_list)?;
547 if let Some(nav_title) = nav_title {
548 self.catalog_title = nav_title.text();
549 };
550 Ok(())
551 }
552 }
553 }
554
555 /// Check if the EPUB file contains `encryption.xml`
556 ///
557 /// This function determines whether a publication contains encrypted resources
558 /// by checking if a `META-INF/encryption.xml` file exists in the EPUB package.
559 /// According to the EPUB specification, when resources in a publication are
560 /// encrypted, the corresponding encryption information must be declared in
561 /// the `META-INF/encryption.xml` file.
562 ///
563 /// # Return
564 /// - `true` if the publication contains encrypted resources
565 /// - `false` if the publication does not contain encrypted resources
566 ///
567 /// # Notes
568 /// - This function only checks the existence of the encrypted file;
569 /// it does not verify the validity of the encrypted information.
570 pub fn has_encryption(&mut self) -> bool {
571 self.archive
572 .by_path(Path::new("META-INF/encryption.xml"))
573 .is_ok()
574 }
575
576 /// Retrieves a list of metadata items
577 ///
578 /// This function retrieves all matching metadata items from the EPUB metadata
579 /// based on the specified attribute name (key). Metadata items may come from
580 /// the DC (Dublin Core) namespace or the OPF namespace and contain basic
581 /// information about the publication, such as title, author, identifier, etc.
582 ///
583 /// # Parameters
584 /// - `key`: The name of the metadata attribute to retrieve
585 ///
586 /// # Return
587 /// - `Some(Vec<MetadataItem>)`: A vector containing all matching metadata items
588 /// - `None`: If no matching metadata items are found
589 pub fn get_metadata(&self, key: &str) -> Option<Vec<MetadataItem>> {
590 let metadatas = self
591 .metadata
592 .iter()
593 .filter(|item| item.property == key)
594 .cloned()
595 .collect::<Vec<MetadataItem>>();
596
597 (!metadatas.is_empty()).then_some(metadatas)
598 }
599
600 /// Retrieves a list of values for specific metadata items
601 ///
602 /// This function retrieves the values of all matching metadata items from
603 /// the EPUB metadata based on the given property name (key).
604 ///
605 /// # Parameters
606 /// - `key`: The name of the metadata attribute to retrieve
607 ///
608 /// # Return
609 /// - `Some(Vec<String>)`: A vector containing all matching metadata item values
610 /// - `None`: If no matching metadata items are found
611 pub fn get_metadata_value(&self, key: &str) -> Option<Vec<String>> {
612 let values = self
613 .metadata
614 .iter()
615 .filter(|item| item.property == key)
616 .map(|item| item.value.clone())
617 .collect::<Vec<String>>();
618
619 (!values.is_empty()).then_some(values)
620 }
621
622 /// Retrieves the title of the publication
623 ///
624 /// This function retrieves all title information from the EPUB metadata.
625 /// According to the EPUB specification, a publication can have multiple titles,
626 /// which are returned in the order they appear in the metadata.
627 ///
628 /// # Return
629 /// - `Result<Vec<String>, EpubError>`: A vector containing all title information
630 /// - `EpubError`: If and only if the OPF file does not contain `<dc:title>`
631 ///
632 /// # Notes
633 /// - The EPUB specification requires each publication to have at least one title.
634 pub fn get_title(&self) -> Result<Vec<String>, EpubError> {
635 self.get_metadata_value("title")
636 .ok_or_else(|| EpubError::NonCanonicalFile {
637 tag: "title".to_string(),
638 })
639 }
640
641 /// Retrieves the language used in the publication
642 ///
643 /// This function retrieves the language information of a publication from the EPUB
644 /// metadata. According to the EPUB specification, language information identifies
645 /// the primary language of the publication and can have multiple language identifiers.
646 ///
647 /// # Return
648 /// - `Ok(Vec<String>)`: A vector containing all language identifiers
649 /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:language>`
650 ///
651 /// # Notes
652 /// - The EPUB specification requires that each publication specify at least one primary language.
653 /// - Language identifiers should conform to RFC 3066 or later standards.
654 pub fn get_language(&self) -> Result<Vec<String>, EpubError> {
655 self.get_metadata_value("language")
656 .ok_or_else(|| EpubError::NonCanonicalFile {
657 tag: "language".to_string(),
658 })
659 }
660
661 /// Retrieves the identifier of a publication
662 ///
663 /// This function retrieves the identifier information of a publication from
664 /// the EPUB metadata. According to the EPUB specification, each publication
665 /// must have a identifier, typically an ISBN, UUID, or other unique identifier.
666 ///
667 /// # Return
668 /// - `Ok(Vec<String>)`: A vector containing all identifier information
669 /// - `Err(EpubError)`: If and only if the OPF file does not contain `<dc:identifier>`
670 ///
671 /// # Notes
672 /// - The EPUB specification requires each publication to have at least one identifier.
673 /// - In the OPF file, the `unique-identifier` attribute of the `<package>` element
674 /// should point to a `<dc:identifier>` element used to uniquely identify the publication.
675 /// This means that `unique-identifier` is not exactly equal to `<dc:identifier>`.
676 pub fn get_identifier(&self) -> Result<Vec<String>, EpubError> {
677 self.get_metadata_value("identifier")
678 .ok_or_else(|| EpubError::NonCanonicalFile {
679 tag: "identifier".to_string(),
680 })
681 }
682
683 /// Retrieve resource data by resource ID
684 ///
685 /// This function will find the resource with the specified ID in the manifest.
686 /// If the resource is encrypted, it will be automatically decrypted.
687 ///
688 /// # Parameters
689 /// - `id`: The ID of the resource to retrieve
690 ///
691 /// # Return
692 /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
693 /// the MIME type
694 /// - `Err(EpubError)`: Errors that occurred during the retrieval process
695 ///
696 /// # Notes
697 /// - This function will automatically decrypt the resource if it is encrypted.
698 /// - For unsupported encryption methods, the corresponding error will be returned.
699 pub fn get_manifest_item(&mut self, id: &str) -> Result<(Vec<u8>, String), EpubError> {
700 let resource_item = self
701 .manifest
702 .get(id)
703 .cloned()
704 .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
705
706 let path = resource_item.path.to_str().unwrap();
707
708 let mut data = match self.archive.by_name(path) {
709 Ok(mut file) => {
710 let mut entry = Vec::<u8>::new();
711 file.read_to_end(&mut entry)?;
712
713 Ok(entry)
714 }
715 Err(ZipError::FileNotFound) => Err(EpubError::ResourceNotFound {
716 resource: path.to_string(),
717 }),
718 Err(err) => Err(EpubError::from(err)),
719 }?;
720
721 if let Some(method) = self.is_encryption_file(path) {
722 data = self.auto_dencrypt(&method, &mut data)?;
723 }
724
725 Ok((data, resource_item.mime))
726 }
727
728 /// Retrieves resource item data by resource path
729 ///
730 /// This function retrieves resources from the manifest based on the input path.
731 /// The input path must be a relative path to the root directory of the EPUB container;
732 /// using an absolute path or a relative path to another location will result in an error.
733 ///
734 /// # Parameters
735 /// - `path`: The path of the resource to retrieve
736 ///
737 /// # Return
738 /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
739 /// the MIME type
740 /// - `Err(EpubError)`: Errors that occurred during the retrieval process
741 ///
742 /// # Notes
743 /// - This function will automatically decrypt the resource if it is encrypted.
744 /// - For unsupported encryption methods, the corresponding error will be returned.
745 /// - Relative paths other than the root directory of the Epub container are not supported.
746 pub fn get_manifest_item_by_path(
747 &mut self,
748 path: &str,
749 ) -> Result<(Vec<u8>, String), EpubError> {
750 let id = self
751 .manifest
752 .iter()
753 .find(|(_, item)| item.path.to_str().unwrap() == path)
754 .map(|(id, _)| id.to_string())
755 .ok_or_else(|| EpubError::ResourceNotFound {
756 resource: path.to_string(),
757 })?;
758
759 self.get_manifest_item(&id)
760 }
761
762 /// Retrieves supported resource items by resource ID, with fallback mechanism supported
763 ///
764 /// This function attempts to retrieve the resource item with the specified ID and
765 /// checks if its MIME type is in the list of supported formats. If the current resource
766 /// format is not supported, it searches for a supported resource format along the
767 /// fallback chain according to the fallback mechanism defined in the EPUB specification.
768 ///
769 /// # Parameters
770 /// - `id`: The ID of the resource to retrieve
771 /// - `supported_format`: A vector of supported MIME types
772 ///
773 /// # Return
774 /// - `Ok((Vec<u8>, String))`: Successfully retrieved and decrypted resource data and
775 /// the MIME type
776 /// - `Err(EpubError)`: Errors that occurred during the retrieval process
777 pub fn get_manifest_item_with_fallback(
778 &mut self,
779 id: &str,
780 supported_format: Vec<&str>,
781 ) -> Result<(Vec<u8>, String), EpubError> {
782 let mut manifest_item = self
783 .manifest
784 .get(id)
785 .cloned()
786 .ok_or_else(|| EpubError::ResourceIdNotExist { id: id.to_string() })?;
787
788 let mut current_manifest_id = id.to_string();
789 let mut fallback_chain = Vec::<String>::new();
790 'fallback: loop {
791 if supported_format.contains(&manifest_item.mime.as_str()) {
792 return self.get_manifest_item(¤t_manifest_id);
793 }
794
795 let fallback_id = manifest_item.fallback.clone();
796
797 match fallback_id {
798 // The loop ends when no fallback resource exists
799 None => break 'fallback,
800
801 // End the loop when the loop continues to fallback if a fallback resource exists
802 Some(id) if fallback_chain.contains(&id) => break 'fallback,
803
804 Some(id) => {
805 fallback_chain.push(id.clone());
806
807 // Since only warnings are issued for fallback resource checks
808 // during initialization, the issue of fallback resources possibly
809 // not existing needs to be handled here.
810 manifest_item = self
811 .manifest
812 .get(&manifest_item.fallback.unwrap())
813 .cloned()
814 .ok_or(EpubError::ResourceIdNotExist { id: id.clone() })?;
815 current_manifest_id = id;
816 }
817 };
818 }
819
820 Err(EpubError::NoSupportedFileFormat)
821 }
822
823 /// Navigate to a specified chapter using the spine index
824 ///
825 /// This function retrieves the content data of the corresponding chapter based
826 /// on the index position in the EPUB spine. The spine defines the linear reading
827 /// order of the publication's content documents, and each spine item references
828 /// resources in the manifest.
829 ///
830 /// # Parameters
831 /// - `index`: The index position in the spine, starting from 0
832 ///
833 /// # Return
834 /// - `Some((Vec<u8>, String))`: Successfully retrieved chapter content data and the MIME type
835 /// - `None`: Index out of range or data retrieval error
836 ///
837 /// # Notes
838 /// - The index must be less than the total number of spine projects.
839 /// - If the resource is encrypted, it will be automatically decrypted before returning.(TODO)
840 /// - It does not check whether the Spine project follows a linear reading order.
841 pub fn navigate_by_spine_index(&mut self, index: usize) -> Option<(Vec<u8>, String)> {
842 if index >= self.spine.len() {
843 return None;
844 }
845
846 let manifest_id = self.spine[index].idref.clone();
847 self.current_spine_index = index;
848 self.get_manifest_item(&manifest_id).ok()
849 }
850
851 /// Navigate to the previous linear reading chapter
852 ///
853 /// This function searches backwards in the EPUB spine for the previous linear
854 /// reading chapter and returns the content data of that chapter. It only navigates
855 /// to chapters marked as linear reading.
856 ///
857 /// # Return
858 /// - `Some((Vec<u8>, String))`: Successfully retrieved previous chapter content data and
859 /// the MIME type
860 /// - `None`: Already in the first chapter, the current chapter is not linear,
861 /// or data retrieval failed
862 pub fn spine_prev(&mut self) -> Option<(Vec<u8>, String)> {
863 if self.current_spine_index == 0 || !self.spine[self.current_spine_index].linear {
864 return None;
865 }
866
867 let prev_index = (0..self.current_spine_index)
868 .rev()
869 .find(|&index| self.spine[index].linear)?;
870
871 self.current_spine_index = prev_index;
872 let manifest_id = self.spine[prev_index].idref.clone();
873 self.get_manifest_item(&manifest_id).ok()
874 }
875
876 /// Navigate to the next linear reading chapter
877 ///
878 /// This function searches forwards in the EPUB spine for the next linear reading
879 /// chapter and returns the content data of that chapter. It only navigates to
880 /// chapters marked as linear reading.
881 ///
882 /// # Return
883 /// - `Some((Vec<u8>, String))`: Successfully retrieved next chapter content data and
884 /// the MIME type
885 /// - `None`: Already in the last chapter, the current chapter is not linear,
886 /// or data retrieval failed
887 pub fn spine_next(&mut self) -> Option<(Vec<u8>, String)> {
888 if self.current_spine_index >= self.spine.len() - 1
889 || !self.spine[self.current_spine_index].linear
890 {
891 return None;
892 }
893
894 let next_index = (self.current_spine_index + 1..self.spine.len())
895 .find(|&index| self.spine[index].linear)?;
896
897 self.current_spine_index = next_index;
898 let manifest_id = self.spine[next_index].idref.clone();
899 self.get_manifest_item(&manifest_id).ok()
900 }
901
902 /// Retrieves the content data of the current chapter
903 ///
904 /// This function returns the content data of the chapter at the current
905 /// index position in the EPUB spine.
906 ///
907 /// # Return
908 /// - `Some((Vec<u8>, String))`: Successfully retrieved current chapter content data and
909 /// the MIME type
910 /// - `None`: Data retrieval failed
911 pub fn spine_current(&mut self) -> Option<(Vec<u8>, String)> {
912 let manifest_id = self.spine[self.current_spine_index].idref.clone();
913 self.get_manifest_item(&manifest_id).ok()
914 }
915
916 /// Determine the EPUB version from the OPF file
917 ///
918 /// This function is used to detect the version of an epub file from an OPF file.
919 /// When the version attribute in the package is abnormal, version information will
920 /// be identified through some version characteristics of the epub file. An error is
921 /// returned when neither direct nor indirect methods can identify the version.
922 ///
923 /// # Parameters
924 /// - `opf_element`: A reference to the OPF file element
925 fn determine_epub_version(opf_element: &XmlElement) -> Result<EpubVersion, EpubError> {
926 // Check the explicit version attribute
927 if let Some(version) = opf_element.get_attr("version") {
928 match version.as_str() {
929 "2.0" => return Ok(EpubVersion::Version2_0),
930 "3.0" => return Ok(EpubVersion::Version3_0),
931 _ => {}
932 }
933 }
934
935 let spine_element = opf_element
936 .find_elements_by_name("spine")
937 .next()
938 .ok_or_else(|| EpubError::NonCanonicalFile {
939 tag: "spine".to_string(),
940 })?;
941
942 // Look for EPUB 2.x specific features
943 if spine_element.get_attr("toc").is_some() {
944 return Ok(EpubVersion::Version2_0);
945 }
946
947 let manifest_element = opf_element
948 .find_elements_by_name("manifest")
949 .next()
950 .ok_or_else(|| EpubError::NonCanonicalFile {
951 tag: "manifest".to_string(),
952 })?;
953
954 // Look for EPUB 3.x specific features
955 manifest_element
956 .children()
957 .find_map(|element| {
958 if let Some(id) = element.get_attr("id") {
959 if id.eq("nav") {
960 return Some(EpubVersion::Version3_0);
961 }
962 }
963
964 None
965 })
966 .ok_or(EpubError::UnrecognizedEpubVersion)
967 }
968
969 /// Parse metadata elements under the Dublin Core namespace
970 ///
971 /// This function handles the `<metadata>` Dublin Core element in the OPF file (namespace
972 /// is "http://purl.org/dc/elements/1.1/"). These elements usually contain the basic
973 /// information of the publication, such as title, author, publication date, etc.
974 ///
975 /// # Notes
976 /// - In EPUB 3.0, granular information is handled by separate '<meta>' elements and 'refines' attributes
977 /// - All text content is normalized by whitespace
978 #[inline]
979 fn parse_dc_metadata(
980 &mut self,
981 element: &XmlElement,
982 metadata: &mut Vec<MetadataItem>,
983 // refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
984 ) -> Result<(), EpubError> {
985 let id = element.get_attr("id");
986 let lang = element.get_attr("lang");
987 let property = element.name.clone();
988 let value = element.text().normalize_whitespace();
989
990 let refined = match self.version {
991 // In EPUB 2.0, supplementary metadata (refinements) are represented
992 // through other attribute data pairs of the tag.
993 EpubVersion::Version2_0 => element
994 .attributes
995 .iter()
996 .map(|(name, value)| {
997 let property = name.to_string();
998 let value = value.to_string().normalize_whitespace();
999
1000 MetadataRefinement {
1001 refines: id.clone().unwrap(),
1002 property,
1003 value,
1004 lang: None,
1005 scheme: None,
1006 }
1007 })
1008 .collect(),
1009 EpubVersion::Version3_0 => vec![],
1010 };
1011
1012 metadata.push(MetadataItem {
1013 id,
1014 property,
1015 value,
1016 lang,
1017 refined,
1018 });
1019
1020 Ok(())
1021 }
1022
1023 /// Parse metadata elements under the OPF namespace
1024 ///
1025 /// This function handles the `<metadata>` OPF element in the OPF file (namespace
1026 /// is "http://www.idpf.org/2007/opf"). These elements include '<meta>' and '<link>',
1027 /// which are used to provide extended metadata and links to external resources for EPUB publications.
1028 ///
1029 /// # Notes
1030 /// - The function is only responsible for distribution processing, and the
1031 /// specific parsing logic is implemented in the dedicated function
1032 /// - All parsing results are added directly to the incoming collection and no new collection is returned
1033 #[inline]
1034 fn parse_opf_metadata(
1035 &mut self,
1036 element: &XmlElement,
1037 metadata: &mut Vec<MetadataItem>,
1038 metadata_link: &mut Vec<MetadataLinkItem>,
1039 refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1040 ) -> Result<(), EpubError> {
1041 match element.name.as_str() {
1042 "meta" => self.parse_meta_element(element, metadata, refinements),
1043 "link" => self.parse_link_element(element, metadata_link),
1044 _ => Ok(()),
1045 }
1046 }
1047
1048 #[inline]
1049 fn parse_meta_element(
1050 &mut self,
1051 element: &XmlElement,
1052 metadata: &mut Vec<MetadataItem>,
1053 refinements: &mut HashMap<String, Vec<MetadataRefinement>>,
1054 ) -> Result<(), EpubError> {
1055 match self.version {
1056 EpubVersion::Version2_0 => {
1057 let property =
1058 element
1059 .get_attr("name")
1060 .ok_or_else(|| EpubError::NonCanonicalFile {
1061 tag: element.tag_name(),
1062 })?;
1063 let value = element
1064 .get_attr("content")
1065 .ok_or_else(|| EpubError::MissingRequiredAttribute {
1066 tag: element.tag_name(),
1067 attribute: "content".to_string(),
1068 })?
1069 .normalize_whitespace();
1070
1071 metadata.push(MetadataItem {
1072 id: None,
1073 property,
1074 value,
1075 lang: None,
1076 refined: vec![],
1077 });
1078 }
1079
1080 EpubVersion::Version3_0 => {
1081 let property = element.get_attr("property").ok_or_else(|| {
1082 EpubError::MissingRequiredAttribute {
1083 tag: element.tag_name(),
1084 attribute: "property".to_string(),
1085 }
1086 })?;
1087 let value = element.text().normalize_whitespace();
1088 let lang = element.get_attr("lang");
1089
1090 if let Some(refines) = element.get_attr("refines") {
1091 let id = refines.strip_prefix("#").unwrap_or(&refines).to_string();
1092 let scheme = element.get_attr("scheme");
1093 let refinement = MetadataRefinement {
1094 refines: id.clone(),
1095 property,
1096 value,
1097 lang,
1098 scheme,
1099 };
1100
1101 if let Some(refinements) = refinements.get_mut(&id) {
1102 refinements.push(refinement);
1103 } else {
1104 refinements.insert(id, vec![refinement]);
1105 }
1106 } else {
1107 let id = element.get_attr("id");
1108 let item = MetadataItem {
1109 id,
1110 property,
1111 value,
1112 lang,
1113 refined: vec![],
1114 };
1115
1116 metadata.push(item);
1117 };
1118 }
1119 }
1120 Ok(())
1121 }
1122
1123 #[inline]
1124 fn parse_link_element(
1125 &mut self,
1126 element: &XmlElement,
1127 metadata_link: &mut Vec<MetadataLinkItem>,
1128 ) -> Result<(), EpubError> {
1129 let href = element
1130 .get_attr("href")
1131 .ok_or_else(|| EpubError::MissingRequiredAttribute {
1132 tag: element.tag_name(),
1133 attribute: "href".to_string(),
1134 })?;
1135 let rel = element
1136 .get_attr("rel")
1137 .ok_or_else(|| EpubError::MissingRequiredAttribute {
1138 tag: element.tag_name(),
1139 attribute: "rel".to_string(),
1140 })?;
1141 let hreflang = element.get_attr("hreflang");
1142 let id = element.get_attr("id");
1143 let mime = element.get_attr("media-type");
1144 let properties = element.get_attr("properties");
1145
1146 metadata_link.push(MetadataLinkItem {
1147 href,
1148 rel,
1149 hreflang,
1150 id,
1151 mime,
1152 properties,
1153 refines: None,
1154 });
1155 Ok(())
1156 }
1157
1158 /// Recursively parse NCX navigation points from navMap or nested navPoint elements
1159 ///
1160 /// This function parses the hierarchical navigation structure defined in NCX files
1161 /// for EPUB 2.x documents. It handles nested navPoint elements to build a complete
1162 /// tree representation of the publication's table of contents.
1163 fn parse_nav_points(&self, parent_element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1164 let mut nav_points = Vec::new();
1165 for nav_point in parent_element.find_children_by_name("navPoint") {
1166 let label = match nav_point.find_children_by_name("navLabel").next() {
1167 Some(element) => element.text(),
1168 None => String::new(),
1169 };
1170
1171 let content = nav_point
1172 .find_children_by_name("content")
1173 .next()
1174 .map(|element| PathBuf::from(element.text()));
1175
1176 let play_order = nav_point
1177 .get_attr("playOrder")
1178 .and_then(|order| order.parse::<usize>().ok());
1179
1180 let children = self.parse_nav_points(nav_point)?;
1181
1182 nav_points.push(NavPoint {
1183 label,
1184 content,
1185 play_order,
1186 children,
1187 });
1188 }
1189
1190 nav_points.sort();
1191 Ok(nav_points)
1192 }
1193
1194 /// Recursively parses directory list structures
1195 ///
1196 /// This function recursively parses HTML navigation list structures,
1197 /// converting `<ol>` and `<li>` elements into NavPoint structures.
1198 /// Multi-level nested directory structures are supported.
1199 fn parse_catalog_list(&self, element: &XmlElement) -> Result<Vec<NavPoint>, EpubError> {
1200 let mut catalog = Vec::new();
1201 for item in element.children() {
1202 if item.tag_name() != "li" {
1203 return Err(EpubError::NonCanonicalFile {
1204 tag: "li".to_string(),
1205 });
1206 }
1207
1208 let title_element = item
1209 .find_children_by_names(&["span", "a"])
1210 .next()
1211 .ok_or_else(|| EpubError::NonCanonicalFile {
1212 tag: "span/a".to_string(),
1213 })?;
1214 let content_href = title_element.get_attr("href").map(PathBuf::from);
1215 let sub_list = if let Some(list) = item.find_children_by_name("ol").next() {
1216 self.parse_catalog_list(list)?
1217 } else {
1218 vec![]
1219 };
1220
1221 catalog.push(NavPoint {
1222 label: title_element.text(),
1223 content: content_href,
1224 children: sub_list,
1225 play_order: None,
1226 });
1227 }
1228
1229 Ok(catalog)
1230 }
1231
1232 /// Converts relative paths in the manifest to normalized paths
1233 /// relative to the EPUB root directory
1234 ///
1235 /// This function processes the href attribute of resources in the EPUB
1236 /// manifest and converts it to a normalized path representation.
1237 /// It handles three types of paths:
1238 /// - Relative paths starting with `../` (checks if they exceed the EPUB package scope)
1239 /// - Absolute paths starting with `/` (relative to the EPUB root directory)
1240 /// - Other relative paths (relative to the directory containing the OPF file)
1241 ///
1242 /// # Parameters
1243 /// - `path`: The href attribute value of the resource in the manifest
1244 ///
1245 /// # Return
1246 /// - `Ok(PathBuf)`: The parsed normalized path
1247 /// - `Err(EpubError)`: Relative link leakage
1248 #[inline]
1249 fn normalize_manifest_path(&self, path: &str) -> Result<PathBuf, EpubError> {
1250 let mut path = if path.starts_with("../") {
1251 let mut current_dir = self.epub_path.join(&self.package_path);
1252 current_dir.pop();
1253
1254 check_realtive_link_leakage(self.epub_path.clone(), current_dir, path)
1255 .map(PathBuf::from)
1256 .ok_or_else(|| EpubError::RealtiveLinkLeakage {
1257 path: path.to_string(),
1258 })?
1259 } else if let Some(path) = path.strip_prefix("/") {
1260 PathBuf::from(path.to_string())
1261 } else {
1262 self.base_path.join(path)
1263 };
1264
1265 #[cfg(windows)]
1266 {
1267 path = PathBuf::from(path.to_string_lossy().replace('\\', "/"));
1268 }
1269
1270 Ok(path)
1271 }
1272
1273 /// Verify the fallback chain of all manifest items
1274 ///
1275 /// This function iterates through all manifest items with the fallback
1276 /// attribute and verifies the validity of their fallback chains, including checking:
1277 /// - Whether circular references exist
1278 /// - Whether the fallback resource exists in the manifest
1279 ///
1280 /// # Notes
1281 /// If an invalid fallback chain is found, a warning log will be logged
1282 /// but the processing flow will not be interrupted.
1283 fn validate_fallback_chains(&self) {
1284 for (id, item) in &self.manifest {
1285 if item.fallback.is_none() {
1286 continue;
1287 }
1288
1289 let mut fallback_chain = Vec::new();
1290 if let Err(msg) = self.validate_fallback_chain(id, &mut fallback_chain) {
1291 warn!("Invalid fallback chain for item {}: {}", id, msg);
1292 }
1293 }
1294 }
1295
1296 /// Recursively verify the validity of a single fallback chain
1297 ///
1298 /// This function recursively traces the fallback chain to check for the following issues:
1299 /// - Circular reference
1300 /// - The referenced fallback resource does not exist
1301 ///
1302 /// # Parameters
1303 /// - `manifest_id`: The id of the manifest item currently being verified
1304 /// - `fallback_chain`: The visited fallback chain paths used to detect circular references
1305 ///
1306 /// # Return
1307 /// - `Ok(())`: The fallback chain is valid
1308 /// - `Err(String)`: A string containing error information
1309 fn validate_fallback_chain(
1310 &self,
1311 manifest_id: &str,
1312 fallback_chain: &mut Vec<String>,
1313 ) -> Result<(), String> {
1314 if fallback_chain.contains(&manifest_id.to_string()) {
1315 fallback_chain.push(manifest_id.to_string());
1316
1317 return Err(format!(
1318 "Circular reference detected in fallback chain for {}",
1319 fallback_chain.join("->")
1320 ));
1321 }
1322
1323 // Get the current item; its existence can be ensured based on the calling context.
1324 let item = self.manifest.get(manifest_id).unwrap();
1325
1326 if let Some(fallback_id) = &item.fallback {
1327 if !self.manifest.contains_key(fallback_id) {
1328 return Err(format!(
1329 "Fallback resource {} does not exist in manifest",
1330 fallback_id
1331 ));
1332 }
1333
1334 fallback_chain.push(manifest_id.to_string());
1335 self.validate_fallback_chain(fallback_id, fallback_chain)
1336 } else {
1337 // The end of the fallback chain
1338 Ok(())
1339 }
1340 }
1341
1342 /// Checks if a resource at the specified path is an encrypted file
1343 ///
1344 /// This function queries whether a specific resource path is marked as an encrypted
1345 /// file in the EPUB encryption information. It checks the encrypted data stored in
1346 /// `self.encryption`, looking for an entry that matches the given path.
1347 ///
1348 /// # Parameters
1349 /// - `path`: The path of the resource to check
1350 ///
1351 /// # Return
1352 /// - `Some(String)`: The encryption method used for the resource
1353 /// - `None`: The resource is not encrypted
1354 fn is_encryption_file(&self, path: &str) -> Option<String> {
1355 self.encryption.as_ref().and_then(|encryptions| {
1356 encryptions
1357 .iter()
1358 .find(|encryption| encryption.data == path)
1359 .map(|encryption| encryption.method.clone())
1360 })
1361 }
1362
1363 /// Automatically decrypts encrypted resource data
1364 ///
1365 /// Automatically decrypts data based on the provided encryption method.
1366 /// This function supports various encryption methods defined by the EPUB
1367 /// specification, including font obfuscation and the XML encryption standard.
1368 ///
1369 /// # Parameters
1370 /// - `method`: The encryption method used for the resource
1371 /// - `data`: The encrypted resource data
1372 ///
1373 /// # Return
1374 /// - `Ok(Vec<u8>)`: The decrypted resource data
1375 /// - `Err(EpubError)`: Unsupported encryption method
1376 ///
1377 /// # Supported Encryption Methods
1378 /// - IDPF font obfuscation: `http://www.idpf.org/2008/embedding`
1379 /// - Adobe font obfuscation: `http://ns.adobe.com/pdf/enc#RC`
1380 #[inline]
1381 fn auto_dencrypt(&self, method: &str, data: &mut [u8]) -> Result<Vec<u8>, EpubError> {
1382 match method {
1383 "http://www.idpf.org/2008/embedding" => {
1384 Ok(idpf_font_dencryption(data, &self.unique_identifier))
1385 }
1386 "http://ns.adobe.com/pdf/enc#RC" => {
1387 Ok(adobe_font_dencryption(data, &self.unique_identifier))
1388 }
1389 _ => Err(EpubError::UnsupportedEncryptedMethod {
1390 method: method.to_string(),
1391 }),
1392 }
1393 }
1394}
1395
1396impl EpubDoc<BufReader<File>> {
1397 /// Creates a new EPUB document instance
1398 ///
1399 /// This function is a convenience constructor for `EpubDoc`,
1400 /// used to create an EPUB parser instance directly from a file path.
1401 ///
1402 /// # Parameters
1403 /// - `path`: The path to the EPUB file
1404 ///
1405 /// # Return
1406 /// - `Ok(EpubDoc)`: The created EPUB document instance
1407 /// - `Err(EpubError)`: An error occurred during initialization
1408 pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, EpubError> {
1409 let file = File::open(&path).map_err(EpubError::from)?;
1410 let path = canonicalize(path)?;
1411
1412 Self::from_reader(BufReader::new(file), path)
1413 }
1414}
1415
1416#[cfg(test)]
1417mod tests {
1418 use std::path::Path;
1419
1420 use crate::epub::EpubDoc;
1421
1422 /// Section 3.3 package documents
1423 mod package_documents_tests {
1424 use std::path::Path;
1425
1426 use crate::epub::{EpubDoc, EpubVersion};
1427
1428 /// ID: pkg-collections-unknown
1429 ///
1430 /// The package document contains a collection with an unknown role. The reading system must open the EPUB successfully.
1431 #[test]
1432 fn test_pkg_collections_unknown() {
1433 let epub_file = Path::new("./test_case/pkg-collections-unknown.epub");
1434 let doc = EpubDoc::new(epub_file);
1435 assert!(doc.is_ok());
1436 }
1437
1438 /// ID: pkg-creator-order
1439 ///
1440 /// Several creators are listed in the package document. The reading system must not display them out of order (but it may display only the first).
1441 #[test]
1442 fn test_pkg_creator_order() {
1443 let epub_file = Path::new("./test_case/pkg-creator-order.epub");
1444 let doc = EpubDoc::new(epub_file);
1445 assert!(doc.is_ok());
1446
1447 let doc = doc.unwrap();
1448 let creators = doc.get_metadata_value("creator");
1449 assert!(creators.is_some());
1450
1451 let creators = creators.unwrap();
1452 assert_eq!(creators.len(), 5);
1453 assert_eq!(
1454 creators,
1455 vec![
1456 "Dave Cramer",
1457 "Wendy Reid",
1458 "Dan Lazin",
1459 "Ivan Herman",
1460 "Brady Duga",
1461 ]
1462 );
1463 }
1464
1465 /// ID: pkg-manifest-unknown
1466 ///
1467 /// The package document contains a manifest item with unknown properties. The reading system must open the EPUB successfully.
1468 #[test]
1469 fn test_pkg_manifest_order() {
1470 let epub_file = Path::new("./test_case/pkg-manifest-unknown.epub");
1471 let doc = EpubDoc::new(epub_file);
1472 assert!(doc.is_ok());
1473
1474 let mut doc = doc.unwrap();
1475 assert_eq!(doc.manifest.len(), 2);
1476 assert!(doc.get_manifest_item("nav").is_ok());
1477 assert!(doc.get_manifest_item("content_001").is_ok());
1478 assert!(doc.get_manifest_item("content_002").is_err());
1479 }
1480
1481 /// ID: pkg-meta-unknown
1482 ///
1483 /// The package document contains a meta tag with an unknown property. The reading system must open the EPUB successfully.
1484 #[test]
1485 fn test_pkg_meta_unknown() {
1486 let epub_file = Path::new("./test_case/pkg-meta-unknown.epub");
1487 let doc = EpubDoc::new(epub_file);
1488 assert!(doc.is_ok());
1489
1490 let doc = doc.unwrap();
1491 let value = doc.get_metadata_value("dcterms:isReferencedBy");
1492 assert!(value.is_some());
1493 let value = value.unwrap();
1494 assert_eq!(value.len(), 1);
1495 assert_eq!(
1496 value,
1497 vec!["https://www.w3.org/TR/epub-rs/#confreq-rs-pkg-meta-unknown"]
1498 );
1499
1500 let value = doc.get_metadata_value("dcterms:modified");
1501 assert!(value.is_some());
1502 let value = value.unwrap();
1503 assert_eq!(value.len(), 1);
1504 assert_eq!(value, vec!["2021-01-11T00:00:00Z"]);
1505
1506 let value = doc.get_metadata_value("dcterms:title");
1507 assert!(value.is_none());
1508 }
1509
1510 /// ID: pkg-meta-whitespace
1511 ///
1512 /// The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases.
1513 #[test]
1514 fn test_pkg_meta_white_space() {
1515 let epub_file = Path::new("./test_case/pkg-meta-whitespace.epub");
1516 let doc = EpubDoc::new(epub_file);
1517 assert!(doc.is_ok());
1518
1519 let doc = doc.unwrap();
1520 let value = doc.get_metadata_value("creator");
1521 assert!(value.is_some());
1522 let value = value.unwrap();
1523 assert_eq!(value.len(), 1);
1524 assert_eq!(value, vec!["Dave Cramer"]);
1525
1526 let value = doc.get_metadata_value("description");
1527 assert!(value.is_some());
1528 let value = value.unwrap();
1529 assert_eq!(value.len(), 1);
1530 assert_eq!(
1531 value,
1532 vec![
1533 "The package document's title and creator contain leading and trailing spaces along with excess internal whitespace. The reading system must render only a single space in all cases."
1534 ]
1535 );
1536 }
1537
1538 /// ID: pkg-spine-duplicate-item-hyperlink
1539 ///
1540 /// The spine contains several references to the same content document. The reading system must move to the position of the first duplicate in the reading order when following a hyperlink.
1541 #[test]
1542 fn test_pkg_spine_duplicate_item_hyperlink() {
1543 let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-hyperlink.epub");
1544 let doc = EpubDoc::new(epub_file);
1545 assert!(doc.is_ok());
1546
1547 let mut doc = doc.unwrap();
1548 assert_eq!(doc.spine.len(), 4);
1549 assert_eq!(
1550 doc.navigate_by_spine_index(0).unwrap(),
1551 doc.get_manifest_item("content_001").unwrap()
1552 );
1553 assert_eq!(
1554 doc.navigate_by_spine_index(1).unwrap(),
1555 doc.get_manifest_item("content_002").unwrap()
1556 );
1557 assert_eq!(
1558 doc.navigate_by_spine_index(2).unwrap(),
1559 doc.get_manifest_item("content_002").unwrap()
1560 );
1561 assert_eq!(
1562 doc.navigate_by_spine_index(3).unwrap(),
1563 doc.get_manifest_item("content_002").unwrap()
1564 );
1565 }
1566
1567 /// ID: pkg-spine-duplicate-item-rendering
1568 ///
1569 /// The spine contains several references to the same content document. The reading system must not skip the duplicates when rendering the reading order.
1570 #[test]
1571 fn test_pkg_spine_duplicate_item_rendering() {
1572 let epub_file = Path::new("./test_case/pkg-spine-duplicate-item-rendering.epub");
1573 let doc = EpubDoc::new(epub_file);
1574 assert!(doc.is_ok());
1575
1576 let mut doc = doc.unwrap();
1577 assert_eq!(doc.spine.len(), 4);
1578
1579 let result = doc.spine_prev();
1580 assert!(result.is_none());
1581
1582 let result = doc.spine_next();
1583 assert!(result.is_some());
1584
1585 doc.spine_next();
1586 doc.spine_next();
1587 let result = doc.spine_next();
1588 assert!(result.is_none());
1589 }
1590
1591 /// ID: pkg-spine-nonlinear-activation
1592 ///
1593 /// An itemref in the spine is marked as non-linear. Although it (possibly) cannot be accessed through the table of contents, it can be reached from a link in the XHTML content.
1594 #[test]
1595 fn test_pkg_spine_nonlinear_activation() {
1596 let epub_file = Path::new("./test_case/pkg-spine-nonlinear-activation.epub");
1597 let doc = EpubDoc::new(epub_file);
1598 assert!(doc.is_ok());
1599
1600 let mut doc = doc.unwrap();
1601 assert!(doc.spine_prev().is_none());
1602 assert!(doc.spine_next().is_none());
1603
1604 assert!(doc.navigate_by_spine_index(1).is_some());
1605 assert!(doc.spine_prev().is_none());
1606 assert!(doc.spine_next().is_none());
1607 }
1608
1609 /// ID: pkg-spine-order
1610 ///
1611 /// Basic test of whether a reading system can display spine items in the correct order. The test fails if the reading system presents content in the order in which the file names sort, or if it presents files in manifest order rather than spine order.
1612 #[test]
1613 fn test_pkg_spine_order() {
1614 let epub_file = Path::new("./test_case/pkg-spine-order.epub");
1615 let doc = EpubDoc::new(epub_file);
1616 assert!(doc.is_ok());
1617
1618 let doc = doc.unwrap();
1619 assert_eq!(doc.spine.len(), 4);
1620 assert_eq!(
1621 doc.spine
1622 .iter()
1623 .map(|item| item.idref.clone())
1624 .collect::<Vec<String>>(),
1625 vec![
1626 "d-content_001",
1627 "c-content_002",
1628 "b-content_003",
1629 "a-content_004",
1630 ]
1631 );
1632 }
1633
1634 /// ID: pkg-spine-order-svg
1635 ///
1636 /// Basic test of whether a reading system can display SVG spine items in the correct order.
1637 #[test]
1638 fn test_spine_order_svg() {
1639 let epub_file = Path::new("./test_case/pkg-spine-order-svg.epub");
1640 let doc = EpubDoc::new(epub_file);
1641 assert!(doc.is_ok());
1642
1643 let mut doc = doc.unwrap();
1644 assert_eq!(doc.spine.len(), 4);
1645
1646 loop {
1647 if let Some(spine) = doc.spine_next() {
1648 let idref = doc.spine[doc.current_spine_index].idref.clone();
1649 let resource = doc.get_manifest_item(&idref);
1650 assert!(resource.is_ok());
1651
1652 let resource = resource.unwrap();
1653 assert_eq!(spine, resource);
1654 } else {
1655 break;
1656 }
1657 }
1658
1659 assert_eq!(doc.current_spine_index, 3);
1660 }
1661
1662 /// ID: pkg-spine-unknown
1663 ///
1664 /// The package document contains a spine item with unknown properties. The reading system must open the EPUB successfully.
1665 #[test]
1666 fn test_pkg_spine_unknown() {
1667 let epub_file = Path::new("./test_case/pkg-spine-unknown.epub");
1668 let doc = EpubDoc::new(epub_file);
1669 assert!(doc.is_ok());
1670
1671 let doc = doc.unwrap();
1672 assert_eq!(doc.spine.len(), 1);
1673 assert_eq!(doc.spine[0].idref, "content_001");
1674 assert_eq!(doc.spine[0].id, None);
1675 assert_eq!(doc.spine[0].linear, true);
1676 assert_eq!(doc.spine[0].properties, Some("untrustworthy".to_string()));
1677 }
1678
1679 /// ID: pkg-title-order
1680 ///
1681 /// Several titles are listed in the package document. The reading system must use the first title (and whether to use other titles is not defined).
1682 #[test]
1683 fn test_pkg_title_order() {
1684 let epub_file = Path::new("./test_case/pkg-title-order.epub");
1685 let doc = EpubDoc::new(epub_file);
1686 assert!(doc.is_ok());
1687
1688 let doc = doc.unwrap();
1689 let title_list = doc.get_title();
1690 assert!(title_list.is_ok());
1691
1692 let title_list = title_list.unwrap();
1693 assert_eq!(title_list.len(), 6);
1694 assert_eq!(
1695 title_list,
1696 vec![
1697 "pkg-title-order",
1698 "This title must not display first",
1699 "Also, this title must not display first",
1700 "This title also must not display first",
1701 "This title must also not display first",
1702 "This title must not display first, also",
1703 ]
1704 );
1705 }
1706
1707 /// ID: pkg-unique-id
1708 ///
1709 /// The package document's dc:identifier is identical across two publications. The reading system should display both publications independently.
1710 #[test]
1711 fn test_pkg_unique_id() {
1712 let epub_file = Path::new("./test_case/pkg-unique-id.epub");
1713 let doc_1 = EpubDoc::new(epub_file);
1714 assert!(doc_1.is_ok());
1715
1716 let epub_file = Path::new("./test_case/pkg-unique-id_duplicate.epub");
1717 let doc_2 = EpubDoc::new(epub_file);
1718 assert!(doc_2.is_ok());
1719
1720 let doc_1 = doc_1.unwrap();
1721 let doc_2 = doc_2.unwrap();
1722
1723 assert_eq!(
1724 doc_1.get_identifier().unwrap(),
1725 doc_2.get_identifier().unwrap()
1726 );
1727 assert_eq!(doc_1.unique_identifier, "pkg-unique-id");
1728 assert_eq!(doc_2.unique_identifier, "pkg-unique-id");
1729 }
1730
1731 /// ID: pkg-version-backward
1732 ///
1733 /// “Reading Systems MUST attempt to process an EPUB Publication whose Package Document version attribute is less than "3.0"”. This is an EPUB with package version attribute set to "0", to see if a reading system will open it.
1734 #[test]
1735 fn test_pkg_version_backward() {
1736 let epub_file = Path::new("./test_case/pkg-version-backward.epub");
1737 let doc = EpubDoc::new(epub_file);
1738 assert!(doc.is_ok());
1739
1740 let doc = doc.unwrap();
1741 assert_eq!(doc.version, EpubVersion::Version3_0);
1742 }
1743
1744 /// ID: pkg-linked-records
1745 ///
1746 /// Reading System must process and display the title and creator metadata from the package document. An ONIX 3.0 format linked metadata record exists, but contains neither title nor creator metadata.
1747 #[test]
1748 fn test_pkg_linked_records() {
1749 let epub_file = Path::new("./test_case/pkg-linked-records.epub");
1750 let doc = EpubDoc::new(epub_file);
1751 assert!(doc.is_ok());
1752
1753 let doc = doc.unwrap();
1754 assert_eq!(doc.metadata_link.len(), 3);
1755
1756 let item = doc.metadata_link.iter().find(|&item| {
1757 if let Some(properties) = &item.properties {
1758 properties.eq("onix")
1759 } else {
1760 false
1761 }
1762 });
1763 assert!(item.is_some());
1764 }
1765
1766 /// ID: pkg-manifest-unlisted-resource
1767 ///
1768 /// The XHTML content references an image that does not appear in the manifest. The image should not be shown.
1769 #[test]
1770 fn test_pkg_manifest_unlisted_resource() {
1771 let epub_file = Path::new("./test_case/pkg-manifest-unlisted-resource.epub");
1772 let doc = EpubDoc::new(epub_file);
1773 assert!(doc.is_ok());
1774
1775 let mut doc = doc.unwrap();
1776 assert!(
1777 doc.get_manifest_item_by_path("EPUB/content_001.xhtml")
1778 .is_ok()
1779 );
1780
1781 assert!(doc.get_manifest_item_by_path("EPUB/red.png").is_err());
1782 let err = doc.get_manifest_item_by_path("EPUB/red.png").unwrap_err();
1783 assert_eq!(
1784 err.to_string(),
1785 "Resource not found: Unable to find resource from \"EPUB/red.png\"."
1786 );
1787 }
1788 }
1789
1790 /// Section 3.4 manifest fallbacks
1791 ///
1792 /// The tests under this module seem to favor the reading system rather than the EPUB format itself
1793 mod manifest_fallbacks_tests {
1794 use std::path::Path;
1795
1796 use crate::epub::EpubDoc;
1797
1798 /// ID: pub-foreign_bad-fallback
1799 ///
1800 /// This is a test of manifest fallbacks where both the spine item and the fallback are likely to be unsupported. The spine item is a DMG, with a fallback to a PSD file. Reading systems may raise an error on the ingenstion workflow.
1801 #[test]
1802 fn test_pub_foreign_bad_fallback() {
1803 let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
1804 let doc = EpubDoc::new(epub_file);
1805 assert!(doc.is_ok());
1806
1807 let mut doc = doc.unwrap();
1808 assert!(doc.get_manifest_item("content_001").is_ok());
1809 assert!(doc.get_manifest_item("bar").is_ok());
1810
1811 assert_eq!(
1812 doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
1813 .unwrap_err()
1814 .to_string(),
1815 "No supported file format: The fallback resource does not contain the file format you support."
1816 );
1817 }
1818
1819 /// ID: pub-foreign_image
1820 ///
1821 /// An HTML content file contains a PSD image, with a manifest fallback to a PNG image. This tests fallbacks for resources that are not in the spine.
1822 #[test]
1823 fn test_pub_foreign_image() {
1824 let epub_file = Path::new("./test_case/pub-foreign_image.epub");
1825 let doc = EpubDoc::new(epub_file);
1826 assert!(doc.is_ok());
1827
1828 let mut doc = doc.unwrap();
1829 let result = doc.get_manifest_item_with_fallback(
1830 "image-tiff",
1831 vec!["image/png", "application/xhtml+xml"],
1832 );
1833 assert!(result.is_ok());
1834
1835 let (_, mime) = result.unwrap();
1836 assert_eq!(mime, "image/png");
1837 }
1838
1839 /// ID: pub-foreign_json-spine
1840 ///
1841 /// This EPUB uses a JSON content file in the spine, with a manifest fallback to an HTML document. If the reading system does not support JSON, it should display the HTML.
1842 #[test]
1843 fn test_pub_foreign_json_spine() {
1844 let epub_file = Path::new("./test_case/pub-foreign_json-spine.epub");
1845 let doc = EpubDoc::new(epub_file);
1846 assert!(doc.is_ok());
1847
1848 let mut doc = doc.unwrap();
1849 let result = doc.get_manifest_item_with_fallback(
1850 "content_primary",
1851 vec!["application/xhtml+xml", "application/json"],
1852 );
1853 assert!(result.is_ok());
1854 let (_, mime) = result.unwrap();
1855 assert_eq!(mime, "application/json");
1856
1857 let result = doc
1858 .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1859 assert!(result.is_ok());
1860 let (_, mime) = result.unwrap();
1861 assert_eq!(mime, "application/xhtml+xml");
1862 }
1863
1864 /// ID: pub-foreign_xml-spine
1865 ///
1866 /// This EPUB uses an ordinary XML content file with mimetype application/xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1867 #[test]
1868 fn test_pub_foreign_xml_spine() {
1869 let epub_file = Path::new("./test_case/pub-foreign_xml-spine.epub");
1870 let doc = EpubDoc::new(epub_file);
1871 assert!(doc.is_ok());
1872
1873 let mut doc = doc.unwrap();
1874 let result = doc.get_manifest_item_with_fallback(
1875 "content_primary",
1876 vec!["application/xhtml+xml", "application/xml"],
1877 );
1878 assert!(result.is_ok());
1879 let (_, mime) = result.unwrap();
1880 assert_eq!(mime, "application/xml");
1881
1882 let result = doc
1883 .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1884 assert!(result.is_ok());
1885 let (_, mime) = result.unwrap();
1886 assert_eq!(mime, "application/xhtml+xml");
1887 }
1888
1889 /// ID: pub-foreign_xml-suffix-spine
1890 ///
1891 /// This EPUB uses an custom XML content file with mimetype application/dtc+xml in the spine, with a manifest fallback to an HTML document. If the reading system does not support XML, it should display the HTML.
1892 #[test]
1893 fn test_pub_foreign_xml_suffix_spine() {
1894 let epub_file = Path::new("./test_case/pub-foreign_xml-suffix-spine.epub");
1895 let doc = EpubDoc::new(epub_file);
1896 assert!(doc.is_ok());
1897
1898 let mut doc = doc.unwrap();
1899 let result = doc.get_manifest_item_with_fallback(
1900 "content_primary",
1901 vec!["application/xhtml+xml", "application/dtc+xml"],
1902 );
1903 assert!(result.is_ok());
1904 let (_, mime) = result.unwrap();
1905 assert_eq!(mime, "application/dtc+xml");
1906
1907 let result = doc
1908 .get_manifest_item_with_fallback("content_primary", vec!["application/xhtml+xml"]);
1909 assert!(result.is_ok());
1910 let (_, mime) = result.unwrap();
1911 assert_eq!(mime, "application/xhtml+xml");
1912 }
1913 }
1914
1915 /// Section 3.9 open container format
1916 mod open_container_format_tests {
1917 use std::{cmp::min, io::Read, path::Path};
1918
1919 use sha1::{Digest, Sha1};
1920
1921 use crate::epub::EpubDoc;
1922
1923 /// ID: ocf-metainf-inc
1924 ///
1925 /// An extra configuration file, not in the reserved files' list, is added to the META-INF folder; this file must be ignored.
1926 #[test]
1927 fn test_ocf_metainf_inc() {
1928 let epub_file = Path::new("./test_case/ocf-metainf-inc.epub");
1929 let doc = EpubDoc::new(epub_file);
1930 assert!(doc.is_ok());
1931 }
1932
1933 /// ID: ocf-metainf-manifest
1934 ///
1935 /// An ancillary manifest file, containing an extra spine item, is present in the META-INF directory; this extra item must be ignored by the reading system.
1936 #[test]
1937 fn test_ocf_metainf_manifest() {
1938 let epub_file = Path::new("./test_case/ocf-metainf-manifest.epub");
1939 let doc = EpubDoc::new(epub_file);
1940 assert!(doc.is_ok());
1941 }
1942
1943 /// ID: ocf-package_arbitrary
1944 ///
1945 /// The EPUB contains three valid package files and three corresponding sets of content documents, but only one of the packages, in an unusual subdirectory, is referenced by the container.xml file. The reading system must use this package.
1946 #[test]
1947 fn test_ocf_package_arbitrary() {
1948 let epub_file = Path::new("./test_case/ocf-package_arbitrary.epub");
1949 let doc = EpubDoc::new(epub_file);
1950 assert!(doc.is_ok());
1951
1952 let doc = doc.unwrap();
1953 assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1954 }
1955
1956 /// ID: ocf-package_multiple
1957 ///
1958 /// The EPUB contains three valid package files and three corresponding sets of content documents, all referenced by the container.xml file. The reading system must use the first package.
1959 #[test]
1960 fn test_ocf_package_multiple() {
1961 let epub_file = Path::new("./test_case/ocf-package_multiple.epub");
1962 let doc = EpubDoc::new(epub_file);
1963 assert!(doc.is_ok());
1964
1965 let doc = doc.unwrap();
1966 assert_eq!(doc.package_path, Path::new("FOO/BAR/package.opf"));
1967 assert_eq!(doc.base_path, Path::new("FOO/BAR"));
1968 }
1969
1970 /// ID: ocf-url_link-leaking-relative
1971 ///
1972 /// Use a relative link with several double-dot path segments from the content to a photograph. The folder hierarchy containing the photograph starts at the root level; the relative image reference exceeds depth of hierarchy.
1973 #[test]
1974 fn test_ocf_url_link_leaking_relative() {
1975 let epub_file = Path::new("./test_case/ocf-url_link-leaking-relative.epub");
1976 let doc = EpubDoc::new(epub_file);
1977 assert!(doc.is_err());
1978 assert_eq!(
1979 doc.err().unwrap().to_string(),
1980 String::from(
1981 "Relative link leakage: Path \"../../../../media/imgs/monastery.jpg\" is out of container range."
1982 )
1983 )
1984 }
1985
1986 /// ID: ocf-url_link-path-absolute
1987 ///
1988 /// Use a path-absolute link, i.e., beginning with a leading slash, from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
1989 #[test]
1990 fn test_ocf_url_link_path_absolute() {
1991 let epub_file = Path::new("./test_case/ocf-url_link-path-absolute.epub");
1992 let doc = EpubDoc::new(epub_file);
1993 assert!(doc.is_ok());
1994
1995 let doc = doc.unwrap();
1996 let resource = doc.manifest.get("photo").unwrap();
1997 assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
1998 }
1999
2000 /// ID: ocf-url_link-relative
2001 ///
2002 /// A simple relative link from the content to a photograph. The folder hierarchy containing the photograph starts at the root level.
2003 #[test]
2004 fn test_ocf_url_link_relative() {
2005 let epub_file = Path::new("./test_case/ocf-url_link-relative.epub");
2006 let doc = EpubDoc::new(epub_file);
2007 assert!(doc.is_ok());
2008
2009 let doc = doc.unwrap();
2010 let resource = doc.manifest.get("photo").unwrap();
2011 assert_eq!(resource.path, Path::new("media/imgs/monastery.jpg"));
2012 }
2013
2014 /// ID: ocf-url_manifest
2015 ///
2016 /// The manifest refers to an XHTML file in an arbitrary subfolder. The reading system must be able to find the content.
2017 #[test]
2018 fn test_ocf_url_manifest() {
2019 let epub_file = Path::new("./test_case/ocf-url_manifest.epub");
2020 let doc = EpubDoc::new(epub_file);
2021 assert!(doc.is_ok());
2022
2023 let mut doc = doc.unwrap();
2024 assert!(doc.get_manifest_item("nav").is_ok());
2025 assert!(doc.get_manifest_item("content_001").is_ok());
2026 assert!(doc.get_manifest_item("content_002").is_err());
2027 }
2028
2029 /// ID: ocf-url_relative
2030 ///
2031 /// The manifest refers to an XHTML file in an arbitrary subfolder that is relative to the package's own arbitrary folder. The reading system must be able to find the content.
2032 #[test]
2033 fn test_ocf_url_relative() {
2034 let epub_file = Path::new("./test_case/ocf-url_relative.epub");
2035 let doc = EpubDoc::new(epub_file);
2036 assert!(doc.is_ok());
2037
2038 let mut doc = doc.unwrap();
2039 assert_eq!(doc.package_path, Path::new("foo/BAR/baz.opf"));
2040 assert_eq!(doc.base_path, Path::new("foo/BAR"));
2041 assert_eq!(
2042 doc.manifest.get("nav").unwrap().path,
2043 Path::new("foo/BAR/nav.xhtml")
2044 );
2045 assert_eq!(
2046 doc.manifest.get("content_001").unwrap().path,
2047 Path::new("foo/BAR/qux/content_001.xhtml")
2048 );
2049 assert!(doc.get_manifest_item("nav").is_ok());
2050 assert!(doc.get_manifest_item("content_001").is_ok());
2051 }
2052
2053 /// ID: ocf-zip-comp
2054 ///
2055 /// MUST treat any OCF ZIP container that uses compression techniques other than Deflate as in error.
2056 /// This test case does not use compression methods other than Deflate and cannot detect whether it is effective.
2057 #[test]
2058 fn test_ocf_zip_comp() {
2059 let epub_file = Path::new("./test_case/ocf-zip-comp.epub");
2060 let doc = EpubDoc::new(epub_file);
2061 assert!(doc.is_ok());
2062 }
2063
2064 /// ID: ocf-zip-mult
2065 ///
2066 /// MUST treat any OCF ZIP container that splits the content into segments as in error.
2067 /// This test case is not a segmented OCF ZIP container and cannot be tested to see if it is valid.
2068 #[test]
2069 fn test_ocf_zip_mult() {
2070 let epub_file = Path::new("./test_case/ocf-zip-mult.epub");
2071 let doc = EpubDoc::new(epub_file);
2072 assert!(doc.is_ok());
2073 }
2074
2075 /// ID: ocf-font_obfuscation
2076 ///
2077 /// An obfuscated (TrueType) font should be displayed after de-obfuscation.
2078 #[test]
2079 fn test_ocf_font_obfuscation() {
2080 let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2081 let doc = EpubDoc::new(epub_file);
2082 assert!(doc.is_ok());
2083
2084 let mut doc = doc.unwrap();
2085 let unique_id = doc.unique_identifier.clone();
2086
2087 let mut hasher = Sha1::new();
2088 hasher.update(unique_id.as_bytes());
2089 let hash = hasher.finalize();
2090 let mut key = vec![0u8; 1040];
2091 for i in 0..1040 {
2092 key[i] = hash[i % hash.len()];
2093 }
2094
2095 assert!(doc.encryption.is_some());
2096 assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2097
2098 let data = &doc.encryption.unwrap()[0];
2099 assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2100
2101 let font_file = doc
2102 .archive
2103 .by_name(&data.data)
2104 .unwrap()
2105 .bytes()
2106 .collect::<Result<Vec<u8>, _>>();
2107 assert!(font_file.is_ok());
2108 let font_file = font_file.unwrap();
2109
2110 // 根据EPUB规范,字体混淆是直接对字体文件进行的,不需要解压步骤,直接进行去混淆处理
2111 let mut deobfuscated = font_file.clone();
2112 for i in 0..min(1040, deobfuscated.len()) {
2113 deobfuscated[i] ^= key[i];
2114 }
2115
2116 assert!(is_valid_font(&deobfuscated));
2117 }
2118
2119 /// ID: ocf-font_obfuscation-bis
2120 ///
2121 /// An obfuscated (TrueType) font should not be displayed after de-obfuscation, because the obfuscation used a different publication id.
2122 #[test]
2123 fn test_ocf_font_obfuscation_bis() {
2124 let epub_file = Path::new("./test_case/ocf-font_obfuscation_bis.epub");
2125 let doc = EpubDoc::new(epub_file);
2126 assert!(doc.is_ok());
2127
2128 let mut doc = doc.unwrap();
2129
2130 let wrong_unique_id = "wrong-publication-id";
2131 let mut hasher = Sha1::new();
2132 hasher.update(wrong_unique_id.as_bytes());
2133 let hash = hasher.finalize();
2134 let mut wrong_key = vec![0u8; 1040];
2135 for i in 0..1040 {
2136 wrong_key[i] = hash[i % hash.len()];
2137 }
2138
2139 assert!(doc.encryption.is_some());
2140 assert_eq!(doc.encryption.as_ref().unwrap().len(), 1);
2141
2142 let data = &doc.encryption.unwrap()[0];
2143 assert_eq!(data.method, "http://www.idpf.org/2008/embedding");
2144
2145 let font_file = doc
2146 .archive
2147 .by_name(&data.data)
2148 .unwrap()
2149 .bytes()
2150 .collect::<Result<Vec<u8>, _>>();
2151 assert!(font_file.is_ok());
2152 let font_file = font_file.unwrap();
2153
2154 // 使用错误的密钥进行去混淆
2155 let mut deobfuscated_with_wrong_key = font_file.clone();
2156 for i in 0..std::cmp::min(1040, deobfuscated_with_wrong_key.len()) {
2157 deobfuscated_with_wrong_key[i] ^= wrong_key[i];
2158 }
2159
2160 assert!(!is_valid_font(&deobfuscated_with_wrong_key));
2161 }
2162
2163 fn is_valid_font(data: &[u8]) -> bool {
2164 if data.len() < 4 {
2165 return false;
2166 }
2167 let sig = &data[0..4];
2168 // OTF: "OTTO"
2169 // TTF: 0x00010000, 0x00020000, "true", "typ1"
2170 sig == b"OTTO"
2171 || sig == b"\x00\x01\x00\x00"
2172 || sig == b"\x00\x02\x00\x00"
2173 || sig == b"true"
2174 || sig == b"typ1"
2175 }
2176 }
2177
2178 /// Test for function `has_encryption`
2179 #[test]
2180 fn test_fn_has_encryption() {
2181 let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2182 let doc = EpubDoc::new(epub_file);
2183 assert!(doc.is_ok());
2184
2185 let mut doc = doc.unwrap();
2186 assert!(doc.has_encryption());
2187 }
2188
2189 /// This test is used to detect whether the "META-INF/encryption.xml" file is parsed correctly
2190 #[test]
2191 fn test_fn_parse_encryption() {
2192 let epub_file = Path::new("./test_case/ocf-font_obfuscation.epub");
2193 let doc = EpubDoc::new(epub_file);
2194 assert!(doc.is_ok());
2195
2196 let doc = doc.unwrap();
2197 assert!(doc.encryption.is_some());
2198
2199 let encryption = doc.encryption.unwrap();
2200 assert_eq!(encryption.len(), 1);
2201 assert_eq!(encryption[0].method, "http://www.idpf.org/2008/embedding");
2202 assert_eq!(encryption[0].data, "EPUB/fonts/Lobster.ttf");
2203 }
2204
2205 #[test]
2206 fn test_get_metadata_existing_key() {
2207 let epub_file = Path::new("./test_case/epub-33.epub");
2208 let doc = EpubDoc::new(epub_file);
2209 assert!(doc.is_ok());
2210
2211 let doc = doc.unwrap();
2212
2213 let titles = doc.get_metadata("title");
2214 assert!(titles.is_some());
2215
2216 let titles = titles.unwrap();
2217 assert_eq!(titles.len(), 1);
2218 assert_eq!(titles[0].property, "title");
2219 assert_eq!(titles[0].value, "EPUB 3.3");
2220
2221 let languages = doc.get_metadata("language");
2222 assert!(languages.is_some());
2223
2224 let languages = languages.unwrap();
2225 assert_eq!(languages.len(), 1);
2226 assert_eq!(languages[0].property, "language");
2227 assert_eq!(languages[0].value, "en-us");
2228 }
2229
2230 #[test]
2231 fn test_get_metadata_nonexistent_key() {
2232 let epub_file = Path::new("./test_case/epub-33.epub");
2233 let doc = EpubDoc::new(epub_file);
2234 assert!(doc.is_ok());
2235
2236 let doc = doc.unwrap();
2237 let metadata = doc.get_metadata("nonexistent");
2238 assert!(metadata.is_none());
2239 }
2240
2241 #[test]
2242 fn test_get_metadata_multiple_items_same_type() {
2243 let epub_file = Path::new("./test_case/epub-33.epub");
2244 let doc = EpubDoc::new(epub_file);
2245 assert!(doc.is_ok());
2246
2247 let doc = doc.unwrap();
2248
2249 let creators = doc.get_metadata("creator");
2250 assert!(creators.is_some());
2251
2252 let creators = creators.unwrap();
2253 assert_eq!(creators.len(), 3);
2254
2255 assert_eq!(creators[0].id, Some("creator_id_0".to_string()));
2256 assert_eq!(creators[0].property, "creator");
2257 assert_eq!(creators[0].value, "Matt Garrish, DAISY Consortium");
2258
2259 assert_eq!(creators[1].id, Some("creator_id_1".to_string()));
2260 assert_eq!(creators[1].property, "creator");
2261 assert_eq!(creators[1].value, "Ivan Herman, W3C");
2262
2263 assert_eq!(creators[2].id, Some("creator_id_2".to_string()));
2264 assert_eq!(creators[2].property, "creator");
2265 assert_eq!(creators[2].value, "Dave Cramer, Invited Expert");
2266 }
2267
2268 #[test]
2269 fn test_get_metadata_with_refinement() {
2270 let epub_file = Path::new("./test_case/epub-33.epub");
2271 let doc = EpubDoc::new(epub_file);
2272 assert!(doc.is_ok());
2273
2274 let doc = doc.unwrap();
2275
2276 let title = doc.get_metadata("title");
2277 assert!(title.is_some());
2278
2279 let title = title.unwrap();
2280 assert_eq!(title.len(), 1);
2281 assert_eq!(title[0].refined.len(), 1);
2282 assert_eq!(title[0].refined[0].property, "title-type");
2283 assert_eq!(title[0].refined[0].value, "main");
2284 }
2285
2286 #[test]
2287 fn test_get_manifest_item_with_fallback() {
2288 let epub_file = Path::new("./test_case/pub-foreign_bad-fallback.epub");
2289 let doc = EpubDoc::new(epub_file);
2290 assert!(doc.is_ok());
2291
2292 let mut doc = doc.unwrap();
2293 assert!(doc.get_manifest_item("content_001").is_ok());
2294 assert!(doc.get_manifest_item("bar").is_ok());
2295
2296 // 当回退链上存在可回退资源时能获取资源
2297 if let Ok((_, mime)) = doc.get_manifest_item_with_fallback("content_001", vec!["image/psd"])
2298 {
2299 assert_eq!(mime, "image/psd");
2300 } else {
2301 assert!(false, "get_manifest_item_with_fallback failed");
2302 }
2303
2304 // 当回退链上不存在可回退资源时无法获取资源
2305 assert_eq!(
2306 doc.get_manifest_item_with_fallback("content_001", vec!["application/xhtml+xml"])
2307 .unwrap_err()
2308 .to_string(),
2309 "No supported file format: The fallback resource does not contain the file format you support."
2310 );
2311 }
2312}