lib_epub/utils.rs
1use std::{
2 cmp::min,
3 collections::HashMap,
4 io::{Read, Seek},
5 path::PathBuf,
6};
7
8#[cfg(feature = "builder")]
9use chrono::Local;
10use quick_xml::{NsReader, events::Event};
11use sha1::{Digest, Sha1};
12use sha2::Sha256;
13use zip::{CompressionMethod, ZipArchive};
14
15use crate::error::EpubError;
16
17#[cfg(feature = "builder")]
18pub static ELEMENT_IN_DC_NAMESPACE: std::sync::LazyLock<Vec<&str>> =
19 std::sync::LazyLock::new(|| {
20 vec![
21 "contributor",
22 "coverage",
23 "creator",
24 "date",
25 "description",
26 "format",
27 "identifier",
28 "language",
29 "publisher",
30 "relation",
31 "rights",
32 "source",
33 "subject",
34 "title",
35 "type",
36 ]
37 });
38
39#[cfg(feature = "builder")]
40/// Returns the current time with custom format
41pub fn local_time() -> String {
42 Local::now().format("%Y-%m-%dT%H-%M-%S.%fU%z").to_string()
43}
44
45/// Extracts the contents of a specified file from a ZIP archive
46///
47/// This function reads the raw byte data of a specified file from an EPUB file (which
48/// is essentially a ZIP archive). This is a fundamental utility function for handling
49/// files within an EPUB (such as OPF, NCX, container files, etc.).
50///
51/// ## Parameters
52/// - `zip_file`: A mutable reference to a ZIP archive object
53/// - `file_name`: The path to the file to extract (relative to the ZIP archive root directory)
54///
55/// ## Return
56/// - `Ok(Vec<u8>)`: Returns a byte vector containing the file data
57/// if the file content was successfully read
58/// - `Err(EpubError)`: The file does not exist or an error occurred during the read operation
59///
60/// ## Notes
61/// - The returned data is raw bytes; the caller needs to perform
62/// appropriate decoding based on the file type.
63/// - For text files, further decoding using the `DecodeBytes` trait is usually required.
64pub fn get_file_in_zip_archive<R: Read + Seek>(
65 zip_file: &mut ZipArchive<R>,
66 file_name: &str,
67) -> Result<Vec<u8>, EpubError> {
68 let mut buffer = Vec::<u8>::new();
69 match zip_file.by_name(file_name) {
70 Ok(mut file) => {
71 let _ = file.read_to_end(&mut buffer).map_err(EpubError::from)?;
72 Ok(buffer)
73 }
74 Err(err) => Err(EpubError::from(err)),
75 }
76}
77
78/// Checks if the compression method of all entries in the EPUB file
79/// conforms to the specification requirements.
80///
81/// According to the OCF (Open Container Format) specification, EPUB files
82/// can only use either Stored (uncompressed) or Deflated (deflate compression).
83/// If any other compression method is found, an error will be returned.
84///
85/// ## Parameters
86/// - `zip_archive`: The ZIP archive to check.
87///
88/// ## Return
89/// - `Ok(())`: All files use the supported compression method
90/// - `Err(EpubError)`: Unsupported compression method found
91///
92/// ## Specification Reference
93/// According to the EPUB OCF 3.2 specification: "OCF ZIP containers
94/// MUST only use compression techniques that are supported
95/// by the ZIP format specification (ISO/IEC 21320-1)"
96/// Currently only Stored and Deflated methods are supported.
97pub fn compression_method_check<R: Read + Seek>(
98 zip_archive: &mut ZipArchive<R>,
99) -> Result<(), EpubError> {
100 for index in 0..zip_archive.len() {
101 let file = zip_archive.by_index(index)?;
102
103 match file.compression() {
104 CompressionMethod::Stored | CompressionMethod::Deflated => continue,
105 _ => {
106 return Err(EpubError::UnusableCompressionMethod {
107 file: file.name().to_string(),
108 method: file.compression().to_string(),
109 });
110 }
111 };
112 }
113
114 Ok(())
115}
116
117/// Check if relative link is outside the EPUB package scope
118///
119/// This function resolves relative path links and checks if they "leak"
120/// outside the EPUB package structure. It determines the depth of upward
121/// navigation by calculating the level of "../", and then verifies that
122/// the final path is still within the EPUB package scope.
123///
124/// ## Parameters
125/// - `epub_path`: The root path of the EPUB file
126/// - `current_dir`: The directory path where the current file is located
127/// - `check_file`: The relative path to check
128///
129/// ## Return
130/// - `Some(String)`: The parsed normalized path string, if the link is within the EPUB package scope
131/// - `None`: If the link is outside the EPUB package scope or an error occurs
132pub fn check_realtive_link_leakage(
133 epub_path: PathBuf,
134 current_dir: PathBuf,
135 check_file: &str,
136) -> Option<String> {
137 let mut folder_depth = 0;
138 let mut remaining = check_file;
139
140 // Count how many levels we need to go up
141 while remaining.starts_with("../") {
142 folder_depth += 1;
143 remaining = &remaining[3..];
144 }
145
146 // Navigate up the directory tree according to folder_depth
147 let mut current_path = epub_path.join(current_dir);
148 for _ in 0..folder_depth {
149 if !current_path.pop() {
150 // failed to navigate up,
151 // which means we're trying to escape the root directory
152 return None;
153 }
154 }
155
156 // verify that the resulting path is still within the EPUB package scope
157 let prefix_path = match current_path.strip_prefix(&epub_path) {
158 Ok(path) => path.to_str().unwrap(),
159 Err(_) => return None, // path is outside the EPUB package scope
160 };
161
162 // construct the final path
163 let path = match prefix_path {
164 "" => remaining.to_string(),
165 _ => format!("{}/{}", prefix_path, remaining),
166 };
167 Some(path)
168}
169
170/// Removes leading slash from a path
171///
172/// This function removes the leading slash from a path if it exists.
173#[cfg(feature = "builder")]
174pub fn remove_leading_slash<P: AsRef<std::path::Path>>(path: P) -> PathBuf {
175 if let Ok(path) = path.as_ref().strip_prefix("/") {
176 path.to_path_buf()
177 } else {
178 path.as_ref().to_path_buf()
179 }
180}
181
182/// Encrypts the font file using the IDPF font obfuscation algorithm
183///
184/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
185/// with the publication's unique identifier. Due to the integrability of the XOR
186/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
187///
188/// ## Parameters
189/// - `data`: Original font data
190/// - `key`: The unique identifier of the EPUB publication
191///
192/// ## Return
193/// - `Vec<u8>`: Encrypted font data
194///
195/// ## Notes
196/// - This function applies to the IDPF font obfuscation algorithm
197/// (http://www.idpf.org/2008/embedding).
198/// - Only processes the first 1040 bytes of the font file; the rest remains unchanged.
199pub fn idpf_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
200 if data.is_empty() {
201 return Vec::new();
202 }
203
204 let mut hasher = Sha1::new();
205 hasher.update(key.as_bytes());
206 let hash = hasher.finalize();
207
208 let mut key = vec![0u8; 1040];
209 for index in 0..1040 {
210 key[index] = hash[index % hash.len()];
211 }
212
213 let mut obfuscated_data = data.to_vec();
214 for index in 0..min(1040, data.len()) {
215 obfuscated_data[index] ^= key[index];
216 }
217
218 obfuscated_data
219}
220
221/// Decrypts a file encrypted using the IDPF obfuscation algorithm
222///
223/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
224/// with the publication's unique identifier. Due to the integrability of the XOR
225/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
226///
227/// ## Parameters
228/// - `data`: Original font data
229/// - `key`: The unique identifier of the EPUB publication
230///
231/// ## Return
232/// - `Vec<u8>`: Decrypted font data
233pub fn idpf_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
234 idpf_font_encryption(data, key)
235}
236
237/// Encrypts the font file using the Adobe font obfuscation algorithm
238///
239/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
240/// with a 16-byte key derived from the publication's unique identifier. Due to the
241/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
242/// use the same algorithm.
243///
244/// ## Parameters
245/// - `data`: Original font data to be obfuscated
246/// - `key`: The unique identifier of the EPUB publication
247///
248/// ## Return
249/// - `Vec<u8>`: Obfuscated font data
250///
251/// ## Notes
252/// - This function applies to the adobe font obfuscation algorithm
253/// (http://ns.adobe.com/pdf/enc#RC).
254/// - Only processes the first 1024 bytes of the font file; the rest remains unchanged.
255pub fn adobe_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
256 if data.is_empty() {
257 return Vec::new();
258 }
259
260 let mut key_vec = key.as_bytes().to_vec();
261 while key_vec.len() < 16 {
262 key_vec.extend_from_slice(key.as_bytes());
263 }
264
265 let key = &key_vec[0..min(16, key_vec.len())];
266
267 let mut obfuscated_data = data.to_vec();
268 for index in 0..min(1024, data.len()) {
269 obfuscated_data[index] ^= key[index % 16];
270 }
271
272 obfuscated_data
273}
274
275/// Decrypts a file encrypted using the Adobe font obfuscation algorithm
276///
277/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
278/// with a 16-byte key derived from the publication's unique identifier. Due to the
279/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
280/// use the same algorithm.
281///
282/// ## Parameters
283/// - `data`: Obfuscated font data
284/// - `key`: The unique identifier of the EPUB publication
285///
286/// ## Return
287/// - `Vec<u8>`: Deobfuscated font data
288pub fn adobe_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
289 adobe_font_encryption(data, key)
290}
291
292mod unused_method {
293 #![allow(dead_code)]
294
295 use super::*;
296
297 /// Encrypts data using the XML Encryption AES-128-CBC algorithm
298 ///
299 /// This function encrypts the provided data using the AES-128 algorithm
300 /// in CBC mode, following the XML Encryption specification.
301 ///
302 /// ## Parameters
303 /// - `data`: The raw byte data to encrypt
304 /// - `key`: The encryption key string which will be processed to
305 /// generate the actual encryption key
306 ///
307 /// ## Return
308 /// - `Vec<u8>`: The encrypted data
309 ///
310 /// ## Notes
311 /// - Uses SHA-256 hashing to derive a 16-byte key from the provided key string
312 /// - Implements http://www.w3.org/2001/04/xmlenc#aes128-cbc algorithm
313 pub fn xml_encryption_aes128_cbc(data: &[u8], key: &str) -> Vec<u8> {
314 xml_encryotion_algorithm(data, key, 16)
315 }
316
317 /// Decrypts data using the XML Encryption AES-128-CBC algorithm
318 ///
319 /// This function decrypts the provided data using the AES-128 algorithm
320 /// in CBC mode, following the XML Encryption specification.
321 ///
322 /// ## Parameters
323 /// - `data`: The encrypted byte data to decrypt
324 /// - `key`: The decryption key string which will be processed to
325 /// generate the actual decryption key
326 ///
327 /// ## Return
328 /// - `Vec<u8>`: The decrypted data
329 pub fn xml_decryption_aes128_cbc(data: &[u8], key: &str) -> Vec<u8> {
330 xml_encryotion_algorithm(data, key, 16)
331 }
332
333 /// Encrypts data using the XML Encryption AES-192-CBC algorithm
334 ///
335 /// This function encrypts the provided data using the AES-192 algorithm
336 /// in CBC mode, following the XML Encryption specification.
337 ///
338 /// ## Parameters
339 /// - `data`: The raw byte data to encrypt
340 /// - `key`: The encryption key string which will be processed to
341 /// generate the actual encryption key
342 ///
343 /// ## Return
344 /// - `Vec<u8>`: The encrypted data
345 ///
346 /// ## Notes
347 /// - Uses SHA-256 hashing to derive a 24-byte key from the provided key string
348 /// - Implements http://www.w3.org/2001/04/xmlenc#aes192-cbc algorithm
349 pub fn xml_encryption_aes192_cbc(data: &[u8], key: &str) -> Vec<u8> {
350 xml_encryotion_algorithm(data, key, 24)
351 }
352
353 /// Decrypts data using the XML Encryption AES-192-CBC algorithm
354 ///
355 /// This function decrypts the provided data using the AES-192 algorithm
356 /// in CBC mode, following the XML Encryption specification.
357 ///
358 /// ## Parameters
359 /// - `data`: The encrypted byte data to decrypt
360 /// - `key`: The decryption key string which will be processed to
361 /// generate the actual decryption key
362 ///
363 /// ## Return
364 /// - `Vec<u8>`: The decrypted data
365 pub fn xml_decryption_aes192_cbc(data: &[u8], key: &str) -> Vec<u8> {
366 xml_encryotion_algorithm(data, key, 24)
367 }
368
369 /// Encrypts data using the XML Encryption AES-256-CBC algorithm
370 ///
371 /// This function encrypts the provided data using the AES-256 algorithm
372 /// in CBC mode, following the XML Encryption specification.
373 ///
374 /// ## Parameters
375 /// - `data`: The raw byte data to encrypt
376 /// - `key`: The encryption key string which will be processed to
377 /// generate the actual encryption key
378 ///
379 /// ## Return
380 /// - `Vec<u8>`: The encrypted data
381 ///
382 /// ## Notes
383 /// - Uses SHA-256 hashing to derive a 32-byte key from the provided key string
384 /// - Implements http://www.w3.org/2001/04/xmlenc#aes256-cbc algorithm
385 pub fn xml_encryption_aes256_cbc(data: &[u8], key: &str) -> Vec<u8> {
386 xml_encryotion_algorithm(data, key, 32)
387 }
388
389 /// Decrypts data using the XML Encryption AES-256-CBC algorithm
390 ///
391 /// This function decrypts the provided data using the AES-256 algorithm
392 /// in CBC mode, following the XML Encryption specification.
393 ///
394 /// ## Parameters
395 /// - `data`: The encrypted byte data to decrypt
396 /// - `key`: The decryption key string which will be processed to
397 /// generate the actual decryption key
398 ///
399 /// ## Return
400 /// - `Vec<u8>`: The decrypted data
401 pub fn xml_decryption_aes256_cbc(data: &[u8], key: &str) -> Vec<u8> {
402 xml_encryotion_algorithm(data, key, 32)
403 }
404
405 /// Internal helper function for XML encryption/decryption operations
406 ///
407 /// This function performs XOR-based encryption/decryption on the provided data
408 /// using a key derived from the provided key string via SHA-256 hashing.
409 ///
410 /// ## Parameters
411 /// - `data`: The raw byte data to process
412 /// - `key`: The key string which will be processed to generate the actual encryption/decryption key
413 /// - `key_size`: The desired size of the key in bytes (16 for AES-128, 24 for AES-192, 32 for AES-256)
414 ///
415 /// ## Return
416 /// - `Vec<u8>`: The processed data (encrypted or decrypted)
417 fn xml_encryotion_algorithm(data: &[u8], key: &str, key_size: usize) -> Vec<u8> {
418 if data.is_empty() {
419 return Vec::new();
420 }
421
422 let mut hasher = Sha256::new();
423 hasher.update(key.as_bytes());
424 let hash = hasher.finalize();
425
426 let ecryption_key = &hash[..min(key_size, hash.len())];
427
428 data.iter()
429 .enumerate()
430 .map(|(index, &byte)| byte ^ ecryption_key[index % key_size])
431 .collect()
432 }
433}
434
435/// Provides functionality to decode byte data into strings
436///
437/// This trait is primarily used to decode raw byte data (such as
438/// text files read from EPUB files) into a suitable string representation.
439/// It supports automatic detection of multiple encoding formats,
440/// including UTF-8 (with or without BOM), UTF-16 BE, and UTF-16 LE.
441///
442/// ## Implementation
443/// Currently, this trait is implemented for the `Vec<u8>` type,
444/// primarily used for processing text content in EPUB files.
445///
446/// ## Notes
447/// - When attempting to parse a byte stream lacking a BOM (Byte Order Mark), the parsing
448/// results may be unreadable; caution should be exercised when using such streams.
449pub trait DecodeBytes {
450 fn decode(&self) -> Result<String, EpubError>;
451}
452
453impl DecodeBytes for Vec<u8> {
454 fn decode(&self) -> Result<String, EpubError> {
455 if self.is_empty() || self.len() < 4 {
456 return Err(EpubError::EmptyDataError);
457 }
458
459 match self[0..3] {
460 // Check UTF-8 BOM (0xEF, 0xBB, 0xBF)
461 [0xEF, 0xBB, 0xBF, ..] => {
462 String::from_utf8(self[3..].to_vec()).map_err(EpubError::from)
463 }
464
465 // Check UTF-16 BE BOM (0xFE, 0xFF)
466 [0xFE, 0xFF, ..] => {
467 let utf16_bytes = &self[2..];
468 let utf16_units: Vec<u16> = utf16_bytes
469 .chunks_exact(2)
470 .map(|b| u16::from_be_bytes([b[0], b[1]]))
471 .collect();
472
473 String::from_utf16(&utf16_units).map_err(EpubError::from)
474 }
475
476 // Check UTF-16 LE BOM (0xFF, 0xFE)
477 [0xFF, 0xFE, ..] => {
478 let utf16_bytes = &self[2..];
479 let utf16_units: Vec<u16> = utf16_bytes
480 .chunks_exact(2)
481 .map(|b| u16::from_le_bytes([b[0], b[1]]))
482 .collect();
483
484 String::from_utf16(&utf16_units).map_err(EpubError::from)
485 }
486
487 // Try without BOM
488 // The analytical results for this branch are unpredictable,
489 // making it difficult to cover all possibilities when testing it.
490 _ => {
491 if let Ok(utf8_str) = String::from_utf8(self.to_vec()) {
492 return Ok(utf8_str);
493 }
494
495 if self.len() % 2 == 0 {
496 let utf16_units: Vec<u16> = self
497 .chunks_exact(2)
498 .map(|b| u16::from_be_bytes([b[0], b[1]]))
499 .collect();
500
501 if let Ok(utf16_str) = String::from_utf16(&utf16_units) {
502 return Ok(utf16_str);
503 }
504 }
505
506 if self.len() % 2 == 0 {
507 let utf16_units: Vec<u16> = self
508 .chunks_exact(2)
509 .map(|b| u16::from_le_bytes([b[0], b[1]]))
510 .collect();
511
512 if let Ok(utf16_str) = String::from_utf16(&utf16_units) {
513 return Ok(utf16_str);
514 }
515 }
516
517 // Final fallback
518 Ok(String::from_utf8_lossy(self).to_string())
519 }
520 }
521 }
522}
523
524/// Provides functionality for normalizing whitespace characters
525///
526/// This trait normalizes various sequences of whitespace characters
527/// (including spaces, tabs, newlines, etc.) in a string into a single
528/// whitespace character, removing leading and trailing whitespace characters.
529///
530/// ## Implementation
531/// This trait is implemented for both `&str` and `String` types.
532pub trait NormalizeWhitespace {
533 fn normalize_whitespace(&self) -> String;
534}
535
536impl NormalizeWhitespace for &str {
537 fn normalize_whitespace(&self) -> String {
538 self.split_whitespace().collect::<Vec<_>>().join(" ")
539 }
540}
541
542impl NormalizeWhitespace for String {
543 fn normalize_whitespace(&self) -> String {
544 self.as_str().normalize_whitespace()
545 }
546}
547
548/// Represents an element node in an XML document
549#[derive(Debug)]
550pub struct XmlElement {
551 /// The local name of the element(excluding namespace prefix)
552 pub name: String,
553
554 /// The namespace prefix of the element
555 pub prefix: Option<String>,
556
557 /// The namespace of the element
558 pub namespace: Option<String>,
559
560 /// The attributes of the element
561 ///
562 /// The key is the attribute name, the value is the attribute value
563 pub attributes: HashMap<String, String>,
564
565 /// The text content of the element
566 pub text: Option<String>,
567
568 /// The CDATA content of the element
569 pub cdata: Option<String>,
570
571 /// The children of the element
572 pub children: Vec<XmlElement>,
573}
574
575impl XmlElement {
576 /// Create a new element
577 pub fn new(name: String) -> Self {
578 Self {
579 name,
580 prefix: None,
581 namespace: None,
582 attributes: HashMap::new(),
583 text: None,
584 cdata: None,
585 children: Vec::new(),
586 }
587 }
588
589 /// Get the full tag name of the element
590 ///
591 /// If the element has a namespace prefix, return "prefix:name" format;
592 /// otherwise, return only the element name.
593 pub fn tag_name(&self) -> String {
594 if let Some(prefix) = &self.prefix {
595 format!("{}:{}", prefix, self.name)
596 } else {
597 self.name.clone()
598 }
599 }
600
601 /// Gets the text content of the element and all its child elements
602 ///
603 /// Collects the text content of the current element and the text content of
604 /// all its child elements, removing leading and trailing whitespace.
605 pub fn text(&self) -> String {
606 let mut result = String::new();
607
608 if let Some(text_value) = &self.text {
609 result.push_str(text_value);
610 }
611
612 for child in &self.children {
613 result.push_str(&child.text());
614 }
615
616 result.trim().to_string()
617 }
618
619 /// Returns the value of the specified attribute
620 pub fn get_attr(&self, name: &str) -> Option<String> {
621 self.attributes.get(name).cloned()
622 }
623
624 /// Find all elements with the specified name
625 pub fn find_elements_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
626 SearchElementsByNameIter::new(self, name)
627 }
628
629 /// Find all elements with the specified name among the child elements of the current element
630 pub fn find_children_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
631 self.children.iter().filter(move |child| child.name == name)
632 }
633
634 /// Find all elements with the specified name list among the child elements of the current element
635 pub fn find_children_by_names(&self, names: &[&str]) -> impl Iterator<Item = &XmlElement> {
636 self.children
637 .iter()
638 .filter(move |child| names.contains(&child.name.as_str()))
639 }
640
641 /// Get children elements
642 pub fn children(&self) -> impl Iterator<Item = &XmlElement> {
643 self.children.iter()
644 }
645}
646
647struct SearchElementsByNameIter<'a> {
648 elements: Vec<&'a XmlElement>,
649 current_index: usize,
650 target_name: String,
651}
652
653impl<'a> SearchElementsByNameIter<'a> {
654 fn new(root: &'a XmlElement, name: &str) -> Self {
655 let mut elements = Vec::new();
656 Self::collect_elements(root, &mut elements);
657 Self {
658 elements,
659 current_index: 0,
660 target_name: name.to_string(),
661 }
662 }
663
664 fn collect_elements(element: &'a XmlElement, collection: &mut Vec<&'a XmlElement>) {
665 collection.push(element);
666 for child in &element.children {
667 Self::collect_elements(child, collection);
668 }
669 }
670}
671
672impl<'a> Iterator for SearchElementsByNameIter<'a> {
673 type Item = &'a XmlElement;
674
675 fn next(&mut self) -> Option<Self::Item> {
676 while self.current_index < self.elements.len() {
677 let element = self.elements[self.current_index];
678 self.current_index += 1;
679 if element.name == self.target_name {
680 return Some(element);
681 }
682 }
683 None
684 }
685}
686
687/// XML parser used to parse XML content and build an XML element tree
688pub struct XmlReader {}
689
690#[allow(unused)]
691impl XmlReader {
692 /// Parses an XML from string and builds the root element
693 ///
694 /// This function takes an XML string, parses its content using the `quick_xml` library,
695 /// and builds an `XmlElement` tree representing the structure of the entire XML document.
696 ///
697 /// ## Parameters
698 /// - `content`: The XML string to be parsed
699 ///
700 /// ## Return
701 /// - `Ok(XmlElement)`: The root element of the XML element tree
702 /// - `Err(EpubError)`: An error occurred during parsing
703 pub fn parse(content: &str) -> Result<XmlElement, EpubError> {
704 if content.is_empty() {
705 return Err(EpubError::EmptyDataError);
706 }
707
708 // Create a XML reader with namespace support
709 let mut reader = NsReader::from_str(content);
710 reader.config_mut().trim_text(true);
711
712 let mut buf = Vec::new();
713 let mut stack = Vec::<XmlElement>::new();
714 let mut root = None;
715 let mut namespace_map = HashMap::new();
716
717 // Read XML events
718 loop {
719 match reader.read_event_into(&mut buf) {
720 // End of file, stop the loop
721 Ok(Event::Eof) => break,
722
723 // Start of an element
724 Ok(Event::Start(e)) => {
725 let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
726 let mut element = XmlElement::new(name);
727
728 if let Some(prefix) = e.name().prefix() {
729 element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
730 }
731
732 for attr in e.attributes().flatten() {
733 let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
734 let attr_value = String::from_utf8_lossy(&attr.value).to_string();
735
736 // Handle namespace attributes
737 if attr_key.contains("xmlns") {
738 let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
739 if attr_keys.len() >= 2 {
740 namespace_map.insert(attr_keys[1].to_string(), attr_value);
741 } else {
742 namespace_map.insert(attr_key, attr_value);
743 }
744
745 continue;
746 }
747
748 element.attributes.insert(attr_key, attr_value);
749 }
750
751 stack.push(element);
752 }
753
754 // End of an element
755 Ok(Event::End(_)) => {
756 if let Some(element) = stack.pop() {
757 // If the stack is empty,
758 // the current element is the root element
759 if stack.is_empty() {
760 root = Some(element);
761 } else if let Some(parent) = stack.last_mut() {
762 // If the stack is not empty,
763 // the current element is a child element of the last element in the stack
764 parent.children.push(element);
765 }
766 }
767 }
768
769 // Self-closing element
770 Ok(Event::Empty(e)) => {
771 let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
772 let mut element = XmlElement::new(name);
773
774 if let Some(prefix) = e.name().prefix() {
775 element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
776 }
777
778 for attr in e.attributes().flatten() {
779 let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
780 let attr_value = String::from_utf8_lossy(&attr.value).to_string();
781
782 if attr_key.contains("xmlns") {
783 let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
784 if attr_keys.len() >= 2 {
785 namespace_map.insert(attr_keys[1].to_string(), attr_value);
786 } else {
787 namespace_map.insert(attr_key, attr_value);
788 }
789
790 continue;
791 }
792
793 element.attributes.insert(attr_key, attr_value);
794 }
795
796 // We can almost certainly assert that a self-closing element cannot be
797 // the root node of an XML file, so this will definitely be executed.
798 if let Some(parent) = stack.last_mut() {
799 parent.children.push(element);
800 }
801 }
802
803 // Text node
804 Ok(Event::Text(e)) => {
805 if let Some(element) = stack.last_mut() {
806 let text = String::from_utf8_lossy(e.as_ref()).to_string();
807 if !text.trim().is_empty() {
808 element.text = Some(text);
809 }
810 }
811 }
812
813 // CDATA node
814 Ok(Event::CData(e)) => {
815 if let Some(element) = stack.last_mut() {
816 element.cdata = Some(String::from_utf8_lossy(e.as_ref()).to_string());
817 }
818 }
819
820 Err(err) => return Err(err.into()),
821
822 // Ignore the following events (elements):
823 // Comment, PI, Declaration, Doctype, GeneralRef
824 _ => continue,
825 }
826 }
827
828 if let Some(element) = root.as_mut() {
829 Self::assign_namespace(element, &namespace_map);
830 }
831
832 // TODO: handle this error with a proper error
833 root.ok_or(EpubError::EmptyDataError)
834 }
835
836 /// Parse XML from bytes and builds the root element
837 pub fn parse_bytes(bytes: Vec<u8>) -> Result<XmlElement, EpubError> {
838 let content = bytes.decode()?;
839 Self::parse(&content)
840 }
841
842 /// Assign namespace to element recursively
843 ///
844 /// ## Parameters
845 /// - `element`: The element to assign namespace
846 /// - `namespace_map`: The prefix-namespace map
847 fn assign_namespace(element: &mut XmlElement, namespace_map: &HashMap<String, String>) {
848 if let Some(prefix) = &element.prefix {
849 if let Some(namespace) = namespace_map.get(prefix) {
850 element.namespace = Some(namespace.clone());
851 }
852 } else if let Some(namespace) = namespace_map.get("xmlns") {
853 element.namespace = Some(namespace.clone());
854 }
855
856 for chiled in element.children.iter_mut() {
857 Self::assign_namespace(chiled, namespace_map);
858 }
859 }
860}
861
862#[cfg(test)]
863mod tests {
864 use crate::{
865 error::EpubError,
866 utils::{DecodeBytes, NormalizeWhitespace},
867 };
868
869 /// Test with empty data
870 #[test]
871 fn test_decode_empty_data() {
872 let data = vec![];
873 let result = data.decode();
874 assert!(result.is_err());
875 assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
876 }
877
878 /// Test data with a length of less than 4 bytes
879 #[test]
880 fn test_decode_short_data() {
881 let data = vec![0xEF, 0xBB];
882 let result = data.decode();
883 assert!(result.is_err());
884 assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
885 }
886
887 /// Testing text decoding with UTF-8 BOM
888 #[test]
889 fn test_decode_utf8_with_bom() {
890 let data: Vec<u8> = vec![0xEF, 0xBB, 0xBF, b'H', b'e', b'l', b'l', b'o'];
891 let result = data.decode();
892 assert!(result.is_ok());
893 assert_eq!(result.unwrap(), "Hello");
894 }
895
896 /// Test text decoding with UTF-16 BE BOM
897 #[test]
898 fn test_decode_utf16_be_with_bom() {
899 let data = vec![
900 0xFE, 0xFF, // BOM
901 0x00, b'H', // H
902 0x00, b'e', // e
903 0x00, b'l', // l
904 0x00, b'l', // l
905 0x00, b'o', // o
906 ];
907 let result = data.decode();
908 assert!(result.is_ok());
909 assert_eq!(result.unwrap(), "Hello");
910 }
911
912 /// Testing text decoding with UTF-16 LE BOM
913 #[test]
914 fn test_decode_utf16_le_with_bom() {
915 let data = vec![
916 0xFF, 0xFE, // BOM
917 b'H', 0x00, // H
918 b'e', 0x00, // e
919 b'l', 0x00, // l
920 b'l', 0x00, // l
921 b'o', 0x00, // o
922 ];
923 let result = data.decode();
924 assert!(result.is_ok());
925 assert_eq!(result.unwrap(), "Hello");
926 }
927
928 /// Testing ordinary UTF-8 text (without BOM)
929 #[test]
930 fn test_decode_plain_utf8() {
931 let data = b"Hello, World!".to_vec();
932 let result = data.decode();
933 assert!(result.is_ok());
934 assert_eq!(result.unwrap(), "Hello, World!");
935 }
936
937 /// Test text standardization containing various whitespace characters
938 #[test]
939 fn test_normalize_whitespace_trait() {
940 // Test for &str
941 let text = " Hello,\tWorld!\n\nRust ";
942 let normalized = text.normalize_whitespace();
943 assert_eq!(normalized, "Hello, World! Rust");
944
945 // Test for String
946 let text_string = String::from(" Hello,\tWorld!\n\nRust ");
947 let normalized = text_string.normalize_whitespace();
948 assert_eq!(normalized, "Hello, World! Rust");
949 }
950}