lib_epub/utils.rs
1use std::{
2 cmp::min,
3 collections::HashMap,
4 io::{Read, Seek},
5 path::PathBuf,
6};
7
8#[cfg(feature = "builder")]
9use chrono::Local;
10use quick_xml::{NsReader, events::Event};
11use sha1::{Digest, Sha1};
12use sha2::Sha256;
13use zip::{CompressionMethod, ZipArchive};
14
15use crate::error::EpubError;
16
17#[cfg(feature = "builder")]
18pub const ELEMENT_IN_DC_NAMESPACE: std::sync::LazyLock<Vec<&str>> =
19 std::sync::LazyLock::new(|| {
20 vec![
21 "contributor",
22 "coverage",
23 "creator",
24 "date",
25 "description",
26 "format",
27 "identifier",
28 "language",
29 "publisher",
30 "relation",
31 "rights",
32 "source",
33 "subject",
34 "title",
35 "type",
36 ]
37 });
38
39#[cfg(feature = "builder")]
40/// Returns the current time with custom format
41pub fn local_time() -> String {
42 Local::now().format("%Y-%m-%dT%H-%M-%S.%fU%z").to_string()
43}
44
45/// Extracts the contents of a specified file from a ZIP archive
46///
47/// This function reads the raw byte data of a specified file from an EPUB file (which
48/// is essentially a ZIP archive). This is a fundamental utility function for handling
49/// files within an EPUB (such as OPF, NCX, container files, etc.).
50///
51/// # Parameters
52/// - `zip_file`: A mutable reference to a ZIP archive object
53/// - `file_name`: The path to the file to extract (relative to the ZIP archive root directory)
54///
55/// # Return
56/// - `Ok(Vec<u8>)`: Returns a byte vector containing the file data
57/// if the file content was successfully read
58/// - `Err(EpubError)`: The file does not exist or an error occurred during the read operation
59///
60/// # Notes
61/// - The returned data is raw bytes; the caller needs to perform
62/// appropriate decoding based on the file type.
63/// - For text files, further decoding using the `DecodeBytes` trait is usually required.
64pub fn get_file_in_zip_archive<R: Read + Seek>(
65 zip_file: &mut ZipArchive<R>,
66 file_name: &str,
67) -> Result<Vec<u8>, EpubError> {
68 let mut buffer = Vec::<u8>::new();
69 match zip_file.by_name(file_name) {
70 Ok(mut file) => {
71 let _ = file.read_to_end(&mut buffer).map_err(EpubError::from)?;
72 Ok(buffer)
73 }
74 Err(err) => Err(EpubError::from(err)),
75 }
76}
77
78/// Checks if the compression method of all entries in the EPUB file
79/// conforms to the specification requirements.
80///
81/// According to the OCF (Open Container Format) specification, EPUB files
82/// can only use either Stored (uncompressed) or Deflated (deflate compression).
83/// If any other compression method is found, an error will be returned.
84///
85/// # Parameters
86/// - `zip_archive`: The ZIP archive to check.
87///
88/// # Return
89/// - `Ok(())`: All files use the supported compression method
90/// - `Err(EpubError)`: Unsupported compression method found
91///
92/// # Specification Reference
93/// According to the EPUB OCF 3.2 specification: "OCF ZIP containers
94/// MUST only use compression techniques that are supported
95/// by the ZIP format specification (ISO/IEC 21320-1)"
96/// Currently only Stored and Deflated methods are supported.
97pub fn compression_method_check<R: Read + Seek>(
98 zip_archive: &mut ZipArchive<R>,
99) -> Result<(), EpubError> {
100 for index in 0..zip_archive.len() {
101 let file = zip_archive.by_index(index)?;
102
103 match file.compression() {
104 CompressionMethod::Stored | CompressionMethod::Deflated => continue,
105 _ => {
106 return Err(EpubError::UnusableCompressionMethod {
107 file: file.name().to_string(),
108 method: file.compression().to_string(),
109 });
110 }
111 };
112 }
113
114 Ok(())
115}
116
117/// Check if relative link is outside the EPUB package scope
118///
119/// This function resolves relative path links and checks if they "leak"
120/// outside the EPUB package structure. It determines the depth of upward
121/// navigation by calculating the level of "../", and then verifies that
122/// the final path is still within the EPUB package scope.
123///
124/// # Parameters
125/// - `epub_path`: The root path of the EPUB file
126/// - `current_dir`: The directory path where the current file is located
127/// - `check_file`: The relative path to check
128///
129/// # Return
130/// - `Some(String)`: The parsed normalized path string, if the link is within the EPUB package scope
131/// - `None`: If the link is outside the EPUB package scope or an error occurs
132pub fn check_realtive_link_leakage(
133 epub_path: PathBuf,
134 current_dir: PathBuf,
135 check_file: &str,
136) -> Option<String> {
137 let mut folder_depth = 0;
138 let mut remaining = check_file;
139
140 // Count how many levels we need to go up
141 while remaining.starts_with("../") {
142 folder_depth += 1;
143 remaining = &remaining[3..];
144 }
145
146 // Navigate up the directory tree according to folder_depth
147 let mut current_path = epub_path.join(current_dir);
148 for _ in 0..folder_depth {
149 if !current_path.pop() {
150 // failed to navigate up,
151 // which means we're trying to escape the root directory
152 return None;
153 }
154 }
155
156 // verify that the resulting path is still within the EPUB package scope
157 let prefix_path = match current_path.strip_prefix(&epub_path) {
158 Ok(path) => path.to_str().unwrap(),
159 Err(_) => return None, // path is outside the EPUB package scope
160 };
161
162 // construct the final path
163 let path = match prefix_path {
164 "" => remaining.to_string(),
165 _ => format!("{}/{}", prefix_path, remaining),
166 };
167 Some(path)
168}
169
170/// Encrypts the font file using the IDPF font obfuscation algorithm
171///
172/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
173/// with the publication's unique identifier. Due to the integrability of the XOR
174/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
175///
176/// # Parameters
177/// - `data`: Original font data
178/// - `key`: The unique identifier of the EPUB publication
179///
180/// # Return
181/// - `Vec<u8>`: Encrypted font data
182///
183/// # Notes
184/// - This function applies to the IDPF font obfuscation algorithm
185/// (http://www.idpf.org/2008/embedding).
186/// - Only processes the first 1040 bytes of the font file; the rest remains unchanged.
187pub fn idpf_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
188 if data.is_empty() {
189 return Vec::new();
190 }
191
192 let mut hasher = Sha1::new();
193 hasher.update(key.as_bytes());
194 let hash = hasher.finalize();
195
196 let mut key = vec![0u8; 1040];
197 for index in 0..1040 {
198 key[index] = hash[index % hash.len()];
199 }
200
201 let mut obfuscated_data = data.to_vec();
202 for index in 0..min(1040, data.len()) {
203 obfuscated_data[index] ^= key[index];
204 }
205
206 obfuscated_data
207}
208
209/// Decrypts a file encrypted using the IDPF obfuscation algorithm
210///
211/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
212/// with the publication's unique identifier. Due to the integrability of the XOR
213/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
214///
215/// # Parameters
216/// - `data`: Original font data
217/// - `key`: The unique identifier of the EPUB publication
218///
219/// # Return
220/// - `Vec<u8>`: Decrypted font data
221pub fn idpf_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
222 idpf_font_encryption(data, key)
223}
224
225/// Encrypts the font file using the Adobe font obfuscation algorithm
226///
227/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
228/// with a 16-byte key derived from the publication's unique identifier. Due to the
229/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
230/// use the same algorithm.
231///
232/// # Parameters
233/// - `data`: Original font data to be obfuscated
234/// - `key`: The unique identifier of the EPUB publication
235///
236/// # Return
237/// - `Vec<u8>`: Obfuscated font data
238///
239/// # Notes
240/// - This function applies to the adobe font obfuscation algorithm
241/// (http://ns.adobe.com/pdf/enc#RC).
242/// - Only processes the first 1024 bytes of the font file; the rest remains unchanged.
243pub fn adobe_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
244 if data.is_empty() {
245 return Vec::new();
246 }
247
248 let mut key_vec = key.as_bytes().to_vec();
249 while key_vec.len() < 16 {
250 key_vec.extend_from_slice(key.as_bytes());
251 }
252
253 let key = &key_vec[0..min(16, key_vec.len())];
254
255 let mut obfuscated_data = data.to_vec();
256 for index in 0..min(1024, data.len()) {
257 obfuscated_data[index] ^= key[index % 16];
258 }
259
260 obfuscated_data
261}
262
263/// Decrypts a file encrypted using the Adobe font obfuscation algorithm
264///
265/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
266/// with a 16-byte key derived from the publication's unique identifier. Due to the
267/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
268/// use the same algorithm.
269///
270/// # Parameters
271/// - `data`: Obfuscated font data
272/// - `key`: The unique identifier of the EPUB publication
273///
274/// # Return
275/// - `Vec<u8>`: Deobfuscated font data
276pub fn adobe_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
277 adobe_font_encryption(data, key)
278}
279
280mod unused_method {
281 #![allow(dead_code)]
282
283 use super::*;
284
285 /// Encrypts data using the XML Encryption AES-128-CBC algorithm
286 ///
287 /// This function encrypts the provided data using the AES-128 algorithm
288 /// in CBC mode, following the XML Encryption specification.
289 ///
290 /// # Parameters
291 /// - `data`: The raw byte data to encrypt
292 /// - `key`: The encryption key string which will be processed to
293 /// generate the actual encryption key
294 ///
295 /// # Return
296 /// - `Vec<u8>`: The encrypted data
297 ///
298 /// # Notes
299 /// - Uses SHA-256 hashing to derive a 16-byte key from the provided key string
300 /// - Implements http://www.w3.org/2001/04/xmlenc#aes128-cbc algorithm
301 pub fn xml_encryption_aes128_cbc(data: &[u8], key: &str) -> Vec<u8> {
302 xml_encryotion_algorithm(data, key, 16)
303 }
304
305 /// Decrypts data using the XML Encryption AES-128-CBC algorithm
306 ///
307 /// This function decrypts the provided data using the AES-128 algorithm
308 /// in CBC mode, following the XML Encryption specification.
309 ///
310 /// # Parameters
311 /// - `data`: The encrypted byte data to decrypt
312 /// - `key`: The decryption key string which will be processed to
313 /// generate the actual decryption key
314 ///
315 /// # Return
316 /// - `Vec<u8>`: The decrypted data
317 pub fn xml_decryption_aes128_cbc(data: &[u8], key: &str) -> Vec<u8> {
318 xml_encryotion_algorithm(data, key, 16)
319 }
320
321 /// Encrypts data using the XML Encryption AES-192-CBC algorithm
322 ///
323 /// This function encrypts the provided data using the AES-192 algorithm
324 /// in CBC mode, following the XML Encryption specification.
325 ///
326 /// # Parameters
327 /// - `data`: The raw byte data to encrypt
328 /// - `key`: The encryption key string which will be processed to
329 /// generate the actual encryption key
330 ///
331 /// # Return
332 /// - `Vec<u8>`: The encrypted data
333 ///
334 /// # Notes
335 /// - Uses SHA-256 hashing to derive a 24-byte key from the provided key string
336 /// - Implements http://www.w3.org/2001/04/xmlenc#aes192-cbc algorithm
337 pub fn xml_encryption_aes192_cbc(data: &[u8], key: &str) -> Vec<u8> {
338 xml_encryotion_algorithm(data, key, 24)
339 }
340
341 /// Decrypts data using the XML Encryption AES-192-CBC algorithm
342 ///
343 /// This function decrypts the provided data using the AES-192 algorithm
344 /// in CBC mode, following the XML Encryption specification.
345 ///
346 /// # Parameters
347 /// - `data`: The encrypted byte data to decrypt
348 /// - `key`: The decryption key string which will be processed to
349 /// generate the actual decryption key
350 ///
351 /// # Return
352 /// - `Vec<u8>`: The decrypted data
353 pub fn xml_decryption_aes192_cbc(data: &[u8], key: &str) -> Vec<u8> {
354 xml_encryotion_algorithm(data, key, 24)
355 }
356
357 /// Encrypts data using the XML Encryption AES-256-CBC algorithm
358 ///
359 /// This function encrypts the provided data using the AES-256 algorithm
360 /// in CBC mode, following the XML Encryption specification.
361 ///
362 /// # Parameters
363 /// - `data`: The raw byte data to encrypt
364 /// - `key`: The encryption key string which will be processed to
365 /// generate the actual encryption key
366 ///
367 /// # Return
368 /// - `Vec<u8>`: The encrypted data
369 ///
370 /// # Notes
371 /// - Uses SHA-256 hashing to derive a 32-byte key from the provided key string
372 /// - Implements http://www.w3.org/2001/04/xmlenc#aes256-cbc algorithm
373 pub fn xml_encryption_aes256_cbc(data: &[u8], key: &str) -> Vec<u8> {
374 xml_encryotion_algorithm(data, key, 32)
375 }
376
377 /// Decrypts data using the XML Encryption AES-256-CBC algorithm
378 ///
379 /// This function decrypts the provided data using the AES-256 algorithm
380 /// in CBC mode, following the XML Encryption specification.
381 ///
382 /// # Parameters
383 /// - `data`: The encrypted byte data to decrypt
384 /// - `key`: The decryption key string which will be processed to
385 /// generate the actual decryption key
386 ///
387 /// # Return
388 /// - `Vec<u8>`: The decrypted data
389 pub fn xml_decryption_aes256_cbc(data: &[u8], key: &str) -> Vec<u8> {
390 xml_encryotion_algorithm(data, key, 32)
391 }
392
393 /// Internal helper function for XML encryption/decryption operations
394 ///
395 /// This function performs XOR-based encryption/decryption on the provided data
396 /// using a key derived from the provided key string via SHA-256 hashing.
397 ///
398 /// # Parameters
399 /// - `data`: The raw byte data to process
400 /// - `key`: The key string which will be processed to generate the actual encryption/decryption key
401 /// - `key_size`: The desired size of the key in bytes (16 for AES-128, 24 for AES-192, 32 for AES-256)
402 ///
403 /// # Return
404 /// - `Vec<u8>`: The processed data (encrypted or decrypted)
405 fn xml_encryotion_algorithm(data: &[u8], key: &str, key_size: usize) -> Vec<u8> {
406 if data.is_empty() {
407 return Vec::new();
408 }
409
410 let mut hasher = Sha256::new();
411 hasher.update(key.as_bytes());
412 let hash = hasher.finalize();
413
414 let ecryption_key = &hash[..min(key_size, hash.len())];
415
416 data.iter()
417 .enumerate()
418 .map(|(index, &byte)| byte ^ ecryption_key[index % key_size])
419 .collect()
420 }
421}
422
423/// Provides functionality to decode byte data into strings
424///
425/// This trait is primarily used to decode raw byte data (such as
426/// text files read from EPUB files) into a suitable string representation.
427/// It supports automatic detection of multiple encoding formats,
428/// including UTF-8 (with or without BOM), UTF-16 BE, and UTF-16 LE.
429///
430/// # Implementation
431/// Currently, this trait is implemented for the `Vec<u8>` type,
432/// primarily used for processing text content in EPUB files.
433///
434/// # Notes
435/// - When attempting to parse a byte stream lacking a BOM (Byte Order Mark), the parsing
436/// results may be unreadable; caution should be exercised when using such streams.
437pub trait DecodeBytes {
438 fn decode(&self) -> Result<String, EpubError>;
439}
440
441impl DecodeBytes for Vec<u8> {
442 fn decode(&self) -> Result<String, EpubError> {
443 if self.is_empty() || self.len() < 4 {
444 return Err(EpubError::EmptyDataError);
445 }
446
447 match self[0..3] {
448 // Check UTF-8 BOM (0xEF, 0xBB, 0xBF)
449 [0xEF, 0xBB, 0xBF, ..] => {
450 String::from_utf8(self[3..].to_vec()).map_err(EpubError::from)
451 }
452
453 // Check UTF-16 BE BOM (0xFE, 0xFF)
454 [0xFE, 0xFF, ..] => {
455 let utf16_bytes = &self[2..];
456 let utf16_units: Vec<u16> = utf16_bytes
457 .chunks_exact(2)
458 .map(|b| u16::from_be_bytes([b[0], b[1]]))
459 .collect();
460
461 String::from_utf16(&utf16_units).map_err(EpubError::from)
462 }
463
464 // Check UTF-16 LE BOM (0xFF, 0xFE)
465 [0xFF, 0xFE, ..] => {
466 let utf16_bytes = &self[2..];
467 let utf16_units: Vec<u16> = utf16_bytes
468 .chunks_exact(2)
469 .map(|b| u16::from_le_bytes([b[0], b[1]]))
470 .collect();
471
472 String::from_utf16(&utf16_units).map_err(EpubError::from)
473 }
474
475 // Try without BOM
476 // The analytical results for this branch are unpredictable,
477 // making it difficult to cover all possibilities when testing it.
478 _ => {
479 if let Ok(utf8_str) = String::from_utf8(self.to_vec()) {
480 return Ok(utf8_str);
481 }
482
483 if self.len() % 2 == 0 {
484 let utf16_units: Vec<u16> = self
485 .chunks_exact(2)
486 .map(|b| u16::from_be_bytes([b[0], b[1]]))
487 .collect();
488
489 if let Ok(utf16_str) = String::from_utf16(&utf16_units) {
490 return Ok(utf16_str);
491 }
492 }
493
494 if self.len() % 2 == 0 {
495 let utf16_units: Vec<u16> = self
496 .chunks_exact(2)
497 .map(|b| u16::from_le_bytes([b[0], b[1]]))
498 .collect();
499
500 if let Ok(utf16_str) = String::from_utf16(&utf16_units) {
501 return Ok(utf16_str);
502 }
503 }
504
505 // Final fallback
506 Ok(String::from_utf8_lossy(self).to_string())
507 }
508 }
509 }
510}
511
512/// Provides functionality for normalizing whitespace characters
513///
514/// This trait normalizes various sequences of whitespace characters
515/// (including spaces, tabs, newlines, etc.) in a string into a single
516/// whitespace character, removing leading and trailing whitespace characters.
517///
518/// # Implementation
519/// This trait is implemented for both `&str` and `String` types.
520pub trait NormalizeWhitespace {
521 fn normalize_whitespace(&self) -> String;
522}
523
524impl NormalizeWhitespace for &str {
525 fn normalize_whitespace(&self) -> String {
526 self.split_whitespace().collect::<Vec<_>>().join(" ")
527 }
528}
529
530impl NormalizeWhitespace for String {
531 fn normalize_whitespace(&self) -> String {
532 self.as_str().normalize_whitespace()
533 }
534}
535
536/// Represents an element node in an XML document
537#[derive(Debug)]
538pub struct XmlElement {
539 /// The local name of the element(excluding namespace prefix)
540 pub name: String,
541
542 /// The namespace prefix of the element
543 pub prefix: Option<String>,
544
545 /// The namespace of the element
546 pub namespace: Option<String>,
547
548 /// The attributes of the element
549 ///
550 /// The key is the attribute name, the value is the attribute value
551 pub attributes: HashMap<String, String>,
552
553 /// The text content of the element
554 pub text: Option<String>,
555
556 /// The CDATA content of the element
557 pub cdata: Option<String>,
558
559 /// The children of the element
560 pub children: Vec<XmlElement>,
561}
562
563impl XmlElement {
564 /// Create a new element
565 pub fn new(name: String) -> Self {
566 Self {
567 name,
568 prefix: None,
569 namespace: None,
570 attributes: HashMap::new(),
571 text: None,
572 cdata: None,
573 children: Vec::new(),
574 }
575 }
576
577 /// Get the full tag name of the element
578 ///
579 /// If the element has a namespace prefix, return "prefix:name" format;
580 /// otherwise, return only the element name.
581 pub fn tag_name(&self) -> String {
582 if let Some(prefix) = &self.prefix {
583 format!("{}:{}", prefix, self.name)
584 } else {
585 self.name.clone()
586 }
587 }
588
589 /// Gets the text content of the element and all its child elements
590 ///
591 /// Collects the text content of the current element and the text content of
592 /// all its child elements, removing leading and trailing whitespace.
593 pub fn text(&self) -> String {
594 let mut result = String::new();
595
596 if let Some(text_value) = &self.text {
597 result.push_str(text_value);
598 }
599
600 for child in &self.children {
601 result.push_str(&child.text());
602 }
603
604 result.trim().to_string()
605 }
606
607 /// Returns the value of the specified attribute
608 pub fn get_attr(&self, name: &str) -> Option<String> {
609 self.attributes.get(name).cloned()
610 }
611
612 /// Find all elements with the specified name
613 pub fn find_elements_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
614 SearchElementsByNameIter::new(self, name)
615 }
616
617 /// Find all elements with the specified name among the child elements of the current element
618 pub fn find_children_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
619 self.children.iter().filter(move |child| child.name == name)
620 }
621
622 /// Find all elements with the specified name list among the child elements of the current element
623 pub fn find_children_by_names(&self, names: &[&str]) -> impl Iterator<Item = &XmlElement> {
624 self.children
625 .iter()
626 .filter(move |child| names.contains(&child.name.as_str()))
627 }
628
629 /// Get children elements
630 pub fn children(&self) -> impl Iterator<Item = &XmlElement> {
631 self.children.iter()
632 }
633}
634
635struct SearchElementsByNameIter<'a> {
636 elements: Vec<&'a XmlElement>,
637 current_index: usize,
638 target_name: String,
639}
640
641impl<'a> SearchElementsByNameIter<'a> {
642 fn new(root: &'a XmlElement, name: &str) -> Self {
643 let mut elements = Vec::new();
644 Self::collect_elements(root, &mut elements);
645 Self {
646 elements,
647 current_index: 0,
648 target_name: name.to_string(),
649 }
650 }
651
652 fn collect_elements(element: &'a XmlElement, collection: &mut Vec<&'a XmlElement>) {
653 collection.push(element);
654 for child in &element.children {
655 Self::collect_elements(child, collection);
656 }
657 }
658}
659
660impl<'a> Iterator for SearchElementsByNameIter<'a> {
661 type Item = &'a XmlElement;
662
663 fn next(&mut self) -> Option<Self::Item> {
664 while self.current_index < self.elements.len() {
665 let element = self.elements[self.current_index];
666 self.current_index += 1;
667 if element.name == self.target_name {
668 return Some(element);
669 }
670 }
671 None
672 }
673}
674
675/// XML parser used to parse XML content and build an XML element tree
676pub struct XmlReader {}
677
678#[allow(unused)]
679impl XmlReader {
680 /// Parses an XML from string and builds the root element
681 ///
682 /// This function takes an XML string, parses its content using the `quick_xml` library,
683 /// and builds an `XmlElement` tree representing the structure of the entire XML document.
684 ///
685 /// # Parameters
686 /// - `content`: The XML string to be parsed
687 ///
688 /// # Return
689 /// - `Ok(XmlElement)`: The root element of the XML element tree
690 /// - `Err(EpubError)`: An error occurred during parsing
691 pub fn parse(content: &str) -> Result<XmlElement, EpubError> {
692 if content.is_empty() {
693 return Err(EpubError::EmptyDataError);
694 }
695
696 // Create a XML reader with namespace support
697 let mut reader = NsReader::from_str(content);
698 reader.config_mut().trim_text(true);
699
700 let mut buf = Vec::new();
701 let mut stack = Vec::<XmlElement>::new();
702 let mut root = None;
703 let mut namespace_map = HashMap::new();
704
705 // Read XML events
706 loop {
707 match reader.read_event_into(&mut buf) {
708 // End of file, stop the loop
709 Ok(Event::Eof) => break,
710
711 // Start of an element
712 Ok(Event::Start(e)) => {
713 let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
714 let mut element = XmlElement::new(name);
715
716 if let Some(prefix) = e.name().prefix() {
717 element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
718 }
719
720 for attr in e.attributes().flatten() {
721 let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
722 let attr_value = String::from_utf8_lossy(&attr.value).to_string();
723
724 // Handle namespace attributes
725 if attr_key.contains("xmlns") {
726 let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
727 if attr_keys.len() >= 2 {
728 namespace_map.insert(attr_keys[1].to_string(), attr_value);
729 } else {
730 namespace_map.insert(attr_key, attr_value);
731 }
732
733 continue;
734 }
735
736 element.attributes.insert(attr_key, attr_value);
737 }
738
739 stack.push(element);
740 }
741
742 // End of an element
743 Ok(Event::End(_)) => {
744 if let Some(element) = stack.pop() {
745 // If the stack is empty,
746 // the current element is the root element
747 if stack.is_empty() {
748 root = Some(element);
749 } else if let Some(parent) = stack.last_mut() {
750 // If the stack is not empty,
751 // the current element is a child element of the last element in the stack
752 parent.children.push(element);
753 }
754 }
755 }
756
757 // Self-closing element
758 Ok(Event::Empty(e)) => {
759 let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
760 let mut element = XmlElement::new(name);
761
762 if let Some(prefix) = e.name().prefix() {
763 element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
764 }
765
766 for attr in e.attributes().flatten() {
767 let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
768 let attr_value = String::from_utf8_lossy(&attr.value).to_string();
769
770 if attr_key.contains("xmlns") {
771 let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
772 if attr_keys.len() >= 2 {
773 namespace_map.insert(attr_keys[1].to_string(), attr_value);
774 } else {
775 namespace_map.insert(attr_key, attr_value);
776 }
777
778 continue;
779 }
780
781 element.attributes.insert(attr_key, attr_value);
782 }
783
784 // We can almost certainly assert that a self-closing element cannot be
785 // the root node of an XML file, so this will definitely be executed.
786 if let Some(parent) = stack.last_mut() {
787 parent.children.push(element);
788 }
789 }
790
791 // Text node
792 Ok(Event::Text(e)) => {
793 if let Some(element) = stack.last_mut() {
794 let text = String::from_utf8_lossy(e.as_ref()).to_string();
795 if !text.trim().is_empty() {
796 element.text = Some(text);
797 }
798 }
799 }
800
801 // CDATA node
802 Ok(Event::CData(e)) => {
803 if let Some(element) = stack.last_mut() {
804 element.cdata = Some(String::from_utf8_lossy(e.as_ref()).to_string());
805 }
806 }
807
808 Err(err) => return Err(err.into()),
809
810 // Ignore the following events (elements):
811 // Comment, PI, Declaration, Doctype, GeneralRef
812 _ => continue,
813 }
814 }
815
816 if let Some(element) = root.as_mut() {
817 Self::assign_namespace(element, &namespace_map);
818 }
819
820 // TODO: handle this error with a proper error
821 root.ok_or(EpubError::EmptyDataError)
822 }
823
824 /// Parse XML from bytes and builds the root element
825 pub fn parse_bytes(bytes: Vec<u8>) -> Result<XmlElement, EpubError> {
826 let content = bytes.decode()?;
827 Self::parse(&content)
828 }
829
830 /// Assign namespace to element recursively
831 ///
832 /// # Parameters
833 /// - `element`: The element to assign namespace
834 /// - `namespace_map`: The prefix-namespace map
835 fn assign_namespace(element: &mut XmlElement, namespace_map: &HashMap<String, String>) {
836 if let Some(prefix) = &element.prefix {
837 if let Some(namespace) = namespace_map.get(prefix) {
838 element.namespace = Some(namespace.clone());
839 }
840 } else if let Some(namespace) = namespace_map.get("xmlns") {
841 element.namespace = Some(namespace.clone());
842 }
843
844 for chiled in element.children.iter_mut() {
845 Self::assign_namespace(chiled, namespace_map);
846 }
847 }
848}
849
850#[cfg(test)]
851mod tests {
852 use crate::{
853 error::EpubError,
854 utils::{DecodeBytes, NormalizeWhitespace},
855 };
856
857 /// Test with empty data
858 #[test]
859 fn test_decode_empty_data() {
860 let data = vec![];
861 let result = data.decode();
862 assert!(result.is_err());
863 assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
864 }
865
866 /// Test data with a length of less than 4 bytes
867 #[test]
868 fn test_decode_short_data() {
869 let data = vec![0xEF, 0xBB];
870 let result = data.decode();
871 assert!(result.is_err());
872 assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
873 }
874
875 /// Testing text decoding with UTF-8 BOM
876 #[test]
877 fn test_decode_utf8_with_bom() {
878 let data: Vec<u8> = vec![0xEF, 0xBB, 0xBF, b'H', b'e', b'l', b'l', b'o'];
879 let result = data.decode();
880 assert!(result.is_ok());
881 assert_eq!(result.unwrap(), "Hello");
882 }
883
884 /// Test text decoding with UTF-16 BE BOM
885 #[test]
886 fn test_decode_utf16_be_with_bom() {
887 let data = vec![
888 0xFE, 0xFF, // BOM
889 0x00, b'H', // H
890 0x00, b'e', // e
891 0x00, b'l', // l
892 0x00, b'l', // l
893 0x00, b'o', // o
894 ];
895 let result = data.decode();
896 assert!(result.is_ok());
897 assert_eq!(result.unwrap(), "Hello");
898 }
899
900 /// Testing text decoding with UTF-16 LE BOM
901 #[test]
902 fn test_decode_utf16_le_with_bom() {
903 let data = vec![
904 0xFF, 0xFE, // BOM
905 b'H', 0x00, // H
906 b'e', 0x00, // e
907 b'l', 0x00, // l
908 b'l', 0x00, // l
909 b'o', 0x00, // o
910 ];
911 let result = data.decode();
912 assert!(result.is_ok());
913 assert_eq!(result.unwrap(), "Hello");
914 }
915
916 /// Testing ordinary UTF-8 text (without BOM)
917 #[test]
918 fn test_decode_plain_utf8() {
919 let data = b"Hello, World!".to_vec();
920 let result = data.decode();
921 assert!(result.is_ok());
922 assert_eq!(result.unwrap(), "Hello, World!");
923 }
924
925 /// Test text standardization containing various whitespace characters
926 #[test]
927 fn test_normalize_whitespace_trait() {
928 // Test for &str
929 let text = " Hello,\tWorld!\n\nRust ";
930 let normalized = text.normalize_whitespace();
931 assert_eq!(normalized, "Hello, World! Rust");
932
933 // Test for String
934 let text_string = String::from(" Hello,\tWorld!\n\nRust ");
935 let normalized = text_string.normalize_whitespace();
936 assert_eq!(normalized, "Hello, World! Rust");
937 }
938}