lib_epub/
utils.rs

1use std::{
2    cmp::min,
3    collections::HashMap,
4    io::{Read, Seek},
5    path::PathBuf,
6};
7
8#[cfg(feature = "builder")]
9use chrono::Local;
10use quick_xml::{NsReader, events::Event};
11use sha1::{Digest, Sha1};
12use sha2::Sha256;
13use zip::{CompressionMethod, ZipArchive};
14
15use crate::error::EpubError;
16
17#[cfg(feature = "builder")]
18pub static ELEMENT_IN_DC_NAMESPACE: std::sync::LazyLock<Vec<&str>> =
19    std::sync::LazyLock::new(|| {
20        vec![
21            "contributor",
22            "coverage",
23            "creator",
24            "date",
25            "description",
26            "format",
27            "identifier",
28            "language",
29            "publisher",
30            "relation",
31            "rights",
32            "source",
33            "subject",
34            "title",
35            "type",
36        ]
37    });
38
39#[cfg(feature = "builder")]
40/// Returns the current time with custom format
41pub fn local_time() -> String {
42    Local::now().format("%Y-%m-%dT%H-%M-%S.%fU%z").to_string()
43}
44
45/// Extracts the contents of a specified file from a ZIP archive
46///
47/// This function reads the raw byte data of a specified file from an EPUB file (which
48/// is essentially a ZIP archive). This is a fundamental utility function for handling
49/// files within an EPUB (such as OPF, NCX, container files, etc.).
50///
51/// ## Parameters
52/// - `zip_file`: A mutable reference to a ZIP archive object
53/// - `file_name`: The path to the file to extract (relative to the ZIP archive root directory)
54///
55/// ## Return
56/// - `Ok(Vec<u8>)`: Returns a byte vector containing the file data
57///   if the file content was successfully read
58/// - `Err(EpubError)`: The file does not exist or an error occurred during the read operation
59///
60/// ## Notes
61/// - The returned data is raw bytes; the caller needs to perform
62///   appropriate decoding based on the file type.
63/// - For text files, further decoding using the `DecodeBytes` trait is usually required.
64pub fn get_file_in_zip_archive<R: Read + Seek>(
65    zip_file: &mut ZipArchive<R>,
66    file_name: &str,
67) -> Result<Vec<u8>, EpubError> {
68    let mut buffer = Vec::<u8>::new();
69    match zip_file.by_name(file_name) {
70        Ok(mut file) => {
71            let _ = file.read_to_end(&mut buffer).map_err(EpubError::from)?;
72            Ok(buffer)
73        }
74        Err(err) => Err(EpubError::from(err)),
75    }
76}
77
78/// Checks if the compression method of all entries in the EPUB file
79/// conforms to the specification requirements.
80///
81/// According to the OCF (Open Container Format) specification, EPUB files
82/// can only use either Stored (uncompressed) or Deflated (deflate compression).
83/// If any other compression method is found, an error will be returned.
84///
85/// ## Parameters
86/// - `zip_archive`: The ZIP archive to check.
87///
88/// ## Return
89/// - `Ok(())`: All files use the supported compression method
90/// - `Err(EpubError)`: Unsupported compression method found
91///
92/// ## Specification Reference
93/// According to the EPUB OCF 3.2 specification: "OCF ZIP containers
94/// MUST only use compression techniques that are supported
95/// by the ZIP format specification (ISO/IEC 21320-1)"
96/// Currently only Stored and Deflated methods are supported.
97pub fn compression_method_check<R: Read + Seek>(
98    zip_archive: &mut ZipArchive<R>,
99) -> Result<(), EpubError> {
100    for index in 0..zip_archive.len() {
101        let file = zip_archive.by_index(index)?;
102
103        match file.compression() {
104            CompressionMethod::Stored | CompressionMethod::Deflated => continue,
105            _ => {
106                return Err(EpubError::UnusableCompressionMethod {
107                    file: file.name().to_string(),
108                    method: file.compression().to_string(),
109                });
110            }
111        };
112    }
113
114    Ok(())
115}
116
117/// Check if relative link is outside the EPUB package scope
118///
119/// This function resolves relative path links and checks if they "leak"
120/// outside the EPUB package structure. It determines the depth of upward
121/// navigation by calculating the level of "../", and then verifies that
122/// the final path is still within the EPUB package scope.
123///
124/// ## Parameters
125/// - `epub_path`: The root path of the EPUB file
126/// - `current_dir`: The directory path where the current file is located
127/// - `check_file`: The relative path to check
128///
129/// ## Return
130/// - `Some(String)`: The parsed normalized path string, if the link is within the EPUB package scope
131/// - `None`: If the link is outside the EPUB package scope or an error occurs
132pub fn check_realtive_link_leakage(
133    epub_path: PathBuf,
134    current_dir: PathBuf,
135    check_file: &str,
136) -> Option<String> {
137    let mut folder_depth = 0;
138    let mut remaining = check_file;
139
140    // Count how many levels we need to go up
141    while remaining.starts_with("../") {
142        folder_depth += 1;
143        remaining = &remaining[3..];
144    }
145
146    // Navigate up the directory tree according to folder_depth
147    let mut current_path = epub_path.join(current_dir);
148    for _ in 0..folder_depth {
149        if !current_path.pop() {
150            // failed to navigate up,
151            // which means we're trying to escape the root directory
152            return None;
153        }
154    }
155
156    // verify that the resulting path is still within the EPUB package scope
157    let prefix_path = match current_path.strip_prefix(&epub_path) {
158        Ok(path) => path.to_str().unwrap(),
159        Err(_) => return None, // path is outside the EPUB package scope
160    };
161
162    // construct the final path
163    let path = match prefix_path {
164        "" => remaining.to_string(),
165        _ => format!("{}/{}", prefix_path, remaining),
166    };
167    Some(path)
168}
169
170/// Removes leading slash from a path
171///
172/// This function removes the leading slash from a path if it exists.
173#[cfg(feature = "builder")]
174pub fn remove_leading_slash<P: AsRef<std::path::Path>>(path: P) -> PathBuf {
175    if let Ok(path) = path.as_ref().strip_prefix("/") {
176        path.to_path_buf()
177    } else {
178        path.as_ref().to_path_buf()
179    }
180}
181
182/// Encrypts the font file using the IDPF font obfuscation algorithm
183///
184/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
185/// with the publication's unique identifier. Due to the integrability of the XOR
186/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
187///
188/// ## Parameters
189/// - `data`: Original font data
190/// - `key`: The unique identifier of the EPUB publication
191///
192/// ## Return
193/// - `Vec<u8>`: Encrypted font data
194///
195/// ## Notes
196/// - This function applies to the IDPF font obfuscation algorithm
197///   (http://www.idpf.org/2008/embedding).
198/// - Only processes the first 1040 bytes of the font file; the rest remains unchanged.
199pub fn idpf_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
200    if data.is_empty() {
201        return Vec::new();
202    }
203
204    let mut hasher = Sha1::new();
205    hasher.update(key.as_bytes());
206    let hash = hasher.finalize();
207
208    let mut key = vec![0u8; 1040];
209    for index in 0..1040 {
210        key[index] = hash[index % hash.len()];
211    }
212
213    let mut obfuscated_data = data.to_vec();
214    for index in 0..min(1040, data.len()) {
215        obfuscated_data[index] ^= key[index];
216    }
217
218    obfuscated_data
219}
220
221/// Decrypts a file encrypted using the IDPF obfuscation algorithm
222///
223/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
224/// with the publication's unique identifier. Due to the integrability of the XOR
225/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
226///
227/// ## Parameters
228/// - `data`: Original font data
229/// - `key`: The unique identifier of the EPUB publication
230///
231/// ## Return
232/// - `Vec<u8>`: Decrypted font data
233pub fn idpf_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
234    idpf_font_encryption(data, key)
235}
236
237/// Encrypts the font file using the Adobe font obfuscation algorithm
238///
239/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
240/// with a 16-byte key derived from the publication's unique identifier. Due to the
241/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
242/// use the same algorithm.
243///
244/// ## Parameters
245/// - `data`: Original font data to be obfuscated
246/// - `key`: The unique identifier of the EPUB publication
247///
248/// ## Return
249/// - `Vec<u8>`: Obfuscated font data
250///
251/// ## Notes
252/// - This function applies to the adobe font obfuscation algorithm
253///   (http://ns.adobe.com/pdf/enc#RC).
254/// - Only processes the first 1024 bytes of the font file; the rest remains unchanged.
255pub fn adobe_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
256    if data.is_empty() {
257        return Vec::new();
258    }
259
260    let mut key_vec = key.as_bytes().to_vec();
261    while key_vec.len() < 16 {
262        key_vec.extend_from_slice(key.as_bytes());
263    }
264
265    let key = &key_vec[0..min(16, key_vec.len())];
266
267    let mut obfuscated_data = data.to_vec();
268    for index in 0..min(1024, data.len()) {
269        obfuscated_data[index] ^= key[index % 16];
270    }
271
272    obfuscated_data
273}
274
275/// Decrypts a file encrypted using the Adobe font obfuscation algorithm
276///
277/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
278/// with a 16-byte key derived from the publication's unique identifier. Due to the
279/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
280/// use the same algorithm.
281///
282/// ## Parameters
283/// - `data`: Obfuscated font data
284/// - `key`: The unique identifier of the EPUB publication
285///
286/// ## Return
287/// - `Vec<u8>`: Deobfuscated font data
288pub fn adobe_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
289    adobe_font_encryption(data, key)
290}
291
292mod unused_method {
293    #![allow(dead_code)]
294
295    use super::*;
296
297    /// Encrypts data using the XML Encryption AES-128-CBC algorithm
298    ///
299    /// This function encrypts the provided data using the AES-128 algorithm
300    /// in CBC mode, following the XML Encryption specification.
301    ///
302    /// ## Parameters
303    /// - `data`: The raw byte data to encrypt
304    /// - `key`: The encryption key string which will be processed to
305    ///   generate the actual encryption key
306    ///
307    /// ## Return
308    /// - `Vec<u8>`: The encrypted data
309    ///
310    /// ## Notes
311    /// - Uses SHA-256 hashing to derive a 16-byte key from the provided key string
312    /// - Implements http://www.w3.org/2001/04/xmlenc#aes128-cbc algorithm
313    pub fn xml_encryption_aes128_cbc(data: &[u8], key: &str) -> Vec<u8> {
314        xml_encryotion_algorithm(data, key, 16)
315    }
316
317    /// Decrypts data using the XML Encryption AES-128-CBC algorithm
318    ///
319    /// This function decrypts the provided data using the AES-128 algorithm
320    /// in CBC mode, following the XML Encryption specification.
321    ///
322    /// ## Parameters
323    /// - `data`: The encrypted byte data to decrypt
324    /// - `key`: The decryption key string which will be processed to
325    ///   generate the actual decryption key
326    ///
327    /// ## Return
328    /// - `Vec<u8>`: The decrypted data
329    pub fn xml_decryption_aes128_cbc(data: &[u8], key: &str) -> Vec<u8> {
330        xml_encryotion_algorithm(data, key, 16)
331    }
332
333    /// Encrypts data using the XML Encryption AES-192-CBC algorithm
334    ///
335    /// This function encrypts the provided data using the AES-192 algorithm
336    /// in CBC mode, following the XML Encryption specification.
337    ///
338    /// ## Parameters
339    /// - `data`: The raw byte data to encrypt
340    /// - `key`: The encryption key string which will be processed to
341    ///   generate the actual encryption key
342    ///
343    /// ## Return
344    /// - `Vec<u8>`: The encrypted data
345    ///
346    /// ## Notes
347    /// - Uses SHA-256 hashing to derive a 24-byte key from the provided key string
348    /// - Implements http://www.w3.org/2001/04/xmlenc#aes192-cbc algorithm
349    pub fn xml_encryption_aes192_cbc(data: &[u8], key: &str) -> Vec<u8> {
350        xml_encryotion_algorithm(data, key, 24)
351    }
352
353    /// Decrypts data using the XML Encryption AES-192-CBC algorithm
354    ///
355    /// This function decrypts the provided data using the AES-192 algorithm
356    /// in CBC mode, following the XML Encryption specification.
357    ///
358    /// ## Parameters
359    /// - `data`: The encrypted byte data to decrypt
360    /// - `key`: The decryption key string which will be processed to
361    ///   generate the actual decryption key
362    ///
363    /// ## Return
364    /// - `Vec<u8>`: The decrypted data
365    pub fn xml_decryption_aes192_cbc(data: &[u8], key: &str) -> Vec<u8> {
366        xml_encryotion_algorithm(data, key, 24)
367    }
368
369    /// Encrypts data using the XML Encryption AES-256-CBC algorithm
370    ///
371    /// This function encrypts the provided data using the AES-256 algorithm
372    /// in CBC mode, following the XML Encryption specification.
373    ///
374    /// ## Parameters
375    /// - `data`: The raw byte data to encrypt
376    /// - `key`: The encryption key string which will be processed to
377    ///   generate the actual encryption key
378    ///
379    /// ## Return
380    /// - `Vec<u8>`: The encrypted data
381    ///
382    /// ## Notes
383    /// - Uses SHA-256 hashing to derive a 32-byte key from the provided key string
384    /// - Implements http://www.w3.org/2001/04/xmlenc#aes256-cbc algorithm
385    pub fn xml_encryption_aes256_cbc(data: &[u8], key: &str) -> Vec<u8> {
386        xml_encryotion_algorithm(data, key, 32)
387    }
388
389    /// Decrypts data using the XML Encryption AES-256-CBC algorithm
390    ///
391    /// This function decrypts the provided data using the AES-256 algorithm
392    /// in CBC mode, following the XML Encryption specification.
393    ///
394    /// ## Parameters
395    /// - `data`: The encrypted byte data to decrypt
396    /// - `key`: The decryption key string which will be processed to
397    ///   generate the actual decryption key
398    ///
399    /// ## Return
400    /// - `Vec<u8>`: The decrypted data
401    pub fn xml_decryption_aes256_cbc(data: &[u8], key: &str) -> Vec<u8> {
402        xml_encryotion_algorithm(data, key, 32)
403    }
404
405    /// Internal helper function for XML encryption/decryption operations
406    ///
407    /// This function performs XOR-based encryption/decryption on the provided data
408    /// using a key derived from the provided key string via SHA-256 hashing.
409    ///
410    /// ## Parameters
411    /// - `data`: The raw byte data to process
412    /// - `key`: The key string which will be processed to generate the actual encryption/decryption key
413    /// - `key_size`: The desired size of the key in bytes (16 for AES-128, 24 for AES-192, 32 for AES-256)
414    ///
415    /// ## Return
416    /// - `Vec<u8>`: The processed data (encrypted or decrypted)
417    fn xml_encryotion_algorithm(data: &[u8], key: &str, key_size: usize) -> Vec<u8> {
418        if data.is_empty() {
419            return Vec::new();
420        }
421
422        let mut hasher = Sha256::new();
423        hasher.update(key.as_bytes());
424        let hash = hasher.finalize();
425
426        let ecryption_key = &hash[..min(key_size, hash.len())];
427
428        data.iter()
429            .enumerate()
430            .map(|(index, &byte)| byte ^ ecryption_key[index % key_size])
431            .collect()
432    }
433}
434
435/// Provides functionality to decode byte data into strings
436///
437/// This trait is primarily used to decode raw byte data (such as
438/// text files read from EPUB files) into a suitable string representation.
439/// It supports automatic detection of multiple encoding formats,
440/// including UTF-8 (with or without BOM), UTF-16 BE, and UTF-16 LE.
441///
442/// ## Implementation
443/// Currently, this trait is implemented for the `Vec<u8>` type,
444/// primarily used for processing text content in EPUB files.
445///
446/// ## Notes
447/// - When attempting to parse a byte stream lacking a BOM (Byte Order Mark), the parsing
448///   results may be unreadable; caution should be exercised when using such streams.
449pub trait DecodeBytes {
450    fn decode(&self) -> Result<String, EpubError>;
451}
452
453impl DecodeBytes for Vec<u8> {
454    fn decode(&self) -> Result<String, EpubError> {
455        if self.is_empty() || self.len() < 4 {
456            return Err(EpubError::EmptyDataError);
457        }
458
459        match self[0..3] {
460            // Check UTF-8 BOM (0xEF, 0xBB, 0xBF)
461            [0xEF, 0xBB, 0xBF, ..] => {
462                String::from_utf8(self[3..].to_vec()).map_err(EpubError::from)
463            }
464
465            // Check UTF-16 BE BOM (0xFE, 0xFF)
466            [0xFE, 0xFF, ..] => {
467                let utf16_bytes = &self[2..];
468                let utf16_units: Vec<u16> = utf16_bytes
469                    .chunks_exact(2)
470                    .map(|b| u16::from_be_bytes([b[0], b[1]]))
471                    .collect();
472
473                String::from_utf16(&utf16_units).map_err(EpubError::from)
474            }
475
476            // Check UTF-16 LE BOM (0xFF, 0xFE)
477            [0xFF, 0xFE, ..] => {
478                let utf16_bytes = &self[2..];
479                let utf16_units: Vec<u16> = utf16_bytes
480                    .chunks_exact(2)
481                    .map(|b| u16::from_le_bytes([b[0], b[1]]))
482                    .collect();
483
484                String::from_utf16(&utf16_units).map_err(EpubError::from)
485            }
486
487            // Try without BOM
488            // The analytical results for this branch are unpredictable,
489            // making it difficult to cover all possibilities when testing it.
490            _ => {
491                if let Ok(utf8_str) = String::from_utf8(self.to_vec()) {
492                    return Ok(utf8_str);
493                }
494
495                if self.len() % 2 == 0 {
496                    let utf16_units: Vec<u16> = self
497                        .chunks_exact(2)
498                        .map(|b| u16::from_be_bytes([b[0], b[1]]))
499                        .collect();
500
501                    if let Ok(utf16_str) = String::from_utf16(&utf16_units) {
502                        return Ok(utf16_str);
503                    }
504                }
505
506                if self.len() % 2 == 0 {
507                    let utf16_units: Vec<u16> = self
508                        .chunks_exact(2)
509                        .map(|b| u16::from_le_bytes([b[0], b[1]]))
510                        .collect();
511
512                    if let Ok(utf16_str) = String::from_utf16(&utf16_units) {
513                        return Ok(utf16_str);
514                    }
515                }
516
517                // Final fallback
518                Ok(String::from_utf8_lossy(self).to_string())
519            }
520        }
521    }
522}
523
524/// Provides functionality for normalizing whitespace characters
525///
526/// This trait normalizes various sequences of whitespace characters
527/// (including spaces, tabs, newlines, etc.) in a string into a single
528/// whitespace character, removing leading and trailing whitespace characters.
529///
530/// ## Implementation
531/// This trait is implemented for both `&str` and `String` types.
532pub trait NormalizeWhitespace {
533    fn normalize_whitespace(&self) -> String;
534}
535
536impl NormalizeWhitespace for &str {
537    fn normalize_whitespace(&self) -> String {
538        self.split_whitespace().collect::<Vec<_>>().join(" ")
539    }
540}
541
542impl NormalizeWhitespace for String {
543    fn normalize_whitespace(&self) -> String {
544        self.as_str().normalize_whitespace()
545    }
546}
547
548/// Represents an element node in an XML document
549#[derive(Debug)]
550pub struct XmlElement {
551    /// The local name of the element(excluding namespace prefix)
552    pub name: String,
553
554    /// The namespace prefix of the element
555    pub prefix: Option<String>,
556
557    /// The namespace of the element
558    pub namespace: Option<String>,
559
560    /// The attributes of the element
561    ///
562    /// The key is the attribute name, the value is the attribute value
563    pub attributes: HashMap<String, String>,
564
565    /// The text content of the element
566    pub text: Option<String>,
567
568    /// The CDATA content of the element
569    pub cdata: Option<String>,
570
571    /// The children of the element
572    pub children: Vec<XmlElement>,
573}
574
575impl XmlElement {
576    /// Create a new element
577    pub fn new(name: String) -> Self {
578        Self {
579            name,
580            prefix: None,
581            namespace: None,
582            attributes: HashMap::new(),
583            text: None,
584            cdata: None,
585            children: Vec::new(),
586        }
587    }
588
589    /// Get the full tag name of the element
590    ///
591    /// If the element has a namespace prefix, return "prefix:name" format;
592    /// otherwise, return only the element name.
593    pub fn tag_name(&self) -> String {
594        if let Some(prefix) = &self.prefix {
595            format!("{}:{}", prefix, self.name)
596        } else {
597            self.name.clone()
598        }
599    }
600
601    /// Gets the text content of the element and all its child elements
602    ///
603    /// Collects the text content of the current element and the text content of
604    /// all its child elements, removing leading and trailing whitespace.
605    pub fn text(&self) -> String {
606        let mut result = String::new();
607
608        if let Some(text_value) = &self.text {
609            result.push_str(text_value);
610        }
611
612        for child in &self.children {
613            result.push_str(&child.text());
614        }
615
616        result.trim().to_string()
617    }
618
619    /// Returns the value of the specified attribute
620    pub fn get_attr(&self, name: &str) -> Option<String> {
621        self.attributes.get(name).cloned()
622    }
623
624    /// Find all elements with the specified name
625    pub fn find_elements_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
626        SearchElementsByNameIter::new(self, name)
627    }
628
629    /// Find all elements with the specified name among the child elements of the current element
630    pub fn find_children_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
631        self.children.iter().filter(move |child| child.name == name)
632    }
633
634    /// Find all elements with the specified name list among the child elements of the current element
635    pub fn find_children_by_names(&self, names: &[&str]) -> impl Iterator<Item = &XmlElement> {
636        self.children
637            .iter()
638            .filter(move |child| names.contains(&child.name.as_str()))
639    }
640
641    /// Get children elements
642    pub fn children(&self) -> impl Iterator<Item = &XmlElement> {
643        self.children.iter()
644    }
645}
646
647struct SearchElementsByNameIter<'a> {
648    elements: Vec<&'a XmlElement>,
649    current_index: usize,
650    target_name: String,
651}
652
653impl<'a> SearchElementsByNameIter<'a> {
654    fn new(root: &'a XmlElement, name: &str) -> Self {
655        let mut elements = Vec::new();
656        Self::collect_elements(root, &mut elements);
657        Self {
658            elements,
659            current_index: 0,
660            target_name: name.to_string(),
661        }
662    }
663
664    fn collect_elements(element: &'a XmlElement, collection: &mut Vec<&'a XmlElement>) {
665        collection.push(element);
666        for child in &element.children {
667            Self::collect_elements(child, collection);
668        }
669    }
670}
671
672impl<'a> Iterator for SearchElementsByNameIter<'a> {
673    type Item = &'a XmlElement;
674
675    fn next(&mut self) -> Option<Self::Item> {
676        while self.current_index < self.elements.len() {
677            let element = self.elements[self.current_index];
678            self.current_index += 1;
679            if element.name == self.target_name {
680                return Some(element);
681            }
682        }
683        None
684    }
685}
686
687/// XML parser used to parse XML content and build an XML element tree
688pub struct XmlReader {}
689
690#[allow(unused)]
691impl XmlReader {
692    /// Parses an XML from string and builds the root element
693    ///
694    /// This function takes an XML string, parses its content using the `quick_xml` library,
695    /// and builds an `XmlElement` tree representing the structure of the entire XML document.
696    ///
697    /// ## Parameters
698    /// - `content`: The XML string to be parsed
699    ///
700    /// ## Return
701    /// - `Ok(XmlElement)`: The root element of the XML element tree
702    /// - `Err(EpubError)`: An error occurred during parsing
703    pub fn parse(content: &str) -> Result<XmlElement, EpubError> {
704        if content.is_empty() {
705            return Err(EpubError::EmptyDataError);
706        }
707
708        // Create a XML reader with namespace support
709        let mut reader = NsReader::from_str(content);
710        reader.config_mut().trim_text(true);
711
712        let mut buf = Vec::new();
713        let mut stack = Vec::<XmlElement>::new();
714        let mut root = None;
715        let mut namespace_map = HashMap::new();
716
717        // Read XML events
718        loop {
719            match reader.read_event_into(&mut buf) {
720                // End of file, stop the loop
721                Ok(Event::Eof) => break,
722
723                // Start of an element
724                Ok(Event::Start(e)) => {
725                    let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
726                    let mut element = XmlElement::new(name);
727
728                    if let Some(prefix) = e.name().prefix() {
729                        element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
730                    }
731
732                    for attr in e.attributes().flatten() {
733                        let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
734                        let attr_value = String::from_utf8_lossy(&attr.value).to_string();
735
736                        // Handle namespace attributes
737                        if attr_key.contains("xmlns") {
738                            let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
739                            if attr_keys.len() >= 2 {
740                                namespace_map.insert(attr_keys[1].to_string(), attr_value);
741                            } else {
742                                namespace_map.insert(attr_key, attr_value);
743                            }
744
745                            continue;
746                        }
747
748                        element.attributes.insert(attr_key, attr_value);
749                    }
750
751                    stack.push(element);
752                }
753
754                // End of an element
755                Ok(Event::End(_)) => {
756                    if let Some(element) = stack.pop() {
757                        // If the stack is empty,
758                        // the current element is the root element
759                        if stack.is_empty() {
760                            root = Some(element);
761                        } else if let Some(parent) = stack.last_mut() {
762                            // If the stack is not empty,
763                            // the current element is a child element of the last element in the stack
764                            parent.children.push(element);
765                        }
766                    }
767                }
768
769                // Self-closing element
770                Ok(Event::Empty(e)) => {
771                    let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
772                    let mut element = XmlElement::new(name);
773
774                    if let Some(prefix) = e.name().prefix() {
775                        element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
776                    }
777
778                    for attr in e.attributes().flatten() {
779                        let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
780                        let attr_value = String::from_utf8_lossy(&attr.value).to_string();
781
782                        if attr_key.contains("xmlns") {
783                            let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
784                            if attr_keys.len() >= 2 {
785                                namespace_map.insert(attr_keys[1].to_string(), attr_value);
786                            } else {
787                                namespace_map.insert(attr_key, attr_value);
788                            }
789
790                            continue;
791                        }
792
793                        element.attributes.insert(attr_key, attr_value);
794                    }
795
796                    // We can almost certainly assert that a self-closing element cannot be
797                    // the root node of an XML file, so this will definitely be executed.
798                    if let Some(parent) = stack.last_mut() {
799                        parent.children.push(element);
800                    }
801                }
802
803                // Text node
804                Ok(Event::Text(e)) => {
805                    if let Some(element) = stack.last_mut() {
806                        let text = String::from_utf8_lossy(e.as_ref()).to_string();
807                        if !text.trim().is_empty() {
808                            element.text = Some(text);
809                        }
810                    }
811                }
812
813                // CDATA node
814                Ok(Event::CData(e)) => {
815                    if let Some(element) = stack.last_mut() {
816                        element.cdata = Some(String::from_utf8_lossy(e.as_ref()).to_string());
817                    }
818                }
819
820                Err(err) => return Err(err.into()),
821
822                // Ignore the following events (elements):
823                // Comment, PI, Declaration, Doctype, GeneralRef
824                _ => continue,
825            }
826        }
827
828        if let Some(element) = root.as_mut() {
829            Self::assign_namespace(element, &namespace_map);
830        }
831
832        // TODO: handle this error with a proper error
833        root.ok_or(EpubError::EmptyDataError)
834    }
835
836    /// Parse XML from bytes and builds the root element
837    pub fn parse_bytes(bytes: Vec<u8>) -> Result<XmlElement, EpubError> {
838        let content = bytes.decode()?;
839        Self::parse(&content)
840    }
841
842    /// Assign namespace to element recursively
843    ///
844    /// ## Parameters
845    /// - `element`: The element to assign namespace
846    /// - `namespace_map`: The prefix-namespace map
847    fn assign_namespace(element: &mut XmlElement, namespace_map: &HashMap<String, String>) {
848        if let Some(prefix) = &element.prefix {
849            if let Some(namespace) = namespace_map.get(prefix) {
850                element.namespace = Some(namespace.clone());
851            }
852        } else if let Some(namespace) = namespace_map.get("xmlns") {
853            element.namespace = Some(namespace.clone());
854        }
855
856        for chiled in element.children.iter_mut() {
857            Self::assign_namespace(chiled, namespace_map);
858        }
859    }
860}
861
862#[cfg(test)]
863mod tests {
864    use crate::{
865        error::EpubError,
866        utils::{DecodeBytes, NormalizeWhitespace},
867    };
868
869    /// Test with empty data
870    #[test]
871    fn test_decode_empty_data() {
872        let data = vec![];
873        let result = data.decode();
874        assert!(result.is_err());
875        assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
876    }
877
878    /// Test data with a length of less than 4 bytes
879    #[test]
880    fn test_decode_short_data() {
881        let data = vec![0xEF, 0xBB];
882        let result = data.decode();
883        assert!(result.is_err());
884        assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
885    }
886
887    /// Testing text decoding with UTF-8 BOM
888    #[test]
889    fn test_decode_utf8_with_bom() {
890        let data: Vec<u8> = vec![0xEF, 0xBB, 0xBF, b'H', b'e', b'l', b'l', b'o'];
891        let result = data.decode();
892        assert!(result.is_ok());
893        assert_eq!(result.unwrap(), "Hello");
894    }
895
896    /// Test text decoding with UTF-16 BE BOM
897    #[test]
898    fn test_decode_utf16_be_with_bom() {
899        let data = vec![
900            0xFE, 0xFF, // BOM
901            0x00, b'H', // H
902            0x00, b'e', // e
903            0x00, b'l', // l
904            0x00, b'l', // l
905            0x00, b'o', // o
906        ];
907        let result = data.decode();
908        assert!(result.is_ok());
909        assert_eq!(result.unwrap(), "Hello");
910    }
911
912    /// Testing text decoding with UTF-16 LE BOM
913    #[test]
914    fn test_decode_utf16_le_with_bom() {
915        let data = vec![
916            0xFF, 0xFE, // BOM
917            b'H', 0x00, // H
918            b'e', 0x00, // e
919            b'l', 0x00, // l
920            b'l', 0x00, // l
921            b'o', 0x00, // o
922        ];
923        let result = data.decode();
924        assert!(result.is_ok());
925        assert_eq!(result.unwrap(), "Hello");
926    }
927
928    /// Testing ordinary UTF-8 text (without BOM)
929    #[test]
930    fn test_decode_plain_utf8() {
931        let data = b"Hello, World!".to_vec();
932        let result = data.decode();
933        assert!(result.is_ok());
934        assert_eq!(result.unwrap(), "Hello, World!");
935    }
936
937    /// Test text standardization containing various whitespace characters
938    #[test]
939    fn test_normalize_whitespace_trait() {
940        // Test for &str
941        let text = "  Hello,\tWorld!\n\nRust  ";
942        let normalized = text.normalize_whitespace();
943        assert_eq!(normalized, "Hello, World! Rust");
944
945        // Test for String
946        let text_string = String::from("  Hello,\tWorld!\n\nRust  ");
947        let normalized = text_string.normalize_whitespace();
948        assert_eq!(normalized, "Hello, World! Rust");
949    }
950}
lib_epub/utils.rs

lib_epub/
utils.rs