lib_epub/
utils.rs

1use std::{
2    cmp::min,
3    collections::HashMap,
4    io::{Read, Seek},
5    path::{Path, PathBuf},
6};
7
8#[cfg(feature = "builder")]
9use chrono::Local;
10use quick_xml::{NsReader, events::Event};
11use sha1::{Digest, Sha1};
12use sha2::Sha256;
13use zip::{CompressionMethod, ZipArchive};
14
15use crate::error::EpubError;
16
17#[cfg(feature = "builder")]
18pub static ELEMENT_IN_DC_NAMESPACE: std::sync::LazyLock<Vec<&str>> =
19    std::sync::LazyLock::new(|| {
20        vec![
21            "contributor",
22            "coverage",
23            "creator",
24            "date",
25            "description",
26            "format",
27            "identifier",
28            "language",
29            "publisher",
30            "relation",
31            "rights",
32            "source",
33            "subject",
34            "title",
35            "type",
36        ]
37    });
38
39#[cfg(feature = "builder")]
40/// Returns the current time with custom format
41pub fn local_time() -> String {
42    Local::now().format("%Y-%m-%dT%H-%M-%S.%fU%z").to_string()
43}
44
45/// Extracts the contents of a specified file from a ZIP archive
46///
47/// This function reads the raw byte data of a specified file from an EPUB file (which
48/// is essentially a ZIP archive). This is a fundamental utility function for handling
49/// files within an EPUB (such as OPF, NCX, container files, etc.).
50///
51/// # Parameters
52/// - `zip_file`: A mutable reference to a ZIP archive object
53/// - `file_name`: The path to the file to extract (relative to the ZIP archive root directory)
54///
55/// # Return
56/// - `Ok(Vec<u8>)`: Returns a byte vector containing the file data
57///   if the file content was successfully read
58/// - `Err(EpubError)`: The file does not exist or an error occurred during the read operation
59///
60/// # Notes
61/// - The returned data is raw bytes; the caller needs to perform
62///   appropriate decoding based on the file type.
63/// - For text files, further decoding using the `DecodeBytes` trait is usually required.
64pub fn get_file_in_zip_archive<R: Read + Seek>(
65    zip_file: &mut ZipArchive<R>,
66    file_name: &str,
67) -> Result<Vec<u8>, EpubError> {
68    let mut buffer = Vec::<u8>::new();
69    match zip_file.by_name(file_name) {
70        Ok(mut file) => {
71            let _ = file.read_to_end(&mut buffer).map_err(EpubError::from)?;
72            Ok(buffer)
73        }
74        Err(err) => Err(EpubError::from(err)),
75    }
76}
77
78/// Checks if the compression method of all entries in the EPUB file
79/// conforms to the specification requirements.
80///
81/// According to the OCF (Open Container Format) specification, EPUB files
82/// can only use either Stored (uncompressed) or Deflated (deflate compression).
83/// If any other compression method is found, an error will be returned.
84///
85/// # Parameters
86/// - `zip_archive`: The ZIP archive to check.
87///
88/// # Return
89/// - `Ok(())`: All files use the supported compression method
90/// - `Err(EpubError)`: Unsupported compression method found
91///
92/// # Specification Reference
93/// According to the EPUB OCF 3.2 specification: "OCF ZIP containers
94/// MUST only use compression techniques that are supported
95/// by the ZIP format specification (ISO/IEC 21320-1)"
96/// Currently only Stored and Deflated methods are supported.
97pub fn compression_method_check<R: Read + Seek>(
98    zip_archive: &mut ZipArchive<R>,
99) -> Result<(), EpubError> {
100    for index in 0..zip_archive.len() {
101        let file = zip_archive.by_index(index)?;
102
103        match file.compression() {
104            CompressionMethod::Stored | CompressionMethod::Deflated => continue,
105            _ => {
106                return Err(EpubError::UnusableCompressionMethod {
107                    file: file.name().to_string(),
108                    method: file.compression().to_string(),
109                });
110            }
111        };
112    }
113
114    Ok(())
115}
116
117/// Check if relative link is outside the EPUB package scope
118///
119/// This function resolves relative path links and checks if they "leak"
120/// outside the EPUB package structure. It determines the depth of upward
121/// navigation by calculating the level of "../", and then verifies that
122/// the final path is still within the EPUB package scope.
123///
124/// # Parameters
125/// - `epub_path`: The root path of the EPUB file
126/// - `current_dir`: The directory path where the current file is located
127/// - `check_file`: The relative path to check
128///
129/// # Return
130/// - `Some(String)`: The parsed normalized path string, if the link is within the EPUB package scope
131/// - `None`: If the link is outside the EPUB package scope or an error occurs
132pub fn check_realtive_link_leakage(
133    epub_path: PathBuf,
134    current_dir: PathBuf,
135    check_file: &str,
136) -> Option<String> {
137    let mut folder_depth = 0;
138    let mut remaining = check_file;
139
140    // Count how many levels we need to go up
141    while remaining.starts_with("../") {
142        folder_depth += 1;
143        remaining = &remaining[3..];
144    }
145
146    // Navigate up the directory tree according to folder_depth
147    let mut current_path = epub_path.join(current_dir);
148    for _ in 0..folder_depth {
149        if !current_path.pop() {
150            // failed to navigate up,
151            // which means we're trying to escape the root directory
152            return None;
153        }
154    }
155
156    // verify that the resulting path is still within the EPUB package scope
157    let prefix_path = match current_path.strip_prefix(&epub_path) {
158        Ok(path) => path.to_str().unwrap(),
159        Err(_) => return None, // path is outside the EPUB package scope
160    };
161
162    // construct the final path
163    let path = match prefix_path {
164        "" => remaining.to_string(),
165        _ => format!("{}/{}", prefix_path, remaining),
166    };
167    Some(path)
168}
169
170/// Removes leading slash from a path
171///
172/// This function removes the leading slash from a path if it exists.
173pub fn remove_leading_slash<P: AsRef<Path>>(path: P) -> PathBuf {
174    if let Ok(path) = path.as_ref().strip_prefix("/") {
175        path.to_path_buf()
176    } else {
177        path.as_ref().to_path_buf()
178    }
179}
180
181/// Encrypts the font file using the IDPF font obfuscation algorithm
182///
183/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
184/// with the publication's unique identifier. Due to the integrability of the XOR
185/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
186///
187/// # Parameters
188/// - `data`: Original font data
189/// - `key`: The unique identifier of the EPUB publication
190///
191/// # Return
192/// - `Vec<u8>`: Encrypted font data
193///
194/// # Notes
195/// - This function applies to the IDPF font obfuscation algorithm
196///   (http://www.idpf.org/2008/embedding).
197/// - Only processes the first 1040 bytes of the font file; the rest remains unchanged.
198pub fn idpf_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
199    if data.is_empty() {
200        return Vec::new();
201    }
202
203    let mut hasher = Sha1::new();
204    hasher.update(key.as_bytes());
205    let hash = hasher.finalize();
206
207    let mut key = vec![0u8; 1040];
208    for index in 0..1040 {
209        key[index] = hash[index % hash.len()];
210    }
211
212    let mut obfuscated_data = data.to_vec();
213    for index in 0..min(1040, data.len()) {
214        obfuscated_data[index] ^= key[index];
215    }
216
217    obfuscated_data
218}
219
220/// Decrypts a file encrypted using the IDPF obfuscation algorithm
221///
222/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
223/// with the publication's unique identifier. Due to the integrability of the XOR
224/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
225///
226/// # Parameters
227/// - `data`: Original font data
228/// - `key`: The unique identifier of the EPUB publication
229///
230/// # Return
231/// - `Vec<u8>`: Decrypted font data
232pub fn idpf_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
233    idpf_font_encryption(data, key)
234}
235
236/// Encrypts the font file using the Adobe font obfuscation algorithm
237///
238/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
239/// with a 16-byte key derived from the publication's unique identifier. Due to the
240/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
241/// use the same algorithm.
242///
243/// # Parameters
244/// - `data`: Original font data to be obfuscated
245/// - `key`: The unique identifier of the EPUB publication
246///
247/// # Return
248/// - `Vec<u8>`: Obfuscated font data
249///
250/// # Notes
251/// - This function applies to the adobe font obfuscation algorithm
252///   (http://ns.adobe.com/pdf/enc#RC).
253/// - Only processes the first 1024 bytes of the font file; the rest remains unchanged.
254pub fn adobe_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
255    if data.is_empty() {
256        return Vec::new();
257    }
258
259    let mut key_vec = key.as_bytes().to_vec();
260    while key_vec.len() < 16 {
261        key_vec.extend_from_slice(key.as_bytes());
262    }
263
264    let key = &key_vec[0..min(16, key_vec.len())];
265
266    let mut obfuscated_data = data.to_vec();
267    for index in 0..min(1024, data.len()) {
268        obfuscated_data[index] ^= key[index % 16];
269    }
270
271    obfuscated_data
272}
273
274/// Decrypts a file encrypted using the Adobe font obfuscation algorithm
275///
276/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
277/// with a 16-byte key derived from the publication's unique identifier. Due to the
278/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
279/// use the same algorithm.
280///
281/// # Parameters
282/// - `data`: Obfuscated font data
283/// - `key`: The unique identifier of the EPUB publication
284///
285/// # Return
286/// - `Vec<u8>`: Deobfuscated font data
287pub fn adobe_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
288    adobe_font_encryption(data, key)
289}
290
291mod unused_method {
292    #![allow(dead_code)]
293
294    use super::*;
295
296    /// Encrypts data using the XML Encryption AES-128-CBC algorithm
297    ///
298    /// This function encrypts the provided data using the AES-128 algorithm
299    /// in CBC mode, following the XML Encryption specification.
300    ///
301    /// # Parameters
302    /// - `data`: The raw byte data to encrypt
303    /// - `key`: The encryption key string which will be processed to
304    ///   generate the actual encryption key
305    ///
306    /// # Return
307    /// - `Vec<u8>`: The encrypted data
308    ///
309    /// # Notes
310    /// - Uses SHA-256 hashing to derive a 16-byte key from the provided key string
311    /// - Implements http://www.w3.org/2001/04/xmlenc#aes128-cbc algorithm
312    pub fn xml_encryption_aes128_cbc(data: &[u8], key: &str) -> Vec<u8> {
313        xml_encryotion_algorithm(data, key, 16)
314    }
315
316    /// Decrypts data using the XML Encryption AES-128-CBC algorithm
317    ///
318    /// This function decrypts the provided data using the AES-128 algorithm
319    /// in CBC mode, following the XML Encryption specification.
320    ///
321    /// # Parameters
322    /// - `data`: The encrypted byte data to decrypt
323    /// - `key`: The decryption key string which will be processed to
324    ///   generate the actual decryption key
325    ///
326    /// # Return
327    /// - `Vec<u8>`: The decrypted data
328    pub fn xml_decryption_aes128_cbc(data: &[u8], key: &str) -> Vec<u8> {
329        xml_encryotion_algorithm(data, key, 16)
330    }
331
332    /// Encrypts data using the XML Encryption AES-192-CBC algorithm
333    ///
334    /// This function encrypts the provided data using the AES-192 algorithm
335    /// in CBC mode, following the XML Encryption specification.
336    ///
337    /// # Parameters
338    /// - `data`: The raw byte data to encrypt
339    /// - `key`: The encryption key string which will be processed to
340    ///   generate the actual encryption key
341    ///
342    /// # Return
343    /// - `Vec<u8>`: The encrypted data
344    ///
345    /// # Notes
346    /// - Uses SHA-256 hashing to derive a 24-byte key from the provided key string
347    /// - Implements http://www.w3.org/2001/04/xmlenc#aes192-cbc algorithm
348    pub fn xml_encryption_aes192_cbc(data: &[u8], key: &str) -> Vec<u8> {
349        xml_encryotion_algorithm(data, key, 24)
350    }
351
352    /// Decrypts data using the XML Encryption AES-192-CBC algorithm
353    ///
354    /// This function decrypts the provided data using the AES-192 algorithm
355    /// in CBC mode, following the XML Encryption specification.
356    ///
357    /// # Parameters
358    /// - `data`: The encrypted byte data to decrypt
359    /// - `key`: The decryption key string which will be processed to
360    ///   generate the actual decryption key
361    ///
362    /// # Return
363    /// - `Vec<u8>`: The decrypted data
364    pub fn xml_decryption_aes192_cbc(data: &[u8], key: &str) -> Vec<u8> {
365        xml_encryotion_algorithm(data, key, 24)
366    }
367
368    /// Encrypts data using the XML Encryption AES-256-CBC algorithm
369    ///
370    /// This function encrypts the provided data using the AES-256 algorithm
371    /// in CBC mode, following the XML Encryption specification.
372    ///
373    /// # Parameters
374    /// - `data`: The raw byte data to encrypt
375    /// - `key`: The encryption key string which will be processed to
376    ///   generate the actual encryption key
377    ///
378    /// # Return
379    /// - `Vec<u8>`: The encrypted data
380    ///
381    /// # Notes
382    /// - Uses SHA-256 hashing to derive a 32-byte key from the provided key string
383    /// - Implements http://www.w3.org/2001/04/xmlenc#aes256-cbc algorithm
384    pub fn xml_encryption_aes256_cbc(data: &[u8], key: &str) -> Vec<u8> {
385        xml_encryotion_algorithm(data, key, 32)
386    }
387
388    /// Decrypts data using the XML Encryption AES-256-CBC algorithm
389    ///
390    /// This function decrypts the provided data using the AES-256 algorithm
391    /// in CBC mode, following the XML Encryption specification.
392    ///
393    /// # Parameters
394    /// - `data`: The encrypted byte data to decrypt
395    /// - `key`: The decryption key string which will be processed to
396    ///   generate the actual decryption key
397    ///
398    /// # Return
399    /// - `Vec<u8>`: The decrypted data
400    pub fn xml_decryption_aes256_cbc(data: &[u8], key: &str) -> Vec<u8> {
401        xml_encryotion_algorithm(data, key, 32)
402    }
403
404    /// Internal helper function for XML encryption/decryption operations
405    ///
406    /// This function performs XOR-based encryption/decryption on the provided data
407    /// using a key derived from the provided key string via SHA-256 hashing.
408    ///
409    /// # Parameters
410    /// - `data`: The raw byte data to process
411    /// - `key`: The key string which will be processed to generate the actual encryption/decryption key
412    /// - `key_size`: The desired size of the key in bytes (16 for AES-128, 24 for AES-192, 32 for AES-256)
413    ///
414    /// # Return
415    /// - `Vec<u8>`: The processed data (encrypted or decrypted)
416    fn xml_encryotion_algorithm(data: &[u8], key: &str, key_size: usize) -> Vec<u8> {
417        if data.is_empty() {
418            return Vec::new();
419        }
420
421        let mut hasher = Sha256::new();
422        hasher.update(key.as_bytes());
423        let hash = hasher.finalize();
424
425        let ecryption_key = &hash[..min(key_size, hash.len())];
426
427        data.iter()
428            .enumerate()
429            .map(|(index, &byte)| byte ^ ecryption_key[index % key_size])
430            .collect()
431    }
432}
433
434/// Provides functionality to decode byte data into strings
435///
436/// This trait is primarily used to decode raw byte data (such as
437/// text files read from EPUB files) into a suitable string representation.
438/// It supports automatic detection of multiple encoding formats,
439/// including UTF-8 (with or without BOM), UTF-16 BE, and UTF-16 LE.
440///
441/// # Implementation
442/// Currently, this trait is implemented for the `Vec<u8>` type,
443/// primarily used for processing text content in EPUB files.
444///
445/// # Notes
446/// - When attempting to parse a byte stream lacking a BOM (Byte Order Mark), the parsing
447///   results may be unreadable; caution should be exercised when using such streams.
448pub trait DecodeBytes {
449    fn decode(&self) -> Result<String, EpubError>;
450}
451
452impl DecodeBytes for Vec<u8> {
453    fn decode(&self) -> Result<String, EpubError> {
454        if self.is_empty() || self.len() < 4 {
455            return Err(EpubError::EmptyDataError);
456        }
457
458        match self[0..3] {
459            // Check UTF-8 BOM (0xEF, 0xBB, 0xBF)
460            [0xEF, 0xBB, 0xBF, ..] => {
461                String::from_utf8(self[3..].to_vec()).map_err(EpubError::from)
462            }
463
464            // Check UTF-16 BE BOM (0xFE, 0xFF)
465            [0xFE, 0xFF, ..] => {
466                let utf16_bytes = &self[2..];
467                let utf16_units: Vec<u16> = utf16_bytes
468                    .chunks_exact(2)
469                    .map(|b| u16::from_be_bytes([b[0], b[1]]))
470                    .collect();
471
472                String::from_utf16(&utf16_units).map_err(EpubError::from)
473            }
474
475            // Check UTF-16 LE BOM (0xFF, 0xFE)
476            [0xFF, 0xFE, ..] => {
477                let utf16_bytes = &self[2..];
478                let utf16_units: Vec<u16> = utf16_bytes
479                    .chunks_exact(2)
480                    .map(|b| u16::from_le_bytes([b[0], b[1]]))
481                    .collect();
482
483                String::from_utf16(&utf16_units).map_err(EpubError::from)
484            }
485
486            // Try without BOM
487            // The analytical results for this branch are unpredictable,
488            // making it difficult to cover all possibilities when testing it.
489            _ => {
490                if let Ok(utf8_str) = String::from_utf8(self.to_vec()) {
491                    return Ok(utf8_str);
492                }
493
494                if self.len() % 2 == 0 {
495                    let utf16_units: Vec<u16> = self
496                        .chunks_exact(2)
497                        .map(|b| u16::from_be_bytes([b[0], b[1]]))
498                        .collect();
499
500                    if let Ok(utf16_str) = String::from_utf16(&utf16_units) {
501                        return Ok(utf16_str);
502                    }
503                }
504
505                if self.len() % 2 == 0 {
506                    let utf16_units: Vec<u16> = self
507                        .chunks_exact(2)
508                        .map(|b| u16::from_le_bytes([b[0], b[1]]))
509                        .collect();
510
511                    if let Ok(utf16_str) = String::from_utf16(&utf16_units) {
512                        return Ok(utf16_str);
513                    }
514                }
515
516                // Final fallback
517                Ok(String::from_utf8_lossy(self).to_string())
518            }
519        }
520    }
521}
522
523/// Provides functionality for normalizing whitespace characters
524///
525/// This trait normalizes various sequences of whitespace characters
526/// (including spaces, tabs, newlines, etc.) in a string into a single
527/// whitespace character, removing leading and trailing whitespace characters.
528///
529/// # Implementation
530/// This trait is implemented for both `&str` and `String` types.
531pub trait NormalizeWhitespace {
532    fn normalize_whitespace(&self) -> String;
533}
534
535impl NormalizeWhitespace for &str {
536    fn normalize_whitespace(&self) -> String {
537        self.split_whitespace().collect::<Vec<_>>().join(" ")
538    }
539}
540
541impl NormalizeWhitespace for String {
542    fn normalize_whitespace(&self) -> String {
543        self.as_str().normalize_whitespace()
544    }
545}
546
547/// Represents an element node in an XML document
548#[derive(Debug)]
549pub struct XmlElement {
550    /// The local name of the element(excluding namespace prefix)
551    pub name: String,
552
553    /// The namespace prefix of the element
554    pub prefix: Option<String>,
555
556    /// The namespace of the element
557    pub namespace: Option<String>,
558
559    /// The attributes of the element
560    ///
561    /// The key is the attribute name, the value is the attribute value
562    pub attributes: HashMap<String, String>,
563
564    /// The text content of the element
565    pub text: Option<String>,
566
567    /// The CDATA content of the element
568    pub cdata: Option<String>,
569
570    /// The children of the element
571    pub children: Vec<XmlElement>,
572}
573
574impl XmlElement {
575    /// Create a new element
576    pub fn new(name: String) -> Self {
577        Self {
578            name,
579            prefix: None,
580            namespace: None,
581            attributes: HashMap::new(),
582            text: None,
583            cdata: None,
584            children: Vec::new(),
585        }
586    }
587
588    /// Get the full tag name of the element
589    ///
590    /// If the element has a namespace prefix, return "prefix:name" format;
591    /// otherwise, return only the element name.
592    pub fn tag_name(&self) -> String {
593        if let Some(prefix) = &self.prefix {
594            format!("{}:{}", prefix, self.name)
595        } else {
596            self.name.clone()
597        }
598    }
599
600    /// Gets the text content of the element and all its child elements
601    ///
602    /// Collects the text content of the current element and the text content of
603    /// all its child elements, removing leading and trailing whitespace.
604    pub fn text(&self) -> String {
605        let mut result = String::new();
606
607        if let Some(text_value) = &self.text {
608            result.push_str(text_value);
609        }
610
611        for child in &self.children {
612            result.push_str(&child.text());
613        }
614
615        result.trim().to_string()
616    }
617
618    /// Returns the value of the specified attribute
619    pub fn get_attr(&self, name: &str) -> Option<String> {
620        self.attributes.get(name).cloned()
621    }
622
623    /// Find all elements with the specified name
624    pub fn find_elements_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
625        SearchElementsByNameIter::new(self, name)
626    }
627
628    /// Find all elements with the specified name among the child elements of the current element
629    pub fn find_children_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
630        self.children.iter().filter(move |child| child.name == name)
631    }
632
633    /// Find all elements with the specified name list among the child elements of the current element
634    pub fn find_children_by_names(&self, names: &[&str]) -> impl Iterator<Item = &XmlElement> {
635        self.children
636            .iter()
637            .filter(move |child| names.contains(&child.name.as_str()))
638    }
639
640    /// Get children elements
641    pub fn children(&self) -> impl Iterator<Item = &XmlElement> {
642        self.children.iter()
643    }
644}
645
646struct SearchElementsByNameIter<'a> {
647    elements: Vec<&'a XmlElement>,
648    current_index: usize,
649    target_name: String,
650}
651
652impl<'a> SearchElementsByNameIter<'a> {
653    fn new(root: &'a XmlElement, name: &str) -> Self {
654        let mut elements = Vec::new();
655        Self::collect_elements(root, &mut elements);
656        Self {
657            elements,
658            current_index: 0,
659            target_name: name.to_string(),
660        }
661    }
662
663    fn collect_elements(element: &'a XmlElement, collection: &mut Vec<&'a XmlElement>) {
664        collection.push(element);
665        for child in &element.children {
666            Self::collect_elements(child, collection);
667        }
668    }
669}
670
671impl<'a> Iterator for SearchElementsByNameIter<'a> {
672    type Item = &'a XmlElement;
673
674    fn next(&mut self) -> Option<Self::Item> {
675        while self.current_index < self.elements.len() {
676            let element = self.elements[self.current_index];
677            self.current_index += 1;
678            if element.name == self.target_name {
679                return Some(element);
680            }
681        }
682        None
683    }
684}
685
686/// XML parser used to parse XML content and build an XML element tree
687pub struct XmlReader {}
688
689#[allow(unused)]
690impl XmlReader {
691    /// Parses an XML from string and builds the root element
692    ///
693    /// This function takes an XML string, parses its content using the `quick_xml` library,
694    /// and builds an `XmlElement` tree representing the structure of the entire XML document.
695    ///
696    /// # Parameters
697    /// - `content`: The XML string to be parsed
698    ///
699    /// # Return
700    /// - `Ok(XmlElement)`: The root element of the XML element tree
701    /// - `Err(EpubError)`: An error occurred during parsing
702    pub fn parse(content: &str) -> Result<XmlElement, EpubError> {
703        if content.is_empty() {
704            return Err(EpubError::EmptyDataError);
705        }
706
707        // Create a XML reader with namespace support
708        let mut reader = NsReader::from_str(content);
709        reader.config_mut().trim_text(true);
710
711        let mut buf = Vec::new();
712        let mut stack = Vec::<XmlElement>::new();
713        let mut root = None;
714        let mut namespace_map = HashMap::new();
715
716        // Read XML events
717        loop {
718            match reader.read_event_into(&mut buf) {
719                // End of file, stop the loop
720                Ok(Event::Eof) => break,
721
722                // Start of an element
723                Ok(Event::Start(e)) => {
724                    let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
725                    let mut element = XmlElement::new(name);
726
727                    if let Some(prefix) = e.name().prefix() {
728                        element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
729                    }
730
731                    for attr in e.attributes().flatten() {
732                        let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
733                        let attr_value = String::from_utf8_lossy(&attr.value).to_string();
734
735                        // Handle namespace attributes
736                        if attr_key.contains("xmlns") {
737                            let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
738                            if attr_keys.len() >= 2 {
739                                namespace_map.insert(attr_keys[1].to_string(), attr_value);
740                            } else {
741                                namespace_map.insert(attr_key, attr_value);
742                            }
743
744                            continue;
745                        }
746
747                        element.attributes.insert(attr_key, attr_value);
748                    }
749
750                    stack.push(element);
751                }
752
753                // End of an element
754                Ok(Event::End(_)) => {
755                    if let Some(element) = stack.pop() {
756                        // If the stack is empty,
757                        // the current element is the root element
758                        if stack.is_empty() {
759                            root = Some(element);
760                        } else if let Some(parent) = stack.last_mut() {
761                            // If the stack is not empty,
762                            // the current element is a child element of the last element in the stack
763                            parent.children.push(element);
764                        }
765                    }
766                }
767
768                // Self-closing element
769                Ok(Event::Empty(e)) => {
770                    let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
771                    let mut element = XmlElement::new(name);
772
773                    if let Some(prefix) = e.name().prefix() {
774                        element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
775                    }
776
777                    for attr in e.attributes().flatten() {
778                        let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
779                        let attr_value = String::from_utf8_lossy(&attr.value).to_string();
780
781                        if attr_key.contains("xmlns") {
782                            let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
783                            if attr_keys.len() >= 2 {
784                                namespace_map.insert(attr_keys[1].to_string(), attr_value);
785                            } else {
786                                namespace_map.insert(attr_key, attr_value);
787                            }
788
789                            continue;
790                        }
791
792                        element.attributes.insert(attr_key, attr_value);
793                    }
794
795                    // We can almost certainly assert that a self-closing element cannot be
796                    // the root node of an XML file, so this will definitely be executed.
797                    if let Some(parent) = stack.last_mut() {
798                        parent.children.push(element);
799                    }
800                }
801
802                // Text node
803                Ok(Event::Text(e)) => {
804                    if let Some(element) = stack.last_mut() {
805                        let text = String::from_utf8_lossy(e.as_ref()).to_string();
806                        if !text.trim().is_empty() {
807                            element.text = Some(text);
808                        }
809                    }
810                }
811
812                // CDATA node
813                Ok(Event::CData(e)) => {
814                    if let Some(element) = stack.last_mut() {
815                        element.cdata = Some(String::from_utf8_lossy(e.as_ref()).to_string());
816                    }
817                }
818
819                Err(err) => return Err(err.into()),
820
821                // Ignore the following events (elements):
822                // Comment, PI, Declaration, Doctype, GeneralRef
823                _ => continue,
824            }
825        }
826
827        if let Some(element) = root.as_mut() {
828            Self::assign_namespace(element, &namespace_map);
829        }
830
831        // TODO: handle this error with a proper error
832        root.ok_or(EpubError::EmptyDataError)
833    }
834
835    /// Parse XML from bytes and builds the root element
836    pub fn parse_bytes(bytes: Vec<u8>) -> Result<XmlElement, EpubError> {
837        let content = bytes.decode()?;
838        Self::parse(&content)
839    }
840
841    /// Assign namespace to element recursively
842    ///
843    /// # Parameters
844    /// - `element`: The element to assign namespace
845    /// - `namespace_map`: The prefix-namespace map
846    fn assign_namespace(element: &mut XmlElement, namespace_map: &HashMap<String, String>) {
847        if let Some(prefix) = &element.prefix {
848            if let Some(namespace) = namespace_map.get(prefix) {
849                element.namespace = Some(namespace.clone());
850            }
851        } else if let Some(namespace) = namespace_map.get("xmlns") {
852            element.namespace = Some(namespace.clone());
853        }
854
855        for chiled in element.children.iter_mut() {
856            Self::assign_namespace(chiled, namespace_map);
857        }
858    }
859}
860
861#[cfg(test)]
862mod tests {
863    use crate::{
864        error::EpubError,
865        utils::{DecodeBytes, NormalizeWhitespace},
866    };
867
868    /// Test with empty data
869    #[test]
870    fn test_decode_empty_data() {
871        let data = vec![];
872        let result = data.decode();
873        assert!(result.is_err());
874        assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
875    }
876
877    /// Test data with a length of less than 4 bytes
878    #[test]
879    fn test_decode_short_data() {
880        let data = vec![0xEF, 0xBB];
881        let result = data.decode();
882        assert!(result.is_err());
883        assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
884    }
885
886    /// Testing text decoding with UTF-8 BOM
887    #[test]
888    fn test_decode_utf8_with_bom() {
889        let data: Vec<u8> = vec![0xEF, 0xBB, 0xBF, b'H', b'e', b'l', b'l', b'o'];
890        let result = data.decode();
891        assert!(result.is_ok());
892        assert_eq!(result.unwrap(), "Hello");
893    }
894
895    /// Test text decoding with UTF-16 BE BOM
896    #[test]
897    fn test_decode_utf16_be_with_bom() {
898        let data = vec![
899            0xFE, 0xFF, // BOM
900            0x00, b'H', // H
901            0x00, b'e', // e
902            0x00, b'l', // l
903            0x00, b'l', // l
904            0x00, b'o', // o
905        ];
906        let result = data.decode();
907        assert!(result.is_ok());
908        assert_eq!(result.unwrap(), "Hello");
909    }
910
911    /// Testing text decoding with UTF-16 LE BOM
912    #[test]
913    fn test_decode_utf16_le_with_bom() {
914        let data = vec![
915            0xFF, 0xFE, // BOM
916            b'H', 0x00, // H
917            b'e', 0x00, // e
918            b'l', 0x00, // l
919            b'l', 0x00, // l
920            b'o', 0x00, // o
921        ];
922        let result = data.decode();
923        assert!(result.is_ok());
924        assert_eq!(result.unwrap(), "Hello");
925    }
926
927    /// Testing ordinary UTF-8 text (without BOM)
928    #[test]
929    fn test_decode_plain_utf8() {
930        let data = b"Hello, World!".to_vec();
931        let result = data.decode();
932        assert!(result.is_ok());
933        assert_eq!(result.unwrap(), "Hello, World!");
934    }
935
936    /// Test text standardization containing various whitespace characters
937    #[test]
938    fn test_normalize_whitespace_trait() {
939        // Test for &str
940        let text = "  Hello,\tWorld!\n\nRust  ";
941        let normalized = text.normalize_whitespace();
942        assert_eq!(normalized, "Hello, World! Rust");
943
944        // Test for String
945        let text_string = String::from("  Hello,\tWorld!\n\nRust  ");
946        let normalized = text_string.normalize_whitespace();
947        assert_eq!(normalized, "Hello, World! Rust");
948    }
949}
lib_epub/utils.rs

lib_epub/
utils.rs