Skip to main content

lib_epub/
utils.rs

1use std::{
2    cmp,
3    collections::HashMap,
4    io::{Read, Seek},
5    path::PathBuf,
6};
7
8#[cfg(feature = "builder")]
9use chrono::Local;
10use quick_xml::{NsReader, events::Event};
11use sha1::{Digest, Sha1};
12use zip::{CompressionMethod, ZipArchive};
13
14use crate::error::EpubError;
15
16#[cfg(feature = "builder")]
17pub static ELEMENT_IN_DC_NAMESPACE: std::sync::LazyLock<Vec<&str>> =
18    std::sync::LazyLock::new(|| {
19        vec![
20            "contributor",
21            "coverage",
22            "creator",
23            "date",
24            "description",
25            "format",
26            "identifier",
27            "language",
28            "publisher",
29            "relation",
30            "rights",
31            "source",
32            "subject",
33            "title",
34            "type",
35        ]
36    });
37
38#[cfg(feature = "builder")]
39/// Returns the current time with custom format
40pub fn local_time() -> String {
41    Local::now().format("%Y-%m-%dT%H-%M-%S.%fU%z").to_string()
42}
43
44/// Extracts the contents of a specified file from a ZIP archive
45///
46/// This function reads the raw byte data of a specified file from an EPUB file (which
47/// is essentially a ZIP archive). This is a fundamental utility function for handling
48/// files within an EPUB (such as OPF, NCX, container files, etc.).
49///
50/// ## Parameters
51/// - `zip_file`: A mutable reference to a ZIP archive object
52/// - `file_name`: The path to the file to extract (relative to the ZIP archive root directory)
53///
54/// ## Return
55/// - `Ok(Vec<u8>)`: Returns a byte vector containing the file data
56///   if the file content was successfully read
57/// - `Err(EpubError)`: The file does not exist or an error occurred during the read operation
58///
59/// ## Notes
60/// - The returned data is raw bytes; the caller needs to perform
61///   appropriate decoding based on the file type.
62/// - For text files, further decoding using the `DecodeBytes` trait is usually required.
63pub fn get_file_in_zip_archive<R: Read + Seek>(
64    zip_file: &mut ZipArchive<R>,
65    file_name: &str,
66) -> Result<Vec<u8>, EpubError> {
67    let mut buffer = Vec::<u8>::new();
68    match zip_file.by_name(file_name) {
69        Ok(mut file) => {
70            let _ = file.read_to_end(&mut buffer).map_err(EpubError::from)?;
71            Ok(buffer)
72        }
73        Err(err) => Err(EpubError::from(err)),
74    }
75}
76
77/// Checks if the compression method of all entries in the EPUB file
78/// conforms to the specification requirements.
79///
80/// According to the OCF (Open Container Format) specification, EPUB files
81/// can only use either Stored (uncompressed) or Deflated (deflate compression).
82/// If any other compression method is found, an error will be returned.
83///
84/// ## Parameters
85/// - `zip_archive`: The ZIP archive to check.
86///
87/// ## Return
88/// - `Ok(())`: All files use the supported compression method
89/// - `Err(EpubError)`: Unsupported compression method found
90///
91/// ## Specification Reference
92/// According to the EPUB OCF 3.2 specification: "OCF ZIP containers
93/// MUST only use compression techniques that are supported
94/// by the ZIP format specification (ISO/IEC 21320-1)"
95/// Currently only Stored and Deflated methods are supported.
96pub fn compression_method_check<R: Read + Seek>(
97    zip_archive: &mut ZipArchive<R>,
98) -> Result<(), EpubError> {
99    for index in 0..zip_archive.len() {
100        let file = zip_archive.by_index(index)?;
101
102        match file.compression() {
103            CompressionMethod::Stored | CompressionMethod::Deflated => continue,
104            method => {
105                return Err(EpubError::UnusableCompressionMethod {
106                    file: file.name().to_string(),
107                    method: method.to_string(),
108                });
109            }
110        };
111    }
112
113    Ok(())
114}
115
116/// Check if relative link is outside the EPUB package scope
117///
118/// This function resolves relative path links and checks if they "leak"
119/// outside the EPUB package structure. It determines the depth of upward
120/// navigation by calculating the level of "../", and then verifies that
121/// the final path is still within the EPUB package scope.
122///
123/// ## Parameters
124/// - `epub_path`: The root path of the EPUB file
125/// - `current_dir`: The directory path where the current file is located
126/// - `check_file`: The relative path to check
127///
128/// ## Return
129/// - `Some(String)`: The parsed normalized path string, if the link is within the EPUB package scope
130/// - `None`: If the link is outside the EPUB package scope or an error occurs
131pub fn check_realtive_link_leakage(
132    epub_path: PathBuf,
133    current_dir: PathBuf,
134    check_file: &str,
135) -> Option<String> {
136    // Normalize the path by resolving "../"
137    // Using the `split` function offers better performance than using a `while slice` loop
138    let parts = check_file.split("../").collect::<Vec<&str>>();
139    let folder_depth = parts.len() - 1;
140    let remaining = *parts.last().unwrap_or(&"");
141
142    // Navigate up the directory tree according to folder_depth
143    let mut current_path = epub_path.join(current_dir);
144    for _ in 0..folder_depth {
145        if !current_path.pop() {
146            // failed to navigate up,
147            // which means we're trying to escape the root directory
148            return None;
149        }
150    }
151
152    // verify that the resulting path is still within the EPUB package scope
153    let prefix_path = match current_path.strip_prefix(&epub_path) {
154        Ok(path) => path.to_str().unwrap(),
155        Err(_) => return None, // path is outside the EPUB package scope
156    };
157
158    // construct the final path
159    let path = match prefix_path {
160        "" => remaining.to_string(),
161        _ => format!("{}/{}", prefix_path, remaining),
162    };
163    Some(path)
164}
165
166/// Removes leading slash from a path
167///
168/// This function removes the leading slash from a path if it exists.
169#[cfg(feature = "builder")]
170pub fn remove_leading_slash<P: AsRef<std::path::Path>>(path: P) -> PathBuf {
171    if let Ok(path) = path.as_ref().strip_prefix("/") {
172        path.to_path_buf()
173    } else {
174        path.as_ref().to_path_buf()
175    }
176}
177
178/// Encrypts the font file using the IDPF font obfuscation algorithm
179///
180/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
181/// with the publication's unique identifier. Due to the integrability of the XOR
182/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
183///
184/// ## Parameters
185/// - `data`: Original font data
186/// - `key`: The unique identifier of the EPUB publication
187///
188/// ## Return
189/// - `Vec<u8>`: Encrypted font data
190///
191/// ## Notes
192/// - This function applies to the IDPF font obfuscation algorithm
193///   (http://www.idpf.org/2008/embedding).
194/// - Only processes the first 1040 bytes of the font file; the rest remains unchanged.
195pub fn idpf_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
196    if data.is_empty() {
197        return Vec::new();
198    }
199
200    let hash = {
201        let mut hasher = Sha1::new();
202        hasher.update(key.as_bytes());
203        hasher.finalize()
204    };
205
206    let mut obfuscated_data = data.to_vec();
207    let limit = cmp::min(1040, data.len());
208
209    for (index, byte) in obfuscated_data.iter_mut().take(limit).enumerate() {
210        *byte ^= hash[index % hash.len()]
211    }
212
213    obfuscated_data
214}
215
216/// Decrypts a file encrypted using the IDPF obfuscation algorithm
217///
218/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
219/// with the publication's unique identifier. Due to the integrability of the XOR
220/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
221///
222/// ## Parameters
223/// - `data`: Original font data
224/// - `key`: The unique identifier of the EPUB publication
225///
226/// ## Return
227/// - `Vec<u8>`: Decrypted font data
228pub fn idpf_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
229    idpf_font_encryption(data, key)
230}
231
232/// Encrypts the font file using the Adobe font obfuscation algorithm
233///
234/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
235/// with a 16-byte key derived from the publication's unique identifier. Due to the
236/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
237/// use the same algorithm.
238///
239/// ## Parameters
240/// - `data`: Original font data to be obfuscated
241/// - `key`: The unique identifier of the EPUB publication
242///
243/// ## Return
244/// - `Vec<u8>`: Obfuscated font data
245///
246/// ## Notes
247/// - This function applies to the adobe font obfuscation algorithm
248///   (http://ns.adobe.com/pdf/enc#RC).
249/// - Only processes the first 1024 bytes of the font file; the rest remains unchanged.
250pub fn adobe_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
251    if data.is_empty() {
252        return Vec::new();
253    }
254
255    let mut obfuscated_data = data.to_vec();
256    let limit = cmp::min(1024, data.len());
257
258    for (index, byte) in obfuscated_data.iter_mut().take(limit).enumerate() {
259        *byte ^= key.as_bytes()[index % 16];
260    }
261
262    obfuscated_data
263}
264
265/// Decrypts a file encrypted using the Adobe font obfuscation algorithm
266///
267/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
268/// with a 16-byte key derived from the publication's unique identifier. Due to the
269/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
270/// use the same algorithm.
271///
272/// ## Parameters
273/// - `data`: Obfuscated font data
274/// - `key`: The unique identifier of the EPUB publication
275///
276/// ## Return
277/// - `Vec<u8>`: Deobfuscated font data
278pub fn adobe_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
279    adobe_font_encryption(data, key)
280}
281
282/// Provides functionality to decode byte data into strings
283///
284/// This trait is primarily used to decode raw byte data (such as
285/// text files read from EPUB files) into a suitable string representation.
286/// It supports automatic detection of multiple encoding formats,
287/// including UTF-8 (with or without BOM), UTF-16 BE, and UTF-16 LE.
288///
289/// ## Implementation
290/// Currently, this trait is implemented for the `Vec<u8>` type,
291/// primarily used for processing text content in EPUB files.
292///
293/// ## Notes
294/// - When attempting to parse a byte stream lacking a BOM (Byte Order Mark), the parsing
295///   results may be unreadable; caution should be exercised when using such streams.
296pub trait DecodeBytes {
297    fn decode(&self) -> Result<String, EpubError>;
298}
299
300impl DecodeBytes for Vec<u8> {
301    fn decode(&self) -> Result<String, EpubError> {
302        if self.is_empty() || self.len() < 4 {
303            return Err(EpubError::EmptyDataError);
304        }
305
306        match self.as_slice() {
307            // Check UTF-8 BOM (0xEF, 0xBB, 0xBF)
308            [0xEF, 0xBB, 0xBF, rest @ ..] => {
309                String::from_utf8(rest.to_vec()).map_err(EpubError::from)
310            }
311
312            // Check UTF-16 BE BOM (0xFE, 0xFF)
313            [0xFE, 0xFF, rest @ ..] => {
314                let utf16_units = rest
315                    .chunks_exact(2)
316                    .map(|b| u16::from_be_bytes([b[0], b[1]]))
317                    .collect::<Vec<u16>>();
318
319                String::from_utf16(&utf16_units).map_err(EpubError::from)
320            }
321
322            // Check UTF-16 LE BOM (0xFF, 0xFE)
323            [0xFF, 0xFE, rest @ ..] => {
324                let utf16_units = rest
325                    .chunks_exact(2)
326                    .map(|b| u16::from_le_bytes([b[0], b[1]]))
327                    .collect::<Vec<u16>>();
328
329                String::from_utf16(&utf16_units).map_err(EpubError::from)
330            }
331
332            // Try without BOM
333            // The analytical results for this branch are unpredictable,
334            // making it difficult to cover all possibilities when testing it.
335            _ => {
336                // try UTF-8 first
337                // if the byte stream is not valid UTF-8,
338                // it will be replaced with the replacement character (U+FFFD)
339                let lossless = String::from_utf8_lossy(self);
340                if !lossless.contains('\u{FFFD}') {
341                    return Ok(lossless.into_owned());
342                }
343
344                if self.len() % 2 == 0 {
345                    // try UTF-16 BE
346                    if let Ok(str) = String::from_utf16(
347                        &self
348                            .chunks_exact(2)
349                            .map(|b| u16::from_be_bytes([b[0], b[1]]))
350                            .collect::<Vec<u16>>(),
351                    ) {
352                        return Ok(str);
353                    }
354
355                    // try UTF-16 LE
356                    if let Ok(str) = String::from_utf16(
357                        &self
358                            .chunks_exact(2)
359                            .map(|b| u16::from_le_bytes([b[0], b[1]]))
360                            .collect::<Vec<u16>>(),
361                    ) {
362                        return Ok(str);
363                    }
364                }
365
366                // Final fallback
367                Ok(String::from_utf8_lossy(self).to_string())
368            }
369        }
370    }
371}
372
373/// Provides functionality for normalizing whitespace characters
374///
375/// This trait normalizes various sequences of whitespace characters
376/// (including spaces, tabs, newlines, etc.) in a string into a single
377/// whitespace character, removing leading and trailing whitespace characters.
378///
379/// ## Implementation
380/// This trait is implemented for both `&str` and `String` types.
381pub trait NormalizeWhitespace {
382    fn normalize_whitespace(&self) -> String;
383}
384
385impl NormalizeWhitespace for &str {
386    fn normalize_whitespace(&self) -> String {
387        let mut result = String::new();
388        let mut is_first = true;
389
390        for word in self.split_whitespace() {
391            if !is_first {
392                result.push(' ');
393            }
394            result.push_str(word);
395            is_first = false;
396        }
397
398        result
399    }
400}
401
402impl NormalizeWhitespace for String {
403    fn normalize_whitespace(&self) -> String {
404        self.as_str().normalize_whitespace()
405    }
406}
407
408/// Represents an element node in an XML document
409#[derive(Debug)]
410pub struct XmlElement {
411    /// The local name of the element(excluding namespace prefix)
412    pub name: String,
413
414    /// The namespace prefix of the element
415    pub prefix: Option<String>,
416
417    /// The namespace of the element
418    pub namespace: Option<String>,
419
420    /// The attributes of the element
421    ///
422    /// The key is the attribute name, the value is the attribute value
423    pub attributes: HashMap<String, String>,
424
425    /// The text content of the element
426    pub text: Option<String>,
427
428    /// The CDATA content of the element
429    pub cdata: Option<String>,
430
431    /// The children of the element
432    pub children: Vec<XmlElement>,
433}
434
435impl XmlElement {
436    /// Create a new element
437    pub fn new(name: String) -> Self {
438        Self {
439            name,
440            prefix: None,
441            namespace: None,
442            attributes: HashMap::new(),
443            text: None,
444            cdata: None,
445            children: Vec::new(),
446        }
447    }
448
449    /// Get the full tag name of the element
450    ///
451    /// If the element has a namespace prefix, return "prefix:name" format;
452    /// otherwise, return only the element name.
453    pub fn tag_name(&self) -> String {
454        match &self.prefix {
455            Some(prefix) => format!("{}:{}", prefix, self.name),
456            None => self.name.clone(),
457        }
458    }
459
460    /// Gets the text content of the element and all its child elements
461    ///
462    /// Collects the text content of the current element and the text content of
463    /// all its child elements, removing leading and trailing whitespace.
464    pub fn text(&self) -> String {
465        let mut result = String::new();
466
467        if let Some(text_value) = &self.text {
468            result.push_str(text_value);
469        }
470
471        for child in &self.children {
472            result.push_str(&child.text());
473        }
474
475        result.trim().to_string()
476    }
477
478    /// Returns the value of the specified attribute
479    pub fn get_attr(&self, name: &str) -> Option<String> {
480        self.attributes.get(name).cloned()
481    }
482
483    /// Find all elements with the specified name
484    pub fn find_elements_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
485        SearchElementsByNameIter::new(self, name)
486    }
487
488    /// Find all elements with the specified name among the child elements of the current element
489    pub fn find_children_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
490        self.children.iter().filter(move |child| child.name == name)
491    }
492
493    /// Find all elements with the specified name list among the child elements of the current element
494    pub fn find_children_by_names(&self, names: &[&str]) -> impl Iterator<Item = &XmlElement> {
495        self.children
496            .iter()
497            .filter(move |child| names.contains(&child.name.as_str()))
498    }
499
500    /// Get children elements
501    pub fn children(&self) -> impl Iterator<Item = &XmlElement> {
502        self.children.iter()
503    }
504}
505
506struct SearchElementsByNameIter<'a> {
507    elements: Vec<&'a XmlElement>,
508    current_index: usize,
509    target_name: String,
510}
511
512impl<'a> SearchElementsByNameIter<'a> {
513    fn new(root: &'a XmlElement, name: &str) -> Self {
514        let mut elements = Vec::new();
515        Self::collect_elements(root, &mut elements);
516        Self {
517            elements,
518            current_index: 0,
519            target_name: name.to_string(),
520        }
521    }
522
523    fn collect_elements(element: &'a XmlElement, collection: &mut Vec<&'a XmlElement>) {
524        collection.push(element);
525        for child in &element.children {
526            Self::collect_elements(child, collection);
527        }
528    }
529}
530
531impl<'a> Iterator for SearchElementsByNameIter<'a> {
532    type Item = &'a XmlElement;
533
534    fn next(&mut self) -> Option<Self::Item> {
535        while self.current_index < self.elements.len() {
536            let element = self.elements[self.current_index];
537            self.current_index += 1;
538            if element.name == self.target_name {
539                return Some(element);
540            }
541        }
542        None
543    }
544}
545
546/// XML parser used to parse XML content and build an XML element tree
547pub struct XmlReader {}
548
549#[allow(unused)]
550impl XmlReader {
551    /// Parses an XML from string and builds the root element
552    ///
553    /// This function takes an XML string, parses its content using the `quick_xml` library,
554    /// and builds an `XmlElement` tree representing the structure of the entire XML document.
555    ///
556    /// ## Parameters
557    /// - `content`: The XML string to be parsed
558    ///
559    /// ## Return
560    /// - `Ok(XmlElement)`: The root element of the XML element tree
561    /// - `Err(EpubError)`: An error occurred during parsing
562    pub fn parse(content: &str) -> Result<XmlElement, EpubError> {
563        if content.is_empty() {
564            return Err(EpubError::EmptyDataError);
565        }
566
567        // Create a XML reader with namespace support
568        let mut reader = NsReader::from_str(content);
569        reader.config_mut().trim_text(true);
570
571        let mut buf = Vec::new();
572        let mut stack = Vec::<XmlElement>::new();
573        let mut root = None;
574        let mut namespace_map = HashMap::new();
575
576        // Read XML events
577        loop {
578            match reader.read_event_into(&mut buf) {
579                // End of file, stop the loop
580                Ok(Event::Eof) => break,
581
582                // Start of an element
583                Ok(Event::Start(e)) => {
584                    let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
585                    let mut element = XmlElement::new(name);
586
587                    if let Some(prefix) = e.name().prefix() {
588                        element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
589                    }
590
591                    for attr in e.attributes().flatten() {
592                        let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
593                        let attr_value = String::from_utf8_lossy(&attr.value).to_string();
594
595                        // Handle namespace attributes
596                        if attr_key.contains("xmlns") {
597                            let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
598                            if attr_keys.len() >= 2 {
599                                namespace_map.insert(attr_keys[1].to_string(), attr_value);
600                            } else {
601                                namespace_map.insert(attr_key, attr_value);
602                            }
603
604                            continue;
605                        }
606
607                        element.attributes.insert(attr_key, attr_value);
608                    }
609
610                    stack.push(element);
611                }
612
613                // End of an element
614                Ok(Event::End(_)) => {
615                    if let Some(element) = stack.pop() {
616                        // If the stack is empty,
617                        // the current element is the root element
618                        if stack.is_empty() {
619                            root = Some(element);
620                        } else if let Some(parent) = stack.last_mut() {
621                            // If the stack is not empty,
622                            // the current element is a child element of the last element in the stack
623                            parent.children.push(element);
624                        }
625                    }
626                }
627
628                // Self-closing element
629                Ok(Event::Empty(e)) => {
630                    let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
631                    let mut element = XmlElement::new(name);
632
633                    if let Some(prefix) = e.name().prefix() {
634                        element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
635                    }
636
637                    for attr in e.attributes().flatten() {
638                        let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
639                        let attr_value = String::from_utf8_lossy(&attr.value).to_string();
640
641                        if attr_key.contains("xmlns") {
642                            let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
643                            if attr_keys.len() >= 2 {
644                                namespace_map.insert(attr_keys[1].to_string(), attr_value);
645                            } else {
646                                namespace_map.insert(attr_key, attr_value);
647                            }
648
649                            continue;
650                        }
651
652                        element.attributes.insert(attr_key, attr_value);
653                    }
654
655                    // We can almost certainly assert that a self-closing element cannot be
656                    // the root node of an XML file, so this will definitely be executed.
657                    if let Some(parent) = stack.last_mut() {
658                        parent.children.push(element);
659                    }
660                }
661
662                // Text node
663                Ok(Event::Text(e)) => {
664                    if let Some(element) = stack.last_mut() {
665                        let text = String::from_utf8_lossy(e.as_ref()).to_string();
666                        if !text.trim().is_empty() {
667                            element.text = Some(text);
668                        }
669                    }
670                }
671
672                // CDATA node
673                Ok(Event::CData(e)) => {
674                    if let Some(element) = stack.last_mut() {
675                        element.cdata = Some(String::from_utf8_lossy(e.as_ref()).to_string());
676                    }
677                }
678
679                Err(err) => return Err(err.into()),
680
681                // Ignore the following events (elements):
682                // Comment, PI, Declaration, Doctype, GeneralRef
683                _ => continue,
684            }
685        }
686
687        if let Some(element) = root.as_mut() {
688            Self::assign_namespace(element, &namespace_map);
689        }
690
691        // TODO: handle this error with a proper error
692        root.ok_or(EpubError::EmptyDataError)
693    }
694
695    /// Parse XML from bytes and builds the root element
696    pub fn parse_bytes(bytes: Vec<u8>) -> Result<XmlElement, EpubError> {
697        let content = bytes.decode()?;
698        Self::parse(&content)
699    }
700
701    /// Assign namespace to element recursively
702    ///
703    /// ## Parameters
704    /// - `element`: The element to assign namespace
705    /// - `namespace_map`: The prefix-namespace map
706    fn assign_namespace(element: &mut XmlElement, namespace_map: &HashMap<String, String>) {
707        if let Some(prefix) = &element.prefix {
708            if let Some(namespace) = namespace_map.get(prefix) {
709                element.namespace = Some(namespace.clone());
710            }
711        } else if let Some(namespace) = namespace_map.get("xmlns") {
712            element.namespace = Some(namespace.clone());
713        }
714
715        for chiled in element.children.iter_mut() {
716            Self::assign_namespace(chiled, namespace_map);
717        }
718    }
719}
720
721#[cfg(test)]
722mod tests {
723    use crate::{
724        error::EpubError,
725        utils::{DecodeBytes, NormalizeWhitespace},
726    };
727
728    /// Test with empty data
729    #[test]
730    fn test_decode_empty_data() {
731        let data = vec![];
732        let result = data.decode();
733        assert!(result.is_err());
734        assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
735    }
736
737    /// Test data with a length of less than 4 bytes
738    #[test]
739    fn test_decode_short_data() {
740        let data = vec![0xEF, 0xBB];
741        let result = data.decode();
742        assert!(result.is_err());
743        assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
744    }
745
746    /// Testing text decoding with UTF-8 BOM
747    #[test]
748    fn test_decode_utf8_with_bom() {
749        let data: Vec<u8> = vec![0xEF, 0xBB, 0xBF, b'H', b'e', b'l', b'l', b'o'];
750        let result = data.decode();
751        assert!(result.is_ok());
752        assert_eq!(result.unwrap(), "Hello");
753    }
754
755    /// Test text decoding with UTF-16 BE BOM
756    #[test]
757    fn test_decode_utf16_be_with_bom() {
758        let data = vec![
759            0xFE, 0xFF, // BOM
760            0x00, b'H', // H
761            0x00, b'e', // e
762            0x00, b'l', // l
763            0x00, b'l', // l
764            0x00, b'o', // o
765        ];
766        let result = data.decode();
767        assert!(result.is_ok());
768        assert_eq!(result.unwrap(), "Hello");
769    }
770
771    /// Testing text decoding with UTF-16 LE BOM
772    #[test]
773    fn test_decode_utf16_le_with_bom() {
774        let data = vec![
775            0xFF, 0xFE, // BOM
776            b'H', 0x00, // H
777            b'e', 0x00, // e
778            b'l', 0x00, // l
779            b'l', 0x00, // l
780            b'o', 0x00, // o
781        ];
782        let result = data.decode();
783        assert!(result.is_ok());
784        assert_eq!(result.unwrap(), "Hello");
785    }
786
787    /// Testing ordinary UTF-8 text (without BOM)
788    #[test]
789    fn test_decode_plain_utf8() {
790        let data = b"Hello, World!".to_vec();
791        let result = data.decode();
792        assert!(result.is_ok());
793        assert_eq!(result.unwrap(), "Hello, World!");
794    }
795
796    /// Test text standardization containing various whitespace characters
797    #[test]
798    fn test_normalize_whitespace_trait() {
799        // Test for &str
800        let text = "  Hello,\tWorld!\n\nRust  ";
801        let normalized = text.normalize_whitespace();
802        assert_eq!(normalized, "Hello, World! Rust");
803
804        // Test for String
805        let text_string = String::from("  Hello,\tWorld!\n\nRust  ");
806        let normalized = text_string.normalize_whitespace();
807        assert_eq!(normalized, "Hello, World! Rust");
808    }
809}