lib_epub/utils.rs
1use std::{
2 cmp,
3 collections::HashMap,
4 io::{Read, Seek},
5 path::PathBuf,
6};
7
8#[cfg(feature = "builder")]
9use chrono::Local;
10use quick_xml::{NsReader, events::Event};
11use sha1::{Digest, Sha1};
12use zip::{CompressionMethod, ZipArchive};
13
14use crate::error::EpubError;
15
16#[cfg(feature = "builder")]
17pub static ELEMENT_IN_DC_NAMESPACE: std::sync::LazyLock<Vec<&str>> =
18 std::sync::LazyLock::new(|| {
19 vec![
20 "contributor",
21 "coverage",
22 "creator",
23 "date",
24 "description",
25 "format",
26 "identifier",
27 "language",
28 "publisher",
29 "relation",
30 "rights",
31 "source",
32 "subject",
33 "title",
34 "type",
35 ]
36 });
37
38#[cfg(feature = "builder")]
39/// Returns the current time with custom format
40pub fn local_time() -> String {
41 Local::now().format("%Y-%m-%dT%H-%M-%S.%fU%z").to_string()
42}
43
44/// Extracts the contents of a specified file from a ZIP archive
45///
46/// This function reads the raw byte data of a specified file from an EPUB file (which
47/// is essentially a ZIP archive). This is a fundamental utility function for handling
48/// files within an EPUB (such as OPF, NCX, container files, etc.).
49///
50/// ## Parameters
51/// - `zip_file`: A mutable reference to a ZIP archive object
52/// - `file_name`: The path to the file to extract (relative to the ZIP archive root directory)
53///
54/// ## Return
55/// - `Ok(Vec<u8>)`: Returns a byte vector containing the file data
56/// if the file content was successfully read
57/// - `Err(EpubError)`: The file does not exist or an error occurred during the read operation
58///
59/// ## Notes
60/// - The returned data is raw bytes; the caller needs to perform
61/// appropriate decoding based on the file type.
62/// - For text files, further decoding using the `DecodeBytes` trait is usually required.
63pub fn get_file_in_zip_archive<R: Read + Seek>(
64 zip_file: &mut ZipArchive<R>,
65 file_name: &str,
66) -> Result<Vec<u8>, EpubError> {
67 let mut buffer = Vec::<u8>::new();
68 match zip_file.by_name(file_name) {
69 Ok(mut file) => {
70 let _ = file.read_to_end(&mut buffer).map_err(EpubError::from)?;
71 Ok(buffer)
72 }
73 Err(err) => Err(EpubError::from(err)),
74 }
75}
76
77/// Checks if the compression method of all entries in the EPUB file
78/// conforms to the specification requirements.
79///
80/// According to the OCF (Open Container Format) specification, EPUB files
81/// can only use either Stored (uncompressed) or Deflated (deflate compression).
82/// If any other compression method is found, an error will be returned.
83///
84/// ## Parameters
85/// - `zip_archive`: The ZIP archive to check.
86///
87/// ## Return
88/// - `Ok(())`: All files use the supported compression method
89/// - `Err(EpubError)`: Unsupported compression method found
90///
91/// ## Specification Reference
92/// According to the EPUB OCF 3.2 specification: "OCF ZIP containers
93/// MUST only use compression techniques that are supported
94/// by the ZIP format specification (ISO/IEC 21320-1)"
95/// Currently only Stored and Deflated methods are supported.
96pub fn compression_method_check<R: Read + Seek>(
97 zip_archive: &mut ZipArchive<R>,
98) -> Result<(), EpubError> {
99 for index in 0..zip_archive.len() {
100 let file = zip_archive.by_index(index)?;
101
102 match file.compression() {
103 CompressionMethod::Stored | CompressionMethod::Deflated => continue,
104 method => {
105 return Err(EpubError::UnusableCompressionMethod {
106 file: file.name().to_string(),
107 method: method.to_string(),
108 });
109 }
110 };
111 }
112
113 Ok(())
114}
115
116/// Check if relative link is outside the EPUB package scope
117///
118/// This function resolves relative path links and checks if they "leak"
119/// outside the EPUB package structure. It determines the depth of upward
120/// navigation by calculating the level of "../", and then verifies that
121/// the final path is still within the EPUB package scope.
122///
123/// ## Parameters
124/// - `epub_path`: The root path of the EPUB file
125/// - `current_dir`: The directory path where the current file is located
126/// - `check_file`: The relative path to check
127///
128/// ## Return
129/// - `Some(String)`: The parsed normalized path string, if the link is within the EPUB package scope
130/// - `None`: If the link is outside the EPUB package scope or an error occurs
131pub fn check_realtive_link_leakage(
132 epub_path: PathBuf,
133 current_dir: PathBuf,
134 check_file: &str,
135) -> Option<String> {
136 // Normalize the path by resolving "../"
137 // Using the `split` function offers better performance than using a `while slice` loop
138 let parts = check_file.split("../").collect::<Vec<&str>>();
139 let folder_depth = parts.len() - 1;
140 let remaining = *parts.last().unwrap_or(&"");
141
142 // Navigate up the directory tree according to folder_depth
143 let mut current_path = epub_path.join(current_dir);
144 for _ in 0..folder_depth {
145 if !current_path.pop() {
146 // failed to navigate up,
147 // which means we're trying to escape the root directory
148 return None;
149 }
150 }
151
152 // verify that the resulting path is still within the EPUB package scope
153 let prefix_path = match current_path.strip_prefix(&epub_path) {
154 Ok(path) => path.to_str().unwrap(),
155 Err(_) => return None, // path is outside the EPUB package scope
156 };
157
158 // construct the final path
159 let path = match prefix_path {
160 "" => remaining.to_string(),
161 _ => format!("{}/{}", prefix_path, remaining),
162 };
163 Some(path)
164}
165
166/// Removes leading slash from a path
167///
168/// This function removes the leading slash from a path if it exists.
169#[cfg(feature = "builder")]
170pub fn remove_leading_slash<P: AsRef<std::path::Path>>(path: P) -> PathBuf {
171 if let Ok(path) = path.as_ref().strip_prefix("/") {
172 path.to_path_buf()
173 } else {
174 path.as_ref().to_path_buf()
175 }
176}
177
178/// Encrypts the font file using the IDPF font obfuscation algorithm
179///
180/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
181/// with the publication's unique identifier. Due to the integrability of the XOR
182/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
183///
184/// ## Parameters
185/// - `data`: Original font data
186/// - `key`: The unique identifier of the EPUB publication
187///
188/// ## Return
189/// - `Vec<u8>`: Encrypted font data
190///
191/// ## Notes
192/// - This function applies to the IDPF font obfuscation algorithm
193/// (http://www.idpf.org/2008/embedding).
194/// - Only processes the first 1040 bytes of the font file; the rest remains unchanged.
195pub fn idpf_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
196 if data.is_empty() {
197 return Vec::new();
198 }
199
200 let hash = {
201 let mut hasher = Sha1::new();
202 hasher.update(key.as_bytes());
203 hasher.finalize()
204 };
205
206 let mut obfuscated_data = data.to_vec();
207 let limit = cmp::min(1040, data.len());
208
209 for (index, byte) in obfuscated_data.iter_mut().take(limit).enumerate() {
210 *byte ^= hash[index % hash.len()]
211 }
212
213 obfuscated_data
214}
215
216/// Decrypts a file encrypted using the IDPF obfuscation algorithm
217///
218/// The IDPF font obfuscation algorithm XORs the first 1040 bytes of the font file
219/// with the publication's unique identifier. Due to the integrability of the XOR
220/// operation (A XOR B XOR B = A), encryption and decryption use the same algorithm.
221///
222/// ## Parameters
223/// - `data`: Original font data
224/// - `key`: The unique identifier of the EPUB publication
225///
226/// ## Return
227/// - `Vec<u8>`: Decrypted font data
228pub fn idpf_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
229 idpf_font_encryption(data, key)
230}
231
232/// Encrypts the font file using the Adobe font obfuscation algorithm
233///
234/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
235/// with a 16-byte key derived from the publication's unique identifier. Due to the
236/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
237/// use the same algorithm.
238///
239/// ## Parameters
240/// - `data`: Original font data to be obfuscated
241/// - `key`: The unique identifier of the EPUB publication
242///
243/// ## Return
244/// - `Vec<u8>`: Obfuscated font data
245///
246/// ## Notes
247/// - This function applies to the adobe font obfuscation algorithm
248/// (http://ns.adobe.com/pdf/enc#RC).
249/// - Only processes the first 1024 bytes of the font file; the rest remains unchanged.
250pub fn adobe_font_encryption(data: &[u8], key: &str) -> Vec<u8> {
251 if data.is_empty() {
252 return Vec::new();
253 }
254
255 let mut obfuscated_data = data.to_vec();
256 let limit = cmp::min(1024, data.len());
257
258 for (index, byte) in obfuscated_data.iter_mut().take(limit).enumerate() {
259 *byte ^= key.as_bytes()[index % 16];
260 }
261
262 obfuscated_data
263}
264
265/// Decrypts a file encrypted using the Adobe font obfuscation algorithm
266///
267/// The Adobe font obfuscation algorithm XORs the first 1024 bytes of the font file
268/// with a 16-byte key derived from the publication's unique identifier. Due to the
269/// integrability of the XOR operation (A XOR B XOR B = A), encryption and decryption
270/// use the same algorithm.
271///
272/// ## Parameters
273/// - `data`: Obfuscated font data
274/// - `key`: The unique identifier of the EPUB publication
275///
276/// ## Return
277/// - `Vec<u8>`: Deobfuscated font data
278pub fn adobe_font_dencryption(data: &[u8], key: &str) -> Vec<u8> {
279 adobe_font_encryption(data, key)
280}
281
282/// Provides functionality to decode byte data into strings
283///
284/// This trait is primarily used to decode raw byte data (such as
285/// text files read from EPUB files) into a suitable string representation.
286/// It supports automatic detection of multiple encoding formats,
287/// including UTF-8 (with or without BOM), UTF-16 BE, and UTF-16 LE.
288///
289/// ## Implementation
290/// Currently, this trait is implemented for the `Vec<u8>` type,
291/// primarily used for processing text content in EPUB files.
292///
293/// ## Notes
294/// - When attempting to parse a byte stream lacking a BOM (Byte Order Mark), the parsing
295/// results may be unreadable; caution should be exercised when using such streams.
296pub trait DecodeBytes {
297 fn decode(&self) -> Result<String, EpubError>;
298}
299
300impl DecodeBytes for Vec<u8> {
301 fn decode(&self) -> Result<String, EpubError> {
302 if self.is_empty() || self.len() < 4 {
303 return Err(EpubError::EmptyDataError);
304 }
305
306 match self.as_slice() {
307 // Check UTF-8 BOM (0xEF, 0xBB, 0xBF)
308 [0xEF, 0xBB, 0xBF, rest @ ..] => {
309 String::from_utf8(rest.to_vec()).map_err(EpubError::from)
310 }
311
312 // Check UTF-16 BE BOM (0xFE, 0xFF)
313 [0xFE, 0xFF, rest @ ..] => {
314 let utf16_units = rest
315 .chunks_exact(2)
316 .map(|b| u16::from_be_bytes([b[0], b[1]]))
317 .collect::<Vec<u16>>();
318
319 String::from_utf16(&utf16_units).map_err(EpubError::from)
320 }
321
322 // Check UTF-16 LE BOM (0xFF, 0xFE)
323 [0xFF, 0xFE, rest @ ..] => {
324 let utf16_units = rest
325 .chunks_exact(2)
326 .map(|b| u16::from_le_bytes([b[0], b[1]]))
327 .collect::<Vec<u16>>();
328
329 String::from_utf16(&utf16_units).map_err(EpubError::from)
330 }
331
332 // Try without BOM
333 // The analytical results for this branch are unpredictable,
334 // making it difficult to cover all possibilities when testing it.
335 _ => {
336 // try UTF-8 first
337 // if the byte stream is not valid UTF-8,
338 // it will be replaced with the replacement character (U+FFFD)
339 let lossless = String::from_utf8_lossy(self);
340 if !lossless.contains('\u{FFFD}') {
341 return Ok(lossless.into_owned());
342 }
343
344 if self.len() % 2 == 0 {
345 // try UTF-16 BE
346 if let Ok(str) = String::from_utf16(
347 &self
348 .chunks_exact(2)
349 .map(|b| u16::from_be_bytes([b[0], b[1]]))
350 .collect::<Vec<u16>>(),
351 ) {
352 return Ok(str);
353 }
354
355 // try UTF-16 LE
356 if let Ok(str) = String::from_utf16(
357 &self
358 .chunks_exact(2)
359 .map(|b| u16::from_le_bytes([b[0], b[1]]))
360 .collect::<Vec<u16>>(),
361 ) {
362 return Ok(str);
363 }
364 }
365
366 // Final fallback
367 Ok(String::from_utf8_lossy(self).to_string())
368 }
369 }
370 }
371}
372
373/// Provides functionality for normalizing whitespace characters
374///
375/// This trait normalizes various sequences of whitespace characters
376/// (including spaces, tabs, newlines, etc.) in a string into a single
377/// whitespace character, removing leading and trailing whitespace characters.
378///
379/// ## Implementation
380/// This trait is implemented for both `&str` and `String` types.
381pub trait NormalizeWhitespace {
382 fn normalize_whitespace(&self) -> String;
383}
384
385impl NormalizeWhitespace for &str {
386 fn normalize_whitespace(&self) -> String {
387 let mut result = String::new();
388 let mut is_first = true;
389
390 for word in self.split_whitespace() {
391 if !is_first {
392 result.push(' ');
393 }
394 result.push_str(word);
395 is_first = false;
396 }
397
398 result
399 }
400}
401
402impl NormalizeWhitespace for String {
403 fn normalize_whitespace(&self) -> String {
404 self.as_str().normalize_whitespace()
405 }
406}
407
408/// Represents an element node in an XML document
409#[derive(Debug)]
410pub struct XmlElement {
411 /// The local name of the element(excluding namespace prefix)
412 pub name: String,
413
414 /// The namespace prefix of the element
415 pub prefix: Option<String>,
416
417 /// The namespace of the element
418 pub namespace: Option<String>,
419
420 /// The attributes of the element
421 ///
422 /// The key is the attribute name, the value is the attribute value
423 pub attributes: HashMap<String, String>,
424
425 /// The text content of the element
426 pub text: Option<String>,
427
428 /// The CDATA content of the element
429 pub cdata: Option<String>,
430
431 /// The children of the element
432 pub children: Vec<XmlElement>,
433}
434
435impl XmlElement {
436 /// Create a new element
437 pub fn new(name: String) -> Self {
438 Self {
439 name,
440 prefix: None,
441 namespace: None,
442 attributes: HashMap::new(),
443 text: None,
444 cdata: None,
445 children: Vec::new(),
446 }
447 }
448
449 /// Get the full tag name of the element
450 ///
451 /// If the element has a namespace prefix, return "prefix:name" format;
452 /// otherwise, return only the element name.
453 pub fn tag_name(&self) -> String {
454 match &self.prefix {
455 Some(prefix) => format!("{}:{}", prefix, self.name),
456 None => self.name.clone(),
457 }
458 }
459
460 /// Gets the text content of the element and all its child elements
461 ///
462 /// Collects the text content of the current element and the text content of
463 /// all its child elements, removing leading and trailing whitespace.
464 pub fn text(&self) -> String {
465 let mut result = String::new();
466
467 if let Some(text_value) = &self.text {
468 result.push_str(text_value);
469 }
470
471 for child in &self.children {
472 result.push_str(&child.text());
473 }
474
475 result.trim().to_string()
476 }
477
478 /// Returns the value of the specified attribute
479 pub fn get_attr(&self, name: &str) -> Option<String> {
480 self.attributes.get(name).cloned()
481 }
482
483 /// Find all elements with the specified name
484 pub fn find_elements_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
485 SearchElementsByNameIter::new(self, name)
486 }
487
488 /// Find all elements with the specified name among the child elements of the current element
489 pub fn find_children_by_name(&self, name: &str) -> impl Iterator<Item = &XmlElement> {
490 self.children.iter().filter(move |child| child.name == name)
491 }
492
493 /// Find all elements with the specified name list among the child elements of the current element
494 pub fn find_children_by_names(&self, names: &[&str]) -> impl Iterator<Item = &XmlElement> {
495 self.children
496 .iter()
497 .filter(move |child| names.contains(&child.name.as_str()))
498 }
499
500 /// Get children elements
501 pub fn children(&self) -> impl Iterator<Item = &XmlElement> {
502 self.children.iter()
503 }
504}
505
506struct SearchElementsByNameIter<'a> {
507 elements: Vec<&'a XmlElement>,
508 current_index: usize,
509 target_name: String,
510}
511
512impl<'a> SearchElementsByNameIter<'a> {
513 fn new(root: &'a XmlElement, name: &str) -> Self {
514 let mut elements = Vec::new();
515 Self::collect_elements(root, &mut elements);
516 Self {
517 elements,
518 current_index: 0,
519 target_name: name.to_string(),
520 }
521 }
522
523 fn collect_elements(element: &'a XmlElement, collection: &mut Vec<&'a XmlElement>) {
524 collection.push(element);
525 for child in &element.children {
526 Self::collect_elements(child, collection);
527 }
528 }
529}
530
531impl<'a> Iterator for SearchElementsByNameIter<'a> {
532 type Item = &'a XmlElement;
533
534 fn next(&mut self) -> Option<Self::Item> {
535 while self.current_index < self.elements.len() {
536 let element = self.elements[self.current_index];
537 self.current_index += 1;
538 if element.name == self.target_name {
539 return Some(element);
540 }
541 }
542 None
543 }
544}
545
546/// XML parser used to parse XML content and build an XML element tree
547pub struct XmlReader {}
548
549#[allow(unused)]
550impl XmlReader {
551 /// Parses an XML from string and builds the root element
552 ///
553 /// This function takes an XML string, parses its content using the `quick_xml` library,
554 /// and builds an `XmlElement` tree representing the structure of the entire XML document.
555 ///
556 /// ## Parameters
557 /// - `content`: The XML string to be parsed
558 ///
559 /// ## Return
560 /// - `Ok(XmlElement)`: The root element of the XML element tree
561 /// - `Err(EpubError)`: An error occurred during parsing
562 pub fn parse(content: &str) -> Result<XmlElement, EpubError> {
563 if content.is_empty() {
564 return Err(EpubError::EmptyDataError);
565 }
566
567 // Create a XML reader with namespace support
568 let mut reader = NsReader::from_str(content);
569 reader.config_mut().trim_text(true);
570
571 let mut buf = Vec::new();
572 let mut stack = Vec::<XmlElement>::new();
573 let mut root = None;
574 let mut namespace_map = HashMap::new();
575
576 // Read XML events
577 loop {
578 match reader.read_event_into(&mut buf) {
579 // End of file, stop the loop
580 Ok(Event::Eof) => break,
581
582 // Start of an element
583 Ok(Event::Start(e)) => {
584 let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
585 let mut element = XmlElement::new(name);
586
587 if let Some(prefix) = e.name().prefix() {
588 element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
589 }
590
591 for attr in e.attributes().flatten() {
592 let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
593 let attr_value = String::from_utf8_lossy(&attr.value).to_string();
594
595 // Handle namespace attributes
596 if attr_key.contains("xmlns") {
597 let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
598 if attr_keys.len() >= 2 {
599 namespace_map.insert(attr_keys[1].to_string(), attr_value);
600 } else {
601 namespace_map.insert(attr_key, attr_value);
602 }
603
604 continue;
605 }
606
607 element.attributes.insert(attr_key, attr_value);
608 }
609
610 stack.push(element);
611 }
612
613 // End of an element
614 Ok(Event::End(_)) => {
615 if let Some(element) = stack.pop() {
616 // If the stack is empty,
617 // the current element is the root element
618 if stack.is_empty() {
619 root = Some(element);
620 } else if let Some(parent) = stack.last_mut() {
621 // If the stack is not empty,
622 // the current element is a child element of the last element in the stack
623 parent.children.push(element);
624 }
625 }
626 }
627
628 // Self-closing element
629 Ok(Event::Empty(e)) => {
630 let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
631 let mut element = XmlElement::new(name);
632
633 if let Some(prefix) = e.name().prefix() {
634 element.prefix = Some(String::from_utf8_lossy(prefix.as_ref()).to_string());
635 }
636
637 for attr in e.attributes().flatten() {
638 let attr_key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
639 let attr_value = String::from_utf8_lossy(&attr.value).to_string();
640
641 if attr_key.contains("xmlns") {
642 let attr_keys = attr_key.split(":").collect::<Vec<&str>>();
643 if attr_keys.len() >= 2 {
644 namespace_map.insert(attr_keys[1].to_string(), attr_value);
645 } else {
646 namespace_map.insert(attr_key, attr_value);
647 }
648
649 continue;
650 }
651
652 element.attributes.insert(attr_key, attr_value);
653 }
654
655 // We can almost certainly assert that a self-closing element cannot be
656 // the root node of an XML file, so this will definitely be executed.
657 if let Some(parent) = stack.last_mut() {
658 parent.children.push(element);
659 }
660 }
661
662 // Text node
663 Ok(Event::Text(e)) => {
664 if let Some(element) = stack.last_mut() {
665 let text = String::from_utf8_lossy(e.as_ref()).to_string();
666 if !text.trim().is_empty() {
667 element.text = Some(text);
668 }
669 }
670 }
671
672 // CDATA node
673 Ok(Event::CData(e)) => {
674 if let Some(element) = stack.last_mut() {
675 element.cdata = Some(String::from_utf8_lossy(e.as_ref()).to_string());
676 }
677 }
678
679 Err(err) => return Err(err.into()),
680
681 // Ignore the following events (elements):
682 // Comment, PI, Declaration, Doctype, GeneralRef
683 _ => continue,
684 }
685 }
686
687 if let Some(element) = root.as_mut() {
688 Self::assign_namespace(element, &namespace_map);
689 }
690
691 // TODO: handle this error with a proper error
692 root.ok_or(EpubError::EmptyDataError)
693 }
694
695 /// Parse XML from bytes and builds the root element
696 pub fn parse_bytes(bytes: Vec<u8>) -> Result<XmlElement, EpubError> {
697 let content = bytes.decode()?;
698 Self::parse(&content)
699 }
700
701 /// Assign namespace to element recursively
702 ///
703 /// ## Parameters
704 /// - `element`: The element to assign namespace
705 /// - `namespace_map`: The prefix-namespace map
706 fn assign_namespace(element: &mut XmlElement, namespace_map: &HashMap<String, String>) {
707 if let Some(prefix) = &element.prefix {
708 if let Some(namespace) = namespace_map.get(prefix) {
709 element.namespace = Some(namespace.clone());
710 }
711 } else if let Some(namespace) = namespace_map.get("xmlns") {
712 element.namespace = Some(namespace.clone());
713 }
714
715 for chiled in element.children.iter_mut() {
716 Self::assign_namespace(chiled, namespace_map);
717 }
718 }
719}
720
721#[cfg(test)]
722mod tests {
723 use crate::{
724 error::EpubError,
725 utils::{DecodeBytes, NormalizeWhitespace},
726 };
727
728 /// Test with empty data
729 #[test]
730 fn test_decode_empty_data() {
731 let data = vec![];
732 let result = data.decode();
733 assert!(result.is_err());
734 assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
735 }
736
737 /// Test data with a length of less than 4 bytes
738 #[test]
739 fn test_decode_short_data() {
740 let data = vec![0xEF, 0xBB];
741 let result = data.decode();
742 assert!(result.is_err());
743 assert_eq!(result.unwrap_err(), EpubError::EmptyDataError);
744 }
745
746 /// Testing text decoding with UTF-8 BOM
747 #[test]
748 fn test_decode_utf8_with_bom() {
749 let data: Vec<u8> = vec![0xEF, 0xBB, 0xBF, b'H', b'e', b'l', b'l', b'o'];
750 let result = data.decode();
751 assert!(result.is_ok());
752 assert_eq!(result.unwrap(), "Hello");
753 }
754
755 /// Test text decoding with UTF-16 BE BOM
756 #[test]
757 fn test_decode_utf16_be_with_bom() {
758 let data = vec![
759 0xFE, 0xFF, // BOM
760 0x00, b'H', // H
761 0x00, b'e', // e
762 0x00, b'l', // l
763 0x00, b'l', // l
764 0x00, b'o', // o
765 ];
766 let result = data.decode();
767 assert!(result.is_ok());
768 assert_eq!(result.unwrap(), "Hello");
769 }
770
771 /// Testing text decoding with UTF-16 LE BOM
772 #[test]
773 fn test_decode_utf16_le_with_bom() {
774 let data = vec![
775 0xFF, 0xFE, // BOM
776 b'H', 0x00, // H
777 b'e', 0x00, // e
778 b'l', 0x00, // l
779 b'l', 0x00, // l
780 b'o', 0x00, // o
781 ];
782 let result = data.decode();
783 assert!(result.is_ok());
784 assert_eq!(result.unwrap(), "Hello");
785 }
786
787 /// Testing ordinary UTF-8 text (without BOM)
788 #[test]
789 fn test_decode_plain_utf8() {
790 let data = b"Hello, World!".to_vec();
791 let result = data.decode();
792 assert!(result.is_ok());
793 assert_eq!(result.unwrap(), "Hello, World!");
794 }
795
796 /// Test text standardization containing various whitespace characters
797 #[test]
798 fn test_normalize_whitespace_trait() {
799 // Test for &str
800 let text = " Hello,\tWorld!\n\nRust ";
801 let normalized = text.normalize_whitespace();
802 assert_eq!(normalized, "Hello, World! Rust");
803
804 // Test for String
805 let text_string = String::from(" Hello,\tWorld!\n\nRust ");
806 let normalized = text_string.normalize_whitespace();
807 assert_eq!(normalized, "Hello, World! Rust");
808 }
809}