ddex_parser/
utf8_utils.rs1use crate::error::ParseError;
4use quick_xml::events::BytesText;
5
6#[allow(dead_code)]
8pub fn process_text_content(raw_bytes: &[u8]) -> Result<String, ParseError> {
9 String::from_utf8(raw_bytes.to_vec()).map_err(|e| ParseError::InvalidUtf8 {
10 message: format!("UTF-8 decoding error at position 0: {}", e),
11 })
12}
13
14#[allow(dead_code)]
16pub fn process_text_content_lossy(raw_bytes: &[u8]) -> String {
17 String::from_utf8_lossy(raw_bytes).into_owned()
18}
19
20pub fn decode_utf8_at_position(bytes: &[u8], position: usize) -> Result<String, ParseError> {
22 std::str::from_utf8(bytes)
23 .map(|s| s.to_string())
24 .map_err(|e| ParseError::InvalidUtf8 {
25 message: format!("UTF-8 decoding error at position {}: {}", position, e),
26 })
27}
28
29#[allow(dead_code)]
31pub fn handle_text_node(event: &BytesText, position: usize) -> Result<String, ParseError> {
32 let unescaped = event.unescape().map_err(|e| {
33 ParseError::SimpleXmlError(format!("Unescape error at {}: {}", position, e))
34 })?;
35
36 process_text_content(unescaped.as_bytes())
37}
38
39#[allow(dead_code)]
41pub fn decode_attribute_name(bytes: &[u8], position: usize) -> Result<String, ParseError> {
42 decode_utf8_at_position(bytes, position)
43}
44
45#[allow(dead_code)]
47pub fn decode_attribute_value(bytes: &[u8], position: usize) -> Result<String, ParseError> {
48 let utf8_str = std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
50 message: format!("UTF-8 decoding error at position {}: {}", position, e),
51 })?;
52
53 quick_xml::escape::unescape(utf8_str)
55 .map(|cow| cow.into_owned())
56 .map_err(|e| ParseError::SimpleXmlError(format!("Attribute unescape error: {}", e)))
57}
58
59pub fn validate_utf8(bytes: &[u8]) -> Result<&str, ParseError> {
61 std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
62 message: format!("UTF-8 validation error: {}", e),
63 })
64}
65
66pub fn validate_utf8_string(text: &str) -> Result<(), ParseError> {
68 for (pos, ch) in text.char_indices() {
71 if ch == '\u{FFFD}' {
72 return Err(ParseError::InvalidUtf8 {
74 message: format!("Found Unicode replacement character at position {} indicating invalid UTF-8", pos),
75 });
76 }
77
78 if ch.is_control() && ch != '\t' && ch != '\n' && ch != '\r' {
80 return Err(ParseError::InvalidUtf8 {
82 message: format!("Found invalid control character at position {}: U+{:04X}", pos, ch as u32),
83 });
84 }
85 }
86 Ok(())
87}
88
89#[cfg(test)]
90mod tests {
91 use super::*;
92
93 #[test]
94 fn test_valid_utf8() {
95 let text = "Hello, δΈη! π΅".as_bytes();
96 assert_eq!(process_text_content(text).unwrap(), "Hello, δΈη! π΅");
97 }
98
99 #[test]
100 fn test_invalid_utf8() {
101 let invalid = vec![0xFF, 0xFE, 0xFD];
102 assert!(process_text_content(&invalid).is_err());
103 }
104
105 #[test]
106 fn test_lossy_conversion() {
107 let mixed = vec![72, 101, 108, 108, 111, 0xFF, 0xFE];
108 let result = process_text_content_lossy(&mixed);
109 assert!(result.starts_with("Hello"));
110 }
111}