ddex_parser/
utf8_utils.rs1use crate::error::ParseError;
4use quick_xml::events::BytesText;
5
6#[allow(dead_code)]
8pub fn process_text_content(raw_bytes: &[u8]) -> Result<String, ParseError> {
9 String::from_utf8(raw_bytes.to_vec()).map_err(|e| ParseError::InvalidUtf8 {
10 position: 0,
11 error: e.to_string(),
12 })
13}
14
15#[allow(dead_code)]
17pub fn process_text_content_lossy(raw_bytes: &[u8]) -> String {
18 String::from_utf8_lossy(raw_bytes).into_owned()
19}
20
21pub fn decode_utf8_at_position(bytes: &[u8], position: usize) -> Result<String, ParseError> {
23 std::str::from_utf8(bytes)
24 .map(|s| s.to_string())
25 .map_err(|e| ParseError::InvalidUtf8 {
26 position,
27 error: e.to_string(),
28 })
29}
30
31#[allow(dead_code)]
33pub fn handle_text_node(event: &BytesText, position: usize) -> Result<String, ParseError> {
34 let unescaped = event.unescape().map_err(|e| {
35 ParseError::SimpleXmlError(format!("Unescape error at {}: {}", position, e))
36 })?;
37
38 process_text_content(unescaped.as_bytes())
39}
40
41#[allow(dead_code)]
43pub fn decode_attribute_name(bytes: &[u8], position: usize) -> Result<String, ParseError> {
44 decode_utf8_at_position(bytes, position)
45}
46
47#[allow(dead_code)]
49pub fn decode_attribute_value(bytes: &[u8], position: usize) -> Result<String, ParseError> {
50 let utf8_str = std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
52 position,
53 error: e.to_string(),
54 })?;
55
56 quick_xml::escape::unescape(utf8_str)
58 .map(|cow| cow.into_owned())
59 .map_err(|e| ParseError::SimpleXmlError(format!("Attribute unescape error: {}", e)))
60}
61
62pub fn validate_utf8(bytes: &[u8]) -> Result<&str, ParseError> {
64 std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
65 position: 0,
66 error: e.to_string(),
67 })
68}
69
70pub fn validate_utf8_string(text: &str) -> Result<(), ParseError> {
72 for (pos, ch) in text.char_indices() {
75 if ch == '\u{FFFD}' {
76 return Err(ParseError::InvalidUtf8 {
78 position: pos,
79 error: "Found Unicode replacement character indicating invalid UTF-8".to_string(),
80 });
81 }
82
83 if ch.is_control() && ch != '\t' && ch != '\n' && ch != '\r' {
85 return Err(ParseError::InvalidUtf8 {
87 position: pos,
88 error: format!("Found invalid control character: U+{:04X}", ch as u32),
89 });
90 }
91 }
92 Ok(())
93}
94
95#[cfg(test)]
96mod tests {
97 use super::*;
98
99 #[test]
100 fn test_valid_utf8() {
101 let text = "Hello, δΈη! π΅".as_bytes();
102 assert_eq!(process_text_content(text).unwrap(), "Hello, δΈη! π΅");
103 }
104
105 #[test]
106 fn test_invalid_utf8() {
107 let invalid = vec![0xFF, 0xFE, 0xFD];
108 assert!(process_text_content(&invalid).is_err());
109 }
110
111 #[test]
112 fn test_lossy_conversion() {
113 let mixed = vec![72, 101, 108, 108, 111, 0xFF, 0xFE];
114 let result = process_text_content_lossy(&mixed);
115 assert!(result.starts_with("Hello"));
116 }
117}