ddex_parser/
utf8_utils.rs

1//! UTF-8 handling utilities for safe text processing
2
3use crate::error::ParseError;
4use quick_xml::events::BytesText;
5
6/// Process text content from raw bytes, ensuring valid UTF-8
7#[allow(dead_code)]
8pub fn process_text_content(raw_bytes: &[u8]) -> Result<String, ParseError> {
9    String::from_utf8(raw_bytes.to_vec()).map_err(|e| ParseError::InvalidUtf8 {
10        message: format!("UTF-8 decoding error at position 0: {}", e),
11    })
12}
13
14/// Process text content with lossy UTF-8 conversion (replaces invalid sequences)
15#[allow(dead_code)]
16pub fn process_text_content_lossy(raw_bytes: &[u8]) -> String {
17    String::from_utf8_lossy(raw_bytes).into_owned()
18}
19
20/// Decode UTF-8 at a specific position with error reporting
21pub fn decode_utf8_at_position(bytes: &[u8], position: usize) -> Result<String, ParseError> {
22    std::str::from_utf8(bytes)
23        .map(|s| s.to_string())
24        .map_err(|e| ParseError::InvalidUtf8 {
25            message: format!("UTF-8 decoding error at position {}: {}", position, e),
26        })
27}
28
29/// Handle text node from XML event
30#[allow(dead_code)]
31pub fn handle_text_node(event: &BytesText, position: usize) -> Result<String, ParseError> {
32    let unescaped = event.unescape().map_err(|e| {
33        ParseError::SimpleXmlError(format!("Unescape error at {}: {}", position, e))
34    })?;
35
36    process_text_content(unescaped.as_bytes())
37}
38
39/// Decode attribute name ensuring valid UTF-8
40#[allow(dead_code)]
41pub fn decode_attribute_name(bytes: &[u8], position: usize) -> Result<String, ParseError> {
42    decode_utf8_at_position(bytes, position)
43}
44
45/// Decode attribute value with unescaping
46#[allow(dead_code)]
47pub fn decode_attribute_value(bytes: &[u8], position: usize) -> Result<String, ParseError> {
48    // First decode UTF-8
49    let utf8_str = std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
50        message: format!("UTF-8 decoding error at position {}: {}", position, e),
51    })?;
52
53    // Then unescape XML entities
54    quick_xml::escape::unescape(utf8_str)
55        .map(|cow| cow.into_owned())
56        .map_err(|e| ParseError::SimpleXmlError(format!("Attribute unescape error: {}", e)))
57}
58
59/// Validate UTF-8 string without copying
60pub fn validate_utf8(bytes: &[u8]) -> Result<&str, ParseError> {
61    std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
62        message: format!("UTF-8 validation error: {}", e),
63    })
64}
65
66/// Validate that a string contains only valid UTF-8 characters
67pub fn validate_utf8_string(text: &str) -> Result<(), ParseError> {
68    // Check if the string is valid UTF-8 (this should always pass for &str)
69    // But we also check for any invalid Unicode scalar values
70    for (pos, ch) in text.char_indices() {
71        if ch == '\u{FFFD}' {
72            // Replacement character indicates invalid UTF-8 was present
73            return Err(ParseError::InvalidUtf8 {
74                message: format!("Found Unicode replacement character at position {} indicating invalid UTF-8", pos),
75            });
76        }
77
78        // Check for other problematic characters that might indicate encoding issues
79        if ch.is_control() && ch != '\t' && ch != '\n' && ch != '\r' {
80            // Allow common whitespace control characters but reject others
81            return Err(ParseError::InvalidUtf8 {
82                message: format!("Found invalid control character at position {}: U+{:04X}", pos, ch as u32),
83            });
84        }
85    }
86    Ok(())
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92
93    #[test]
94    fn test_valid_utf8() {
95        let text = "Hello, δΈ–η•Œ! 🎡".as_bytes();
96        assert_eq!(process_text_content(text).unwrap(), "Hello, δΈ–η•Œ! 🎡");
97    }
98
99    #[test]
100    fn test_invalid_utf8() {
101        let invalid = vec![0xFF, 0xFE, 0xFD];
102        assert!(process_text_content(&invalid).is_err());
103    }
104
105    #[test]
106    fn test_lossy_conversion() {
107        let mixed = vec![72, 101, 108, 108, 111, 0xFF, 0xFE];
108        let result = process_text_content_lossy(&mixed);
109        assert!(result.starts_with("Hello"));
110    }
111}