ddex_parser/
utf8_utils.rs

1//! UTF-8 handling utilities for safe text processing
2
3use crate::error::ParseError;
4use quick_xml::events::BytesText;
5
6/// Process text content from raw bytes, ensuring valid UTF-8
7#[allow(dead_code)]
8pub fn process_text_content(raw_bytes: &[u8]) -> Result<String, ParseError> {
9    String::from_utf8(raw_bytes.to_vec()).map_err(|e| ParseError::InvalidUtf8 {
10        position: 0,
11        error: e.to_string(),
12    })
13}
14
15/// Process text content with lossy UTF-8 conversion (replaces invalid sequences)
16#[allow(dead_code)]
17pub fn process_text_content_lossy(raw_bytes: &[u8]) -> String {
18    String::from_utf8_lossy(raw_bytes).into_owned()
19}
20
21/// Decode UTF-8 at a specific position with error reporting
22pub fn decode_utf8_at_position(bytes: &[u8], position: usize) -> Result<String, ParseError> {
23    std::str::from_utf8(bytes)
24        .map(|s| s.to_string())
25        .map_err(|e| ParseError::InvalidUtf8 {
26            position,
27            error: e.to_string(),
28        })
29}
30
31/// Handle text node from XML event
32#[allow(dead_code)]
33pub fn handle_text_node(event: &BytesText, position: usize) -> Result<String, ParseError> {
34    let unescaped = event.unescape().map_err(|e| {
35        ParseError::SimpleXmlError(format!("Unescape error at {}: {}", position, e))
36    })?;
37
38    process_text_content(unescaped.as_bytes())
39}
40
41/// Decode attribute name ensuring valid UTF-8
42#[allow(dead_code)]
43pub fn decode_attribute_name(bytes: &[u8], position: usize) -> Result<String, ParseError> {
44    decode_utf8_at_position(bytes, position)
45}
46
47/// Decode attribute value with unescaping
48#[allow(dead_code)]
49pub fn decode_attribute_value(bytes: &[u8], position: usize) -> Result<String, ParseError> {
50    // First decode UTF-8
51    let utf8_str = std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
52        position,
53        error: e.to_string(),
54    })?;
55
56    // Then unescape XML entities
57    quick_xml::escape::unescape(utf8_str)
58        .map(|cow| cow.into_owned())
59        .map_err(|e| ParseError::SimpleXmlError(format!("Attribute unescape error: {}", e)))
60}
61
62/// Validate UTF-8 string without copying
63pub fn validate_utf8(bytes: &[u8]) -> Result<&str, ParseError> {
64    std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
65        position: 0,
66        error: e.to_string(),
67    })
68}
69
70/// Validate that a string contains only valid UTF-8 characters
71pub fn validate_utf8_string(text: &str) -> Result<(), ParseError> {
72    // Check if the string is valid UTF-8 (this should always pass for &str)
73    // But we also check for any invalid Unicode scalar values
74    for (pos, ch) in text.char_indices() {
75        if ch == '\u{FFFD}' {
76            // Replacement character indicates invalid UTF-8 was present
77            return Err(ParseError::InvalidUtf8 {
78                position: pos,
79                error: "Found Unicode replacement character indicating invalid UTF-8".to_string(),
80            });
81        }
82
83        // Check for other problematic characters that might indicate encoding issues
84        if ch.is_control() && ch != '\t' && ch != '\n' && ch != '\r' {
85            // Allow common whitespace control characters but reject others
86            return Err(ParseError::InvalidUtf8 {
87                position: pos,
88                error: format!("Found invalid control character: U+{:04X}", ch as u32),
89            });
90        }
91    }
92    Ok(())
93}
94
95#[cfg(test)]
96mod tests {
97    use super::*;
98
99    #[test]
100    fn test_valid_utf8() {
101        let text = "Hello, δΈ–η•Œ! 🎡".as_bytes();
102        assert_eq!(process_text_content(text).unwrap(), "Hello, δΈ–η•Œ! 🎡");
103    }
104
105    #[test]
106    fn test_invalid_utf8() {
107        let invalid = vec![0xFF, 0xFE, 0xFD];
108        assert!(process_text_content(&invalid).is_err());
109    }
110
111    #[test]
112    fn test_lossy_conversion() {
113        let mixed = vec![72, 101, 108, 108, 111, 0xFF, 0xFE];
114        let result = process_text_content_lossy(&mixed);
115        assert!(result.starts_with("Hello"));
116    }
117}