use crate::error::ParseError;
use quick_xml::events::BytesText;
#[allow(dead_code)]
pub fn process_text_content(raw_bytes: &[u8]) -> Result<String, ParseError> {
String::from_utf8(raw_bytes.to_vec()).map_err(|e| ParseError::InvalidUtf8 {
message: format!("UTF-8 decoding error at position 0: {}", e),
})
}
#[allow(dead_code)]
pub fn process_text_content_lossy(raw_bytes: &[u8]) -> String {
String::from_utf8_lossy(raw_bytes).into_owned()
}
pub fn decode_utf8_at_position(bytes: &[u8], position: usize) -> Result<String, ParseError> {
std::str::from_utf8(bytes)
.map(|s| s.to_string())
.map_err(|e| ParseError::InvalidUtf8 {
message: format!("UTF-8 decoding error at position {}: {}", position, e),
})
}
#[allow(dead_code)]
pub fn handle_text_node(event: &BytesText, position: usize) -> Result<String, ParseError> {
let unescaped = event.unescape().map_err(|e| {
ParseError::SimpleXmlError(format!("Unescape error at {}: {}", position, e))
})?;
process_text_content(unescaped.as_bytes())
}
#[allow(dead_code)]
pub fn decode_attribute_name(bytes: &[u8], position: usize) -> Result<String, ParseError> {
decode_utf8_at_position(bytes, position)
}
#[allow(dead_code)]
pub fn decode_attribute_value(bytes: &[u8], position: usize) -> Result<String, ParseError> {
let utf8_str = std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
message: format!("UTF-8 decoding error at position {}: {}", position, e),
})?;
quick_xml::escape::unescape(utf8_str)
.map(|cow| cow.into_owned())
.map_err(|e| ParseError::SimpleXmlError(format!("Attribute unescape error: {}", e)))
}
pub fn validate_utf8(bytes: &[u8]) -> Result<&str, ParseError> {
std::str::from_utf8(bytes).map_err(|e| ParseError::InvalidUtf8 {
message: format!("UTF-8 validation error: {}", e),
})
}
pub fn validate_utf8_string(text: &str) -> Result<(), ParseError> {
for (pos, ch) in text.char_indices() {
if ch == '\u{FFFD}' {
return Err(ParseError::InvalidUtf8 {
message: format!("Found Unicode replacement character at position {} indicating invalid UTF-8", pos),
});
}
if ch.is_control() && ch != '\t' && ch != '\n' && ch != '\r' {
return Err(ParseError::InvalidUtf8 {
message: format!("Found invalid control character at position {}: U+{:04X}", pos, ch as u32),
});
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_valid_utf8() {
let text = "Hello, δΈη! π΅".as_bytes();
assert_eq!(process_text_content(text).unwrap(), "Hello, δΈη! π΅");
}
#[test]
fn test_invalid_utf8() {
let invalid = vec![0xFF, 0xFE, 0xFD];
assert!(process_text_content(&invalid).is_err());
}
#[test]
fn test_lossy_conversion() {
let mixed = vec![72, 101, 108, 108, 111, 0xFF, 0xFE];
let result = process_text_content_lossy(&mixed);
assert!(result.starts_with("Hello"));
}
}