use crate::common::truncate_text;
use crate::files::llm_output_extraction::xsd_validation::{XsdErrorType, XsdValidationError};
pub fn check_for_illegal_xml_characters(content: &str) -> Result<(), XsdValidationError> {
if let Some((byte_index, ch)) = content.char_indices().find(|&(_, ch)| {
matches!(
ch as u32,
0x00 | 0x01..=0x08 | 0x0B | 0x0C | 0x0E..=0x1F | 0xD800..=0xDFFF | 0xFFFE | 0xFFFF )
}) {
Err(illegal_character_error(ch, byte_index, content))
} else {
Ok(())
}
}
fn illegal_character_error(ch: char, byte_index: usize, content: &str) -> XsdValidationError {
let char_display = match ch {
'\0' => "NUL (null byte)".to_string(),
'\u{0001}'..='\u{001F}' => format!("control character 0x{:02X}", ch as u32),
_ => format!("0x{:04X}", ch as u32),
};
let context_start = byte_index.saturating_sub(50);
let context_end = (byte_index + 50).min(content.len());
let safe_start = floor_char_boundary(content, context_start);
let safe_end = ceil_char_boundary(content, context_end.max(safe_start));
let error_context = content.get(safe_start..safe_end).unwrap_or(content);
let preview = truncate_text(error_context, 100);
let suggestion = if ch == '\0' {
format!(
"NUL byte found at position {byte_index}. Common causes:\n\
- Intended to use non-breaking space (\\u00A0) but wrote \\u0000 instead\n\
- Binary data mixed into text content\n\
- Incorrect escape sequence\n\n\
Near: {preview}"
)
} else {
format!(
"Illegal character {char_display} found at position {byte_index}. Options to fix:\n\
- Remove the illegal character\n\
- Replace with a valid character (e.g., space or \u{00A0})\n\n\
Near: {preview}"
)
};
XsdValidationError {
error_type: XsdErrorType::MalformedXml,
element_path: "xml".to_string(),
expected: "valid XML 1.0 content (no illegal control characters)".to_string(),
found: format!("illegal character {char_display} at byte position {byte_index}"),
suggestion,
example: None,
}
}
fn floor_char_boundary(content: &str, index: usize) -> usize {
if index >= content.len() {
return content.len();
}
content
.char_indices()
.map(|(pos, _)| pos)
.take_while(|&pos| pos <= index)
.last()
.unwrap_or(0)
}
fn ceil_char_boundary(content: &str, index: usize) -> usize {
if index >= content.len() {
return content.len();
}
content
.char_indices()
.map(|(pos, _)| pos)
.find(|&pos| pos >= index)
.unwrap_or(content.len())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_check_for_illegal_xml_characters_accepts_valid_content() {
let valid = "Hello world\nNew line\tTab\rCarriage return";
assert!(
check_for_illegal_xml_characters(valid).is_ok(),
"Valid content should pass"
);
}
#[test]
fn test_check_for_illegal_xml_characters_accepts_unicode() {
let valid = "Hello 世界 🌍 Ωμέγα";
assert!(
check_for_illegal_xml_characters(valid).is_ok(),
"Valid Unicode should pass"
);
}
#[test]
fn test_check_for_illegal_xml_characters_rejects_nul() {
let invalid = "text\0here";
let result = check_for_illegal_xml_characters(invalid);
assert!(result.is_err(), "NUL byte should be rejected");
let error = result.unwrap_err();
assert!(
error.found.contains("NUL") || error.found.contains("0x00"),
"Error should mention NUL or 0x00, got: {}",
error.found
);
assert!(
error.suggestion.contains("\\u00A0") || error.suggestion.contains("non-breaking space"),
"Error should suggest NBSP as common fix, got: {}",
error.suggestion
);
}
#[test]
fn test_check_for_illegal_xml_characters_rejects_control_chars() {
let test_cases = vec![
("\u{0001}", "0x01"),
("\u{0008}", "0x08"),
("\u{000B}", "0x0B"), ("\u{000C}", "0x0C"), ("\u{000E}", "0x0E"),
("\u{001F}", "0x1F"),
];
for (invalid_str, expected_code) in test_cases {
let content = format!("text{invalid_str}here");
let result = check_for_illegal_xml_characters(&content);
assert!(
result.is_err(),
"Control character {expected_code} should be rejected"
);
let error = result.unwrap_err();
assert!(
error.found.contains(expected_code) || error.found.contains("control character"),
"Error should mention control character, got: {}",
error.found
);
}
}
#[test]
fn test_illegal_character_error_does_not_suggest_cdata_for_control_chars() {
let invalid = "text\u{0001}here";
let result = check_for_illegal_xml_characters(invalid);
assert!(result.is_err(), "Control character should be rejected");
let error = result.unwrap_err();
assert!(
!error.suggestion.contains("CDATA"),
"Control character suggestions should not mention CDATA, got: {}",
error.suggestion
);
}
#[test]
fn test_check_for_illegal_xml_characters_provides_context() {
let invalid = "Valid text before\0invalid character after";
let result = check_for_illegal_xml_characters(invalid);
assert!(result.is_err());
let error = result.unwrap_err();
assert!(
error.suggestion.contains("before") || error.suggestion.contains("after"),
"Error should include context, got: {}",
error.suggestion
);
assert!(
error.found.contains("position"),
"Error should mention position, got: {}",
error.found
);
}
#[test]
fn test_illegal_character_error_handles_multibyte_context_without_panic() {
let mut prefix = String::from("aaaaaaaaa");
prefix.push('é');
let remaining = 60 - prefix.len();
prefix.push_str(&"b".repeat(remaining));
assert_eq!(prefix.len(), 60);
let content = format!("{prefix}\0tail");
let result = std::panic::catch_unwind(|| check_for_illegal_xml_characters(&content));
assert!(result.is_ok(), "Should not panic on multibyte boundaries");
let error = result.unwrap().unwrap_err();
assert!(
error.suggestion.contains("Near:"),
"Error should include context preview, got: {}",
error.suggestion
);
}
#[test]
fn test_check_for_illegal_xml_characters_allows_tab_newline_cr() {
let valid = "text\twith\ntab\rand\nnewlines";
assert!(
check_for_illegal_xml_characters(valid).is_ok(),
"Tab, LF, CR should be allowed"
);
}
#[test]
fn test_illegal_character_error_format_is_actionable() {
let invalid = "git\0diff";
let result = check_for_illegal_xml_characters(invalid);
assert!(result.is_err());
let error = result.unwrap_err();
let formatted = error.format_for_ai_retry();
assert!(
formatted.contains("NUL") || formatted.contains("0x00"),
"Formatted error should mention NUL"
);
assert!(
formatted.contains("How to fix") || formatted.contains("suggestion"),
"Formatted error should include fix guidance"
);
}
}