use crate::error::Result;
use crate::traits::{Repair, RepairStrategy, Validator};
use regex::Regex;
use std::sync::OnceLock;
#[allow(dead_code)]
struct XmlRegexCache {
unclosed_tags: Regex,
malformed_attributes: Regex,
invalid_characters: Regex,
missing_quotes: Regex,
self_closing_tags: Regex,
}
impl XmlRegexCache {
fn new() -> Result<Self> {
Ok(Self {
unclosed_tags: Regex::new(r"<(\w+)([^>]*)>")?,
malformed_attributes: Regex::new(r#"(\w+)=([^"'\s>]+)"#)?,
invalid_characters: Regex::new(r"[<>&]")?,
missing_quotes: Regex::new(r#"(\w+)=([^"'\s>]+)"#)?,
self_closing_tags: Regex::new(r"<(\w+)([^>]*)/>")?,
})
}
}
static XML_REGEX_CACHE: OnceLock<XmlRegexCache> = OnceLock::new();
fn get_xml_regex_cache() -> &'static XmlRegexCache {
XML_REGEX_CACHE
.get_or_init(|| XmlRegexCache::new().expect("Failed to initialize XML regex cache"))
}
pub struct XmlRepairer {
inner: crate::repairer_base::GenericRepairer,
}
impl XmlRepairer {
pub fn new() -> Self {
let strategies: Vec<Box<dyn RepairStrategy>> = vec![
Box::new(FixUnclosedTagsStrategy),
Box::new(FixMalformedAttributesStrategy),
Box::new(FixInvalidCharactersStrategy),
Box::new(FixMissingQuotesStrategy),
Box::new(FixSelfClosingTagsStrategy),
Box::new(AddXmlDeclarationStrategy),
];
let validator: Box<dyn Validator> = Box::new(XmlValidator);
let inner = crate::repairer_base::GenericRepairer::new(validator, strategies);
Self { inner }
}
}
impl Default for XmlRepairer {
fn default() -> Self {
Self::new()
}
}
impl Repair for XmlRepairer {
fn repair(&mut self, content: &str) -> Result<String> {
self.inner.repair(content)
}
fn needs_repair(&self, content: &str) -> bool {
self.inner.needs_repair(content)
}
fn confidence(&self, content: &str) -> f64 {
if content.trim().is_empty() {
return 0.0;
}
let mut score: f64 = 0.0;
if content.trim().starts_with("<?xml") {
score += 0.3;
}
if content.contains('<') && content.contains('>') {
score += 0.3;
}
let open_tags = content.matches('<').count();
let close_tags = content.matches('>').count();
if open_tags == close_tags {
score += 0.2;
}
if content.contains('=') {
score += 0.1;
}
if content.contains("</") {
score += 0.1;
}
score.min(1.0)
}
}
pub struct XmlValidator;
impl Validator for XmlValidator {
fn is_valid(&self, content: &str) -> bool {
if content.trim().is_empty() {
return false;
}
let lines: Vec<&str> = content.lines().collect();
for line in lines {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if trimmed.contains('=') && !trimmed.contains('"') {
return false;
}
}
quick_xml::Reader::from_str(content).read_event().is_ok()
}
fn validate(&self, content: &str) -> Vec<String> {
let mut errors = Vec::new();
if content.trim().is_empty() {
errors.push("Empty XML content".to_string());
return errors;
}
let mut reader = quick_xml::Reader::from_str(content);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Eof) => break,
Ok(_) => continue,
Err(e) => {
errors.push(format!("XML parsing error: {e}"));
break;
}
}
}
errors
}
}
struct FixUnclosedTagsStrategy;
impl RepairStrategy for FixUnclosedTagsStrategy {
fn apply(&self, content: &str) -> Result<String> {
let cache = get_xml_regex_cache();
let mut result = content.to_string();
let mut open_tags = Vec::new();
for cap in cache.unclosed_tags.captures_iter(&result) {
let tag_name = &cap[1];
let attributes = &cap[2];
if attributes.ends_with('/') {
continue;
}
if let Some(stripped) = tag_name.strip_prefix('/') {
if let Some(expected_tag) = open_tags.pop()
&& expected_tag != stripped {
open_tags.push(expected_tag);
}
} else {
open_tags.push(tag_name.to_string());
}
}
for tag in open_tags.iter().rev() {
result.push_str(&format!("</{tag}>"));
}
Ok(result)
}
fn priority(&self) -> u8 {
6
}
fn name(&self) -> &str {
"FixUnclosedTagsStrategy"
}
}
struct FixMalformedAttributesStrategy;
impl RepairStrategy for FixMalformedAttributesStrategy {
fn apply(&self, content: &str) -> Result<String> {
let cache = get_xml_regex_cache();
let result = cache
.malformed_attributes
.replace_all(content, |caps: ®ex::Captures| {
let attr_name = &caps[1];
let attr_value = &caps[2];
format!("{attr_name}=\"{attr_value}\"")
});
Ok(result.to_string())
}
fn priority(&self) -> u8 {
5
}
fn name(&self) -> &str {
"FixMalformedAttributesStrategy"
}
}
struct FixInvalidCharactersStrategy;
impl RepairStrategy for FixInvalidCharactersStrategy {
fn apply(&self, content: &str) -> Result<String> {
let mut result = content.to_string();
result = result.replace('&', "&");
result = result.replace('<', "<");
result = result.replace('>', ">");
result = result.replace("<", "<");
result = result.replace(">", ">");
Ok(result)
}
fn priority(&self) -> u8 {
4
}
fn name(&self) -> &str {
"FixInvalidCharactersStrategy"
}
}
struct FixMissingQuotesStrategy;
impl RepairStrategy for FixMissingQuotesStrategy {
fn apply(&self, content: &str) -> Result<String> {
let cache = get_xml_regex_cache();
let result = cache
.missing_quotes
.replace_all(content, |caps: ®ex::Captures| {
let attr_name = &caps[1];
let attr_value = &caps[2];
format!("{attr_name}=\"{attr_value}\"")
});
Ok(result.to_string())
}
fn priority(&self) -> u8 {
3
}
fn name(&self) -> &str {
"FixMissingQuotesStrategy"
}
}
struct FixSelfClosingTagsStrategy;
impl RepairStrategy for FixSelfClosingTagsStrategy {
fn apply(&self, content: &str) -> Result<String> {
let cache = get_xml_regex_cache();
let result = cache
.self_closing_tags
.replace_all(content, |caps: ®ex::Captures| {
let tag_name = &caps[1];
let attributes = &caps[2];
format!("<{tag_name}{attributes}/>")
});
Ok(result.to_string())
}
fn priority(&self) -> u8 {
2
}
fn name(&self) -> &str {
"FixSelfClosingTagsStrategy"
}
}
struct AddXmlDeclarationStrategy;
impl RepairStrategy for AddXmlDeclarationStrategy {
fn apply(&self, content: &str) -> Result<String> {
let trimmed = content.trim();
if !trimmed.starts_with("<?xml") {
Ok(format!(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n{trimmed}"
))
} else {
Ok(trimmed.to_string())
}
}
fn priority(&self) -> u8 {
1
}
fn name(&self) -> &str {
"AddXmlDeclarationStrategy"
}
}