use std::borrow::Cow;
use quick_xml::events::BytesText;
use crate::error::{Error, Result};
const MAX_ENTITY_LEN: usize = 16;
pub(crate) fn lenient_unescape(input: &str) -> Cow<'_, str> {
match quick_xml::escape::unescape(input) {
Ok(cow) => cow,
Err(_) => Cow::Owned(lenient_slow_path(input)),
}
}
fn lenient_slow_path(input: &str) -> String {
let bytes = input.as_bytes();
let mut out = String::with_capacity(input.len());
let mut i = 0;
while i < bytes.len() {
match bytes[i..].iter().position(|&b| b == b'&') {
None => {
out.push_str(&input[i..]);
break;
}
Some(rel) => {
let amp = i + rel;
out.push_str(&input[i..amp]);
let window_end = (amp + MAX_ENTITY_LEN).min(bytes.len());
match bytes[amp + 1..window_end].iter().position(|&b| b == b';') {
None => {
out.push('&');
i = amp + 1;
}
Some(srel) => {
let semi = amp + 1 + srel;
let token = &input[amp..=semi]; match quick_xml::escape::unescape(token) {
Ok(decoded) => {
out.push_str(&decoded);
i = semi + 1;
}
Err(_) => {
out.push('&');
i = amp + 1;
}
}
}
}
}
}
}
out
}
pub(crate) fn decode_text_lossy(text: &BytesText<'_>) -> String {
let raw = String::from_utf8_lossy(text.as_ref());
lenient_unescape(raw.as_ref()).into_owned()
}
pub(crate) fn decode_text_strict(text: &BytesText<'_>, location: &str) -> Result<String> {
let raw = std::str::from_utf8(text.as_ref())
.map_err(|err| Error::xml_parse_with_context(err.to_string(), location))?;
Ok(lenient_unescape(raw).into_owned())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn fast_path_no_entities_borrows() {
let input = "plain text without references";
let out = lenient_unescape(input);
assert_eq!(out, "plain text without references");
assert!(
matches!(out, Cow::Borrowed(_)),
"expected Borrowed, got Owned"
);
}
#[test]
fn fast_path_legitimate_only_decodes() {
assert_eq!(lenient_unescape("A & B"), "A & B");
assert_eq!(lenient_unescape("<tag>"), "<tag>");
assert_eq!(lenient_unescape(""x""), "\"x\"");
assert_eq!(lenient_unescape("don't"), "don't");
}
#[test]
fn fast_path_numeric_refs_decode() {
assert_eq!(lenient_unescape("AA"), "AA");
assert_eq!(lenient_unescape("😀"), "\u{1F600}");
}
#[test]
fn slow_path_mixed_legitimate_and_malformed() {
assert_eq!(lenient_unescape("A & B &bogus; C"), "A & B &bogus; C");
}
#[test]
fn slow_path_multiple_malformed_preserved() {
assert_eq!(lenient_unescape("&foo;&bar;"), "&foo;&bar;");
}
#[test]
fn slow_path_stray_ampersand_then_malformed_entity() {
assert_eq!(
lenient_unescape("R&D and &bogus; tail"),
"R&D and &bogus; tail"
);
}
#[test]
fn slow_path_stray_ampersand_then_legitimate_entity() {
assert_eq!(lenient_unescape("R&D & tail"), "R&D & tail");
}
#[test]
fn slow_path_adjacent_ampersand_before_legitimate_entity() {
assert_eq!(lenient_unescape("&<"), "&<");
}
#[test]
fn slow_path_unterminated_ampersand_bounded() {
let padding = "x".repeat(MAX_ENTITY_LEN + 8);
let input = format!("&{padding}&bogus;");
assert_eq!(lenient_unescape(&input), input);
}
#[test]
fn slow_path_numeric_mixed_with_malformed() {
assert_eq!(lenient_unescape("A&bogus;B"), "A&bogus;B");
}
#[test]
fn slow_path_preserves_non_ascii_between_tokens() {
assert_eq!(
lenient_unescape("한글 & &bogus; 日本語"),
"한글 & &bogus; 日本語"
);
}
#[test]
fn slow_path_empty_entity_reference() {
assert_eq!(lenient_unescape("a &; b &bogus;"), "a &; b &bogus;");
}
}