use crate::shared::syntax::*;
use std::fmt::{Display, Formatter, Result as FmtResult};
use std::str::FromStr;
use std::sync::OnceLock;
#[derive(Clone, Debug, Default, PartialEq)]
pub(crate) enum SpaceHandling {
#[default]
Default,
Preserve,
}
pub(crate) trait EntityResolver {
fn resolve(&self, entity: &str) -> Option<String>;
}
pub(crate) fn normalize_attribute_value(
value: impl AsRef<str>,
resolver: &dyn EntityResolver,
is_cdata: bool,
) -> String {
let value = value.as_ref();
let step_1 = normalize_end_of_lines(value);
let step_3 = if step_1.is_empty() {
step_1
} else {
static FIND: OnceLock<regex::Regex> = OnceLock::new();
let find = FIND.get_or_init(|| {regex::Regex::new(
r"(?P<entity_ref>[&%][\pL_][\pL\.\d_\-]*;)|(?P<char>&#\d+;)|(?P<char_hex>&#x[0-9a-fA-F]+;)|(?P<ws>[\u{09}\u{0A}\u{0D}])",
)
.unwrap()});
let mut step_2 = String::new();
let mut last_end = 0;
for capture in find.captures_iter(&step_1) {
let (start, end, replacement) = if let Some(a_match) = capture.name("entity_ref") {
let replacement = match resolver.resolve(a_match.as_str()) {
None => panic!("unknown entity reference {}", a_match.as_str()),
Some(replacement) => {
normalize_attribute_value(&replacement, resolver, is_cdata)
}
};
(a_match.start(), a_match.end(), replacement)
} else if let Some(a_match) = capture.name("char") {
let replacement = char_from_entity(a_match.as_str());
(a_match.start(), a_match.end(), replacement)
} else if let Some(a_match) = capture.name("char_hex") {
let replacement = char_from_entity(a_match.as_str());
(a_match.start(), a_match.end(), replacement)
} else if let Some(a_match) = capture.name("ws") {
(a_match.start(), a_match.end(), "\u{20}".to_string())
} else {
panic!("unexpected result");
};
step_2.push_str(&step_1[last_end..start]);
step_2.push_str(&replacement);
last_end = end;
}
if last_end < value.len() {
step_2.push_str(&step_1[last_end..]);
}
step_2
};
if is_cdata {
step_3
} else {
step_3.trim_matches(' ').to_string()
}
}
pub(crate) fn normalize_end_of_lines(value: impl AsRef<str>) -> String {
let value = value.as_ref();
if value.is_empty() {
value.to_string()
} else {
static LINE_ENDS: OnceLock<regex::Regex> = OnceLock::new();
let line_ends = LINE_ENDS
.get_or_init(|| regex::Regex::new(r"\u{0D}[\u{0A}\u{85}]?|\u{85}|\u{2028}").unwrap());
line_ends.replace_all(value, "\u{0A}").to_string()
}
}
pub(crate) fn escape(input: impl AsRef<str>) -> String {
let input = input.as_ref();
let mut result = String::with_capacity(input.len());
for c in input.chars() {
match c {
XML_ESC_AMP_CHAR => result.push_str(&to_entity(XML_ESC_AMP_CHAR)),
XML_ESC_APOS_CHAR => result.push_str(&to_entity(XML_ESC_APOS_CHAR)),
XML_ESC_GT_CHAR => result.push_str(&to_entity(XML_ESC_GT_CHAR)),
XML_ESC_LT_CHAR => result.push_str(&to_entity(XML_ESC_LT_CHAR)),
XML_ESC_QUOT_CHAR => result.push_str(&to_entity(XML_ESC_QUOT_CHAR)),
o => result.push(o),
}
}
result
}
pub(crate) fn to_entity(c: char) -> String {
format!(
"{}{}{}",
XML_NUMBERED_ENTITYREF_START, c as u16, XML_ENTITYREF_END
)
}
#[allow(dead_code)]
pub(crate) fn to_entity_hex(c: char) -> String {
format!(
"{}{:X}{}",
XML_HEX_NUMBERED_ENTITYREF_START, c as u16, XML_ENTITYREF_END
)
}
fn char_from_entity(entity: impl AsRef<str>) -> String {
let entity = entity.as_ref();
assert!(entity.starts_with("&#"));
assert!(entity.ends_with(';'));
let code_point = if &entity[2..3] == "x" {
let code_point = &entity[3..entity.len() - 1];
u32::from_str_radix(code_point, 16).unwrap()
} else {
let code_point = &entity[2..entity.len() - 1];
code_point.parse::<u32>().unwrap()
};
let character = char::try_from(code_point).unwrap();
character.to_string()
}
#[allow(dead_code)]
pub(crate) fn is_xml_10_char(c: char) -> bool {
c == '\u{0009}'
|| c == '\u{000A}'
|| c == '\u{000D}'
|| ('\u{0020}'..='\u{D7FF}').contains(&c)
|| ('\u{E000}'..='\u{FFFD}').contains(&c)
|| ('\u{10000}'..='\u{10FFF}').contains(&c)
}
#[allow(dead_code)]
pub(crate) fn is_xml_10_restricted_char(c: char) -> bool {
c == XML_ESC_AMP_CHAR
|| c == XML_ESC_APOS_CHAR
|| c == XML_ESC_GT_CHAR
|| c == XML_ESC_LT_CHAR
|| c == XML_ESC_QUOT_CHAR
}
#[allow(dead_code)]
pub(crate) fn is_xml_11_char(c: char) -> bool {
('\u{0001}'..='\u{D7FF}').contains(&c)
|| ('\u{E000}'..='\u{FFFD}').contains(&c)
|| ('\u{10000}'..='\u{10FFF}').contains(&c)
}
#[allow(dead_code)]
pub(crate) fn is_xml_11_restricted_char(c: char) -> bool {
('\u{01}'..='\u{08}').contains(&c)
|| ('\u{0B}'..='\u{0C}').contains(&c)
|| ('\u{0E}'..='\u{1F}').contains(&c)
|| ('\u{7F}'..='\u{84}').contains(&c)
|| ('\u{86}'..='\u{9F}').contains(&c)
}
#[allow(dead_code)]
pub(crate) fn is_xml_space(c: char) -> bool {
c == '\u{09}' || c == '\u{0A}' || c == '\u{0D}' || c == '\u{20}'
}
#[allow(dead_code)]
pub(crate) fn is_xml_name_start_char(c: char) -> bool {
c == ':'
|| c.is_ascii_uppercase()
|| c == '_'
|| c.is_ascii_lowercase()
|| ('\u{C0}'..='\u{D6}').contains(&c)
|| ('\u{D8}'..='\u{F6}').contains(&c)
|| ('\u{0F8}'..='\u{2FF}').contains(&c)
|| ('\u{370}'..='\u{37D}').contains(&c)
|| ('\u{037F}'..='\u{1FFF}').contains(&c)
|| ('\u{200C}'..='\u{200D}').contains(&c)
|| ('\u{2070}'..='\u{218F}').contains(&c)
|| ('\u{2C00}'..='\u{2FEF}').contains(&c)
|| ('\u{3001}'..='\u{D7FF}').contains(&c)
|| ('\u{F900}'..='\u{FDCF}').contains(&c)
|| ('\u{FDF0}'..='\u{FFFD}').contains(&c)
|| ('\u{10000}'..='\u{EFFFF}').contains(&c)
}
pub(crate) fn is_xml_name_char(c: char) -> bool {
is_xml_name_start_char(c)
|| c == '-'
|| c == '.'
|| c.is_ascii_digit()
|| c == '\u{B7}'
|| ('\u{0300}'..='\u{036F}').contains(&c)
|| ('\u{203F}'..='\u{2040}').contains(&c)
}
pub(crate) fn is_xml_name(s: impl AsRef<str>) -> bool {
let s = s.as_ref();
!s.is_empty() && s.starts_with(is_xml_name_start_char) && s[1..].chars().all(is_xml_name_char)
}
#[allow(dead_code)]
pub(crate) fn is_xml_names(s: impl AsRef<str>) -> bool {
let s = s.as_ref();
!s.is_empty() && s.split(' ').all(is_xml_name)
}
#[allow(dead_code)]
pub(crate) fn is_xml_nmtoken(s: impl AsRef<str>) -> bool {
let s = s.as_ref();
!s.is_empty() && s.chars().all(is_xml_name_char)
}
#[allow(dead_code)]
pub(crate) fn is_xml_nmtokens(s: impl AsRef<str>) -> bool {
let s = s.as_ref();
!s.is_empty() && s.split(' ').all(is_xml_nmtoken)
}
impl Display for SpaceHandling {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
write!(
f,
"{}{}{}=\"{}\"",
XML_NS_ATTRIBUTE,
XML_NS_SEPARATOR,
XML_NS_ATTR_SPACE,
match self {
SpaceHandling::Default => XML_NS_ATTR_SPACE_DEFAULT,
SpaceHandling::Preserve => XML_NS_ATTR_SPACE_PRESERVE,
}
)
}
}
impl FromStr for SpaceHandling {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
if s == XML_NS_ATTR_SPACE_DEFAULT {
Ok(SpaceHandling::Default)
} else if s == XML_NS_ATTR_SPACE_PRESERVE {
Ok(SpaceHandling::Preserve)
} else {
Err(())
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::borrow::Borrow;
use std::collections::HashMap;
#[test]
fn test_space_handling_default() {
let sh = SpaceHandling::default();
assert_eq!(sh, SpaceHandling::Default);
}
#[test]
fn test_space_handling_display() {
assert_eq!(
format!("{}", SpaceHandling::Default),
format!(
"{}{}{}=\"{}\"",
XML_NS_ATTRIBUTE, XML_NS_SEPARATOR, XML_NS_ATTR_SPACE, XML_NS_ATTR_SPACE_DEFAULT
)
);
assert_eq!(
format!("{}", SpaceHandling::Preserve),
format!(
"{}{}{}=\"{}\"",
XML_NS_ATTRIBUTE, XML_NS_SEPARATOR, XML_NS_ATTR_SPACE, XML_NS_ATTR_SPACE_PRESERVE
)
);
}
#[test]
fn test_space_handling_from_str() {
assert_eq!(
SpaceHandling::from_str(XML_NS_ATTR_SPACE_DEFAULT).unwrap(),
SpaceHandling::Default
);
assert_eq!(
SpaceHandling::from_str(XML_NS_ATTR_SPACE_PRESERVE).unwrap(),
SpaceHandling::Preserve
);
assert!(SpaceHandling::from_str("").is_err());
assert!(SpaceHandling::from_str("other").is_err());
}
#[test]
fn test_end_of_line_handling() {
let input = "one\u{0D}two\u{0D}\u{0A}\u{0A}three\u{0A}\u{0D}\u{85}four\u{85}five\u{2028}";
let output = normalize_end_of_lines(input);
assert_eq!(
output,
"one\u{0A}two\u{0A}\u{0A}three\u{0A}\u{0A}four\u{0A}five\u{0A}".to_string()
)
}
struct NoneEntityResolver {}
impl EntityResolver for NoneEntityResolver {
fn resolve(&self, name: &str) -> Option<String> {
let result: Option<String> = None;
println!("EntityResolver::resolve({:?}) -> {:?}", name, result);
result
}
}
pub(crate) fn none_entity_resolver() -> Box<dyn EntityResolver> {
let resolver = NoneEntityResolver {};
Box::new(resolver)
}
#[test]
fn test_normalize_avalue_trim() {
let resolver = none_entity_resolver();
let resolver = resolver.borrow();
assert_eq!(
normalize_attribute_value(" abc ", resolver, true),
" abc "
);
assert_eq!(normalize_attribute_value(" abc ", resolver, false), "abc");
}
struct TestResolver {
entity_map: HashMap<String, String>,
}
impl EntityResolver for TestResolver {
fn resolve(&self, entity: &str) -> Option<String> {
self.entity_map.get(entity).cloned()
}
}
impl TestResolver {
pub(crate) fn new() -> Self {
let mut new_self = Self {
entity_map: Default::default(),
};
let _safe_to_ignore = new_self
.entity_map
.insert("£".to_string(), "£".to_string());
let _safe_to_ignore = new_self
.entity_map
.insert("¥".to_string(), "¥".to_string());
let _safe_to_ignore = new_self
.entity_map
.insert("€".to_string(), "€".to_string());
let _safe_to_ignore = new_self.entity_map.insert(
"¤cy;".to_string(),
"$, £, €, and ¥".to_string(),
);
new_self
}
}
fn test_resolver() -> Box<dyn EntityResolver> {
let resolver = TestResolver::new();
Box::new(resolver)
}
#[test]
fn test_normalize_avalue_entity_resolver() {
let resolver = test_resolver();
let resolver = resolver.borrow();
assert_eq!(
normalize_attribute_value("10$ in £s please", resolver, true),
"10$ in £s please"
);
assert_eq!(
normalize_attribute_value("¥ to €", resolver, false),
"¥ to €"
);
assert_eq!(
normalize_attribute_value("¤cy;", resolver, false),
"$, £, €, and ¥"
);
}
}