use std::borrow::Cow;
use std::char;
use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::character::complete::{digit1, hex_digit1};
use nom::combinator::{map, opt, recognize};
use nom::sequence::{preceded, terminated};
use nom::IResult;
use crate::parser::raw::name;
pub type Result<T = ()> = std::result::Result<T, EntityError>;
#[derive(Clone, Debug, PartialEq, thiserror::Error)]
pub enum EntityError {
#[error("invalid character reference")]
InvalidCharacterRef,
#[error("entity '{0}' is not defined")]
UndefinedEntity(String),
}
pub fn expand_character_references(text: &str) -> Result<Cow<str>> {
expand_entities(text, |_| None::<&str>)
}
pub fn expand_entities<F, T>(text: &str, mut f: F) -> Result<Cow<str>>
where
F: FnMut(&str) -> Option<T>,
T: AsRef<str>,
{
let mut parts = text.split('&');
let first = parts.next().unwrap();
if first.len() == text.len() {
return Ok(text.into());
}
let mut out = String::new();
out.push_str(first);
for part in parts {
match entity_or_char_ref(part) {
Ok((rest, entity)) => {
match entity {
EntityRef::Entity(name) => match f(name) {
Some(value) => out.push_str(value.as_ref()),
None => return Err(EntityError::UndefinedEntity(name.to_owned())),
},
EntityRef::Char(c) => out.push(c),
EntityRef::InvalidChar => return Err(EntityError::InvalidCharacterRef),
}
out.push_str(rest);
}
Err(_) => {
out.push('&');
out.push_str(part)
}
}
}
Ok(out.into())
}
fn entity_or_char_ref(input: &str) -> IResult<&str, EntityRef> {
terminated(
alt((
map(
preceded(
tag("#"),
alt((
map(digit1, |code: &str| code.parse().ok()),
preceded(
tag("x"),
map(hex_digit1, |code| u32::from_str_radix(code, 16).ok()),
),
)),
),
|code| {
code.and_then(char::from_u32)
.map(EntityRef::Char)
.unwrap_or(EntityRef::InvalidChar)
},
),
map(recognize(preceded(opt(tag("#")), name)), EntityRef::Entity),
)),
opt(tag(";")),
)(input)
}
enum EntityRef<'a> {
Entity(&'a str),
Char(char),
InvalidChar,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_invalid_refs() {
fn assert_noop(s: &str) {
let result = expand_character_references(s);
assert_eq!(result, Ok(s.into()));
}
assert_noop("foo&");
assert_noop("foo&&");
assert_noop("foo&;bar");
assert_noop("foo&&;bar");
assert_noop("foo&#");
assert_noop("foo&#;");
assert_noop("foo&#;bar");
assert_noop("foo&##bar");
}
#[test]
fn test_invalid_character_ref() {
let result = expand_character_references("foo�bar");
assert_eq!(result, Err(EntityError::InvalidCharacterRef));
}
#[test]
fn test_expand_character_references_hex() {
let result = expand_character_references("foo bar ");
assert_eq!(result, Ok("foo bar \u{feff}".into()));
}
#[test]
fn test_expand_character_references_missing_semicolon() {
let result = expand_character_references("foo bar ");
assert_eq!(result, Ok("foo bar \u{feff}".into()));
}
#[test]
fn test_expand_entities_noop() {
let result = expand_entities("this string has no references", |_| -> Option<&str> {
unreachable!()
});
assert!(matches!(result.unwrap(), Cow::Borrowed(_)));
}
#[test]
fn test_expand_entities_lookup() {
let result = expand_entities("test &foo;&bar.x; &baz&qu-ux\n", |key| match key {
"foo" => Some("x"),
"bar.x" => Some("y"),
"baz" => Some("z"),
"qu-ux" => Some("w"),
x => panic!("unexpected reference: {:?}", x),
});
assert_eq!(result, Ok("test xy zw\n".into()));
}
#[test]
fn test_expand_entities_invalid_entity() {
let result = expand_entities("test &foo;&bar;", |key| match key {
"foo" => Some("x"),
"bar" => None,
x => panic!("unexpected reference: {:?}", x),
});
assert_eq!(result, Err(EntityError::UndefinedEntity("bar".to_owned())));
}
#[test]
fn test_expand_entities_invalid_function() {
let mut called = false;
let result = expand_entities("foo&#test;bar", |x| {
called = true;
assert_eq!(x, "#test");
None::<&str>
});
assert!(called);
assert_eq!(
result,
Err(EntityError::UndefinedEntity("#test".to_owned()))
);
}
}