use std::borrow::Cow;
use crate::error::ParseError;
use crate::output::Normalizer;
use crate::Span;
pub(crate) fn parse_text(content: Cow<str>, base_position: usize) -> Result<Cow<str>, ParseError> {
parse_content(content, false, base_position)
}
pub(crate) fn parse_attribute(
content: Cow<str>,
base_position: usize,
) -> Result<Cow<str>, ParseError> {
parse_content(content, true, base_position)
}
fn parse_content(
content: Cow<str>,
attribute: bool,
base_position: usize,
) -> Result<Cow<str>, ParseError> {
let mut result = String::new();
let mut chars = content.char_indices().peekable();
let mut change = false;
while let Some((position, c)) = chars.next() {
if c == '\r' {
if let Some((_, peeked)) = chars.peek() {
if peeked == &'\n' {
chars.next();
}
}
if !attribute {
result.push('\n');
} else {
result.push(' ');
}
change = true;
} else if c == '&' {
let mut entity = String::new();
let mut is_complete = false;
let mut end_position = 0;
for (p, c) in chars.by_ref() {
if c == ';' {
is_complete = true;
end_position = p + 1;
break;
}
entity.push(c);
}
if !is_complete {
return Err(ParseError::UnclosedEntity(entity, base_position + position));
}
change = true;
if let Some(entity) = entity.strip_prefix('#') {
let first_char = entity.chars().next().ok_or_else(|| {
ParseError::InvalidEntity(
entity.to_string(),
Span::new(base_position + position, base_position + end_position),
)
})?;
let code = if first_char == 'x' {
u32::from_str_radix(&entity[1..], 16)
} else {
entity.parse::<u32>()
};
let code = code.map_err(|_| {
ParseError::InvalidEntity(
entity.to_string(),
Span::new(base_position + position, base_position + end_position),
)
})?;
let c = std::char::from_u32(code).ok_or_else(|| {
ParseError::InvalidEntity(
entity.to_string(),
Span::new(base_position + position, base_position + end_position),
)
})?;
result.push(c);
} else {
match entity.as_str() {
"amp" => result.push('&'),
"apos" => result.push('\''),
"gt" => result.push('>'),
"lt" => result.push('<'),
"quot" => result.push('"'),
_ => {
return Err(ParseError::InvalidEntity(
entity,
Span::new(base_position + position, base_position + end_position),
))
}
}
}
} else if attribute && (c == '\t' || c == '\n') {
result.push(' ');
change = true;
} else {
result.push(c);
}
}
if !change {
Ok(content)
} else {
Ok(result.into())
}
}
pub(crate) fn serialize_text<'a, N: Normalizer>(
content: Cow<'a, str>,
normalizer: &N,
unescaped_gt: bool,
) -> Cow<'a, str> {
let mut result = String::new();
let mut change = false;
let normalized_content = normalizer.normalize(content);
for c in normalized_content.chars() {
match c {
'&' => {
change = true;
result.push_str("&")
}
'<' => {
change = true;
result.push_str("<")
}
'>' if !unescaped_gt => {
change = true;
result.push_str(">")
}
'>' if unescaped_gt => {
change = true;
let mut last_two = result.chars().rev().take(2);
let last = last_two.next();
if let Some(']') = last {
let last = last_two.next();
if let Some(']') = last {
result.push_str(">");
continue;
}
}
result.push('>');
}
_ => result.push(c),
}
}
if !change {
normalized_content
} else {
result.into()
}
}
pub(crate) fn serialize_cdata<'a, N: Normalizer>(
content: Cow<'a, str>,
normalizer: &N,
) -> Cow<'a, str> {
let mut result = String::new();
result.push_str("<![CDATA[");
let mut closing_square_brackets_seen = 0;
let normalized_content = normalizer.normalize(content);
for c in normalized_content.chars() {
match c {
']' => {
if closing_square_brackets_seen < 2 {
closing_square_brackets_seen += 1;
} else {
result.push(c);
closing_square_brackets_seen = 2;
}
}
'>' => {
if closing_square_brackets_seen == 2 {
result.push_str("]]]]><![CDATA[>");
} else {
for _ in 0..closing_square_brackets_seen {
result.push(']');
}
result.push(c);
}
closing_square_brackets_seen = 0;
}
_ => {
for _ in 0..closing_square_brackets_seen {
result.push(']');
}
closing_square_brackets_seen = 0;
result.push(c)
}
}
}
for _ in 0..closing_square_brackets_seen {
result.push(']');
}
result.push_str("]]>");
result.into()
}
pub(crate) fn serialize_attribute<'a, N: Normalizer>(
content: Cow<'a, str>,
normalizer: &N,
) -> Cow<'a, str> {
let mut result = String::new();
let mut change = false;
let normalized_content = normalizer.normalize(content);
for c in normalized_content.chars() {
match c {
'&' => {
change = true;
result.push_str("&")
}
'<' => {
change = true;
result.push_str("<")
}
'\'' => {
change = true;
result.push_str("'")
}
'"' => {
change = true;
result.push_str(""")
}
_ => result.push(c),
}
}
if !change {
normalized_content
} else {
result.into()
}
}
#[cfg(test)]
mod tests {
use crate::output::NoopNormalizer;
use super::*;
#[test]
fn test_parse() {
let text = "A & B";
assert_eq!(parse_text(text.into(), 0).unwrap(), "A & B");
}
#[test]
fn test_parse_multiple() {
let text = "&'><"";
assert_eq!(parse_text(text.into(), 0).unwrap(), "&'><\"");
}
#[test]
fn test_parse_unknown_entity() {
let text = "&unknown;";
let err = parse_text(text.into(), 0);
if let Err(ParseError::InvalidEntity(entity, span)) = err {
assert_eq!(entity, "unknown");
assert_eq!(span, Span::new(0, 9));
} else {
unreachable!();
}
}
#[test]
fn test_parse_unfinished_entity() {
let text = "&";
let err = parse_text(text.into(), 0);
if let Err(ParseError::UnclosedEntity(entity, position)) = err {
assert_eq!(entity, "amp");
assert_eq!(position, 0);
} else {
unreachable!();
}
}
#[test]
fn test_parse_no_entities() {
let text = "hello";
let result = parse_text(text.into(), 0).unwrap();
assert!(std::ptr::eq(text, result.as_ref()));
}
#[test]
fn test_parse_newline_r() {
let text = "A \r B";
assert_eq!(parse_text(text.into(), 0).unwrap(), "A \n B");
}
#[test]
fn test_parse_newline_rn() {
let text = "A \r\n B";
assert_eq!(parse_text(text.into(), 0).unwrap(), "A \n B");
}
#[test]
fn test_do_not_normalize_text_tab() {
let text = "A \t B";
assert_eq!(parse_text(text.into(), 0).unwrap(), "A \t B");
}
#[test]
fn test_do_not_normalize_text_newline() {
let text = "A \n B";
assert_eq!(parse_text(text.into(), 0).unwrap(), "A \n B");
}
#[test]
fn test_normalize_attribute_tab() {
let text = "A \t B";
assert_eq!(parse_attribute(text.into(), 0).unwrap(), "A B");
}
#[test]
fn test_normalize_attribute_r_newline() {
let text = "A \r B";
assert_eq!(parse_attribute(text.into(), 0).unwrap(), "A B");
}
#[test]
fn test_normalize_attribute_rn_newline() {
let text = "A \r\n B";
assert_eq!(parse_attribute(text.into(), 0).unwrap(), "A B");
}
#[test]
fn test_normalize_attribute_newline() {
let text = "A \n B";
assert_eq!(parse_attribute(text.into(), 0).unwrap(), "A B");
}
#[test]
fn test_serialize_text() {
let text = "A & B";
assert_eq!(
serialize_text(text.into(), &NoopNormalizer, false),
"A & B"
);
}
#[test]
fn test_serialize_text_multiple() {
let text = "&<'\">";
assert_eq!(
serialize_text(text.into(), &NoopNormalizer, false),
"&<'\">"
);
}
#[test]
fn test_serialize_text_gt_escaped() {
let text = ">";
assert_eq!(serialize_text(text.into(), &NoopNormalizer, false), ">");
}
#[test]
fn test_serialize_text_gt_unescaped() {
let text = ">";
assert_eq!(serialize_text(text.into(), &NoopNormalizer, true), ">");
}
#[test]
fn test_serialize_text_like_cdata_section_close_delimiter() {
let text = "]]>";
assert_eq!(
serialize_text(text.into(), &NoopNormalizer, false),
"]]>"
);
}
#[test]
fn test_serialize_text_like_cdata_section_close_delimiter_gt_unescaped() {
let text = "]]>";
assert_eq!(serialize_text(text.into(), &NoopNormalizer, true), "]]>");
}
#[test]
fn test_serialize_text_like_cdata_section_close_delimiter_split() {
let text = "]]extra>";
assert_eq!(
serialize_text(text.into(), &NoopNormalizer, true),
"]]extra>"
);
}
#[test]
fn test_serialize_text_like_cdata_section_close_delimiter_prefix() {
let text = "extra]]>";
assert_eq!(
serialize_text(text.into(), &NoopNormalizer, true),
"extra]]>"
);
}
#[test]
fn test_serialize_text_like_cdata_section_close_delimiter_postfix() {
let text = "]]>extra";
assert_eq!(
serialize_text(text.into(), &NoopNormalizer, true),
"]]>extra"
);
}
#[test]
fn test_serialize_text_no_entities() {
let text = "hello";
let result = serialize_text(text.into(), &NoopNormalizer, false);
assert!(std::ptr::eq(text, result.as_ref()));
}
#[test]
fn test_serialize_attribute() {
let text = "A & B";
assert_eq!(
serialize_attribute(text.into(), &NoopNormalizer),
"A & B"
);
}
#[test]
fn test_serialize_attribute_multiple_single() {
let text = "&<'";
assert_eq!(
serialize_attribute(text.into(), &NoopNormalizer),
"&<'"
);
}
#[test]
fn test_serialize_attribute_multiple_double() {
let text = "&<\"";
assert_eq!(
serialize_attribute(text.into(), &NoopNormalizer),
"&<""
);
}
#[test]
fn test_serialize_attribute_no_entities() {
let text = "hello";
let result = serialize_attribute(text.into(), &NoopNormalizer);
assert!(std::ptr::eq(text, result.as_ref()));
}
#[test]
fn test_parse_character_hex_entity() {
let text = "A & B";
assert_eq!(parse_text(text.into(), 0).unwrap(), "A & B");
}
#[test]
fn test_parse_character_decimal_entity() {
let text = "A & B";
assert_eq!(parse_text(text.into(), 0).unwrap(), "A & B");
}
#[test]
fn test_parse_character_empty_entity() {
let text = "A &#; B";
assert!(parse_text(text.into(), 0).is_err());
}
#[test]
fn test_parse_character_empty_hex_entity() {
let text = "A &x#; B";
assert!(parse_text(text.into(), 0).is_err());
}
#[test]
fn test_parse_character_broken_hex_entity() {
let text = "A &xflub#; B";
assert!(parse_text(text.into(), 0).is_err());
}
#[test]
fn test_serialize_cdata_simple() {
let text = "hello";
assert_eq!(
serialize_cdata(text.into(), &NoopNormalizer),
"<![CDATA[hello]]>"
);
}
#[test]
fn test_serialize_cdata_end_sequence() {
let text = "hello]]>world";
assert_eq!(
serialize_cdata(text.into(), &NoopNormalizer),
"<![CDATA[hello]]]]><![CDATA[>world]]>"
);
}
#[test]
fn test_serialize_cdata_two_square_brackets() {
let text = "hello]]world";
assert_eq!(
serialize_cdata(text.into(), &NoopNormalizer),
"<![CDATA[hello]]world]]>"
);
}
#[test]
fn test_serialize_cdata_two_square_brackets_end() {
let text = "hello]]";
assert_eq!(
serialize_cdata(text.into(), &NoopNormalizer),
"<![CDATA[hello]]]]>"
);
}
#[test]
fn test_serialize_cdata_greater_than() {
let text = ">";
assert_eq!(
serialize_cdata(text.into(), &NoopNormalizer),
"<![CDATA[>]]>"
);
}
#[test]
fn test_serialize_cdata_special_sequence() {
let text = "]]>";
assert_eq!(
serialize_cdata(text.into(), &NoopNormalizer),
"<![CDATA[]]]]><![CDATA[>]]>"
);
}
#[test]
fn test_serialize_cdata_three_square_brackets() {
let text = "hello]]]world";
assert_eq!(
serialize_cdata(text.into(), &NoopNormalizer),
"<![CDATA[hello]]]world]]>"
);
}
#[test]
fn test_serialize_cdata_three_square_brackets_end_sequence() {
let text = "hello]]]>world";
assert_eq!(
serialize_cdata(text.into(), &NoopNormalizer),
"<![CDATA[hello]]]]]><![CDATA[>world]]>"
);
}
}