use std::borrow::Cow;
use std::collections::HashMap;
use crate::dom::{
Attribute, Document, Element, NodeId, NodeKind, ProcessingInstruction, QName, XmlDeclaration,
};
use crate::error::{XmlError, XmlResult};
use crate::namespace::NamespaceResolver;
type EntityMap = HashMap<String, String>;
type EntityCache = HashMap<String, String>;
pub const DEFAULT_MAX_DEPTH: u32 = 128;
pub const DEFAULT_MAX_ENTITY_EXPANSION: usize = 1 << 20;
pub struct Parser {
namespace_aware: bool,
max_depth: u32,
max_entity_expansion: usize,
}
impl Parser {
pub fn new() -> Self {
Parser {
namespace_aware: true,
max_depth: DEFAULT_MAX_DEPTH,
max_entity_expansion: DEFAULT_MAX_ENTITY_EXPANSION,
}
}
pub fn with_namespace_aware(namespace_aware: bool) -> Self {
Parser {
namespace_aware,
max_depth: DEFAULT_MAX_DEPTH,
max_entity_expansion: DEFAULT_MAX_ENTITY_EXPANSION,
}
}
pub fn with_max_depth(mut self, max_depth: u32) -> Self {
self.max_depth = max_depth;
self
}
pub fn with_max_entity_expansion(mut self, max_bytes: usize) -> Self {
self.max_entity_expansion = max_bytes;
self
}
pub fn parse<'a>(&self, input: &'a str) -> XmlResult<Document<'a>> {
let mut cursor = Cursor::new(input);
let mut doc = Document::new();
doc.input = input;
doc.nodes.reserve(input.len() / 40);
let mut ns_resolver = if self.namespace_aware {
Some(NamespaceResolver::new())
} else {
None
};
let mut entities = EntityMap::new();
let mut entity_cache = EntityCache::new();
let mut entity_budget: usize = self.max_entity_expansion;
cursor.skip_bom();
if cursor.starts_with("<?xml ")
|| cursor.starts_with("<?xml\t")
|| cursor.starts_with("<?xml\r")
|| cursor.starts_with("<?xml\n")
{
let decl = parse_xml_declaration(&mut cursor)?;
doc.xml_declaration = Some(decl);
}
let root_id = doc.root();
parse_misc(
&mut cursor,
&mut doc,
root_id,
&mut entities,
&mut entity_budget,
)?;
let mut found_root = false;
while !cursor.is_eof() {
cursor.skip_whitespace();
if cursor.is_eof() {
break;
}
if cursor.starts_with("<!--") {
let start = cursor.pos;
let comment = parse_comment(&mut cursor)?;
let id = doc.alloc_node(NodeKind::Comment(comment), start);
doc.set_byte_end_pos(id, cursor.pos);
doc.append_child_unchecked(root_id, id);
} else if cursor.starts_with("<?") {
let start = cursor.pos;
let pi = parse_pi(&mut cursor)?;
let id = doc.alloc_node(NodeKind::ProcessingInstruction(pi), start);
doc.set_byte_end_pos(id, cursor.pos);
doc.append_child_unchecked(root_id, id);
} else if cursor.starts_with("<") {
if found_root {
return Err(XmlError::well_formedness(
"Only one root element is allowed",
cursor.line(),
cursor.column(),
));
}
parse_element(
&mut cursor,
&mut doc,
root_id,
&mut ns_resolver,
&entities,
&mut entity_cache,
&mut entity_budget,
0,
self.max_depth,
)?;
found_root = true;
} else {
return Err(XmlError::well_formedness(
"Content found outside of root element",
cursor.line(),
cursor.column(),
));
}
}
if !found_root {
return Err(XmlError::well_formedness(
"Document must have a root element",
0,
0,
));
}
doc.set_byte_end_pos(root_id, input.len());
Ok(doc)
}
}
impl Default for Parser {
fn default() -> Self {
Self::new()
}
}
struct Cursor<'a> {
input: &'a str,
pos: usize,
}
impl<'a> Cursor<'a> {
fn new(input: &'a str) -> Self {
Cursor { input, pos: 0 }
}
#[inline(never)]
fn line(&self) -> usize {
self.input.as_bytes()[..self.pos]
.iter()
.filter(|&&b| b == b'\n')
.count()
+ 1
}
#[inline(never)]
fn column(&self) -> usize {
let bytes = &self.input.as_bytes()[..self.pos];
match bytes.iter().rposition(|&b| b == b'\n') {
Some(nl_pos) => self.pos - nl_pos,
None => self.pos + 1,
}
}
fn is_eof(&self) -> bool {
self.pos >= self.input.len()
}
fn remaining(&self) -> &'a str {
&self.input[self.pos..]
}
fn peek(&self) -> Option<char> {
self.remaining().chars().next()
}
#[inline(always)]
fn peek_byte(&self) -> Option<u8> {
self.input.as_bytes().get(self.pos).copied()
}
fn starts_with(&self, prefix: &str) -> bool {
self.remaining().starts_with(prefix)
}
#[inline(always)]
fn advance(&mut self, n: usize) {
self.pos += n;
}
#[inline(always)]
fn advance_no_newlines(&mut self, n: usize) {
self.pos += n;
}
fn advance_char(&mut self) -> Option<char> {
let c = self.peek()?;
self.pos += c.len_utf8();
Some(c)
}
fn skip_bom(&mut self) {
if self.remaining().starts_with('\u{FEFF}') {
self.pos += '\u{FEFF}'.len_utf8();
}
}
fn skip_whitespace(&mut self) {
let bytes = &self.input.as_bytes()[self.pos..];
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
b' ' | b'\t' | b'\n' | b'\r' => i += 1,
_ => break,
}
}
self.pos += i;
}
fn expect(&mut self, expected: &str) -> XmlResult<()> {
if self.starts_with(expected) {
self.advance_no_newlines(expected.len());
Ok(())
} else {
Err(XmlError::parse(
format!("Expected '{}'", expected),
self.line(),
self.column(),
))
}
}
fn read_until(&mut self, delimiter: &str) -> XmlResult<Cow<'a, str>> {
if let Some(idx) = self.remaining().find(delimiter) {
let text = &self.input[self.pos..self.pos + idx];
self.advance(idx + delimiter.len());
Ok(Cow::Borrowed(text))
} else {
Err(XmlError::parse(
format!("Expected '{}'", delimiter),
self.line(),
self.column(),
))
}
}
fn read_until_owned(&mut self, delimiter: &str) -> XmlResult<String> {
if let Some(idx) = self.remaining().find(delimiter) {
let text = self.remaining()[..idx].to_string();
self.advance(idx + delimiter.len());
Ok(text)
} else {
Err(XmlError::parse(
format!("Expected '{}'", delimiter),
self.line(),
self.column(),
))
}
}
}
fn is_xml_whitespace(c: char) -> bool {
matches!(c, ' ' | '\t' | '\r' | '\n')
}
fn is_name_start_char(c: char) -> bool {
matches!(c,
':' | 'A'..='Z' | '_' | 'a'..='z' |
'\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' |
'\u{F8}'..='\u{2FF}' | '\u{370}'..='\u{37D}' |
'\u{37F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' |
'\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' |
'\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' |
'\u{FDF0}'..='\u{FFFD}' | '\u{10000}'..='\u{EFFFF}'
)
}
fn is_name_char(c: char) -> bool {
is_name_start_char(c)
|| matches!(c,
'-' | '.' | '0'..='9' | '\u{B7}' |
'\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}'
)
}
fn is_xml_char(c: char) -> bool {
matches!(c,
'\u{9}' | '\u{A}' | '\u{D}' |
'\u{20}'..='\u{D7FF}' |
'\u{E000}'..='\u{FFFD}' |
'\u{10000}'..='\u{10FFFF}'
)
}
#[inline(always)]
fn is_ascii_name_start(b: u8) -> bool {
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'_' | b':')
}
#[inline(always)]
fn is_ascii_name_char(b: u8) -> bool {
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b':' | b'-' | b'.')
}
fn parse_name<'a>(cursor: &mut Cursor<'a>) -> XmlResult<Cow<'a, str>> {
let start = cursor.pos;
let bytes = cursor.input.as_bytes();
let &first = bytes
.get(start)
.ok_or_else(|| XmlError::parse("Expected XML name", cursor.line(), cursor.column()))?;
let mut pos = if first < 0x80 {
if !is_ascii_name_start(first) {
return Err(XmlError::parse(
"Expected XML name",
cursor.line(),
cursor.column(),
));
}
start + 1
} else {
let c = cursor.input[start..].chars().next().unwrap();
if !is_name_start_char(c) {
return Err(XmlError::parse(
"Expected XML name",
cursor.line(),
cursor.column(),
));
}
start + c.len_utf8()
};
while pos < bytes.len() {
let b = bytes[pos];
if b < 0x80 {
if is_ascii_name_char(b) {
pos += 1;
} else {
break;
}
} else {
let c = cursor.input[pos..].chars().next().unwrap();
if is_name_char(c) {
pos += c.len_utf8();
} else {
break;
}
}
}
cursor.advance_no_newlines(pos - start);
Ok(Cow::Borrowed(&cursor.input[start..pos]))
}
#[inline]
fn borrow_from_cow<'a>(source: &Cow<'a, str>, slice: &str) -> Cow<'a, str> {
match source {
Cow::Borrowed(s) => {
let start = slice.as_ptr() as usize - s.as_ptr() as usize;
Cow::Borrowed(&s[start..start + slice.len()])
}
Cow::Owned(_) => Cow::Owned(slice.to_string()),
}
}
fn split_qname(name: &str) -> (Option<&str>, &str) {
if let Some(colon_pos) = name.find(':') {
let prefix = &name[..colon_pos];
let local = &name[colon_pos + 1..];
if prefix.is_empty() || local.is_empty() {
(None, name)
} else {
(Some(prefix), local)
}
} else {
(None, name)
}
}
fn parse_xml_declaration<'a>(cursor: &mut Cursor<'a>) -> XmlResult<XmlDeclaration<'a>> {
cursor.expect("<?xml")?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::parse(
"Expected whitespace after '<?xml'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
cursor.expect("version")?;
cursor.skip_whitespace();
cursor.expect("=")?;
cursor.skip_whitespace();
let version = parse_quoted_value(cursor)?;
if &*version != "1.0" && &*version != "1.1" {
return Err(XmlError::well_formedness(
format!("Invalid XML version: '{}'", version),
cursor.line(),
cursor.column(),
));
}
let mut encoding = None;
let mut standalone = None;
let has_ws_after_version = cursor.peek().map(is_xml_whitespace).unwrap_or(false);
cursor.skip_whitespace();
if cursor.starts_with("encoding") {
if !has_ws_after_version {
return Err(XmlError::parse(
"Expected whitespace before 'encoding'",
cursor.line(),
cursor.column(),
));
}
cursor.expect("encoding")?;
cursor.skip_whitespace();
cursor.expect("=")?;
cursor.skip_whitespace();
let enc = parse_quoted_value(cursor)?;
if !is_valid_encoding_name(&enc) {
return Err(XmlError::well_formedness(
format!("Invalid encoding name: '{}'", enc),
cursor.line(),
cursor.column(),
));
}
encoding = Some(enc);
let has_ws_after_encoding = cursor.peek().map(is_xml_whitespace).unwrap_or(false);
cursor.skip_whitespace();
if cursor.starts_with("standalone") {
if !has_ws_after_encoding {
return Err(XmlError::parse(
"Expected whitespace before 'standalone'",
cursor.line(),
cursor.column(),
));
}
let val = parse_standalone(cursor)?;
standalone = Some(val);
}
} else if cursor.starts_with("standalone") {
if !has_ws_after_version {
return Err(XmlError::parse(
"Expected whitespace before 'standalone'",
cursor.line(),
cursor.column(),
));
}
let val = parse_standalone(cursor)?;
standalone = Some(val);
}
cursor.skip_whitespace();
cursor.expect("?>")?;
Ok(XmlDeclaration {
version,
encoding,
standalone,
})
}
fn is_valid_encoding_name(name: &str) -> bool {
if name.is_empty() {
return false;
}
let mut chars = name.chars();
match chars.next() {
Some(c) if c.is_ascii_alphabetic() => {}
_ => return false,
}
for c in chars {
if !c.is_ascii_alphanumeric() && c != '.' && c != '_' && c != '-' {
return false;
}
}
true
}
fn parse_standalone(cursor: &mut Cursor) -> XmlResult<bool> {
cursor.expect("standalone")?;
cursor.skip_whitespace();
cursor.expect("=")?;
cursor.skip_whitespace();
let val = parse_quoted_value(cursor)?;
if &*val != "yes" && &*val != "no" {
return Err(XmlError::well_formedness(
format!(
"Invalid standalone value: '{}' (must be 'yes' or 'no')",
val
),
cursor.line(),
cursor.column(),
));
}
Ok(&*val == "yes")
}
#[inline]
fn charge_entity_budget(budget: &mut usize, n: usize, line: usize, col: usize) -> XmlResult<()> {
match budget.checked_sub(n) {
Some(remaining) => {
*budget = remaining;
Ok(())
}
None => Err(XmlError::parse(
format!(
"Entity expansion exceeds configured limit ({} bytes remaining)",
*budget
),
line,
col,
)),
}
}
fn parse_quoted_value<'a>(cursor: &mut Cursor<'a>) -> XmlResult<Cow<'a, str>> {
let mut budget = DEFAULT_MAX_ENTITY_EXPANSION;
parse_quoted_value_with_entities(
cursor,
&HashMap::new(),
&mut EntityCache::new(),
&mut budget,
)
}
fn parse_quoted_value_with_entities<'a>(
cursor: &mut Cursor<'a>,
entities: &EntityMap,
entity_cache: &mut EntityCache,
budget: &mut usize,
) -> XmlResult<Cow<'a, str>> {
let quote = match cursor.peek() {
Some('"') => '"',
Some('\'') => '\'',
_ => {
return Err(XmlError::parse(
"Expected quote character",
cursor.line(),
cursor.column(),
));
}
};
cursor.advance_char();
let start = cursor.pos;
let bytes = cursor.input.as_bytes();
let qb = quote as u8;
let (advance, has_non_ascii_or_control) =
crate::simd::scan_attr_delimiters(&bytes[cursor.pos..], qb);
let fast_end = cursor.pos + advance;
if fast_end >= bytes.len() {
return Err(XmlError::UnexpectedEof);
}
if bytes[fast_end] == qb {
let text = &cursor.input[start..fast_end];
if has_non_ascii_or_control {
for c in text.chars() {
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!("Invalid XML character U+{:04X}", c as u32),
cursor.line(),
cursor.column(),
));
}
}
}
cursor.pos = fast_end + 1; return Ok(Cow::Borrowed(text));
}
let mut value = String::from(&cursor.input[start..fast_end]);
cursor.advance(fast_end - cursor.pos);
loop {
let bytes = cursor.input.as_bytes();
let scan_start = cursor.pos;
let mut scan_pos = scan_start;
while scan_pos < bytes.len() {
let b = bytes[scan_pos];
if b == qb || b == b'&' || b == b'<' {
break;
}
scan_pos += 1;
}
if scan_pos > scan_start {
let chunk = &cursor.input[scan_start..scan_pos];
for c in chunk.chars() {
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!("Invalid XML character U+{:04X}", c as u32),
cursor.line(),
cursor.column(),
));
}
}
value.push_str(chunk);
cursor.advance(scan_pos - scan_start);
}
match cursor.peek_byte() {
None => return Err(XmlError::UnexpectedEof),
Some(b) if b == qb => {
cursor.advance_no_newlines(1);
break;
}
Some(b'&') => {
let resolved =
parse_reference_with_entities(cursor, entities, entity_cache, budget)?;
value.push_str(&resolved);
}
Some(b'<') => {
return Err(XmlError::well_formedness(
"'<' not allowed in attribute values",
cursor.line(),
cursor.column(),
));
}
Some(_) => unreachable!(),
}
}
Ok(Cow::Owned(value))
}
fn parse_reference(cursor: &mut Cursor) -> XmlResult<String> {
let mut budget = DEFAULT_MAX_ENTITY_EXPANSION;
parse_reference_with_entities(
cursor,
&HashMap::new(),
&mut EntityCache::new(),
&mut budget,
)
}
fn parse_reference_with_entities(
cursor: &mut Cursor,
entities: &EntityMap,
entity_cache: &mut EntityCache,
budget: &mut usize,
) -> XmlResult<String> {
cursor.expect("&")?;
let after_amp = cursor.peek_byte();
if after_amp == Some(b'#') {
cursor.advance_no_newlines(1); let is_hex = cursor.peek_byte() == Some(b'x');
if is_hex {
cursor.advance_no_newlines(1); }
let start = cursor.pos;
let bytes = cursor.input.as_bytes();
let mut end = start;
while end < bytes.len() && bytes[end] != b';' {
end += 1;
}
if end >= bytes.len() {
return Err(XmlError::UnexpectedEof);
}
let digits = &cursor.input[start..end];
cursor.advance_no_newlines(end - start + 1);
let code = if is_hex {
u32::from_str_radix(digits, 16).map_err(|_| {
XmlError::parse(
format!("Invalid hex character reference: {}", digits),
cursor.line(),
cursor.column(),
)
})?
} else {
digits.parse::<u32>().map_err(|_| {
XmlError::parse(
format!("Invalid decimal character reference: {}", digits),
cursor.line(),
cursor.column(),
)
})?
};
let c = char::from_u32(code).ok_or_else(|| {
XmlError::parse(
format!("Invalid character reference: U+{:04X}", code),
cursor.line(),
cursor.column(),
)
})?;
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!(
"Character reference U+{:04X} is not a valid XML character",
code
),
cursor.line(),
cursor.column(),
));
}
Ok(c.to_string())
} else {
let name = parse_name(cursor)?;
cursor.expect(";")?;
match &*name {
"lt" => Ok("<".to_string()),
"gt" => Ok(">".to_string()),
"amp" => Ok("&".to_string()),
"apos" => Ok("'".to_string()),
"quot" => Ok("\"".to_string()),
_ => {
if let Some(cached) = entity_cache.get(&*name) {
charge_entity_budget(budget, cached.len(), cursor.line(), cursor.column())?;
return Ok(cached.clone());
}
if let Some(value) = entities.get(&*name) {
let expanded = expand_entity_value(
value,
entities,
&mut vec![name.to_string()],
budget,
cursor.line(),
cursor.column(),
)?;
let validation_text = expand_entity_value_no_builtins(
value,
entities,
&mut vec![name.to_string()],
budget,
cursor.line(),
cursor.column(),
)?;
validate_entity_as_content(
&validation_text,
entities,
cursor.line(),
cursor.column(),
)?;
entity_cache.insert(name.to_string(), expanded.clone());
Ok(expanded)
} else {
Err(XmlError::well_formedness(
format!("Unknown entity reference: &{};", name),
cursor.line(),
cursor.column(),
))
}
}
}
}
}
fn expand_entity_value(
value: &str,
entities: &EntityMap,
seen: &mut Vec<String>,
budget: &mut usize,
line: usize,
col: usize,
) -> XmlResult<String> {
let mut result = String::new();
let mut pos = 0;
let bytes = value.as_bytes();
while pos < bytes.len() {
if value[pos..].starts_with("<![CDATA[") {
if let Some(end) = value[pos..].find("]]>") {
let cdata_end = pos + end + 3;
charge_entity_budget(budget, cdata_end - pos, line, col)?;
result.push_str(&value[pos..cdata_end]);
pos = cdata_end;
continue;
}
}
if bytes[pos] == b'&' {
if let Some(semi) = value[pos + 1..].find(';') {
let ref_content = &value[pos + 1..pos + 1 + semi];
if ref_content.starts_with('#') {
charge_entity_budget(budget, semi + 2, line, col)?;
result.push_str(&value[pos..pos + 2 + semi]);
pos = pos + 2 + semi;
} else {
match ref_content {
"lt" => {
charge_entity_budget(budget, 1, line, col)?;
result.push('<');
pos = pos + 2 + semi;
}
"gt" => {
charge_entity_budget(budget, 1, line, col)?;
result.push('>');
pos = pos + 2 + semi;
}
"amp" => {
charge_entity_budget(budget, 1, line, col)?;
result.push('&');
pos = pos + 2 + semi;
}
"apos" => {
charge_entity_budget(budget, 1, line, col)?;
result.push('\'');
pos = pos + 2 + semi;
}
"quot" => {
charge_entity_budget(budget, 1, line, col)?;
result.push('"');
pos = pos + 2 + semi;
}
_ => {
let ref_name = ref_content.to_string();
if seen.contains(&ref_name) {
return Err(XmlError::well_formedness(
format!("Circular entity reference: &{};", ref_name),
line,
col,
));
}
if let Some(ref_value) = entities.get(&ref_name) {
seen.push(ref_name);
let expanded = expand_entity_value(
ref_value, entities, seen, budget, line, col,
)?;
seen.pop();
result.push_str(&expanded);
} else {
return Err(XmlError::well_formedness(
format!("Unknown entity reference: &{};", ref_name),
line,
col,
));
}
pos = pos + 2 + semi;
}
}
}
} else {
charge_entity_budget(budget, 1, line, col)?;
result.push('&');
pos += 1;
}
} else {
let c = value[pos..].chars().next().unwrap();
charge_entity_budget(budget, c.len_utf8(), line, col)?;
result.push(c);
pos += c.len_utf8();
}
}
Ok(result)
}
fn expand_entity_value_no_builtins(
value: &str,
entities: &EntityMap,
seen: &mut Vec<String>,
budget: &mut usize,
line: usize,
col: usize,
) -> XmlResult<String> {
let mut result = String::new();
let mut pos = 0;
let bytes = value.as_bytes();
while pos < bytes.len() {
if value[pos..].starts_with("<![CDATA[") {
if let Some(end) = value[pos..].find("]]>") {
let cdata_end = pos + end + 3;
charge_entity_budget(budget, cdata_end - pos, line, col)?;
result.push_str(&value[pos..cdata_end]);
pos = cdata_end;
continue;
}
}
if bytes[pos] == b'&' {
if let Some(semi) = value[pos + 1..].find(';') {
let ref_content = &value[pos + 1..pos + 1 + semi];
if ref_content.starts_with('#') {
charge_entity_budget(budget, semi + 2, line, col)?;
result.push_str(&value[pos..pos + 2 + semi]);
pos = pos + 2 + semi;
} else {
match ref_content {
"lt" | "gt" | "amp" | "apos" | "quot" => {
charge_entity_budget(budget, semi + 2, line, col)?;
result.push_str(&value[pos..pos + 2 + semi]);
pos = pos + 2 + semi;
}
_ => {
let ref_name = ref_content.to_string();
if seen.contains(&ref_name) {
return Err(XmlError::well_formedness(
format!("Circular entity reference: &{};", ref_name),
line,
col,
));
}
if let Some(ref_value) = entities.get(&ref_name) {
seen.push(ref_name);
let expanded = expand_entity_value_no_builtins(
ref_value, entities, seen, budget, line, col,
)?;
seen.pop();
result.push_str(&expanded);
} else {
return Err(XmlError::well_formedness(
format!("Unknown entity reference: &{};", ref_name),
line,
col,
));
}
pos = pos + 2 + semi;
}
}
}
} else {
charge_entity_budget(budget, 1, line, col)?;
result.push('&');
pos += 1;
}
} else {
let c = value[pos..].chars().next().unwrap();
charge_entity_budget(budget, c.len_utf8(), line, col)?;
result.push(c);
pos += c.len_utf8();
}
}
Ok(result)
}
fn validate_entity_as_content(
text: &str,
_entities: &EntityMap,
line: usize,
col: usize,
) -> XmlResult<()> {
let wrapped = format!("<__entity_wrapper__>{}</__entity_wrapper__>", text);
let test_parser = Parser::with_namespace_aware(false);
match test_parser.parse(&wrapped) {
Ok(_) => Ok(()),
Err(_) => Err(XmlError::well_formedness(
"Entity replacement text is not well-formed content",
line,
col,
)),
}
}
fn parse_comment<'a>(cursor: &mut Cursor<'a>) -> XmlResult<Cow<'a, str>> {
cursor.expect("<!--")?;
let content = cursor.read_until("-->")?;
if content.contains("--") {
return Err(XmlError::well_formedness(
"Comments must not contain '--'",
cursor.line(),
cursor.column(),
));
}
if content.ends_with('-') {
return Err(XmlError::well_formedness(
"Comments must not end with '-'",
cursor.line(),
cursor.column(),
));
}
for c in content.chars() {
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!("Invalid XML character U+{:04X} in comment", c as u32),
cursor.line(),
cursor.column(),
));
}
}
Ok(content)
}
fn parse_pi<'a>(cursor: &mut Cursor<'a>) -> XmlResult<ProcessingInstruction<'a>> {
cursor.expect("<?")?;
let target = parse_name(cursor)?;
if target.eq_ignore_ascii_case("xml") {
return Err(XmlError::well_formedness(
"Processing instruction target must not be 'xml'",
cursor.line(),
cursor.column(),
));
}
if cursor.starts_with("?>") {
cursor.expect("?>")?;
return Ok(ProcessingInstruction { target, data: None });
}
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::parse(
"Expected whitespace after PI target",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
let data = cursor.read_until("?>")?;
for c in data.chars() {
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!(
"Invalid XML character U+{:04X} in processing instruction",
c as u32
),
cursor.line(),
cursor.column(),
));
}
}
Ok(ProcessingInstruction {
target,
data: Some(data),
})
}
fn parse_misc<'a>(
cursor: &mut Cursor<'a>,
doc: &mut Document<'a>,
parent: NodeId,
entities: &mut EntityMap,
entity_budget: &mut usize,
) -> XmlResult<()> {
loop {
cursor.skip_whitespace();
if cursor.is_eof() {
break;
}
if cursor.starts_with("<!--") {
let start = cursor.pos;
let comment = parse_comment(cursor)?;
let id = doc.alloc_node(NodeKind::Comment(comment), start);
doc.set_byte_end_pos(id, cursor.pos);
doc.append_child_unchecked(parent, id);
} else if cursor.starts_with("<?") {
let start = cursor.pos;
let pi = parse_pi(cursor)?;
let id = doc.alloc_node(NodeKind::ProcessingInstruction(pi), start);
doc.set_byte_end_pos(id, cursor.pos);
doc.append_child_unchecked(parent, id);
} else if cursor.starts_with("<!DOCTYPE") {
parse_doctype(cursor, doc, entities, entity_budget)?;
} else {
break;
}
}
Ok(())
}
fn parse_doctype<'a>(
cursor: &mut Cursor<'a>,
doc: &mut Document<'a>,
entities: &mut EntityMap,
entity_budget: &mut usize,
) -> XmlResult<()> {
let start_pos = cursor.pos;
cursor.expect("<!DOCTYPE")?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after '<!DOCTYPE'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_name(cursor)?;
cursor.skip_whitespace();
if cursor.starts_with("SYSTEM") {
cursor.advance(6);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after 'SYSTEM'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_system_literal(cursor)?;
cursor.skip_whitespace();
} else if cursor.starts_with("PUBLIC") {
cursor.advance(6);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after 'PUBLIC'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_pubid_literal(cursor)?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace between public and system literal",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_system_literal(cursor)?;
cursor.skip_whitespace();
}
if cursor.peek() == Some('[') {
cursor.advance_char();
parse_internal_subset(cursor, entities, entity_budget)?;
cursor.expect("]")?;
cursor.skip_whitespace();
}
cursor.expect(">")?;
doc.doctype = Some(Cow::Borrowed(&cursor.input[start_pos..cursor.pos]));
Ok(())
}
fn parse_system_literal(cursor: &mut Cursor) -> XmlResult<String> {
let quote = match cursor.peek() {
Some('"') => '"',
Some('\'') => '\'',
_ => {
return Err(XmlError::parse(
"Expected quote for system literal",
cursor.line(),
cursor.column(),
));
}
};
cursor.advance_char();
let mut value = String::new();
loop {
match cursor.peek() {
None => return Err(XmlError::UnexpectedEof),
Some(c) if c == quote => {
cursor.advance_char();
break;
}
Some(c) => {
cursor.advance_char();
value.push(c);
}
}
}
Ok(value)
}
fn parse_pubid_literal(cursor: &mut Cursor) -> XmlResult<String> {
let quote = match cursor.peek() {
Some('"') => '"',
Some('\'') => '\'',
_ => {
return Err(XmlError::parse(
"Expected quote for public ID literal",
cursor.line(),
cursor.column(),
));
}
};
cursor.advance_char();
let mut value = String::new();
loop {
match cursor.peek() {
None => return Err(XmlError::UnexpectedEof),
Some(c) if c == quote => {
cursor.advance_char();
break;
}
Some(c) => {
if !is_pubid_char(c) {
return Err(XmlError::well_formedness(
format!("Invalid character in public ID: U+{:04X}", c as u32),
cursor.line(),
cursor.column(),
));
}
cursor.advance_char();
value.push(c);
}
}
}
Ok(value)
}
fn is_pubid_char(c: char) -> bool {
matches!(c,
' ' | '\r' | '\n' |
'a'..='z' | 'A'..='Z' | '0'..='9' |
'-' | '\'' | '(' | ')' | '+' | ',' | '.' | '/' |
':' | '=' | '?' | ';' | '!' | '*' | '#' | '@' |
'$' | '_' | '%'
)
}
fn parse_internal_subset(
cursor: &mut Cursor,
entities: &mut EntityMap,
entity_budget: &mut usize,
) -> XmlResult<()> {
loop {
cursor.skip_whitespace();
if cursor.is_eof() {
return Err(XmlError::UnexpectedEof);
}
if cursor.peek() == Some(']') {
return Ok(());
}
if cursor.starts_with("<!--") {
parse_comment(cursor)?;
} else if cursor.starts_with("<?") {
parse_pi_in_dtd(cursor)?;
} else if cursor.starts_with("<!ELEMENT") {
parse_element_decl(cursor)?;
} else if cursor.starts_with("<!ATTLIST") {
parse_attlist_decl(cursor, entities, entity_budget)?;
} else if cursor.starts_with("<!ENTITY") {
parse_entity_decl(cursor, entities)?;
} else if cursor.starts_with("<!NOTATION") {
parse_notation_decl(cursor)?;
} else if cursor.starts_with("<![") {
return Err(XmlError::well_formedness(
"Conditional sections not allowed in internal subset",
cursor.line(),
cursor.column(),
));
} else if cursor.starts_with("%") {
parse_pe_reference(cursor)?;
} else {
return Err(XmlError::well_formedness(
format!(
"Unexpected character '{}' in internal subset",
cursor.peek().unwrap_or('\0')
),
cursor.line(),
cursor.column(),
));
}
}
}
fn parse_pi_in_dtd(cursor: &mut Cursor) -> XmlResult<()> {
cursor.expect("<?")?;
let target = parse_name(cursor)?;
if target.eq_ignore_ascii_case("xml") {
return Err(XmlError::well_formedness(
"Processing instruction target must not be 'xml'",
cursor.line(),
cursor.column(),
));
}
if cursor.starts_with("?>") {
cursor.expect("?>")?;
return Ok(());
}
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::parse(
"Expected whitespace after PI target",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
cursor.read_until_owned("?>")?;
Ok(())
}
fn parse_pe_reference(cursor: &mut Cursor) -> XmlResult<String> {
cursor.expect("%")?;
let name = parse_name(cursor)?;
cursor.expect(";")?;
Ok(name.into_owned())
}
fn reject_pe_in_markup_decl(cursor: &Cursor) -> XmlResult<()> {
Err(XmlError::well_formedness(
"Parameter entity reference not allowed within markup declaration in internal subset",
cursor.line(),
cursor.column(),
))
}
fn parse_element_decl(cursor: &mut Cursor) -> XmlResult<()> {
cursor.expect("<!ELEMENT")?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after '<!ELEMENT'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_name(cursor)?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after element name in ELEMENT declaration",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_content_spec(cursor)?;
cursor.skip_whitespace();
cursor.expect(">")?;
Ok(())
}
fn parse_content_spec(cursor: &mut Cursor) -> XmlResult<()> {
if cursor.starts_with("EMPTY") {
cursor.advance(5);
Ok(())
} else if cursor.starts_with("ANY") {
cursor.advance(3);
Ok(())
} else if cursor.peek() == Some('(') {
parse_content_model(cursor)
} else if cursor.starts_with("%") {
reject_pe_in_markup_decl(cursor)?;
Ok(())
} else {
Err(XmlError::well_formedness(
"Expected content specification (EMPTY, ANY, or content model)",
cursor.line(),
cursor.column(),
))
}
}
fn parse_content_model(cursor: &mut Cursor) -> XmlResult<()> {
cursor.expect("(")?;
cursor.skip_whitespace();
if cursor.starts_with("#PCDATA") {
cursor.advance(7);
cursor.skip_whitespace();
if cursor.peek() == Some(')') {
cursor.advance_char();
if cursor.peek() == Some('*') {
cursor.advance_char();
}
return Ok(());
}
loop {
cursor.skip_whitespace();
if cursor.peek() == Some(')') {
cursor.advance_char();
if cursor.peek() != Some('*') {
return Err(XmlError::well_formedness(
"Mixed content model with alternatives must end with ')*'",
cursor.line(),
cursor.column(),
));
}
cursor.advance_char();
return Ok(());
}
cursor.expect("|")?;
cursor.skip_whitespace();
if cursor.starts_with("%") {
reject_pe_in_markup_decl(cursor)?;
} else {
if cursor.peek() == Some('(') {
return Err(XmlError::well_formedness(
"Parenthesized group not allowed in Mixed content model",
cursor.line(),
cursor.column(),
));
}
parse_name(cursor)?;
cursor.skip_whitespace();
if cursor.peek() == Some('*')
|| cursor.peek() == Some('+')
|| cursor.peek() == Some('?')
{
return Err(XmlError::well_formedness(
"Occurrence indicator not allowed on elements in Mixed content model",
cursor.line(),
cursor.column(),
));
}
}
}
}
parse_cp(cursor)?;
cursor.skip_whitespace();
if cursor.peek() == Some(')') {
cursor.advance_char();
if matches!(cursor.peek(), Some('*') | Some('+') | Some('?')) {
cursor.advance_char();
}
return Ok(());
}
let sep = match cursor.peek() {
Some(',') => ',',
Some('|') => '|',
_ => {
return Err(XmlError::well_formedness(
"Expected ',' or '|' or ')' in content model",
cursor.line(),
cursor.column(),
));
}
};
loop {
cursor.skip_whitespace();
if cursor.peek() == Some(')') {
cursor.advance_char();
if matches!(cursor.peek(), Some('*') | Some('+') | Some('?')) {
cursor.advance_char();
}
return Ok(());
}
if cursor.peek() == Some(sep) {
cursor.advance_char();
} else if cursor.peek() == Some(',') || cursor.peek() == Some('|') {
return Err(XmlError::well_formedness(
"Cannot mix ',' and '|' in content model group",
cursor.line(),
cursor.column(),
));
} else {
return Err(XmlError::well_formedness(
format!("Expected '{}' or ')' in content model", sep),
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_cp(cursor)?;
}
}
fn parse_cp(cursor: &mut Cursor) -> XmlResult<()> {
if cursor.peek() == Some('(') {
parse_children_group(cursor)?;
} else if cursor.starts_with("%") {
reject_pe_in_markup_decl(cursor)?;
} else {
parse_name(cursor)?;
if matches!(cursor.peek(), Some('*') | Some('+') | Some('?')) {
cursor.advance_char();
}
}
Ok(())
}
fn parse_children_group(cursor: &mut Cursor) -> XmlResult<()> {
cursor.expect("(")?;
cursor.skip_whitespace();
if cursor.starts_with("#PCDATA") {
return Err(XmlError::well_formedness(
"#PCDATA not allowed in nested content model group",
cursor.line(),
cursor.column(),
));
}
parse_cp(cursor)?;
cursor.skip_whitespace();
if cursor.peek() == Some(')') {
cursor.advance_char();
if matches!(cursor.peek(), Some('*') | Some('+') | Some('?')) {
cursor.advance_char();
}
return Ok(());
}
let sep = match cursor.peek() {
Some(',') => ',',
Some('|') => '|',
_ => {
return Err(XmlError::well_formedness(
"Expected ',' or '|' or ')' in content model",
cursor.line(),
cursor.column(),
));
}
};
loop {
cursor.skip_whitespace();
if cursor.peek() == Some(')') {
cursor.advance_char();
if matches!(cursor.peek(), Some('*') | Some('+') | Some('?')) {
cursor.advance_char();
}
return Ok(());
}
if cursor.peek() == Some(sep) {
cursor.advance_char();
} else if cursor.peek() == Some(',') || cursor.peek() == Some('|') {
return Err(XmlError::well_formedness(
"Cannot mix ',' and '|' in content model group",
cursor.line(),
cursor.column(),
));
} else {
return Err(XmlError::well_formedness(
format!("Expected '{}' or ')' in content model", sep),
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_cp(cursor)?;
}
}
fn parse_attlist_decl(
cursor: &mut Cursor,
entities: &EntityMap,
entity_budget: &mut usize,
) -> XmlResult<()> {
cursor.expect("<!ATTLIST")?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after '<!ATTLIST'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_name(cursor)?;
loop {
cursor.skip_whitespace();
if cursor.is_eof() {
return Err(XmlError::UnexpectedEof);
}
if cursor.peek() == Some('>') {
cursor.advance_char();
return Ok(());
}
if cursor.starts_with("%") {
reject_pe_in_markup_decl(cursor)?;
continue;
}
parse_att_def(cursor, entities, entity_budget)?;
}
}
fn parse_att_def(
cursor: &mut Cursor,
entities: &EntityMap,
entity_budget: &mut usize,
) -> XmlResult<()> {
parse_name(cursor)?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after attribute name",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_att_type(cursor)?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after attribute type",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_default_decl(cursor, entities, entity_budget)?;
Ok(())
}
fn parse_att_type(cursor: &mut Cursor) -> XmlResult<()> {
if cursor.starts_with("CDATA") {
cursor.advance(5);
} else if cursor.starts_with("IDREFS") {
cursor.advance(6);
} else if cursor.starts_with("IDREF") {
cursor.advance(5);
} else if cursor.starts_with("ID") {
cursor.advance(2);
} else if cursor.starts_with("ENTITIES") {
cursor.advance(8);
} else if cursor.starts_with("ENTITY") {
cursor.advance(6);
} else if cursor.starts_with("NMTOKENS") {
cursor.advance(8);
} else if cursor.starts_with("NMTOKEN") {
cursor.advance(7);
} else if cursor.starts_with("NOTATION") {
cursor.advance(8);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after 'NOTATION'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_enumeration(cursor)?;
} else if cursor.peek() == Some('(') {
parse_enumeration(cursor)?;
} else if cursor.starts_with("%") {
reject_pe_in_markup_decl(cursor)?;
} else {
return Err(XmlError::well_formedness(
"Expected attribute type (CDATA, ID, IDREF, etc.)",
cursor.line(),
cursor.column(),
));
}
Ok(())
}
fn parse_enumeration(cursor: &mut Cursor) -> XmlResult<()> {
cursor.expect("(")?;
cursor.skip_whitespace();
parse_nmtoken(cursor)?;
loop {
cursor.skip_whitespace();
if cursor.peek() == Some(')') {
cursor.advance_char();
return Ok(());
}
cursor.expect("|")?;
cursor.skip_whitespace();
parse_nmtoken(cursor)?;
}
}
fn parse_nmtoken(cursor: &mut Cursor) -> XmlResult<String> {
let mut token = String::new();
while let Some(c) = cursor.peek() {
if is_name_char(c) {
cursor.advance_char();
token.push(c);
} else {
break;
}
}
if token.is_empty() {
return Err(XmlError::parse(
"Expected Nmtoken",
cursor.line(),
cursor.column(),
));
}
Ok(token)
}
fn parse_default_decl(
cursor: &mut Cursor,
entities: &EntityMap,
entity_budget: &mut usize,
) -> XmlResult<()> {
if cursor.starts_with("#REQUIRED") {
cursor.advance(9);
} else if cursor.starts_with("#IMPLIED") {
cursor.advance(8);
} else if cursor.starts_with("#FIXED") {
cursor.advance(6);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after '#FIXED'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_att_value_in_dtd(cursor, entities, entity_budget)?;
} else if cursor.peek() == Some('"') || cursor.peek() == Some('\'') {
parse_att_value_in_dtd(cursor, entities, entity_budget)?;
} else {
return Err(XmlError::well_formedness(
"Expected default declaration (#REQUIRED, #IMPLIED, #FIXED, or default value)",
cursor.line(),
cursor.column(),
));
}
Ok(())
}
fn parse_att_value_in_dtd(
cursor: &mut Cursor,
entities: &EntityMap,
entity_budget: &mut usize,
) -> XmlResult<String> {
let quote = match cursor.peek() {
Some('"') => '"',
Some('\'') => '\'',
_ => {
return Err(XmlError::parse(
"Expected quote character for attribute value",
cursor.line(),
cursor.column(),
));
}
};
cursor.advance_char();
let mut value = String::new();
loop {
match cursor.peek() {
None => return Err(XmlError::UnexpectedEof),
Some(c) if c == quote => {
cursor.advance_char();
break;
}
Some('&') => {
let resolved = parse_reference_with_entities(
cursor,
entities,
&mut EntityCache::new(),
entity_budget,
)?;
value.push_str(&resolved);
}
Some('<') => {
return Err(XmlError::well_formedness(
"'<' not allowed in attribute value",
cursor.line(),
cursor.column(),
));
}
Some(c) => {
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!("Invalid XML character U+{:04X} in DTD", c as u32),
cursor.line(),
cursor.column(),
));
}
cursor.advance_char();
value.push(c);
}
}
}
Ok(value)
}
fn parse_entity_decl(cursor: &mut Cursor, entities: &mut EntityMap) -> XmlResult<()> {
cursor.expect("<!ENTITY")?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after '<!ENTITY'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
let is_pe = cursor.peek() == Some('%');
if is_pe {
cursor.advance_char();
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after '%' in parameter entity declaration",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
}
let name = parse_name(cursor)?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after entity name",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
if cursor.peek() == Some('"') || cursor.peek() == Some('\'') {
let value = parse_entity_value(cursor)?;
cursor.skip_whitespace();
if !is_pe {
entities.entry(name.into_owned()).or_insert(value);
}
} else if cursor.starts_with("SYSTEM") || cursor.starts_with("PUBLIC") {
if cursor.starts_with("SYSTEM") {
cursor.advance(6);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after 'SYSTEM'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_system_literal(cursor)?;
} else {
cursor.advance(6);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after 'PUBLIC'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_pubid_literal(cursor)?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace between public and system literal",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_system_literal(cursor)?;
}
let has_ws_before_ndata = cursor.peek().map(is_xml_whitespace).unwrap_or(false);
cursor.skip_whitespace();
if !is_pe && cursor.starts_with("NDATA") {
if !has_ws_before_ndata {
return Err(XmlError::well_formedness(
"Expected whitespace before 'NDATA'",
cursor.line(),
cursor.column(),
));
}
cursor.advance(5);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after 'NDATA'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_name(cursor)?;
cursor.skip_whitespace();
} else if is_pe && cursor.starts_with("NDATA") {
return Err(XmlError::well_formedness(
"NDATA not allowed on parameter entity declarations",
cursor.line(),
cursor.column(),
));
}
} else if cursor.starts_with("%") {
reject_pe_in_markup_decl(cursor)?;
cursor.skip_whitespace();
} else {
return Err(XmlError::well_formedness(
"Expected entity value or external ID in ENTITY declaration",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
cursor.expect(">")?;
Ok(())
}
fn parse_entity_value(cursor: &mut Cursor) -> XmlResult<String> {
let quote = match cursor.peek() {
Some('"') => '"',
Some('\'') => '\'',
_ => {
return Err(XmlError::parse(
"Expected quote for entity value",
cursor.line(),
cursor.column(),
));
}
};
cursor.advance_char();
let mut value = String::new();
loop {
match cursor.peek() {
None => return Err(XmlError::UnexpectedEof),
Some(c) if c == quote => {
cursor.advance_char();
break;
}
Some('&') => {
if cursor.starts_with("&#x") || cursor.starts_with("&#") {
let resolved = parse_reference(cursor)?;
value.push_str(&resolved);
} else {
cursor.advance(1);
let name = parse_name(cursor)?;
cursor.expect(";")?;
value.push('&');
value.push_str(&name);
value.push(';');
}
}
Some('%') => {
return Err(XmlError::well_formedness(
"Parameter entity reference not allowed within markup declaration in internal subset",
cursor.line(),
cursor.column(),
));
}
Some(c) => {
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!("Invalid XML character U+{:04X} in entity value", c as u32),
cursor.line(),
cursor.column(),
));
}
cursor.advance_char();
value.push(c);
}
}
}
Ok(value)
}
fn parse_notation_decl(cursor: &mut Cursor) -> XmlResult<()> {
cursor.expect("<!NOTATION")?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after '<!NOTATION'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_name(cursor)?;
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after notation name",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
if cursor.starts_with("SYSTEM") {
cursor.advance(6);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after 'SYSTEM'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_system_literal(cursor)?;
} else if cursor.starts_with("PUBLIC") {
cursor.advance(6);
if !cursor.peek().map(is_xml_whitespace).unwrap_or(false) {
return Err(XmlError::well_formedness(
"Expected whitespace after 'PUBLIC'",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
parse_pubid_literal(cursor)?;
cursor.skip_whitespace();
if cursor.peek() == Some('"') || cursor.peek() == Some('\'') {
parse_system_literal(cursor)?;
}
} else {
return Err(XmlError::well_formedness(
"Expected 'SYSTEM' or 'PUBLIC' in NOTATION declaration",
cursor.line(),
cursor.column(),
));
}
cursor.skip_whitespace();
cursor.expect(">")?;
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn parse_element<'a>(
cursor: &mut Cursor<'a>,
doc: &mut Document<'a>,
parent: NodeId,
ns_resolver: &mut Option<NamespaceResolver<'a>>,
entities: &EntityMap,
entity_cache: &mut EntityCache,
budget: &mut usize,
depth: u32,
max_depth: u32,
) -> XmlResult<NodeId> {
if depth >= max_depth {
return Err(XmlError::parse(
format!("Element nesting exceeds maximum depth of {}", max_depth),
cursor.line(),
cursor.column(),
));
}
let start_pos = cursor.pos;
cursor.expect("<")?;
let tag_name = parse_name(cursor)?;
let mut raw_attrs: Vec<(Cow<'a, str>, Cow<'a, str>)> = Vec::with_capacity(8);
let mut ns_decls: Vec<(Cow<'a, str>, Cow<'a, str>)> = Vec::new();
loop {
cursor.skip_whitespace();
if cursor.is_eof() {
return Err(XmlError::UnexpectedEof);
}
if matches!(cursor.peek_byte(), Some(b'>') | Some(b'/')) {
break;
}
let attr_name = parse_name(cursor)?;
cursor.skip_whitespace();
cursor.expect("=")?;
cursor.skip_whitespace();
let attr_value = parse_quoted_value_with_entities(cursor, entities, entity_cache, budget)?;
if &*attr_name == "xmlns" {
if ns_decls.iter().any(|(p, _)| p.is_empty()) {
return Err(XmlError::well_formedness(
format!("Duplicate attribute: {}", attr_name),
cursor.line(),
cursor.column(),
));
}
ns_decls.push((Cow::Borrowed(""), attr_value));
} else if let Some(prefix) = attr_name.strip_prefix("xmlns:") {
if prefix == "xmlns" {
return Err(XmlError::namespace(
"The prefix 'xmlns' must not be declared",
cursor.line(),
cursor.column(),
));
}
if prefix == "xml" && &*attr_value != "http://www.w3.org/XML/1998/namespace" {
return Err(XmlError::namespace(
"The prefix 'xml' must not be bound to any other namespace",
cursor.line(),
cursor.column(),
));
}
if ns_decls.iter().any(|(p, _)| &**p == prefix) {
return Err(XmlError::well_formedness(
format!("Duplicate attribute: {}", attr_name),
cursor.line(),
cursor.column(),
));
}
let prefix_cow: Cow<'a, str> = match &attr_name {
Cow::Borrowed(s) => Cow::Borrowed(&s[6..]),
Cow::Owned(s) => Cow::Owned(s[6..].to_string()),
};
ns_decls.push((prefix_cow, attr_value));
} else {
if raw_attrs.iter().any(|(n, _)| *n == *attr_name) {
return Err(XmlError::well_formedness(
format!("Duplicate attribute: {}", attr_name),
cursor.line(),
cursor.column(),
));
}
raw_attrs.push((attr_name, attr_value));
}
if let Some(b) = cursor.peek_byte() {
if b != b'>' && b != b'/' && b != b' ' && b != b'\t' && b != b'\n' && b != b'\r' {
return Err(XmlError::well_formedness(
"Expected whitespace between attributes",
cursor.line(),
cursor.column(),
));
}
}
}
if let Some(resolver) = ns_resolver.as_mut() {
resolver.push_scope();
for (prefix, uri) in &ns_decls {
resolver.declare(prefix.clone(), uri.clone());
}
}
let (prefix, local_name) = split_qname(&tag_name);
let qname = if let Some(resolver) = ns_resolver.as_ref() {
let ns: Option<Cow<'a, str>> = if let Some(p) = prefix {
let uri = resolver.resolve(p).ok_or_else(|| {
XmlError::namespace(
format!("Undeclared namespace prefix: {}", p),
cursor.line(),
cursor.column(),
)
})?;
Some(uri.clone())
} else {
resolver.resolve_default().cloned()
};
QName {
namespace_uri: ns,
prefix: prefix.map(|s| borrow_from_cow(&tag_name, s)),
local_name: borrow_from_cow(&tag_name, local_name),
}
} else {
QName::local(tag_name.clone())
};
let mut resolved_attrs = Vec::with_capacity(raw_attrs.len());
for (attr_name, attr_value) in raw_attrs {
let (a_prefix, a_local) = split_qname(&attr_name);
let a_qname = if let Some(resolver) = ns_resolver.as_ref() {
if let Some(p) = a_prefix {
let ns_uri = resolver.resolve(p).ok_or_else(|| {
XmlError::namespace(
format!("Undeclared namespace prefix: {}", p),
cursor.line(),
cursor.column(),
)
})?;
QName {
namespace_uri: Some(ns_uri.clone()),
prefix: Some(borrow_from_cow(&attr_name, p)),
local_name: borrow_from_cow(&attr_name, a_local),
}
} else {
QName::local(borrow_from_cow(&attr_name, a_local))
}
} else {
QName::local(attr_name)
};
resolved_attrs.push(Attribute {
name: a_qname,
value: attr_value,
});
}
let elem = Element {
name: qname,
attributes: resolved_attrs,
namespace_declarations: ns_decls,
};
let elem_id = doc.alloc_node(NodeKind::Element(elem), start_pos);
doc.append_child_unchecked(parent, elem_id);
if cursor.peek_byte() == Some(b'/') {
cursor.expect("/>")?;
doc.set_byte_end_pos(elem_id, cursor.pos);
if let Some(resolver) = ns_resolver.as_mut() {
resolver.pop_scope();
}
return Ok(elem_id);
}
cursor.expect(">")?;
parse_content(
cursor,
doc,
elem_id,
ns_resolver,
entities,
entity_cache,
budget,
depth,
max_depth,
)?;
cursor.expect("</")?;
let end_tag_name = parse_name(cursor)?;
cursor.skip_whitespace();
cursor.expect(">")?;
doc.set_byte_end_pos(elem_id, cursor.pos);
if *end_tag_name != *tag_name {
return Err(XmlError::well_formedness(
format!(
"Mismatched end tag: expected </{}>, found </{}>",
tag_name, end_tag_name
),
cursor.line(),
cursor.column(),
));
}
if let Some(resolver) = ns_resolver.as_mut() {
resolver.pop_scope();
}
Ok(elem_id)
}
#[allow(clippy::too_many_arguments)]
fn parse_content<'a>(
cursor: &mut Cursor<'a>,
doc: &mut Document<'a>,
parent: NodeId,
ns_resolver: &mut Option<NamespaceResolver<'a>>,
entities: &EntityMap,
entity_cache: &mut EntityCache,
budget: &mut usize,
depth: u32,
max_depth: u32,
) -> XmlResult<()> {
enum TextBuf {
Empty,
Borrowed { start: usize },
Owned(String),
}
impl TextBuf {
fn flush<'a>(
self,
input: &'a str,
doc: &mut Document<'a>,
parent: NodeId,
byte_pos: usize,
end_pos: usize,
) {
match self {
TextBuf::Empty => {}
TextBuf::Borrowed { start } => {
if start < end_pos {
let text = Cow::Borrowed(&input[start..end_pos]);
let id = doc.alloc_node(NodeKind::Text(text), start);
doc.set_byte_end_pos(id, end_pos);
doc.append_child_unchecked(parent, id);
}
}
TextBuf::Owned(s) => {
if !s.is_empty() {
let id = doc.alloc_node(NodeKind::Text(Cow::Owned(s)), byte_pos);
doc.set_byte_end_pos(id, end_pos);
doc.append_child_unchecked(parent, id);
}
}
}
}
fn switch_to_owned(&mut self, input: &str, end_pos: usize) {
match self {
TextBuf::Empty => {
*self = TextBuf::Owned(String::new());
}
TextBuf::Borrowed { start } => {
let s = input[*start..end_pos].to_string();
*self = TextBuf::Owned(s);
}
TextBuf::Owned(_) => {} }
}
fn push_str(&mut self, input: &str, end_pos: usize, s: &str) {
self.switch_to_owned(input, end_pos);
if let TextBuf::Owned(ref mut buf) = self {
buf.push_str(s);
}
}
fn push_char(&mut self, input: &str, end_pos: usize, c: char) {
self.switch_to_owned(input, end_pos);
if let TextBuf::Owned(ref mut buf) = self {
buf.push(c);
}
}
}
let text_start_pos = cursor.pos;
let mut text_buf: TextBuf = TextBuf::Empty;
loop {
if cursor.pos >= cursor.input.len() {
return Err(XmlError::UnexpectedEof);
}
let bytes = cursor.input.as_bytes();
let scan_start = cursor.pos;
let (advance, has_non_ascii_or_control) =
crate::simd::scan_content_delimiters(&bytes[scan_start..]);
let i = scan_start + advance;
if i > scan_start {
if has_non_ascii_or_control {
let chunk = &cursor.input[scan_start..i];
for c in chunk.chars() {
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!("Invalid XML character U+{:04X}", c as u32),
cursor.line(),
cursor.column(),
));
}
}
}
match &mut text_buf {
TextBuf::Empty => {
text_buf = TextBuf::Borrowed { start: scan_start };
}
TextBuf::Borrowed { .. } => {
}
TextBuf::Owned(ref mut buf) => {
buf.push_str(&cursor.input[scan_start..i]);
}
}
cursor.pos = i;
}
if cursor.pos >= cursor.input.len() {
return Err(XmlError::UnexpectedEof);
}
match bytes[cursor.pos] {
b'<' => {
match bytes.get(cursor.pos + 1) {
Some(b'/') => {
text_buf.flush(cursor.input, doc, parent, text_start_pos, cursor.pos);
return Ok(());
}
Some(b'!') => {
text_buf.flush(cursor.input, doc, parent, text_start_pos, cursor.pos);
text_buf = TextBuf::Empty;
if cursor.starts_with("<![CDATA[") {
let start = cursor.pos;
let cdata = parse_cdata(cursor)?;
let id = doc.alloc_node(NodeKind::CData(cdata), start);
doc.set_byte_end_pos(id, cursor.pos);
doc.append_child_unchecked(parent, id);
} else if cursor.starts_with("<!--") {
let start = cursor.pos;
let comment = parse_comment(cursor)?;
let id = doc.alloc_node(NodeKind::Comment(comment), start);
doc.set_byte_end_pos(id, cursor.pos);
doc.append_child_unchecked(parent, id);
} else {
return Err(XmlError::well_formedness(
"Invalid markup in element content",
cursor.line(),
cursor.column(),
));
}
}
Some(b'?') => {
text_buf.flush(cursor.input, doc, parent, text_start_pos, cursor.pos);
text_buf = TextBuf::Empty;
let start = cursor.pos;
let pi = parse_pi(cursor)?;
let id = doc.alloc_node(NodeKind::ProcessingInstruction(pi), start);
doc.set_byte_end_pos(id, cursor.pos);
doc.append_child_unchecked(parent, id);
}
_ => {
text_buf.flush(cursor.input, doc, parent, text_start_pos, cursor.pos);
text_buf = TextBuf::Empty;
parse_element(
cursor,
doc,
parent,
ns_resolver,
entities,
entity_cache,
budget,
depth + 1,
max_depth,
)?;
}
}
}
b'&' => {
let before_pos = cursor.pos;
let resolved =
parse_reference_with_entities(cursor, entities, entity_cache, budget)?;
text_buf.push_str(cursor.input, before_pos, &resolved);
}
b'\r' => {
let before_pos = cursor.pos;
cursor.pos += 1; if cursor.peek_byte() == Some(b'\n') {
cursor.pos += 1; }
text_buf.push_char(cursor.input, before_pos, '\n');
}
b']' => {
if cursor.starts_with("]]>") {
return Err(XmlError::well_formedness(
"']]>' not allowed in element content",
cursor.line(),
cursor.column(),
));
}
match &mut text_buf {
TextBuf::Empty => {
text_buf = TextBuf::Borrowed { start: cursor.pos };
cursor.advance_no_newlines(1);
}
TextBuf::Borrowed { .. } => {
cursor.advance_no_newlines(1);
}
TextBuf::Owned(ref mut buf) => {
buf.push(']');
cursor.advance_no_newlines(1);
}
}
}
_ => unreachable!(),
}
}
}
fn parse_cdata<'a>(cursor: &mut Cursor<'a>) -> XmlResult<Cow<'a, str>> {
cursor.expect("<![CDATA[")?;
let content = cursor.read_until("]]>")?;
for c in content.chars() {
if !is_xml_char(c) {
return Err(XmlError::well_formedness(
format!("Invalid XML character U+{:04X} in CDATA section", c as u32),
cursor.line(),
cursor.column(),
));
}
}
Ok(content)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_element() {
let doc = Parser::new().parse("<root/>").unwrap();
let root = doc.document_element().unwrap();
let elem = doc.element(root).unwrap();
assert_eq!(&*elem.name.local_name, "root");
}
#[test]
fn test_parse_text_content() {
let doc = Parser::new().parse("<root>hello world</root>").unwrap();
let root = doc.document_element().unwrap();
let text = doc.text_content_deep(root);
assert_eq!(text, "hello world");
}
#[test]
fn test_parse_attributes() {
let doc = Parser::new()
.parse(r#"<root attr="value" foo='bar'/>"#)
.unwrap();
let root = doc.document_element().unwrap();
let elem = doc.element(root).unwrap();
assert_eq!(elem.get_attribute("attr"), Some("value"));
assert_eq!(elem.get_attribute("foo"), Some("bar"));
}
#[test]
fn test_parse_entity_references() {
let doc = Parser::new()
.parse("<root><>&'"</root>")
.unwrap();
let root = doc.document_element().unwrap();
assert_eq!(doc.text_content_deep(root), "<>&'\"");
}
#[test]
fn test_parse_character_references() {
let doc = Parser::new().parse("<root>AB</root>").unwrap();
let root = doc.document_element().unwrap();
assert_eq!(doc.text_content_deep(root), "AB");
}
#[test]
fn test_mismatched_end_tag() {
let result = Parser::new().parse("<root></other>");
assert!(result.is_err());
}
#[test]
fn test_duplicate_attribute() {
let result = Parser::new().parse(r#"<root a="1" a="2"/>"#);
assert!(result.is_err());
}
#[test]
fn test_parse_xml_declaration() {
let doc = Parser::new()
.parse(r#"<?xml version="1.0" encoding="UTF-8"?><root/>"#)
.unwrap();
let decl = doc.xml_declaration.as_ref().unwrap();
assert_eq!(&*decl.version, "1.0");
assert_eq!(decl.encoding.as_deref(), Some("UTF-8"));
}
#[test]
fn test_parse_cdata() {
let doc = Parser::new()
.parse("<root><![CDATA[<not>&xml;]]></root>")
.unwrap();
let root = doc.document_element().unwrap();
assert_eq!(doc.text_content_deep(root), "<not>&xml;");
}
#[test]
fn test_parse_comment() {
let doc = Parser::new()
.parse("<root><!-- a comment --></root>")
.unwrap();
let root = doc.document_element().unwrap();
let children = doc.children(root);
assert_eq!(children.len(), 1);
assert!(matches!(
doc.node_kind(children[0]),
Some(NodeKind::Comment(_))
));
}
#[test]
fn test_no_root_element() {
let result = Parser::new().parse("");
assert!(result.is_err());
}
#[test]
fn test_two_root_elements() {
let result = Parser::new().parse("<a/><b/>");
assert!(result.is_err());
}
#[test]
fn test_zero_copy_text() {
let input = "<root>hello</root>";
let doc = Parser::new().parse(input).unwrap();
let root = doc.document_element().unwrap();
let children = doc.children(root);
if let Some(NodeKind::Text(t)) = doc.node_kind(children[0]) {
assert!(matches!(t, Cow::Borrowed(_)), "Expected borrowed text");
}
}
#[test]
fn test_zero_copy_name() {
let input = "<root/>";
let doc = Parser::new().parse(input).unwrap();
let root = doc.document_element().unwrap();
let elem = doc.element(root).unwrap();
let doc2 = Parser::with_namespace_aware(false).parse(input).unwrap();
let root2 = doc2.document_element().unwrap();
let elem2 = doc2.element(root2).unwrap();
assert!(
matches!(elem2.name.local_name, Cow::Borrowed(_)),
"Expected borrowed name"
);
let _ = elem; }
fn nested_xml(depth: usize) -> String {
let mut s = String::with_capacity(depth * 8);
for _ in 0..depth {
s.push_str("<a>");
}
s.push('x');
for _ in 0..depth {
s.push_str("</a>");
}
s
}
#[test]
fn test_depth_cap_rejects_deep_input() {
let xml = nested_xml(5_000);
let err = Parser::new()
.parse(&xml)
.expect_err("deep input must be rejected");
assert!(
format!("{}", err).contains("maximum depth"),
"expected depth-cap error, got: {}",
err
);
}
#[test]
fn test_depth_within_cap_parses() {
let xml = nested_xml(100);
let doc = Parser::new().parse(&xml).expect("within cap must parse");
assert!(doc.document_element().is_some());
}
#[test]
fn test_custom_max_depth() {
let xml = nested_xml(10);
assert!(
Parser::new().with_max_depth(5).parse(&xml).is_err(),
"cap of 5 must reject 10-deep input"
);
assert!(
Parser::new().with_max_depth(20).parse(&xml).is_ok(),
"cap of 20 must admit 10-deep input"
);
}
#[test]
fn test_entity_budget_rejects_billion_laughs() {
let xml = r#"<?xml version="1.0"?>
<!DOCTYPE lolz [
<!ENTITY lol "lol">
<!ENTITY lol1 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
<!ENTITY lol2 "&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;&lol1;">
<!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
<!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
<!ENTITY lol5 "&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;">
<!ENTITY lol6 "&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;&lol5;">
]>
<lolz>&lol6;</lolz>"#;
let err = Parser::new()
.parse(xml)
.expect_err("billion-laughs must be rejected by the default budget");
let msg = format!("{}", err);
assert!(
msg.contains("Entity expansion"),
"expected entity-budget error, got: {}",
msg
);
}
#[test]
fn test_entity_budget_rejects_quadratic_blowup() {
let big_a = "A".repeat(10_000);
let xml = format!(
r#"<?xml version="1.0"?>
<!DOCTYPE doc [
<!ENTITY a "{}">
<!ENTITY b "&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;">
]>
<doc>{}</doc>"#,
big_a,
"&b;".repeat(50),
);
let err = Parser::new()
.parse(&xml)
.expect_err("quadratic blow-up must be rejected");
assert!(
format!("{}", err).contains("Entity expansion"),
"expected entity-budget error, got: {}",
err
);
}
#[test]
fn test_entity_budget_allows_legitimate_use() {
let xml = r#"<?xml version="1.0"?>
<!DOCTYPE doc [
<!ENTITY name "Alice">
<!ENTITY greeting "Hello, &name;!">
]>
<doc>&greeting; &greeting; &greeting;</doc>"#;
let doc = Parser::new().parse(xml).expect("legitimate entities OK");
let root = doc.document_element().unwrap();
let text = doc.text_content_deep(root);
assert_eq!(text, "Hello, Alice! Hello, Alice! Hello, Alice!");
}
#[test]
fn test_entity_budget_covers_attlist_defaults() {
let big_a = "A".repeat(500_000);
let xml = format!(
r#"<?xml version="1.0"?>
<!DOCTYPE r [
<!ENTITY a "{}">
<!ATTLIST r foo CDATA "&a;&a;&a;">
]>
<r/>"#,
big_a
);
let err = Parser::new()
.parse(&xml)
.expect_err("ATTLIST default must charge against the shared budget");
assert!(
format!("{}", err).contains("Entity expansion"),
"expected entity-budget error, got: {}",
err
);
}
#[test]
fn test_custom_max_entity_expansion() {
let xml = r#"<?xml version="1.0"?>
<!DOCTYPE doc [<!ENTITY s "XXXXXXXXXXXXXXXX">]>
<doc>&s;&s;&s;&s;</doc>"#;
assert!(
Parser::new()
.with_max_entity_expansion(32)
.parse(xml)
.is_err(),
"tight budget must fire"
);
assert!(
Parser::new()
.with_max_entity_expansion(1 << 16)
.parse(xml)
.is_ok(),
"loose budget must admit the same input"
);
}
}