use alloc::string::{String, ToString};
use alloc::{vec, vec::Vec};
use alloc::borrow::Cow;
use core::ops::Range;
use core::mem::take;
use core::fmt;
use memchr::{memchr, memchr2, memchr_iter};
use crate::{
AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
NodeKind, ShortRange, StringStorage, TextPos, NS_XMLNS_URI, NS_XML_PREFIX, NS_XML_URI, PI,
XMLNS,
};
use crate::tokenizer::{self, Reference, StrSpan, Stream};
type Result<T> = core::result::Result<T, Error>;
#[derive(Clone, PartialEq, Eq, Hash, Debug)]
pub enum Error {
InvalidXmlPrefixUri(TextPos),
UnexpectedXmlUri(TextPos),
UnexpectedXmlnsUri(TextPos),
InvalidElementNamePrefix(TextPos),
DuplicatedNamespace(String, TextPos),
UnknownNamespace(String, TextPos),
UnexpectedCloseTag(String, String, TextPos),
UnexpectedEntityCloseTag(TextPos),
UnknownEntityReference(String, TextPos),
MalformedEntityReference(TextPos),
EntityReferenceLoop(TextPos),
InvalidAttributeValue(TextPos),
DuplicatedAttribute(String, TextPos),
NoRootNode,
UnclosedRootNode,
UnexpectedDeclaration(TextPos),
DtdDetected,
NodesLimitReached,
AttributesLimitReached,
NamespacesLimitReached,
InvalidName(TextPos),
NonXmlChar(char, TextPos),
InvalidChar(u8, u8, TextPos),
InvalidChar2(&'static str, u8, TextPos),
InvalidString(&'static str, TextPos),
InvalidExternalID(TextPos),
EntityResolver(TextPos, String),
InvalidComment(TextPos),
InvalidCharacterData(TextPos),
UnknownToken(TextPos),
UnexpectedEndOfStream,
}
impl Error {
pub fn pos(&self) -> TextPos {
match *self {
Error::InvalidXmlPrefixUri(pos) => pos,
Error::UnexpectedXmlUri(pos) => pos,
Error::UnexpectedXmlnsUri(pos) => pos,
Error::InvalidElementNamePrefix(pos) => pos,
Error::DuplicatedNamespace(_, pos) => pos,
Error::UnknownNamespace(_, pos) => pos,
Error::UnexpectedCloseTag(_, _, pos) => pos,
Error::UnexpectedEntityCloseTag(pos) => pos,
Error::UnknownEntityReference(_, pos) => pos,
Error::MalformedEntityReference(pos) => pos,
Error::EntityReferenceLoop(pos) => pos,
Error::InvalidAttributeValue(pos) => pos,
Error::DuplicatedAttribute(_, pos) => pos,
Error::NoRootNode => TextPos::new(1, 1),
Error::UnclosedRootNode => TextPos::new(1, 1),
Error::UnexpectedDeclaration(pos) => pos,
Error::DtdDetected => TextPos::new(1, 1),
Error::NodesLimitReached => TextPos::new(1, 1),
Error::AttributesLimitReached => TextPos::new(1, 1),
Error::NamespacesLimitReached => TextPos::new(1, 1),
Error::InvalidName(pos) => pos,
Error::NonXmlChar(_, pos) => pos,
Error::InvalidChar(_, _, pos) => pos,
Error::InvalidChar2(_, _, pos) => pos,
Error::InvalidString(_, pos) => pos,
Error::InvalidExternalID(pos) => pos,
Error::EntityResolver(pos, _) => pos,
Error::InvalidComment(pos) => pos,
Error::InvalidCharacterData(pos) => pos,
Error::UnknownToken(pos) => pos,
Error::UnexpectedEndOfStream => TextPos::new(1, 1),
}
}
}
impl core::fmt::Display for Error {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
match self {
Error::InvalidXmlPrefixUri(pos) => {
write!(f, "'xml' namespace prefix mapped to wrong URI at {}", pos)
}
Error::UnexpectedXmlUri(pos) => {
write!(
f,
"the 'xml' namespace URI is used for not 'xml' prefix at {}",
pos
)
}
Error::UnexpectedXmlnsUri(pos) => {
write!(
f,
"the 'xmlns' URI is used at {}, but it must not be declared",
pos
)
}
Error::InvalidElementNamePrefix(pos) => {
write!(
f,
"the 'xmlns' prefix is used at {}, but it must not be",
pos
)
}
Error::DuplicatedNamespace(ref name, pos) => {
write!(f, "namespace '{}' at {} is already defined", name, pos)
}
Error::UnknownNamespace(ref name, pos) => {
write!(f, "an unknown namespace prefix '{}' at {}", name, pos)
}
Error::UnexpectedCloseTag(ref expected, ref actual, pos) => {
write!(
f,
"expected '{}' tag, not '{}' at {}",
expected, actual, pos
)
}
Error::UnexpectedEntityCloseTag(pos) => {
write!(f, "unexpected close tag at {}", pos)
}
Error::MalformedEntityReference(pos) => {
write!(f, "malformed entity reference at {}", pos)
}
Error::UnknownEntityReference(ref name, pos) => {
write!(f, "unknown entity reference '{}' at {}", name, pos)
}
Error::EntityReferenceLoop(pos) => {
write!(f, "a possible entity reference loop is detected at {}", pos)
}
Error::InvalidAttributeValue(pos) => {
write!(f, "unescaped '<' found at {}", pos)
}
Error::DuplicatedAttribute(ref name, pos) => {
write!(f, "attribute '{}' at {} is already defined", name, pos)
}
Error::NoRootNode => {
write!(f, "the document does not have a root node")
}
Error::UnclosedRootNode => {
write!(f, "the root node was opened but never closed")
}
Error::UnexpectedDeclaration(pos) => {
write!(f, "unexpected XML declaration at {}", pos)
}
Error::DtdDetected => {
write!(f, "XML with DTD detected")
}
Error::NodesLimitReached => {
write!(f, "nodes limit reached")
}
Error::AttributesLimitReached => {
write!(f, "more than 2^32 attributes were parsed")
}
Error::NamespacesLimitReached => {
write!(f, "more than 2^16 unique namespaces were parsed")
}
Error::InvalidName(pos) => {
write!(f, "invalid name token at {}", pos)
}
Error::NonXmlChar(c, pos) => {
write!(f, "a non-XML character {:?} found at {}", c, pos)
}
Error::InvalidChar(expected, actual, pos) => {
write!(
f,
"expected '{}' not '{}' at {}",
*expected as char, *actual as char, pos
)
}
Error::InvalidChar2(expected, actual, pos) => {
write!(
f,
"expected {} not '{}' at {}",
expected, *actual as char, pos
)
}
Error::InvalidString(expected, pos) => {
write!(f, "expected '{}' at {}", expected, pos)
}
Error::InvalidExternalID(pos) => {
write!(f, "invalid ExternalID at {}", pos)
}
Error::EntityResolver(pos, msg) => {
write!(f, "entity resolver failed at {}: {}", pos, msg)
}
Error::InvalidComment(pos) => {
write!(f, "comment at {} contains '--'", pos)
}
Error::InvalidCharacterData(pos) => {
write!(f, "']]>' at {} is not allowed inside a character data", pos)
}
Error::UnknownToken(pos) => {
write!(f, "unknown token at {}", pos)
}
Error::UnexpectedEndOfStream => {
write!(f, "unexpected end of stream")
}
}
}
}
#[cfg(feature = "std")]
impl std::error::Error for Error {
fn description(&self) -> &str {
"an XML parsing error"
}
}
pub struct ParsingOptions<'input> {
pub allow_dtd: bool,
pub nodes_limit: u32,
pub entity_resolver: Option<&'input EntityResolver<'input>>,
}
pub type EntityResolver<'input> =
dyn Fn(Option<&str>, &str) -> core::result::Result<Option<&'input str>, String> + 'input;
impl Default for ParsingOptions<'_> {
fn default() -> Self {
ParsingOptions {
allow_dtd: false,
nodes_limit: u32::MAX,
entity_resolver: None,
}
}
}
impl fmt::Debug for ParsingOptions<'_> {
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
let entity_resolver = if self.entity_resolver.is_some() {
"Some(..)"
} else {
"None"
};
fmt.debug_struct("ParsingOptions")
.field("allow_dtd", &self.allow_dtd)
.field("nodes_limit", &self.nodes_limit)
.field("entity_resolver", &entity_resolver)
.finish()
}
}
struct TempAttributeData<'input> {
prefix: &'input str,
local: &'input str,
value: StringStorage<'input>,
range: Range<usize>,
#[cfg(feature = "positions")]
qname_len: u16,
#[cfg(feature = "positions")]
eq_len: u8,
}
impl<'input> Document<'input> {
#[inline]
pub fn parse(text: &'input str) -> Result<Self> {
Self::parse_with_options(text, ParsingOptions::default())
}
#[inline]
pub fn parse_with_options(text: &'input str, opt: ParsingOptions<'input>) -> Result<Self> {
parse(text, opt)
}
}
struct Entity<'input> {
name: &'input str,
value: StrSpan<'input>,
}
#[derive(Clone, Copy)]
struct TagNameSpan<'input> {
prefix: &'input str,
name: &'input str,
pos: usize,
prefix_pos: usize,
}
impl<'input> TagNameSpan<'input> {
#[inline]
fn new_null() -> Self {
Self {
prefix: "",
name: "",
pos: 0,
prefix_pos: 0,
}
}
}
#[derive(Default)]
struct LoopDetector {
depth: u8,
references: u8,
}
impl LoopDetector {
#[inline]
fn inc_depth(&mut self, stream: &Stream) -> Result<()> {
if self.depth < 10 {
self.depth += 1;
Ok(())
} else {
Err(Error::EntityReferenceLoop(stream.gen_text_pos()))
}
}
#[inline]
fn dec_depth(&mut self) {
if self.depth > 0 {
self.depth -= 1;
}
if self.depth == 0 {
self.references = 0;
}
}
#[inline]
fn inc_references(&mut self, stream: &Stream) -> Result<()> {
if self.depth == 0 {
Ok(())
} else {
if self.references == u8::MAX {
return Err(Error::EntityReferenceLoop(stream.gen_text_pos()));
}
self.references += 1;
Ok(())
}
}
}
struct Context<'input> {
opt: ParsingOptions<'input>,
namespace_start_idx: usize,
current_attributes: Vec<TempAttributeData<'input>>,
awaiting_subtree: Vec<NodeId>,
parent_prefixes: Vec<&'input str>,
entities: Vec<Entity<'input>>,
after_text: Vec<Cow<'input, str>>,
parent_id: NodeId,
tag_name: TagNameSpan<'input>,
loop_detector: LoopDetector,
doc: Document<'input>,
}
impl<'input> Context<'input> {
fn append_node(&mut self, kind: NodeKind<'input>, range: Range<usize>) -> Result<NodeId> {
if self.doc.nodes.len() >= self.opt.nodes_limit as usize {
return Err(Error::NodesLimitReached);
}
#[cfg(not(feature = "positions"))]
let _ = range;
let new_child_id = NodeId::from(self.doc.nodes.len());
let appending_element = matches!(kind, NodeKind::Element { .. });
self.doc.nodes.push(NodeData {
parent: Some(self.parent_id),
prev_sibling: None,
next_subtree: None,
last_child: None,
kind,
#[cfg(feature = "positions")]
range,
});
let last_child_id = self.doc.nodes[self.parent_id.get_usize()].last_child;
self.doc.nodes[new_child_id.get_usize()].prev_sibling = last_child_id;
self.doc.nodes[self.parent_id.get_usize()].last_child = Some(new_child_id);
for id in &self.awaiting_subtree {
self.doc.nodes[id.get_usize()].next_subtree = Some(new_child_id);
}
self.awaiting_subtree.clear();
if !appending_element {
self.awaiting_subtree
.push(NodeId::from(self.doc.nodes.len() - 1));
}
Ok(new_child_id)
}
fn append_text(
&mut self,
text: Cow<'input, str>,
range: Range<usize>,
) -> Result<()> {
if self.after_text.is_empty() {
let text = match &text {
Cow::Borrowed(text) => StringStorage::Borrowed(text),
Cow::Owned(text) => StringStorage::new_owned(text.as_str()),
};
self.append_node(NodeKind::Text(text), range)?;
}
self.after_text.push(text);
Ok(())
}
#[cold]
#[inline(never)]
fn merge_text(&mut self) {
let node = &mut self.doc.nodes.last_mut().unwrap();
let text = match &mut node.kind {
NodeKind::Text(text) => text,
_ => unreachable!(),
};
*text = StringStorage::new_owned(&self.after_text.join(""));
}
#[inline]
fn reset_after_text(&mut self) {
if self.after_text.is_empty() {
return;
}
if self.after_text.len() > 1 {
self.merge_text();
}
self.after_text.clear();
}
}
fn parse<'input>(text: &'input str, opt: ParsingOptions<'input>) -> Result<Document<'input>> {
let nodes_capacity = memchr_iter(b'<', text.as_bytes()).count();
let attributes_capacity = memchr_iter(b'=', text.as_bytes()).count();
let mut doc = Document {
text,
nodes: Vec::with_capacity(nodes_capacity),
attributes: Vec::with_capacity(attributes_capacity),
namespaces: Namespaces::default(),
};
doc.nodes.push(NodeData {
parent: None,
prev_sibling: None,
next_subtree: None,
last_child: None,
kind: NodeKind::Root,
#[cfg(feature = "positions")]
range: 0..text.len(),
});
doc.namespaces
.push_ns(Some(NS_XML_PREFIX), StringStorage::Borrowed(NS_XML_URI))?;
let allow_dtd = opt.allow_dtd;
let mut ctx = Context {
opt,
namespace_start_idx: 1,
current_attributes: Vec::with_capacity(16),
entities: Vec::new(),
awaiting_subtree: Vec::new(),
parent_prefixes: vec![""],
after_text: Vec::with_capacity(1),
parent_id: NodeId::new(0),
tag_name: TagNameSpan::new_null(),
loop_detector: LoopDetector::default(),
doc,
};
tokenizer::parse(text, allow_dtd, &mut ctx)?;
let mut doc = ctx.doc;
if !doc.root().children().any(|n| n.is_element()) {
return Err(Error::NoRootNode);
}
if ctx.parent_prefixes.len() > 1 {
return Err(Error::UnclosedRootNode);
}
doc.nodes.shrink_to_fit();
doc.attributes.shrink_to_fit();
doc.namespaces.shrink_to_fit();
Ok(doc)
}
impl<'input> tokenizer::XmlEvents<'input> for Context<'input> {
#[inline(always)]
fn token(&mut self, token: tokenizer::Token<'input>) -> Result<()> {
match token {
tokenizer::Token::ProcessingInstruction(target, value, range) => {
self.reset_after_text();
let pi = NodeKind::PI(PI { target, value });
self.append_node(pi, range)?;
}
tokenizer::Token::Comment(text, range) => {
self.reset_after_text();
self.append_node(NodeKind::Comment(StringStorage::Borrowed(text)), range)?;
}
tokenizer::Token::EntityDeclaration(name, definition) => {
self.entities.push(Entity {
name,
value: definition,
});
}
tokenizer::Token::ElementStart(prefix, local, start) => {
self.reset_after_text();
if prefix == XMLNS {
let pos = self.doc.text_pos_at(start + 1);
return Err(Error::InvalidElementNamePrefix(pos));
}
self.tag_name = TagNameSpan {
prefix,
name: local,
pos: start,
prefix_pos: start + 1,
};
}
tokenizer::Token::Attribute(range, qname_len, eq_len, prefix, local, value) => {
process_attribute(range, qname_len, eq_len, prefix, local, value, self)?;
}
tokenizer::Token::ElementEnd(end, range) => {
self.reset_after_text();
process_element(end, range, self)?;
}
tokenizer::Token::Text(text, range) => {
process_text(text, range, self)?;
}
tokenizer::Token::Cdata(text, range) => {
process_cdata(text, range, self)?;
}
}
Ok(())
}
fn resolve_entity(&mut self, pub_id: Option<&str>, uri: &str) -> core::result::Result<Option<&'input str>, String> {
match &mut self.opt.entity_resolver {
Some(entity_resolver) => entity_resolver(pub_id, uri),
None => Ok(None),
}
}
}
#[allow(clippy::too_many_arguments)]
fn process_attribute<'input>(
range: Range<usize>,
qname_len: u16,
eq_len: u8,
prefix: &'input str,
local: &'input str,
value: StrSpan<'input>,
ctx: &mut Context<'input>,
) -> Result<()> {
let value = normalize_attribute(value, ctx)?;
if prefix == XMLNS {
if value.as_str() == NS_XMLNS_URI {
let pos = ctx.doc.text_pos_at(range.start);
return Err(Error::UnexpectedXmlnsUri(pos));
}
let is_xml_ns_uri = value.as_str() == NS_XML_URI;
if local == NS_XML_PREFIX {
if !is_xml_ns_uri {
let pos = ctx.doc.text_pos_at(range.start);
return Err(Error::InvalidXmlPrefixUri(pos));
}
} else {
if is_xml_ns_uri {
let pos = ctx.doc.text_pos_at(range.start);
return Err(Error::UnexpectedXmlUri(pos));
}
}
if ctx
.doc
.namespaces
.exists(ctx.namespace_start_idx, Some(local))
{
let pos = ctx.doc.text_pos_at(range.start);
return Err(Error::DuplicatedNamespace(local.to_string(), pos));
}
if !is_xml_ns_uri {
ctx.doc.namespaces.push_ns(Some(local), value)?;
}
} else if local == XMLNS {
if value.as_str() == NS_XML_URI {
let pos = ctx.doc.text_pos_at(range.start);
return Err(Error::UnexpectedXmlUri(pos));
}
if value.as_str() == NS_XMLNS_URI {
let pos = ctx.doc.text_pos_at(range.start);
return Err(Error::UnexpectedXmlnsUri(pos));
}
ctx.doc.namespaces.push_ns(None, value)?;
} else {
#[cfg(not(feature = "positions"))]
let _ = (qname_len, eq_len);
ctx.current_attributes.push(TempAttributeData {
prefix,
local,
value,
range,
#[cfg(feature = "positions")]
qname_len,
#[cfg(feature = "positions")]
eq_len,
});
}
Ok(())
}
fn process_element<'input>(
end_token: tokenizer::ElementEnd<'input>,
token_range: Range<usize>,
ctx: &mut Context<'input>,
) -> Result<()> {
if ctx.tag_name.name.is_empty() {
if let tokenizer::ElementEnd::Close(..) = end_token {
return Err(Error::UnexpectedEntityCloseTag(
ctx.doc.text_pos_at(token_range.start),
));
} else {
unreachable!("should be already checked by the tokenizer");
}
}
let namespaces = ctx.resolve_namespaces();
ctx.namespace_start_idx = ctx.doc.namespaces.tree_order.len();
let attributes = resolve_attributes(namespaces, ctx)?;
match end_token {
tokenizer::ElementEnd::Empty => {
let tag_ns_idx = get_ns_idx_by_prefix(
namespaces,
ctx.tag_name.prefix_pos,
ctx.tag_name.prefix,
&ctx.doc,
)?;
let new_element_id = ctx.append_node(
NodeKind::Element {
tag_name: ExpandedNameIndexed {
namespace_idx: tag_ns_idx,
local_name: ctx.tag_name.name,
},
attributes,
namespaces,
},
ctx.tag_name.pos..token_range.end,
)?;
ctx.awaiting_subtree.push(new_element_id);
}
tokenizer::ElementEnd::Close(prefix, local) => {
let parent_node = &mut ctx.doc.nodes[ctx.parent_id.get_usize()];
let parent_prefix = *ctx.parent_prefixes.last().unwrap();
#[cfg(feature = "positions")]
{
parent_node.range.end = token_range.end;
}
if let NodeKind::Element { ref tag_name, .. } = parent_node.kind {
if prefix != parent_prefix || local != tag_name.local_name {
return Err(Error::UnexpectedCloseTag(
gen_qname_string(parent_prefix, tag_name.local_name),
gen_qname_string(prefix, local),
ctx.doc.text_pos_at(token_range.start),
));
}
}
ctx.awaiting_subtree.push(ctx.parent_id);
if let Some(id) = parent_node.parent {
ctx.parent_id = id;
ctx.parent_prefixes.pop();
debug_assert!(!ctx.parent_prefixes.is_empty());
} else {
return Err(Error::UnexpectedEntityCloseTag(
ctx.doc.text_pos_at(token_range.start),
));
}
}
tokenizer::ElementEnd::Open => {
let tag_ns_idx = get_ns_idx_by_prefix(
namespaces,
ctx.tag_name.prefix_pos,
ctx.tag_name.prefix,
&ctx.doc,
)?;
ctx.parent_id = ctx.append_node(
NodeKind::Element {
tag_name: ExpandedNameIndexed {
namespace_idx: tag_ns_idx,
local_name: ctx.tag_name.name,
},
attributes,
namespaces,
},
ctx.tag_name.pos..token_range.end,
)?;
ctx.parent_prefixes.push(ctx.tag_name.prefix);
}
}
Ok(())
}
impl Context<'_> {
fn resolve_namespaces(&mut self) -> ShortRange {
if let NodeKind::Element { ref namespaces, .. } =
self.doc.nodes[self.parent_id.get_usize()].kind
{
let parent_ns = *namespaces;
if self.namespace_start_idx == self.doc.namespaces.tree_order.len() {
return parent_ns;
}
for i in parent_ns.to_urange() {
if !self.doc.namespaces.exists(
self.namespace_start_idx,
self.doc
.namespaces
.get(self.doc.namespaces.tree_order[i])
.name,
) {
self.doc.namespaces.push_ref(i);
}
}
}
(self.namespace_start_idx..self.doc.namespaces.tree_order.len()).into()
}
}
fn resolve_attributes(namespaces: ShortRange, ctx: &mut Context) -> Result<ShortRange> {
if ctx.current_attributes.is_empty() {
return Ok(ShortRange::new(0, 0));
}
if ctx.doc.attributes.len() + ctx.current_attributes.len() >= u32::MAX as usize {
return Err(Error::AttributesLimitReached);
}
let start_idx = ctx.doc.attributes.len();
for attr in ctx.current_attributes.drain(..) {
let namespace_idx = if attr.prefix == NS_XML_PREFIX {
Some(NamespaceIdx(0))
} else if attr.prefix.is_empty() {
None
} else {
get_ns_idx_by_prefix(namespaces, attr.range.start, attr.prefix, &ctx.doc)?
};
let attr_name = ExpandedNameIndexed {
namespace_idx,
local_name: attr.local,
};
if ctx.doc.attributes[start_idx..].iter().any(|attr| {
attr.name.as_expanded_name(&ctx.doc) == attr_name.as_expanded_name(&ctx.doc)
}) {
let pos = ctx.doc.text_pos_at(attr.range.start);
return Err(Error::DuplicatedAttribute(attr.local.to_string(), pos));
}
ctx.doc.attributes.push(AttributeData {
name: attr_name,
value: attr.value,
#[cfg(feature = "positions")]
range: attr.range,
#[cfg(feature = "positions")]
qname_len: attr.qname_len,
#[cfg(feature = "positions")]
eq_len: attr.eq_len,
});
}
Ok((start_idx..ctx.doc.attributes.len()).into())
}
fn process_text<'input>(
text: &'input str,
range: Range<usize>,
ctx: &mut Context<'input>,
) -> Result<()> {
if memchr2(b'&', b'\r', text.as_bytes()).is_none() {
ctx.append_text(Cow::Borrowed(text), range)?;
return Ok(());
}
let mut text_buffer = TextBuffer::new();
let mut is_as_is = false; let mut stream = Stream::from_substr(ctx.doc.text, range.clone());
while !stream.at_end() {
match parse_next_chunk(&mut stream, &ctx.entities)? {
NextChunk::Byte(c) => {
if is_as_is {
text_buffer.push_raw(c);
is_as_is = false;
} else {
text_buffer.push_from_text(c, stream.at_end());
}
}
NextChunk::Char(c) => {
for b in CharToBytes::new(c) {
if ctx.loop_detector.depth > 0 {
text_buffer.push_from_text(b, stream.at_end());
} else {
text_buffer.push_raw(b);
is_as_is = true;
}
}
}
NextChunk::Text(fragment) => {
is_as_is = false;
if !text_buffer.is_empty() {
ctx.append_text(Cow::Owned(text_buffer.finish()), range.clone())?;
}
ctx.loop_detector.inc_references(&stream)?;
ctx.loop_detector.inc_depth(&stream)?;
let text = if fragment.range().start == 0 {
fragment.as_str()
} else {
ctx.doc.text
};
let mut stream = Stream::from_substr(text, fragment.range());
let prev_tag_name = ctx.tag_name;
ctx.tag_name = TagNameSpan::new_null();
tokenizer::parse_content(&mut stream, ctx)?;
ctx.tag_name = prev_tag_name;
text_buffer.clear();
ctx.loop_detector.dec_depth();
}
}
}
if !text_buffer.is_empty() {
ctx.append_text(Cow::Owned(text_buffer.finish()), range)?;
}
Ok(())
}
fn process_cdata<'input>(
mut text: &'input str,
range: Range<usize>,
ctx: &mut Context<'input>,
) -> Result<()> {
let mut pos = memchr(b'\r', text.as_bytes());
if pos.is_none() {
ctx.append_text(Cow::Borrowed(text), range)?;
return Ok(());
}
let mut buf = String::new();
while let Some(pos1) = pos {
let (line, rest) = text.split_at(pos1);
buf.push_str(line);
buf.push('\n');
text = if rest.as_bytes().get(1) == Some(&b'\n') {
&rest[2..]
} else {
&rest[1..]
};
pos = memchr(b'\r', text.as_bytes());
}
buf.push_str(text);
ctx.append_text(Cow::Owned(buf), range)?;
Ok(())
}
enum NextChunk<'a> {
Byte(u8),
Char(char),
Text(StrSpan<'a>),
}
fn parse_next_chunk<'a>(stream: &mut Stream<'a>, entities: &[Entity<'a>]) -> Result<NextChunk<'a>> {
debug_assert!(!stream.at_end());
let c = stream.curr_byte_unchecked();
if c == b'&' {
let start = stream.pos();
match stream.consume_reference() {
Some(Reference::Char(ch)) => Ok(NextChunk::Char(ch)),
Some(Reference::Entity(name)) => entities
.iter()
.find(|e| e.name == name)
.map(|e| NextChunk::Text(e.value))
.ok_or_else(|| {
let pos = stream.gen_text_pos_from(start);
Error::UnknownEntityReference(name.into(), pos)
}),
None => {
let pos = stream.gen_text_pos_from(start);
Err(Error::MalformedEntityReference(pos))
}
}
} else {
stream.advance(1);
Ok(NextChunk::Byte(c))
}
}
fn normalize_attribute<'input>(
text: StrSpan<'input>,
ctx: &mut Context<'input>,
) -> Result<StringStorage<'input>> {
if memchr2(b'&', b'\t', text.as_str().as_bytes()).is_some() || memchr2(b'\n', b'\r', text.as_str().as_bytes()).is_some() {
let mut text_buffer = TextBuffer::new();
_normalize_attribute(text, &mut text_buffer, ctx)?;
Ok(StringStorage::new_owned(&text_buffer.finish()))
} else {
Ok(StringStorage::Borrowed(text.as_str()))
}
}
fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
let mut stream = Stream::from_substr(ctx.doc.text, text.range());
while !stream.at_end() {
let c = stream.curr_byte_unchecked();
if c != b'&' {
stream.advance(1);
buffer.push_from_attr(c, stream.curr_byte().ok());
continue;
}
let start = stream.pos();
match stream.consume_reference() {
Some(Reference::Char(ch)) => {
for b in CharToBytes::new(ch) {
if ctx.loop_detector.depth > 0 {
if b == b'<' {
return Err(Error::InvalidAttributeValue(
stream.gen_text_pos_from(start),
));
}
buffer.push_from_attr(b, None);
} else {
buffer.push_raw(b);
}
}
}
Some(Reference::Entity(name)) => match ctx.entities.iter().find(|e| e.name == name) {
Some(entity) => {
ctx.loop_detector.inc_references(&stream)?;
ctx.loop_detector.inc_depth(&stream)?;
_normalize_attribute(entity.value, buffer, ctx)?;
ctx.loop_detector.dec_depth();
}
None => {
let pos = stream.gen_text_pos_from(start);
return Err(Error::UnknownEntityReference(name.into(), pos));
}
},
None => {
let pos = stream.gen_text_pos_from(start);
return Err(Error::MalformedEntityReference(pos));
}
}
}
Ok(())
}
fn get_ns_idx_by_prefix(
namespaces: ShortRange,
prefix_pos: usize,
prefix: &str,
doc: &Document<'_>,
) -> Result<Option<NamespaceIdx>> {
let prefix_opt = if prefix.is_empty() {
None
} else {
Some(prefix)
};
let idx = doc.namespaces.tree_order[namespaces.to_urange()]
.iter()
.find(|idx| doc.namespaces.get(**idx).name == prefix_opt);
match idx {
Some(idx) => Ok(Some(*idx)),
None => {
if !prefix.is_empty() {
let pos = doc.text_pos_at(prefix_pos);
Err(Error::UnknownNamespace(prefix.to_string(), pos))
} else {
Ok(None)
}
}
}
}
fn gen_qname_string(prefix: &str, local: &str) -> String {
if prefix.is_empty() {
local.to_string()
} else {
alloc::format!("{}:{}", prefix, local)
}
}
struct CharToBytes {
buf: [u8; 4],
idx: u8,
}
impl CharToBytes {
#[inline]
fn new(c: char) -> Self {
let mut buf = [0xFF; 4];
c.encode_utf8(&mut buf);
CharToBytes { buf, idx: 0 }
}
}
impl Iterator for CharToBytes {
type Item = u8;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.idx < 4 {
let b = self.buf[self.idx as usize];
if b != 0xFF {
self.idx += 1;
return Some(b);
} else {
self.idx = 4;
}
}
None
}
}
struct TextBuffer {
buffer: Vec<u8>,
}
impl TextBuffer {
#[inline]
fn new() -> Self {
TextBuffer {
buffer: Vec::with_capacity(32),
}
}
#[inline]
fn push_raw(&mut self, c: u8) {
self.buffer.push(c);
}
fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
if current == b'\r' && next == Some(b'\n') {
return;
}
current = match current {
b'\n' | b'\r' | b'\t' => b' ',
_ => current,
};
self.buffer.push(current);
}
fn push_from_text(&mut self, c: u8, at_end: bool) {
if self.buffer.last() == Some(&b'\r') {
let idx = self.buffer.len() - 1;
self.buffer[idx] = b'\n';
if at_end && c == b'\r' {
self.buffer.push(b'\n');
} else if c != b'\n' {
self.buffer.push(c);
}
} else if at_end && c == b'\r' {
self.buffer.push(b'\n');
} else {
self.buffer.push(c);
}
}
#[inline]
fn clear(&mut self) {
self.buffer.clear();
}
#[inline]
fn is_empty(&self) -> bool {
self.buffer.is_empty()
}
#[inline]
fn finish(&mut self) -> String {
String::from_utf8(take(&mut self.buffer)).unwrap()
}
}