use crate::bitstream::{self, TransposeFn};
use crate::classify::{self, CharClassMasks};
use crate::state::{DoctypeSubState, ParserState, QuoteStyle};
use crate::types::{is_xml_whitespace, Error, ErrorKind, ParseError, Span};
use crate::visitor::Visitor;
const MAX_NAME_LENGTH: usize = 1_000;
const MAX_CHAR_REF_LENGTH: usize = 7;
pub struct Reader {
state: ParserState,
transpose: TransposeFn,
markup_start: Option<usize>,
text_start: Option<usize>,
resume_pos: usize,
content_bracket_count: u8,
content_start: Option<usize>,
markup_stream_offset: Option<u64>,
had_markup: bool,
in_xml_decl: bool,
xml_decl_buf: [u8; 256],
xml_decl_buf_len: usize,
xml_decl_span_start: u64,
}
impl Reader {
pub fn new() -> Self {
Self {
state: ParserState::Content,
transpose: bitstream::select_transpose(),
markup_start: None,
text_start: None,
resume_pos: 0,
content_bracket_count: 0,
content_start: None,
markup_stream_offset: None,
had_markup: false,
in_xml_decl: false,
xml_decl_buf: [0; 256],
xml_decl_buf_len: 0,
xml_decl_span_start: 0,
}
}
pub fn reset(&mut self) {
self.finish_content_body();
self.resume_pos = 0;
self.content_bracket_count = 0;
self.had_markup = false;
self.in_xml_decl = false;
self.xml_decl_buf_len = 0;
}
#[inline(always)]
fn finish_markup(&mut self) {
self.state = ParserState::Content;
self.markup_start = None;
self.text_start = None;
self.markup_stream_offset = None;
self.had_markup = true;
}
#[inline(always)]
fn finish_content_body(&mut self) {
self.finish_markup();
self.content_start = None;
}
#[inline(never)]
fn try_inline_with_peek<V: Visitor>(
&mut self,
buf: &[u8],
delim_pos: usize,
stream_offset: u64,
visitor: &mut V,
) -> Result<Option<(usize, usize)>, ParseError<V::Error>> {
let b = buf[delim_pos];
let first_pos = if b == b'<' {
self.try_inline_tag(buf, delim_pos, stream_offset, visitor)?
} else if b == b'&' {
self.try_inline_ref(buf, delim_pos, stream_offset, visitor)?
} else {
None
};
let Some(mut pos) = first_pos else {
return Ok(None);
};
'peek: loop {
let limit = (pos + 16).min(buf.len());
let mut text_scan = pos;
while text_scan < limit {
let ch = buf[text_scan];
if ch == b'<' {
if let Some(next) = self.try_inline_tag(
buf, text_scan, stream_offset, visitor,
)? {
pos = next;
continue 'peek;
}
self.text_start = Some(pos);
return Ok(Some((text_scan, text_scan / 64 * 64)));
} else if ch == b'&' {
if let Some(next) = self.try_inline_ref(
buf, text_scan, stream_offset, visitor,
)? {
pos = next;
continue 'peek;
}
self.text_start = Some(pos);
return Ok(Some((text_scan, text_scan / 64 * 64)));
} else if ch == b']' {
self.text_start = Some(pos);
return Ok(Some((text_scan, text_scan / 64 * 64)));
}
text_scan += 1;
}
self.text_start = Some(pos);
return Ok(Some((limit, limit / 64 * 64)));
}
}
#[inline]
fn try_inline_tag<V: Visitor>(
&mut self,
buf: &[u8],
lt_pos: usize,
stream_offset: u64,
visitor: &mut V,
) -> Result<Option<usize>, ParseError<V::Error>> {
let after = lt_pos + 1;
if after >= buf.len() {
return Ok(None);
}
let b = buf[after];
if b == b'/' {
self.try_inline_end_tag(buf, lt_pos, stream_offset, visitor)
} else if is_name_start_byte(b) {
self.try_inline_start_tag(buf, lt_pos, stream_offset, visitor)
} else {
Ok(None)
}
}
#[inline]
fn try_inline_end_tag<V: Visitor>(
&mut self,
buf: &[u8],
lt_pos: usize,
stream_offset: u64,
visitor: &mut V,
) -> Result<Option<usize>, ParseError<V::Error>> {
let name_start = lt_pos + 2;
if name_start >= buf.len() {
return Ok(None);
}
if !is_name_start_byte(buf[name_start]) {
return Ok(None);
}
let mut i = name_start + 1;
while i < buf.len() && is_name_byte(buf[i]) {
i += 1;
}
if i >= buf.len() {
return Ok(None);
}
let name_end = i;
if name_end - name_start > MAX_NAME_LENGTH {
return Ok(None);
}
if buf[name_end] != b'>' {
return Ok(None);
}
self.flush_text_before(buf, lt_pos, stream_offset, visitor)?;
let name = &buf[name_start..name_end];
let span = Span::new(
stream_offset + name_start as u64,
stream_offset + name_end as u64,
);
visitor.end_tag(name, span).map_err(ParseError::Visitor)?;
Ok(Some(name_end + 1))
}
#[inline]
fn try_inline_start_tag<V: Visitor>(
&mut self,
buf: &[u8],
lt_pos: usize,
stream_offset: u64,
visitor: &mut V,
) -> Result<Option<usize>, ParseError<V::Error>> {
let name_start = lt_pos + 1;
let mut i = name_start + 1;
while i < buf.len() && is_name_byte(buf[i]) {
i += 1;
}
if i >= buf.len() {
return Ok(None);
}
let name_end = i;
if name_end - name_start > MAX_NAME_LENGTH {
return Ok(None);
}
let byte = buf[name_end];
if byte == b'>' {
self.flush_text_before(buf, lt_pos, stream_offset, visitor)?;
let name = &buf[name_start..name_end];
let name_span = Span::new(
stream_offset + name_start as u64,
stream_offset + name_end as u64,
);
visitor.start_tag_open(name, name_span).map_err(ParseError::Visitor)?;
let close_span = Span::new(
stream_offset + name_end as u64,
stream_offset + name_end as u64 + 1,
);
visitor.start_tag_close(close_span).map_err(ParseError::Visitor)?;
Ok(Some(name_end + 1))
} else if byte == b'/' {
let gt_pos = name_end + 1;
if gt_pos >= buf.len() || buf[gt_pos] != b'>' {
return Ok(None);
}
self.flush_text_before(buf, lt_pos, stream_offset, visitor)?;
let name = &buf[name_start..name_end];
let name_span = Span::new(
stream_offset + name_start as u64,
stream_offset + name_end as u64,
);
visitor.start_tag_open(name, name_span).map_err(ParseError::Visitor)?;
let close_span = Span::new(
stream_offset + name_end as u64,
stream_offset + gt_pos as u64 + 1,
);
visitor.empty_element_end(close_span).map_err(ParseError::Visitor)?;
Ok(Some(gt_pos + 1))
} else {
Ok(None)
}
}
#[inline]
fn try_inline_ref<V: Visitor>(
&mut self,
buf: &[u8],
amp_pos: usize,
stream_offset: u64,
visitor: &mut V,
) -> Result<Option<usize>, ParseError<V::Error>> {
let name_start = amp_pos + 1;
if name_start >= buf.len() {
return Ok(None);
}
if buf[name_start] == b'#' || !is_name_start_byte(buf[name_start]) {
return Ok(None);
}
let mut i = name_start + 1;
while i < buf.len() && is_name_byte(buf[i]) {
i += 1;
}
if i >= buf.len() || buf[i] != b';' {
return Ok(None);
}
let name_end = i;
if name_end - name_start > MAX_NAME_LENGTH {
return Ok(None);
}
self.flush_text_before(buf, amp_pos, stream_offset, visitor)?;
let name = &buf[name_start..name_end];
let span = Span::new(
stream_offset + name_start as u64,
stream_offset + name_end as u64,
);
visitor.entity_ref(name, span).map_err(ParseError::Visitor)?;
Ok(Some(name_end + 1))
}
#[inline]
fn flush_text_before<V: Visitor>(
&mut self,
buf: &[u8],
end_pos: usize,
stream_offset: u64,
visitor: &mut V,
) -> Result<(), ParseError<V::Error>> {
if let Some(text_start) = self.text_start.take() {
if text_start < end_pos {
let span = Span::new(
stream_offset + text_start as u64,
stream_offset + end_pos as u64,
);
visitor
.characters(&buf[text_start..end_pos], span)
.map_err(ParseError::Visitor)?;
}
}
Ok(())
}
#[inline(always)]
fn handle_empty_element_slash<V: Visitor>(
&mut self,
buf: &[u8],
abs: usize,
block_rel_pos: usize,
stream_offset: u64,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let gt_pos = abs + 1;
if gt_pos < buf.len() {
if buf[gt_pos] == b'>' {
let span = Span::new(
stream_offset + abs as u64,
stream_offset + gt_pos as u64 + 1,
);
visitor
.empty_element_end(span)
.map_err(ParseError::Visitor)?;
self.finish_markup();
Ok(block_rel_pos + 2)
} else {
Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(buf[gt_pos]),
offset: stream_offset + gt_pos as u64,
}))
}
} else {
self.state = ParserState::StartTagGotSlash;
Ok(block_rel_pos + 1)
}
}
pub fn parse_slice<V: Visitor>(
&mut self,
buf: &[u8],
visitor: &mut V,
) -> Result<u64, ParseError<V::Error>> {
self.parse(buf, 0, true, visitor)
}
pub fn parse<V: Visitor>(
&mut self,
buf: &[u8],
stream_offset: u64,
is_final: bool,
visitor: &mut V,
) -> Result<u64, ParseError<V::Error>> {
if buf.is_empty() {
if is_final && self.state != ParserState::Content {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedEof,
offset: stream_offset,
}));
}
return Ok(0);
}
let first_block = (self.resume_pos / 64) * 64;
let mut block_offset = first_block;
while block_offset < buf.len() {
if matches!(self.state, ParserState::Content) && self.content_bracket_count == 0 {
let scan_start = if block_offset <= self.resume_pos {
self.resume_pos
} else {
block_offset
};
if scan_start < buf.len() {
if let Some(rel) = memchr::memchr3(b'<', b'&', b']', &buf[scan_start..]) {
let delim_pos = scan_start + rel;
let delim_block = delim_pos / 64 * 64;
if delim_block > block_offset {
if self.text_start.is_none() {
self.text_start = Some(scan_start);
}
if let Some((resume, block)) = self.try_inline_with_peek(
buf, delim_pos, stream_offset, visitor,
)? {
self.resume_pos = resume;
block_offset = block;
continue;
}
block_offset = delim_block;
self.resume_pos = block_offset;
continue;
}
} else {
if self.text_start.is_none() {
self.text_start = Some(scan_start);
}
self.resume_pos = buf.len();
block_offset = buf.len();
continue;
}
}
}
let (bp, block_len) = bitstream::transpose_block(self.transpose, buf, block_offset);
let masks = classify::classify(&bp);
let start_pos = if block_offset <= self.resume_pos {
self.resume_pos - block_offset
} else {
0
};
let final_buf_pos =
self.process_block(buf, block_offset, block_len, start_pos, &masks, stream_offset, visitor)?;
self.resume_pos = final_buf_pos;
block_offset += block_len;
}
let mut consumed = if let Some(start) = self.markup_start {
if is_final {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedEof,
offset: stream_offset + start as u64,
}));
}
start
} else if is_final {
if let Some(offset) = self.markup_stream_offset {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedEof,
offset,
}));
}
buf.len()
} else {
match utf8_boundary_rewind(buf) {
Ok(rewind) => buf.len() - rewind,
Err(offset) => {
return Err(ParseError::Xml(Error {
kind: ErrorKind::InvalidUtf8,
offset: stream_offset + offset as u64,
}));
}
}
};
if !is_final {
let exclude = match &self.state {
ParserState::CommentContent { dash_count } => *dash_count as usize,
ParserState::CdataContent { bracket_count } => *bracket_count as usize,
ParserState::PIContent { saw_qmark: true } => 1,
_ => 0,
};
if exclude > 0 {
consumed = consumed.saturating_sub(exclude);
match &mut self.state {
ParserState::CommentContent { dash_count } => *dash_count = 0,
ParserState::CdataContent { bracket_count } => *bracket_count = 0,
ParserState::PIContent { saw_qmark } => *saw_qmark = false,
_ => {}
}
self.resume_pos = consumed;
}
}
if let Some(text_start) = self.text_start {
if text_start < consumed {
let text = &buf[text_start..consumed];
if !text.is_empty() {
let span = Span::new(
stream_offset + text_start as u64,
stream_offset + consumed as u64,
);
visitor
.characters(text, span)
.map_err(ParseError::Visitor)?;
}
}
if consumed >= buf.len() {
self.text_start = None;
} else {
self.text_start = Some(text_start.saturating_sub(consumed));
}
}
if let Some(cs) = self.content_start {
if cs < consumed {
let content = &buf[cs..consumed];
if !content.is_empty() {
let span = Span::new(
stream_offset + cs as u64,
stream_offset + consumed as u64,
);
match &self.state {
ParserState::CommentContent { .. } => {
visitor.comment_content(content, span).map_err(ParseError::Visitor)?;
}
ParserState::CdataContent { .. } => {
visitor.cdata_content(content, span).map_err(ParseError::Visitor)?;
}
ParserState::PIContent { .. } => {
self.emit_pi_content(content, span, visitor)?;
}
ParserState::DoctypeContent { .. } => {
visitor.doctype_content(content, span).map_err(ParseError::Visitor)?;
}
ParserState::AttrValue { .. } => {
visitor.attribute_value(content, span).map_err(ParseError::Visitor)?;
}
_ => {}
}
}
}
self.content_start = Some(cs.saturating_sub(consumed));
}
if consumed > 0 {
self.markup_start = self.markup_start.map(|s| s - consumed);
self.resume_pos = self.resume_pos.saturating_sub(consumed);
self.state.adjust_positions(consumed);
}
Ok(consumed as u64)
}
fn process_block<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
start_pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let mut pos = start_pos;
while pos < block_len {
match self.state {
ParserState::Content => {
pos = self.scan_content(
buf, block_offset, block_len, pos, masks, stream_offset, visitor,
)?;
}
ParserState::AfterLt => {
let abs = block_offset + pos;
let byte = buf[abs];
match byte {
b'/' => {
self.state = ParserState::EndTagName {
name_start: abs + 1,
};
pos += 1;
}
b'?' => {
self.state = ParserState::PITarget {
name_start: abs + 1,
};
pos += 1;
}
b'!' => {
self.state = ParserState::AfterLtBang;
pos += 1;
}
_ if is_name_start_byte(byte) => {
self.state = ParserState::StartTagName { name_start: abs };
}
_ => {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
}
ParserState::StartTagName { name_start } => {
let Some((next, abs)) =
find_name_end(masks.name_end, pos, block_offset, block_len)
else {
check_name_length(
block_offset + block_len,
name_start,
stream_offset,
)?;
pos = block_len;
continue;
};
let name = validate_name(buf, name_start, abs, stream_offset)?;
let name_span = Span::new(
stream_offset + name_start as u64,
stream_offset + abs as u64,
);
visitor
.start_tag_open(name, name_span)
.map_err(ParseError::Visitor)?;
self.markup_stream_offset = Some(stream_offset + self.markup_start.unwrap() as u64);
self.markup_start = None;
let byte = buf[abs];
match byte {
b'>' => {
let span = Span::new(
stream_offset + abs as u64,
stream_offset + abs as u64 + 1,
);
visitor.start_tag_close(span).map_err(ParseError::Visitor)?;
self.finish_markup();
pos = next + 1;
}
b'/' => {
pos = self.handle_empty_element_slash(
buf, abs, next, stream_offset, visitor,
)?;
}
_ => {
self.state = ParserState::StartTagPostName;
pos = next;
}
}
}
ParserState::StartTagPostName => {
let Some((next, abs)) =
find_non_whitespace(masks.whitespace, pos, block_offset, block_len)
else {
pos = block_len;
continue;
};
let byte = buf[abs];
match byte {
b'>' => {
let span = Span::new(
stream_offset + abs as u64,
stream_offset + abs as u64 + 1,
);
visitor.start_tag_close(span).map_err(ParseError::Visitor)?;
self.finish_markup();
pos = next + 1;
}
b'/' => {
pos = self.handle_empty_element_slash(
buf, abs, next, stream_offset, visitor,
)?;
}
_ if is_name_start_byte(byte) => {
self.markup_start = Some(abs);
self.state = ParserState::AttrName { name_start: abs };
pos = next;
}
_ => {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
}
ParserState::StartTagGotSlash => {
let abs = block_offset + pos;
let byte = buf[abs];
if byte == b'>' {
let close_span = Span::new(
stream_offset + abs as u64 - 1,
stream_offset + abs as u64 + 1,
);
visitor
.empty_element_end(close_span)
.map_err(ParseError::Visitor)?;
self.finish_markup();
pos += 1;
} else {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
ParserState::AttrName { name_start } => {
let Some((next, abs)) =
find_name_end(masks.name_end, pos, block_offset, block_len)
else {
check_name_length(
block_offset + block_len,
name_start,
stream_offset,
)?;
pos = block_len;
continue;
};
let name = validate_name(buf, name_start, abs, stream_offset)?;
let name_span = Span::new(
stream_offset + name_start as u64,
stream_offset + abs as u64,
);
visitor
.attribute_name(name, name_span)
.map_err(ParseError::Visitor)?;
self.markup_start = None;
let byte = buf[abs];
if byte == b'=' {
self.state = ParserState::BeforeAttrValue;
pos = next + 1;
} else {
self.state = ParserState::AfterAttrName;
pos = next;
}
}
ParserState::AfterAttrName => {
let Some((next, abs)) =
find_non_whitespace(masks.whitespace, pos, block_offset, block_len)
else {
pos = block_len;
continue;
};
let byte = buf[abs];
if byte == b'=' {
self.state = ParserState::BeforeAttrValue;
pos = next + 1;
} else {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
ParserState::BeforeAttrValue => {
let abs = block_offset + pos;
let byte = buf[abs];
if byte == b'"' {
self.state = ParserState::AttrValue { quote: QuoteStyle::Double };
self.content_start = Some(abs + 1);
pos += 1;
} else if byte == b'\'' {
self.state = ParserState::AttrValue { quote: QuoteStyle::Single };
self.content_start = Some(abs + 1);
pos += 1;
} else if is_xml_whitespace(byte) {
pos += 1;
} else {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
ParserState::AttrValue { quote } => {
let content_start = self.content_start.unwrap();
let delim_mask = match quote {
QuoteStyle::Double => masks.attr_dq_delim,
QuoteStyle::Single => masks.attr_sq_delim,
};
let delim_byte = match quote {
QuoteStyle::Double => b'"',
QuoteStyle::Single => b'\'',
};
let Some((next, abs)) =
find_name_end(delim_mask, pos, block_offset, block_len)
else {
pos = block_len;
continue;
};
let byte = buf[abs];
if byte == delim_byte {
if content_start < abs {
let value = &buf[content_start..abs];
let span = Span::new(
stream_offset + content_start as u64,
stream_offset + abs as u64,
);
visitor
.attribute_value(value, span)
.map_err(ParseError::Visitor)?;
}
let quote_span = Span::new(
stream_offset + abs as u64,
stream_offset + abs as u64 + 1,
);
visitor
.attribute_end(quote_span)
.map_err(ParseError::Visitor)?;
self.content_start = None;
self.state = ParserState::StartTagPostName;
pos = next + 1;
} else if byte == b'<' {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
} else {
if content_start < abs {
let value = &buf[content_start..abs];
let span = Span::new(
stream_offset + content_start as u64,
stream_offset + abs as u64,
);
visitor
.attribute_value(value, span)
.map_err(ParseError::Visitor)?;
}
self.markup_start = Some(abs);
self.content_start = None;
self.state = ParserState::AttrEntityRef {
name_start: abs + 1,
quote,
};
pos = next + 1;
}
}
ParserState::EndTagName { name_start } => {
let Some((next, abs)) =
find_name_end(masks.name_end, pos, block_offset, block_len)
else {
check_name_length(
block_offset + block_len,
name_start,
stream_offset,
)?;
pos = block_len;
continue;
};
let name = validate_name(buf, name_start, abs, stream_offset)?;
let name_span = Span::new(
stream_offset + name_start as u64,
stream_offset + abs as u64,
);
visitor
.end_tag(name, name_span)
.map_err(ParseError::Visitor)?;
let byte = buf[abs];
if byte == b'>' {
self.finish_markup();
pos = next + 1;
} else {
self.state = ParserState::EndTagPostName;
pos = next;
}
}
ParserState::EndTagPostName => {
let Some((next, abs)) =
find_non_whitespace(masks.whitespace, pos, block_offset, block_len)
else {
pos = block_len;
continue;
};
let byte = buf[abs];
if byte == b'>' {
self.finish_markup();
pos = next + 1;
} else {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
ParserState::AfterLtBang => {
let abs = block_offset + pos;
let byte = buf[abs];
match byte {
b'-' => {
self.state = ParserState::AfterLtBangDash;
pos += 1;
}
b'[' => {
self.state = ParserState::AfterLtBangBracket { matched: 0 };
pos += 1;
}
b'D' => {
self.state = ParserState::AfterLtBangD { matched: 0 };
pos += 1;
}
_ => {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
}
ParserState::AfterLtBangDash => {
let abs = block_offset + pos;
let byte = buf[abs];
if byte == b'-' {
let start_span = Span::new(
stream_offset + self.markup_start.unwrap() as u64,
stream_offset + abs as u64 + 1,
);
visitor
.comment_start(start_span)
.map_err(ParseError::Visitor)?;
self.markup_stream_offset = Some(stream_offset + self.markup_start.unwrap() as u64);
self.markup_start = None;
self.content_start = Some(abs + 1);
self.state = ParserState::CommentContent {
dash_count: 0,
};
pos += 1;
} else {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
ParserState::CommentContent { dash_count } => {
pos = self.scan_comment_content(
buf, block_offset, block_len, pos, masks,
stream_offset, dash_count, visitor,
)?;
}
ParserState::AfterLtBangBracket { matched } => {
let abs = block_offset + pos;
let byte = buf[abs];
const CDATA_CHARS: &[u8] = b"CDATA[";
if byte == CDATA_CHARS[matched as usize] {
let new_matched = matched + 1;
if new_matched as usize == CDATA_CHARS.len() {
let start_span = Span::new(
stream_offset + self.markup_start.unwrap() as u64,
stream_offset + abs as u64 + 1,
);
visitor
.cdata_start(start_span)
.map_err(ParseError::Visitor)?;
self.markup_stream_offset = Some(stream_offset + self.markup_start.unwrap() as u64);
self.markup_start = None;
self.content_start = Some(abs + 1);
self.state = ParserState::CdataContent {
bracket_count: 0,
};
} else {
self.state = ParserState::AfterLtBangBracket { matched: new_matched };
}
pos += 1;
} else {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
ParserState::CdataContent { bracket_count } => {
pos = self.scan_cdata_content(
buf, block_offset, block_len, pos, masks,
stream_offset, bracket_count, visitor,
)?;
}
ParserState::AfterLtBangD { matched } => {
let abs = block_offset + pos;
let byte = buf[abs];
const DOCTYPE_CHARS: &[u8] = b"OCTYPE";
if byte == DOCTYPE_CHARS[matched as usize] {
let new_matched = matched + 1;
if new_matched as usize == DOCTYPE_CHARS.len() {
self.state = ParserState::DoctypeName { name_start: usize::MAX };
} else {
self.state = ParserState::AfterLtBangD { matched: new_matched };
}
pos += 1;
} else {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(byte),
offset: stream_offset + abs as u64,
}));
}
}
ParserState::DoctypeName { name_start } => {
pos = self.scan_doctype_name(
buf, block_offset, block_len, pos, masks,
stream_offset, name_start, visitor,
)?;
}
ParserState::DoctypeContent { depth, sub } => {
pos = self.scan_doctype_content(
buf, block_offset, block_len, pos,
stream_offset, depth, sub, visitor,
)?;
}
ParserState::PITarget { name_start } => {
pos = self.scan_pi_target(
buf, block_offset, block_len, pos, masks,
stream_offset, name_start, visitor,
)?;
}
ParserState::PIContent { saw_qmark } => {
pos = self.scan_pi_content(
buf, block_offset, block_len, pos, masks,
stream_offset, saw_qmark, visitor,
)?;
}
ParserState::EntityRef { name_start } => {
pos = self.scan_entity_ref(
buf, block_offset, block_len, pos, masks,
stream_offset, name_start, visitor,
)?;
}
ParserState::CharRef { value_start } => {
pos = self.scan_char_ref(
buf, block_offset, block_len, pos, masks,
stream_offset, value_start, visitor,
)?;
}
ParserState::AttrEntityRef { name_start, quote } => {
pos = self.scan_attr_entity_ref(
buf, block_offset, block_len, pos, masks,
stream_offset, name_start, quote, visitor,
)?;
}
ParserState::AttrCharRef { value_start, quote } => {
pos = self.scan_attr_char_ref(
buf, block_offset, block_len, pos, masks,
stream_offset, value_start, quote, visitor,
)?;
}
}
}
Ok(block_offset + pos)
}
fn scan_content<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
mut pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
if self.text_start.is_none() {
self.text_start = Some(block_offset + pos);
}
if self.content_bracket_count > 0 {
let abs = block_offset + pos;
if abs < buf.len() {
let mut scan = abs;
let mut brackets = self.content_bracket_count;
while scan < buf.len() {
let ch = buf[scan];
if ch == b']' {
brackets = brackets.saturating_add(1);
scan += 1;
} else if ch == b'>' && brackets >= 2 {
return Err(ParseError::Xml(Error {
kind: ErrorKind::CdataEndInContent,
offset: stream_offset + scan as u64 - 2,
}));
} else {
self.content_bracket_count = 0;
break;
}
}
if scan >= buf.len() {
self.content_bracket_count = brackets.min(2);
}
let consumed_in_block = scan - block_offset;
if consumed_in_block >= block_len {
return Ok(block_len);
}
pos = consumed_in_block;
}
}
loop {
if pos >= block_len {
return Ok(block_len);
}
let shifted = masks.content_delim >> pos;
if shifted == 0 {
return Ok(block_len);
}
let next = shifted.trailing_zeros() as usize;
if pos + next >= block_len {
return Ok(block_len);
}
let abs = block_offset + pos + next;
let byte = buf[abs];
match byte {
b'<' => {
self.content_bracket_count = 0;
if let Some(text_start) = self.text_start.take() {
if text_start < abs {
let span = Span::new(
stream_offset + text_start as u64,
stream_offset + abs as u64,
);
visitor
.characters(&buf[text_start..abs], span)
.map_err(ParseError::Visitor)?;
}
}
self.markup_start = Some(abs);
let after = abs + 1;
if after < buf.len() {
let b = buf[after];
match b {
b'/' => {
self.state = ParserState::EndTagName {
name_start: after + 1,
};
return Ok(pos + next + 2);
}
b'?' => {
self.state = ParserState::PITarget {
name_start: after + 1,
};
return Ok(pos + next + 2);
}
b'!' => {
self.state = ParserState::AfterLtBang;
return Ok(pos + next + 2);
}
_ if is_name_start_byte(b) => {
self.state =
ParserState::StartTagName { name_start: after };
return Ok(pos + next + 1);
}
_ => {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(b),
offset: stream_offset + after as u64,
}));
}
}
} else {
self.state = ParserState::AfterLt;
return Ok(pos + next + 1);
}
}
b'&' => {
self.content_bracket_count = 0;
if let Some(text_start) = self.text_start.take() {
if text_start < abs {
let span = Span::new(
stream_offset + text_start as u64,
stream_offset + abs as u64,
);
visitor
.characters(&buf[text_start..abs], span)
.map_err(ParseError::Visitor)?;
}
}
self.state = ParserState::EntityRef {
name_start: abs + 1,
};
self.markup_start = Some(abs);
return Ok(pos + next + 1);
}
b']' => {
let mut scan = abs + 1;
let mut brackets: u8 = self.content_bracket_count + 1;
while scan < buf.len() {
let ch = buf[scan];
if ch == b']' {
brackets = brackets.saturating_add(1);
scan += 1;
} else if ch == b'>' && brackets >= 2 {
return Err(ParseError::Xml(Error {
kind: ErrorKind::CdataEndInContent,
offset: stream_offset + scan as u64 - 2,
}));
} else {
self.content_bracket_count = 0;
break;
}
}
if scan >= buf.len() {
self.content_bracket_count = brackets.min(2);
}
let consumed_in_block = scan - block_offset;
if consumed_in_block >= block_len {
return Ok(block_len);
}
pos = consumed_in_block;
}
_ => unreachable!(),
}
}
}
fn scan_comment_content<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
mut pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
mut dash_count: u8,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let content_start = self.content_start.unwrap();
loop {
while dash_count > 0 && pos < block_len {
let abs = block_offset + pos;
let byte = buf[abs];
if byte == b'>' && dash_count >= 2 {
let content_end = abs - dash_count as usize;
if content_end > content_start {
let span = Span::new(
stream_offset + content_start as u64,
stream_offset + content_end as u64,
);
visitor
.comment_content(&buf[content_start..content_end], span)
.map_err(ParseError::Visitor)?;
}
let end_span = Span::new(
stream_offset + content_end as u64,
stream_offset + abs as u64 + 1,
);
visitor
.comment_end(end_span)
.map_err(ParseError::Visitor)?;
self.finish_content_body();
return Ok(pos + 1);
} else if dash_count >= 2 {
return Err(ParseError::Xml(Error {
kind: ErrorKind::DoubleDashInComment,
offset: stream_offset + abs as u64 - 2,
}));
} else if byte == b'-' {
dash_count += 1;
pos += 1;
} else {
dash_count = 0;
pos += 1;
break;
}
}
if pos >= block_len {
self.state = ParserState::CommentContent { dash_count };
return Ok(block_len);
}
if dash_count > 0 {
self.state = ParserState::CommentContent { dash_count };
return Ok(block_len);
}
let shifted = masks.dash >> pos;
if shifted == 0 {
self.state = ParserState::CommentContent { dash_count: 0 };
return Ok(block_len);
}
let next = shifted.trailing_zeros() as usize;
if pos + next >= block_len {
self.state = ParserState::CommentContent { dash_count: 0 };
return Ok(block_len);
}
pos = pos + next;
dash_count = 1;
pos += 1;
}
}
fn scan_cdata_content<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
mut pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
mut bracket_count: u8,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let content_start = self.content_start.unwrap();
loop {
while bracket_count > 0 && pos < block_len {
let abs = block_offset + pos;
let byte = buf[abs];
if byte == b']' {
bracket_count = (bracket_count + 1).min(2);
pos += 1;
} else if byte == b'>' && bracket_count >= 2 {
let content_end = abs - bracket_count as usize;
if content_end > content_start {
let span = Span::new(
stream_offset + content_start as u64,
stream_offset + content_end as u64,
);
visitor
.cdata_content(&buf[content_start..content_end], span)
.map_err(ParseError::Visitor)?;
}
let end_span = Span::new(
stream_offset + content_end as u64,
stream_offset + abs as u64 + 1,
);
visitor
.cdata_end(end_span)
.map_err(ParseError::Visitor)?;
self.finish_content_body();
return Ok(pos + 1);
} else {
bracket_count = 0;
pos += 1;
break;
}
}
if pos >= block_len {
self.state = ParserState::CdataContent { bracket_count };
return Ok(block_len);
}
if bracket_count > 0 {
self.state = ParserState::CdataContent { bracket_count };
return Ok(block_len);
}
let shifted = masks.rbracket >> pos;
if shifted == 0 {
self.state = ParserState::CdataContent { bracket_count: 0 };
return Ok(block_len);
}
let next = shifted.trailing_zeros() as usize;
if pos + next >= block_len {
self.state = ParserState::CdataContent { bracket_count: 0 };
return Ok(block_len);
}
pos = pos + next;
bracket_count = 1;
pos += 1;
}
}
fn scan_doctype_name<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
name_start: usize,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
if name_start >= usize::MAX - 1 {
if name_start == usize::MAX {
let abs = block_offset + pos;
if abs >= buf.len() {
return Ok(block_len);
}
let byte = buf[abs];
if !is_xml_whitespace(byte) {
return Err(ParseError::Xml(Error {
kind: ErrorKind::DoctypeMissingWhitespace,
offset: stream_offset + abs as u64,
}));
}
self.state = ParserState::DoctypeName { name_start: usize::MAX - 1 };
return Ok(pos + 1);
}
let non_ws = !masks.whitespace >> pos;
if non_ws == 0 {
return Ok(block_len);
}
let next = non_ws.trailing_zeros() as usize;
if pos + next >= block_len {
return Ok(block_len);
}
let new_abs = block_offset + pos + next;
let byte = buf[new_abs];
if byte == b'>' || !is_name_start_byte(byte) {
return Err(ParseError::Xml(Error {
kind: ErrorKind::DoctypeMissingName,
offset: stream_offset + new_abs as u64,
}));
}
self.state = ParserState::DoctypeName { name_start: new_abs };
let shifted2 = masks.name_end >> (pos + next);
if shifted2 == 0 {
return Ok(block_len);
}
let next2 = shifted2.trailing_zeros() as usize;
if pos + next + next2 >= block_len {
return Ok(block_len);
}
let end_abs = block_offset + pos + next + next2;
return self.finish_doctype_name(buf, pos + next + next2, end_abs, new_abs, stream_offset, visitor);
}
let Some((next, end_abs)) =
find_name_end(masks.name_end, pos, block_offset, block_len)
else {
check_name_length(block_offset + block_len, name_start, stream_offset)?;
return Ok(block_len);
};
self.finish_doctype_name(buf, next, end_abs, name_start, stream_offset, visitor)
}
fn finish_doctype_name<V: Visitor>(
&mut self,
buf: &[u8],
block_rel_pos: usize,
end_abs: usize,
name_start: usize,
stream_offset: u64,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let name = &buf[name_start..end_abs];
if name.len() > MAX_NAME_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::NameTooLong,
offset: stream_offset + name_start as u64,
}));
}
let name_span = Span::new(
stream_offset + name_start as u64,
stream_offset + end_abs as u64,
);
visitor
.doctype_start(name, name_span)
.map_err(ParseError::Visitor)?;
let byte = buf[end_abs];
if byte == b'>' {
let end_span = Span::new(
stream_offset + end_abs as u64,
stream_offset + end_abs as u64 + 1,
);
visitor
.doctype_end(end_span)
.map_err(ParseError::Visitor)?;
self.finish_markup();
Ok(block_rel_pos + 1)
} else {
self.markup_stream_offset = Some(stream_offset + self.markup_start.unwrap() as u64);
self.markup_start = None;
self.content_start = Some(end_abs + 1);
self.state = ParserState::DoctypeContent {
depth: 0,
sub: DoctypeSubState::Normal,
};
Ok(block_rel_pos + 1)
}
}
fn scan_doctype_content<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
mut pos: usize,
stream_offset: u64,
mut depth: u32,
mut sub: DoctypeSubState,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let content_start = self.content_start.unwrap();
while pos < block_len {
let abs = block_offset + pos;
let byte = buf[abs];
match sub {
DoctypeSubState::Normal => match byte {
b'[' => {
depth += 1;
if depth > 1024 {
return Err(ParseError::Xml(Error {
kind: ErrorKind::DoctypeBracketsTooDeep,
offset: stream_offset + abs as u64,
}));
}
}
b']' => {
depth = depth.saturating_sub(1);
}
b'>' => {
if depth == 0 {
if abs > content_start {
let span = Span::new(
stream_offset + content_start as u64,
stream_offset + abs as u64,
);
visitor
.doctype_content(&buf[content_start..abs], span)
.map_err(ParseError::Visitor)?;
}
let end_span = Span::new(
stream_offset + abs as u64,
stream_offset + abs as u64 + 1,
);
visitor
.doctype_end(end_span)
.map_err(ParseError::Visitor)?;
self.finish_content_body();
return Ok(pos + 1);
}
}
b'<' => sub = DoctypeSubState::AfterLt,
b'"' => sub = DoctypeSubState::DoubleQuoted,
b'\'' => sub = DoctypeSubState::SingleQuoted,
_ => {}
},
DoctypeSubState::AfterLt => match byte {
b'!' => sub = DoctypeSubState::AfterLtBang,
b'?' => sub = DoctypeSubState::PI { saw_qmark: false },
_ => { sub = DoctypeSubState::Normal; continue; }
},
DoctypeSubState::AfterLtBang => match byte {
b'-' => sub = DoctypeSubState::AfterLtBangDash,
_ => { sub = DoctypeSubState::Normal; continue; }
},
DoctypeSubState::AfterLtBangDash => match byte {
b'-' => sub = DoctypeSubState::Comment { dash_count: 0 },
_ => { sub = DoctypeSubState::Normal; continue; }
},
DoctypeSubState::Comment { ref mut dash_count } => match byte {
b'-' => *dash_count = dash_count.saturating_add(1),
b'>' if *dash_count >= 2 => sub = DoctypeSubState::Normal,
_ => *dash_count = 0,
},
DoctypeSubState::PI { ref mut saw_qmark } => match byte {
b'?' => *saw_qmark = true,
b'>' if *saw_qmark => sub = DoctypeSubState::Normal,
_ => *saw_qmark = false,
},
DoctypeSubState::DoubleQuoted => {
if byte == b'"' { sub = DoctypeSubState::Normal; }
}
DoctypeSubState::SingleQuoted => {
if byte == b'\'' { sub = DoctypeSubState::Normal; }
}
}
pos += 1;
}
self.state = ParserState::DoctypeContent { depth, sub };
Ok(block_len)
}
const XML_DECL_BUF_LIMIT: usize = 256;
fn emit_pi_content<V: Visitor>(
&mut self,
content: &[u8],
span: Span,
visitor: &mut V,
) -> Result<(), ParseError<V::Error>> {
if self.in_xml_decl {
let new_len = self.xml_decl_buf_len + content.len();
if new_len > Self::XML_DECL_BUF_LIMIT {
return Err(ParseError::Xml(Error {
kind: ErrorKind::MalformedXmlDeclaration,
offset: span.start,
}));
}
self.xml_decl_buf[self.xml_decl_buf_len..new_len].copy_from_slice(content);
self.xml_decl_buf_len = new_len;
Ok(())
} else {
visitor.pi_content(content, span).map_err(ParseError::Visitor)
}
}
fn emit_pi_end<V: Visitor>(
&mut self,
end_span: Span,
visitor: &mut V,
) -> Result<(), ParseError<V::Error>> {
if self.in_xml_decl {
self.in_xml_decl = false;
let decl_span = Span::new(self.xml_decl_span_start, end_span.end);
let len = self.xml_decl_buf_len;
self.xml_decl_buf_len = 0;
let (version, encoding, standalone) =
parse_xml_decl(&self.xml_decl_buf[..len], self.xml_decl_span_start)?;
visitor
.xml_declaration(version, encoding, standalone, decl_span)
.map_err(ParseError::Visitor)
} else {
visitor.pi_end(end_span).map_err(ParseError::Visitor)
}
}
fn scan_pi_target<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
name_start: usize,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let Some((next, abs)) =
find_name_end(masks.name_end, pos, block_offset, block_len)
else {
check_name_length(block_offset + block_len, name_start, stream_offset)?;
return Ok(block_len);
};
let name = validate_name(buf, name_start, abs, stream_offset)?;
let name_span = Span::new(
stream_offset + name_start as u64,
stream_offset + abs as u64,
);
let is_xml_target = name.eq_ignore_ascii_case(b"xml");
if is_xml_target {
if self.had_markup {
return Err(ParseError::Xml(Error {
kind: ErrorKind::ReservedPITarget,
offset: stream_offset + name_start as u64,
}));
}
self.in_xml_decl = true;
self.xml_decl_buf_len = 0;
self.xml_decl_span_start = stream_offset + self.markup_start.unwrap() as u64;
} else {
visitor
.pi_start(name, name_span)
.map_err(ParseError::Visitor)?;
}
let byte = buf[abs];
if byte == b'?' {
let gt_pos = abs + 1;
if gt_pos < buf.len() && buf[gt_pos] == b'>' {
let end_span = Span::new(
stream_offset + abs as u64,
stream_offset + gt_pos as u64 + 1,
);
self.emit_pi_end(end_span, visitor)?;
self.finish_markup();
Ok(next + 2)
} else {
self.markup_stream_offset = Some(stream_offset + self.markup_start.unwrap() as u64);
self.markup_start = None;
self.content_start = Some(abs + 1);
self.state = ParserState::PIContent {
saw_qmark: true,
};
Ok(next + 1)
}
} else {
self.markup_stream_offset = Some(stream_offset + self.markup_start.unwrap() as u64);
self.markup_start = None;
self.content_start = Some(abs + 1);
self.state = ParserState::PIContent {
saw_qmark: false,
};
Ok(next + 1)
}
}
fn scan_pi_content<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
mut pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
mut saw_qmark: bool,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let content_start = self.content_start.unwrap();
loop {
if saw_qmark {
if pos >= block_len {
self.state = ParserState::PIContent { saw_qmark: true };
return Ok(block_len);
}
let abs = block_offset + pos;
let byte = buf[abs];
if byte == b'>' {
let content_end = abs - 1;
if content_end > content_start {
let span = Span::new(
stream_offset + content_start as u64,
stream_offset + content_end as u64,
);
self.emit_pi_content(&buf[content_start..content_end], span, visitor)?;
}
let end_span = Span::new(
stream_offset + abs as u64 - 1,
stream_offset + abs as u64 + 1,
);
self.emit_pi_end(end_span, visitor)?;
self.finish_content_body();
return Ok(pos + 1);
}
saw_qmark = false;
if byte == b'?' {
saw_qmark = true;
pos += 1;
continue;
}
pos += 1;
continue;
}
if pos >= block_len {
self.state = ParserState::PIContent { saw_qmark: false };
return Ok(block_len);
}
let shifted = masks.qmark >> pos;
if shifted == 0 {
self.state = ParserState::PIContent {
saw_qmark: false,
};
return Ok(block_len);
}
let next = shifted.trailing_zeros() as usize;
if pos + next >= block_len {
self.state = ParserState::PIContent {
saw_qmark: false,
};
return Ok(block_len);
}
let qmark_abs = block_offset + pos + next;
let gt_pos = qmark_abs + 1;
if gt_pos < buf.len() {
if buf[gt_pos] == b'>' {
let content_end = qmark_abs;
if content_end > content_start {
let span = Span::new(
stream_offset + content_start as u64,
stream_offset + content_end as u64,
);
self.emit_pi_content(&buf[content_start..content_end], span, visitor)?;
}
let end_span = Span::new(
stream_offset + qmark_abs as u64,
stream_offset + gt_pos as u64 + 1,
);
self.emit_pi_end(end_span, visitor)?;
self.finish_content_body();
return Ok(pos + next + 2);
}
pos = pos + next + 1;
} else {
self.state = ParserState::PIContent {
saw_qmark: true,
};
return Ok(pos + next + 1);
}
}
}
fn scan_entity_ref<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
name_start: usize,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let abs = block_offset + pos;
if abs == name_start && abs < buf.len() && buf[abs] == b'#' {
self.state = ParserState::CharRef {
value_start: abs + 1,
};
return Ok(pos + 1);
}
let Some((name, span, next_pos)) = find_and_validate_entity_name(
buf, block_offset, block_len, pos, masks.semicolon,
stream_offset, name_start,
)? else {
return Ok(block_len);
};
visitor
.entity_ref(name, span)
.map_err(ParseError::Visitor)?;
self.finish_markup();
Ok(next_pos)
}
fn scan_char_ref<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
value_start: usize,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let Some((value, span, next_pos)) = find_and_validate_char_ref(
buf, block_offset, block_len, pos, masks.semicolon,
stream_offset, value_start,
)? else {
return Ok(block_len);
};
visitor
.char_ref(value, span)
.map_err(ParseError::Visitor)?;
self.finish_markup();
Ok(next_pos)
}
fn scan_attr_entity_ref<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
name_start: usize,
quote: QuoteStyle,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let abs = block_offset + pos;
if abs == name_start && abs < buf.len() && buf[abs] == b'#' {
self.state = ParserState::AttrCharRef {
value_start: abs + 1,
quote,
};
return Ok(pos + 1);
}
let Some((name, span, next_pos)) = find_and_validate_entity_name(
buf, block_offset, block_len, pos, masks.semicolon,
stream_offset, name_start,
)? else {
return Ok(block_len);
};
visitor
.attribute_entity_ref(name, span)
.map_err(ParseError::Visitor)?;
self.markup_start = None;
self.content_start = Some(block_offset + next_pos);
self.state = ParserState::AttrValue { quote };
Ok(next_pos)
}
fn scan_attr_char_ref<V: Visitor>(
&mut self,
buf: &[u8],
block_offset: usize,
block_len: usize,
pos: usize,
masks: &CharClassMasks,
stream_offset: u64,
value_start: usize,
quote: QuoteStyle,
visitor: &mut V,
) -> Result<usize, ParseError<V::Error>> {
let Some((value, span, next_pos)) = find_and_validate_char_ref(
buf, block_offset, block_len, pos, masks.semicolon,
stream_offset, value_start,
)? else {
return Ok(block_len);
};
visitor
.attribute_char_ref(value, span)
.map_err(ParseError::Visitor)?;
self.markup_start = None;
self.content_start = Some(block_offset + next_pos);
self.state = ParserState::AttrValue { quote };
Ok(next_pos)
}
}
#[inline(always)]
fn find_name_end(
name_end_mask: u64,
pos: usize,
block_offset: usize,
block_len: usize,
) -> Option<(usize, usize)> {
let shifted = name_end_mask >> pos;
if shifted == 0 {
return None;
}
let next = shifted.trailing_zeros() as usize;
if pos + next >= block_len {
return None;
}
Some((pos + next, block_offset + pos + next))
}
#[inline]
fn check_name_length<E>(
block_end: usize,
name_start: usize,
stream_offset: u64,
) -> Result<(), ParseError<E>> {
if block_end - name_start > MAX_NAME_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::NameTooLong,
offset: stream_offset + name_start as u64,
}));
}
Ok(())
}
#[inline]
fn validate_name<'a, E>(
buf: &'a [u8],
name_start: usize,
name_end: usize,
stream_offset: u64,
) -> Result<&'a [u8], ParseError<E>> {
if name_start == name_end {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(buf[name_end]),
offset: stream_offset + name_end as u64,
}));
}
if name_end - name_start > MAX_NAME_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::NameTooLong,
offset: stream_offset + name_start as u64,
}));
}
Ok(&buf[name_start..name_end])
}
#[inline]
fn find_and_validate_entity_name<'a, E>(
buf: &'a [u8],
block_offset: usize,
block_len: usize,
pos: usize,
semicolon_mask: u64,
stream_offset: u64,
name_start: usize,
) -> Result<Option<(&'a [u8], Span, usize)>, ParseError<E>> {
let shifted = semicolon_mask >> pos;
if shifted == 0 {
if block_offset + block_len - name_start > MAX_NAME_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::NameTooLong,
offset: stream_offset + name_start as u64,
}));
}
return Ok(None);
}
let next = shifted.trailing_zeros() as usize;
if pos + next >= block_len {
if block_offset + block_len - name_start > MAX_NAME_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::NameTooLong,
offset: stream_offset + name_start as u64,
}));
}
return Ok(None);
}
let semi_abs = block_offset + pos + next;
let name = &buf[name_start..semi_abs];
if name.is_empty() {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(b';'),
offset: stream_offset + semi_abs as u64,
}));
}
if name.len() > MAX_NAME_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::NameTooLong,
offset: stream_offset + name_start as u64,
}));
}
if !is_name_start_byte(name[0]) {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(name[0]),
offset: stream_offset + name_start as u64,
}));
}
for (i, &b) in name[1..].iter().enumerate() {
if !is_name_byte(b) {
return Err(ParseError::Xml(Error {
kind: ErrorKind::UnexpectedByte(b),
offset: stream_offset + name_start as u64 + 1 + i as u64,
}));
}
}
let span = Span::new(
stream_offset + name_start as u64,
stream_offset + semi_abs as u64,
);
Ok(Some((name, span, pos + next + 1)))
}
#[inline]
fn find_and_validate_char_ref<'a, E>(
buf: &'a [u8],
block_offset: usize,
block_len: usize,
pos: usize,
semicolon_mask: u64,
stream_offset: u64,
value_start: usize,
) -> Result<Option<(&'a [u8], Span, usize)>, ParseError<E>> {
let shifted = semicolon_mask >> pos;
if shifted == 0 {
if block_offset + block_len - value_start > MAX_CHAR_REF_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::CharRefTooLong,
offset: stream_offset + value_start as u64,
}));
}
return Ok(None);
}
let next = shifted.trailing_zeros() as usize;
if pos + next >= block_len {
if block_offset + block_len - value_start > MAX_CHAR_REF_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::CharRefTooLong,
offset: stream_offset + value_start as u64,
}));
}
return Ok(None);
}
let semi_abs = block_offset + pos + next;
let value = &buf[value_start..semi_abs];
if value.is_empty() {
return Err(ParseError::Xml(Error {
kind: ErrorKind::InvalidCharRef,
offset: stream_offset + semi_abs as u64,
}));
}
if value.len() > MAX_CHAR_REF_LENGTH {
return Err(ParseError::Xml(Error {
kind: ErrorKind::CharRefTooLong,
offset: stream_offset + value_start as u64,
}));
}
if value[0] == b'x' {
let hex_digits = &value[1..];
if hex_digits.is_empty() || !hex_digits.iter().all(|b| b.is_ascii_hexdigit()) {
return Err(ParseError::Xml(Error {
kind: ErrorKind::InvalidCharRef,
offset: stream_offset + value_start as u64,
}));
}
} else if !value.iter().all(|b| b.is_ascii_digit()) {
return Err(ParseError::Xml(Error {
kind: ErrorKind::InvalidCharRef,
offset: stream_offset + value_start as u64,
}));
}
let span = Span::new(
stream_offset + value_start as u64,
stream_offset + semi_abs as u64,
);
Ok(Some((value, span, pos + next + 1)))
}
#[inline(always)]
fn find_non_whitespace(
whitespace_mask: u64,
pos: usize,
block_offset: usize,
block_len: usize,
) -> Option<(usize, usize)> {
find_name_end(!whitespace_mask, pos, block_offset, block_len)
}
#[inline]
fn is_name_start_byte(b: u8) -> bool {
b.is_ascii_alphabetic() || b == b'_' || b == b':' || b >= 0x80
}
#[inline]
fn is_name_byte(b: u8) -> bool {
is_name_start_byte(b) || b.is_ascii_digit() || b == b'-' || b == b'.'
}
fn parse_xml_decl<E>(
buf: &[u8],
offset: u64,
) -> Result<(&[u8], Option<&[u8]>, Option<bool>), ParseError<E>> {
let err = || {
ParseError::Xml(Error {
kind: ErrorKind::MalformedXmlDeclaration,
offset,
})
};
let mut pos = 0;
while pos < buf.len() && is_xml_whitespace(buf[pos]) {
pos += 1;
}
let version = parse_pseudo_attr(buf, &mut pos, b"version").ok_or_else(err)?;
while pos < buf.len() && is_xml_whitespace(buf[pos]) {
pos += 1;
}
if pos >= buf.len() {
return Ok((version, None, None));
}
let mut encoding = None;
let mut standalone = None;
if buf[pos..].starts_with(b"encoding") {
encoding = Some(parse_pseudo_attr(buf, &mut pos, b"encoding").ok_or_else(err)?);
while pos < buf.len() && is_xml_whitespace(buf[pos]) {
pos += 1;
}
if pos >= buf.len() {
return Ok((version, encoding, None));
}
}
if buf[pos..].starts_with(b"standalone") {
let val = parse_pseudo_attr(buf, &mut pos, b"standalone").ok_or_else(err)?;
standalone = Some(match val {
b"yes" => true,
b"no" => false,
_ => return Err(err()),
});
while pos < buf.len() && is_xml_whitespace(buf[pos]) {
pos += 1;
}
}
if pos < buf.len() {
return Err(err());
}
Ok((version, encoding, standalone))
}
fn parse_pseudo_attr<'a>(buf: &'a [u8], pos: &mut usize, expected_name: &[u8]) -> Option<&'a [u8]> {
let end = *pos + expected_name.len();
if end > buf.len() || &buf[*pos..end] != expected_name {
return None;
}
*pos = end;
while *pos < buf.len() && is_xml_whitespace(buf[*pos]) {
*pos += 1;
}
if *pos >= buf.len() || buf[*pos] != b'=' {
return None;
}
*pos += 1;
while *pos < buf.len() && is_xml_whitespace(buf[*pos]) {
*pos += 1;
}
if *pos >= buf.len() {
return None;
}
let quote = buf[*pos];
if quote != b'"' && quote != b'\'' {
return None;
}
*pos += 1;
let value_start = *pos;
while *pos < buf.len() && buf[*pos] != quote {
*pos += 1;
}
if *pos >= buf.len() {
return None;
}
let value = &buf[value_start..*pos];
*pos += 1;
Some(value)
}
fn utf8_boundary_rewind(buf: &[u8]) -> Result<usize, usize> {
if buf.is_empty() {
return Ok(0);
}
let start = buf.len().saturating_sub(3);
for i in (start..buf.len()).rev() {
let b = buf[i];
if b < 0x80 {
return Ok(0); }
if b >= 0xC0 {
let expected_len = if b < 0xE0 {
2
} else if b < 0xF0 {
3
} else {
4
};
let available = buf.len() - i;
if available >= expected_len {
return Ok(0); } else {
return Ok(available); }
}
}
Err(buf.len().saturating_sub(3)) }
#[cfg(feature = "std")]
const DEFAULT_BUF_SIZE: usize = 8192;
#[cfg(feature = "std")]
pub fn parse_read<R: std::io::Read, V: Visitor>(
reader: R,
visitor: &mut V,
) -> Result<(), ReadError<V::Error>> {
parse_read_with_capacity(reader, visitor, DEFAULT_BUF_SIZE)
}
#[cfg(feature = "std")]
pub fn parse_read_with_capacity<R: std::io::Read, V: Visitor>(
mut reader: R,
visitor: &mut V,
capacity: usize,
) -> Result<(), ReadError<V::Error>> {
let capacity = capacity.max(64);
let mut buf = std::vec![0u8; capacity];
let mut parser = Reader::new();
let mut stream_offset: u64 = 0;
let mut valid: usize = 0;
loop {
let n = reader.read(&mut buf[valid..]).map_err(ReadError::Io)?;
valid += n;
let is_final = n == 0;
if valid == 0 {
break;
}
let consumed = parser
.parse(&buf[..valid], stream_offset, is_final, visitor)
.map_err(ReadError::from_parse)? as usize;
let leftover = valid - consumed;
if leftover > 0 {
buf.copy_within(consumed..valid, 0);
}
valid = leftover;
stream_offset += consumed as u64;
if consumed == 0 && is_final {
break;
}
}
Ok(())
}
#[cfg(feature = "std")]
pub enum ReadError<E> {
Xml(Error),
Visitor(E),
Io(std::io::Error),
}
#[cfg(feature = "std")]
impl<E> ReadError<E> {
fn from_parse(e: ParseError<E>) -> Self {
match e {
ParseError::Xml(e) => ReadError::Xml(e),
ParseError::Visitor(e) => ReadError::Visitor(e),
}
}
}
#[cfg(feature = "std")]
impl<E: core::fmt::Debug> core::fmt::Debug for ReadError<E> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
ReadError::Xml(e) => write!(f, "ReadError::Xml({e:?})"),
ReadError::Visitor(e) => write!(f, "ReadError::Visitor({e:?})"),
ReadError::Io(e) => write!(f, "ReadError::Io({e:?})"),
}
}
}
#[cfg(feature = "std")]
impl<E: core::fmt::Display> core::fmt::Display for ReadError<E> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
ReadError::Xml(e) => write!(f, "XML error: {e}"),
ReadError::Visitor(e) => write!(f, "visitor error: {e}"),
ReadError::Io(e) => write!(f, "I/O error: {e}"),
}
}
}
#[cfg(feature = "std")]
impl<E: core::error::Error> core::error::Error for ReadError<E> {}
#[cfg(feature = "std")]
impl<E> From<Error> for ReadError<E> {
fn from(e: Error) -> Self {
ReadError::Xml(e)
}
}
#[cfg(feature = "std")]
impl<E> From<std::io::Error> for ReadError<E> {
fn from(e: std::io::Error) -> Self {
ReadError::Io(e)
}
}