use alloc::borrow::ToOwned;
use alloc::string::String;
use alloc::vec::Vec;
use core::fmt;
mod ranges;
mod read;
use crate::error::{EndOrError, Error, ErrorContext, ErrorWithContext};
use crate::parser::Result;
use crate::strings::*;
use ranges::*;
use read::Endbyte;
use rxml_validation::selectors::*;
#[derive(Copy, Debug, Clone, PartialEq, Eq)]
pub struct TokenMetrics {
start: usize,
end: usize,
}
impl TokenMetrics {
pub fn len(&self) -> usize {
self.end.wrapping_sub(self.start)
}
pub fn start(&self) -> usize {
self.start
}
pub fn end(&self) -> usize {
self.end
}
#[cfg(test)]
pub(crate) const fn new(start: usize, end: usize) -> TokenMetrics {
TokenMetrics { start, end }
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
Name(TokenMetrics, Name),
Eq(TokenMetrics),
AttributeValue(TokenMetrics, String),
XMLDeclEnd(TokenMetrics),
ElementHeadClose(TokenMetrics),
ElementHFEnd(TokenMetrics),
XMLDeclStart(TokenMetrics),
ElementHeadStart(TokenMetrics, Name),
ElementFootStart(TokenMetrics, Name),
Text(TokenMetrics, String),
CommentStart(TokenMetrics),
CommentEnd(TokenMetrics),
}
impl Token {
pub const NAME_NAME: &'static str = "Name";
pub const NAME_EQ: &'static str = "'='";
pub const NAME_ATTRIBUTEVALUE: &'static str = "AttValue";
pub const NAME_XMLDECLEND: &'static str = "'?>'";
pub const NAME_ELEMENTHEADCLOSE: &'static str = "'/>'";
pub const NAME_ELEMENTHFEND: &'static str = "'>'";
pub const NAME_XMLDECLSTART: &'static str = "'<?xml'";
pub const NAME_ELEMENTHEADSTART: &'static str = "'<'";
pub const NAME_ELEMENTFOOTSTART: &'static str = "'</'";
pub const NAME_TEXT: &'static str = "Text";
pub const NAME_COMMENTSTART: &'static str = "'<!--'";
pub const NAME_COMMENTEND: &'static str = "'-->'";
pub fn name(&self) -> &'static str {
match self {
Self::Name(..) => Self::NAME_NAME,
Self::Eq(..) => Self::NAME_EQ,
Self::AttributeValue(..) => Self::NAME_ATTRIBUTEVALUE,
Self::XMLDeclEnd(..) => Self::NAME_XMLDECLEND,
Self::ElementHeadClose(..) => Self::NAME_ELEMENTHEADCLOSE,
Self::ElementHFEnd(..) => Self::NAME_ELEMENTHFEND,
Self::XMLDeclStart(..) => Self::NAME_XMLDECLSTART,
Self::ElementHeadStart(..) => Self::NAME_ELEMENTHEADSTART,
Self::ElementFootStart(..) => Self::NAME_ELEMENTFOOTSTART,
Self::Text(..) => Self::NAME_TEXT,
Self::CommentStart(..) => Self::NAME_COMMENTSTART,
Self::CommentEnd(..) => Self::NAME_COMMENTEND,
}
}
pub fn metrics(&self) -> &TokenMetrics {
match self {
Self::Name(m, ..) => m,
Self::Eq(m) => m,
Self::AttributeValue(m, ..) => m,
Self::XMLDeclEnd(m) => m,
Self::ElementHeadClose(m) => m,
Self::ElementHFEnd(m) => m,
Self::XMLDeclStart(m) => m,
Self::ElementHeadStart(m, ..) => m,
Self::ElementFootStart(m, ..) => m,
Self::Text(m, ..) => m,
Self::CommentStart(m, ..) => m,
Self::CommentEnd(m, ..) => m,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum CharRefRadix {
Decimal,
Hexadecimal,
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum RefKind {
Entity,
Char(CharRefRadix),
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum ElementState {
Start,
SpaceRequired,
Blank,
Name,
Eq,
Close,
AttributeValue(u8, bool),
MaybeXMLDeclEnd,
MaybeHeadClose,
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum ElementKind {
Header,
Footer,
XMLDecl,
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum Bang {
Initial,
CDataSectionStart(usize),
CommentStart(usize),
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum MaybeElementState {
Initial,
Bang(Bang),
XMLDeclStart(usize),
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum CrlfReturn {
Normal,
Cdata,
Comment,
}
impl From<CrlfReturn> for ContentState {
fn from(other: CrlfReturn) -> Self {
match other {
CrlfReturn::Normal => Self::Initial,
CrlfReturn::Cdata => Self::CDataSection,
CrlfReturn::Comment => Self::Comment,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum ContentState {
Initial,
CDataSection,
MaybeElement(MaybeElementState),
Whitespace,
MaybeCDataEnd(bool, usize),
MaybeCRLF(CrlfReturn),
Comment,
MaybeCommentEnd(usize),
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum RefReturnState {
AttributeValue(ElementKind, u8),
Text,
}
impl RefReturnState {
fn to_state(self) -> State {
match self {
Self::AttributeValue(kind, delim) => State::Element {
kind,
state: ElementState::AttributeValue(delim, false),
},
Self::Text => State::Content(ContentState::Initial),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum State {
Content(ContentState),
Element {
kind: ElementKind,
state: ElementState,
},
Reference {
ctx: ErrorContext,
ret: RefReturnState,
kind: RefKind,
},
Eof,
}
const MAX_REFERENCE_LENGTH: usize = 8usize;
const TOK_XML_DECL_START: &[u8] = b"<?xml";
const TOK_XML_CDATA_START: &[u8] = b"<![CDATA[";
const TOK_XML_COMMENT_START: &[u8] = b"<!--";
const TOK_XML_CDATA_END: &[u8] = b"]]>";
const TOK_XML_COMMENT_END: &[u8] = b"-->";
#[derive(Debug, Clone, PartialEq, Copy)]
pub(crate) struct LexerOptions {
pub max_token_length: usize,
}
fn resolve_named_entity(name: &[u8]) -> Result<u8> {
match name {
b"amp" => Ok(b'&'),
b"lt" => Ok(b'<'),
b"gt" => Ok(b'>'),
b"apos" => Ok(b'\''),
b"quot" => Ok(b'"'),
_ => Err(EndOrError::Error(Error::UndeclaredEntity)),
}
}
fn resolve_char_reference(s: &str, radix: CharRefRadix, into: &mut Vec<u8>) -> Result<()> {
let radix = match radix {
CharRefRadix::Decimal => 10,
CharRefRadix::Hexadecimal => 16,
};
let codepoint = u32::from_str_radix(s, radix).unwrap();
let ch = match char::from_u32(codepoint) {
Some(ch) => ch,
None => return Err(EndOrError::Error(Error::InvalidChar(None, codepoint, true))),
};
if !CLASS_XML_NONCHAR.select(ch) {
let mut buf = [0u8; 4];
let s = ch.encode_utf8(&mut buf[..]);
into.extend_from_slice(s.as_bytes());
Ok(())
} else {
Err(EndOrError::Error(Error::InvalidChar(None, codepoint, true)))
}
}
fn add_context<T>(r: Result<T>, ctx: ErrorContext) -> Result<T> {
r.map_err(|e| e.with_context(ctx))
}
fn handle_eof<T>(v: Option<T>, ctx: ErrorContext) -> Result<T> {
v.ok_or_else(|| EndOrError::Error(Error::wfeof(ctx)))
}
struct ST(State, Option<Token>);
impl ST {
fn splice(self, st: &mut State) -> Option<Token> {
*st = self.0;
self.1
}
}
pub struct Lexer {
state: State,
scratchpad: Vec<u8>,
swap: Vec<u8>,
ctr: usize,
last_token_end: usize,
opts: LexerOptions,
#[cfg(debug_assertions)]
prev_state: (Vec<u8>, State),
#[cfg(debug_assertions)]
last_single_read: Option<u8>,
err: Option<Error>,
has_eof: bool,
pub(crate) text_buffering: bool,
}
impl Lexer {
pub fn new(opts: LexerOptions) -> Self {
Self {
state: State::Content(ContentState::Initial),
scratchpad: Vec::new(),
swap: Vec::new(),
ctr: 0,
last_token_end: 0,
opts,
#[cfg(debug_assertions)]
prev_state: (Vec::new(), State::Content(ContentState::Initial)),
#[cfg(debug_assertions)]
last_single_read: None,
err: None,
has_eof: false,
text_buffering: true,
}
}
fn demote_eof(&self, ep: Endbyte) -> Result<Endbyte> {
match ep {
Endbyte::Eof => {
if self.has_eof {
Ok(Endbyte::Eof)
} else {
Err(EndOrError::NeedMoreData)
}
}
other => Ok(other),
}
}
fn token_length_error() -> EndOrError {
EndOrError::Error(Error::RestrictedXml("long name or reference"))
}
fn eat_whitespace_metrics(&mut self, without: usize) {
self.last_token_end = self.ctr.wrapping_sub(without);
}
#[inline]
fn prep_scratchpad(&mut self) {
if self.scratchpad.capacity() < self.opts.max_token_length {
self.scratchpad
.reserve_exact(self.opts.max_token_length - self.scratchpad.capacity())
}
}
fn read_validated<B: ByteSelect + ?Sized>(
&mut self,
r: &mut &[u8],
selector: &B,
limit: usize,
) -> Result<Endbyte> {
let remaining = match limit.checked_sub(self.scratchpad.len()) {
None => return Ok(Endbyte::Limit),
Some(v) => v,
};
let old_len = self.scratchpad.len();
self.prep_scratchpad();
let ep = read::read_validated_bytes(r, selector, remaining, &mut self.scratchpad);
self.ctr = self.ctr.wrapping_add(self.scratchpad.len() - old_len);
if let Endbyte::Delimiter(_) = ep {
self.ctr = self.ctr.wrapping_add(1);
}
self.demote_eof(ep)
}
#[inline]
fn read_single(&mut self, r: &mut &[u8]) -> Result<Option<u8>> {
let last_read = match r.split_first() {
Some((v, tail)) => {
self.ctr = self.ctr.wrapping_add(1);
*r = tail;
Some(*v)
}
None => {
if self.has_eof {
None
} else {
return Err(EndOrError::NeedMoreData);
}
}
};
#[cfg(debug_assertions)]
{
self.last_single_read = last_read;
}
Ok(last_read)
}
#[inline]
fn skip_matching<B: ByteSelect>(
&mut self,
r: &mut &[u8],
selector: &B,
) -> (usize, Result<Endbyte>) {
let (nread, ep) = read::skip_matching_bytes(r, selector);
self.ctr = self.ctr.wrapping_add(nread);
match self.demote_eof(ep) {
Ok(ep) => {
if let Endbyte::Delimiter(_) = ep {
self.ctr = self.ctr.wrapping_add(1)
};
(nread, Ok(ep))
}
Err(e) => (nread, Err(e)),
}
}
fn drop_scratchpad(&mut self) -> Result<()> {
self.scratchpad.clear();
Ok(())
}
fn swap_scratchpad(&mut self) -> Result<()> {
core::mem::swap(&mut self.scratchpad, &mut self.swap);
Ok(())
}
fn read_swap(&mut self) -> Vec<u8> {
let mut tmp = Vec::new();
core::mem::swap(&mut tmp, &mut self.swap);
tmp
}
fn metrics(&mut self, without: usize) -> TokenMetrics {
let start = self.last_token_end;
let end = self.ctr.wrapping_sub(without);
self.last_token_end = end;
TokenMetrics { start, end }
}
fn flush_scratchpad<U, T: FnOnce(&[u8]) -> Result<U>>(&mut self, conv: T) -> Result<U> {
let result = conv(&self.scratchpad);
self.scratchpad.clear();
result
}
fn flush_scratchpad_as_name(&mut self) -> Result<Name> {
self.flush_scratchpad(|bytes| -> Result<Name> {
let s = match core::str::from_utf8(bytes) {
Ok(s) => Ok(s),
Err(e) => Err(Error::utf8err(bytes, &e)),
}?;
Ok(s.try_into()?)
})
}
fn flush_scratchpad_as_complete_cdata(&mut self) -> Result<String> {
self.flush_scratchpad(|bytes| -> Result<String> {
let s = match core::str::from_utf8(bytes) {
Ok(s) => Ok(s),
Err(e) => Err(Error::utf8err(bytes, &e)),
}?;
validate_cdata(s)?;
Ok(s.to_owned())
})
}
fn flush_scratchpad_as_partial_cdata(&mut self) -> Result<String> {
let s = match core::str::from_utf8(&self.scratchpad[..]) {
Ok(s) => s,
Err(e) => {
let valid_up_to = e.valid_up_to();
if valid_up_to == 0 {
return Err(EndOrError::Error(Error::InvalidUtf8Byte(
self.scratchpad[0],
)));
} else {
unsafe { core::str::from_utf8_unchecked(&self.scratchpad[..valid_up_to]) }
}
}
};
validate_cdata(s)?;
let s = s.to_owned();
let to_drop = s.len();
self.scratchpad.drain(..to_drop);
Ok(s)
}
fn maybe_flush_scratchpad_as_text(
&mut self,
without: usize,
complete: bool,
) -> Result<Option<Token>> {
if self.scratchpad.is_empty() {
self.eat_whitespace_metrics(without);
Ok(None)
} else {
Ok(Some(Token::Text(
self.metrics(without),
if complete {
self.flush_scratchpad_as_complete_cdata()?
} else {
self.flush_scratchpad_as_partial_cdata()?
},
)))
}
}
fn flush_limited_scratchpad_as_text(&mut self) -> Result<Option<Token>> {
if self.scratchpad.len() >= self.opts.max_token_length
|| (!self.text_buffering && !self.scratchpad.is_empty())
{
Ok(Some(Token::Text(
self.metrics(0),
self.flush_scratchpad_as_partial_cdata()?,
)))
} else {
Ok(None)
}
}
fn lex_posttext_char(&mut self, b: u8) -> Result<Option<ST>> {
match b {
b'<' => Ok(Some(ST(
State::Content(ContentState::MaybeElement(MaybeElementState::Initial)),
self.maybe_flush_scratchpad_as_text(1, true)?, ))),
b']' => Ok(Some(ST(
State::Content(ContentState::MaybeCDataEnd(false, 1)),
None,
))),
b'&' => {
let tok = self.maybe_flush_scratchpad_as_text(1, true)?; self.swap_scratchpad()?;
Ok(Some(ST(
State::Reference {
ctx: ErrorContext::Text,
ret: RefReturnState::Text,
kind: RefKind::Entity,
},
tok,
)))
}
b'\r' => {
Ok(Some(ST(
State::Content(ContentState::MaybeCRLF(CrlfReturn::Normal)),
None,
)))
}
_ => Ok(None),
}
}
fn lex_maybe_element(&mut self, state: MaybeElementState, r: &mut &[u8]) -> Result<ST> {
match state {
MaybeElementState::Initial => match self.read_single(r)? {
Some(byte) => match byte {
b'?' => {
self.drop_scratchpad()?;
Ok(ST(
State::Content(ContentState::MaybeElement(
MaybeElementState::XMLDeclStart(2),
)),
None,
))
}
b'!' => {
self.drop_scratchpad()?;
Ok(ST(
State::Content(ContentState::MaybeElement(MaybeElementState::Bang(
Bang::Initial,
))),
None,
))
}
b'/' => {
self.drop_scratchpad()?;
Ok(ST(
State::Element {
kind: ElementKind::Footer,
state: ElementState::Start,
},
None,
))
}
byte => {
if maybe_name(byte) {
self.prep_scratchpad();
self.scratchpad.push(byte);
Ok(ST(
State::Element {
kind: ElementKind::Header,
state: ElementState::Start,
},
None,
))
} else {
self.drop_scratchpad()?;
Err(EndOrError::Error(Error::UnexpectedByte(
Some(ErrorContext::NameStart),
byte,
None,
)))
}
}
},
None => Err(EndOrError::Error(Error::wfeof(ErrorContext::Element))),
},
MaybeElementState::XMLDeclStart(i) => {
debug_assert!(i < TOK_XML_DECL_START.len());
let b = handle_eof(self.read_single(r)?, ErrorContext::CdataSectionStart)?;
if b != TOK_XML_DECL_START[i] {
return Err(EndOrError::Error(Error::RestrictedXml(
"processing instructions",
)));
}
let next = i + 1;
if next == TOK_XML_DECL_START.len() {
self.drop_scratchpad()?;
Ok(ST(
State::Element {
kind: ElementKind::XMLDecl,
state: ElementState::SpaceRequired,
},
Some(Token::XMLDeclStart(self.metrics(0))),
))
} else {
Ok(ST(
State::Content(ContentState::MaybeElement(
MaybeElementState::XMLDeclStart(next),
)),
None,
))
}
}
MaybeElementState::Bang(Bang::Initial) => {
let b = handle_eof(self.read_single(r)?, ErrorContext::XmlDeclarationStart)?;
if b == b'-' {
Ok(ST(
State::Content(ContentState::MaybeElement(MaybeElementState::Bang(
Bang::CommentStart(3),
))),
None,
))
} else if b == b'[' {
Ok(ST(
State::Content(ContentState::MaybeElement(MaybeElementState::Bang(
Bang::CDataSectionStart(3),
))),
None,
))
} else {
return Err(EndOrError::Error(Error::InvalidSyntax(
"malformed cdata or comment section start",
)));
}
}
MaybeElementState::Bang(Bang::CDataSectionStart(i)) => {
debug_assert!(i < TOK_XML_CDATA_START.len());
let b = handle_eof(self.read_single(r)?, ErrorContext::XmlDeclarationStart)?;
if b != TOK_XML_CDATA_START[i] {
return Err(EndOrError::Error(Error::InvalidSyntax(
"malformed cdata section start",
)));
}
let next = i + 1;
if next == TOK_XML_CDATA_START.len() {
self.drop_scratchpad()?;
Ok(ST(
State::Content(ContentState::CDataSection),
self.maybe_flush_scratchpad_as_text(TOK_XML_CDATA_START.len(), true)?,
))
} else {
Ok(ST(
State::Content(ContentState::MaybeElement(MaybeElementState::Bang(
Bang::CDataSectionStart(next),
))),
None,
))
}
}
MaybeElementState::Bang(Bang::CommentStart(i)) => {
debug_assert!(i < TOK_XML_COMMENT_START.len());
let b = handle_eof(self.read_single(r)?, ErrorContext::XmlDeclarationStart)?;
if b != TOK_XML_COMMENT_START[i] {
return Err(EndOrError::Error(Error::InvalidSyntax(
"malformed cdata section start",
)));
}
let next = i + 1;
if next == TOK_XML_COMMENT_START.len() {
self.drop_scratchpad()?;
Ok(ST(
State::Content(ContentState::Comment),
Some(Token::CommentStart(self.metrics(0))),
))
} else {
Ok(ST(
State::Content(ContentState::MaybeElement(MaybeElementState::Bang(
Bang::CommentStart(next),
))),
None,
))
}
}
}
}
fn lex_resume_text(&mut self, b: u8) -> Result<ST> {
match self.lex_posttext_char(b)? {
Some(st) => Ok(st),
None => {
if is_nonchar_byte(b) {
Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::Text),
b as u32,
false,
)))
} else {
self.prep_scratchpad();
self.scratchpad.push(b);
Ok(ST(State::Content(ContentState::Initial), None))
}
}
}
}
fn lex_maybe_cdata_end(&mut self, in_cdata: bool, nend: usize, r: &mut &[u8]) -> Result<ST> {
debug_assert!(nend < TOK_XML_CDATA_END.len());
let ctx = if in_cdata {
ErrorContext::CdataSection
} else {
ErrorContext::Text
};
let b = handle_eof(self.read_single(r)?, ctx)?;
let expected = TOK_XML_CDATA_END[nend];
if b == expected {
match nend {
1 => Ok(ST(
State::Content(ContentState::MaybeCDataEnd(in_cdata, 2)),
None,
)),
2 => {
if !in_cdata {
Err(EndOrError::Error(Error::InvalidSyntax(
"unescaped ']]>' forbidden in text",
)))
} else {
Ok(ST(
State::Content(ContentState::Initial),
self.maybe_flush_scratchpad_as_text(0, true)?,
))
}
}
_ => panic!("unreachable state: cdata nend = {:?}", nend),
}
} else if b == b']' {
self.prep_scratchpad();
self.scratchpad.push(b']');
Ok(ST(
State::Content(ContentState::MaybeCDataEnd(in_cdata, nend)),
self.flush_limited_scratchpad_as_text()?,
))
} else {
self.prep_scratchpad();
self.scratchpad
.extend_from_slice(&TOK_XML_CDATA_END[..nend]);
if in_cdata {
if is_nonchar_byte(b) {
Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::CdataSection),
b as u32,
false,
)))
} else {
self.scratchpad.push(b);
Ok(ST(
State::Content(ContentState::CDataSection),
self.flush_limited_scratchpad_as_text()?,
))
}
} else {
self.lex_resume_text(b)
}
}
}
fn lex_maybe_comment_end(&mut self, nend: usize, r: &mut &[u8]) -> Result<ST> {
debug_assert!(nend < TOK_XML_COMMENT_END.len());
let b = handle_eof(self.read_single(r)?, ErrorContext::Comment)?;
let expected = TOK_XML_COMMENT_END[nend];
if b == expected {
return match nend {
1 => Ok(ST(
State::Content(ContentState::MaybeCommentEnd(2)),
self.maybe_flush_scratchpad_as_text(2, true)?,
)),
2 => {
Ok(ST(
State::Content(ContentState::Initial),
Some(Token::CommentEnd(self.metrics(0))),
))
}
_ => panic!("unreachable state: comment nend = {:?}", nend),
};
}
if nend == 1 {
self.prep_scratchpad();
self.scratchpad.push(b'-');
if is_nonchar_byte(b) {
Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::Comment),
b as u32,
false,
)))
} else {
self.scratchpad.push(b);
Ok(ST(
State::Content(ContentState::Comment),
self.flush_limited_scratchpad_as_text()?,
))
}
} else {
Err(EndOrError::Error(Error::InvalidSyntax("`--` in comment")))
}
}
fn lex_content(&mut self, state: ContentState, r: &mut &[u8]) -> Result<ST> {
match state {
ContentState::MaybeElement(substate) => self.lex_maybe_element(substate, r),
ContentState::MaybeCDataEnd(in_cdata, nend) => {
self.lex_maybe_cdata_end(in_cdata, nend, r)
}
ContentState::MaybeCommentEnd(nend) => self.lex_maybe_comment_end(nend, r),
ContentState::MaybeCRLF(return_to) => {
let b = handle_eof(self.read_single(r)?, ErrorContext::Text)?;
match b {
b'\n' => {
self.prep_scratchpad();
self.scratchpad.push(b'\n');
Ok(ST(State::Content(return_to.into()), None))
}
b'\r' => {
self.prep_scratchpad();
self.scratchpad.push(b'\n');
Ok(ST(State::Content(ContentState::MaybeCRLF(return_to)), None))
}
b => {
self.prep_scratchpad();
self.scratchpad.push(b'\n');
match return_to {
CrlfReturn::Normal => self.lex_resume_text(b),
CrlfReturn::Cdata => {
if b == b']' {
Ok(ST(
State::Content(ContentState::MaybeCDataEnd(true, 1)),
None,
))
} else if !is_nonchar_byte(b) {
self.scratchpad.push(b);
Ok(ST(State::Content(ContentState::CDataSection), None))
} else {
Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::CdataSection),
b as u32,
false,
)))
}
}
CrlfReturn::Comment => {
if b == b'-' {
Ok(ST(State::Content(ContentState::MaybeCommentEnd(1)), None))
} else if !is_nonchar_byte(b) {
self.scratchpad.push(b);
Ok(ST(State::Content(ContentState::Comment), None))
} else {
Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::Comment),
b as u32,
false,
)))
}
}
}
}
}
}
ContentState::Initial => {
match self.read_validated(r, &maybe_text, self.opts.max_token_length) {
Ok(Endbyte::Eof) => Ok(ST(
State::Eof,
self.maybe_flush_scratchpad_as_text(0, true)?,
)),
Ok(Endbyte::Limit) => Ok(ST(
State::Content(ContentState::Initial),
self.maybe_flush_scratchpad_as_text(0, false)?,
)),
Ok(Endbyte::Delimiter(b)) => match self.lex_posttext_char(b)? {
Some(st) => Ok(st),
None => Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::Text),
b as u32,
false,
))),
},
Err(EndOrError::NeedMoreData)
if !self.text_buffering && !self.scratchpad.is_empty() =>
{
Ok(ST(
State::Content(ContentState::Initial),
match self.maybe_flush_scratchpad_as_text(0, false) {
Err(EndOrError::Error(Error::InvalidUtf8Byte(..)))
if self.scratchpad.len() < 4 =>
{
return Err(EndOrError::NeedMoreData);
}
Err(other) => return Err(other),
Ok(v) => v,
},
))
}
Err(other) => Err(other),
}
}
ContentState::CDataSection => {
match self.read_validated(r, &maybe_cdata_content, self.opts.max_token_length)? {
Endbyte::Eof => {
Err(EndOrError::Error(Error::wfeof(ErrorContext::CdataSection)))
}
Endbyte::Limit => Ok(ST(
State::Content(ContentState::CDataSection),
self.maybe_flush_scratchpad_as_text(0, false)?,
)),
Endbyte::Delimiter(b) => match b {
b']' => Ok(ST(
State::Content(ContentState::MaybeCDataEnd(true, 1)),
None,
)),
b'\r' => Ok(ST(
State::Content(ContentState::MaybeCRLF(CrlfReturn::Cdata)),
None,
)),
_ => Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::CdataSection),
b as u32,
false,
))),
},
}
}
ContentState::Whitespace => match self.skip_matching(r, &is_space) {
(_, Ok(Endbyte::Eof)) | (_, Ok(Endbyte::Limit)) => Ok(ST(State::Eof, None)),
(_, Ok(Endbyte::Delimiter(b))) => match b {
b'<' => Ok(ST(
State::Content(ContentState::MaybeElement(MaybeElementState::Initial)),
None,
)),
_ => Err(EndOrError::Error(Error::UnexpectedByte(
Some(ErrorContext::XmlDeclarationEnd),
b,
Some(&["Spaces", "<"]),
))),
},
(_, Err(e)) => Err(e),
},
ContentState::Comment => {
match self.read_validated(r, &maybe_comment_content, self.opts.max_token_length)? {
Endbyte::Eof => Err(EndOrError::Error(Error::wfeof(ErrorContext::Comment))),
Endbyte::Limit => Ok(ST(
State::Content(ContentState::Comment),
self.maybe_flush_scratchpad_as_text(0, false)?,
)),
Endbyte::Delimiter(b) => match b {
b'-' => Ok(ST(State::Content(ContentState::MaybeCommentEnd(1)), None)),
b'\r' => Ok(ST(
State::Content(ContentState::MaybeCRLF(CrlfReturn::Comment)),
None,
)),
_ => Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::CdataSection),
b as u32,
false,
))),
},
}
}
}
}
fn lex_element_postblank(&mut self, kind: ElementKind, b: u8) -> Result<ElementState> {
match b {
b' ' | b'\t' | b'\r' | b'\n' => Ok(ElementState::Blank),
b'"' => Ok(ElementState::AttributeValue(b'"', false)),
b'\'' => Ok(ElementState::AttributeValue(b'\'', false)),
b'=' => Ok(ElementState::Eq),
b'>' => match kind {
ElementKind::Footer | ElementKind::Header => Ok(ElementState::Close),
ElementKind::XMLDecl => Err(EndOrError::Error(Error::UnexpectedChar(
Some(ErrorContext::XmlDeclaration),
'>',
Some(&["?"]),
))),
},
b'?' => match kind {
ElementKind::XMLDecl => Ok(ElementState::MaybeXMLDeclEnd),
_ => Err(EndOrError::Error(Error::UnexpectedChar(
Some(ErrorContext::Element),
'?',
None,
))),
},
b'/' => match kind {
ElementKind::Header => Ok(ElementState::MaybeHeadClose),
ElementKind::Footer => Err(EndOrError::Error(Error::UnexpectedChar(
Some(ErrorContext::ElementFoot),
'/',
None,
))),
ElementKind::XMLDecl => Err(EndOrError::Error(Error::UnexpectedChar(
Some(ErrorContext::XmlDeclaration),
'/',
None,
))),
},
b if maybe_name(b) => {
self.prep_scratchpad();
self.scratchpad.push(b);
Ok(ElementState::Name)
}
_ => Err(EndOrError::Error(Error::UnexpectedByte(
match kind {
ElementKind::XMLDecl => Some(ErrorContext::XmlDeclaration),
_ => Some(ErrorContext::Element),
},
b,
Some(&["whitespace", "\"", "'", "=", ">", "?", "/", "start of name"]),
))),
}
}
fn lex_attval_next(&mut self, delim: u8, b: u8, element_kind: ElementKind) -> Result<ST> {
match b {
b'<' => Err(EndOrError::Error(Error::UnexpectedChar(
Some(ErrorContext::AttributeValue),
'<',
None,
))),
b'&' => {
self.swap_scratchpad()?;
Ok(ST(
State::Reference {
ctx: ErrorContext::AttributeValue,
ret: RefReturnState::AttributeValue(element_kind, delim),
kind: RefKind::Entity,
},
None,
))
}
b'\t' | b'\n' => {
self.prep_scratchpad();
self.scratchpad.push(b' ');
Ok(ST(
State::Element {
kind: element_kind,
state: ElementState::AttributeValue(delim, false),
},
None,
))
}
b'\r' => Ok(ST(
State::Element {
kind: element_kind,
state: ElementState::AttributeValue(delim, true),
},
None,
)),
d if d == delim => Ok(ST(
State::Element {
kind: element_kind,
state: ElementState::SpaceRequired,
},
Some(Token::AttributeValue(
self.metrics(0),
self.flush_scratchpad_as_complete_cdata()?,
)),
)),
other => Err(EndOrError::Error(Error::InvalidChar(
Some(ErrorContext::AttributeValue),
other as u32,
false,
))),
}
}
fn lex_element(&mut self, kind: ElementKind, state: ElementState, r: &mut &[u8]) -> Result<ST> {
match state {
ElementState::Start | ElementState::Name => {
match self.read_validated(r, &maybe_name, self.opts.max_token_length)? {
Endbyte::Eof => Err(EndOrError::Error(Error::wfeof(ErrorContext::Name))),
Endbyte::Limit => Err(Self::token_length_error()),
Endbyte::Delimiter(ch) => {
let next_state = self.lex_element_postblank(kind, ch)?;
let name =
add_context(self.flush_scratchpad_as_name(), ErrorContext::NameStart)?;
let metrics = self.metrics(1);
Ok(ST(
State::Element {
kind,
state: next_state,
},
Some(if state == ElementState::Name {
Token::Name(metrics, name)
} else {
match kind {
ElementKind::Header => Token::ElementHeadStart(metrics, name),
ElementKind::Footer => Token::ElementFootStart(metrics, name),
ElementKind::XMLDecl => panic!("invalid state"),
}
}),
))
}
}
}
ElementState::SpaceRequired | ElementState::Blank => {
match self.skip_matching(r, &is_space) {
(_, Ok(Endbyte::Eof)) | (_, Ok(Endbyte::Limit)) => {
Err(EndOrError::Error(Error::wfeof(ErrorContext::Element)))
}
(nmatching, Err(EndOrError::NeedMoreData))
if nmatching > 0 && state == ElementState::SpaceRequired =>
{
Ok(ST(
State::Element {
kind,
state: ElementState::Blank,
},
None,
))
}
(nmatching, Ok(Endbyte::Delimiter(b))) => {
self.eat_whitespace_metrics(1);
let next_state = self.lex_element_postblank(kind, b)?;
if next_state == ElementState::Name
&& state == ElementState::SpaceRequired
&& nmatching == 0
{
Err(EndOrError::Error(Error::InvalidSyntax(
"space required before attribute names",
)))
} else {
Ok(ST(
State::Element {
kind,
state: next_state,
},
None,
))
}
}
(_, Err(e)) => Err(e),
}
}
ElementState::AttributeValue(delim, false) => {
let selector = if delim == b'\'' {
&maybe_attval_apos as &dyn Fn(_) -> _
} else {
&maybe_attval_quot as &dyn Fn(_) -> _
};
match self.read_validated(r, &selector, self.opts.max_token_length)? {
Endbyte::Eof => Err(EndOrError::Error(Error::wfeof(
ErrorContext::AttributeValue,
))),
Endbyte::Limit => Err(Self::token_length_error()),
Endbyte::Delimiter(utf8ch) => self.lex_attval_next(delim, utf8ch, kind),
}
}
ElementState::AttributeValue(delim, true) => {
let b = handle_eof(self.read_single(r)?, ErrorContext::AttributeValue)?;
if b == b'\r' {
self.prep_scratchpad();
self.scratchpad.push(b' ');
Ok(ST(
State::Element {
kind,
state: ElementState::AttributeValue(delim, true),
},
None,
))
} else {
self.lex_attval_next(delim, b, kind)
}
}
ElementState::MaybeXMLDeclEnd => match self.read_single(r)? {
Some(b'>') => {
self.drop_scratchpad()?;
Ok(ST(
State::Content(ContentState::Whitespace),
Some(Token::XMLDeclEnd(self.metrics(0))),
))
}
Some(b) => Err(EndOrError::Error(Error::UnexpectedByte(
Some(ErrorContext::XmlDeclarationEnd),
b,
Some(&[">"]),
))),
None => Err(EndOrError::Error(Error::wfeof(
ErrorContext::XmlDeclarationEnd,
))),
},
ElementState::MaybeHeadClose => match self.read_single(r)? {
Some(b'>') => {
self.drop_scratchpad()?;
Ok(ST(
State::Content(ContentState::Initial),
Some(Token::ElementHeadClose(self.metrics(0))),
))
}
Some(b) => Err(EndOrError::Error(Error::UnexpectedByte(
Some(ErrorContext::ElementClose),
b,
Some(&[">"]),
))),
None => Err(EndOrError::Error(Error::wfeof(ErrorContext::ElementClose))),
},
ElementState::Eq => Ok(ST(
State::Element {
kind,
state: ElementState::Blank,
},
Some(Token::Eq(self.metrics(0))),
)),
ElementState::Close => Ok(ST(
State::Content(ContentState::Initial),
Some(Token::ElementHFEnd(self.metrics(0))),
)),
}
}
fn lex_reference(
&mut self,
ctx: ErrorContext,
ret: RefReturnState,
kind: RefKind,
r: &mut &[u8],
) -> Result<ST> {
let result = match kind {
RefKind::Entity => self.read_validated(r, &maybe_name, MAX_REFERENCE_LENGTH)?,
RefKind::Char(CharRefRadix::Decimal) => {
self.read_validated(r, &is_decimal_digit, MAX_REFERENCE_LENGTH)?
}
RefKind::Char(CharRefRadix::Hexadecimal) => {
self.read_validated(r, &is_hexadecimal_digit, MAX_REFERENCE_LENGTH)?
}
};
let result = match result {
Endbyte::Eof => return Err(EndOrError::Error(Error::wfeof(ErrorContext::Reference))),
Endbyte::Limit => return Err(EndOrError::Error(Error::UndeclaredEntity)),
Endbyte::Delimiter(b) => match b {
b'#' => {
if !self.scratchpad.is_empty() {
Err(b'#')
} else {
match kind {
RefKind::Entity => {
return Ok(ST(
State::Reference {
ctx,
ret,
kind: RefKind::Char(CharRefRadix::Decimal),
},
None,
));
}
_ => Err(b'#'),
}
}
}
b'x' => {
if !self.scratchpad.is_empty() {
Err(b'x')
} else {
match kind {
RefKind::Char(CharRefRadix::Decimal) => {
return Ok(ST(
State::Reference {
ctx,
ret,
kind: RefKind::Char(CharRefRadix::Hexadecimal),
},
None,
));
}
_ => Err(b'x'),
}
}
}
b';' => {
if self.scratchpad.is_empty() {
return Err(EndOrError::Error(Error::InvalidSyntax("empty reference")));
}
self.swap_scratchpad()?;
let entity = self.read_swap();
match kind {
RefKind::Entity => {
let b = add_context(resolve_named_entity(&entity[..]), ctx)?;
self.scratchpad.push(b);
Ok(())
}
RefKind::Char(radix) => {
let entity = unsafe { core::str::from_utf8_unchecked(&entity[..]) };
Ok(add_context(
resolve_char_reference(entity, radix, &mut self.scratchpad),
ctx,
)?)
}
}
}
c => Err(c),
},
};
match result {
Ok(_) => Ok(ST(ret.to_state(), None)),
Err(b) => Err(EndOrError::Error(Error::UnexpectedByte(
Some(ErrorContext::Reference),
b,
Some(&[";"]),
))),
}
}
fn lex_bytes_raw(&mut self, r: &mut &[u8]) -> Result<Option<Token>> {
if let Some(e) = self.err {
return Err(EndOrError::Error(e));
}
loop {
let stresult = match self.state {
State::Content(substate) => self.lex_content(substate, r),
State::Element {
kind,
state: substate,
} => self.lex_element(kind, substate, r),
State::Reference { ctx, ret, kind } => self.lex_reference(ctx, ret, kind, r),
State::Eof => return Ok(None),
};
let st = match stresult {
Err(EndOrError::NeedMoreData) => return Err(EndOrError::NeedMoreData),
Err(EndOrError::Error(other)) => {
self.err = Some(other);
return Err(EndOrError::Error(other));
}
Ok(st) => st,
};
if let Some(tok) = st.splice(&mut self.state) {
#[cfg(debug_assertions)]
{
self.prev_state = (self.scratchpad.clone(), self.state);
}
return Ok(Some(tok));
}
#[cfg(debug_assertions)]
{
if self.prev_state.0 == self.scratchpad && self.prev_state.1 == self.state {
panic!(
"state has not changed in the last iteration: {:?} {:?} last read: {:?}",
self, self.scratchpad, self.last_single_read
)
} else {
self.prev_state = (self.scratchpad.clone(), self.state)
}
}
}
}
pub(crate) fn lex_bytes(&mut self, r: &mut &[u8], at_eof: bool) -> Result<Option<Token>> {
self.has_eof = at_eof;
self.lex_bytes_raw(r)
}
pub fn release_temporaries(&mut self) {
self.scratchpad.shrink_to_fit();
self.swap.shrink_to_fit();
}
}
impl fmt::Debug for Lexer {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Lexer").field("state", &self.state).finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::format;
use alloc::string::ToString;
use std::io;
trait Sink {
type ErrorType;
fn token(&mut self, token: Token);
}
fn stream_to_sink<S: Sink>(
l: &mut Lexer,
r: &mut &[u8],
s: &mut S,
at_eof: bool,
) -> Result<()> {
loop {
match l.lex_bytes(r, at_eof)? {
Some(tok) => s.token(tok),
None => break,
}
}
Ok(())
}
fn stream_to_sink_from_bytes<S: Sink>(l: &mut Lexer, r: &mut &[u8], s: &mut S) -> Result<()> {
loop {
match l.lex_bytes(r, true)? {
Some(tok) => s.token(tok),
None => break,
}
}
Ok(())
}
struct VecSink {
dest: Vec<Token>,
limit: usize,
}
impl VecSink {
fn new(limit: usize) -> VecSink {
VecSink {
dest: Vec::new(),
limit: limit,
}
}
}
impl Sink for VecSink {
type ErrorType = io::Error;
fn token(&mut self, token: Token) {
if self.dest.len() >= self.limit {
panic!("token limit exceeded: {}", self.limit);
}
self.dest.push(token);
}
}
fn new_lexer() -> Lexer {
Lexer::new(crate::parser::Options::default().into())
}
fn lex(mut data: &[u8], token_limit: usize) -> (Vec<Token>, Result<()>) {
let mut lexer = new_lexer();
let mut sink = VecSink::new(token_limit);
let result = stream_to_sink(&mut lexer, &mut data, &mut sink, true);
(sink.dest, result)
}
fn lex_chunked_with(
data: &[&[u8]],
token_limit: usize,
mut lexer: Lexer,
) -> (Vec<Token>, Result<()>) {
let mut sink = VecSink::new(token_limit);
for chunk in data.iter() {
let mut chunk = *chunk;
match stream_to_sink(&mut lexer, &mut chunk, &mut sink, false) {
Ok(()) => panic!("unexpected end of tokens"),
Err(EndOrError::NeedMoreData) => (),
Err(e) => return (sink.dest, Err(e)),
}
assert_eq!(chunk.len(), 0);
}
let result = stream_to_sink(&mut lexer, &mut &[][..], &mut sink, true);
(sink.dest, result)
}
fn lex_chunked(data: &[&[u8]], token_limit: usize) -> (Vec<Token>, Result<()>) {
let lexer = new_lexer();
lex_chunked_with(data, token_limit, lexer)
}
fn lex_err(data: &[u8], token_limit: usize) -> Option<EndOrError> {
let (_, r) = lex(data, token_limit);
r.err()
}
fn run_fuzz_test(mut data: &[u8], token_limit: usize) -> Result<Vec<Token>> {
let mut lexer = new_lexer();
let mut sink = VecSink::new(token_limit);
stream_to_sink(&mut lexer, &mut data, &mut sink, true)?;
Ok(sink.dest)
}
#[test]
fn lexer_lex_xml_decl_start() {
let mut src = "<?xml".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
assert_eq!(
sink.dest[0],
Token::XMLDeclStart(TokenMetrics { start: 0, end: 5 })
);
}
#[test]
fn lexer_lex_rejects_invalid_xml_decl_opener() {
let mut src = "<?xmlversion".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
let err = stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
assert!(!matches!(err, EndOrError::Error(Error::InvalidEof(..))));
assert_eq!(
sink.dest[0],
Token::XMLDeclStart(TokenMetrics { start: 0, end: 5 })
);
assert_eq!(sink.dest.len(), 1);
}
#[test]
fn lexer_lex_xml_decl_version_name() {
let mut src = "<?xml version=".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
assert_eq!(
sink.dest[1],
Token::Name(
TokenMetrics { start: 6, end: 13 },
"version".try_into().unwrap()
)
);
}
#[test]
fn lexer_lex_xml_decl_version_eq() {
let mut src = "<?xml version=".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
assert_eq!(sink.dest[2], Token::Eq(TokenMetrics { start: 13, end: 14 }));
}
#[test]
fn lexer_lex_xml_decl_version_value_squot() {
let mut src = "<?xml version='1.0'".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
assert_eq!(
sink.dest[3],
Token::AttributeValue(
TokenMetrics { start: 14, end: 19 },
"1.0".try_into().unwrap()
)
);
}
#[test]
fn lexer_lex_xml_decl_version_value_dquot() {
let mut src = "<?xml version=\"1.0\"".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
assert_eq!(
sink.dest[3],
Token::AttributeValue(
TokenMetrics { start: 14, end: 19 },
"1.0".try_into().unwrap()
)
);
}
#[test]
fn lexer_lex_xml_decl_end() {
let mut src = "<?xml version=\"1.0\"?>".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
assert_eq!(
sink.dest[4],
Token::XMLDeclEnd(TokenMetrics { start: 19, end: 21 })
);
}
#[test]
fn lexer_lex_xml_decl_complete() {
let mut src = "<?xml version=\"1.0\" encoding='utf-8'?>".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
let result = stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink);
assert!(result.is_ok());
assert_eq!(
sink.dest[0],
Token::XMLDeclStart(TokenMetrics { start: 0, end: 5 })
);
assert_eq!(
sink.dest[1],
Token::Name(
TokenMetrics { start: 6, end: 13 },
"version".try_into().unwrap()
)
);
assert_eq!(sink.dest[2], Token::Eq(TokenMetrics { start: 13, end: 14 }));
assert_eq!(
sink.dest[3],
Token::AttributeValue(
TokenMetrics { start: 14, end: 19 },
"1.0".try_into().unwrap()
)
);
assert_eq!(
sink.dest[4],
Token::Name(
TokenMetrics { start: 20, end: 28 },
"encoding".try_into().unwrap()
)
);
assert_eq!(sink.dest[5], Token::Eq(TokenMetrics { start: 28, end: 29 }));
assert_eq!(
sink.dest[6],
Token::AttributeValue(
TokenMetrics { start: 29, end: 36 },
"utf-8".try_into().unwrap()
)
);
assert_eq!(
sink.dest[7],
Token::XMLDeclEnd(TokenMetrics { start: 36, end: 38 })
);
}
#[test]
fn lexer_lex_element_start() {
let mut src = &b"<element "[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
assert_eq!(
sink.dest[0],
Token::ElementHeadStart(
TokenMetrics { start: 0, end: 8 },
"element".try_into().unwrap()
)
);
}
#[test]
fn lexer_lex_element_noattr_empty() {
let mut src = &b"<element/>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
assert_eq!(
sink.dest[0],
Token::ElementHeadStart(
TokenMetrics { start: 0, end: 8 },
"element".try_into().unwrap()
)
);
assert_eq!(
sink.dest[1],
Token::ElementHeadClose(TokenMetrics { start: 8, end: 10 })
);
}
#[test]
fn lexer_lex_element_noattr_open() {
let mut src = &b"<element>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
assert_eq!(
sink.dest[0],
Token::ElementHeadStart(
TokenMetrics { start: 0, end: 8 },
"element".try_into().unwrap()
)
);
assert_eq!(
sink.dest[1],
Token::ElementHFEnd(TokenMetrics { start: 8, end: 9 })
);
}
#[test]
fn lexer_lex_element_noattr_empty_explicit() {
let mut src = &b"<element></element>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
assert_eq!(
sink.dest[0],
Token::ElementHeadStart(
TokenMetrics { start: 0, end: 8 },
"element".try_into().unwrap()
)
);
assert_eq!(
sink.dest[1],
Token::ElementHFEnd(TokenMetrics { start: 8, end: 9 })
);
assert_eq!(
sink.dest[2],
Token::ElementFootStart(
TokenMetrics { start: 9, end: 18 },
"element".try_into().unwrap()
)
);
assert_eq!(
sink.dest[3],
Token::ElementHFEnd(TokenMetrics { start: 18, end: 19 })
);
}
#[test]
fn lexer_lex_element_attribute() {
let mut src = &b"<element x='foo' y=\"bar\" xmlns='baz' xmlns:abc='fnord'>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
assert!(matches!(iter.next().unwrap(), Token::ElementHeadStart(_, nm) if nm == "element"));
assert_eq!(
*iter.next().unwrap(),
Token::Name(TokenMetrics { start: 9, end: 10 }, "x".try_into().unwrap())
);
assert!(matches!(iter.next().unwrap(), Token::Eq(_)));
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 11, end: 16 },
"foo".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Name(TokenMetrics { start: 17, end: 18 }, "y".try_into().unwrap())
);
assert!(matches!(iter.next().unwrap(), Token::Eq(_)));
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 19, end: 24 },
"bar".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Name(
TokenMetrics { start: 25, end: 30 },
"xmlns".try_into().unwrap()
)
);
assert!(matches!(iter.next().unwrap(), Token::Eq(_)));
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 31, end: 36 },
"baz".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Name(
TokenMetrics { start: 37, end: 46 },
"xmlns:abc".try_into().unwrap()
)
);
assert!(matches!(iter.next().unwrap(), Token::Eq(_)));
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 47, end: 54 },
"fnord".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 54, end: 55 })
);
}
#[test]
fn lexer_lex_text() {
let mut src = &b"<root>Hello World!</root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
assert!(matches!(iter.next().unwrap(), Token::ElementHeadStart(_, nm) if nm == "root"));
assert!(matches!(iter.next().unwrap(), Token::ElementHFEnd(_)));
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 6, end: 18 },
"Hello World!".try_into().unwrap()
)
);
assert!(matches!(iter.next().unwrap(), Token::ElementFootStart(_, nm) if nm == "root"));
assert!(matches!(iter.next().unwrap(), Token::ElementHFEnd(_)));
}
#[test]
fn lexer_lex_amp() {
let mut src = &b"<root>&</root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
assert!(matches!(iter.next().unwrap(), Token::ElementHeadStart(_, nm) if nm == "root"));
assert!(matches!(
iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
));
assert_eq!(
*iter.next().unwrap(),
Token::Text(TokenMetrics { start: 6, end: 11 }, "&".try_into().unwrap())
);
assert!(
matches!(iter.next().unwrap(), Token::ElementFootStart(TokenMetrics{start: 11, end: 17}, nm) if nm == "root")
);
assert!(matches!(iter.next().unwrap(), Token::ElementHFEnd(_)));
}
#[test]
fn lexer_lex_decimal_charref() {
let mut src = &b"<root><</root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
assert!(matches!(iter.next().unwrap(), Token::ElementHeadStart(_, nm) if nm == "root"));
assert!(matches!(
iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
));
assert_eq!(
*iter.next().unwrap(),
Token::Text(TokenMetrics { start: 6, end: 11 }, "<".try_into().unwrap())
);
assert!(
matches!(iter.next().unwrap(), Token::ElementFootStart(TokenMetrics{start: 11, end: 17}, nm) if nm == "root")
);
assert!(matches!(iter.next().unwrap(), Token::ElementHFEnd(_)));
}
#[test]
fn lexer_lex_hexadecimal_charref() {
let mut src = &b"<root>></root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
assert!(matches!(iter.next().unwrap(), Token::ElementHeadStart(_, nm) if nm == "root"));
assert!(matches!(
iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
));
assert_eq!(
*iter.next().unwrap(),
Token::Text(TokenMetrics { start: 6, end: 12 }, ">".try_into().unwrap())
);
assert!(
matches!(iter.next().unwrap(), Token::ElementFootStart(TokenMetrics{start: 12, end: 18}, nm) if nm == "root")
);
assert!(matches!(iter.next().unwrap(), Token::ElementHFEnd(_)));
}
fn collect_texts<'x, T: Iterator<Item = &'x Token>>(
iter: &'x mut T,
) -> (String, usize, usize, Option<&'x Token>) {
let mut texts: Vec<String> = Vec::new();
let mut start = 0;
let mut had_start = false;
let mut end = 0;
let mut token: Option<&'x Token> = None;
for tok in iter {
match tok {
Token::Text(metrics, t) => {
if !had_start {
start = metrics.start();
had_start = true;
} else {
assert_eq!(metrics.start(), end);
}
end = metrics.end();
texts.push(t.to_string());
}
other => {
token = Some(other);
break;
}
}
}
let text = texts.join("");
return (text, start, end, token);
}
#[test]
fn lexer_lex_mixed_text_entities() {
let mut src =
&b"<root><example foo="bar" baz='fnord'/></root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
assert!(matches!(iter.next().unwrap(), Token::ElementHeadStart(_, nm) if nm == "root"));
assert!(matches!(
iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
));
let (text, start, end, _) = collect_texts(&mut iter);
assert_eq!(start, 6);
assert_eq!(end, 65);
assert_eq!(text, "<example foo=\"bar\" baz='fnord'/>");
}
#[test]
fn lexer_lex_reject_charref_with_invalid_cdata() {
let mut src = &b"<root>�</root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
match stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink) {
Err(EndOrError::Error(_)) => (),
other => panic!("unexpected result: {:?}", other),
}
}
#[test]
fn lexer_lex_attribute_amp() {
let mut src = &b"<root foo='&'>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::Eq(TokenMetrics { start: 9, end: 10 })
);
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(TokenMetrics { start: 10, end: 17 }, "&".try_into().unwrap())
);
}
#[test]
fn lexer_lex_attribute_mixed_with_entities() {
let mut src =
&b"<root foo='<example foo="bar" baz='fnord'/>'>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::Eq(TokenMetrics { start: 9, end: 10 })
);
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 10, end: 71 },
"<example foo=\"bar\" baz='fnord'/>".try_into().unwrap()
)
);
}
#[test]
fn lexer_lex_cdata_section() {
let mut src = &b"<root><![CDATA[<example foo=\"bar\" baz='fnord'/>]]></root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 6, end: 50 },
"<example foo=\"bar\" baz='fnord'/>".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::ElementFootStart(
TokenMetrics { start: 50, end: 56 },
"root".try_into().unwrap()
)
);
iter.next().unwrap();
}
#[test]
fn lexer_lex_cdata_section_degenerate() {
let mut src = &b"<root><![CDATA[]]></root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
);
assert_eq!(
*iter.next().unwrap(),
Token::ElementFootStart(
TokenMetrics { start: 18, end: 24 },
"root".try_into().unwrap()
)
);
iter.next().unwrap();
}
#[test]
fn lexer_lex_cdata_section_mixed() {
let mut src = &b"<root>foobar <![CDATA[Hello <fun>]]</fun>&games world!]]> </root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
);
let (text, start, end, next) = collect_texts(&mut iter);
assert_eq!(start, 6);
assert_eq!(end, 62);
assert_eq!(text, "foobar Hello <fun>]]</fun>&games world! ");
assert_eq!(next.unwrap().metrics().start(), 62);
}
#[test]
fn lexer_lex_comment() {
let mut src = &b"<root>hello<!-- not --> world</root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 6, end: 11 },
"hello".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::CommentStart(TokenMetrics { start: 11, end: 15 },)
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 15, end: 20 },
" not ".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::CommentEnd(TokenMetrics { start: 20, end: 23 },)
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 23, end: 29 },
" world".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::ElementFootStart(
TokenMetrics { start: 29, end: 35 },
"root".try_into().unwrap()
)
);
iter.next().unwrap();
}
#[test]
fn lexer_does_not_expand_references_in_comments() {
let mut src = &b"<root><!-- & --></root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
);
assert_eq!(
*iter.next().unwrap(),
Token::CommentStart(TokenMetrics { start: 6, end: 10 },)
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 10, end: 17 },
" & ".try_into().unwrap()
)
);
iter.next().unwrap();
}
#[test]
fn lexer_does_not_emit_empty_text_node_for_empty_comment() {
let mut src = &b"<root><!----></root>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
);
assert_eq!(
*iter.next().unwrap(),
Token::CommentStart(TokenMetrics { start: 6, end: 10 },)
);
assert_eq!(
*iter.next().unwrap(),
Token::CommentEnd(TokenMetrics { start: 10, end: 13 },)
);
iter.next().unwrap();
}
#[test]
fn lexer_lex_restrict_element_name_by_token_length() {
let mut src = &b"<foobar2342/>"[..];
let mut lexer = Lexer::new(LexerOptions {
max_token_length: 6,
});
let mut sink = VecSink::new(128);
match stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink) {
Err(EndOrError::Error(Error::RestrictedXml(_))) => (),
other => panic!("unexpected result: {:?}", other),
}
}
#[test]
fn lexer_lex_restrict_attribute_name_by_token_length() {
let mut src = &b"<a foobar2342='foo'/>"[..];
let mut lexer = Lexer::new(LexerOptions {
max_token_length: 6,
});
let mut sink = VecSink::new(128);
match stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink) {
Err(EndOrError::Error(Error::RestrictedXml(_))) => (),
other => panic!("unexpected result: {:?}", other),
}
}
#[test]
fn lexer_lex_restrict_attribute_value_by_token_length() {
let mut src = &b"<a b='foobar2342'/>"[..];
let mut lexer = Lexer::new(LexerOptions {
max_token_length: 6,
});
let mut sink = VecSink::new(128);
match stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink) {
Err(EndOrError::Error(Error::RestrictedXml(_))) => (),
other => panic!("unexpected result: {:?}", other),
}
}
#[test]
fn lexer_lex_restrict_attribute_value_by_token_length_even_with_entities() {
let mut src = &b"<a b='foob&rx'/>"[..];
let mut lexer = Lexer::new(LexerOptions {
max_token_length: 6,
});
let mut sink = VecSink::new(128);
match stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink) {
Err(EndOrError::Error(Error::RestrictedXml(_))) => (),
other => panic!("unexpected result: {:?}", other),
}
}
#[test]
fn lexer_lex_attribute_value_entities_do_only_count_for_expansion() {
let mut src = &b"<a b='foob&'/>"[..];
let mut lexer = Lexer::new(LexerOptions {
max_token_length: 6,
});
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
}
#[test]
fn lexer_lex_token_length_causes_text_nodes_to_be_split() {
let mut src = &b"<a>foo001foo002foo003</a>"[..];
let mut lexer = Lexer::new(LexerOptions {
max_token_length: 6,
});
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 2, end: 3 })
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 3, end: 9 },
"foo001".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 9, end: 15 },
"foo002".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(
TokenMetrics { start: 15, end: 21 },
"foo003".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::ElementFootStart(TokenMetrics { start: 21, end: 24 }, "a".try_into().unwrap())
);
iter.next().unwrap();
}
#[test]
fn lexer_handles_broken_numeric_entity_correctly() {
let src = &b"&#;"[..];
let result = run_fuzz_test(src, 128);
assert!(result.is_err());
}
#[test]
fn lexer_limits_decimal_entities() {
let src = &b"�"[..];
let result = run_fuzz_test(src, 128);
assert!(result.is_err());
}
#[test]
fn lexer_limits_hexadecimal_entities() {
let src = &b"�"[..];
let result = run_fuzz_test(src, 128);
assert!(result.is_err());
}
#[test]
fn lexer_rejects_invalid_namestarts() {
let err = lex_err(b"<123/>", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedChar(_, '1', None)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<'foo/>", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedByte(_, b'\'', None)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<.bar/>", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedChar(_, '.', None)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_invalid_names() {
let err = lex_err(b"<foo#/>", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedByte(..)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<f\\a/>", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedByte(..)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_undeclared_or_invalid_references() {
let err = lex_err(b"&123;", 128).unwrap();
match err {
EndOrError::Error(Error::UndeclaredEntity) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"&foobar;", 128).unwrap();
match err {
EndOrError::Error(Error::UndeclaredEntity) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"&?;", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedByte(_, b'?', _)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_non_scalar_char_refs() {
let err = lex_err(b"�", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, true)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_non_xml_10_chars_via_refs_in_text() {
let err = lex_err(b"�", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, true)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, true)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_non_xml_10_chars_via_refs_in_attrs() {
let err = lex_err(b"<a foo='�'/>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, true)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<a foo=''/>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, true)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_non_xml_10_chars_verbatim_in_text() {
let err = lex_err(b"\x00", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"\x1f", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_non_xml_10_chars_verbatim_in_attrs() {
let err = lex_err(b"<a foo='\x00'/>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<a foo='\x1f'/>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, _, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_re_emits_error_on_next_call() {
let mut src = &b"<a>\x00</a>"[..];
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
let e1 = stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
let e2 = stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink)
.err()
.unwrap();
assert_eq!(e1, e2);
let mut iter = sink.dest.iter();
assert!(matches!(iter.next().unwrap(), Token::ElementHeadStart(_, nm) if nm == "a"));
assert!(matches!(iter.next().unwrap(), Token::ElementHFEnd(_)));
assert!(iter.next().is_none());
}
#[test]
fn lexer_handles_closing_brackets_in_cdata_section() {
let mut src = &b"<a><![CDATA[]]]></a>"[..];
let mut lexer = Lexer::new(LexerOptions {
max_token_length: 6,
});
let mut sink = VecSink::new(128);
stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink).unwrap();
let mut iter = sink.dest.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 2, end: 3 })
);
assert_eq!(
*iter.next().unwrap(),
Token::Text(TokenMetrics { start: 3, end: 16 }, "]".try_into().unwrap())
);
assert_eq!(
*iter.next().unwrap(),
Token::ElementFootStart(TokenMetrics { start: 16, end: 19 }, "a".try_into().unwrap())
);
iter.next().unwrap();
}
#[test]
fn lexer_recovers_from_wouldblock() {
let seq = &b"<?xml version='1.0'?>"[..];
let (sink, result) = lex_chunked(&seq.chunks(5).collect::<Vec<_>>(), 128);
result.unwrap();
let mut iter = sink.iter();
assert_eq!(
*iter.next().unwrap(),
Token::XMLDeclStart(TokenMetrics { start: 0, end: 5 })
);
assert_eq!(
*iter.next().unwrap(),
Token::Name(
TokenMetrics { start: 6, end: 13 },
"version".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Eq(TokenMetrics { start: 13, end: 14 })
);
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 14, end: 19 },
"1.0".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::XMLDeclEnd(TokenMetrics { start: 19, end: 21 })
);
}
#[test]
fn lexer_recovers_from_wouldblock_within_long_whitespace_with_correct_counting() {
let seq = &b"<?xml version = '1.0' ?>"[..];
let (sink, result) = lex_chunked(&seq.chunks(5).collect::<Vec<_>>(), 128);
result.unwrap();
let mut iter = sink.iter();
assert_eq!(
*iter.next().unwrap(),
Token::XMLDeclStart(TokenMetrics { start: 0, end: 5 })
);
assert_eq!(
*iter.next().unwrap(),
Token::Name(
TokenMetrics { start: 8, end: 15 },
"version".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Eq(TokenMetrics { start: 17, end: 18 })
);
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 20, end: 25 },
"1.0".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::XMLDeclEnd(TokenMetrics { start: 27, end: 29 })
);
}
#[test]
fn lexer_rejects_missing_whitespace_between_attrvalue_and_attrname() {
let err = lex_err(b"<a a='x'b='y'/>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidSyntax(_)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_nonchar_in_comment() {
let err = lex_err(b"<a><!--\x00--></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 0u32, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<a><!---\x00--></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 0u32, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_dashdash_in_comment() {
let err = lex_err(b"<a><!-- -- --></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidSyntax(_)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_handles_specials_comment_end() {
let (toks, r) = lex(&b"<root><!----></root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementFootStart(
TokenMetrics { start: 13, end: 19 },
"root".try_into().unwrap()
)
);
iter.next().unwrap();
let (toks, r) = lex(&b"<root><!---->&</root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::Text(TokenMetrics { start: 13, end: 18 }, "&".try_into().unwrap())
);
}
#[test]
fn lexer_rejects_nonchar_after_cr_in_comment() {
let err = lex_err(b"<a><!--\r\x01--></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 1, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_allows_cdata_end_marker_in_comment() {
let (toks, r) = lex(b"<a><!--]]>--></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "]]>");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_allows_comment_marker_in_cdata() {
let (toks, r) = lex(b"<a><![CDATA[-->]]></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "-->");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_correctly_linefolds_with_comment_marker_in_cdata() {
let (toks, r) = lex(b"<a><![CDATA[\r-->]]></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n-->");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_folds_crlf_to_lf_in_comment() {
let (toks, r) = lex(b"<a><!--\r\n--></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_cr_folding_in_comment_does_not_break_exit() {
let (toks, r) = lex(b"<a><!--\r--></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_cr_folding_in_comment_does_exit_comment() {
let (toks, r) = lex(b"<a><!--\r<>--></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n<>");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_rejects_nonchar_in_cdata_section() {
let err = lex_err(b"<a><![CDATA[\x00]]></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 0u32, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<a><![CDATA[]\x00]]></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 0u32, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<a><![CDATA[]]\x00]]></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 0u32, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_cdata_end_in_text() {
let err = lex_err(b"<a>]]></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidSyntax(_)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<a>]]]></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidSyntax(_)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<a>]]]]></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidSyntax(_)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_handles_partial_cdata_end() {
let (toks, r) = lex(&b"<root>]]</root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
let (text, start, end, next) = collect_texts(&mut iter);
assert_eq!(text, "]]");
assert_eq!(start, 6);
assert_eq!(end, 8);
assert_eq!(next.unwrap().metrics().start(), 8);
let (toks, r) = lex(&b"<root>]]foo</root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
let (text, start, end, next) = collect_texts(&mut iter);
assert_eq!(text, "]]foo");
assert_eq!(start, 6);
assert_eq!(end, 11);
assert_eq!(next.unwrap().metrics().start(), 11);
let (toks, r) = lex(&b"<root>]]></root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
let (text, start, end, next) = collect_texts(&mut iter);
assert_eq!(text, "]]>");
assert_eq!(start, 6);
assert_eq!(end, 12);
assert_eq!(next.unwrap().metrics().start(), 12);
let (toks, r) = lex(&b"<root>]]]</root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
let (text, start, end, next) = collect_texts(&mut iter);
assert_eq!(text, "]]]");
assert_eq!(start, 6);
assert_eq!(end, 9);
assert_eq!(next.unwrap().metrics().start(), 9);
let (toks, r) = lex(&b"<root>]]]foo</root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
let (text, start, end, next) = collect_texts(&mut iter);
assert_eq!(text, "]]]foo");
assert_eq!(start, 6);
assert_eq!(end, 12);
assert_eq!(next.unwrap().metrics().start(), 12);
let (toks, r) = lex(&b"<root>]]]></root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
let (text, start, end, next) = collect_texts(&mut iter);
assert_eq!(text, "]]]>");
assert_eq!(start, 6);
assert_eq!(end, 13);
assert_eq!(next.unwrap().metrics().start(), 13);
}
#[test]
fn lexer_handles_specials_after_cdata_end() {
let (toks, r) = lex(&b"<root><![CDATA[]]></root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::ElementHFEnd(TokenMetrics { start: 5, end: 6 })
);
assert_eq!(
*iter.next().unwrap(),
Token::ElementFootStart(
TokenMetrics { start: 18, end: 24 },
"root".try_into().unwrap()
)
);
iter.next().unwrap();
let (toks, r) = lex(&b"<root><![CDATA[]]>&</root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
assert_eq!(
*iter.next().unwrap(),
Token::Text(TokenMetrics { start: 18, end: 23 }, "&".try_into().unwrap())
);
let (toks, r) = lex(&b"<root><![CDATA[]]><![CDATA[]]]]>></root>"[..], 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
let (text, start, end, next) = collect_texts(&mut iter);
assert_eq!(text, "]]>");
assert_eq!(start, 18);
assert_eq!(end, 36);
assert_eq!(next.unwrap().metrics().start(), 36);
}
#[test]
fn lexer_rejects_nonchar_in_cdata_end_in_text() {
let err = lex_err(b"<a>]\x00]></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 0u32, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
let err = lex_err(b"<a>]]\x00></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 0u32, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_numeric_start_of_name_in_opening_tag() {
let err = lex_err(b"<4foo>", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedChar(_, '4', None)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_numeric_start_of_name_in_closing_tag() {
let err = lex_err(b"</4foo>", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedChar(_, '4', None)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_zero_length_name_in_opening_tag() {
let err = lex_err(b"< >", 128).unwrap();
match err {
EndOrError::Error(Error::UnexpectedByte(_, b' ', None)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_zero_length_name_in_closing_tag() {
let err = lex_err(b"</ >", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidSyntax(_)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_lex_accounts_whitespace_between_xml_decl_and_element_to_element() {
let mut src = "<?xml version=\"1.0\" encoding='utf-8'?>\n\n<root/>".as_bytes();
let mut lexer = new_lexer();
let mut sink = VecSink::new(128);
let result = stream_to_sink_from_bytes(&mut lexer, &mut src, &mut sink);
assert!(result.is_ok());
let mut iter = sink.dest.iter();
assert_eq!(
*iter.next().unwrap(),
Token::XMLDeclStart(TokenMetrics { start: 0, end: 5 })
);
assert_eq!(
*iter.next().unwrap(),
Token::Name(
TokenMetrics { start: 6, end: 13 },
"version".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Eq(TokenMetrics { start: 13, end: 14 })
);
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 14, end: 19 },
"1.0".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Name(
TokenMetrics { start: 20, end: 28 },
"encoding".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::Eq(TokenMetrics { start: 28, end: 29 })
);
assert_eq!(
*iter.next().unwrap(),
Token::AttributeValue(
TokenMetrics { start: 29, end: 36 },
"utf-8".try_into().unwrap()
)
);
assert_eq!(
*iter.next().unwrap(),
Token::XMLDeclEnd(TokenMetrics { start: 36, end: 38 })
);
match iter.next().unwrap() {
Token::ElementHeadStart(tm, ..) => {
assert_eq!(*tm, TokenMetrics { start: 38, end: 45 });
}
other => panic!("unexpected event: {:?}", other),
}
}
#[test]
fn lexer_folds_crlf_to_lf_in_text() {
let (toks, r) = lex(b"<a>\r\n</a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_rejects_nonchar_after_cr() {
let err = lex_err(b"<a>\r\x01</a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 1, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_rejects_nonchar_after_cr_in_cdata() {
let err = lex_err(b"<a><![CDATA[\r\x01]]></a>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidChar(_, 1, false)) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_does_not_modify_charrefs_for_line_endings() {
let (toks, r) = lex(b"<a>
</a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
let (text, _, _, _) = collect_texts(&mut iter);
assert_eq!(text, "\r\n");
}
#[test]
fn lexer_folds_crlf_to_lf_in_cdata() {
let (toks, r) = lex(b"<a><![CDATA[\r\n]]></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_cr_folding_does_not_break_specials() {
let (toks, r) = lex(b"<a>\r</a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_cr_folding_in_cdata_does_not_break_exit() {
let (toks, r) = lex(b"<a><![CDATA[\r]]></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_cr_folding_in_cdata_does_exit_cdata_section() {
let (toks, r) = lex(b"<a><![CDATA[\r<>]]></a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n<>");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_folds_crcrlf_to_lflf_in_text() {
let (toks, r) = lex(b"<a>\r\r\n</a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n\n");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_folds_cr_to_lf_in_text() {
let (toks, r) = lex(b"<a>\r</a>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::Text(_, cdata) => {
assert_eq!(cdata, "\n");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_normalizes_whitespace_in_attributes() {
let (toks, r) = lex(b"<a x='\r\r\n\t '/>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::AttributeValue(_, cdata) => {
assert_eq!(cdata, " ");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_handles_crlf_in_attribute() {
let (toks, r) = lex(b"<a x='\r\n\t '/>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::AttributeValue(_, cdata) => {
assert_eq!(cdata, " ");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_preserves_whitespace_inserted_via_charrefs_in_attributes() {
let (toks, r) = lex(b"<a x='
	 '/>", 128);
r.unwrap();
let mut iter = toks.iter();
iter.next().unwrap();
iter.next().unwrap();
iter.next().unwrap();
match iter.next().unwrap() {
Token::AttributeValue(_, cdata) => {
assert_eq!(cdata, "\r\n\t ");
}
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn lexer_is_resilient_to_chunking() {
let (_toks, r) = lex_chunked(&[&b"<foo bar='baz' "[..], &b"fnord=''/>"[..]], 128);
r.unwrap();
}
#[test]
fn lexer_emits_close_tag_token_even_at_end_of_buffer() {
let mut buf = &b"</foo>"[..];
let mut lexer = new_lexer();
match lexer.lex_bytes(&mut buf, false) {
Ok(Some(Token::ElementFootStart(_, _))) => (),
other => panic!("unexpected result: {:?}", other),
};
match lexer.lex_bytes(&mut buf, false) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
match lexer.lex_bytes(&mut buf, false) {
Err(EndOrError::NeedMoreData) => (),
other => panic!("unexpected result: {:?}", other),
};
}
#[test]
fn lexer_catches_broken_utf8_sequence_at_end_of_file() {
let mut buf = &b"<xyz>\xf0\x9f\x8e"[..];
let mut lexer = new_lexer();
match lexer.lex_bytes(&mut buf, true) {
Ok(Some(Token::ElementHeadStart(_, name))) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected result: {:?}", other),
};
match lexer.lex_bytes(&mut buf, true) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
match lexer.lex_bytes(&mut buf, true) {
Err(EndOrError::Error(Error::InvalidUtf8Byte(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
}
#[test]
fn lexer_catches_incorrect_utf8_at_end_of_file() {
let mut buf = &b"<xyz>\xf0\x9f\x8e\xff"[..];
let mut lexer = new_lexer();
match lexer.lex_bytes(&mut buf, true) {
Ok(Some(Token::ElementHeadStart(_, name))) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected result: {:?}", other),
};
match lexer.lex_bytes(&mut buf, true) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
match lexer.lex_bytes(&mut buf, true) {
Err(EndOrError::Error(Error::InvalidUtf8Byte(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
}
#[test]
fn lexer_catches_split_utf8_sequence_at_cdata_section_boundary() {
let err = lex_err(b"<xyz>\xe2<![CDATA[\x98\xba]]></xyz>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidUtf8Byte(0xe2)) => (),
other => panic!("unexpected error: {other:?}"),
}
let err = lex_err(b"<xyz><![CDATA[\xe2\x98]]>\xba</xyz>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidUtf8Byte(0xe2)) => (),
other => panic!("unexpected error: {other:?}"),
}
}
#[test]
fn lexer_catches_split_utf8_sequence_at_comment_boundary() {
let err = lex_err(b"<xyz>\xe2<!--\x98\xba--></xyz>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidUtf8Byte(0xe2)) => (),
other => panic!("unexpected error: {other:?}"),
}
let err = lex_err(b"<xyz><!--\xe2\x98-->\xba</xyz>", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidUtf8Byte(0xe2)) => (),
other => panic!("unexpected error: {other:?}"),
}
}
fn feed_bytewise(lexer: &mut Lexer, src: &mut &[u8]) -> Result<Option<Token>> {
loop {
let ((mut buf, remainder), at_eof) = if !src.is_empty() {
(src.split_at(1), src.len() == 1)
} else {
((&[][..], *src), true)
};
let result = lexer.lex_bytes(&mut buf, at_eof);
if buf.is_empty() {
*src = remainder;
}
match result {
Ok(v) => return Ok(v),
Err(EndOrError::NeedMoreData) => {
continue;
}
Err(other) => return Err(other),
};
}
}
#[test]
fn lexer_handles_chunked_utf8_fed_bytewise() {
let mut src = "<xyz>fööbär🎉</xyz>".as_bytes();
let mut lexer = new_lexer();
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHeadStart(_, name))) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::Text(_, text))) => {
assert_eq!(text, "fööbär🎉");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementFootStart(_, name))) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
}
#[test]
fn lexer_detect_eof_in_name() {
let err = lex_err(b"<aa", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidEof(Some(ErrorContext::Name))) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_in_element_head_whitespace() {
let err = lex_err(b"<aa ", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidEof(Some(ErrorContext::Element))) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_in_attrname() {
let err = lex_err(b"<a xxxx", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidEof(Some(ErrorContext::Name))) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_after_attrname() {
let err = lex_err(b"<a x=", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidEof(Some(ErrorContext::Element))) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_in_attrval() {
let err = lex_err(b"<a x='xyz", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidEof(Some(ErrorContext::AttributeValue))) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_after_attr() {
let err = lex_err(b"<a x=''", 128).unwrap();
match err {
EndOrError::Error(Error::InvalidEof(Some(ErrorContext::Element))) => (),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_after_head() {
let (_, result) = lex(b"<a>", 128);
match result {
Ok(()) => (),
other => panic!("unexpected lex result: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_in_text() {
let (_, result) = lex(b"<a>foo", 128);
match result {
Ok(()) => (),
other => panic!("unexpected lex result: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_at_end_of_document() {
let (_, result) = lex(b"<a/>", 128);
match result {
Ok(()) => (),
other => panic!("unexpected lex result: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_at_end_of_document_with_whitespace() {
let (_, result) = lex(b"<a/>\n\n", 128);
match result {
Ok(()) => (),
other => panic!("unexpected lex result: {:?}", other),
}
}
#[test]
fn lexer_detect_eof_before_decl_or_el() {
let (_, result) = lex(b"", 128);
match result {
Ok(()) => (),
other => panic!("unexpected lex result: {:?}", other),
}
}
#[test]
fn utf8_sequence_on_token_boundary() {
let inner = "あああ";
let buf = format!("<x>{}</x>", inner);
for limit in 3..(inner.len() + 1) {
let mut lexer = Lexer::new(LexerOptions {
max_token_length: limit,
});
let mut sink = VecSink::new(128);
let result = stream_to_sink(&mut lexer, &mut buf.as_bytes(), &mut sink, true);
match result {
Ok(()) => (),
other => panic!("unexpected lex result (limit={}): {:?}", limit, other),
}
}
}
#[test]
fn unbuffered_text_fed_chunkwise() {
let mut lexer = new_lexer();
lexer.text_buffering = false;
let (tokens, result) = lex_chunked_with(
&[
&b"<xyz>f\xc3"[..],
&b"\xb6\xc3\xb6"[..],
"bär🎉</xyz>".as_bytes(),
],
128,
lexer,
);
result.unwrap();
let mut tokens = tokens.into_iter();
match tokens.next() {
Some(Token::ElementHeadStart(_, name)) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected token: {:?}", other),
}
match tokens.next() {
Some(Token::ElementHFEnd(_)) => (),
other => panic!("unexpected token: {:?}", other),
}
match tokens.next() {
Some(Token::Text(_, text)) => {
assert_eq!(text, "f");
}
other => panic!("unexpected token: {:?}", other),
}
match tokens.next() {
Some(Token::Text(_, text)) => {
assert_eq!(text, "öö");
}
other => panic!("unexpected token: {:?}", other),
}
match tokens.next() {
Some(Token::Text(_, text)) => {
assert_eq!(text, "bär🎉");
}
other => panic!("unexpected token: {:?}", other),
}
match tokens.next() {
Some(Token::ElementFootStart(_, name)) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected token: {:?}", other),
}
match tokens.next() {
Some(Token::ElementHFEnd(_)) => (),
other => panic!("unexpected token: {:?}", other),
}
match tokens.next() {
None => (),
other => panic!("unexpected token: {:?}", other),
}
}
#[test]
fn unbuffered_text_fed_bytewise() {
let mut src = "<xyz>fööbär🎉</xyz>".as_bytes();
let mut lexer = new_lexer();
lexer.text_buffering = false;
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHeadStart(_, name))) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::Text(_, text))) => {
assert_eq!(text, "f");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::Text(_, text))) => {
assert_eq!(text, "ö");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::Text(_, text))) => {
assert_eq!(text, "ö");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::Text(_, text))) => {
assert_eq!(text, "b");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::Text(_, text))) => {
assert_eq!(text, "ä");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::Text(_, text))) => {
assert_eq!(text, "r");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::Text(_, text))) => {
assert_eq!(text, "🎉");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementFootStart(_, name))) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
}
#[test]
fn unbuffered_text_detects_and_reports_truncated_utf8() {
let mut src = &b"<xyz>\xf0\x9f\x8emore</xyz>"[..];
let mut lexer = new_lexer();
lexer.text_buffering = false;
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHeadStart(_, name))) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Err(EndOrError::Error(Error::InvalidUtf8Byte(..))) => (),
other => panic!("unexpected result: {:?}", other),
};
}
#[test]
fn unbuffered_text_detects_and_reports_invalid_utf8() {
let mut src = &b"<xyz>\xf0\x9f\xff\x89more</xyz>"[..];
let mut lexer = new_lexer();
lexer.text_buffering = false;
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHeadStart(_, name))) => {
assert_eq!(name, "xyz");
}
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Ok(Some(Token::ElementHFEnd(_))) => (),
other => panic!("unexpected result: {:?}", other),
};
match feed_bytewise(&mut lexer, &mut src) {
Err(EndOrError::Error(Error::InvalidUtf8Byte(..))) => (),
other => panic!("unexpected result: {:?}", other),
};
}
}