//! [<img alt="github" src="https://img.shields.io/badge/github-jdrouet/htmlparser.git-8da0cb?style=for-the-badge&logo=github" height="20">](https://github.com/jdrouet/htmlparser.git)
//! [<img alt="crates.io" src="https://img.shields.io/crates/v/htmlparser.svg?style=for-the-badge&color=fc8d62&logo=rust" height="20">](https://crates.io/crates/htmlparser)
//! [<img alt="docs.rs" src="https://img.shields.io/badge/docs.rs-htmlparser-66c2a5?style=for-the-badge&logoColor=white&logo=" height="20">](https://docs.rs/htmlparser)
//!
//! *htmlparser* is a low-level, pull-based, zero-allocation
//! [XML 1.0](https://www.w3.org/TR/xml/) parser.
//!
//! <br>
//!
//! ## Example
//!
//! ```rust
//! for token in htmlparser::Tokenizer::from("<tagname name='value'/>") {
//! println!("{:?}", token);
//! }
//! ```
//!
//! <br>
//!
//! ## Why a new library?
//!
//! This library is basically a low-level XML tokenizer that preserves the
//! positions of the tokens and is not intended to be used directly.
//!
//! If you are looking for a higher level solution, check out
//! [roxmltree](https://github.com/RazrFalcon/roxmltree).
//!
//! <br>
//!
//! ## Benefits
//!
//! - All tokens contain `StrSpan` structs which represent the position of the
//! substring in the original document.
//! - Good error processing. All error types contain the position (line:column)
//! where it occurred.
//! - No heap allocations.
//! - No dependencies.
//! - Tiny. ~1400 LOC and ~30KiB in the release build according to
//! `cargo-bloat`.
//! - Supports `no_std` builds. To use without the standard library, disable the
//! default features.
//!
//! <br>
//!
//! ## Limitations
//!
//! - Currently, only ENTITY objects are parsed from the DOCTYPE. All others are
//! ignored.
//! - No tree structure validation. So an XML like
//! `<root><child></root></child>` or a string without root element will be
//! parsed without errors. You should check for this manually. On the other
//! hand `<a/><a/>` will lead to an error.
//! - Duplicated attributes is not an error. So XML like `<item a="v1" a="v2"/>`
//! will be parsed without errors. You should check for this manually.
//! - UTF-8 only.
//!
//! <br>
//!
//! ## Safety
//!
//! - The library must not panic. Any panic is considered a critical bug and
//! should be reported.
//! - The library forbids unsafe code.
//!
//! <br>
//!
//! ## License
//!
//! Licensed under either of
//!
//! - Apache License, Version 2.0 ([LICENSE-APACHE] or
//! http://www.apache.org/licenses/LICENSE-2.0)
//! - MIT license ([LICENSE-MIT] or http://opensource.org/licenses/MIT)
//!
//! at your option.
//!
//! <br>
//!
//! ### Contribution
//!
//! Unless you explicitly state otherwise, any contribution intentionally submitted
//! for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
//! dual licensed as above, without any additional terms or conditions.
//!
//! [LICENSE-APACHE]: https://github.com/jdrouet/htmlparser/blob/master/LICENSE-APACHE
//! [LICENSE-MIT]: https://github.com/jdrouet/htmlparser/blob/master/LICENSE-MIT
#![no_std]
#![forbid(unsafe_code)]
#![warn(missing_docs)]
#![allow(ellipsis_inclusive_range_patterns)]
#[cfg(feature = "std")]
#[macro_use]
extern crate std;
macro_rules! matches {
($expression:expr, $($pattern:tt)+) => {
match $expression {
$($pattern)+ => true,
_ => false
}
}
}
mod error;
mod stream;
mod strspan;
mod xmlchar;
pub use crate::error::*;
pub use crate::stream::*;
pub use crate::strspan::*;
pub use crate::xmlchar::*;
/// An XML token.
#[allow(missing_docs)]
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub enum Token<'a> {
/// Declaration token.
///
/// ```text
/// <?xml version='1.0' encoding='UTF-8' standalone='yes'?>
/// --- - version
/// ----- - encoding?
/// --- - standalone?
/// ------------------------------------------------------- - span
/// ```
Declaration {
version: StrSpan<'a>,
encoding: Option<StrSpan<'a>>,
standalone: Option<bool>,
span: StrSpan<'a>,
},
/// Processing instruction token.
///
/// ```text
/// <?target content?>
/// ------ - target
/// ------- - content?
/// ------------------ - span
/// ```
ProcessingInstruction {
target: StrSpan<'a>,
content: Option<StrSpan<'a>>,
span: StrSpan<'a>,
},
/// Conditional comment start token.
///
/// Should work with downlevel-hidden conditional comments
///
/// ```text
/// <!--[if IE 8]>
/// ------- - condition
/// -------------- - span
/// ```
///
/// or with downlevel-revealed conditional comment alternatives
///
/// ```text
/// <![if !IE]>
/// ------ - condition
/// ----------- - span
///
/// <!--[if !IE]>-->
/// ------ - condition
/// ---------------- - span
///
/// <!--[if gt IE 6]><!-->
/// ---------- - condition
/// ---------------------- - span
/// ```
ConditionalCommentStart {
condition: StrSpan<'a>,
span: StrSpan<'a>,
},
/// Conditional comment end token.
///
/// Should work for all alternatives
///
/// ```text
/// <![endif]-->
/// ------------ - span
///
/// <![endif]>
/// ---------- - span
///
/// <!--<![endif]-->
/// ---------------- - span
/// ```
ConditionalCommentEnd { span: StrSpan<'a> },
/// Comment token.
///
/// ```text
/// <!-- text -->
/// ------ - text
/// ------------- - span
/// ```
Comment {
text: StrSpan<'a>,
span: StrSpan<'a>,
},
/// DOCTYPE start token.
///
/// ```text
/// <!DOCTYPE greeting SYSTEM "hello.dtd" [
/// -------- - name
/// ------------------ - external_id?
/// --------------------------------------- - span
/// ```
DtdStart {
name: StrSpan<'a>,
external_id: Option<ExternalId<'a>>,
span: StrSpan<'a>,
},
/// Empty DOCTYPE token.
///
/// ```text
/// <!DOCTYPE greeting SYSTEM "hello.dtd">
/// -------- - name
/// ------------------ - external_id?
/// -------------------------------------- - span
/// ```
EmptyDtd {
name: StrSpan<'a>,
external_id: Option<ExternalId<'a>>,
span: StrSpan<'a>,
},
/// ENTITY token.
///
/// Can appear only inside the DTD.
///
/// ```text
/// <!ENTITY ns_extend "http://test.com">
/// --------- - name
/// --------------- - definition
/// ------------------------------------- - span
/// ```
EntityDeclaration {
name: StrSpan<'a>,
definition: EntityDefinition<'a>,
span: StrSpan<'a>,
},
/// DOCTYPE end token.
///
/// ```text
/// <!DOCTYPE svg [
/// ...
/// ]>
/// -- - span
/// ```
DtdEnd { span: StrSpan<'a> },
/// Element start token.
///
/// ```text
/// <ns:elem attr="value"/>
/// -- - prefix
/// ---- - local
/// -------- - span
/// ```
ElementStart {
prefix: StrSpan<'a>,
local: StrSpan<'a>,
span: StrSpan<'a>,
},
/// Attribute token.
///
/// ```text
/// <elem ns:attr="value"/>
/// -- - prefix
/// ---- - local
/// ----- - value
/// --------------- - span
/// ```
Attribute {
prefix: StrSpan<'a>,
local: StrSpan<'a>,
value: Option<StrSpan<'a>>,
span: StrSpan<'a>,
},
/// Element end token.
///
/// ```text
/// <ns:elem>text</ns:elem>
/// - ElementEnd::Open
/// - - span
/// ```
///
/// ```text
/// <ns:elem>text</ns:elem>
/// -- ---- - ElementEnd::Close(prefix, local)
/// ---------- - span
/// ```
///
/// ```text
/// <ns:elem/>
/// - ElementEnd::Empty
/// -- - span
/// ```
ElementEnd {
end: ElementEnd<'a>,
span: StrSpan<'a>,
},
/// Text token.
///
/// Contains text between elements including whitespaces.
/// Basically everything between `>` and `<`.
/// Except `]]>`, which is not allowed and will lead to an error.
///
/// ```text
/// <p> text </p>
/// ------ - text
/// ```
///
/// The token span is equal to the `text`.
Text { text: StrSpan<'a> },
/// CDATA token.
///
/// ```text
/// <p><![CDATA[text]]></p>
/// ---- - text
/// ---------------- - span
/// ```
Cdata {
text: StrSpan<'a>,
span: StrSpan<'a>,
},
}
impl<'a> Token<'a> {
/// Returns the [`StrSpan`] encompassing all of the token.
pub fn span(&self) -> StrSpan<'a> {
let span = match self {
Token::Declaration { span, .. } => span,
Token::ProcessingInstruction { span, .. } => span,
Token::ConditionalCommentStart { span, .. } => span,
Token::ConditionalCommentEnd { span } => span,
Token::Comment { span, .. } => span,
Token::DtdStart { span, .. } => span,
Token::EmptyDtd { span, .. } => span,
Token::EntityDeclaration { span, .. } => span,
Token::DtdEnd { span, .. } => span,
Token::ElementStart { span, .. } => span,
Token::Attribute { span, .. } => span,
Token::ElementEnd { span, .. } => span,
Token::Text { text, .. } => text,
Token::Cdata { span, .. } => span,
};
*span
}
}
/// `ElementEnd` token.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub enum ElementEnd<'a> {
/// Indicates `>`
Open,
/// Indicates `</name>`
Close(StrSpan<'a>, StrSpan<'a>),
/// Indicates `/>`
Empty,
}
/// Representation of the [ExternalID](https://www.w3.org/TR/xml/#NT-ExternalID) value.
#[allow(missing_docs)]
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub enum ExternalId<'a> {
System(StrSpan<'a>),
Public(StrSpan<'a>, StrSpan<'a>),
}
/// Representation of the [EntityDef](https://www.w3.org/TR/xml/#NT-EntityDef) value.
#[allow(missing_docs)]
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub enum EntityDefinition<'a> {
EntityValue(StrSpan<'a>),
ExternalId(ExternalId<'a>),
}
type Result<T> = core::result::Result<T, Error>;
type StreamResult<T> = core::result::Result<T, StreamError>;
#[derive(Clone, Copy, PartialEq, Debug)]
enum State {
Declaration,
AfterDeclaration,
Dtd,
AfterDtd,
Elements,
Attributes,
AfterElements,
End,
}
/// Tokenizer for the XML structure.
#[derive(Clone)]
pub struct Tokenizer<'a> {
stream: Stream<'a>,
state: State,
depth: usize,
fragment_parsing: bool,
}
impl core::fmt::Debug for Tokenizer<'_> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "Tokenizer {{ ... }}")
}
}
impl<'a> From<&'a str> for Tokenizer<'a> {
#[inline]
fn from(text: &'a str) -> Self {
let mut stream = Stream::from(text);
// Skip UTF-8 BOM.
if stream.starts_with(&[0xEF, 0xBB, 0xBF]) {
stream.advance(3);
}
Tokenizer {
stream,
state: State::Declaration,
depth: 0,
fragment_parsing: false,
}
}
}
macro_rules! map_err_at {
($fun:expr, $stream:expr, $err:ident) => {{
let start = $stream.pos();
$fun.map_err(|e| Error::$err(e, $stream.gen_text_pos_from(start)))
}};
}
#[inline]
fn is_conditional_comment(s: &mut Stream<'_>) -> bool {
// Downlevel-hidden conditional comment
// <!--[if IE 8]>
s.starts_with(b"<!--[if")
// Downlevel-revealed conditional comment
// <![if !IE]>
|| s.starts_with(b"<![if")
// Closing downlevel-revealed conditional comment
// <![endif]>
|| s.starts_with(b"<![endif")
// Closing downlevel-revealed conditional comment
// <!--<![endif]-->
|| s.starts_with(b"<!--<![endif")
}
impl<'a> Tokenizer<'a> {
/// Enables document fragment parsing.
///
/// By default, `htmlparser` will check for DTD, root element, etc.
/// But if we have to parse an XML fragment, it will lead to an error.
/// This method switches the parser to the root element content parsing mode,
/// so it will treat any data as a content of the root element.
pub fn from_fragment(full_text: &'a str, fragment: core::ops::Range<usize>) -> Self {
Tokenizer {
stream: Stream::from_substr(full_text, fragment),
state: State::Elements,
depth: 0,
fragment_parsing: true,
}
}
fn parse_next_impl(&mut self) -> Option<Result<Token<'a>>> {
let s = &mut self.stream;
if s.at_end() {
return None;
}
let start = s.pos();
match self.state {
State::Declaration => {
self.state = State::AfterDeclaration;
if s.starts_with(b"<?xml ") {
Some(Self::parse_declaration(s))
} else {
None
}
}
State::AfterDeclaration => {
if s.starts_with(b"<!DOCTYPE") || s.starts_with(b"<!doctype") {
let t = Self::parse_doctype(s);
match t {
Ok(Token::DtdStart { .. }) => self.state = State::Dtd,
Ok(Token::EmptyDtd { .. }) => self.state = State::AfterDtd,
_ => {}
}
Some(t)
} else if is_conditional_comment(s) {
Some(Self::parse_conditional_comment(s))
} else if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<?") {
if s.starts_with(b"<?xml ") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else {
Some(Self::parse_pi(s))
}
} else if s.starts_with_space() {
s.skip_spaces();
None
} else {
self.state = State::AfterDtd;
None
}
}
State::Dtd => {
if s.starts_with(b"<!ENTITY") {
Some(Self::parse_entity_decl(s))
} else if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<?") {
if s.starts_with(b"<?xml ") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else {
Some(Self::parse_pi(s))
}
} else if s.starts_with(b"]") {
// DTD ends with ']' S? '>', therefore we have to skip possible spaces.
s.advance(1);
s.skip_spaces();
match s.curr_byte() {
Ok(b'>') => {
self.state = State::AfterDtd;
s.advance(1);
Some(Ok(Token::DtdEnd {
span: s.slice_back(start),
}))
}
Ok(c) => {
let e = StreamError::InvalidChar(c, b'>', s.gen_text_pos());
Some(Err(Error::InvalidDoctype(e, s.gen_text_pos_from(start))))
}
Err(_) => {
let e = StreamError::UnexpectedEndOfStream;
Some(Err(Error::InvalidDoctype(e, s.gen_text_pos_from(start))))
}
}
} else if s.starts_with_space() {
s.skip_spaces();
None
} else if s.starts_with(b"<!ELEMENT")
|| s.starts_with(b"<!ATTLIST")
|| s.starts_with(b"<!NOTATION")
{
if Self::consume_decl(s).is_err() {
let pos = s.gen_text_pos_from(start);
Some(Err(Error::UnknownToken(pos)))
} else {
None
}
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
State::AfterDtd => {
if is_conditional_comment(s) {
Some(Self::parse_conditional_comment(s))
} else if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<?") {
if s.starts_with(b"<?xml ") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else {
Some(Self::parse_pi(s))
}
} else if s.starts_with(b"<!") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else if s.starts_with(b"<") {
self.state = State::Attributes;
Some(Self::parse_element_start(s))
} else if s.starts_with_space() {
s.skip_spaces();
None
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
State::Elements => {
// Use `match` only here, because only this section is performance-critical.
match s.curr_byte() {
Ok(b'<') => match s.next_byte() {
Ok(b'!') => {
if is_conditional_comment(s) {
Some(Self::parse_conditional_comment(s))
} else if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<![CDATA[") {
Some(Self::parse_cdata(s))
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
Ok(b'?') => {
if !s.starts_with(b"<?xml ") {
Some(Self::parse_pi(s))
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
Ok(b'/') => {
if self.depth > 0 {
self.depth -= 1;
}
if self.depth == 0 && !self.fragment_parsing {
self.state = State::AfterElements;
} else {
self.state = State::Elements;
}
Some(Self::parse_close_element(s))
}
Ok(_) => {
self.state = State::Attributes;
Some(Self::parse_element_start(s))
}
Err(_) => Some(Err(Error::UnknownToken(s.gen_text_pos()))),
},
Ok(_) => Some(Self::parse_text(s)),
Err(_) => Some(Err(Error::UnknownToken(s.gen_text_pos()))),
}
}
State::Attributes => {
let t = Self::parse_attribute(s);
if let Ok(Token::ElementEnd { end, .. }) = t {
if end == ElementEnd::Open {
self.depth += 1;
}
if self.depth == 0 && !self.fragment_parsing {
self.state = State::AfterElements;
} else {
self.state = State::Elements;
}
}
Some(t.map_err(|e| Error::InvalidAttribute(e, s.gen_text_pos_from(start))))
}
State::AfterElements => {
if is_conditional_comment(s) {
Some(Self::parse_conditional_comment(s))
} else if s.starts_with(b"<!--") {
Some(Self::parse_comment(s))
} else if s.starts_with(b"<?") {
if s.starts_with(b"<?xml ") {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
} else {
Some(Self::parse_pi(s))
}
} else if s.starts_with_space() {
s.skip_spaces();
None
} else {
Some(Err(Error::UnknownToken(s.gen_text_pos())))
}
}
State::End => None,
}
}
fn parse_declaration(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_declaration_impl(s), s, InvalidDeclaration)
}
// XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
fn parse_declaration_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
fn consume_spaces(s: &mut Stream) -> StreamResult<()> {
if s.starts_with_space() {
s.skip_spaces();
} else if !s.starts_with(b"?>") && !s.at_end() {
return Err(StreamError::InvalidSpace(
s.curr_byte_unchecked(),
s.gen_text_pos(),
));
}
Ok(())
}
let start = s.pos();
s.advance(6);
let version = Self::parse_version_info(s)?;
consume_spaces(s)?;
let encoding = Self::parse_encoding_decl(s)?;
if encoding.is_some() {
consume_spaces(s)?;
}
let standalone = Self::parse_standalone(s)?;
s.skip_spaces();
s.skip_string(b"?>")?;
let span = s.slice_back(start);
Ok(Token::Declaration {
version,
encoding,
standalone,
span,
})
}
// VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
// VersionNum ::= '1.' [0-9]+
fn parse_version_info(s: &mut Stream<'a>) -> StreamResult<StrSpan<'a>> {
s.skip_spaces();
s.skip_string(b"version")?;
s.consume_eq()?;
let quote = s.consume_quote()?;
let start = s.pos();
s.skip_string(b"1.")?;
s.skip_bytes(|_, c| c.is_xml_digit());
let ver = s.slice_back(start);
s.consume_byte(quote)?;
Ok(ver)
}
// EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
// EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
fn parse_encoding_decl(s: &mut Stream<'a>) -> StreamResult<Option<StrSpan<'a>>> {
if !s.starts_with(b"encoding") {
return Ok(None);
}
s.advance(8);
s.consume_eq()?;
let quote = s.consume_quote()?;
// [A-Za-z] ([A-Za-z0-9._] | '-')*
// TODO: check that first byte is [A-Za-z]
let name = s.consume_bytes(|_, c| {
c.is_xml_letter() || c.is_xml_digit() || c == b'.' || c == b'-' || c == b'_'
});
s.consume_byte(quote)?;
Ok(Some(name))
}
// SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
fn parse_standalone(s: &mut Stream<'a>) -> StreamResult<Option<bool>> {
if !s.starts_with(b"standalone") {
return Ok(None);
}
s.advance(10);
s.consume_eq()?;
let quote = s.consume_quote()?;
let start = s.pos();
let value = s.consume_name()?.as_str();
let flag = match value {
"yes" => true,
"no" => false,
_ => {
let pos = s.gen_text_pos_from(start);
return Err(StreamError::InvalidString("yes', 'no", pos));
}
};
s.consume_byte(quote)?;
Ok(Some(flag))
}
fn parse_conditional_comment(s: &mut Stream<'a>) -> Result<Token<'a>> {
let start = s.pos();
Self::parse_conditional_comment_impl(s)
.map_err(|e| Error::InvalidConditionalComment(e, s.gen_text_pos_from(start)))
}
fn parse_conditional_comment_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
// Downlevel-hidden conditional comment
let condition = if s.starts_with(b"<!--[if") {
s.advance(5);
let text = s.consume_chars(|_s, c| c != ']')?;
Some(text)
}
// Downlevel-revealed conditional comment
// <![if !IE]>
else if s.starts_with(b"<![if") {
s.advance(3);
let text = s.consume_chars(|_s, c| c != ']')?;
Some(text)
}
// Closing downlevel-revealed conditional comment
// <![endif]>
else if s.starts_with(b"<![endif]") {
s.advance(3);
let _text = s.consume_chars(|_s, c| c != ']')?;
None
}
// Closing downlevel-revealed conditional comment
// <!--<![endif]-->
else if s.starts_with(b"<!--<![endif") {
s.advance(7);
let _text = s.consume_chars(|_s, c| c != ']')?;
None
}
// Shouldn't happen
else {
panic!("oops");
};
s.advance(1);
if s.starts_with(b">-->") {
s.advance(4);
} else if s.starts_with(b"><!-->") {
s.advance(6);
} else if s.starts_with(b"-->") {
s.advance(3);
} else if s.starts_with(b">") {
s.advance(1);
} else {
panic!("oops")
}
let span = s.slice_back(start);
if let Some(condition) = condition {
Ok(Token::ConditionalCommentStart { condition, span })
} else {
Ok(Token::ConditionalCommentEnd { span })
}
}
fn parse_comment(s: &mut Stream<'a>) -> Result<Token<'a>> {
let start = s.pos();
Self::parse_comment_impl(s)
.map_err(|e| Error::InvalidComment(e, s.gen_text_pos_from(start)))
}
// '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
fn parse_comment_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(4);
let text = s.consume_chars(|s, c| !(c == '-' && s.starts_with(b"-->")))?;
s.skip_string(b"-->")?;
if text.as_str().contains("--") {
return Err(StreamError::InvalidCommentData);
}
if text.as_str().ends_with('-') {
return Err(StreamError::InvalidCommentEnd);
}
let span = s.slice_back(start);
Ok(Token::Comment { text, span })
}
fn parse_pi(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_pi_impl(s), s, InvalidPI)
}
// PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
// PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
fn parse_pi_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(2);
let target = s.consume_name()?;
s.skip_spaces();
let content = s.consume_chars(|s, c| !(c == '?' && s.starts_with(b"?>")))?;
let content = if !content.is_empty() {
Some(content)
} else {
None
};
s.skip_string(b"?>")?;
let span = s.slice_back(start);
Ok(Token::ProcessingInstruction {
target,
content,
span,
})
}
fn parse_doctype(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_doctype_impl(s), s, InvalidDoctype)
}
// doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
fn parse_doctype_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(9);
s.consume_spaces()?;
let name = s.consume_name()?;
s.skip_spaces();
let external_id = Self::parse_external_id(s)?;
s.skip_spaces();
let c = s.curr_byte()?;
if c != b'[' && c != b'>' {
static EXPECTED: &[u8] = b"[>";
return Err(StreamError::InvalidCharMultiple(
c,
EXPECTED,
s.gen_text_pos(),
));
}
s.advance(1);
let span = s.slice_back(start);
if c == b'[' {
Ok(Token::DtdStart {
name,
external_id,
span,
})
} else {
Ok(Token::EmptyDtd {
name,
external_id,
span,
})
}
}
// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
fn parse_external_id(s: &mut Stream<'a>) -> StreamResult<Option<ExternalId<'a>>> {
let v = if s.starts_with(b"SYSTEM") || s.starts_with(b"PUBLIC") {
let start = s.pos();
s.advance(6);
let id = s.slice_back(start);
s.consume_spaces()?;
let quote = s.consume_quote()?;
let literal1 = s.consume_bytes(|_, c| c != quote);
s.consume_byte(quote)?;
let v = if id.as_str() == "SYSTEM" {
ExternalId::System(literal1)
} else {
s.consume_spaces()?;
let quote = s.consume_quote()?;
let literal2 = s.consume_bytes(|_, c| c != quote);
s.consume_byte(quote)?;
ExternalId::Public(literal1, literal2)
};
Some(v)
} else {
None
};
Ok(v)
}
fn parse_entity_decl(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_entity_decl_impl(s), s, InvalidEntity)
}
// EntityDecl ::= GEDecl | PEDecl
// GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
// PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
fn parse_entity_decl_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(8);
s.consume_spaces()?;
let is_ge = if s.try_consume_byte(b'%') {
s.consume_spaces()?;
false
} else {
true
};
let name = s.consume_name()?;
s.consume_spaces()?;
let definition = Self::parse_entity_def(s, is_ge)?;
s.skip_spaces();
s.consume_byte(b'>')?;
let span = s.slice_back(start);
Ok(Token::EntityDeclaration {
name,
definition,
span,
})
}
// EntityDef ::= EntityValue | (ExternalID NDataDecl?)
// PEDef ::= EntityValue | ExternalID
// EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | "'" ([^%&']
// | PEReference | Reference)* "'"
// ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
// NDataDecl ::= S 'NDATA' S Name
fn parse_entity_def(s: &mut Stream<'a>, is_ge: bool) -> StreamResult<EntityDefinition<'a>> {
let c = s.curr_byte()?;
match c {
b'"' | b'\'' => {
let quote = s.consume_quote()?;
let value = s.consume_bytes(|_, c| c != quote);
s.consume_byte(quote)?;
Ok(EntityDefinition::EntityValue(value))
}
b'S' | b'P' => {
if let Some(id) = Self::parse_external_id(s)? {
if is_ge {
s.skip_spaces();
if s.starts_with(b"NDATA") {
s.advance(5);
s.consume_spaces()?;
s.skip_name()?;
// TODO: NDataDecl is not supported
}
}
Ok(EntityDefinition::ExternalId(id))
} else {
Err(StreamError::InvalidExternalID)
}
}
_ => {
static EXPECTED: &[u8] = b"\"'SP";
let pos = s.gen_text_pos();
Err(StreamError::InvalidCharMultiple(c, EXPECTED, pos))
}
}
}
fn consume_decl(s: &mut Stream) -> StreamResult<()> {
s.skip_bytes(|_, c| c != b'>');
s.consume_byte(b'>')?;
Ok(())
}
fn parse_cdata(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_cdata_impl(s), s, InvalidCdata)
}
// CDSect ::= CDStart CData CDEnd
// CDStart ::= '<![CDATA['
// CData ::= (Char* - (Char* ']]>' Char*))
// CDEnd ::= ']]>'
fn parse_cdata_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(9);
let text = s.consume_chars(|s, c| !(c == ']' && s.starts_with(b"]]>")))?;
s.skip_string(b"]]>")?;
let span = s.slice_back(start);
Ok(Token::Cdata { text, span })
}
fn parse_element_start(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_element_start_impl(s), s, InvalidElement)
}
// '<' Name (S Attribute)* S? '>'
fn parse_element_start_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(1);
let (prefix, local) = s.consume_qname()?;
let span = s.slice_back(start);
Ok(Token::ElementStart {
prefix,
local,
span,
})
}
fn parse_close_element(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_close_element_impl(s), s, InvalidElement)
}
// '</' Name S? '>'
fn parse_close_element_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let start = s.pos();
s.advance(2);
let (prefix, tag_name) = s.consume_qname()?;
s.skip_spaces();
s.consume_byte(b'>')?;
let span = s.slice_back(start);
Ok(Token::ElementEnd {
end: ElementEnd::Close(prefix, tag_name),
span,
})
}
// Name Eq AttValue, or
// Name (infered as Name Eq "true")
fn parse_attribute(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let attr_start = s.pos();
let has_space = s.starts_with_space();
s.skip_spaces();
if let Ok(c) = s.curr_byte() {
let start = s.pos();
match c {
b'/' => {
s.advance(1);
s.consume_byte(b'>')?;
let span = s.slice_back(start);
return Ok(Token::ElementEnd {
end: ElementEnd::Empty,
span,
});
}
b'>' => {
s.advance(1);
let span = s.slice_back(start);
return Ok(Token::ElementEnd {
end: ElementEnd::Open,
span,
});
}
_ => {}
}
}
if !has_space {
if !s.at_end() {
return Err(StreamError::InvalidSpace(
s.curr_byte_unchecked(),
s.gen_text_pos_from(attr_start),
));
} else {
return Err(StreamError::UnexpectedEndOfStream);
}
}
let start = s.pos();
let (prefix, local) = s.consume_qname()?;
let value = if s.try_consume_eq() {
let quote = s.consume_quote()?;
let quote_c = quote as char;
let value = s.consume_chars(|_, c| c != quote_c)?;
s.consume_byte(quote)?;
Some(value)
} else {
None
};
let span = s.slice_back(start);
Ok(Token::Attribute {
prefix,
local,
value,
span,
})
}
fn parse_text(s: &mut Stream<'a>) -> Result<Token<'a>> {
map_err_at!(Self::parse_text_impl(s), s, InvalidCharData)
}
fn parse_text_impl(s: &mut Stream<'a>) -> StreamResult<Token<'a>> {
let text = s.consume_chars(|_, c| c != '<')?;
// According to the spec, `]]>` must not appear inside a Text node.
// https://www.w3.org/TR/xml/#syntax
//
// Search for `>` first, since it's a bit faster than looking for `]]>`.
if let Some(position) = text.as_str().find('>') {
if text.as_str()[position.saturating_sub(2)..].starts_with("]]>") {
return Err(StreamError::InvalidCharacterData);
}
}
Ok(Token::Text { text })
}
/// Returns a copy of the tokenizer's stream.
pub fn stream(&self) -> Stream<'a> {
self.stream
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Result<Token<'a>>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let mut t = None;
while !self.stream.at_end() && self.state != State::End && t.is_none() {
t = self.parse_next_impl();
}
if let Some(Err(_)) = t {
self.stream.jump_to_end();
self.state = State::End;
}
t
}
}