html5gum 0.5.2

A WHATWG-compliant HTML5 tokenizer and tag soup parser.
Documentation
use std::borrow::{Borrow, BorrowMut};
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::VecDeque;
use std::fmt::{Debug, Formatter};
use std::mem;
use std::ops::{Deref, DerefMut};

use crate::{Error, State};

/// A wrapper around a bytestring.
///
/// This newtype only exists to provide a nicer `Debug` impl
#[derive(Clone, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub struct HtmlString(pub Vec<u8>);

impl Deref for HtmlString {
    type Target = Vec<u8>;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl DerefMut for HtmlString {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

impl Debug for HtmlString {
    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
        write!(f, "b\"")?;
        for &byte in &self.0 {
            for ch in std::ascii::escape_default(byte) {
                write!(f, "{}", ch as char)?;
            }
        }

        write!(f, "\"")
    }
}

impl Borrow<[u8]> for HtmlString {
    fn borrow(&self) -> &[u8] {
        &self.0
    }
}

impl BorrowMut<[u8]> for HtmlString {
    fn borrow_mut(&mut self) -> &mut [u8] {
        &mut self.0
    }
}

#[test]
fn test_borrowing() {
    // demonstrate a usecase for Borrow/BorrowMut
    let tag = StartTag::default();
    assert!(tag.attributes.get(b"href".as_slice()).is_none());
}

impl From<Vec<u8>> for HtmlString {
    fn from(vec: Vec<u8>) -> HtmlString {
        HtmlString(vec)
    }
}

impl From<HtmlString> for Vec<u8> {
    fn from(other: HtmlString) -> Vec<u8> {
        other.0
    }
}

/// An emitter is an object providing methods to the tokenizer to produce tokens.
///
/// Domain-specific applications of the HTML tokenizer can manually implement this trait to
/// customize per-token allocations, or avoid them altogether.
///
/// An emitter is assumed to have these internal states:
///
/// * _last start tag_: The most recently emitted start tag's name
/// * _current token_: Can be a tag, doctype or comment token. There's only one current token.
/// * _current attribute_: The currently processed HTML attribute, consisting of two strings for name and value.
///
/// The following methods are describing what kind of behavior the WHATWG spec expects, but that
/// doesn't mean you need to follow it. For example:
///
/// * If your usage of the tokenizer will ignore all errors, none of the error handling and
///   validation requirements apply to you. You can implement `emit_error` as noop and omit all
///   checks that would emit errors.
///
/// * If you don't care about attributes at all, you can make all related methods a noop.
///
/// The state machine needs to have a functional implementation of
/// `current_is_appropriate_end_tag_token` to do correct transitions, however.
pub trait Emitter {
    /// The token type emitted by this emitter. This controls what type of values the [`crate::Tokenizer`]
    /// yields when used as an iterator.
    type Token;

    /// Set the name of the _last start tag_.
    ///
    /// This is primarily for testing purposes. This is *not* supposed to override the tag name of
    /// the current tag.
    fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>);

    /// The state machine has reached the end of the file. It will soon call `pop_token` for the
    /// last time.
    fn emit_eof(&mut self);

    /// A (probably recoverable) parsing error has occured.
    fn emit_error(&mut self, error: Error);

    /// After every state change, the tokenizer calls this method to retrieve a new token that can
    /// be returned via the tokenizer's iterator interface.
    fn pop_token(&mut self) -> Option<Self::Token>;

    /// Emit a bunch of plain characters as character tokens.
    fn emit_string(&mut self, c: &[u8]);

    /// Set the _current token_ to a start tag.
    fn init_start_tag(&mut self);

    /// Set the _current token_ to an end tag.
    fn init_end_tag(&mut self);

    /// Set the _current token_ to a comment.
    fn init_comment(&mut self);

    /// Emit the _current token_, assuming it is a tag.
    ///
    /// Also get the current attribute and append it to the to-be-emitted tag. See docstring for
    /// [`Emitter::init_attribute`] for how duplicates should be handled.
    ///
    /// If a start tag is emitted, update the _last start tag_.
    ///
    /// If the current token is not a start/end tag, this method may panic.
    ///
    /// The return value is used to switch the tokenizer to a new state. Used in tree building.
    ///
    /// If this method always returns `None`, states are never switched, which leads to artifacts
    /// like contents of `<script>` tags being incorrectly interpreted as HTML.
    ///
    /// It's not possible to implement this method correctly in line with the spec without
    /// implementing a full-blown tree builder as per [tree
    /// construction](https://html.spec.whatwg.org/#tree-construction), which this crate does not
    /// offer.
    ///
    /// You can approximate correct behavior using [`naive_next_state`], but the caveats of doing
    /// so are not well-understood.
    ///
    /// See the `tokenize_with_state_switches` cargo example for a practical example where this
    /// matters.
    #[must_use]
    fn emit_current_tag(&mut self) -> Option<State>;

    /// Emit the _current token_, assuming it is a comment.
    ///
    /// If the current token is not a comment, this method may panic.
    fn emit_current_comment(&mut self);

    /// Emit the _current token_, assuming it is a doctype.
    ///
    /// If the current token is not a doctype, this method may panic.
    fn emit_current_doctype(&mut self);

    /// Assuming the _current token_ is a start tag, set the self-closing flag.
    ///
    /// If the current token is not a start or end tag, this method may panic.
    ///
    /// If the current token is an end tag, the emitter should emit the
    /// [`crate::Error::EndTagWithTrailingSolidus`] error.
    fn set_self_closing(&mut self);

    /// Assuming the _current token_ is a doctype, set its "force quirks" flag to true.
    ///
    /// If the current token is not a doctype, this method pay panic.
    fn set_force_quirks(&mut self);

    /// Assuming the _current token_ is a start/end tag, append a string to the current tag's name.
    ///
    /// If the current token is not a start or end tag, this method may panic.
    fn push_tag_name(&mut self, s: &[u8]);

    /// Assuming the _current token_ is a comment, append a string to the comment's contents.
    ///
    /// If the current token is not a comment, this method may panic.
    fn push_comment(&mut self, s: &[u8]);

    /// Assuming the _current token_ is a doctype, append a string to the doctype's name.
    ///
    /// If the current token is not a doctype, this method may panic.
    fn push_doctype_name(&mut self, s: &[u8]);

    /// Set the _current token_ to a new doctype token:
    ///
    /// * the name should be empty
    /// * the "public identifier" should be null (different from empty)
    /// * the "system identifier" should be null (different from empty)
    /// * the "force quirks" flag should be `false`
    fn init_doctype(&mut self);

    /// Set the _current attribute_ to a new one, starting with empty name and value strings.
    ///
    /// The old attribute, if any, should be put on the _current token_. If an attribute with that
    /// name already exists, WHATWG says the new one should be ignored and a
    /// [`crate::Error::DuplicateAttribute`] error should be emitted.
    ///
    /// If the current token is an end tag token, a [`crate::Error::EndTagWithAttributes`] error should be
    /// emitted.
    ///
    /// If the current token is no tag at all, this method may panic.
    fn init_attribute(&mut self);

    /// Append a string to the current attribute's name.
    ///
    /// If there is no current attribute, this method may panic.
    fn push_attribute_name(&mut self, s: &[u8]);

    /// Append a string to the current attribute's value.
    ///
    /// If there is no current attribute, this method may panic.
    fn push_attribute_value(&mut self, s: &[u8]);

    /// Assuming the _current token_ is a doctype, set its "public identifier" to the given string.
    ///
    /// If the current token is not a doctype, this method may panic.
    fn set_doctype_public_identifier(&mut self, value: &[u8]);

    /// Assuming the _current token_ is a doctype, set its "system identifier" to the given string.
    ///
    /// If the current token is not a doctype, this method may panic.
    fn set_doctype_system_identifier(&mut self, value: &[u8]);

    /// Assuming the _current token_ is a doctype, append a string to its "public identifier" to the given string.
    ///
    /// If the current token is not a doctype, this method may panic.
    fn push_doctype_public_identifier(&mut self, s: &[u8]);

    /// Assuming the _current token_ is a doctype, append a string to its "system identifier" to the given string.
    ///
    /// If the current token is not a doctype, this method may panic.
    fn push_doctype_system_identifier(&mut self, s: &[u8]);

    /// Return true if all of these hold. Return false otherwise.
    ///
    /// * the _current token_ is an end tag
    /// * the _last start tag_ exists
    /// * the current end tag token's name equals to the last start tag's name.
    ///
    /// See also [WHATWG's definition of "appropriate end tag
    /// token"](https://html.spec.whatwg.org/#appropriate-end-tag-token).
    fn current_is_appropriate_end_tag_token(&mut self) -> bool;

    /// By default, this always returns false and thus
    /// all CDATA sections are tokenized as bogus comments.
    ///
    /// See [markup declaration open
    /// state](https://html.spec.whatwg.org/multipage/#markup-declaration-open-state).
    fn adjusted_current_node_present_but_not_in_html_namespace(&mut self) -> bool {
        false
    }
}

/// Take an educated guess at the next state using the name of a just-now emitted start tag.
///
/// This can be used to implement [`Emitter::emit_current_tag`] for most HTML scraping applications,
/// but is unsuitable for implementing a browser.
///
/// The mapping was inspired by `lol-html` which has additional safeguards to detect ambiguous
/// parsing state.
#[must_use]
pub fn naive_next_state(tag_name: &[u8]) -> Option<State> {
    match tag_name {
        b"textarea" | b"title" => Some(State::RcData),
        b"plaintext" => Some(State::PlainText),
        b"script" => Some(State::ScriptData),
        b"style" | b"iframe" | b"xmp" | b"noembed" | b"noframe" | b"noscript" => {
            Some(State::RawText)
        }
        _ => None,
    }
}

/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
#[derive(Debug, Default)]
pub struct DefaultEmitter {
    current_characters: HtmlString,
    current_token: Option<Token>,
    last_start_tag: HtmlString,
    current_attribute: Option<(HtmlString, HtmlString)>,
    seen_attributes: BTreeSet<HtmlString>,
    emitted_tokens: VecDeque<Token>,
    switch_states: bool,
}

impl DefaultEmitter {
    /// Whether to use [`naive_next_state`] to switch states automatically.
    ///
    /// The default is off.
    pub fn switch_states(&mut self, yes: bool) {
        self.switch_states = yes;
    }

    fn emit_token(&mut self, token: Token) {
        self.flush_current_characters();
        self.emitted_tokens.push_front(token);
    }

    fn flush_current_attribute(&mut self) {
        if let Some((k, v)) = self.current_attribute.take() {
            match self.current_token {
                Some(Token::StartTag(ref mut tag)) => {
                    let mut error = None;
                    tag.attributes
                        .entry(k)
                        .and_modify(|_| {
                            error = Some(Error::DuplicateAttribute);
                        })
                        .or_insert(v);

                    if let Some(e) = error {
                        self.emit_error(e);
                    }
                }
                Some(Token::EndTag(_)) => {
                    if !self.seen_attributes.insert(k) {
                        self.emit_error(Error::DuplicateAttribute);
                    }
                }
                _ => {
                    debug_assert!(false);
                }
            }
        }
    }

    fn flush_current_characters(&mut self) {
        if self.current_characters.is_empty() {
            return;
        }

        let s = mem::take(&mut self.current_characters);
        self.emit_token(Token::String(s));
    }
}

impl Emitter for DefaultEmitter {
    type Token = Token;

    fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
        self.last_start_tag.clear();
        self.last_start_tag
            .extend(last_start_tag.unwrap_or_default());
    }

    fn emit_eof(&mut self) {
        self.flush_current_characters();
    }

    fn emit_error(&mut self, error: Error) {
        // bypass character flushing in self.emit_token: we don't need the error location to be
        // that exact
        self.emitted_tokens.push_front(Token::Error(error));
    }

    fn pop_token(&mut self) -> Option<Self::Token> {
        self.emitted_tokens.pop_back()
    }

    fn emit_string(&mut self, s: &[u8]) {
        self.current_characters.extend(s);
    }

    fn init_start_tag(&mut self) {
        self.current_token = Some(Token::StartTag(StartTag::default()));
    }
    fn init_end_tag(&mut self) {
        self.current_token = Some(Token::EndTag(EndTag::default()));
        self.seen_attributes.clear();
    }

    fn init_comment(&mut self) {
        self.current_token = Some(Token::Comment(HtmlString::default()));
    }
    fn emit_current_tag(&mut self) -> Option<State> {
        self.flush_current_attribute();
        let mut token = self.current_token.take().unwrap();
        match token {
            Token::EndTag(_) => {
                if !self.seen_attributes.is_empty() {
                    self.emit_error(Error::EndTagWithAttributes);
                }
                self.seen_attributes.clear();
            }
            Token::StartTag(ref mut tag) => {
                self.set_last_start_tag(Some(&tag.name));
            }
            _ => debug_assert!(false),
        }
        self.emit_token(token);
        if self.switch_states {
            dbg!(naive_next_state(&*self.last_start_tag))
        } else {
            None
        }
    }
    fn emit_current_comment(&mut self) {
        let comment = self.current_token.take().unwrap();
        debug_assert!(matches!(comment, Token::Comment(_)));
        self.emit_token(comment);
    }

    fn emit_current_doctype(&mut self) {
        let doctype = self.current_token.take().unwrap();
        debug_assert!(matches!(doctype, Token::Doctype(_)));
        self.emit_token(doctype);
    }

    fn set_self_closing(&mut self) {
        let tag = self.current_token.as_mut().unwrap();
        match tag {
            Token::StartTag(StartTag {
                ref mut self_closing,
                ..
            }) => {
                *self_closing = true;
            }
            Token::EndTag(_) => {
                self.emit_error(Error::EndTagWithTrailingSolidus);
            }
            _ => {
                debug_assert!(false);
            }
        }
    }
    fn set_force_quirks(&mut self) {
        match self.current_token {
            Some(Token::Doctype(ref mut doctype)) => doctype.force_quirks = true,
            _ => debug_assert!(false),
        }
    }
    fn push_tag_name(&mut self, s: &[u8]) {
        match self.current_token {
            Some(
                Token::StartTag(StartTag { ref mut name, .. })
                | Token::EndTag(EndTag { ref mut name, .. }),
            ) => {
                name.extend(s);
            }
            _ => debug_assert!(false),
        }
    }

    fn push_comment(&mut self, s: &[u8]) {
        match self.current_token {
            Some(Token::Comment(ref mut data)) => data.extend(s),
            _ => debug_assert!(false),
        }
    }

    fn push_doctype_name(&mut self, s: &[u8]) {
        match self.current_token {
            Some(Token::Doctype(ref mut doctype)) => doctype.name.extend(s),
            _ => debug_assert!(false),
        }
    }
    fn init_doctype(&mut self) {
        self.current_token = Some(Token::Doctype(Doctype {
            name: HtmlString::default(),
            force_quirks: false,
            public_identifier: None,
            system_identifier: None,
        }));
    }

    fn init_attribute(&mut self) {
        self.flush_current_attribute();
        self.current_attribute = Some(Default::default());
    }
    fn push_attribute_name(&mut self, s: &[u8]) {
        self.current_attribute.as_mut().unwrap().0.extend(s);
    }
    fn push_attribute_value(&mut self, s: &[u8]) {
        self.current_attribute.as_mut().unwrap().1.extend(s);
    }
    fn set_doctype_public_identifier(&mut self, value: &[u8]) {
        if let Some(Token::Doctype(Doctype {
            ref mut public_identifier,
            ..
        })) = self.current_token
        {
            *public_identifier = Some(value.to_vec().into());
        } else {
            debug_assert!(false);
        }
    }
    fn set_doctype_system_identifier(&mut self, value: &[u8]) {
        if let Some(Token::Doctype(Doctype {
            ref mut system_identifier,
            ..
        })) = self.current_token
        {
            *system_identifier = Some(value.to_vec().into());
        } else {
            debug_assert!(false);
        }
    }
    fn push_doctype_public_identifier(&mut self, s: &[u8]) {
        if let Some(Token::Doctype(Doctype {
            public_identifier: Some(ref mut id),
            ..
        })) = self.current_token
        {
            id.extend(s);
        } else {
            debug_assert!(false);
        }
    }
    fn push_doctype_system_identifier(&mut self, s: &[u8]) {
        if let Some(Token::Doctype(Doctype {
            system_identifier: Some(ref mut id),
            ..
        })) = self.current_token
        {
            id.extend(s);
        } else {
            debug_assert!(false);
        }
    }

    fn current_is_appropriate_end_tag_token(&mut self) -> bool {
        match self.current_token {
            Some(Token::EndTag(ref tag)) => {
                !self.last_start_tag.is_empty() && self.last_start_tag == tag.name
            }
            _ => false,
        }
    }
}

/// A HTML end/close tag, such as `<p>` or `<a>`.
#[derive(Debug, Default, Eq, PartialEq, Clone)]
pub struct StartTag {
    /// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
    /// expected.
    pub self_closing: bool,

    /// The start tag's name, such as `"p"` or `"a"`.
    pub name: HtmlString,

    /// A mapping for any HTML attributes this start tag may have.
    ///
    /// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own
    /// [`Emitter`] to tweak this behavior.
    pub attributes: BTreeMap<HtmlString, HtmlString>,
}

/// A HTML end/close tag, such as `</p>` or `</a>`.
#[derive(Debug, Default, Eq, PartialEq, Clone)]
pub struct EndTag {
    /// The ending tag's name, such as `"p"` or `"a"`.
    pub name: HtmlString,
}

/// A doctype. Some examples:
///
/// * `<!DOCTYPE {name}>`
/// * `<!DOCTYPE {name} PUBLIC '{public_identifier}'>`
/// * `<!DOCTYPE {name} SYSTEM '{system_identifier}'>`
/// * `<!DOCTYPE {name} PUBLIC '{public_identifier}' '{system_identifier}'>`
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct Doctype {
    /// The ["force quirks"](https://html.spec.whatwg.org/#force-quirks-flag) flag.
    pub force_quirks: bool,

    /// The doctype's name. For HTML documents this is "html".
    pub name: HtmlString,

    /// The doctype's public identifier.
    pub public_identifier: Option<HtmlString>,

    /// The doctype's system identifier.
    pub system_identifier: Option<HtmlString>,
}

/// The token type used by default. You can define your own token type by implementing the
/// [`crate::Emitter`] trait and using [`crate::Tokenizer::new_with_emitter`].
#[derive(Debug, Eq, PartialEq, Clone)]
pub enum Token {
    /// A HTML start tag.
    StartTag(StartTag),
    /// A HTML end tag.
    EndTag(EndTag),
    /// A literal string.
    String(HtmlString),
    /// A HTML comment.
    Comment(HtmlString),
    /// A HTML doctype declaration.
    Doctype(Doctype),
    /// A HTML parsing error.
    ///
    /// Can be skipped over, the tokenizer is supposed to recover from the error and continues with
    /// more tokens afterward.
    Error(Error),
}