htmlite 0.12.0

An HTML manipulation toolkit
Documentation
use crate::tokenizer::{self, Attribute, Tag, TagKind, TokenKind, Tokenizer};
use crate::{Node, NodeArena, NodeKind};
use annotate_snippets::{AnnotationKind, Level, Snippet};
use std::sync::Arc;

struct Parser<'arena> {
    tokenizer: Tokenizer,
    root: Node<'arena>,
    open_elements: Vec<(usize, Node<'arena>)>,
}

#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
pub(crate) enum Namespace {
    #[default]
    Html,
    Foreign,
}

pub(crate) const VOID_ELEMENT_NAMES: [&str; 13] = [
    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "source", "track",
    "wbr",
];

/// An error that occurs while constructing an HTML tree.
///
/// The `Display` implementation will print out the error as well as the line on which it occured.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParserError {
    kind: ErrorKind,
    offset: usize,
    source: Arc<String>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(missing_docs)]
enum ErrorKind {
    EndTagWithoutCorrespondingStartTag,
    UnclosedStartTag,
    VoidElementAsEndTag,
    NonVoidElementStartTagWithTrailingSolidus,
}

/// Parses the given HTML fragment.
///
/// Parsing is done in two phases: tokenization and tree construction:
///   - The tokenization phase is spec compliant, so it does not fail hard, instead errors are recovered from in a spec-compliant way.
///     The input will still be converted to tokens and emitted to the next phase.
///   - The tree construction phase implements a very small subset of the spec, and fails at the first [`ParserError`], which is returned to the caller.
pub fn parse<'a>(arena: &'a NodeArena, html: &str) -> Result<Node<'a>, ParserError> {
    let parser = Parser {
        tokenizer: Tokenizer::new(html),
        open_elements: Vec::new(),
        root: arena.fragment([]),
    };
    parser.parse(arena)
}

impl<'a> Parser<'a> {
    fn parse(mut self, arena: &'a NodeArena) -> Result<Node<'a>, ParserError> {
        loop {
            let current_token = self.tokenizer.next_token();

            match current_token.kind {
                TokenKind::Text(text) => {
                    let node = if let Some((_, el)) = self.open_elements.last() {
                        // When parsing markup, we mark text nodes that should be serialized verbtim as such.
                        // This makes the serializtion algorithm much easier.
                        // See: https://html.spec.whatwg.org/multipage/parsing.html#serialising-html-fragments
                        if matches!(
                            el.name(),
                            "style"
                                | "script"
                                | "xmp"
                                | "iframe"
                                | "noembed"
                                | "noframes"
                                | "plaintext"
                        ) {
                            arena.raw_text(text)
                        } else {
                            arena.text(text)
                        }
                    } else {
                        arena.text(text)
                    };
                    self.insert_comment_text_or_doctype(node);
                }
                TokenKind::Comment(comment) => {
                    self.insert_comment_text_or_doctype(arena.new_comment(comment));
                }
                TokenKind::Doctype(doctype) => {
                    self.insert_comment_text_or_doctype(
                        arena.new_doctype(doctype.name.unwrap_or_default()),
                    );
                }
                TokenKind::Eof => {
                    if let Some((token_offset, _)) = self.open_elements.last() {
                        return Err(ParserError {
                            offset: *token_offset,
                            kind: ErrorKind::UnclosedStartTag,
                            source: self.tokenizer.get_input(),
                        });
                    };
                    // We are done parsing when there are no more tokens AND the stack of open elements is empty.
                    return Ok(self.root);
                }
                TokenKind::Tag(
                    tag @ Tag {
                        kind: TagKind::Start,
                        ..
                    },
                ) => {
                    let attrs = tag
                        .attributes
                        .iter()
                        .map(|Attribute { name, value }| (name.as_str(), value.as_str()));

                    let namespace = if tag.name == "svg" || tag.name == "math" {
                        Namespace::Foreign
                    } else {
                        self.current_namespace()
                    };

                    let el = arena.new_element(tag.name, attrs, namespace);
                    self.insert_element(current_token.span.start, el);

                    if namespace == Namespace::Html && VOID_ELEMENT_NAMES.contains(&el.name()) {
                        self.open_elements.pop();
                    } else if tag.self_closing {
                        if namespace == Namespace::Foreign {
                            self.open_elements.pop();
                        } else {
                            return Err(ParserError {
                                kind: ErrorKind::NonVoidElementStartTagWithTrailingSolidus,
                                offset: current_token.span.start,
                                source: self.tokenizer.get_input(),
                            });
                        }
                    }

                    match el.name() {
                        "title" | "textarea" => {
                            self.tokenizer.switch_to(tokenizer::state_rc_data);
                        }
                        "noscript" | "style" | "xmp" | "iframe" | "noembed" | "noframes" => {
                            self.tokenizer.switch_to(tokenizer::state_raw_text);
                        }
                        "script" => {
                            self.tokenizer.switch_to(tokenizer::state_script_data);
                        }
                        _ => {}
                    }
                }

                TokenKind::Tag(
                    tag @ Tag {
                        kind: TagKind::End, ..
                    },
                ) => {
                    if self.current_namespace() == Namespace::Html
                        && VOID_ELEMENT_NAMES.contains(&tag.name.as_str())
                    {
                        return Err(ParserError {
                            kind: ErrorKind::VoidElementAsEndTag,
                            offset: current_token.span.start,
                            source: self.tokenizer.get_input(),
                        });
                    }

                    let Some((token_offset, node)) = self.open_elements.pop() else {
                        return Err(ParserError {
                            kind: ErrorKind::EndTagWithoutCorrespondingStartTag,
                            offset: current_token.span.start,
                            source: self.tokenizer.get_input(),
                        });
                    };

                    if *node.name() != tag.name {
                        return Err(ParserError {
                            kind: ErrorKind::UnclosedStartTag,
                            offset: token_offset,
                            source: self.tokenizer.get_input(),
                        });
                    }
                }
            }
        }
    }

    fn insert_comment_text_or_doctype(&mut self, n: Node<'a>) {
        debug_assert!(matches!(
            n.kind(),
            NodeKind::Text | NodeKind::Comment | NodeKind::Doctype
        ));
        if let Some((_, parent)) = self.open_elements.last() {
            parent.append(n);
        } else {
            self.root.append(n);
        };
    }

    fn insert_element(&mut self, token_offset: usize, el: Node<'a>) {
        if let Some((_, parent)) = self.open_elements.last() {
            parent.append(el);
        } else {
            self.root.append(el);
        }

        self.open_elements.push((token_offset, el));
    }

    fn current_namespace(&self) -> Namespace {
        let Some((_, el)) = self.open_elements.last() else {
            return Namespace::Html;
        };
        let n = &el.arena.nodes[el.id];
        n.namespace
    }
}

impl std::error::Error for ParserError {}

impl std::fmt::Display for ParserError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let snippet = Snippet::source(self.source.as_str())
            .fold(true)
            .annotation(AnnotationKind::Primary.span(self.offset..self.offset));
        let report = Level::ERROR
            .primary_title(self.kind.code())
            .element(snippet);
        let renderer = annotate_snippets::Renderer::plain();
        let display = renderer.render(&[report]);
        write!(f, "{display}")
    }
}

impl ErrorKind {
    fn code(&self) -> &'static str {
        match self {
            ErrorKind::EndTagWithoutCorrespondingStartTag => "no-matching-start-tag",
            ErrorKind::UnclosedStartTag => "unclosed-start-tag",
            ErrorKind::VoidElementAsEndTag => "void-element-as-end-tag",
            ErrorKind::NonVoidElementStartTagWithTrailingSolidus => {
                "non-void-element-with-trailing-solidus"
            }
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    fn print_node<'a>(sink: &mut String, node: Node<'a>) {
        use std::fmt::Write;
        match &node.kind() {
            NodeKind::Comment => {
                write!(sink, "#comment({})", node.text_content()).unwrap();
            }
            NodeKind::Text => {
                write!(sink, "#text({})", node.text_content()).unwrap();
            }
            NodeKind::Doctype => {
                write!(sink, "#doctype({})", node.name()).unwrap();
            }
            NodeKind::Fragment => {
                for c in node.children() {
                    print_node(sink, c);
                }
            }
            NodeKind::Element => {
                write!(sink, "{}", node.name()).unwrap();
                let attrs: Vec<_> = node.attrs().collect();
                if attrs.len() > 0 {
                    write!(sink, "[").unwrap();
                    for (i, (k, v)) in attrs.iter().enumerate() {
                        if i != 0 {
                            write!(sink, ",").unwrap();
                        }
                        write!(sink, "{}={}", k, v).unwrap();
                    }
                    write!(sink, "]").unwrap();
                }
                write!(sink, "(").unwrap();
                for c in node.children() {
                    print_node(sink, c);
                }
                write!(sink, ")").unwrap();
            }
        }
    }

    #[track_caller]
    fn check(input: &str, expected: &str) {
        let arena = NodeArena::new();
        let result = parse(&arena, input);

        let mut actual_output = String::new();
        match result {
            Ok(root) => {
                print_node(&mut actual_output, root);
            }
            Err(err) => {
                use std::fmt::Write;
                write!(actual_output, "error @ {}: {}", err.offset, err.kind.code()).unwrap();
            }
        }

        assert_eq!(actual_output, expected);
    }

    #[test]
    fn test_cases() {
        check("", "");
        check("   ", "#text(   )");
        check("just some text", "#text(just some text)");
        check("<!--a comment-->", "#comment(a comment)");
        check("<!doctype html>", "#doctype(html)");
        check("<div></div>", "div()");

        // element casing is not relevant
        check("<DIV></DIV>", "div()");

        // element with attributes
        check("<div a='b'></div>", "div[a=b]()");

        // attribute name casing is not relevant
        check("<div A='b'></div>", "div[a=b]()");

        // void element
        check("<link><div></div>", "link()div()");

        // void element with trailing solidus
        check("<link/><div></div>", "link()div()");

        // trailing solidus on non-void elements causes an error
        check(
            "<div />",
            "error @ 0: non-void-element-with-trailing-solidus",
        );

        // trailing solidus on foreign elements is ok (svg)
        check("<svg />", "svg()");

        // trailing solidus on foreign elements is ok (svg descendants)
        check("<svg><p /></svg>", "svg(p())");

        // trailing solidus on foreign elements is ok (math)
        check("<math />", "math()");

        // trailing solidus on foreign elements is ok (math descendants)
        check("<math><variable /></math>", "math(variable())");

        // parsing switches states upon encountering certain elements
        check(
            "<title><tag></title><textarea><tag></textarea><noscript><tag></noscript><style><tag></style><xmp><tag></xmp><iframe><tag></iframe><noembed><tag></noembed><noframes><tag></noframes><script><tag></script>",
            "title(#text(<tag>))textarea(#text(<tag>))noscript(#text(<tag>))style(#text(<tag>))xmp(#text(<tag>))iframe(#text(<tag>))noembed(#text(<tag>))noframes(#text(<tag>))script(#text(<tag>))",
        );

        // unclosed start tag
        check("<title>", "error @ 0: unclosed-start-tag");

        // unopened end tag
        check("</span>", "error @ 0: no-matching-start-tag");

        // leading solidus on void elements
        check("<link></link>", "error @ 6: void-element-as-end-tag");

        // closing tag without matching start tag
        check("<div></span>", "error @ 0: unclosed-start-tag");

        // end tag with no open element
        check("<div></div></span>", "error @ 11: no-matching-start-tag");
    }
}