use crate::tokenizer::{self, Attribute, Tag, TagKind, TokenKind, Tokenizer};
use crate::{Node, NodeArena, NodeKind};
use annotate_snippets::{AnnotationKind, Level, Snippet};
use std::sync::Arc;
struct Parser<'arena> {
tokenizer: Tokenizer,
root: Node<'arena>,
open_elements: Vec<(usize, Node<'arena>)>,
}
#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
pub(crate) enum Namespace {
#[default]
Html,
Foreign,
}
pub(crate) const VOID_ELEMENT_NAMES: [&str; 13] = [
"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "source", "track",
"wbr",
];
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParserError {
kind: ErrorKind,
offset: usize,
source: Arc<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(missing_docs)]
enum ErrorKind {
EndTagWithoutCorrespondingStartTag,
UnclosedStartTag,
VoidElementAsEndTag,
NonVoidElementStartTagWithTrailingSolidus,
}
pub fn parse<'a>(arena: &'a NodeArena, html: &str) -> Result<Node<'a>, ParserError> {
let parser = Parser {
tokenizer: Tokenizer::new(html),
open_elements: Vec::new(),
root: arena.fragment([]),
};
parser.parse(arena)
}
impl<'a> Parser<'a> {
fn parse(mut self, arena: &'a NodeArena) -> Result<Node<'a>, ParserError> {
loop {
let current_token = self.tokenizer.next_token();
match current_token.kind {
TokenKind::Text(text) => {
let node = if let Some((_, el)) = self.open_elements.last() {
if matches!(
el.name(),
"style"
| "script"
| "xmp"
| "iframe"
| "noembed"
| "noframes"
| "plaintext"
) {
arena.raw_text(text)
} else {
arena.text(text)
}
} else {
arena.text(text)
};
self.insert_comment_text_or_doctype(node);
}
TokenKind::Comment(comment) => {
self.insert_comment_text_or_doctype(arena.new_comment(comment));
}
TokenKind::Doctype(doctype) => {
self.insert_comment_text_or_doctype(
arena.new_doctype(doctype.name.unwrap_or_default()),
);
}
TokenKind::Eof => {
if let Some((token_offset, _)) = self.open_elements.last() {
return Err(ParserError {
offset: *token_offset,
kind: ErrorKind::UnclosedStartTag,
source: self.tokenizer.get_input(),
});
};
return Ok(self.root);
}
TokenKind::Tag(
tag @ Tag {
kind: TagKind::Start,
..
},
) => {
let attrs = tag
.attributes
.iter()
.map(|Attribute { name, value }| (name.as_str(), value.as_str()));
let namespace = if tag.name == "svg" || tag.name == "math" {
Namespace::Foreign
} else {
self.current_namespace()
};
let el = arena.new_element(tag.name, attrs, namespace);
self.insert_element(current_token.span.start, el);
if namespace == Namespace::Html && VOID_ELEMENT_NAMES.contains(&el.name()) {
self.open_elements.pop();
} else if tag.self_closing {
if namespace == Namespace::Foreign {
self.open_elements.pop();
} else {
return Err(ParserError {
kind: ErrorKind::NonVoidElementStartTagWithTrailingSolidus,
offset: current_token.span.start,
source: self.tokenizer.get_input(),
});
}
}
match el.name() {
"title" | "textarea" => {
self.tokenizer.switch_to(tokenizer::state_rc_data);
}
"noscript" | "style" | "xmp" | "iframe" | "noembed" | "noframes" => {
self.tokenizer.switch_to(tokenizer::state_raw_text);
}
"script" => {
self.tokenizer.switch_to(tokenizer::state_script_data);
}
_ => {}
}
}
TokenKind::Tag(
tag @ Tag {
kind: TagKind::End, ..
},
) => {
if self.current_namespace() == Namespace::Html
&& VOID_ELEMENT_NAMES.contains(&tag.name.as_str())
{
return Err(ParserError {
kind: ErrorKind::VoidElementAsEndTag,
offset: current_token.span.start,
source: self.tokenizer.get_input(),
});
}
let Some((token_offset, node)) = self.open_elements.pop() else {
return Err(ParserError {
kind: ErrorKind::EndTagWithoutCorrespondingStartTag,
offset: current_token.span.start,
source: self.tokenizer.get_input(),
});
};
if *node.name() != tag.name {
return Err(ParserError {
kind: ErrorKind::UnclosedStartTag,
offset: token_offset,
source: self.tokenizer.get_input(),
});
}
}
}
}
}
fn insert_comment_text_or_doctype(&mut self, n: Node<'a>) {
debug_assert!(matches!(
n.kind(),
NodeKind::Text | NodeKind::Comment | NodeKind::Doctype
));
if let Some((_, parent)) = self.open_elements.last() {
parent.append(n);
} else {
self.root.append(n);
};
}
fn insert_element(&mut self, token_offset: usize, el: Node<'a>) {
if let Some((_, parent)) = self.open_elements.last() {
parent.append(el);
} else {
self.root.append(el);
}
self.open_elements.push((token_offset, el));
}
fn current_namespace(&self) -> Namespace {
let Some((_, el)) = self.open_elements.last() else {
return Namespace::Html;
};
let n = &el.arena.nodes[el.id];
n.namespace
}
}
impl std::error::Error for ParserError {}
impl std::fmt::Display for ParserError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let snippet = Snippet::source(self.source.as_str())
.fold(true)
.annotation(AnnotationKind::Primary.span(self.offset..self.offset));
let report = Level::ERROR
.primary_title(self.kind.code())
.element(snippet);
let renderer = annotate_snippets::Renderer::plain();
let display = renderer.render(&[report]);
write!(f, "{display}")
}
}
impl ErrorKind {
fn code(&self) -> &'static str {
match self {
ErrorKind::EndTagWithoutCorrespondingStartTag => "no-matching-start-tag",
ErrorKind::UnclosedStartTag => "unclosed-start-tag",
ErrorKind::VoidElementAsEndTag => "void-element-as-end-tag",
ErrorKind::NonVoidElementStartTagWithTrailingSolidus => {
"non-void-element-with-trailing-solidus"
}
}
}
}
#[cfg(test)]
mod test {
use super::*;
fn print_node<'a>(sink: &mut String, node: Node<'a>) {
use std::fmt::Write;
match &node.kind() {
NodeKind::Comment => {
write!(sink, "#comment({})", node.text_content()).unwrap();
}
NodeKind::Text => {
write!(sink, "#text({})", node.text_content()).unwrap();
}
NodeKind::Doctype => {
write!(sink, "#doctype({})", node.name()).unwrap();
}
NodeKind::Fragment => {
for c in node.children() {
print_node(sink, c);
}
}
NodeKind::Element => {
write!(sink, "{}", node.name()).unwrap();
let attrs: Vec<_> = node.attrs().collect();
if attrs.len() > 0 {
write!(sink, "[").unwrap();
for (i, (k, v)) in attrs.iter().enumerate() {
if i != 0 {
write!(sink, ",").unwrap();
}
write!(sink, "{}={}", k, v).unwrap();
}
write!(sink, "]").unwrap();
}
write!(sink, "(").unwrap();
for c in node.children() {
print_node(sink, c);
}
write!(sink, ")").unwrap();
}
}
}
#[track_caller]
fn check(input: &str, expected: &str) {
let arena = NodeArena::new();
let result = parse(&arena, input);
let mut actual_output = String::new();
match result {
Ok(root) => {
print_node(&mut actual_output, root);
}
Err(err) => {
use std::fmt::Write;
write!(actual_output, "error @ {}: {}", err.offset, err.kind.code()).unwrap();
}
}
assert_eq!(actual_output, expected);
}
#[test]
fn test_cases() {
check("", "");
check(" ", "#text( )");
check("just some text", "#text(just some text)");
check("<!--a comment-->", "#comment(a comment)");
check("<!doctype html>", "#doctype(html)");
check("<div></div>", "div()");
check("<DIV></DIV>", "div()");
check("<div a='b'></div>", "div[a=b]()");
check("<div A='b'></div>", "div[a=b]()");
check("<link><div></div>", "link()div()");
check("<link/><div></div>", "link()div()");
check(
"<div />",
"error @ 0: non-void-element-with-trailing-solidus",
);
check("<svg />", "svg()");
check("<svg><p /></svg>", "svg(p())");
check("<math />", "math()");
check("<math><variable /></math>", "math(variable())");
check(
"<title><tag></title><textarea><tag></textarea><noscript><tag></noscript><style><tag></style><xmp><tag></xmp><iframe><tag></iframe><noembed><tag></noembed><noframes><tag></noframes><script><tag></script>",
"title(#text(<tag>))textarea(#text(<tag>))noscript(#text(<tag>))style(#text(<tag>))xmp(#text(<tag>))iframe(#text(<tag>))noembed(#text(<tag>))noframes(#text(<tag>))script(#text(<tag>))",
);
check("<title>", "error @ 0: unclosed-start-tag");
check("</span>", "error @ 0: no-matching-start-tag");
check("<link></link>", "error @ 6: void-element-as-end-tag");
check("<div></span>", "error @ 0: unclosed-start-tag");
check("<div></div></span>", "error @ 11: no-matching-start-tag");
}
}