pub mod entities;
use crate::error::{ErrorSeverity, ParseDiagnostic, ParseError};
use crate::parser::input::ParserInput;
use crate::tree::{Attribute, Document, NodeId, NodeKind};
#[derive(Debug, Clone)]
#[allow(clippy::struct_excessive_bools)]
pub struct HtmlParseOptions {
pub recover: bool,
pub no_blanks: bool,
pub no_implied: bool,
pub no_warnings: bool,
}
impl Default for HtmlParseOptions {
fn default() -> Self {
Self {
recover: true,
no_blanks: false,
no_implied: false,
no_warnings: false,
}
}
}
impl HtmlParseOptions {
#[must_use]
pub fn recover(mut self, yes: bool) -> Self {
self.recover = yes;
self
}
#[must_use]
pub fn no_blanks(mut self, yes: bool) -> Self {
self.no_blanks = yes;
self
}
#[must_use]
pub fn no_implied(mut self, yes: bool) -> Self {
self.no_implied = yes;
self
}
#[must_use]
pub fn no_warnings(mut self, yes: bool) -> Self {
self.no_warnings = yes;
self
}
}
pub fn parse_html(input: &str) -> Result<Document, ParseError> {
parse_html_with_options(input, &HtmlParseOptions::default())
}
pub fn parse_html_with_options(
input: &str,
options: &HtmlParseOptions,
) -> Result<Document, ParseError> {
let mut parser = HtmlParser::new(input, options);
parser.parse()
}
pub(crate) fn is_void_element(tag: &str) -> bool {
matches!(
tag,
"area"
| "base"
| "br"
| "col"
| "embed"
| "hr"
| "img"
| "input"
| "link"
| "meta"
| "param"
| "source"
| "track"
| "wbr"
| "basefont"
| "frame"
| "isindex"
)
}
fn auto_closes(open_tag: &str, tag: &str) -> bool {
match open_tag {
"p" => matches!(
tag,
"p" | "div"
| "ul"
| "ol"
| "dl"
| "pre"
| "table"
| "blockquote"
| "address"
| "h1"
| "h2"
| "h3"
| "h4"
| "h5"
| "h6"
| "hr"
| "form"
| "fieldset"
| "section"
| "article"
| "aside"
| "header"
| "footer"
| "nav"
| "figure"
| "figcaption"
| "main"
| "details"
| "summary"
),
"li" => tag == "li",
"dt" => matches!(tag, "dt" | "dd"),
"dd" => tag == "dt",
"tr" => tag == "tr",
"td" => matches!(tag, "td" | "th" | "tr"),
"th" => matches!(tag, "td" | "th" | "tr"),
"thead" => matches!(tag, "tbody" | "tfoot"),
"tbody" => matches!(tag, "tbody" | "tfoot"),
"tfoot" => tag == "tbody",
"option" => matches!(tag, "option" | "optgroup"),
"optgroup" => tag == "optgroup",
"colgroup" => {
tag != "col" && matches!(tag, "thead" | "tbody" | "tfoot" | "tr" | "colgroup")
}
"head" => matches!(tag, "body" | "frameset"),
_ => false,
}
}
pub(crate) fn is_raw_text_element(tag: &str) -> bool {
matches!(tag, "script" | "style")
}
fn is_head_content_element(tag: &str) -> bool {
matches!(
tag,
"title" | "meta" | "link" | "base" | "style" | "script" | "noscript"
)
}
struct HtmlParser<'a> {
input: ParserInput<'a>,
doc: Document,
options: HtmlParseOptions,
open_elements: Vec<(NodeId, String)>,
fatal_error: Option<ParseError>,
}
impl<'a> HtmlParser<'a> {
fn new(input: &'a str, options: &HtmlParseOptions) -> Self {
let mut pi = ParserInput::new(input);
pi.set_recover(options.recover);
Self {
input: pi,
doc: Document::new(),
options: options.clone(),
open_elements: Vec::new(),
fatal_error: None,
}
}
fn parse(&mut self) -> Result<Document, ParseError> {
self.input.skip_whitespace();
let mut has_doctype = false;
if self.input.looking_at_ci(b"<!doctype") {
self.parse_doctype();
self.input.skip_whitespace();
has_doctype = true;
}
self.parse_content();
if let Some(err) = self.fatal_error.take() {
return Err(err);
}
while let Some((_, tag)) = self.open_elements.pop() {
self.push_warning(format!("unclosed element <{tag}> at end of document"));
}
if !has_doctype && !self.options.no_implied {
let doctype_id = self.doc.create_node(NodeKind::DocumentType {
name: "html".to_string(),
public_id: Some("-//W3C//DTD HTML 4.0 Transitional//EN".to_string()),
system_id: Some("http://www.w3.org/TR/REC-html40/loose.dtd".to_string()),
internal_subset: None,
});
let root = self.doc.root();
self.doc.prepend_child(root, doctype_id);
}
self.remove_empty_heads();
Ok(std::mem::take(&mut self.doc))
}
fn ensure_html(&mut self) -> NodeId {
let root = self.doc.root();
for id in self.doc.children(root) {
if matches!(&self.doc.node(id).kind, NodeKind::Element { name, .. } if name == "html") {
return id;
}
}
let html_id = self.doc.create_node(NodeKind::Element {
name: "html".to_string(),
prefix: None,
namespace: None,
attributes: vec![],
});
self.doc.append_child(root, html_id);
self.open_elements.push((html_id, "html".to_string()));
html_id
}
fn ensure_body(&mut self) -> NodeId {
let html_id = self.ensure_html();
for id in self.doc.children(html_id) {
if matches!(&self.doc.node(id).kind, NodeKind::Element { name, .. } if name == "body") {
return id;
}
}
let body_id = self.doc.create_node(NodeKind::Element {
name: "body".to_string(),
prefix: None,
namespace: None,
attributes: vec![],
});
self.doc.append_child(html_id, body_id);
self.open_elements.push((body_id, "body".to_string()));
body_id
}
fn ensure_head(&mut self) -> NodeId {
let html_id = self.ensure_html();
for id in self.doc.children(html_id) {
if matches!(&self.doc.node(id).kind, NodeKind::Element { name, .. } if name == "head") {
return id;
}
}
let head_id = self.doc.create_node(NodeKind::Element {
name: "head".to_string(),
prefix: None,
namespace: None,
attributes: vec![],
});
let body_id = self.doc.children(html_id).find(|&id| {
matches!(&self.doc.node(id).kind, NodeKind::Element { name, .. } if name == "body")
});
if let Some(body) = body_id {
self.doc.insert_before(body, head_id);
} else {
self.doc.append_child(html_id, head_id);
}
head_id
}
fn remove_empty_heads(&mut self) {
let root = self.doc.root();
for html_id in self.doc.children(root).collect::<Vec<_>>() {
if !matches!(&self.doc.node(html_id).kind, NodeKind::Element { name, .. } if name == "html")
{
continue;
}
for child_id in self.doc.children(html_id).collect::<Vec<_>>() {
if matches!(&self.doc.node(child_id).kind, NodeKind::Element { name, .. } if name == "head")
&& self.doc.first_child(child_id).is_none()
{
self.doc.remove_node(child_id);
}
}
}
}
fn current_parent(&self) -> NodeId {
self.open_elements
.last()
.map_or_else(|| self.doc.root(), |&(id, _)| id)
}
fn parse_content(&mut self) {
while !self.input.at_end() && self.fatal_error.is_none() {
if self.input.looking_at(b"<!--") {
self.parse_comment();
} else if self.input.looking_at_ci(b"<!doctype") {
self.skip_to_gt();
} else if self.input.looking_at(b"</") {
self.parse_end_tag();
} else if self.input.peek() == Some(b'<')
&& self
.input
.peek_at(1)
.is_some_and(|b| b.is_ascii_alphabetic())
{
self.parse_start_tag();
} else if self.input.peek() == Some(b'<') && self.input.peek_at(1) == Some(b'!') {
if self.input.peek_at(2) == Some(b'[') {
self.skip_conditional_comment();
} else {
self.push_warning("malformed markup".to_string());
self.input.advance(1);
}
} else if self.input.peek() == Some(b'<') && self.input.peek_at(1) == Some(b'?') {
self.parse_processing_instruction();
} else if self.input.peek() == Some(b'<') {
self.input.advance(1);
if !self.options.no_implied && self.open_elements.is_empty() {
self.ensure_body();
}
let parent = self.current_parent();
if let Some(last_child) = self.doc.last_child(parent) {
if let NodeKind::Text { content } = &mut self.doc.node_mut(last_child).kind {
content.push('<');
continue;
}
}
let text_id = self.doc.create_node(NodeKind::Text {
content: "<".to_string(),
});
self.doc.append_child(parent, text_id);
} else {
self.parse_text();
}
}
}
fn parse_doctype(&mut self) {
self.input.advance(9); self.input.skip_whitespace();
let name = self.parse_tag_name();
self.input.skip_whitespace();
let mut system_id = None;
let mut public_id = None;
if self.input.looking_at_ci(b"system") {
self.input.advance(6);
self.input.skip_whitespace();
system_id = self.try_parse_quoted_value();
self.input.skip_whitespace();
} else if self.input.looking_at_ci(b"public") {
self.input.advance(6);
self.input.skip_whitespace();
public_id = self.try_parse_quoted_value();
self.input.skip_whitespace();
system_id = self.try_parse_quoted_value();
self.input.skip_whitespace();
}
while !self.input.at_end() && self.input.peek() != Some(b'>') {
self.input.advance(1);
}
if !self.input.at_end() {
self.input.advance(1); }
let doctype_id = self.doc.create_node(NodeKind::DocumentType {
name,
system_id,
public_id,
internal_subset: None,
});
self.doc.append_child(self.doc.root(), doctype_id);
}
#[allow(clippy::too_many_lines)]
fn parse_start_tag(&mut self) {
self.input.advance(1); if let Err(e) = self.input.increment_depth() {
self.fatal_error = Some(e);
return;
}
let tag = self.parse_tag_name();
if tag.is_empty() {
self.push_warning("empty tag name".to_string());
self.input.decrement_depth();
self.skip_to_gt();
return;
}
let lower_tag = tag.to_ascii_lowercase();
let attributes = self.parse_attributes();
self.input.skip_whitespace();
let explicit_self_close = self.input.peek() == Some(b'/');
if explicit_self_close {
self.input.advance(1);
}
if self.input.peek() == Some(b'>') {
self.input.advance(1);
} else if !self.input.at_end() {
self.push_warning(format!("expected '>' after tag <{lower_tag}>"));
self.skip_to_gt();
}
if !self.options.no_implied {
if lower_tag == "html" {
let html_id = self.ensure_html();
self.merge_attributes(html_id, attributes);
if !self.open_elements.iter().any(|(_, t)| t == "html") {
self.open_elements.push((html_id, "html".to_string()));
}
self.input.decrement_depth();
return;
}
if lower_tag == "head" {
let head_id = self.ensure_head();
self.merge_attributes(head_id, attributes);
if !self.open_elements.iter().any(|(_, t)| t == "head") {
self.open_elements.push((head_id, "head".to_string()));
}
self.input.decrement_depth();
return;
}
if lower_tag == "body" && !self.is_in_frameset() {
self.close_head_if_open();
let body_id = self.ensure_body();
self.merge_attributes(body_id, attributes);
if !self.open_elements.iter().any(|(_, t)| t == "body") {
self.open_elements.push((body_id, "body".to_string()));
}
self.input.decrement_depth();
return;
}
}
self.handle_auto_close(&lower_tag);
if !self.options.no_implied {
if lower_tag == "frameset" {
self.close_head_if_open();
self.ensure_html();
} else if is_head_content_element(&lower_tag) && !self.is_in_body() {
let head_id = self.ensure_head();
if !self.open_elements.iter().any(|(_, t)| t == "head") {
self.open_elements.push((head_id, "head".to_string()));
}
} else if self.is_in_frameset() {
} else {
self.close_head_if_open();
self.ensure_body();
}
}
let parent = self.current_parent();
let id_value = attributes.iter().find_map(|a| {
if a.name == "id" {
Some(a.value.clone())
} else {
None
}
});
let elem_id = self.doc.create_node(NodeKind::Element {
name: lower_tag.clone(),
prefix: None,
namespace: None,
attributes,
});
self.doc.append_child(parent, elem_id);
if let Some(id_val) = id_value {
self.doc.set_id(&id_val, elem_id);
}
if is_void_element(&lower_tag) || explicit_self_close {
self.input.decrement_depth();
return;
}
if is_raw_text_element(&lower_tag) {
self.open_elements.push((elem_id, lower_tag.clone()));
self.parse_raw_text(&lower_tag);
self.open_elements.pop();
self.input.decrement_depth();
return;
}
self.open_elements.push((elem_id, lower_tag));
}
fn merge_attributes(&mut self, elem_id: NodeId, attrs: Vec<Attribute>) {
if attrs.is_empty() {
return;
}
if let NodeKind::Element { attributes, .. } = &mut self.doc.node_mut(elem_id).kind {
for attr in attrs {
if !attributes.iter().any(|a| a.name == attr.name) {
attributes.push(attr);
}
}
}
}
fn close_head_if_open(&mut self) {
if self.open_elements.last().is_some_and(|(_, t)| t == "head") {
self.open_elements.pop();
}
}
fn is_in_body(&self) -> bool {
self.open_elements.iter().any(|(_, t)| t == "body")
}
fn is_in_frameset(&self) -> bool {
self.open_elements.iter().any(|(_, t)| t == "frameset")
}
fn handle_auto_close(&mut self, new_tag: &str) {
loop {
let should_close = self
.open_elements
.last()
.is_some_and(|(_, open_tag)| auto_closes(open_tag, new_tag));
if should_close {
self.open_elements.pop();
self.input.decrement_depth();
} else {
break;
}
}
}
fn parse_end_tag(&mut self) {
self.input.advance(2); let tag = self.parse_tag_name();
let lower_tag = tag.to_ascii_lowercase();
self.input.skip_whitespace();
if self.input.peek() == Some(b'>') {
self.input.advance(1);
} else if !self.input.at_end() {
self.push_warning(format!("expected '>' after end tag </{lower_tag}>"));
self.skip_to_gt();
}
if is_void_element(&lower_tag) {
self.push_warning(format!("end tag for void element </{lower_tag}> ignored"));
return;
}
let found = self
.open_elements
.iter()
.rposition(|(_, name)| *name == lower_tag);
if let Some(idx) = found {
let count = self.open_elements.len() - idx;
for i in (0..count).rev() {
let stack_idx = idx + i;
if stack_idx < self.open_elements.len() {
let (_, ref closed_tag) = self.open_elements[stack_idx];
if *closed_tag != lower_tag {
self.push_warning(format!(
"implicitly closing <{closed_tag}> before </{lower_tag}>"
));
}
}
}
for _ in 0..count {
self.input.decrement_depth();
}
self.open_elements.truncate(idx);
} else {
self.push_warning(format!("stray end tag </{lower_tag}>"));
}
}
fn parse_attributes(&mut self) -> Vec<Attribute> {
let mut attributes = Vec::new();
loop {
self.input.skip_whitespace();
if self.input.at_end()
|| self.input.peek() == Some(b'>')
|| self.input.peek() == Some(b'/')
|| self.input.looking_at(b"/>")
{
break;
}
let name = self.parse_attr_name();
if name.is_empty() {
self.input.advance(1);
continue;
}
let lower_name = name.to_ascii_lowercase();
self.input.skip_whitespace();
let value = if self.input.peek() == Some(b'=') {
self.input.advance(1); self.input.skip_whitespace();
self.parse_attr_value()
} else {
lower_name.clone()
};
attributes.push(Attribute {
name: lower_name,
value,
prefix: None,
namespace: None,
raw_value: None,
});
}
attributes
}
fn parse_attr_name(&mut self) -> String {
let start = self.input.pos();
while let Some(b) = self.input.peek() {
if b == b' '
|| b == b'\t'
|| b == b'\r'
|| b == b'\n'
|| b == b'='
|| b == b'>'
|| b == b'/'
|| b == b'<'
|| b == b'"'
|| b == b'\''
{
break;
}
self.input.advance(1);
}
String::from_utf8_lossy(self.input.slice(start, self.input.pos())).to_string()
}
fn parse_attr_value(&mut self) -> String {
if self.input.at_end() {
return String::new();
}
let b = self.input.peek();
if b == Some(b'"') || b == Some(b'\'') {
let quote = b.unwrap_or(b'"');
self.input.advance(1); let mut value = String::new();
while !self.input.at_end() {
let ch = self.input.peek();
if ch == Some(quote) {
self.input.advance(1);
break;
}
if ch == Some(b'&') {
let resolved = self.parse_html_reference();
value.push_str(&resolved);
} else {
let c = self.next_char_html();
value.push(c);
}
}
value
} else {
let mut value = String::new();
while let Some(b) = self.input.peek() {
if b == b' '
|| b == b'\t'
|| b == b'\r'
|| b == b'\n'
|| b == b'>'
|| b == b'<'
|| b == b'`'
{
break;
}
if b == b'\\' {
let c1 = self.next_char_html();
value.push(c1);
if !self.input.at_end() {
let c2 = self.next_char_html();
value.push(c2);
}
continue;
}
if b == b'"' || b == b'\'' {
break;
}
if b == b'&' {
let resolved = self.parse_html_reference();
value.push_str(&resolved);
} else {
let c = self.next_char_html();
value.push(c);
}
}
value
}
}
fn parse_text(&mut self) {
let mut text = String::new();
while !self.input.at_end() {
if self.input.peek() == Some(b'<') {
break;
}
if self.input.peek() == Some(b'&') {
let resolved = self.parse_html_reference();
text.push_str(&resolved);
} else {
let ch = self.next_char_html();
text.push(ch);
}
}
if !text.is_empty() {
if self.options.no_blanks && text.chars().all(char::is_whitespace) {
return;
}
if !self.options.no_implied && self.open_elements.is_empty() {
if text.chars().all(char::is_whitespace) {
return;
}
self.ensure_body();
}
let parent = self.current_parent();
let text_id = self.doc.create_node(NodeKind::Text { content: text });
self.doc.append_child(parent, text_id);
}
}
fn parse_raw_text(&mut self, tag: &str) {
let mut content = String::new();
let end_tag_bytes: Vec<u8> = format!("</{tag}").bytes().collect();
while !self.input.at_end() {
if self.input.looking_at_ci(&end_tag_bytes) {
break;
}
let ch = self.next_char_html();
content.push(ch);
}
if !content.is_empty() {
let parent = self.current_parent();
let text_id = self.doc.create_node(NodeKind::Text { content });
self.doc.append_child(parent, text_id);
}
if !self.input.at_end() {
self.input.advance(end_tag_bytes.len());
self.input.skip_whitespace();
if self.input.peek() == Some(b'>') {
self.input.advance(1);
}
}
}
fn parse_comment(&mut self) {
self.input.advance(4);
if self.input.peek() == Some(b'>') {
self.input.advance(1);
let parent = self.current_parent();
let comment_id = self.doc.create_node(NodeKind::Comment {
content: String::new(),
});
self.doc.append_child(parent, comment_id);
return;
}
if self.input.looking_at(b"->") {
self.input.advance(2);
let parent = self.current_parent();
let comment_id = self.doc.create_node(NodeKind::Comment {
content: String::new(),
});
self.doc.append_child(parent, comment_id);
return;
}
let mut content = String::new();
let mut terminated = false;
loop {
if self.input.at_end() {
self.push_warning("unterminated comment".to_string());
break;
}
if self.input.looking_at(b"-->") {
self.input.advance(3);
terminated = true;
break;
}
if self.input.looking_at(b"--!>") {
self.input.advance(4);
terminated = true;
break;
}
let ch = self.next_char_html();
content.push(ch);
}
if !terminated {
return;
}
let parent = self.current_parent();
let comment_id = self.doc.create_node(NodeKind::Comment { content });
self.doc.append_child(parent, comment_id);
}
fn parse_processing_instruction(&mut self) {
self.input.advance(2); let target = self.parse_tag_name();
self.input.skip_whitespace();
let mut data = String::new();
loop {
if self.input.at_end() {
self.push_warning("unterminated processing instruction".to_string());
break;
}
if self.input.peek() == Some(b'>') {
self.input.advance(1);
break;
}
let ch = self.next_char_html();
data.push(ch);
}
let pi_data = if data.is_empty() { None } else { Some(data) };
let parent = self.current_parent();
let pi_id = self.doc.create_node(NodeKind::ProcessingInstruction {
target,
data: pi_data,
});
self.doc.append_child(parent, pi_id);
}
fn parse_html_reference(&mut self) -> String {
let saved = self.input.save_position();
self.input.advance(1);
if self.input.peek() == Some(b'#') {
self.input.advance(1);
if self.input.peek() == Some(b'x') || self.input.peek() == Some(b'X') {
self.input.advance(1);
let hex = self.take_while_ascii(|b| b.is_ascii_hexdigit());
if !hex.is_empty() && self.input.peek() == Some(b';') {
self.input.advance(1);
if let Ok(value) = u32::from_str_radix(&hex, 16) {
if let Some(ch) = char::from_u32(value) {
return ch.to_string();
}
}
}
self.input.restore_position(saved);
self.input.advance(1);
return "&".to_string();
}
let dec = self.take_while_ascii(|b| b.is_ascii_digit());
if !dec.is_empty() && self.input.peek() == Some(b';') {
self.input.advance(1);
if let Ok(value) = dec.parse::<u32>() {
if let Some(ch) = char::from_u32(value) {
return ch.to_string();
}
}
}
self.input.restore_position(saved);
self.input.advance(1);
return "&".to_string();
}
let name = self.take_while_ascii(|b| b.is_ascii_alphanumeric());
if !name.is_empty() {
if self.input.peek() == Some(b';') {
self.input.advance(1);
if let Some(value) = entities::lookup_entity(&name) {
return value.to_string();
}
self.push_warning(format!("unknown entity reference: &{name};"));
return format!("&{name};");
}
if let Some(value) = entities::lookup_entity(&name) {
self.push_warning(format!("entity reference &{name} missing semicolon"));
return value.to_string();
}
}
self.input.restore_position(saved);
self.input.advance(1);
"&".to_string()
}
fn parse_tag_name(&mut self) -> String {
let start = self.input.pos();
while let Some(b) = self.input.peek() {
if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b':' || b == b'.' {
self.input.advance(1);
} else {
break;
}
}
String::from_utf8_lossy(self.input.slice(start, self.input.pos())).to_string()
}
fn try_parse_quoted_value(&mut self) -> Option<String> {
let quote = self.input.peek()?;
if quote != b'"' && quote != b'\'' {
return None;
}
self.input.advance(1);
let start = self.input.pos();
while !self.input.at_end() && self.input.peek() != Some(quote) {
self.input.advance(1);
}
let value = String::from_utf8_lossy(self.input.slice(start, self.input.pos())).to_string();
if !self.input.at_end() {
self.input.advance(1); }
Some(value)
}
fn next_char_html(&mut self) -> char {
if self.input.at_end() {
return '\0';
}
if let Some(ch) = self.input.peek_char() {
self.input.advance_char(ch);
if ch == '\r' {
if self.input.peek() == Some(b'\n') {
self.input.advance(1);
}
return '\n';
}
ch
} else {
self.input.advance(1);
'\u{FFFD}'
}
}
fn skip_conditional_comment(&mut self) {
self.push_warning("incorrectly opened comment".to_string());
self.skip_to_gt();
}
fn skip_to_gt(&mut self) {
while !self.input.at_end() {
if self.input.peek() == Some(b'>') {
self.input.advance(1);
return;
}
self.input.advance(1);
}
}
fn take_while_ascii(&mut self, pred: impl Fn(u8) -> bool) -> String {
let start = self.input.pos();
while let Some(b) = self.input.peek() {
if pred(b) {
self.input.advance(1);
} else {
break;
}
}
String::from_utf8_lossy(self.input.slice(start, self.input.pos())).to_string()
}
fn push_warning(&mut self, message: String) {
if self.options.no_warnings {
return;
}
self.doc.diagnostics.push(ParseDiagnostic {
severity: ErrorSeverity::Warning,
message,
location: self.input.location(),
});
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
fn parse(input: &str) -> Document {
parse_html(input).unwrap_or_else(|e| panic!("parse failed: {e}"))
}
fn parse_no_implied(input: &str) -> Document {
let opts = HtmlParseOptions::default().no_implied(true);
parse_html_with_options(input, &opts).unwrap_or_else(|e| panic!("parse failed: {e}"))
}
#[test]
fn test_parse_simple_html() {
let doc = parse("<html><body><p>Hello</p></body></html>");
let html = doc.root_element().unwrap();
assert_eq!(doc.node_name(html), Some("html"));
}
#[test]
fn test_parse_implied_structure() {
let doc = parse("<p>Hello</p>");
let html = doc.root_element().unwrap();
assert_eq!(doc.node_name(html), Some("html"));
let children: Vec<_> = doc.children(html).collect();
assert!(!children.is_empty());
let body = children.last().unwrap();
assert_eq!(doc.node_name(*body), Some("body"));
}
#[test]
fn test_parse_no_implied_option() {
let doc = parse_no_implied("<p>Hello</p>");
let root = doc.root();
let first = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(first), Some("p"));
assert_eq!(doc.text_content(first), "Hello");
}
#[test]
fn test_void_elements() {
let doc = parse_no_implied("<p>line1<br>line2</p>");
let root = doc.root();
let p = doc.first_child(root).unwrap();
let children: Vec<_> = doc.children(p).collect();
assert_eq!(children.len(), 3); assert_eq!(doc.node_name(children[1]), Some("br"));
assert!(doc.first_child(children[1]).is_none()); }
#[test]
fn test_void_element_with_end_tag() {
let doc = parse_no_implied("<p>text<br></br>more</p>");
let root = doc.root();
let p = doc.first_child(root).unwrap();
let children: Vec<_> = doc.children(p).collect();
assert_eq!(children.len(), 3); }
#[test]
fn test_img_void_element() {
let doc = parse_no_implied("<img src=\"test.jpg\" alt=\"test\">");
let root = doc.root();
let img = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(img), Some("img"));
assert_eq!(doc.attribute(img, "src"), Some("test.jpg"));
assert_eq!(doc.attribute(img, "alt"), Some("test"));
}
#[test]
fn test_case_insensitive_tags() {
let doc = parse_no_implied("<DIV><P>Hello</P></DIV>");
let root = doc.root();
let div = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(div), Some("div"));
let p = doc.first_child(div).unwrap();
assert_eq!(doc.node_name(p), Some("p"));
assert_eq!(doc.text_content(p), "Hello");
}
#[test]
fn test_mixed_case_tags() {
let doc = parse_no_implied("<Div><SPAN>Hi</span></div>");
let root = doc.root();
let div = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(div), Some("div"));
}
#[test]
fn test_unquoted_attributes() {
let doc = parse_no_implied("<div class=main id=content>text</div>");
let root = doc.root();
let div = doc.first_child(root).unwrap();
assert_eq!(doc.attribute(div, "class"), Some("main"));
assert_eq!(doc.attribute(div, "id"), Some("content"));
}
#[test]
fn test_boolean_attributes() {
let doc = parse_no_implied("<input disabled readonly>");
let root = doc.root();
let input = doc.first_child(root).unwrap();
assert_eq!(doc.attribute(input, "disabled"), Some("disabled"));
assert_eq!(doc.attribute(input, "readonly"), Some("readonly"));
}
#[test]
fn test_p_auto_closes_p() {
let doc = parse_no_implied("<p>First<p>Second");
let root = doc.root();
let children: Vec<_> = doc.children(root).collect();
assert_eq!(children.len(), 2);
assert_eq!(doc.node_name(children[0]), Some("p"));
assert_eq!(doc.text_content(children[0]), "First");
assert_eq!(doc.node_name(children[1]), Some("p"));
assert_eq!(doc.text_content(children[1]), "Second");
}
#[test]
fn test_li_auto_closes_li() {
let doc = parse_no_implied("<ul><li>A<li>B<li>C</ul>");
let root = doc.root();
let ul = doc.first_child(root).unwrap();
let items: Vec<_> = doc.children(ul).collect();
assert_eq!(items.len(), 3);
assert_eq!(doc.text_content(items[0]), "A");
assert_eq!(doc.text_content(items[1]), "B");
assert_eq!(doc.text_content(items[2]), "C");
}
#[test]
fn test_dd_dt_auto_close() {
let doc = parse_no_implied("<dl><dt>Term<dd>Def<dt>Term2<dd>Def2</dl>");
let root = doc.root();
let dl = doc.first_child(root).unwrap();
let items: Vec<_> = doc.children(dl).collect();
assert_eq!(items.len(), 4);
assert_eq!(doc.node_name(items[0]), Some("dt"));
assert_eq!(doc.node_name(items[1]), Some("dd"));
assert_eq!(doc.node_name(items[2]), Some("dt"));
assert_eq!(doc.node_name(items[3]), Some("dd"));
}
#[test]
fn test_html_entities() {
let doc = parse_no_implied("<p>© 2024 — All rights reserved</p>");
let root = doc.root();
let p = doc.first_child(root).unwrap();
let text = doc.text_content(p);
assert!(text.contains('\u{00A9}')); assert!(text.contains('\u{2014}')); }
#[test]
fn test_bare_ampersand() {
let doc = parse_no_implied("<p>A & B</p>");
let root = doc.root();
let p = doc.first_child(root).unwrap();
assert_eq!(doc.text_content(p), "A & B");
}
#[test]
fn test_numeric_character_reference() {
let doc = parse_no_implied("<p>A B</p>");
let root = doc.root();
let p = doc.first_child(root).unwrap();
assert_eq!(doc.text_content(p), "A B");
}
#[test]
fn test_missing_closing_tags() {
let doc = parse_no_implied("<div><p>Hello");
let root = doc.root();
let div = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(div), Some("div"));
let p = doc.first_child(div).unwrap();
assert_eq!(doc.node_name(p), Some("p"));
assert_eq!(doc.text_content(p), "Hello");
}
#[test]
fn test_html_comment() {
let doc = parse_no_implied("<!-- hello --><p>text</p>");
let root = doc.root();
let first = doc.first_child(root).unwrap();
assert_eq!(doc.node_text(first), Some(" hello "));
}
#[test]
fn test_script_raw_text() {
let doc = parse_no_implied("<script>var x = 1 < 2 && true;</script>");
let root = doc.root();
let script = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(script), Some("script"));
assert_eq!(doc.text_content(script), "var x = 1 < 2 && true;");
}
#[test]
fn test_style_raw_text() {
let doc = parse_no_implied("<style>p > span { color: red; }</style>");
let root = doc.root();
let style = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(style), Some("style"));
assert_eq!(doc.text_content(style), "p > span { color: red; }");
}
#[test]
fn test_html_doctype() {
let doc = parse_no_implied("<!DOCTYPE html><p>text</p>");
let root = doc.root();
let children: Vec<_> = doc.children(root).collect();
assert!(children.len() >= 2);
match &doc.node(children[0]).kind {
NodeKind::DocumentType { name, .. } => {
assert_eq!(name, "html");
}
other => panic!("expected DocumentType, got {other:?}"),
}
}
#[test]
fn test_single_quoted_attributes() {
let doc = parse_no_implied("<div class='main'>text</div>");
let root = doc.root();
let div = doc.first_child(root).unwrap();
assert_eq!(doc.attribute(div, "class"), Some("main"));
}
#[test]
fn test_html_processing_instruction() {
let doc = parse_no_implied("<?xml-stylesheet type=\"text/css\"?><p>text</p>");
let root = doc.root();
let first = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(first), Some("xml-stylesheet"));
}
#[test]
fn test_attribute_case_normalized() {
let doc = parse_no_implied("<div CLASS=\"main\" ID=\"1\">text</div>");
let root = doc.root();
let div = doc.first_child(root).unwrap();
assert_eq!(doc.attribute(div, "class"), Some("main"));
assert_eq!(doc.attribute(div, "id"), Some("1"));
}
#[test]
fn test_no_blanks_option() {
let opts = HtmlParseOptions::default().no_blanks(true).no_implied(true);
let doc = parse_html_with_options("<div> \n <p>text</p> \n </div>", &opts).unwrap();
let root = doc.root();
let div = doc.first_child(root).unwrap();
let children: Vec<_> = doc.children(div).collect();
assert_eq!(children.len(), 1);
assert_eq!(doc.node_name(children[0]), Some("p"));
}
#[test]
fn test_stray_end_tag() {
let doc = parse_no_implied("</div><p>text</p>");
let root = doc.root();
let p = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(p), Some("p"));
assert_eq!(doc.text_content(p), "text");
assert!(!doc.diagnostics.is_empty());
}
#[test]
fn test_complex_html_document() {
let doc = parse(
r#"<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<h1>Hello</h1>
<p>A paragraph with <b>bold</b> and <em>emphasis</em>.</p>
<ul>
<li>Item 1
<li>Item 2
<li>Item 3
</ul>
<img src="test.jpg">
</body>
</html>"#,
);
let html = doc.root_element().unwrap();
assert_eq!(doc.node_name(html), Some("html"));
}
#[test]
fn test_self_closing_syntax() {
let doc = parse_no_implied("<br/>");
let root = doc.root();
let br = doc.first_child(root).unwrap();
assert_eq!(doc.node_name(br), Some("br"));
assert!(doc.first_child(br).is_none());
}
#[test]
fn test_entity_in_attribute() {
let doc = parse_no_implied("<a href=\"page?a=1&b=2\">link</a>");
let root = doc.root();
let a = doc.first_child(root).unwrap();
assert_eq!(doc.attribute(a, "href"), Some("page?a=1&b=2"));
}
}