use super::{
element::Element,
iter::{select, Select},
node::{Node, NodeData, NodeId},
parser,
selectors::ParseError,
serializer::serialize_to,
InliningMode,
};
use crate::{html::DocumentStyleMap, InlineError};
use html5ever::{local_name, tendril::StrTendril, LocalName};
use rustc_hash::FxHashMap;
use selectors::context::SelectorCaches;
use std::{fmt, fmt::Formatter, io::Write, iter::successors};
/// Minimum input size (bytes) to build element indexes.
/// For smaller inputs, linear scan is faster than hash map operations.
const INDEX_SIZE_THRESHOLD: usize = 1024;
/// HTML document representation.
///
/// A `Document` holds a collection of nodes, with each node representing an HTML element.
/// Nodes are interconnected to form a tree-like structure, mimicking the HTML DOM tree.
/// The struct also keeps track of nodes that contain CSS styles or refer to CSS stylesheets.
///
/// Here is an example of how nodes within a `Document` could be arranged:
///
/// ```text
/// Document
/// ↓
/// [N1]
/// / \
/// / \
/// / \
/// / \
/// / \
/// [N2]<->[N3]<->[N4]
/// / \ / \
/// / \ / \
/// [N5]<->[N6] [N7]<->[N8]
/// ```
///
/// Each Node within the `Document` is interconnected with its siblings, and has a parent-child
/// relationship with its descendants.
pub(crate) struct Document {
pub(crate) nodes: Vec<Node>,
/// Ids of Element nodes & caches for their nth index selectors.
pub(crate) elements: Vec<NodeId>,
/// Ids of `style` nodes.
styles: Vec<NodeId>,
/// Ids of `link` nodes, specifically those with the `rel` attribute value set as `stylesheet`.
/// They represent the locations (URLs) of all linked stylesheet resources in the document.
linked_stylesheets: Vec<NodeId>,
/// Whether element indexes are enabled for this document.
use_indexes: bool,
/// Index: ID attribute value -> `NodeId` (IDs should be unique per document).
by_id: FxHashMap<LocalName, NodeId>,
/// Index: class name -> list of `NodeId`s with that class.
by_class: FxHashMap<LocalName, Vec<NodeId>>,
/// Index: tag name -> list of `NodeId`s with that tag.
by_tag: FxHashMap<LocalName, Vec<NodeId>>,
}
impl fmt::Debug for Document {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.debug_struct("Document")
.field("nodes", &self.nodes)
.field("styles", &self.styles)
.field("linked_stylesheets", &self.linked_stylesheets)
.finish_non_exhaustive()?;
Ok(())
}
}
impl Document {
pub(crate) fn parse_with_options(
bytes: &[u8],
preallocate_node_capacity: usize,
mode: InliningMode,
) -> Document {
parser::parse_with_options(bytes, preallocate_node_capacity, mode)
}
pub(super) fn with_capacity(capacity: usize, input_size: usize) -> Self {
// Dummy node at index 0 so that other indices fit in NonZero
let mut nodes = vec![Node::new(NodeData::Document), Node::new(NodeData::Document)];
// Usually there are a lot of nodes, hence, reserve some space for them
nodes.reserve(capacity);
// Only build indexes for larger documents where the lookup benefit outweighs overhead
let use_indexes = input_size >= INDEX_SIZE_THRESHOLD;
Document {
nodes,
elements: Vec::with_capacity(capacity),
styles: Vec::new(),
linked_stylesheets: Vec::new(),
use_indexes,
by_id: FxHashMap::default(),
by_class: FxHashMap::default(),
by_tag: FxHashMap::default(),
}
}
#[inline]
pub(super) fn as_element(&self, node_id: NodeId) -> Option<Element<'_>> {
if let NodeData::Element { element, .. } = &self[node_id].data {
Some(Element::new(self, node_id, element))
} else {
None
}
}
/// Add a new `style` element node.
pub(super) fn add_style(&mut self, node: NodeId) {
self.styles.push(node);
}
/// Iterator over blocks of CSS defined inside `style` tags.
pub(crate) fn styles(&self) -> impl Iterator<Item = (NodeId, &str)> + '_ {
self.styles.iter().filter_map(|node_id| {
self[*node_id]
.first_child
.and_then(|child_id| self[child_id].as_text().map(|text| (*node_id, text)))
})
}
/// Iterate over `href` attribute values of `link[rel~=stylesheet]` tags.
pub(crate) fn stylesheets(&self) -> impl Iterator<Item = &str> + '_ {
self.linked_stylesheets.iter().filter_map(|node_id| {
self[*node_id]
.as_element()
.and_then(|data| data.attributes.get(local_name!("href")))
})
}
/// Add a new linked stylesheet location.
pub(super) fn add_linked_stylesheet(&mut self, node: NodeId) {
self.linked_stylesheets.push(node);
}
/// Add a new node to the nodes vector, returning its index.
pub(super) fn push_node(&mut self, node: NodeData) -> NodeId {
let next_index = self.nodes.len();
self.nodes.push(Node::new(node));
NodeId::new(next_index)
}
#[inline]
pub(super) fn push_element_id(&mut self, node: NodeId) {
self.elements.push(node);
}
/// Whether element indexes are enabled for this document.
#[inline]
pub(crate) fn has_indexes(&self) -> bool {
self.use_indexes
}
/// Index an element by its tag name.
#[inline]
pub(super) fn index_by_tag(&mut self, tag: LocalName, node_id: NodeId) {
if self.use_indexes {
self.by_tag.entry(tag).or_default().push(node_id);
}
}
/// Index an element by its ID attribute value.
#[inline]
pub(super) fn index_by_id(&mut self, id: &StrTendril, node_id: NodeId) {
if self.use_indexes && !id.is_empty() {
let id_atom = LocalName::from(&**id);
// IDs should be unique; later declarations override earlier ones
self.by_id.insert(id_atom, node_id);
}
}
/// Index an element by its class names (space-separated).
pub(super) fn index_by_classes(&mut self, classes: &StrTendril, node_id: NodeId) {
if self.use_indexes {
for class in classes.split_whitespace() {
let class_atom = LocalName::from(class);
self.by_class.entry(class_atom).or_default().push(node_id);
}
}
}
/// Lookup an element by its ID attribute value.
#[inline]
pub(crate) fn get_by_id(&self, id: &LocalName) -> Option<NodeId> {
self.by_id.get(id).copied()
}
/// Lookup elements by class name.
#[inline]
pub(crate) fn get_by_class(&self, class: &LocalName) -> &[NodeId] {
self.by_class.get(class).map_or(&[], Vec::as_slice)
}
/// Lookup elements by tag name.
#[inline]
pub(crate) fn get_by_tag(&self, tag: &LocalName) -> &[NodeId] {
self.by_tag.get(tag).map_or(&[], Vec::as_slice)
}
/// Quick check if a selector's anchor exists in the document.
/// Returns `false` if we can determine the selector won't match anything.
///
/// This is a fast heuristic, not a full CSS parser. It extracts the leftmost
/// identifier from the selector and checks if it exists in our indexes.
/// False positives (returning `true` for non-matching selectors) are acceptable;
/// the full selector matching will filter those out later.
#[inline]
pub(crate) fn anchor_exists(&self, selector: &str) -> bool {
// Only use this optimization when indexes are enabled.
// For small documents, indexes aren't built so we can't do this check.
if !self.use_indexes {
return true;
}
let bytes = selector.trim_ascii_start().as_bytes();
match bytes.first() {
// Class selector: .classname
Some(b'.') => extract_identifier(&bytes[1..]).map_or(true, |name| {
self.by_class.contains_key(&LocalName::from(name))
}),
// ID selector: #id
Some(b'#') => extract_identifier(&bytes[1..])
.map_or(true, |name| self.by_id.contains_key(&LocalName::from(name))),
// Tag selector: starts with ASCII letter
Some(b'a'..=b'z' | b'A'..=b'Z') => extract_identifier(bytes).map_or(true, |name| {
self.by_tag.contains_key(&LocalName::from(name))
}),
// Universal (*), attribute ([), pseudo-class (:), empty, or unknown
_ => true,
}
}
pub(crate) fn detach_node(&mut self, node: NodeId) {
self.detach(node);
}
/// Detach a node from its siblings and its parent.
///
/// Before:
///
/// [Parent]
/// ↓
/// ... [Previous] <--> [Node] <--> [Next] ...
///
/// After:
///
/// [Parent]
/// ↓
/// ... [Previous] <--> [Next] ...
pub(super) fn detach(&mut self, node: NodeId) {
// Save references to the parent and sibling nodes of the node being detached.
let (parent, previous_sibling, next_sibling) = {
let node = &mut self[node];
(
node.parent.take(),
node.previous_sibling.take(),
node.next_sibling.take(),
)
};
if let Some(next_sibling) = next_sibling {
// Point next sibling one step back to bypass the detached node
self[next_sibling].previous_sibling = previous_sibling;
} else if let Some(parent) = parent {
// No next sibling - this node was the last child of the parent node, now the previous
// sibling becomes the last child
self[parent].last_child = previous_sibling;
}
if let Some(previous_sibling) = previous_sibling {
// Point the previous sibling one step forward to bypass the detached node
self[previous_sibling].next_sibling = next_sibling;
} else if let Some(parent) = parent {
// No previous sibling - this node was the first child of the parent node, now the next
// sibling becomes the first child
self[parent].first_child = next_sibling;
}
}
/// Remove all the children from node and append them to `new_parent`.
pub(super) fn reparent_children(&mut self, node: NodeId, new_parent: NodeId) {
// Re-read `first_child` each iteration: `append` calls `detach` internally, which clears
// `child.next_sibling` and updates `node.first_child` to the former second child.
while let Some(child) = self[node].first_child {
self.append(new_parent, child);
}
}
/// Append a new child node to a parent node.
///
/// If the parent node already has children. Before:
///
/// [Parent]
/// ↓
/// [Child1] <--> [Child2] <--> ...
///
/// After:
///
/// [Parent]
/// ↓
/// [Child1] <--> [Child2] <--> [New] ...
///
/// If the parent node has no children. Before:
///
/// [Parent]
///
/// After:
///
/// [Parent]
/// ↓
/// [New]
///
/// So, [New} becomes the first child of [Parent].
pub(super) fn append(&mut self, parent: NodeId, node: NodeId) {
// Detach `node` from its current parent (if any)
self.detach(node);
// Set `node` parent to the specified parent
self[node].parent = Some(parent);
if let Some(last_child) = self[parent].last_child.take() {
// Connect `node` with the last child (if any) by adding `node` after it
self[node].previous_sibling = Some(last_child);
self[last_child].next_sibling = Some(node);
} else {
// No last child - it becomes the first child
self[parent].first_child = Some(node);
}
// Now, `node` is the last child of the new parent
self[parent].last_child = Some(node);
}
/// Insert a new node right before a given sibling node.
///
/// In general, this method transforms this relationship:
///
/// ... <--> [Previous] <--> [Sibling] <--> [Next] --> ...
///
/// into this:
///
/// ... <--> [Previous] <--> [New] <--> [Sibling] <--> [Next] <--> ...
///
/// If [Sibling] node is the first child (i.e., no [Previous] exists), the method also updates the parent node:
///
/// Before:
///
/// [Parent]
/// ↓
/// [Sibling] <--> [Next] <--> ...
///
/// After:
///
/// [Parent]
/// ↓
/// [New] <--> [Sibling] <--> [Next] <--> ...
///
/// So, [New] becomes the first child of [Parent].
pub(super) fn insert_before(&mut self, sibling: NodeId, node: NodeId) {
// Detach `node` from its current parent (if any)
self.detach(node);
// Set `node` parent to `sibling` parent
self[node].parent = self[sibling].parent;
// As it is inserted before, then `next_sibling` should point to `sibling`
self[node].next_sibling = Some(sibling);
if let Some(previous_sibling) = self[sibling].previous_sibling.take() {
// Connect `node` with the previous sibling (if any)
self[node].previous_sibling = Some(previous_sibling);
self[previous_sibling].next_sibling = Some(node);
} else if let Some(parent) = self[sibling].parent {
// No previous sibling - then it is the first child of its parent, so the parent node
// should be updated too
self[parent].first_child = Some(node);
}
// Now `node` is the previous sibling of the `sibling` node
self[sibling].previous_sibling = Some(node);
}
/// Returns an iterator over the direct children of a node.
pub(super) fn children(&self, node: NodeId) -> impl Iterator<Item = NodeId> + '_ {
successors(self[node].first_child, |&node| self[node].next_sibling)
}
/// Serialize the document to HTML string.
#[allow(clippy::too_many_arguments, clippy::fn_params_excessive_bools)]
pub(crate) fn serialize<W: Write>(
&self,
writer: &mut W,
styles: DocumentStyleMap<'_>,
keep_style_tags: bool,
keep_link_tags: bool,
minify_css: bool,
at_rules: Option<&String>,
mode: InliningMode,
apply_width_attributes: bool,
apply_height_attributes: bool,
) -> Result<(), InlineError> {
serialize_to(
self,
writer,
styles,
keep_style_tags,
keep_link_tags,
minify_css,
at_rules,
mode,
apply_width_attributes,
apply_height_attributes,
)
}
/// Filter this node iterator to elements matching the given selectors.
pub(crate) fn select<'a, 'b, 'c>(
&'a self,
selectors: &'b str,
caches: &'c mut SelectorCaches,
) -> Result<Select<'a, 'c>, ParseError<'b>> {
select(self, selectors, caches)
}
}
impl std::ops::Index<NodeId> for Document {
type Output = Node;
#[inline]
fn index(&self, id: NodeId) -> &Node {
&self.nodes[id.get()]
}
}
impl std::ops::IndexMut<NodeId> for Document {
#[inline]
fn index_mut(&mut self, id: NodeId) -> &mut Node {
&mut self.nodes[id.get()]
}
}
/// Extract a CSS identifier (class/id/tag name) from a byte slice.
/// Returns the identifier as a `&str` if valid, `None` otherwise.
#[inline]
fn extract_identifier(bytes: &[u8]) -> Option<&str> {
// Find the end of the identifier (ASCII letters, digits, underscore, hyphen)
let end = bytes
.iter()
.position(|&b| !matches!(b, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-'))
.unwrap_or(bytes.len());
if end == 0 {
return None;
}
// SAFETY: All matched bytes are ASCII (0x00-0x7F), which is valid UTF-8
Some(unsafe { std::str::from_utf8_unchecked(&bytes[..end]) })
}
#[cfg(test)]
mod tests {
use super::{super::node::ElementData, *};
use html5ever::{local_name, ns, QualName};
use test_case::test_case;
fn new_element() -> NodeData {
NodeData::Element {
element: ElementData::new(QualName::new(None, ns!(), local_name!("span")), vec![]),
inlining_ignored: false,
}
}
fn roundtrip(bytes: &[u8]) -> Vec<u8> {
let mut buffer = Vec::new();
let doc = Document::parse_with_options(bytes, 0, InliningMode::Document);
doc.serialize(
&mut buffer,
vec![None; doc.nodes.len()],
false,
false,
false,
None,
InliningMode::Document,
false,
false,
)
.expect("Failed to serialize");
buffer
}
#[test]
fn test_collect_styles() {
let doc = Document::parse_with_options(
r"
<head>
<style>h1 { color:blue; }</style>
<style>h1 { color:red }</style>
<style data-css-inline='ignore'>h1 { color:yellow; }</style>
</head>"
.as_bytes(),
0,
InliningMode::Document,
);
let styles = doc.styles().map(|(_, content)| content).collect::<Vec<_>>();
assert_eq!(styles.len(), 2);
assert_eq!(styles[0], "h1 { color:blue; }");
assert_eq!(styles[1], "h1 { color:red }");
}
#[test]
fn test_collect_stylesheets() {
let doc = Document::parse_with_options(
r"
<head>
<link href='styles1.css' rel='stylesheet' type='text/css'>
<link href='styles2.css' rel='stylesheet' type='text/css'>
<link href='' rel='stylesheet' type='text/css'>
<link href='styles3.css' rel='stylesheet' type='text/css' data-css-inline='ignore'>
</head>"
.as_bytes(),
0,
InliningMode::Document,
);
let links = doc.stylesheets().collect::<Vec<_>>();
assert_eq!(links.len(), 2);
assert_eq!(links[0], "styles1.css");
assert_eq!(links[1], "styles2.css");
}
#[test]
fn test_insert_before() {
let mut doc = Document::with_capacity(0, 0);
let node1_id = doc.push_node(new_element());
let node2_id = doc.push_node(new_element());
let new_node_id = doc.push_node(new_element());
let document_id = NodeId::document_id();
doc.append(document_id, node1_id);
doc.append(document_id, node2_id);
doc.insert_before(node2_id, new_node_id);
assert_eq!(doc[node2_id].previous_sibling, Some(new_node_id));
assert_eq!(doc[new_node_id].next_sibling, Some(node2_id));
}
#[test]
fn test_append() {
let mut doc = Document::with_capacity(0, 0);
let node1_id = doc.push_node(new_element());
let node2_id = doc.push_node(new_element());
let document_id = NodeId::document_id();
doc.append(document_id, node1_id);
doc.append(document_id, node2_id);
assert_eq!(doc[document_id].last_child, Some(node2_id));
assert_eq!(doc[node1_id].next_sibling, Some(node2_id));
assert_eq!(doc[node2_id].previous_sibling, Some(node1_id));
}
#[test_case(b"<!DOCTYPE html><html><head><title>Title of the document</title></head><body></body></html>")]
#[test_case(b"<!DOCTYPE html><html><head><title>Title of the document</title></head><body><hr></body></html>")]
fn test_roundtrip(input: &[u8]) {
assert_eq!(roundtrip(input), input);
}
#[test]
fn test_ignore_children() {
assert_eq!(roundtrip(b"<!DOCTYPE html><html><head><title>Title of the document</title></head><body><hr><hr></hr></hr></body></html>"), b"<!DOCTYPE html><html><head><title>Title of the document</title></head><body><hr><hr></body></html>");
}
#[test]
fn test_pseudo_class() {
let output = roundtrip(b"<!DOCTYPE html><html><head><title>Title of the document</title><style>h1:hover { color:blue; }</style></head><body><h1>Hello world!</h1></body></html>");
assert_eq!(output, b"<!DOCTYPE html><html><head><title>Title of the document</title></head><body><h1>Hello world!</h1></body></html>");
}
#[test]
fn test_comment() {
let output = roundtrip(b"<html><head><title>Title of the document</title></head><body><!--TTT--></body></html>");
assert_eq!(output, b"<html><head><title>Title of the document</title></head><body><!--TTT--></body></html>");
}
#[test]
fn test_debug() {
let doc =
Document::parse_with_options(b"<html><body></body></html>", 0, InliningMode::Document);
assert_eq!(format!("{doc:?}"), "Document { nodes: [Node { parent: None, next_sibling: None, previous_sibling: None, first_child: None, last_child: None, data: Document }, Node { parent: None, next_sibling: None, previous_sibling: None, first_child: Some(NodeId(2)), last_child: Some(NodeId(2)), data: Document }, Node { parent: Some(NodeId(1)), next_sibling: None, previous_sibling: None, first_child: Some(NodeId(3)), last_child: Some(NodeId(4)), data: Element { element: ElementData { name: QualName { prefix: None, ns: Atom('http://www.w3.org/1999/xhtml' type=static), local: Atom('html' type=inline) }, attributes: Attributes { attributes: [], class: None } }, inlining_ignored: false } }, Node { parent: Some(NodeId(2)), next_sibling: Some(NodeId(4)), previous_sibling: None, first_child: None, last_child: None, data: Element { element: ElementData { name: QualName { prefix: None, ns: Atom('http://www.w3.org/1999/xhtml' type=static), local: Atom('head' type=inline) }, attributes: Attributes { attributes: [], class: None } }, inlining_ignored: false } }, Node { parent: Some(NodeId(2)), next_sibling: None, previous_sibling: Some(NodeId(3)), first_child: None, last_child: None, data: Element { element: ElementData { name: QualName { prefix: None, ns: Atom('http://www.w3.org/1999/xhtml' type=static), local: Atom('body' type=inline) }, attributes: Attributes { attributes: [], class: None } }, inlining_ignored: false } }], styles: [], linked_stylesheets: [], .. }");
}
#[test]
fn test_edit_document() {
let mut doc = Document::parse_with_options(
b"<html><body><a></a><b></b></body></html>",
0,
InliningMode::Document,
);
let a_id = NodeId::new(5);
assert_eq!(
doc[a_id].as_element().expect("Element does not exist").name,
QualName::new(None, ns!(html), local_name!("a"))
);
let b_id = NodeId::new(6);
assert_eq!(
doc[b_id].as_element().expect("Element does not exist").name,
QualName::new(None, ns!(html), local_name!("b"))
);
// `a` is the previous sibling of `b`
// `b` is the next sibling of `a`
assert_eq!(doc[b_id].previous_sibling, Some(a_id));
assert_eq!(doc[a_id].next_sibling, Some(b_id));
// Detach `b`, so it has no previous sibling
// And `a` has not next sibling
doc.detach(b_id);
assert_eq!(doc[b_id].previous_sibling, None);
assert_eq!(doc[a_id].next_sibling, None);
let head_id = NodeId::new(3);
let body_id = NodeId::new(4);
// Detach `head`, so previous sibling of `body` is empty
doc.detach(head_id);
assert_eq!(doc[body_id].next_sibling, None);
}
}