scrape_core/parser/
html5.rs

1//! html5ever-based HTML parser implementation.
2
3use std::collections::HashMap;
4
5use html5ever::{ParseOpts, parse_document, tendril::TendrilSink};
6use markup5ever_rcdom::{Handle, NodeData, RcDom};
7
8use super::{ParseConfig, ParseError, ParseResult, Parser, private::Sealed};
9use crate::dom::{Document, NodeId};
10
11/// HTML5 spec-compliant parser using html5ever.
12///
13/// This parser uses the [html5ever](https://github.com/servo/html5ever) crate
14/// for spec-compliant HTML5 parsing. It handles malformed HTML gracefully
15/// using the HTML5 error recovery algorithm.
16///
17/// # Example
18///
19/// ```rust
20/// use scrape_core::{Html5everParser, Parser};
21///
22/// let parser = Html5everParser;
23/// let document = parser.parse("<html><body><h1>Hello</h1></body></html>").unwrap();
24/// assert!(document.root().is_some());
25/// ```
26#[derive(Debug, Default, Clone, Copy)]
27pub struct Html5everParser;
28
29impl Sealed for Html5everParser {}
30
31impl Parser for Html5everParser {
32    fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document> {
33        if html.trim().is_empty() {
34            return Err(ParseError::EmptyInput);
35        }
36
37        let dom = parse_document(RcDom::default(), ParseOpts::default())
38            .from_utf8()
39            .read_from(&mut html.as_bytes())
40            .map_err(|e| ParseError::InternalError(e.to_string()))?;
41
42        convert_rcdom_to_document(&dom, config)
43    }
44}
45
46/// Converts an html5ever `RcDom` to our Document representation.
47fn convert_rcdom_to_document(dom: &RcDom, config: &ParseConfig) -> ParseResult<Document> {
48    let mut document = Document::new();
49    let mut depth = 0;
50
51    convert_node(&dom.document, &mut document, None, &mut depth, config)?;
52
53    Ok(document)
54}
55
56/// Recursively converts an `RcDom` node and its children to our DOM representation.
57fn convert_node(
58    handle: &Handle,
59    document: &mut Document,
60    parent: Option<NodeId>,
61    depth: &mut usize,
62    config: &ParseConfig,
63) -> ParseResult<Option<NodeId>> {
64    if *depth > config.max_depth {
65        return Err(ParseError::MaxDepthExceeded { max_depth: config.max_depth });
66    }
67    *depth = depth.saturating_add(1);
68
69    let result = match &handle.data {
70        NodeData::Document => {
71            // Process children of document node without creating a node
72            for child in handle.children.borrow().iter() {
73                if let Some(child_id) = convert_node(child, document, None, depth, config)?
74                    && document.root().is_none()
75                {
76                    document.set_root(child_id);
77                }
78            }
79            *depth = depth.saturating_sub(1);
80            return Ok(None);
81        }
82
83        NodeData::Element { name, attrs, .. } => {
84            // html5ever normalizes tag names to lowercase during parsing
85            let tag_name = name.local.to_string();
86
87            let attrs_ref = attrs.borrow();
88            let mut attributes = HashMap::with_capacity(attrs_ref.len());
89            for attr in attrs_ref.iter() {
90                let key = if attr.name.ns.is_empty() {
91                    attr.name.local.to_string()
92                } else {
93                    format!("{}:{}", attr.name.ns, attr.name.local)
94                };
95                attributes.insert(key, attr.value.to_string());
96            }
97
98            let node_id = document.create_element(tag_name, attributes);
99
100            if let Some(parent_id) = parent {
101                document.append_child(parent_id, node_id);
102            } else if document.root().is_none() {
103                document.set_root(node_id);
104            }
105
106            // Process children
107            for child in handle.children.borrow().iter() {
108                convert_node(child, document, Some(node_id), depth, config)?;
109            }
110
111            Some(node_id)
112        }
113
114        NodeData::Text { contents } => {
115            let text = contents.borrow().to_string();
116
117            // Skip whitespace-only text nodes unless configured to preserve
118            if !config.preserve_whitespace && text.trim().is_empty() {
119                *depth = depth.saturating_sub(1);
120                return Ok(None);
121            }
122
123            let node_id = document.create_text(text);
124
125            if let Some(parent_id) = parent {
126                document.append_child(parent_id, node_id);
127            }
128
129            Some(node_id)
130        }
131
132        NodeData::Comment { contents } => {
133            if !config.include_comments {
134                *depth = depth.saturating_sub(1);
135                return Ok(None);
136            }
137
138            let node_id = document.create_comment(contents.to_string());
139
140            if let Some(parent_id) = parent {
141                document.append_child(parent_id, node_id);
142            }
143
144            Some(node_id)
145        }
146
147        NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {
148            // Skip doctype and processing instructions
149            *depth = depth.saturating_sub(1);
150            return Ok(None);
151        }
152    };
153
154    *depth = depth.saturating_sub(1);
155    Ok(result)
156}