Skip to main content

scrape_core/parser/
html5.rs

1//! html5ever-based HTML parser implementation.
2
3use std::collections::HashMap;
4
5use html5ever::{ParseOpts, parse_document, tendril::TendrilSink};
6use markup5ever_rcdom::{Handle, NodeData, RcDom};
7
8use super::{ParseConfig, ParseError, ParseResult, Parser, private::Sealed};
9use crate::dom::{Document, DocumentIndex, NodeId};
10
11/// HTML5 spec-compliant parser using html5ever.
12///
13/// This parser uses the [html5ever](https://github.com/servo/html5ever) crate
14/// for spec-compliant HTML5 parsing. It handles malformed HTML gracefully
15/// using the HTML5 error recovery algorithm.
16///
17/// # Example
18///
19/// ```rust
20/// use scrape_core::{Html5everParser, Parser};
21///
22/// let parser = Html5everParser;
23/// let document = parser.parse("<html><body><h1>Hello</h1></body></html>").unwrap();
24/// assert!(document.root().is_some());
25/// ```
26#[derive(Debug, Default, Clone, Copy)]
27pub struct Html5everParser;
28
29impl Sealed for Html5everParser {}
30
31impl Parser for Html5everParser {
32    fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document> {
33        self.parse_with_config_and_capacity(html, config, 256)
34    }
35}
36
37impl Html5everParser {
38    /// Parses HTML with the given configuration and pre-allocated capacity.
39    ///
40    /// # Errors
41    ///
42    /// Returns [`ParseError`] if parsing fails.
43    pub fn parse_with_config_and_capacity(
44        &self,
45        html: &str,
46        config: &ParseConfig,
47        capacity: usize,
48    ) -> ParseResult<Document> {
49        if html.trim().is_empty() {
50            return Err(ParseError::EmptyInput);
51        }
52
53        let dom = parse_document(RcDom::default(), ParseOpts::default())
54            .from_utf8()
55            .read_from(&mut html.as_bytes())
56            .map_err(|e| ParseError::InternalError(e.to_string()))?;
57
58        convert_rcdom_to_document_with_capacity(&dom, config, capacity)
59    }
60}
61
62/// Converts an html5ever `RcDom` to our Document representation.
63fn convert_rcdom_to_document(dom: &RcDom, config: &ParseConfig) -> ParseResult<Document> {
64    convert_rcdom_to_document_with_capacity(dom, config, 256)
65}
66
67/// Converts an html5ever `RcDom` to our Document representation with pre-allocated capacity.
68fn convert_rcdom_to_document_with_capacity(
69    dom: &RcDom,
70    config: &ParseConfig,
71    capacity: usize,
72) -> ParseResult<Document> {
73    let mut document = crate::dom::DocumentImpl::<crate::dom::Building>::with_capacity(capacity);
74    let mut depth = 0;
75    let mut index = DocumentIndex::new();
76
77    convert_node(&dom.document, &mut document, None, &mut depth, config, &mut index)?;
78
79    let mut document = document.build();
80    document.set_index(index);
81    Ok(document)
82}
83
84/// Recursively converts an `RcDom` node and its children to our DOM representation.
85fn convert_node(
86    handle: &Handle,
87    document: &mut crate::dom::DocumentImpl<crate::dom::Building>,
88    parent: Option<NodeId>,
89    depth: &mut usize,
90    config: &ParseConfig,
91    index: &mut DocumentIndex,
92) -> ParseResult<Option<NodeId>> {
93    if *depth > config.max_depth {
94        return Err(ParseError::MaxDepthExceeded { max_depth: config.max_depth, span: None });
95    }
96    *depth = depth.saturating_add(1);
97
98    let result = match &handle.data {
99        NodeData::Document => {
100            // Process children of document node without creating a node
101            for child in handle.children.borrow().iter() {
102                if let Some(child_id) = convert_node(child, document, None, depth, config, index)?
103                    && document.root().is_none()
104                {
105                    document.set_root(child_id);
106                }
107            }
108            *depth = depth.saturating_sub(1);
109            return Ok(None);
110        }
111
112        NodeData::Element { name, attrs, .. } => {
113            // html5ever normalizes tag names to lowercase during parsing
114            let tag_name = name.local.to_string();
115
116            let attrs_ref = attrs.borrow();
117            let mut attributes = HashMap::with_capacity(attrs_ref.len());
118            for attr in attrs_ref.iter() {
119                let key = if attr.name.ns.is_empty() {
120                    attr.name.local.to_string()
121                } else {
122                    format!("{}:{}", attr.name.ns, attr.name.local)
123                };
124                attributes.insert(key, attr.value.to_string());
125            }
126
127            let node_id = document.create_element(tag_name, attributes.clone());
128
129            if let Some(id_attr) = attributes.get("id") {
130                index.register_id(id_attr.clone(), node_id);
131            }
132            if let Some(class_attr) = attributes.get("class") {
133                index.register_classes(class_attr, node_id);
134            }
135
136            if let Some(parent_id) = parent {
137                document.append_child(parent_id, node_id);
138            } else if document.root().is_none() {
139                document.set_root(node_id);
140            }
141
142            // Process children
143            for child in handle.children.borrow().iter() {
144                convert_node(child, document, Some(node_id), depth, config, index)?;
145            }
146
147            Some(node_id)
148        }
149
150        NodeData::Text { contents } => {
151            let text = contents.borrow().to_string();
152
153            // Skip whitespace-only text nodes unless configured to preserve
154            if !config.preserve_whitespace && text.trim().is_empty() {
155                *depth = depth.saturating_sub(1);
156                return Ok(None);
157            }
158
159            let node_id = document.create_text(text);
160
161            if let Some(parent_id) = parent {
162                document.append_child(parent_id, node_id);
163            }
164
165            Some(node_id)
166        }
167
168        NodeData::Comment { contents } => {
169            if !config.include_comments {
170                *depth = depth.saturating_sub(1);
171                return Ok(None);
172            }
173
174            let node_id = document.create_comment(contents.to_string());
175
176            if let Some(parent_id) = parent {
177                document.append_child(parent_id, node_id);
178            }
179
180            Some(node_id)
181        }
182
183        NodeData::Doctype { .. } | NodeData::ProcessingInstruction { .. } => {
184            // Skip doctype and processing instructions
185            *depth = depth.saturating_sub(1);
186            return Ok(None);
187        }
188    };
189
190    *depth = depth.saturating_sub(1);
191    Ok(result)
192}