scrape_core/parser/mod.rs
1//! HTML parsing implementations.
2//!
3//! This module provides different HTML parsing strategies:
4//!
5//! - **html5ever**: Spec-compliant HTML5 parser for correct parsing of all HTML
6//!
7//! # Architecture
8//!
9//! The parser module is responsible for converting raw HTML bytes into a DOM
10//! tree structure. It uses the `html5ever` crate for spec-compliant parsing.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use scrape_core::{Html5everParser, Parser, ParseConfig};
16//!
17//! let parser = Html5everParser;
18//! let config = ParseConfig::default();
19//! let document = parser.parse_with_config("<html><body>Hello</body></html>", &config)?;
20//! ```
21
22mod error;
23pub mod fragment;
24mod html5;
25#[cfg(test)]
26mod tests;
27
28pub use error::{ParseError, ParseResult};
29pub use html5::Html5everParser;
30
31use crate::dom::Document;
32
33/// Sealed trait module to prevent external implementations.
34mod private {
35 /// Marker trait for sealing [`Parser`](super::Parser).
36 pub trait Sealed {}
37}
38
39/// A sealed trait for HTML parsers.
40///
41/// This trait is sealed and cannot be implemented outside of this crate.
42/// Use [`Html5everParser`] for spec-compliant HTML5 parsing.
43///
44/// # Example
45///
46/// ```rust,ignore
47/// use scrape_core::{Html5everParser, Parser, ParseConfig};
48///
49/// let parser = Html5everParser;
50/// let document = parser.parse("<html><body>Hello</body></html>")?;
51/// ```
52pub trait Parser: private::Sealed {
53 /// Parses HTML with default configuration.
54 ///
55 /// # Errors
56 ///
57 /// Returns [`ParseError::EmptyInput`] if the input is empty or whitespace-only.
58 fn parse(&self, html: &str) -> ParseResult<Document> {
59 self.parse_with_config(html, &ParseConfig::default())
60 }
61
62 /// Parses HTML with the given configuration.
63 ///
64 /// # Errors
65 ///
66 /// Returns [`ParseError`] if parsing fails:
67 /// - [`ParseError::EmptyInput`] if the input is empty or whitespace-only
68 /// - [`ParseError::MaxDepthExceeded`] if nesting exceeds `config.max_depth`
69 fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document>;
70}
71
72/// Configuration for HTML parsing behavior.
73///
74/// # Example
75///
76/// ```rust
77/// use scrape_core::ParseConfig;
78///
79/// let config = ParseConfig { max_depth: 256, preserve_whitespace: true, include_comments: false };
80/// ```
81#[derive(Debug, Clone)]
82pub struct ParseConfig {
83 /// Maximum nesting depth for the DOM tree.
84 ///
85 /// Parsing will return [`ParseError::MaxDepthExceeded`] if this limit is exceeded.
86 /// Default: 512.
87 pub max_depth: usize,
88
89 /// Whether to preserve whitespace-only text nodes.
90 ///
91 /// When `false` (default), text nodes containing only whitespace are filtered out.
92 pub preserve_whitespace: bool,
93
94 /// Whether to include comment nodes in the parsed document.
95 ///
96 /// Default: `false`.
97 pub include_comments: bool,
98}
99
100impl Default for ParseConfig {
101 fn default() -> Self {
102 Self { max_depth: 512, preserve_whitespace: false, include_comments: false }
103 }
104}