scrape_core/parser/
mod.rs

1//! HTML parsing implementations.
2//!
3//! This module provides different HTML parsing strategies:
4//!
5//! - **html5ever**: Spec-compliant HTML5 parser for correct parsing of all HTML
6//!
7//! # Architecture
8//!
9//! The parser module is responsible for converting raw HTML bytes into a DOM
10//! tree structure. It uses the `html5ever` crate for spec-compliant parsing.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use scrape_core::{Html5everParser, Parser, ParseConfig};
16//!
17//! let parser = Html5everParser;
18//! let config = ParseConfig::default();
19//! let document = parser.parse_with_config("<html><body>Hello</body></html>", &config)?;
20//! ```
21
22mod error;
23mod html5;
24#[cfg(test)]
25mod tests;
26
27pub use error::{ParseError, ParseResult};
28pub use html5::Html5everParser;
29
30use crate::dom::Document;
31
32/// Sealed trait module to prevent external implementations.
33mod private {
34    /// Marker trait for sealing [`Parser`](super::Parser).
35    pub trait Sealed {}
36}
37
38/// A sealed trait for HTML parsers.
39///
40/// This trait is sealed and cannot be implemented outside of this crate.
41/// Use [`Html5everParser`] for spec-compliant HTML5 parsing.
42///
43/// # Example
44///
45/// ```rust,ignore
46/// use scrape_core::{Html5everParser, Parser, ParseConfig};
47///
48/// let parser = Html5everParser;
49/// let document = parser.parse("<html><body>Hello</body></html>")?;
50/// ```
51pub trait Parser: private::Sealed {
52    /// Parses HTML with default configuration.
53    ///
54    /// # Errors
55    ///
56    /// Returns [`ParseError::EmptyInput`] if the input is empty or whitespace-only.
57    fn parse(&self, html: &str) -> ParseResult<Document> {
58        self.parse_with_config(html, &ParseConfig::default())
59    }
60
61    /// Parses HTML with the given configuration.
62    ///
63    /// # Errors
64    ///
65    /// Returns [`ParseError`] if parsing fails:
66    /// - [`ParseError::EmptyInput`] if the input is empty or whitespace-only
67    /// - [`ParseError::MaxDepthExceeded`] if nesting exceeds `config.max_depth`
68    fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document>;
69}
70
71/// Configuration for HTML parsing behavior.
72///
73/// # Example
74///
75/// ```rust
76/// use scrape_core::ParseConfig;
77///
78/// let config = ParseConfig { max_depth: 256, preserve_whitespace: true, include_comments: false };
79/// ```
80#[derive(Debug, Clone)]
81pub struct ParseConfig {
82    /// Maximum nesting depth for the DOM tree.
83    ///
84    /// Parsing will return [`ParseError::MaxDepthExceeded`] if this limit is exceeded.
85    /// Default: 512.
86    pub max_depth: usize,
87
88    /// Whether to preserve whitespace-only text nodes.
89    ///
90    /// When `false` (default), text nodes containing only whitespace are filtered out.
91    pub preserve_whitespace: bool,
92
93    /// Whether to include comment nodes in the parsed document.
94    ///
95    /// Default: `false`.
96    pub include_comments: bool,
97}
98
99impl Default for ParseConfig {
100    fn default() -> Self {
101        Self { max_depth: 512, preserve_whitespace: false, include_comments: false }
102    }
103}