scrape_core/parser/
mod.rs

1//! HTML parsing implementations.
2//!
3//! This module provides different HTML parsing strategies:
4//!
5//! - **html5ever**: Spec-compliant HTML5 parser for correct parsing of all HTML
6//!
7//! # Architecture
8//!
9//! The parser module is responsible for converting raw HTML bytes into a DOM
10//! tree structure. It uses the `html5ever` crate for spec-compliant parsing.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use scrape_core::{Html5everParser, Parser, ParseConfig};
16//!
17//! let parser = Html5everParser;
18//! let config = ParseConfig::default();
19//! let document = parser.parse_with_config("<html><body>Hello</body></html>", &config)?;
20//! ```
21
22mod error;
23pub mod fragment;
24mod html5;
25#[cfg(test)]
26mod tests;
27
28pub use error::{ParseError, ParseResult};
29pub use html5::Html5everParser;
30
31use crate::dom::Document;
32
33/// Sealed trait module to prevent external implementations.
34mod private {
35    /// Marker trait for sealing [`Parser`](super::Parser).
36    pub trait Sealed {}
37}
38
39/// A sealed trait for HTML parsers.
40///
41/// This trait is sealed and cannot be implemented outside of this crate.
42/// Use [`Html5everParser`] for spec-compliant HTML5 parsing.
43///
44/// # Example
45///
46/// ```rust,ignore
47/// use scrape_core::{Html5everParser, Parser, ParseConfig};
48///
49/// let parser = Html5everParser;
50/// let document = parser.parse("<html><body>Hello</body></html>")?;
51/// ```
52pub trait Parser: private::Sealed {
53    /// Parses HTML with default configuration.
54    ///
55    /// # Errors
56    ///
57    /// Returns [`ParseError::EmptyInput`] if the input is empty or whitespace-only.
58    fn parse(&self, html: &str) -> ParseResult<Document> {
59        self.parse_with_config(html, &ParseConfig::default())
60    }
61
62    /// Parses HTML with the given configuration.
63    ///
64    /// # Errors
65    ///
66    /// Returns [`ParseError`] if parsing fails:
67    /// - [`ParseError::EmptyInput`] if the input is empty or whitespace-only
68    /// - [`ParseError::MaxDepthExceeded`] if nesting exceeds `config.max_depth`
69    fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document>;
70}
71
72/// Configuration for HTML parsing behavior.
73///
74/// # Example
75///
76/// ```rust
77/// use scrape_core::ParseConfig;
78///
79/// let config = ParseConfig { max_depth: 256, preserve_whitespace: true, include_comments: false };
80/// ```
81#[derive(Debug, Clone)]
82pub struct ParseConfig {
83    /// Maximum nesting depth for the DOM tree.
84    ///
85    /// Parsing will return [`ParseError::MaxDepthExceeded`] if this limit is exceeded.
86    /// Default: 512.
87    pub max_depth: usize,
88
89    /// Whether to preserve whitespace-only text nodes.
90    ///
91    /// When `false` (default), text nodes containing only whitespace are filtered out.
92    pub preserve_whitespace: bool,
93
94    /// Whether to include comment nodes in the parsed document.
95    ///
96    /// Default: `false`.
97    pub include_comments: bool,
98}
99
100impl Default for ParseConfig {
101    fn default() -> Self {
102        Self { max_depth: 512, preserve_whitespace: false, include_comments: false }
103    }
104}