Skip to main content

scrape_core/parser/
mod.rs

1//! HTML parsing implementations.
2//!
3//! This module provides different HTML parsing strategies:
4//!
5//! - **html5ever**: Spec-compliant HTML5 parser for correct parsing of all HTML
6//!
7//! # Architecture
8//!
9//! The parser module is responsible for converting raw HTML bytes into a DOM
10//! tree structure. It uses the `html5ever` crate for spec-compliant parsing.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use scrape_core::{Html5everParser, Parser, ParseConfig};
16//!
17//! let parser = Html5everParser;
18//! let config = ParseConfig::default();
19//! let document = parser.parse_with_config("<html><body>Hello</body></html>", &config)?;
20//! ```
21
22mod error;
23pub mod fragment;
24mod html5;
25#[cfg(test)]
26mod tests;
27pub mod warnings;
28
29pub use error::{ParseError, ParseResult};
30pub use html5::Html5everParser;
31pub use warnings::{ParseResultWithWarnings, ParseWarning, WarningSeverity};
32
33use crate::dom::Document;
34
35/// Sealed trait module to prevent external implementations.
36mod private {
37    /// Marker trait for sealing [`Parser`](super::Parser).
38    pub trait Sealed {}
39}
40
41/// A sealed trait for HTML parsers.
42///
43/// This trait is sealed and cannot be implemented outside of this crate.
44/// Use [`Html5everParser`] for spec-compliant HTML5 parsing.
45///
46/// # Example
47///
48/// ```rust,ignore
49/// use scrape_core::{Html5everParser, Parser, ParseConfig};
50///
51/// let parser = Html5everParser;
52/// let document = parser.parse("<html><body>Hello</body></html>")?;
53/// ```
54pub trait Parser: private::Sealed {
55    /// Parses HTML with default configuration.
56    ///
57    /// # Errors
58    ///
59    /// Returns [`ParseError::EmptyInput`] if the input is empty or whitespace-only.
60    fn parse(&self, html: &str) -> ParseResult<Document> {
61        self.parse_with_config(html, &ParseConfig::default())
62    }
63
64    /// Parses HTML with the given configuration.
65    ///
66    /// # Errors
67    ///
68    /// Returns [`ParseError`] if parsing fails:
69    /// - [`ParseError::EmptyInput`] if the input is empty or whitespace-only
70    /// - [`ParseError::MaxDepthExceeded`] if nesting exceeds `config.max_depth`
71    fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document>;
72}
73
74/// Configuration for HTML parsing behavior.
75///
76/// # Example
77///
78/// ```rust
79/// use scrape_core::ParseConfig;
80///
81/// let config = ParseConfig { max_depth: 256, preserve_whitespace: true, include_comments: false };
82/// ```
83#[derive(Debug, Clone)]
84pub struct ParseConfig {
85    /// Maximum nesting depth for the DOM tree.
86    ///
87    /// Parsing will return [`ParseError::MaxDepthExceeded`] if this limit is exceeded.
88    /// Default: 512.
89    pub max_depth: usize,
90
91    /// Whether to preserve whitespace-only text nodes.
92    ///
93    /// When `false` (default), text nodes containing only whitespace are filtered out.
94    pub preserve_whitespace: bool,
95
96    /// Whether to include comment nodes in the parsed document.
97    ///
98    /// Default: `false`.
99    pub include_comments: bool,
100}
101
102impl Default for ParseConfig {
103    fn default() -> Self {
104        Self { max_depth: 512, preserve_whitespace: false, include_comments: false }
105    }
106}