scrape_core/parser/mod.rs
1//! HTML parsing implementations.
2//!
3//! This module provides different HTML parsing strategies:
4//!
5//! - **html5ever**: Spec-compliant HTML5 parser for correct parsing of all HTML
6//!
7//! # Architecture
8//!
9//! The parser module is responsible for converting raw HTML bytes into a DOM
10//! tree structure. It uses the `html5ever` crate for spec-compliant parsing.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use scrape_core::{Html5everParser, Parser, ParseConfig};
16//!
17//! let parser = Html5everParser;
18//! let config = ParseConfig::default();
19//! let document = parser.parse_with_config("<html><body>Hello</body></html>", &config)?;
20//! ```
21
22mod error;
23pub mod fragment;
24mod html5;
25pub mod sink;
26#[cfg(test)]
27mod tests;
28pub mod warnings;
29
30pub use error::{ParseError, ParseResult};
31pub use html5::Html5everParser;
32pub use warnings::{ParseResultWithWarnings, ParseWarning, WarningSeverity};
33
34use crate::dom::Document;
35
36/// Sealed trait module to prevent external implementations.
37mod private {
38 /// Marker trait for sealing [`Parser`](super::Parser).
39 pub trait Sealed {}
40}
41
42/// A sealed trait for HTML parsers.
43///
44/// This trait is sealed and cannot be implemented outside of this crate.
45/// Use [`Html5everParser`] for spec-compliant HTML5 parsing.
46///
47/// # Example
48///
49/// ```rust,ignore
50/// use scrape_core::{Html5everParser, Parser, ParseConfig};
51///
52/// let parser = Html5everParser;
53/// let document = parser.parse("<html><body>Hello</body></html>")?;
54/// ```
55pub trait Parser: private::Sealed {
56 /// Parses HTML with default configuration.
57 ///
58 /// # Errors
59 ///
60 /// Returns [`ParseError::EmptyInput`] if the input is empty or whitespace-only.
61 fn parse(&self, html: &str) -> ParseResult<Document> {
62 self.parse_with_config(html, &ParseConfig::default())
63 }
64
65 /// Parses HTML with the given configuration.
66 ///
67 /// # Errors
68 ///
69 /// Returns [`ParseError`] if parsing fails:
70 /// - [`ParseError::EmptyInput`] if the input is empty or whitespace-only
71 /// - [`ParseError::MaxDepthExceeded`] if nesting exceeds `config.max_depth`
72 fn parse_with_config(&self, html: &str, config: &ParseConfig) -> ParseResult<Document>;
73}
74
75/// Configuration for HTML parsing behavior.
76///
77/// # Example
78///
79/// ```rust
80/// use scrape_core::ParseConfig;
81///
82/// let config = ParseConfig { max_depth: 256, preserve_whitespace: true, include_comments: false };
83/// ```
84#[derive(Debug, Clone)]
85pub struct ParseConfig {
86 /// Maximum nesting depth for the DOM tree.
87 ///
88 /// Parsing will return [`ParseError::MaxDepthExceeded`] if this limit is exceeded.
89 /// Default: 512.
90 pub max_depth: usize,
91
92 /// Whether to preserve whitespace-only text nodes.
93 ///
94 /// When `false` (default), text nodes containing only whitespace are filtered out.
95 pub preserve_whitespace: bool,
96
97 /// Whether to include comment nodes in the parsed document.
98 ///
99 /// Default: `false`.
100 pub include_comments: bool,
101}
102
103impl Default for ParseConfig {
104 fn default() -> Self {
105 Self { max_depth: 512, preserve_whitespace: false, include_comments: false }
106 }
107}