Skip to main content

fast_html_parser/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2//! # fast-html-parser — SIMD-Optimized HTML Parser
3//!
4//! A high-performance HTML parser designed for web scraping workloads.
5//! Uses SIMD instructions (SSE4.2, AVX2, NEON) for tokenization and builds
6//! a cache-line aligned arena-based DOM tree for fast traversal.
7//!
8//! ## Quick Start
9//!
10//! ```
11//! use fast_html_parser::HtmlParser;
12//!
13//! let doc = HtmlParser::parse("<div><p>Hello</p></div>").unwrap();
14//! assert_eq!(doc.root().text_content(), "Hello");
15//! ```
16//!
17//! ## Builder Pattern
18//!
19//! ```
20//! use fast_html_parser::HtmlParser;
21//!
22//! let doc = HtmlParser::builder()
23//!     .max_input_size(64 * 1024 * 1024) // 64 MiB
24//!     .build()
25//!     .parse_str("<div>Hello</div>")
26//!     .unwrap();
27//! ```
28//!
29//! ## CSS Selectors
30//!
31//! ```
32//! use fast_html_parser::prelude::*;
33//!
34//! let doc = HtmlParser::parse("<ul><li>one</li><li>two</li></ul>").unwrap();
35//! let items = doc.select("li").unwrap();
36//! assert_eq!(items.len(), 2);
37//! ```
38//!
39//! ## Streaming
40//!
41//! ```
42//! use fast_html_parser::streaming::parse_stream;
43//!
44//! let html = b"<div><p>Hello</p></div>";
45//! let doc = parse_stream(html.chunks(8)).unwrap();
46//! assert_eq!(doc.root().text_content(), "Hello");
47//! ```
48//!
49//! ## Feature Flags
50//!
51//! | Feature | Default | Description |
52//! |---|---|---|
53//! | `css-selector` | Yes | CSS selector engine |
54//! | `entity-decode` | Yes | HTML entity decoding |
55//! | `xpath` | No | XPath expression support |
56//! | `encoding` | No | Auto-detect encoding from raw bytes |
57//! | `async-tokio` | No | Async parsing via Tokio |
58
59// ---------------------------------------------------------------------------
60// Re-exports: core types
61// ---------------------------------------------------------------------------
62
63/// Core types: interned tags, entity table, error definitions.
64pub use fhp_core as core_types;
65
66/// Interned HTML tag enum.
67pub use fhp_core::tag::Tag;
68
69/// Tokenizer (low-level).
70pub use fhp_tokenizer as tokenizer;
71
72/// DOM tree types.
73pub use fhp_tree as tree;
74
75/// Parsed document and node reference.
76pub use fhp_tree::{Document, HtmlError, NodeRef};
77
78/// Node identity type.
79pub use fhp_tree::node::NodeId;
80
81/// Streaming and incremental parsing.
82pub mod streaming {
83    pub use fhp_tree::streaming::{EarlyStopParser, ParseStatus, StreamParser, parse_stream};
84}
85
86// ---------------------------------------------------------------------------
87// Conditional re-exports
88// ---------------------------------------------------------------------------
89
90/// CSS selector and XPath engine.
91#[cfg(any(feature = "css-selector", feature = "xpath"))]
92#[cfg_attr(docsrs, doc(cfg(any(feature = "css-selector", feature = "xpath"))))]
93pub use fhp_selector::{CompiledSelector, DocumentIndex, Selectable, Selection};
94
95/// XPath types (re-exported from selector crate).
96#[cfg(feature = "xpath")]
97#[cfg_attr(docsrs, doc(cfg(feature = "xpath")))]
98pub mod xpath {
99    pub use fhp_selector::xpath::ast::XPathResult;
100}
101
102/// Encoding detection and conversion.
103#[cfg(feature = "encoding")]
104#[cfg_attr(docsrs, doc(cfg(feature = "encoding")))]
105pub mod encoding {
106    pub use fhp_encoding::{Encoding, decode, decode_or_detect, detect};
107}
108
109/// Async parser (requires `async-tokio` feature).
110#[cfg(feature = "async-tokio")]
111#[cfg_attr(docsrs, doc(cfg(feature = "async-tokio")))]
112pub mod async_parser {
113    pub use fhp_tree::async_parser::{AsyncParser, parse_async};
114}
115
116// ---------------------------------------------------------------------------
117// Prelude
118// ---------------------------------------------------------------------------
119
120/// Convenience prelude that imports the most commonly used types.
121///
122/// ```
123/// use fast_html_parser::prelude::*;
124/// ```
125pub mod prelude {
126    pub use fhp_tree::node::NodeId;
127    pub use fhp_tree::{Document, HtmlError, NodeRef};
128
129    #[cfg(any(feature = "css-selector", feature = "xpath"))]
130    #[cfg_attr(docsrs, doc(cfg(any(feature = "css-selector", feature = "xpath"))))]
131    pub use fhp_selector::{CompiledSelector, Selectable, Selection};
132
133    pub use crate::HtmlParser;
134}
135
136// ---------------------------------------------------------------------------
137// Builder + HtmlParser
138// ---------------------------------------------------------------------------
139
140/// Default maximum input size (256 MiB).
141const DEFAULT_MAX_INPUT_SIZE: usize = 256 * 1024 * 1024;
142
143/// Configuration builder for the HTML parser.
144///
145/// # Example
146///
147/// ```
148/// use fast_html_parser::HtmlParser;
149///
150/// let parser = HtmlParser::builder()
151///     .max_input_size(128 * 1024 * 1024)
152///     .fragment_mode(true)
153///     .build();
154///
155/// let doc = parser.parse_str("<p>fragment</p>").unwrap();
156/// assert_eq!(doc.root().text_content(), "fragment");
157/// ```
158pub struct ParserBuilder {
159    max_input_size: usize,
160    fragment_mode: bool,
161}
162
163impl Default for ParserBuilder {
164    fn default() -> Self {
165        Self {
166            max_input_size: DEFAULT_MAX_INPUT_SIZE,
167            fragment_mode: false,
168        }
169    }
170}
171
172impl ParserBuilder {
173    /// Set the maximum input size in bytes.
174    ///
175    /// Inputs exceeding this limit will return [`HtmlError::InputTooLarge`].
176    /// Default: 256 MiB.
177    pub fn max_input_size(mut self, size: usize) -> Self {
178        self.max_input_size = size;
179        self
180    }
181
182    /// Enable fragment mode.
183    ///
184    /// In fragment mode the parser treats input as an HTML fragment rather
185    /// than a full document. Currently this behaves identically to normal
186    /// mode (the parser already handles fragments gracefully).
187    pub fn fragment_mode(mut self, enabled: bool) -> Self {
188        self.fragment_mode = enabled;
189        self
190    }
191
192    /// Consume the builder and create a configured [`HtmlParser`].
193    pub fn build(self) -> HtmlParser {
194        HtmlParser {
195            max_input_size: self.max_input_size,
196            _fragment_mode: self.fragment_mode,
197        }
198    }
199}
200
201/// A configured HTML parser instance.
202///
203/// Create via [`HtmlParser::builder()`] for custom configuration, or use the
204/// convenience methods [`HtmlParser::parse()`] and [`HtmlParser::parse_bytes()`]
205/// for defaults.
206///
207/// # Example
208///
209/// ```
210/// use fast_html_parser::HtmlParser;
211///
212/// // One-shot convenience
213/// let doc = HtmlParser::parse("<p>Hello</p>").unwrap();
214///
215/// // Builder pattern
216/// let parser = HtmlParser::builder()
217///     .max_input_size(1024 * 1024)
218///     .build();
219/// let doc = parser.parse_str("<p>World</p>").unwrap();
220/// ```
221pub struct HtmlParser {
222    max_input_size: usize,
223    _fragment_mode: bool,
224}
225
226impl HtmlParser {
227    /// Create a new [`ParserBuilder`].
228    pub fn builder() -> ParserBuilder {
229        ParserBuilder::default()
230    }
231
232    /// Parse an HTML string with default settings.
233    ///
234    /// This is a convenience wrapper around `fhp_tree::parse()`.
235    ///
236    /// # Errors
237    ///
238    /// Returns [`HtmlError::InputTooLarge`] if the input exceeds 256 MiB.
239    ///
240    /// # Example
241    ///
242    /// ```
243    /// use fast_html_parser::HtmlParser;
244    ///
245    /// let doc = HtmlParser::parse("<div><p>Hello</p></div>").unwrap();
246    /// assert_eq!(doc.root().text_content(), "Hello");
247    /// ```
248    pub fn parse(input: &str) -> Result<Document, HtmlError> {
249        fhp_tree::parse(input)
250    }
251
252    /// Parse an owned `String` with default settings, transferring the allocation.
253    ///
254    /// Avoids a memcpy of the source bytes when the caller already owns the
255    /// input (e.g., from an HTTP response body).
256    ///
257    /// # Errors
258    ///
259    /// Returns [`HtmlError::InputTooLarge`] if the input exceeds 256 MiB.
260    ///
261    /// # Example
262    ///
263    /// ```
264    /// use fast_html_parser::HtmlParser;
265    ///
266    /// let html = String::from("<div><p>Hello</p></div>");
267    /// let doc = HtmlParser::parse_owned(html).unwrap();
268    /// assert_eq!(doc.root().text_content(), "Hello");
269    /// ```
270    pub fn parse_owned(input: String) -> Result<Document, HtmlError> {
271        fhp_tree::parse_owned(input)
272    }
273
274    /// Parse raw bytes with default settings, auto-detecting encoding.
275    ///
276    /// # Errors
277    ///
278    /// Returns [`HtmlError::InputTooLarge`] or [`HtmlError::Encoding`] on
279    /// failure.
280    ///
281    /// # Example
282    ///
283    /// ```
284    /// use fast_html_parser::HtmlParser;
285    ///
286    /// let doc = HtmlParser::parse_bytes(b"<p>Hello</p>").unwrap();
287    /// assert_eq!(doc.root().text_content(), "Hello");
288    /// ```
289    pub fn parse_bytes(input: &[u8]) -> Result<Document, HtmlError> {
290        fhp_tree::parse_bytes(input)
291    }
292
293    /// Parse an HTML string with the current configuration.
294    ///
295    /// # Errors
296    ///
297    /// Returns [`HtmlError::InputTooLarge`] if the input exceeds the
298    /// configured limit.
299    pub fn parse_str(&self, input: &str) -> Result<Document, HtmlError> {
300        if input.len() > self.max_input_size {
301            return Err(HtmlError::InputTooLarge {
302                size: input.len(),
303                max: self.max_input_size,
304            });
305        }
306        fhp_tree::parse(input)
307    }
308
309    /// Parse an owned `String` with the current configuration.
310    ///
311    /// Avoids a memcpy of the source bytes when the caller already owns the
312    /// input (e.g., from an HTTP response body).
313    ///
314    /// # Errors
315    ///
316    /// Returns [`HtmlError::InputTooLarge`] if the input exceeds the
317    /// configured limit.
318    pub fn parse_str_owned(&self, input: String) -> Result<Document, HtmlError> {
319        if input.len() > self.max_input_size {
320            return Err(HtmlError::InputTooLarge {
321                size: input.len(),
322                max: self.max_input_size,
323            });
324        }
325        fhp_tree::parse_owned(input)
326    }
327
328    /// Parse raw bytes with the current configuration, auto-detecting encoding.
329    ///
330    /// # Errors
331    ///
332    /// Returns [`HtmlError::InputTooLarge`] or [`HtmlError::Encoding`] on
333    /// failure.
334    pub fn parse_raw(&self, input: &[u8]) -> Result<Document, HtmlError> {
335        if input.len() > self.max_input_size {
336            return Err(HtmlError::InputTooLarge {
337                size: input.len(),
338                max: self.max_input_size,
339            });
340        }
341        fhp_tree::parse_bytes(input)
342    }
343}
344
345/// Parse an HTML string with default settings (convenience alias).
346///
347/// # Example
348///
349/// ```
350/// let doc = fast_html_parser::parse("<p>Quick</p>").unwrap();
351/// assert_eq!(doc.root().text_content(), "Quick");
352/// ```
353pub fn parse(input: &str) -> Result<Document, HtmlError> {
354    HtmlParser::parse(input)
355}
356
357/// Parse an owned `String` with default settings, transferring the allocation.
358///
359/// # Example
360///
361/// ```
362/// let doc = fast_html_parser::parse_owned(String::from("<p>Quick</p>")).unwrap();
363/// assert_eq!(doc.root().text_content(), "Quick");
364/// ```
365pub fn parse_owned(input: String) -> Result<Document, HtmlError> {
366    HtmlParser::parse_owned(input)
367}
368
369/// Parse raw bytes with default settings, auto-detecting encoding.
370///
371/// # Example
372///
373/// ```
374/// let doc = fast_html_parser::parse_bytes(b"<p>Quick</p>").unwrap();
375/// assert_eq!(doc.root().text_content(), "Quick");
376/// ```
377pub fn parse_bytes(input: &[u8]) -> Result<Document, HtmlError> {
378    HtmlParser::parse_bytes(input)
379}
380
381// ---------------------------------------------------------------------------
382// Tests
383// ---------------------------------------------------------------------------
384
385#[cfg(test)]
386mod tests {
387    use super::*;
388
389    #[test]
390    fn parse_convenience() {
391        let doc = parse("<div><p>Hello</p></div>").unwrap();
392        assert_eq!(doc.root().text_content(), "Hello");
393    }
394
395    #[test]
396    fn parse_bytes_convenience() {
397        let doc = parse_bytes(b"<div><p>Hello</p></div>").unwrap();
398        assert_eq!(doc.root().text_content(), "Hello");
399    }
400
401    #[test]
402    fn builder_default() {
403        let parser = HtmlParser::builder().build();
404        let doc = parser.parse_str("<p>ok</p>").unwrap();
405        assert_eq!(doc.root().text_content(), "ok");
406    }
407
408    #[test]
409    fn builder_max_input_size() {
410        let parser = HtmlParser::builder().max_input_size(10).build();
411        let result = parser.parse_str("<p>this is too long</p>");
412        assert!(result.is_err());
413    }
414
415    #[test]
416    fn builder_fragment_mode() {
417        let parser = HtmlParser::builder().fragment_mode(true).build();
418        let doc = parser.parse_str("<li>item</li>").unwrap();
419        assert_eq!(doc.root().text_content(), "item");
420    }
421
422    #[test]
423    fn builder_parse_raw() {
424        let parser = HtmlParser::builder().build();
425        let doc = parser.parse_raw(b"<p>bytes</p>").unwrap();
426        assert_eq!(doc.root().text_content(), "bytes");
427    }
428
429    #[test]
430    fn builder_parse_raw_too_large() {
431        let parser = HtmlParser::builder().max_input_size(5).build();
432        let result = parser.parse_raw(b"<p>too large</p>");
433        assert!(result.is_err());
434    }
435
436    #[test]
437    fn static_parse_method() {
438        let doc = HtmlParser::parse("<b>bold</b>").unwrap();
439        assert_eq!(doc.root().text_content(), "bold");
440    }
441
442    #[test]
443    fn static_parse_bytes_method() {
444        let doc = HtmlParser::parse_bytes(b"<i>italic</i>").unwrap();
445        assert_eq!(doc.root().text_content(), "italic");
446    }
447
448    #[cfg(feature = "css-selector")]
449    #[test]
450    fn selector_reexport() {
451        let doc = HtmlParser::parse("<div><p>Hello</p></div>").unwrap();
452        let sel = doc.select("p").unwrap();
453        assert_eq!(sel.len(), 1);
454    }
455
456    #[test]
457    fn streaming_reexport() {
458        let doc = streaming::parse_stream(b"<p>stream</p>".chunks(4)).unwrap();
459        assert_eq!(doc.root().text_content(), "stream");
460    }
461
462    #[test]
463    fn node_ref_access() {
464        let doc = parse("<a href=\"url\">link</a>").unwrap();
465        let root = doc.root();
466        let a = root.first_child().unwrap();
467        assert_eq!(a.tag(), Tag::A);
468        assert_eq!(a.attr("href"), Some("url"));
469    }
470
471    #[test]
472    fn prelude_works() {
473        use crate::prelude::*;
474        let doc = HtmlParser::parse("<p>prelude</p>").unwrap();
475        let _root: NodeRef<'_> = doc.root();
476    }
477}