fast_html_parser/lib.rs
1#![cfg_attr(docsrs, feature(doc_cfg))]
2//! # fast-html-parser — SIMD-Optimized HTML Parser
3//!
4//! A high-performance HTML parser designed for web scraping workloads.
5//! Uses SIMD instructions (SSE4.2, AVX2, NEON) for tokenization and builds
6//! a cache-line aligned arena-based DOM tree for fast traversal.
7//!
8//! ## Quick Start
9//!
10//! ```
11//! use fast_html_parser::HtmlParser;
12//!
13//! let doc = HtmlParser::parse("<div><p>Hello</p></div>").unwrap();
14//! assert_eq!(doc.root().text_content(), "Hello");
15//! ```
16//!
17//! ## Builder Pattern
18//!
19//! ```
20//! use fast_html_parser::HtmlParser;
21//!
22//! let doc = HtmlParser::builder()
23//! .max_input_size(64 * 1024 * 1024) // 64 MiB
24//! .build()
25//! .parse_str("<div>Hello</div>")
26//! .unwrap();
27//! ```
28//!
29//! ## CSS Selectors
30//!
31//! ```
32//! use fast_html_parser::prelude::*;
33//!
34//! let doc = HtmlParser::parse("<ul><li>one</li><li>two</li></ul>").unwrap();
35//! let items = doc.select("li").unwrap();
36//! assert_eq!(items.len(), 2);
37//! ```
38//!
39//! ## Streaming
40//!
41//! ```
42//! use fast_html_parser::streaming::parse_stream;
43//!
44//! let html = b"<div><p>Hello</p></div>";
45//! let doc = parse_stream(html.chunks(8)).unwrap();
46//! assert_eq!(doc.root().text_content(), "Hello");
47//! ```
48//!
49//! ## Feature Flags
50//!
51//! | Feature | Default | Description |
52//! |---|---|---|
53//! | `css-selector` | Yes | CSS selector engine |
54//! | `entity-decode` | Yes | HTML entity decoding |
55//! | `xpath` | No | XPath expression support |
56//! | `encoding` | No | Auto-detect encoding from raw bytes |
57//! | `async-tokio` | No | Async parsing via Tokio |
58
59// ---------------------------------------------------------------------------
60// Re-exports: core types
61// ---------------------------------------------------------------------------
62
63/// Core types: interned tags, entity table, error definitions.
64pub use fhp_core as core_types;
65
66/// Interned HTML tag enum.
67pub use fhp_core::tag::Tag;
68
69/// Tokenizer (low-level).
70pub use fhp_tokenizer as tokenizer;
71
72/// DOM tree types.
73pub use fhp_tree as tree;
74
75/// Parsed document and node reference.
76pub use fhp_tree::{Document, HtmlError, NodeRef};
77
78/// Node identity type.
79pub use fhp_tree::node::NodeId;
80
81/// Streaming and incremental parsing.
82pub mod streaming {
83 pub use fhp_tree::streaming::{EarlyStopParser, ParseStatus, StreamParser, parse_stream};
84}
85
86// ---------------------------------------------------------------------------
87// Conditional re-exports
88// ---------------------------------------------------------------------------
89
90/// CSS selector and XPath engine.
91#[cfg(any(feature = "css-selector", feature = "xpath"))]
92#[cfg_attr(docsrs, doc(cfg(any(feature = "css-selector", feature = "xpath"))))]
93pub use fhp_selector::{CompiledSelector, DocumentIndex, Selectable, Selection};
94
95/// XPath types (re-exported from selector crate).
96#[cfg(feature = "xpath")]
97#[cfg_attr(docsrs, doc(cfg(feature = "xpath")))]
98pub mod xpath {
99 pub use fhp_selector::xpath::ast::XPathResult;
100}
101
102/// Encoding detection and conversion.
103#[cfg(feature = "encoding")]
104#[cfg_attr(docsrs, doc(cfg(feature = "encoding")))]
105pub mod encoding {
106 pub use fhp_encoding::{Encoding, decode, decode_or_detect, detect};
107}
108
109/// Async parser (requires `async-tokio` feature).
110#[cfg(feature = "async-tokio")]
111#[cfg_attr(docsrs, doc(cfg(feature = "async-tokio")))]
112pub mod async_parser {
113 pub use fhp_tree::async_parser::{AsyncParser, parse_async};
114}
115
116// ---------------------------------------------------------------------------
117// Prelude
118// ---------------------------------------------------------------------------
119
120/// Convenience prelude that imports the most commonly used types.
121///
122/// ```
123/// use fast_html_parser::prelude::*;
124/// ```
125pub mod prelude {
126 pub use fhp_tree::node::NodeId;
127 pub use fhp_tree::{Document, HtmlError, NodeRef};
128
129 #[cfg(any(feature = "css-selector", feature = "xpath"))]
130 #[cfg_attr(docsrs, doc(cfg(any(feature = "css-selector", feature = "xpath"))))]
131 pub use fhp_selector::{CompiledSelector, Selectable, Selection};
132
133 pub use crate::HtmlParser;
134}
135
136// ---------------------------------------------------------------------------
137// Builder + HtmlParser
138// ---------------------------------------------------------------------------
139
140/// Default maximum input size (256 MiB).
141const DEFAULT_MAX_INPUT_SIZE: usize = 256 * 1024 * 1024;
142
143/// Configuration builder for the HTML parser.
144///
145/// # Example
146///
147/// ```
148/// use fast_html_parser::HtmlParser;
149///
150/// let parser = HtmlParser::builder()
151/// .max_input_size(128 * 1024 * 1024)
152/// .fragment_mode(true)
153/// .build();
154///
155/// let doc = parser.parse_str("<p>fragment</p>").unwrap();
156/// assert_eq!(doc.root().text_content(), "fragment");
157/// ```
158pub struct ParserBuilder {
159 max_input_size: usize,
160 fragment_mode: bool,
161}
162
163impl Default for ParserBuilder {
164 fn default() -> Self {
165 Self {
166 max_input_size: DEFAULT_MAX_INPUT_SIZE,
167 fragment_mode: false,
168 }
169 }
170}
171
172impl ParserBuilder {
173 /// Set the maximum input size in bytes.
174 ///
175 /// Inputs exceeding this limit will return [`HtmlError::InputTooLarge`].
176 /// Default: 256 MiB.
177 pub fn max_input_size(mut self, size: usize) -> Self {
178 self.max_input_size = size;
179 self
180 }
181
182 /// Enable fragment mode.
183 ///
184 /// In fragment mode the parser treats input as an HTML fragment rather
185 /// than a full document. Currently this behaves identically to normal
186 /// mode (the parser already handles fragments gracefully).
187 pub fn fragment_mode(mut self, enabled: bool) -> Self {
188 self.fragment_mode = enabled;
189 self
190 }
191
192 /// Consume the builder and create a configured [`HtmlParser`].
193 pub fn build(self) -> HtmlParser {
194 HtmlParser {
195 max_input_size: self.max_input_size,
196 _fragment_mode: self.fragment_mode,
197 }
198 }
199}
200
201/// A configured HTML parser instance.
202///
203/// Create via [`HtmlParser::builder()`] for custom configuration, or use the
204/// convenience methods [`HtmlParser::parse()`] and [`HtmlParser::parse_bytes()`]
205/// for defaults.
206///
207/// # Example
208///
209/// ```
210/// use fast_html_parser::HtmlParser;
211///
212/// // One-shot convenience
213/// let doc = HtmlParser::parse("<p>Hello</p>").unwrap();
214///
215/// // Builder pattern
216/// let parser = HtmlParser::builder()
217/// .max_input_size(1024 * 1024)
218/// .build();
219/// let doc = parser.parse_str("<p>World</p>").unwrap();
220/// ```
221pub struct HtmlParser {
222 max_input_size: usize,
223 _fragment_mode: bool,
224}
225
226impl HtmlParser {
227 /// Create a new [`ParserBuilder`].
228 pub fn builder() -> ParserBuilder {
229 ParserBuilder::default()
230 }
231
232 /// Parse an HTML string with default settings.
233 ///
234 /// This is a convenience wrapper around `fhp_tree::parse()`.
235 ///
236 /// # Errors
237 ///
238 /// Returns [`HtmlError::InputTooLarge`] if the input exceeds 256 MiB.
239 ///
240 /// # Example
241 ///
242 /// ```
243 /// use fast_html_parser::HtmlParser;
244 ///
245 /// let doc = HtmlParser::parse("<div><p>Hello</p></div>").unwrap();
246 /// assert_eq!(doc.root().text_content(), "Hello");
247 /// ```
248 pub fn parse(input: &str) -> Result<Document, HtmlError> {
249 fhp_tree::parse(input)
250 }
251
252 /// Parse an owned `String` with default settings, transferring the allocation.
253 ///
254 /// Avoids a memcpy of the source bytes when the caller already owns the
255 /// input (e.g., from an HTTP response body).
256 ///
257 /// # Errors
258 ///
259 /// Returns [`HtmlError::InputTooLarge`] if the input exceeds 256 MiB.
260 ///
261 /// # Example
262 ///
263 /// ```
264 /// use fast_html_parser::HtmlParser;
265 ///
266 /// let html = String::from("<div><p>Hello</p></div>");
267 /// let doc = HtmlParser::parse_owned(html).unwrap();
268 /// assert_eq!(doc.root().text_content(), "Hello");
269 /// ```
270 pub fn parse_owned(input: String) -> Result<Document, HtmlError> {
271 fhp_tree::parse_owned(input)
272 }
273
274 /// Parse raw bytes with default settings, auto-detecting encoding.
275 ///
276 /// # Errors
277 ///
278 /// Returns [`HtmlError::InputTooLarge`] or [`HtmlError::Encoding`] on
279 /// failure.
280 ///
281 /// # Example
282 ///
283 /// ```
284 /// use fast_html_parser::HtmlParser;
285 ///
286 /// let doc = HtmlParser::parse_bytes(b"<p>Hello</p>").unwrap();
287 /// assert_eq!(doc.root().text_content(), "Hello");
288 /// ```
289 pub fn parse_bytes(input: &[u8]) -> Result<Document, HtmlError> {
290 fhp_tree::parse_bytes(input)
291 }
292
293 /// Parse an HTML string with the current configuration.
294 ///
295 /// # Errors
296 ///
297 /// Returns [`HtmlError::InputTooLarge`] if the input exceeds the
298 /// configured limit.
299 pub fn parse_str(&self, input: &str) -> Result<Document, HtmlError> {
300 if input.len() > self.max_input_size {
301 return Err(HtmlError::InputTooLarge {
302 size: input.len(),
303 max: self.max_input_size,
304 });
305 }
306 fhp_tree::parse(input)
307 }
308
309 /// Parse an owned `String` with the current configuration.
310 ///
311 /// Avoids a memcpy of the source bytes when the caller already owns the
312 /// input (e.g., from an HTTP response body).
313 ///
314 /// # Errors
315 ///
316 /// Returns [`HtmlError::InputTooLarge`] if the input exceeds the
317 /// configured limit.
318 pub fn parse_str_owned(&self, input: String) -> Result<Document, HtmlError> {
319 if input.len() > self.max_input_size {
320 return Err(HtmlError::InputTooLarge {
321 size: input.len(),
322 max: self.max_input_size,
323 });
324 }
325 fhp_tree::parse_owned(input)
326 }
327
328 /// Parse raw bytes with the current configuration, auto-detecting encoding.
329 ///
330 /// # Errors
331 ///
332 /// Returns [`HtmlError::InputTooLarge`] or [`HtmlError::Encoding`] on
333 /// failure.
334 pub fn parse_raw(&self, input: &[u8]) -> Result<Document, HtmlError> {
335 if input.len() > self.max_input_size {
336 return Err(HtmlError::InputTooLarge {
337 size: input.len(),
338 max: self.max_input_size,
339 });
340 }
341 fhp_tree::parse_bytes(input)
342 }
343}
344
345/// Parse an HTML string with default settings (convenience alias).
346///
347/// # Example
348///
349/// ```
350/// let doc = fast_html_parser::parse("<p>Quick</p>").unwrap();
351/// assert_eq!(doc.root().text_content(), "Quick");
352/// ```
353pub fn parse(input: &str) -> Result<Document, HtmlError> {
354 HtmlParser::parse(input)
355}
356
357/// Parse an owned `String` with default settings, transferring the allocation.
358///
359/// # Example
360///
361/// ```
362/// let doc = fast_html_parser::parse_owned(String::from("<p>Quick</p>")).unwrap();
363/// assert_eq!(doc.root().text_content(), "Quick");
364/// ```
365pub fn parse_owned(input: String) -> Result<Document, HtmlError> {
366 HtmlParser::parse_owned(input)
367}
368
369/// Parse raw bytes with default settings, auto-detecting encoding.
370///
371/// # Example
372///
373/// ```
374/// let doc = fast_html_parser::parse_bytes(b"<p>Quick</p>").unwrap();
375/// assert_eq!(doc.root().text_content(), "Quick");
376/// ```
377pub fn parse_bytes(input: &[u8]) -> Result<Document, HtmlError> {
378 HtmlParser::parse_bytes(input)
379}
380
381// ---------------------------------------------------------------------------
382// Tests
383// ---------------------------------------------------------------------------
384
385#[cfg(test)]
386mod tests {
387 use super::*;
388
389 #[test]
390 fn parse_convenience() {
391 let doc = parse("<div><p>Hello</p></div>").unwrap();
392 assert_eq!(doc.root().text_content(), "Hello");
393 }
394
395 #[test]
396 fn parse_bytes_convenience() {
397 let doc = parse_bytes(b"<div><p>Hello</p></div>").unwrap();
398 assert_eq!(doc.root().text_content(), "Hello");
399 }
400
401 #[test]
402 fn builder_default() {
403 let parser = HtmlParser::builder().build();
404 let doc = parser.parse_str("<p>ok</p>").unwrap();
405 assert_eq!(doc.root().text_content(), "ok");
406 }
407
408 #[test]
409 fn builder_max_input_size() {
410 let parser = HtmlParser::builder().max_input_size(10).build();
411 let result = parser.parse_str("<p>this is too long</p>");
412 assert!(result.is_err());
413 }
414
415 #[test]
416 fn builder_fragment_mode() {
417 let parser = HtmlParser::builder().fragment_mode(true).build();
418 let doc = parser.parse_str("<li>item</li>").unwrap();
419 assert_eq!(doc.root().text_content(), "item");
420 }
421
422 #[test]
423 fn builder_parse_raw() {
424 let parser = HtmlParser::builder().build();
425 let doc = parser.parse_raw(b"<p>bytes</p>").unwrap();
426 assert_eq!(doc.root().text_content(), "bytes");
427 }
428
429 #[test]
430 fn builder_parse_raw_too_large() {
431 let parser = HtmlParser::builder().max_input_size(5).build();
432 let result = parser.parse_raw(b"<p>too large</p>");
433 assert!(result.is_err());
434 }
435
436 #[test]
437 fn static_parse_method() {
438 let doc = HtmlParser::parse("<b>bold</b>").unwrap();
439 assert_eq!(doc.root().text_content(), "bold");
440 }
441
442 #[test]
443 fn static_parse_bytes_method() {
444 let doc = HtmlParser::parse_bytes(b"<i>italic</i>").unwrap();
445 assert_eq!(doc.root().text_content(), "italic");
446 }
447
448 #[cfg(feature = "css-selector")]
449 #[test]
450 fn selector_reexport() {
451 let doc = HtmlParser::parse("<div><p>Hello</p></div>").unwrap();
452 let sel = doc.select("p").unwrap();
453 assert_eq!(sel.len(), 1);
454 }
455
456 #[test]
457 fn streaming_reexport() {
458 let doc = streaming::parse_stream(b"<p>stream</p>".chunks(4)).unwrap();
459 assert_eq!(doc.root().text_content(), "stream");
460 }
461
462 #[test]
463 fn node_ref_access() {
464 let doc = parse("<a href=\"url\">link</a>").unwrap();
465 let root = doc.root();
466 let a = root.first_child().unwrap();
467 assert_eq!(a.tag(), Tag::A);
468 assert_eq!(a.attr("href"), Some("url"));
469 }
470
471 #[test]
472 fn prelude_works() {
473 use crate::prelude::*;
474 let doc = HtmlParser::parse("<p>prelude</p>").unwrap();
475 let _root: NodeRef<'_> = doc.root();
476 }
477}