1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// Copyright (c) 2026 vectorless developers
// SPDX-License-Identifier: Apache-2.0
//! HTML document parser.
//!
//! This module provides an HTML parser that extracts hierarchical structure
//! from HTML documents using heading tags (`<h1>`-`<h6>`) as section markers.
//!
//! # Features
//!
//! - Parses HTML5 documents using `scraper`
//! - Extracts heading hierarchy (`<h1>`-`<h6>`)
//! - Extracts content from paragraphs, lists, tables, etc.
//! - Preserves document structure
//!
//! # Example
//!
//! ```rust
//! use vectorless::parser::html::HtmlParser;
//! use vectorless::parser::DocumentParser;
//!
//! # #[tokio::main]
//! # async fn main() -> vectorless::Result<()> {
//! let parser = HtmlParser::new();
//! let html = r#"
//! <html>
//! <body>
//! <h1>Title</h1>
//! <p>Introduction paragraph.</p>
//! <h2>Section 1</h2>
//! <p>Content for section 1.</p>
//! </body>
//! </html>
//! "#;
//! let result = parser.parse(html).await?;
//! println!("Found {} nodes", result.node_count());
//! # Ok(())
//! # }
//! ```
pub use HtmlConfig;
pub use HtmlParser;