scah/lib.rs
1//! # scah - Streaming CSS-selector-driven HTML extraction
2//!
3//! **scah** (*scan HTML*) is a high-performance parsing library that bridges the gap
4//! between SAX/StAX streaming efficiency and DOM convenience. Instead of loading an
5//! entire document into memory or manually tracking parser state, you declare what
6//! you want with **CSS selectors**; the library handles the streaming complexity and
7//! builds a targeted [`Store`] containing only your selections.
8//!
9//! ## Highlights
10//!
11//! | Feature | Detail |
12//! |---------|--------|
13//! | **Streaming core** | Built on StAX: constant memory regardless of document size |
14//! | **Familiar API** | CSS selectors including `>` (child) and ` ` (descendant) combinators |
15//! | **Composable queries** | Chain selections with [`QueryBuilder::then`] for hierarchical data extraction |
16//! | **Zero-copy** | Element names, attributes, and inner HTML are `&str` slices into the source |
17//! | **Multi-language** | Rust core with Python and TypeScript/JavaScript bindings |
18//!
19//! ## Quick Start
20//!
21//! ```rust
22//! use scah::{Query, Save, parse};
23//!
24//! let html = r#"
25//! <main>
26//! <section>
27//! <a href="link1">Link 1</a>
28//! <a href="link2">Link 2</a>
29//! </section>
30//! </main>
31//! "#;
32//!
33//! // Build a query: find all <a> tags with an href attribute
34//! // that are direct children of a <section> inside <main>.
35//! let queries = &[
36//! Query::all("main > section > a[href]", Save::all())
37//! .expect("valid selector")
38//! .build()
39//! ];
40//!
41//! let store = parse(html, queries);
42//!
43//! // Iterate over matched elements
44//! for element in store.get("main > section > a[href]").unwrap() {
45//! println!("{}: {}", element.name, element.attribute(&store, "href").unwrap());
46//! }
47//! ```
48//!
49//! ## Structured Querying with `.then()`
50//!
51//! Instead of flat filtering, you can nest queries using closures.
52//! Child queries only run within the context of their parent match,
53//! making extraction of hierarchical relationships both efficient and ergonomic:
54//!
55//! ```rust
56//! use scah::{Query, Save, parse};
57//!
58//! # let html = "<main><section><a href='x'>Link</a></section></main>";
59//! let queries = &[Query::all("main > section", Save::all())
60//! .expect("valid selector")
61//! .then(|section| {
62//! Ok([
63//! section.all("> a[href]", Save::all())?,
64//! section.all("div a", Save::all())?,
65//! ])
66//! })
67//! .expect("valid child selectors")
68//! .build()];
69//!
70//! let store = parse(html, queries);
71//! ```
72//!
73//! ## Architecture
74//!
75//! Internally, scah is composed of the following layers:
76//!
77//! 1. **[`Reader`]**: A zero-copy byte-level cursor over the HTML source.
78//! 2. **CSS selector compiler**: Parses selector strings into a compact
79//! automaton of [`Query`] transitions.
80//! 3. **[`XHtmlParser`]**: A streaming StAX parser that emits open/close events.
81//! 4. **[`QueryMultiplexer`]**: Drives one or more query executors against
82//! the token stream simultaneously.
83//! 5. **[`Store`]**: An arena-based result set that collects matched
84//! [`Element`]s, their attributes, and (optionally) inner HTML / text content.
85//!
86//! ## Supported CSS Selector Syntax
87//!
88//! | Syntax | Example | Status |
89//! |--------|---------|--------|
90//! | **Tag name** | `a`, `div` | Working |
91//! | **ID** | `#my-id` | Working |
92//! | **Class** | `.my-class` | Working |
93//! | **Descendant combinator** | `main section a` | Working |
94//! | **Child combinator** | `main > section` | Working |
95//! | **Attribute presence** | `a[href]` | Working |
96//! | **Attribute exact match** | `a[href="url"]` | Working |
97//! | **Attribute prefix** | `a[href^="https"]` | Working |
98//! | **Attribute suffix** | `a[href$=".com"]` | Working |
99//! | **Attribute substring** | `a[href*="example"]` | Working |
100//! | **Adjacent sibling** | `h1 + p` | Coming soon |
101//! | **General sibling** | `h1 ~ p` | Coming soon |
102
103mod engine;
104mod html;
105mod store;
106mod support;
107
108pub use engine::multiplexer::QueryMultiplexer;
109pub use html::element::builder::XHtmlElement;
110pub use html::parser::XHtmlParser;
111pub use scah_macros::query;
112pub use scah_query_ir::lazy;
113pub use scah_query_ir::{
114 Attribute, AttributeSelection, AttributeSelectionKind, AttributeSelections, ClassSelections,
115 Combinator, ElementPredicate, IElement, Position, Query, QueryBuilder, QueryFactory,
116 QuerySection, QuerySpec, Save, SelectionKind, SelectorParseError, StaticQuery, Transition,
117};
118pub use scah_reader::Reader;
119pub use store::{Element, ElementId, Store};
120
121/// Parse an HTML string against one or more pre-built [`Query`] objects and
122/// return a [`Store`] containing all matched elements.
123///
124/// This is the main entry point of scah. It wires together the streaming
125/// [`XHtmlParser`], the [`QueryMultiplexer`], and the result [`Store`].
126///
127/// # Parameters
128///
129/// - `html`: The HTML source string. All returned string slices in the
130/// resulting [`Store`] borrow directly from this string (zero-copy).
131/// - `queries`: A slice of compiled [`Query`] objects. Each query is
132/// executed concurrently against the same token stream in a single pass.
133///
134/// # Returns
135///
136/// A [`Store`] containing all matched elements. Use [`Store::get`] with the
137/// original selector string to retrieve results for a specific query.
138///
139/// # Example
140///
141/// ```rust
142/// use scah::{Query, Save, parse};
143///
144/// let html = "<div><a href='link'>Hello</a></div>";
145/// let queries = &[Query::all("a", Save::all())
146/// .expect("valid selector")
147/// .build()];
148/// let store = parse(html, queries);
149///
150/// let links: Vec<_> = store.get("a").unwrap().collect();
151/// assert_eq!(links.len(), 1);
152/// assert_eq!(links[0].name, "a");
153/// ```
154pub fn parse<'a: 'query, 'html: 'query, 'query: 'html, Q>(
155 html: &'html str,
156 queries: &'a [Q],
157) -> Store<'html, 'query>
158where
159 Q: QuerySpec<'query>,
160{
161 let selectors = QueryMultiplexer::new(queries);
162
163 let no_extra_allocations = queries.iter().all(|q| q.exit_at_section_end().is_some());
164 let mut parser = if no_extra_allocations {
165 XHtmlParser::new(selectors)
166 } else {
167 XHtmlParser::with_capacity(selectors, html.len())
168 };
169
170 let mut reader = Reader::new(html);
171 while parser.next(&mut reader) {}
172
173 parser.matches()
174}