scrape_core/
lib.rs

1//! # scrape-core
2//!
3//! High-performance HTML parsing library with CSS selector support.
4//!
5//! This crate provides the core functionality for parsing HTML documents
6//! and querying them using CSS selectors. It is designed to be fast,
7//! memory-efficient, and spec-compliant.
8//!
9//! ## Quick Start
10//!
11//! ```rust
12//! use scrape_core::{Html5everParser, Parser, Soup, SoupConfig};
13//!
14//! // Parse HTML using Soup (high-level API)
15//! let html = "<html><body><div class=\"product\">Hello</div></body></html>";
16//! let soup = Soup::parse(html);
17//!
18//! // Find elements using CSS selectors
19//! if let Ok(Some(div)) = soup.find("div.product") {
20//!     assert_eq!(div.text(), "Hello");
21//! }
22//!
23//! // Or use the parser directly (low-level API)
24//! let parser = Html5everParser;
25//! let document = parser.parse(html).unwrap();
26//! assert!(document.root().is_some());
27//! ```
28//!
29//! ## Features
30//!
31//! - **Fast parsing**: Built on `html5ever` for spec-compliant HTML5 parsing
32//! - **CSS selectors**: Full CSS selector support via the `selectors` crate
33//! - **Memory efficient**: Arena-based allocation for DOM nodes
34//! - **SIMD acceleration**: Optional SIMD support for faster byte scanning
35//!
36//! ## CSS Selector Support
37//!
38//! The query engine supports most CSS3 selectors:
39//!
40//! ```rust
41//! use scrape_core::Soup;
42//!
43//! let html = r#"
44//!     <div class="container">
45//!         <ul id="list">
46//!             <li class="item active">One</li>
47//!             <li class="item">Two</li>
48//!             <li class="item">Three</li>
49//!         </ul>
50//!     </div>
51//! "#;
52//! let soup = Soup::parse(html);
53//!
54//! // Type selector
55//! let divs = soup.find_all("div").unwrap();
56//!
57//! // Class selector
58//! let items = soup.find_all(".item").unwrap();
59//!
60//! // ID selector
61//! let list = soup.find("#list").unwrap();
62//!
63//! // Compound selector
64//! let active = soup.find("li.item.active").unwrap();
65//!
66//! // Descendant combinator
67//! let nested = soup.find_all("div li").unwrap();
68//!
69//! // Child combinator
70//! let direct = soup.find_all("ul > li").unwrap();
71//!
72//! // Attribute selectors
73//! let with_id = soup.find_all("[id]").unwrap();
74//! ```
75
76#![warn(missing_docs)]
77#![warn(clippy::all)]
78#![warn(clippy::pedantic)]
79
80mod dom;
81mod error;
82#[cfg(feature = "parallel")]
83pub mod parallel;
84mod parser;
85pub mod query;
86pub mod serialize;
87#[cfg(feature = "simd")]
88pub mod simd;
89mod soup;
90mod tag;
91pub mod utils;
92
93// Error types
94// DOM types
95pub use dom::{
96    AncestorsIter, Building, ChildrenIter, CommentMarker, DescendantsIter, Document, DocumentImpl,
97    DocumentIndex, DocumentState, ElementAncestorsIter, ElementChildrenIter,
98    ElementDescendantsIter, ElementMarker, ElementNextSiblingsIter, ElementPrevSiblingsIter,
99    ElementSiblingsIter, MutableState, NextSiblingsIter, Node, NodeId, NodeKind, NodeType,
100    PrevSiblingsIter, Queryable, QueryableState, Sealed, SiblingsIter, TagId, TextMarker,
101};
102pub use error::{Error, Result};
103// Parser types
104pub use parser::{Html5everParser, ParseConfig, ParseError, ParseResult, Parser};
105// Query types
106pub use query::{
107    CompiledSelector, Filter, QueryError, QueryResult, TextNodesIter, compile_selector,
108};
109// Serialization utilities
110pub use serialize::{HtmlSerializer, collect_text, serialize_inner_html, serialize_node};
111// High-level API
112pub use soup::{Soup, SoupConfig};
113pub use tag::Tag;
114// HTML utilities
115pub use utils::{escape_attr, escape_text, is_void_element};
scrape_core/lib.rs

scrape_core/
lib.rs