html_cleaning/lib.rs
1//! HTML cleaning, sanitization, and text processing utilities.
2//!
3//! This crate provides generic HTML cleaning operations useful for web scraping,
4//! content extraction, and HTML sanitization.
5//!
6//! # Quick Start
7//!
8//! ```
9//! use html_cleaning::{HtmlCleaner, CleaningOptions};
10//! use dom_query::Document;
11//!
12//! // Create a cleaner with custom options
13//! let options = CleaningOptions::builder()
14//! .remove_tags(&["script", "style"])
15//! .build();
16//! let cleaner = HtmlCleaner::with_options(options);
17//!
18//! let html = "<html><body><script>bad</script><p>Hello!</p></body></html>";
19//! let doc = Document::from(html);
20//!
21//! cleaner.clean(&doc);
22//! assert!(doc.select("script").is_empty());
23//! assert!(doc.select("p").exists());
24//! ```
25//!
26//! # Features
27//!
28//! - **HTML Cleaning**: Remove unwanted elements (scripts, styles, forms)
29//! - **Tag Stripping**: Remove tags while preserving text content
30//! - **Text Normalization**: Collapse whitespace, trim text
31//! - **Link Processing**: Make URLs absolute, filter links
32//! - **Content Deduplication**: LRU-based duplicate detection
33//! - **Presets**: Ready-to-use configurations for common scenarios
34//!
35//! # Feature Flags
36//!
37//! | Feature | Default | Description |
38//! |---------|---------|-------------|
39//! | `presets` | Yes | Include prebuilt cleaning configurations |
40//! | `regex` | No | Enable regex-based selectors |
41//! | `url` | No | Enable URL processing with the `url` crate |
42//! | `full` | No | Enable all features |
43//!
44//! # Modules
45//!
46//! - [`cleaner`] - Core `HtmlCleaner` and cleaning operations
47//! - [`text`] - Text processing utilities
48//! - [`tree`] - lxml-style text/tail tree manipulation
49//! - [`dom`] - DOM helper utilities
50//! - [`dedup`] - Content deduplication
51//! - [`presets`] - Ready-to-use cleaning configurations (feature: `presets`)
52//! - [`links`] - URL and link processing (feature: `url`)
53
54#![forbid(unsafe_code)]
55#![warn(missing_docs)]
56
57// Core modules - always available
58pub mod cleaner;
59pub mod dedup;
60pub mod dom;
61pub mod error;
62pub mod options;
63pub mod text;
64pub mod tree;
65
66// Feature-gated modules
67#[cfg(feature = "presets")]
68pub mod presets;
69
70// Links module is always available - it provides basic URL utilities without dependencies.
71// When the `url` feature is enabled, it uses the `url` crate for more robust parsing.
72// When disabled, it uses simple string-based fallbacks.
73pub mod links;
74
75
76// Re-export core types
77pub use cleaner::HtmlCleaner;
78pub use error::{Error, Result};
79pub use options::{CleaningOptions, CleaningOptionsBuilder};
80
81// Re-export dom_query types for convenience
82pub use dom_query::{Document, Selection};
83
84#[cfg(test)]
85mod tests {
86 use super::*;
87
88 #[test]
89 fn test_basic_cleaning() {
90 let doc = Document::from("<div><script>bad</script><p>Hello</p></div>");
91 let cleaner = HtmlCleaner::new();
92 cleaner.remove_tags(&doc, &["script"]);
93
94 assert!(doc.select("script").is_empty());
95 assert!(doc.select("p").exists());
96 }
97
98 #[test]
99 fn test_with_options() {
100 let options = CleaningOptions::builder()
101 .remove_tags(&["script", "style"])
102 .prune_empty(true)
103 .build();
104
105 let cleaner = HtmlCleaner::with_options(options);
106 assert!(cleaner.options().prune_empty);
107 }
108
109 #[cfg(feature = "presets")]
110 #[test]
111 fn test_presets() {
112 let cleaner = HtmlCleaner::with_options(presets::standard());
113 assert!(!cleaner.options().tags_to_remove.is_empty());
114 }
115}