Skip to main content

html_cleaning/
presets.rs

1//! Prebuilt cleaning configurations.
2//!
3//! Ready-to-use presets for common cleaning scenarios.
4
5use crate::options::CleaningOptions;
6
7/// Minimal cleaning - just scripts and styles.
8///
9/// Removes:
10/// - `script`, `style`, `noscript`
11///
12/// Best for: Quick sanitization, preserving most structure.
13///
14/// # Example
15///
16/// ```
17/// use html_cleaning::{HtmlCleaner, presets};
18///
19/// let cleaner = HtmlCleaner::with_options(presets::minimal());
20/// ```
21#[must_use]
22pub fn minimal() -> CleaningOptions {
23    CleaningOptions {
24        tags_to_remove: vec![
25            "script".to_string(),
26            "style".to_string(),
27            "noscript".to_string(),
28        ],
29        prune_empty: false,
30        normalize_whitespace: false,
31        ..Default::default()
32    }
33}
34
35/// Standard cleaning for web scraping.
36///
37/// Removes:
38/// - `script`, `style`, `noscript`
39/// - `form`, `iframe`, `object`, `embed`
40/// - `svg`, `canvas`, `video`, `audio`
41///
42/// Enables:
43/// - `prune_empty`
44/// - `normalize_whitespace`
45///
46/// Best for: General web scraping, content display.
47///
48/// # Example
49///
50/// ```
51/// use html_cleaning::{HtmlCleaner, presets};
52///
53/// let cleaner = HtmlCleaner::with_options(presets::standard());
54/// ```
55#[must_use]
56pub fn standard() -> CleaningOptions {
57    CleaningOptions {
58        tags_to_remove: vec![
59            "script".to_string(),
60            "style".to_string(),
61            "noscript".to_string(),
62            "form".to_string(),
63            "iframe".to_string(),
64            "object".to_string(),
65            "embed".to_string(),
66            "svg".to_string(),
67            "canvas".to_string(),
68            "video".to_string(),
69            "audio".to_string(),
70        ],
71        prune_empty: true,
72        normalize_whitespace: true,
73        ..Default::default()
74    }
75}
76
77/// Aggressive cleaning for maximum content extraction.
78///
79/// Includes everything in `standard()` plus:
80/// - Removes: `nav`, `header`, `footer`, `aside`, `figure`, `figcaption`
81/// - Enables: `strip_attributes` (preserves `href`, `src`, `alt`)
82///
83/// Best for: Text extraction, removing all non-content elements.
84///
85/// # Example
86///
87/// ```
88/// use html_cleaning::{HtmlCleaner, presets};
89///
90/// let cleaner = HtmlCleaner::with_options(presets::aggressive());
91/// ```
92#[must_use]
93pub fn aggressive() -> CleaningOptions {
94    CleaningOptions {
95        tags_to_remove: vec![
96            // Standard tags
97            "script".to_string(),
98            "style".to_string(),
99            "noscript".to_string(),
100            "form".to_string(),
101            "iframe".to_string(),
102            "object".to_string(),
103            "embed".to_string(),
104            "svg".to_string(),
105            "canvas".to_string(),
106            "video".to_string(),
107            "audio".to_string(),
108            // Layout tags
109            "nav".to_string(),
110            "header".to_string(),
111            "footer".to_string(),
112            "aside".to_string(),
113            "figure".to_string(),
114            "figcaption".to_string(),
115        ],
116        prune_empty: true,
117        normalize_whitespace: true,
118        strip_attributes: true,
119        preserved_attributes: vec![
120            "href".to_string(),
121            "src".to_string(),
122            "alt".to_string(),
123        ],
124        ..Default::default()
125    }
126}
127
128/// Article extraction preset.
129///
130/// Optimized for extracting article content:
131/// - Removes navigation and layout elements
132/// - Strips wrapper tags (`div`, `span`) while preserving content
133/// - Removes common advertisement selectors
134///
135/// Best for: News articles, blog posts, content pages.
136///
137/// # Example
138///
139/// ```
140/// use html_cleaning::{HtmlCleaner, presets};
141///
142/// let cleaner = HtmlCleaner::with_options(presets::article_extraction());
143/// ```
144#[must_use]
145pub fn article_extraction() -> CleaningOptions {
146    CleaningOptions {
147        tags_to_remove: vec![
148            "script".to_string(),
149            "style".to_string(),
150            "noscript".to_string(),
151            "form".to_string(),
152            "iframe".to_string(),
153            "object".to_string(),
154            "embed".to_string(),
155            "svg".to_string(),
156            "canvas".to_string(),
157            "video".to_string(),
158            "audio".to_string(),
159            "nav".to_string(),
160            "footer".to_string(),
161        ],
162        tags_to_strip: vec!["span".to_string(), "div".to_string()],
163        selectors_to_remove: vec![
164            "[role='navigation']".to_string(),
165            "[role='banner']".to_string(),
166            "[role='complementary']".to_string(),
167            ".advertisement".to_string(),
168            ".ads".to_string(),
169            "#comments".to_string(),
170            ".comments".to_string(),
171        ],
172        prune_empty: true,
173        normalize_whitespace: true,
174        empty_tags: vec![
175            "div".to_string(),
176            "span".to_string(),
177            "p".to_string(),
178            "section".to_string(),
179        ],
180        ..Default::default()
181    }
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn test_minimal_preset() {
190        let opts = minimal();
191        assert_eq!(opts.tags_to_remove.len(), 3);
192        assert!(!opts.prune_empty);
193    }
194
195    #[test]
196    fn test_standard_preset() {
197        let opts = standard();
198        assert!(opts.tags_to_remove.len() > 5);
199        assert!(opts.prune_empty);
200        assert!(opts.normalize_whitespace);
201    }
202
203    #[test]
204    fn test_aggressive_preset() {
205        let opts = aggressive();
206        assert!(opts.tags_to_remove.contains(&"nav".to_string()));
207        assert!(opts.strip_attributes);
208        assert!(opts.preserved_attributes.contains(&"href".to_string()));
209    }
210
211    #[test]
212    fn test_article_extraction_preset() {
213        let opts = article_extraction();
214        assert!(!opts.selectors_to_remove.is_empty());
215        assert!(opts.tags_to_strip.contains(&"span".to_string()));
216    }
217}