html_cleaning/
presets.rs

1//! Prebuilt cleaning configurations.
2//!
3//! Ready-to-use presets for common cleaning scenarios.
4
5use crate::options::CleaningOptions;
6
7/// Minimal cleaning - just scripts and styles.
8///
9/// Removes:
10/// - `script`, `style`, `noscript`
11///
12/// Best for: Quick sanitization, preserving most structure.
13///
14/// # Example
15///
16/// ```
17/// use html_cleaning::{HtmlCleaner, presets};
18///
19/// let cleaner = HtmlCleaner::with_options(presets::minimal());
20/// ```
21#[must_use]
22pub fn minimal() -> CleaningOptions {
23    CleaningOptions {
24        tags_to_remove: vec![
25            "script".to_string(),
26            "style".to_string(),
27            "noscript".to_string(),
28        ],
29        prune_empty: false,
30        normalize_whitespace: false,
31        ..Default::default()
32    }
33}
34
35/// Standard cleaning for web scraping.
36///
37/// Removes:
38/// - `script`, `style`, `noscript`
39/// - `form`, `iframe`, `object`, `embed`
40/// - `svg`, `canvas`, `video`, `audio`
41///
42/// Enables:
43/// - `prune_empty`
44/// - `normalize_whitespace`
45///
46/// Best for: General web scraping, content display.
47///
48/// # Example
49///
50/// ```
51/// use html_cleaning::{HtmlCleaner, presets};
52///
53/// let cleaner = HtmlCleaner::with_options(presets::standard());
54/// ```
55#[must_use]
56pub fn standard() -> CleaningOptions {
57    CleaningOptions {
58        tags_to_remove: vec![
59            "script".to_string(),
60            "style".to_string(),
61            "noscript".to_string(),
62            "form".to_string(),
63            "iframe".to_string(),
64            "object".to_string(),
65            "embed".to_string(),
66            "svg".to_string(),
67            "canvas".to_string(),
68            "video".to_string(),
69            "audio".to_string(),
70        ],
71        prune_empty: true,
72        normalize_whitespace: true,
73        ..Default::default()
74    }
75}
76
77/// Aggressive cleaning for maximum content extraction.
78///
79/// Includes everything in `standard()` plus:
80/// - Removes: `nav`, `header`, `footer`, `aside`, `figure`, `figcaption`
81/// - Enables: `strip_attributes` (preserves `href`, `src`, `alt`)
82///
83/// Best for: Text extraction, removing all non-content elements.
84///
85/// # Example
86///
87/// ```
88/// use html_cleaning::{HtmlCleaner, presets};
89///
90/// let cleaner = HtmlCleaner::with_options(presets::aggressive());
91/// ```
92#[must_use]
93pub fn aggressive() -> CleaningOptions {
94    CleaningOptions {
95        tags_to_remove: vec![
96            // Standard tags
97            "script".to_string(),
98            "style".to_string(),
99            "noscript".to_string(),
100            "form".to_string(),
101            "iframe".to_string(),
102            "object".to_string(),
103            "embed".to_string(),
104            "svg".to_string(),
105            "canvas".to_string(),
106            "video".to_string(),
107            "audio".to_string(),
108            // Layout tags
109            "nav".to_string(),
110            "header".to_string(),
111            "footer".to_string(),
112            "aside".to_string(),
113            "figure".to_string(),
114            "figcaption".to_string(),
115        ],
116        prune_empty: true,
117        normalize_whitespace: true,
118        strip_attributes: true,
119        preserved_attributes: vec![
120            "href".to_string(),
121            "src".to_string(),
122            "alt".to_string(),
123        ],
124        ..Default::default()
125    }
126}
127
128/// Article extraction preset.
129///
130/// Optimized for extracting article content:
131/// - Removes navigation and layout elements
132/// - Strips wrapper tags (`div`, `span`) while preserving content
133/// - Removes common advertisement selectors
134///
135/// Best for: News articles, blog posts, content pages.
136///
137/// # Example
138///
139/// ```
140/// use html_cleaning::{HtmlCleaner, presets};
141///
142/// let cleaner = HtmlCleaner::with_options(presets::article_extraction());
143/// ```
144#[must_use]
145pub fn article_extraction() -> CleaningOptions {
146    CleaningOptions {
147        tags_to_remove: vec![
148            "script".to_string(),
149            "style".to_string(),
150            "noscript".to_string(),
151            "form".to_string(),
152            "iframe".to_string(),
153            "object".to_string(),
154            "embed".to_string(),
155            "svg".to_string(),
156            "canvas".to_string(),
157            "video".to_string(),
158            "audio".to_string(),
159            "nav".to_string(),
160            "footer".to_string(),
161        ],
162        tags_to_strip: vec!["span".to_string(), "div".to_string()],
163        selectors_to_remove: vec![
164            "[role='navigation']".to_string(),
165            "[role='banner']".to_string(),
166            "[role='complementary']".to_string(),
167            ".advertisement".to_string(),
168            ".ads".to_string(),
169            "#comments".to_string(),
170            ".comments".to_string(),
171        ],
172        prune_empty: true,
173        normalize_whitespace: true,
174        empty_tags: vec![
175            "div".to_string(),
176            "span".to_string(),
177            "p".to_string(),
178            "section".to_string(),
179        ],
180        ..Default::default()
181    }
182}
183
184/// Trafilatura-compatible cleaning preset.
185///
186/// Matches the tag removal, stripping, and pruning behavior used by
187/// rs-trafilatura for web content extraction. Removes 50 non-content tags,
188/// strips 18 wrapper tags, and prunes 22 tag types when empty.
189///
190/// This preset handles pure HTML cleaning only — extraction-specific logic
191/// (link density analysis, boilerplate detection, content scoring) is NOT
192/// included and should be handled by the extraction pipeline.
193///
194/// Removes:
195/// - Script/style/noscript, forms, iframes, embeds, media elements
196/// - Navigation, footer, aside, menus
197/// - UI elements (buttons, inputs, selects, dialogs)
198/// - Non-content elements (applet, marquee, math, svg, canvas)
199///
200/// Strips (keeps children):
201/// - Wrapper/formatting tags: abbr, acronym, address, bdi, bdo, big, cite,
202///   data, dfn, font, hgroup, img, ins, mark, meta, ruby, small, template
203///
204/// Prunes empty:
205/// - p, div, span, h1-h6, blockquote, article, section, main, li, dd, dt,
206///   em, i, b, strong, pre, q
207///
208/// Also: removes HTML comments, normalizes whitespace.
209///
210/// # Example
211///
212/// ```
213/// use html_cleaning::{HtmlCleaner, presets};
214///
215/// let cleaner = HtmlCleaner::with_options(presets::trafilatura());
216/// ```
217#[must_use]
218pub fn trafilatura() -> CleaningOptions {
219    CleaningOptions {
220        tags_to_remove: vec![
221            // Important
222            "aside".into(), "embed".into(), "footer".into(), "form".into(),
223            "head".into(), "iframe".into(), "menu".into(), "object".into(),
224            "script".into(),
225            // Other content
226            "applet".into(), "audio".into(), "canvas".into(), "figure".into(),
227            "map".into(), "picture".into(), "svg".into(), "video".into(),
228            // Secondary
229            "area".into(), "blink".into(), "button".into(), "datalist".into(),
230            "dialog".into(), "frame".into(), "frameset".into(), "fieldset".into(),
231            "link".into(), "input".into(), "ins".into(), "label".into(),
232            "legend".into(), "marquee".into(), "math".into(), "menuitem".into(),
233            "nav".into(), "noscript".into(), "optgroup".into(), "option".into(),
234            "output".into(), "param".into(), "progress".into(), "rp".into(),
235            "rt".into(), "rtc".into(), "select".into(), "source".into(),
236            "style".into(), "track".into(), "textarea".into(), "time".into(),
237            "use".into(),
238        ],
239        tags_to_strip: vec![
240            "abbr".into(), "acronym".into(), "address".into(), "bdi".into(),
241            "bdo".into(), "big".into(), "cite".into(), "data".into(),
242            "dfn".into(), "font".into(), "hgroup".into(), "img".into(),
243            "ins".into(), "mark".into(), "meta".into(), "ruby".into(),
244            "small".into(), "template".into(),
245        ],
246        prune_empty: true,
247        empty_tags: vec![
248            "p".into(), "div".into(), "span".into(),
249            "h1".into(), "h2".into(), "h3".into(), "h4".into(), "h5".into(), "h6".into(),
250            "blockquote".into(), "article".into(), "section".into(), "main".into(),
251            "li".into(), "dd".into(), "dt".into(),
252            "em".into(), "i".into(), "b".into(), "strong".into(),
253            "pre".into(), "q".into(),
254        ],
255        normalize_whitespace: true,
256        remove_comments: true,
257        ..Default::default()
258    }
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn test_minimal_preset() {
267        let opts = minimal();
268        assert_eq!(opts.tags_to_remove.len(), 3);
269        assert!(!opts.prune_empty);
270    }
271
272    #[test]
273    fn test_standard_preset() {
274        let opts = standard();
275        assert!(opts.tags_to_remove.len() > 5);
276        assert!(opts.prune_empty);
277        assert!(opts.normalize_whitespace);
278    }
279
280    #[test]
281    fn test_aggressive_preset() {
282        let opts = aggressive();
283        assert!(opts.tags_to_remove.contains(&"nav".to_string()));
284        assert!(opts.strip_attributes);
285        assert!(opts.preserved_attributes.contains(&"href".to_string()));
286    }
287
288    #[test]
289    fn test_article_extraction_preset() {
290        let opts = article_extraction();
291        assert!(!opts.selectors_to_remove.is_empty());
292        assert!(opts.tags_to_strip.contains(&"span".to_string()));
293    }
294
295    #[test]
296    fn test_trafilatura_preset() {
297        let opts = trafilatura();
298        assert_eq!(opts.tags_to_remove.len(), 50);
299        assert_eq!(opts.tags_to_strip.len(), 18);
300        assert_eq!(opts.empty_tags.len(), 22);
301        assert!(opts.prune_empty);
302        assert!(opts.normalize_whitespace);
303        assert!(opts.remove_comments);
304        assert!(opts.tags_to_remove.contains(&"script".to_string()));
305        assert!(opts.tags_to_remove.contains(&"nav".to_string()));
306        assert!(opts.tags_to_strip.contains(&"font".to_string()));
307        assert!(opts.empty_tags.contains(&"blockquote".to_string()));
308    }
309}
html_cleaning/presets.rs

html_cleaning/
presets.rs