html_to_markdown_rs/
sanitizer.rs

1//! HTML sanitization using ammonia.
2
3use ammonia::Builder;
4
5use crate::error::Result;
6use crate::options::{PreprocessingOptions, PreprocessingPreset};
7
8/// Sanitize HTML using ammonia.
9///
10/// This function cleans HTML by removing unwanted elements and attributes
11/// based on the preprocessing options.
12pub fn sanitize(html: &str, options: &PreprocessingOptions) -> Result<String> {
13    use std::collections::HashSet;
14
15    let mut builder = match options.preset {
16        PreprocessingPreset::Minimal => create_minimal_builder(),
17        PreprocessingPreset::Standard => create_standard_builder(),
18        PreprocessingPreset::Aggressive => create_aggressive_builder(),
19    };
20
21    let mut clean_content = HashSet::new();
22    let mut allowed_tags = builder.clone_tags();
23
24    clean_content.insert("script");
25    clean_content.insert("style");
26    allowed_tags.remove("script");
27    allowed_tags.remove("style");
28
29    if options.remove_navigation {
30        clean_content.insert("nav");
31        clean_content.insert("aside");
32        clean_content.insert("header");
33        clean_content.insert("footer");
34        allowed_tags.remove("nav");
35        allowed_tags.remove("aside");
36        allowed_tags.remove("header");
37        allowed_tags.remove("footer");
38    }
39
40    if options.remove_forms {
41        clean_content.insert("form");
42        clean_content.insert("input");
43        clean_content.insert("button");
44        clean_content.insert("select");
45        clean_content.insert("textarea");
46        clean_content.insert("label");
47        clean_content.insert("fieldset");
48        clean_content.insert("legend");
49        allowed_tags.remove("form");
50        allowed_tags.remove("input");
51        allowed_tags.remove("button");
52        allowed_tags.remove("select");
53        allowed_tags.remove("textarea");
54        allowed_tags.remove("label");
55        allowed_tags.remove("fieldset");
56        allowed_tags.remove("legend");
57    }
58
59    builder.tags(allowed_tags);
60    builder.clean_content_tags(clean_content);
61
62    Ok(builder.clean(html).to_string())
63}
64
65/// Create a minimal sanitization builder (keeps most elements).
66fn create_minimal_builder() -> Builder<'static> {
67    let mut builder = Builder::default();
68    builder.strip_comments(false);
69    builder
70}
71
72/// Create a standard sanitization builder (balanced cleaning).
73fn create_standard_builder() -> Builder<'static> {
74    let mut builder = Builder::default();
75    builder.strip_comments(true);
76    builder
77}
78
79/// Create an aggressive sanitization builder (heavy cleaning for web scraping).
80fn create_aggressive_builder() -> Builder<'static> {
81    let mut builder = Builder::default();
82    builder.strip_comments(true);
83    builder.link_rel(Some("nofollow noopener noreferrer"));
84    builder
85}