Skip to main content

html_cleaning/
options.rs

1//! Configuration options for HTML cleaning.
2
3/// Configuration for HTML cleaning operations.
4///
5/// # Example
6///
7/// ```
8/// use html_cleaning::CleaningOptions;
9///
10/// let options = CleaningOptions {
11///     tags_to_remove: vec!["script".into(), "style".into()],
12///     prune_empty: true,
13///     ..Default::default()
14/// };
15/// ```
16#[derive(Debug, Clone, Default)]
17pub struct CleaningOptions {
18    /// HTML tags to remove completely (including children).
19    ///
20    /// Example: `["script", "style", "noscript"]`
21    pub tags_to_remove: Vec<String>,
22
23    /// HTML tags to strip (remove tag, keep children).
24    ///
25    /// Example: `["span", "font"]`
26    pub tags_to_strip: Vec<String>,
27
28    /// CSS selectors for elements to remove.
29    ///
30    /// Example: `[".advertisement", "#cookie-banner"]`
31    pub selectors_to_remove: Vec<String>,
32
33    /// Remove elements with no text content.
34    pub prune_empty: bool,
35
36    /// Tags considered "empty" for pruning.
37    ///
38    /// Default: `["div", "span", "p", "section", "article"]`
39    pub empty_tags: Vec<String>,
40
41    /// Normalize whitespace in text nodes.
42    pub normalize_whitespace: bool,
43
44    /// Remove HTML comment nodes (`<!-- ... -->`).
45    pub remove_comments: bool,
46
47    /// Remove all attributes from elements.
48    pub strip_attributes: bool,
49
50    /// Attributes to preserve when `strip_attributes` is true.
51    ///
52    /// Example: `["href", "src", "alt"]`
53    pub preserved_attributes: Vec<String>,
54}
55
56impl CleaningOptions {
57    /// Create a new builder for `CleaningOptions`.
58    #[must_use]
59    pub fn builder() -> CleaningOptionsBuilder {
60        CleaningOptionsBuilder::default()
61    }
62}
63
64/// Builder for `CleaningOptions`.
65#[derive(Debug, Clone, Default)]
66pub struct CleaningOptionsBuilder {
67    options: CleaningOptions,
68}
69
70impl CleaningOptionsBuilder {
71    /// Add tags to remove (including children).
72    #[must_use]
73    pub fn remove_tags(mut self, tags: &[&str]) -> Self {
74        self.options
75            .tags_to_remove
76            .extend(tags.iter().map(|s| (*s).to_string()));
77        self
78    }
79
80    /// Add tags to strip (keep children).
81    #[must_use]
82    pub fn strip_tags(mut self, tags: &[&str]) -> Self {
83        self.options
84            .tags_to_strip
85            .extend(tags.iter().map(|s| (*s).to_string()));
86        self
87    }
88
89    /// Add CSS selectors to remove.
90    #[must_use]
91    pub fn remove_selectors(mut self, selectors: &[&str]) -> Self {
92        self.options
93            .selectors_to_remove
94            .extend(selectors.iter().map(|s| (*s).to_string()));
95        self
96    }
97
98    /// Enable empty element pruning.
99    #[must_use]
100    pub fn prune_empty(mut self, enabled: bool) -> Self {
101        self.options.prune_empty = enabled;
102        self
103    }
104
105    /// Set tags to consider as empty for pruning.
106    #[must_use]
107    pub fn empty_tags(mut self, tags: &[&str]) -> Self {
108        self.options.empty_tags = tags.iter().map(|s| (*s).to_string()).collect();
109        self
110    }
111
112    /// Enable whitespace normalization.
113    #[must_use]
114    pub fn normalize_whitespace(mut self, enabled: bool) -> Self {
115        self.options.normalize_whitespace = enabled;
116        self
117    }
118
119    /// Enable HTML comment removal.
120    #[must_use]
121    pub fn remove_comments(mut self, enabled: bool) -> Self {
122        self.options.remove_comments = enabled;
123        self
124    }
125
126    /// Enable attribute stripping.
127    #[must_use]
128    pub fn strip_attributes(mut self, enabled: bool) -> Self {
129        self.options.strip_attributes = enabled;
130        self
131    }
132
133    /// Set preserved attributes (when `strip_attributes` is enabled).
134    #[must_use]
135    pub fn preserve_attributes(mut self, attrs: &[&str]) -> Self {
136        self.options.preserved_attributes = attrs.iter().map(|s| (*s).to_string()).collect();
137        self
138    }
139
140    /// Build the `CleaningOptions`.
141    #[must_use]
142    pub fn build(self) -> CleaningOptions {
143        self.options
144    }
145}
146
147#[cfg(test)]
148mod tests {
149    use super::*;
150
151    #[test]
152    fn test_default_options() {
153        let options = CleaningOptions::default();
154        assert!(options.tags_to_remove.is_empty());
155        assert!(!options.prune_empty);
156    }
157
158    #[test]
159    fn test_builder() {
160        let options = CleaningOptions::builder()
161            .remove_tags(&["script", "style"])
162            .prune_empty(true)
163            .build();
164
165        assert_eq!(options.tags_to_remove.len(), 2);
166        assert!(options.prune_empty);
167    }
168}