Skip to main content

spider_agent_html/
cleaning.rs

1//! HTML cleaning utilities for automation.
2//!
3//! Provides multiple cleaning levels for preparing HTML content
4//! before sending to LLM models.
5
6use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
7use spider_agent_types::{CleaningIntent, ContentAnalysis, HtmlCleaningProfile};
8
9/// Raw passthrough - no cleaning.
10pub fn clean_html_raw(html: &str) -> String {
11    html.to_string()
12}
13
14/// Clean the HTML removing CSS and JS (base level).
15///
16/// Removes:
17/// - `<script>` tags
18/// - `<style>` tags
19/// - `<link>` tags
20/// - `<iframe>` tags
21/// - Elements with display:none
22/// - Ad and tracking elements
23/// - Non-essential meta tags
24pub fn clean_html_base(html: &str) -> String {
25    match rewrite_str(
26        html,
27        RewriteStrSettings {
28            element_content_handlers: vec![
29                element!("script", |el| {
30                    el.remove();
31                    Ok(())
32                }),
33                element!("style", |el| {
34                    el.remove();
35                    Ok(())
36                }),
37                element!("link", |el| {
38                    el.remove();
39                    Ok(())
40                }),
41                element!("iframe", |el| {
42                    el.remove();
43                    Ok(())
44                }),
45                element!("[style*='display:none']", |el| {
46                    el.remove();
47                    Ok(())
48                }),
49                element!("[id*='ad']", |el| {
50                    el.remove();
51                    Ok(())
52                }),
53                element!("[class*='ad']", |el| {
54                    el.remove();
55                    Ok(())
56                }),
57                element!("[id*='tracking']", |el| {
58                    el.remove();
59                    Ok(())
60                }),
61                element!("[class*='tracking']", |el| {
62                    el.remove();
63                    Ok(())
64                }),
65                element!("meta", |el| {
66                    if let Some(attribute) = el.get_attribute("name") {
67                        if attribute != "title" && attribute != "description" {
68                            el.remove();
69                        }
70                    } else {
71                        el.remove();
72                    }
73                    Ok(())
74                }),
75            ],
76            document_content_handlers: vec![doc_comments!(|c| {
77                c.remove();
78                Ok(())
79            })],
80            ..RewriteStrSettings::new()
81        },
82    ) {
83        Ok(r) => r,
84        _ => html.into(),
85    }
86}
87
88/// Slim HTML cleaning - removes heavy elements.
89///
90/// In addition to base cleaning, removes:
91/// - `<svg>` tags
92/// - `<noscript>` tags
93/// - `<canvas>` tags
94/// - `<video>` tags
95/// - Base64 images
96/// - Picture elements with data URIs
97pub fn clean_html_slim(html: &str) -> String {
98    match rewrite_str(
99        html,
100        RewriteStrSettings {
101            element_content_handlers: vec![
102                element!("script", |el| {
103                    el.remove();
104                    Ok(())
105                }),
106                element!("style", |el| {
107                    el.remove();
108                    Ok(())
109                }),
110                element!("svg", |el| {
111                    el.remove();
112                    Ok(())
113                }),
114                element!("noscript", |el| {
115                    el.remove();
116                    Ok(())
117                }),
118                element!("link", |el| {
119                    el.remove();
120                    Ok(())
121                }),
122                element!("iframe", |el| {
123                    el.remove();
124                    Ok(())
125                }),
126                element!("canvas", |el| {
127                    el.remove();
128                    Ok(())
129                }),
130                element!("video", |el| {
131                    el.remove();
132                    Ok(())
133                }),
134                element!("img", |el| {
135                    if let Some(src) = el.get_attribute("src") {
136                        if src.starts_with("data:image") {
137                            el.remove();
138                        }
139                    }
140                    Ok(())
141                }),
142                element!("picture", |el| {
143                    // Remove if it contains data URIs
144                    if let Some(src) = el.get_attribute("src") {
145                        if src.starts_with("data:") {
146                            el.remove();
147                        }
148                    }
149                    Ok(())
150                }),
151                element!("[style*='display:none']", |el| {
152                    el.remove();
153                    Ok(())
154                }),
155                element!("[id*='ad']", |el| {
156                    el.remove();
157                    Ok(())
158                }),
159                element!("[class*='ad']", |el| {
160                    el.remove();
161                    Ok(())
162                }),
163                element!("[id*='tracking']", |el| {
164                    el.remove();
165                    Ok(())
166                }),
167                element!("[class*='tracking']", |el| {
168                    el.remove();
169                    Ok(())
170                }),
171                element!("meta", |el| {
172                    if let Some(attribute) = el.get_attribute("name") {
173                        if attribute != "title" && attribute != "description" {
174                            el.remove();
175                        }
176                    } else {
177                        el.remove();
178                    }
179                    Ok(())
180                }),
181            ],
182            document_content_handlers: vec![doc_comments!(|c| {
183                c.remove();
184                Ok(())
185            })],
186            ..RewriteStrSettings::new()
187        },
188    ) {
189        Ok(r) => r,
190        _ => html.into(),
191    }
192}
193
194/// Full/aggressive HTML cleaning.
195///
196/// In addition to other cleaning levels, also removes:
197/// - `<nav>` tags
198/// - `<footer>` tags
199/// - Most attributes except id, class, and data-*
200pub fn clean_html_full(html: &str) -> String {
201    match rewrite_str(
202        html,
203        RewriteStrSettings {
204            element_content_handlers: vec![
205                element!("script", |el| {
206                    el.remove();
207                    Ok(())
208                }),
209                element!("style", |el| {
210                    el.remove();
211                    Ok(())
212                }),
213                element!("svg", |el| {
214                    el.remove();
215                    Ok(())
216                }),
217                element!("nav", |el| {
218                    el.remove();
219                    Ok(())
220                }),
221                element!("footer", |el| {
222                    el.remove();
223                    Ok(())
224                }),
225                element!("noscript", |el| {
226                    el.remove();
227                    Ok(())
228                }),
229                element!("link", |el| {
230                    el.remove();
231                    Ok(())
232                }),
233                element!("iframe", |el| {
234                    el.remove();
235                    Ok(())
236                }),
237                element!("canvas", |el| {
238                    el.remove();
239                    Ok(())
240                }),
241                element!("video", |el| {
242                    el.remove();
243                    Ok(())
244                }),
245                element!("meta", |el| {
246                    let name = el.get_attribute("name").map(|n| n.to_lowercase());
247                    if !matches!(name.as_deref(), Some("viewport") | Some("charset")) {
248                        el.remove();
249                    }
250                    Ok(())
251                }),
252                element!("*", |el| {
253                    // Keep only: id, class, data-*
254                    let mut to_remove: Vec<String> = Vec::new();
255                    for attr in el.attributes().iter() {
256                        let n = attr.name();
257                        let keep = n == "id" || n == "class" || n.starts_with("data-");
258                        if !keep {
259                            to_remove.push(n);
260                        }
261                    }
262                    for attr in to_remove {
263                        el.remove_attribute(&attr);
264                    }
265                    Ok(())
266                }),
267            ],
268            document_content_handlers: vec![doc_comments!(|c| {
269                c.remove();
270                Ok(())
271            })],
272            ..RewriteStrSettings::new()
273        },
274    ) {
275        Ok(r) => r,
276        _ => html.into(),
277    }
278}
279
280/// Default cleaner (base level).
281#[inline]
282pub fn clean_html(html: &str) -> String {
283    clean_html_base(html)
284}
285
286/// Clean HTML using a specific profile.
287pub fn clean_html_with_profile(html: &str, profile: HtmlCleaningProfile) -> String {
288    clean_html_with_profile_and_intent(html, profile, CleaningIntent::General)
289}
290
291/// Clean HTML with a specific profile and intent.
292///
293/// The intent helps Auto mode choose the right cleaning level:
294/// - `Extraction` - can be more aggressive, removes nav/footer
295/// - `Action` - preserves interactive elements
296/// - `General` - balanced approach
297pub fn clean_html_with_profile_and_intent(
298    html: &str,
299    profile: HtmlCleaningProfile,
300    intent: CleaningIntent,
301) -> String {
302    match profile {
303        HtmlCleaningProfile::Raw => clean_html_raw(html),
304        HtmlCleaningProfile::Default => clean_html(html),
305        HtmlCleaningProfile::Aggressive => clean_html_full(html),
306        HtmlCleaningProfile::Slim => clean_html_slim(html),
307        HtmlCleaningProfile::Minimal => clean_html_base(html),
308        HtmlCleaningProfile::Auto => {
309            // Analyze content and choose the best profile based on intent
310            let analysis = ContentAnalysis::analyze(html);
311            let auto_profile =
312                HtmlCleaningProfile::from_content_analysis_with_intent(&analysis, intent);
313            // Recursively call with determined profile (won't be Auto again)
314            clean_html_with_profile_and_intent(html, auto_profile, intent)
315        }
316    }
317}
318
319/// Smart HTML cleaner that automatically determines the best cleaning level.
320///
321/// This is the recommended function for cleaning HTML when you don't have
322/// a specific profile preference. It analyzes the content and chooses
323/// the optimal cleaning level based on:
324/// - Content size and text ratio
325/// - Presence of heavy elements (SVGs, canvas, video)
326/// - The intended use case (extraction vs action)
327pub fn smart_clean_html(html: &str, intent: CleaningIntent) -> String {
328    clean_html_with_profile_and_intent(html, HtmlCleaningProfile::Auto, intent)
329}
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn test_clean_html_raw() {
337        let html = "<script>alert(1)</script><p>Hello</p>";
338        assert_eq!(clean_html_raw(html), html);
339    }
340
341    #[test]
342    fn test_clean_html_base() {
343        let html = "<script>alert(1)</script><p>Hello</p><style>.x{}</style>";
344        let cleaned = clean_html_base(html);
345        assert!(!cleaned.contains("<script>"));
346        assert!(!cleaned.contains("<style>"));
347        assert!(cleaned.contains("<p>Hello</p>"));
348    }
349
350    #[test]
351    fn test_clean_html_slim() {
352        let html = "<svg><path/></svg><p>Hello</p><canvas></canvas>";
353        let cleaned = clean_html_slim(html);
354        assert!(!cleaned.contains("<svg>"));
355        assert!(!cleaned.contains("<canvas>"));
356        assert!(cleaned.contains("<p>Hello</p>"));
357    }
358
359    #[test]
360    fn test_clean_html_full() {
361        let html = "<nav>Menu</nav><p>Hello</p><footer>Footer</footer>";
362        let cleaned = clean_html_full(html);
363        assert!(!cleaned.contains("<nav>"));
364        assert!(!cleaned.contains("<footer>"));
365        assert!(cleaned.contains("<p>Hello</p>"));
366    }
367
368    #[test]
369    fn test_smart_clean_html() {
370        // Small, simple content should use minimal cleaning
371        let simple = "<html><body><p>Hello World!</p></body></html>";
372        let _cleaned = smart_clean_html(simple, CleaningIntent::General);
373        // Just verify it doesn't panic
374    }
375}