1use lol_html::{doc_comments, element, rewrite_str, RewriteStrSettings};
7use spider_agent_types::{CleaningIntent, ContentAnalysis, HtmlCleaningProfile};
8
9pub fn clean_html_raw(html: &str) -> String {
11 html.to_string()
12}
13
14pub fn clean_html_base(html: &str) -> String {
25 match rewrite_str(
26 html,
27 RewriteStrSettings {
28 element_content_handlers: vec![
29 element!("script", |el| {
30 el.remove();
31 Ok(())
32 }),
33 element!("style", |el| {
34 el.remove();
35 Ok(())
36 }),
37 element!("link", |el| {
38 el.remove();
39 Ok(())
40 }),
41 element!("iframe", |el| {
42 el.remove();
43 Ok(())
44 }),
45 element!("[style*='display:none']", |el| {
46 el.remove();
47 Ok(())
48 }),
49 element!("[id*='ad']", |el| {
50 el.remove();
51 Ok(())
52 }),
53 element!("[class*='ad']", |el| {
54 el.remove();
55 Ok(())
56 }),
57 element!("[id*='tracking']", |el| {
58 el.remove();
59 Ok(())
60 }),
61 element!("[class*='tracking']", |el| {
62 el.remove();
63 Ok(())
64 }),
65 element!("meta", |el| {
66 if let Some(attribute) = el.get_attribute("name") {
67 if attribute != "title" && attribute != "description" {
68 el.remove();
69 }
70 } else {
71 el.remove();
72 }
73 Ok(())
74 }),
75 ],
76 document_content_handlers: vec![doc_comments!(|c| {
77 c.remove();
78 Ok(())
79 })],
80 ..RewriteStrSettings::new()
81 },
82 ) {
83 Ok(r) => r,
84 _ => html.into(),
85 }
86}
87
88pub fn clean_html_slim(html: &str) -> String {
98 match rewrite_str(
99 html,
100 RewriteStrSettings {
101 element_content_handlers: vec![
102 element!("script", |el| {
103 el.remove();
104 Ok(())
105 }),
106 element!("style", |el| {
107 el.remove();
108 Ok(())
109 }),
110 element!("svg", |el| {
111 el.remove();
112 Ok(())
113 }),
114 element!("noscript", |el| {
115 el.remove();
116 Ok(())
117 }),
118 element!("link", |el| {
119 el.remove();
120 Ok(())
121 }),
122 element!("iframe", |el| {
123 el.remove();
124 Ok(())
125 }),
126 element!("canvas", |el| {
127 el.remove();
128 Ok(())
129 }),
130 element!("video", |el| {
131 el.remove();
132 Ok(())
133 }),
134 element!("img", |el| {
135 if let Some(src) = el.get_attribute("src") {
136 if src.starts_with("data:image") {
137 el.remove();
138 }
139 }
140 Ok(())
141 }),
142 element!("picture", |el| {
143 if let Some(src) = el.get_attribute("src") {
145 if src.starts_with("data:") {
146 el.remove();
147 }
148 }
149 Ok(())
150 }),
151 element!("[style*='display:none']", |el| {
152 el.remove();
153 Ok(())
154 }),
155 element!("[id*='ad']", |el| {
156 el.remove();
157 Ok(())
158 }),
159 element!("[class*='ad']", |el| {
160 el.remove();
161 Ok(())
162 }),
163 element!("[id*='tracking']", |el| {
164 el.remove();
165 Ok(())
166 }),
167 element!("[class*='tracking']", |el| {
168 el.remove();
169 Ok(())
170 }),
171 element!("meta", |el| {
172 if let Some(attribute) = el.get_attribute("name") {
173 if attribute != "title" && attribute != "description" {
174 el.remove();
175 }
176 } else {
177 el.remove();
178 }
179 Ok(())
180 }),
181 ],
182 document_content_handlers: vec![doc_comments!(|c| {
183 c.remove();
184 Ok(())
185 })],
186 ..RewriteStrSettings::new()
187 },
188 ) {
189 Ok(r) => r,
190 _ => html.into(),
191 }
192}
193
194pub fn clean_html_full(html: &str) -> String {
201 match rewrite_str(
202 html,
203 RewriteStrSettings {
204 element_content_handlers: vec![
205 element!("script", |el| {
206 el.remove();
207 Ok(())
208 }),
209 element!("style", |el| {
210 el.remove();
211 Ok(())
212 }),
213 element!("svg", |el| {
214 el.remove();
215 Ok(())
216 }),
217 element!("nav", |el| {
218 el.remove();
219 Ok(())
220 }),
221 element!("footer", |el| {
222 el.remove();
223 Ok(())
224 }),
225 element!("noscript", |el| {
226 el.remove();
227 Ok(())
228 }),
229 element!("link", |el| {
230 el.remove();
231 Ok(())
232 }),
233 element!("iframe", |el| {
234 el.remove();
235 Ok(())
236 }),
237 element!("canvas", |el| {
238 el.remove();
239 Ok(())
240 }),
241 element!("video", |el| {
242 el.remove();
243 Ok(())
244 }),
245 element!("meta", |el| {
246 let name = el.get_attribute("name").map(|n| n.to_lowercase());
247 if !matches!(name.as_deref(), Some("viewport") | Some("charset")) {
248 el.remove();
249 }
250 Ok(())
251 }),
252 element!("*", |el| {
253 let mut to_remove: Vec<String> = Vec::new();
255 for attr in el.attributes().iter() {
256 let n = attr.name();
257 let keep = n == "id" || n == "class" || n.starts_with("data-");
258 if !keep {
259 to_remove.push(n);
260 }
261 }
262 for attr in to_remove {
263 el.remove_attribute(&attr);
264 }
265 Ok(())
266 }),
267 ],
268 document_content_handlers: vec![doc_comments!(|c| {
269 c.remove();
270 Ok(())
271 })],
272 ..RewriteStrSettings::new()
273 },
274 ) {
275 Ok(r) => r,
276 _ => html.into(),
277 }
278}
279
280#[inline]
282pub fn clean_html(html: &str) -> String {
283 clean_html_base(html)
284}
285
286pub fn clean_html_with_profile(html: &str, profile: HtmlCleaningProfile) -> String {
288 clean_html_with_profile_and_intent(html, profile, CleaningIntent::General)
289}
290
291pub fn clean_html_with_profile_and_intent(
298 html: &str,
299 profile: HtmlCleaningProfile,
300 intent: CleaningIntent,
301) -> String {
302 match profile {
303 HtmlCleaningProfile::Raw => clean_html_raw(html),
304 HtmlCleaningProfile::Default => clean_html(html),
305 HtmlCleaningProfile::Aggressive => clean_html_full(html),
306 HtmlCleaningProfile::Slim => clean_html_slim(html),
307 HtmlCleaningProfile::Minimal => clean_html_base(html),
308 HtmlCleaningProfile::Auto => {
309 let analysis = ContentAnalysis::analyze(html);
311 let auto_profile =
312 HtmlCleaningProfile::from_content_analysis_with_intent(&analysis, intent);
313 clean_html_with_profile_and_intent(html, auto_profile, intent)
315 }
316 }
317}
318
319pub fn smart_clean_html(html: &str, intent: CleaningIntent) -> String {
328 clean_html_with_profile_and_intent(html, HtmlCleaningProfile::Auto, intent)
329}
330
331#[cfg(test)]
332mod tests {
333 use super::*;
334
335 #[test]
336 fn test_clean_html_raw() {
337 let html = "<script>alert(1)</script><p>Hello</p>";
338 assert_eq!(clean_html_raw(html), html);
339 }
340
341 #[test]
342 fn test_clean_html_base() {
343 let html = "<script>alert(1)</script><p>Hello</p><style>.x{}</style>";
344 let cleaned = clean_html_base(html);
345 assert!(!cleaned.contains("<script>"));
346 assert!(!cleaned.contains("<style>"));
347 assert!(cleaned.contains("<p>Hello</p>"));
348 }
349
350 #[test]
351 fn test_clean_html_slim() {
352 let html = "<svg><path/></svg><p>Hello</p><canvas></canvas>";
353 let cleaned = clean_html_slim(html);
354 assert!(!cleaned.contains("<svg>"));
355 assert!(!cleaned.contains("<canvas>"));
356 assert!(cleaned.contains("<p>Hello</p>"));
357 }
358
359 #[test]
360 fn test_clean_html_full() {
361 let html = "<nav>Menu</nav><p>Hello</p><footer>Footer</footer>";
362 let cleaned = clean_html_full(html);
363 assert!(!cleaned.contains("<nav>"));
364 assert!(!cleaned.contains("<footer>"));
365 assert!(cleaned.contains("<p>Hello</p>"));
366 }
367
368 #[test]
369 fn test_smart_clean_html() {
370 let simple = "<html><body><p>Hello World!</p></body></html>";
372 let _cleaned = smart_clean_html(simple, CleaningIntent::General);
373 }
375}