1#![doc = include_str!("../README.md")]
2use ahash::{HashMap, HashSet};
7use lol_html::{
8 errors::RewritingError,
9 html_content::{Comment, ContentType, DocumentEnd, Element, TextChunk},
10 DocumentContentHandlers, ElementContentHandlers, HandlerResult, HtmlRewriter, Selector,
11 Settings,
12};
13use once_cell::sync::Lazy;
14use slab::Slab;
15use std::{borrow::Cow, cell::RefCell, fmt::Write, iter, rc::Rc, str::FromStr};
16use thiserror::Error;
17
18pub use lol_html::MemorySettings;
19
20mod macros;
21
22static GLOBAL_BUBBLE_BATH: Lazy<BubbleBath<'static>> = Lazy::new(BubbleBath::default);
23static SELECT_ALL: Lazy<Selector> = Lazy::new(|| Selector::from_str("*").unwrap());
24
25#[inline]
35pub fn clean(content: &str) -> Result<String, Error> {
36 GLOBAL_BUBBLE_BATH.clean(content)
37}
38
39#[inline]
40fn clean_text(source: &str) -> String {
41 let mut acc = String::with_capacity(source.len());
42
43 for chr in source.chars() {
44 let replacement = match chr {
45 '<' => "<",
46 '>' => ">",
47 '\"' => """,
48 '\'' => "'",
49 '`' => "`",
50 '/' => "/",
51 '&' => "&",
52 '=' => "=",
53 '\0' => "�",
54 _ => {
55 acc.push(chr);
56 continue;
57 }
58 };
59
60 acc.push_str(replacement);
61 }
62 acc
63}
64
65#[derive(Debug, Error)]
67#[non_exhaustive]
68pub enum Error {
69 #[error(transparent)]
71 Rewriting(#[from] RewritingError),
72}
73
74pub struct BubbleBath<'a> {
85 pub allowed_generic_attributes: HashSet<&'a str>,
87
88 pub allowed_tags: HashSet<&'a str>,
90
91 pub allowed_tag_attributes: HashMap<&'a str, HashSet<&'a str>>,
93
94 pub allowed_url_schemes: HashSet<&'a str>,
96
97 pub clean_url_attributes: HashMap<&'a str, HashSet<&'a str>>,
99
100 pub memory_settings: MemorySettings,
102
103 pub preserve_escaped: bool,
105
106 pub remove_content_tags: HashSet<&'a str>,
112
113 pub set_tag_attributes: HashMap<&'a str, HashMap<&'a str, &'a str>>,
115}
116
117impl BubbleBath<'_> {
118 #[inline]
119 fn clean_attributes(&self, element: &mut Element<'_, '_>, tag_name: &str) {
120 let allowed_attributes = self.allowed_tag_attributes.get(tag_name);
121
122 let mut remove_attributes = Vec::with_capacity(element.attributes().len());
123 for attribute in element.attributes() {
124 let attribute_name = attribute.name();
125
126 if self
127 .allowed_generic_attributes
128 .contains(attribute_name.as_str())
129 {
130 continue;
131 }
132
133 if let Some(allowed_attributes) = allowed_attributes {
134 if allowed_attributes.contains(attribute_name.as_str()) {
135 continue;
136 }
137 }
138
139 remove_attributes.push(attribute_name);
140 }
141
142 for attribute_name in remove_attributes {
143 element.remove_attribute(&attribute_name);
144 }
145 }
146
147 #[inline]
148 fn clean_link(&self, element: &mut Element<'_, '_>, attribute_name: &str) {
149 let Some(raw_url) = element.get_attribute(attribute_name) else {
150 return;
151 };
152
153 let Some((scheme, _rest)) = raw_url.split_once("://") else {
154 element.remove_attribute(attribute_name);
155 return;
156 };
157
158 if !self.allowed_url_schemes.contains(scheme) {
159 element.remove_attribute(attribute_name);
160 }
161 }
162
163 #[inline]
164 fn delete_element(&self, element: &mut Element<'_, '_>, tag_name: &str) {
165 if self.preserve_escaped {
166 let start_tag = element.start_tag();
167
168 let mut formatted = String::new();
169 let _ = write!(formatted, "<{tag_name}");
170
171 for attribute in start_tag.attributes() {
172 let _ = write!(formatted, " {}=\"{}\"", attribute.name(), attribute.value());
173 }
174
175 if start_tag.self_closing() {
176 formatted.push_str(" />");
177 } else {
178 formatted.push('>');
179 }
180
181 start_tag.replace(&formatted, ContentType::Text);
182
183 if let Some(handlers) = element.end_tag_handlers() {
184 handlers.push(Box::new(move |end_tag| {
185 let tag_name = end_tag.name();
186 let content = format!("</{tag_name}>");
187 end_tag.replace(&content, ContentType::Text);
188
189 Ok(())
190 }));
191 }
192 } else {
193 element.remove_and_keep_content();
194 }
195 }
196
197 #[inline]
198 fn element_handler(
199 &self,
200 element: &mut Element<'_, '_>,
201 unclosed_tags: Rc<RefCell<Slab<String>>>,
202 ) -> HandlerResult {
203 let tag_name = element.tag_name();
204
205 if self.remove_content_tags.contains(tag_name.as_str()) {
206 element.remove();
207 return Ok(());
208 }
209
210 if !self.allowed_tags.contains(tag_name.as_str()) {
211 self.delete_element(element, &tag_name);
212 return Ok(());
213 }
214
215 self.clean_attributes(element, &tag_name);
216
217 if let Some(set_attributes) = self.set_tag_attributes.get(tag_name.as_str()) {
218 for (name, value) in set_attributes {
219 element.set_attribute(name, value)?;
220 }
221 }
222
223 if let Some(attributes) = self.clean_url_attributes.get(tag_name.as_str()) {
224 for name in attributes {
225 self.clean_link(element, name);
226 }
227 }
228
229 if !element.is_self_closing() {
231 let unclosed_tag_idx = {
232 let mut unclosed_tags = unclosed_tags.borrow_mut();
233 unclosed_tags.insert(tag_name)
234 };
235
236 if let Some(end_tag_handlers) = element.end_tag_handlers() {
237 end_tag_handlers.push(Box::new(move |_end_tag| {
238 unclosed_tags.borrow_mut().remove(unclosed_tag_idx);
239 Ok(())
240 }));
241 }
242 }
243
244 Ok(())
245 }
246
247 #[inline]
248 fn count_unclosed_opening_tags<B>(counter: &mut usize, input: B)
249 where
250 B: AsRef<[u8]>,
251 {
252 let bytes = input.as_ref();
253
254 let opening_tags = bytecount::count(bytes, b'<');
255 let closing_tags = bytecount::count(bytes, b'>');
256
257 *counter = counter.saturating_add(opening_tags);
258 *counter = counter.saturating_sub(closing_tags);
259 }
260
261 #[inline]
262 fn subtract_opening_tags<B>(counter: &mut usize, input: B)
263 where
264 B: AsRef<[u8]>,
265 {
266 let mut tmp_counter = 0;
267 Self::count_unclosed_opening_tags(&mut tmp_counter, input);
268
269 *counter = counter.saturating_sub(tmp_counter);
270 }
271
272 #[inline]
273 fn comment_handler(comment: &mut Comment<'_>, opening_tags: &RefCell<usize>) {
274 Self::subtract_opening_tags(&mut opening_tags.borrow_mut(), comment.text());
275 comment.remove();
276 }
277
278 #[inline]
279 fn text_handler(chunk: &mut TextChunk<'_>, opening_tags: &RefCell<usize>) {
280 Self::subtract_opening_tags(&mut opening_tags.borrow_mut(), chunk.as_str());
281 *chunk.as_mut_str() = clean_text(chunk.as_str());
282 }
283
284 #[inline]
292 pub fn clean_streaming<'a, I, S>(&self, input: I, sink: S) -> Result<(), Error>
293 where
294 I: Iterator<Item = &'a [u8]>,
295 S: FnMut(&[u8]),
296 {
297 let unclosed_tags = Rc::new(RefCell::new(Slab::new()));
298 let opening_tags = RefCell::new(0);
299
300 let comment_handler = |comment: &mut Comment<'_>| {
301 Self::comment_handler(comment, &opening_tags);
302 Ok(())
303 };
304 let document_end_handler = |document_end: &mut DocumentEnd<'_>| {
305 let unclosed_tags = unclosed_tags.borrow();
306 for (_key, content) in unclosed_tags.iter() {
307 let formatted = format!("</{content}>");
308 document_end.append(&formatted, ContentType::Html);
309 }
310
311 Ok(())
312 };
313 let text_handler = |chunk: &mut TextChunk<'_>| {
314 Self::text_handler(chunk, &opening_tags);
315 Ok(())
316 };
317
318 let document_content_handlers = vec![DocumentContentHandlers::default()
319 .comments(comment_handler)
320 .text(text_handler)
321 .end(document_end_handler)];
322
323 #[inline(always)]
326 fn bounds_assertion<T>(uwu: T) -> T
327 where
328 T: FnMut(&mut Element<'_, '_>) -> HandlerResult,
329 {
330 uwu
331 }
332
333 let element_content_handlers = vec![(
334 Cow::Borrowed(&*SELECT_ALL),
335 ElementContentHandlers::default().element(bounds_assertion(|element| {
336 self.element_handler(element, unclosed_tags.clone())
337 })),
338 )];
339
340 let settings = Settings {
341 document_content_handlers,
342 element_content_handlers,
343 memory_settings: MemorySettings {
344 preallocated_parsing_buffer_size: self
345 .memory_settings
346 .preallocated_parsing_buffer_size,
347 max_allowed_memory_usage: self.memory_settings.max_allowed_memory_usage,
348 },
349 ..Settings::default()
350 };
351
352 let mut rewriter = HtmlRewriter::new(settings, sink);
353
354 for chunk in input {
355 Self::count_unclosed_opening_tags(&mut opening_tags.borrow_mut(), chunk);
356
357 rewriter.write(chunk)?;
358 }
359
360 let opening_tags = *opening_tags.borrow();
361 for _ in 0..opening_tags {
362 rewriter.write(&[b'>'])?;
363 }
364
365 rewriter.end()?;
366
367 Ok(())
368 }
369
370 #[inline]
378 pub fn clean(&self, content: &str) -> Result<String, Error> {
379 let mut acc = Vec::with_capacity(content.len());
380 self.clean_streaming(iter::once(content.as_bytes()), |out| {
381 acc.extend_from_slice(out);
382 })?;
383
384 #[allow(unsafe_code)]
393 Ok(unsafe { String::from_utf8_unchecked(acc) })
394 }
395}
396
397impl Default for BubbleBath<'static> {
398 #[allow(clippy::too_many_lines)]
399 fn default() -> Self {
400 #[rustfmt::skip]
402 let allowed_tags = hashset![
403 "a", "abbr", "acronym", "area", "article", "aside", "b", "bdi",
404 "bdo", "blockquote", "br", "caption", "center", "cite", "code",
405 "col", "colgroup", "data", "dd", "del", "details", "dfn", "div",
406 "dl", "dt", "em", "figcaption", "figure", "footer", "h1", "h2",
407 "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "i", "img",
408 "ins", "kbd", "li", "map", "mark", "nav", "ol", "p", "pre",
409 "q", "rp", "rt", "rtc", "ruby", "s", "samp", "small", "span",
410 "strike", "strong", "sub", "summary", "sup", "table", "tbody",
411 "td", "th", "thead", "time", "tr", "tt", "u", "ul", "var", "wbr",
412 ];
413 let allowed_generic_attributes = hashset!["lang", "title"];
414 let allowed_tag_attributes = hashmap![
415 "a" => hashset![
416 "href", "hreflang"
417 ],
418 "bdo" => hashset![
419 "dir"
420 ],
421 "blockquote" => hashset![
422 "cite"
423 ],
424 "col" => hashset![
425 "align", "char", "charoff", "span"
426 ],
427 "colgroup" => hashset![
428 "align", "char", "charoff", "span"
429 ],
430 "del" => hashset![
431 "cite", "datetime"
432 ],
433 "hr" => hashset![
434 "align", "size", "width"
435 ],
436 "img" => hashset![
437 "align", "alt", "height", "src", "width"
438 ],
439 "ins" => hashset![
440 "cite", "datetime"
441 ],
442 "ol" => hashset![
443 "start"
444 ],
445 "q" => hashset![
446 "cite"
447 ],
448 "table" => hashset![
449 "align", "char", "charoff", "summary"
450 ],
451 "tbody" => hashset![
452 "align", "char", "charoff"
453 ],
454 "td" => hashset![
455 "align", "char", "charoff", "colspan", "headers", "rowspan"
456 ],
457 "tfoot" => hashset![
458 "align", "char", "charoff"
459 ],
460 "th" => hashset![
461 "align", "char", "charoff", "colspan", "headers", "rowspan", "scope"
462 ],
463 "thead" => hashset![
464 "align", "char", "charoff"
465 ],
466 "tr" => hashset![
467 "align", "char", "charoff"
468 ],
469 ];
470 let allowed_url_schemes = hashset![
471 "bitcoin",
472 "ftp",
473 "ftps",
474 "geo",
475 "http",
476 "https",
477 "im",
478 "irc",
479 "ircs",
480 "magnet",
481 "mailto",
482 "mms",
483 "mx",
484 "news",
485 "nntp",
486 "openpgp4fpr",
487 "sip",
488 "sms",
489 "smsto",
490 "ssh",
491 "tel",
492 "url",
493 "webcal",
494 "wtai",
495 "xmpp",
496 ];
497 let clean_url_attributes = hashmap![
498 "a" => hashset!["href"],
499 "img" => hashset!["src"],
500 "link" => hashset!["href"],
501 ];
502 let remove_content_tags = hashset!["script", "style"];
503 let set_tag_attributes = hashmap![
504 "a" => hashmap![
505 "rel" => "noopener noreferrer",
506 ],
507 ];
508
509 Self {
510 allowed_tags,
511 allowed_generic_attributes,
512 allowed_tag_attributes,
513 allowed_url_schemes,
514 clean_url_attributes,
515 memory_settings: MemorySettings::default(),
516 preserve_escaped: false,
517 remove_content_tags,
518 set_tag_attributes,
519 }
520 }
521}