bubble_bath/
lib.rs

1#![doc = include_str!("../README.md")]
2//!
3//! For an entry point to the library, check the docs of [`BubbleBath`] or [`clean`]
4//!
5
6use ahash::{HashMap, HashSet};
7use lol_html::{
8    errors::RewritingError,
9    html_content::{Comment, ContentType, DocumentEnd, Element, TextChunk},
10    DocumentContentHandlers, ElementContentHandlers, HandlerResult, HtmlRewriter, Selector,
11    Settings,
12};
13use once_cell::sync::Lazy;
14use slab::Slab;
15use std::{borrow::Cow, cell::RefCell, fmt::Write, iter, rc::Rc, str::FromStr};
16use thiserror::Error;
17
18pub use lol_html::MemorySettings;
19
20mod macros;
21
22static GLOBAL_BUBBLE_BATH: Lazy<BubbleBath<'static>> = Lazy::new(BubbleBath::default);
23static SELECT_ALL: Lazy<Selector> = Lazy::new(|| Selector::from_str("*").unwrap());
24
25/// Clean provided HTML with a global [`BubbleBath`] instance, constructed using [`BubbleBath::default`]
26///
27/// ## Important
28///
29/// The global instance does *not* limit memory usage by default. If you need to limit memory usage, build your own [`BubbleBath`] instance
30///
31/// # Errors
32///
33/// See [`BubbleBath::clean`] documentation
34#[inline]
35pub fn clean(content: &str) -> Result<String, Error> {
36    GLOBAL_BUBBLE_BATH.clean(content)
37}
38
39#[inline]
40fn clean_text(source: &str) -> String {
41    let mut acc = String::with_capacity(source.len());
42
43    for chr in source.chars() {
44        let replacement = match chr {
45            '<' => "&lt;",
46            '>' => "&gt;",
47            '\"' => "&quot;",
48            '\'' => "&apos;",
49            '`' => "&grave;",
50            '/' => "&#47;",
51            '&' => "&amp;",
52            '=' => "&#61;",
53            '\0' => "&#65533;",
54            _ => {
55                acc.push(chr);
56                continue;
57            }
58        };
59
60        acc.push_str(replacement);
61    }
62    acc
63}
64
65/// Potential errors
66#[derive(Debug, Error)]
67#[non_exhaustive]
68pub enum Error {
69    /// The rewriting of the HTML content failed
70    #[error(transparent)]
71    Rewriting(#[from] RewritingError),
72}
73
74/// HTML sanitizer
75///
76/// `bubble-bath` is allow-list based, meaning all tags are by default cleaned.
77///
78/// `BubbleBath::default` provides a safe default
79///
80/// ## Implementation details
81///
82/// - We use `lol_html` as our underlying HTML processor
83/// - Only absolute URLs (i.e. URLs with a scheme) are allowed. Relative links are discarded
84pub struct BubbleBath<'a> {
85    /// Attributes you want to keep on all tags
86    pub allowed_generic_attributes: HashSet<&'a str>,
87
88    /// Tags you want to keep
89    pub allowed_tags: HashSet<&'a str>,
90
91    /// Attributes you want to keep on a per-tag basis
92    pub allowed_tag_attributes: HashMap<&'a str, HashSet<&'a str>>,
93
94    /// Schemes you want to allow on URLs in anchor tags
95    pub allowed_url_schemes: HashSet<&'a str>,
96
97    /// Clean certain attributes on tags as if they are URLs
98    pub clean_url_attributes: HashMap<&'a str, HashSet<&'a str>>,
99
100    /// Memory settings for the underlying HTML transformer
101    pub memory_settings: MemorySettings,
102
103    /// Instead of removing tags (and potentially their content), escape the HTML instead and output them as raw text
104    pub preserve_escaped: bool,
105
106    /// Tags of which you want to remove the tag *and* the content of
107    ///
108    /// By default `bubble-bath` preserves the content of tags
109    ///
110    /// **Note**: Remember to put `<script>` and `<style>` tags in here (unless you 100% know what you are doing) since they are really damn evil!
111    pub remove_content_tags: HashSet<&'a str>,
112
113    /// Attributes you want to set on a per-tag basis
114    pub set_tag_attributes: HashMap<&'a str, HashMap<&'a str, &'a str>>,
115}
116
117impl BubbleBath<'_> {
118    #[inline]
119    fn clean_attributes(&self, element: &mut Element<'_, '_>, tag_name: &str) {
120        let allowed_attributes = self.allowed_tag_attributes.get(tag_name);
121
122        let mut remove_attributes = Vec::with_capacity(element.attributes().len());
123        for attribute in element.attributes() {
124            let attribute_name = attribute.name();
125
126            if self
127                .allowed_generic_attributes
128                .contains(attribute_name.as_str())
129            {
130                continue;
131            }
132
133            if let Some(allowed_attributes) = allowed_attributes {
134                if allowed_attributes.contains(attribute_name.as_str()) {
135                    continue;
136                }
137            }
138
139            remove_attributes.push(attribute_name);
140        }
141
142        for attribute_name in remove_attributes {
143            element.remove_attribute(&attribute_name);
144        }
145    }
146
147    #[inline]
148    fn clean_link(&self, element: &mut Element<'_, '_>, attribute_name: &str) {
149        let Some(raw_url) = element.get_attribute(attribute_name) else {
150            return;
151        };
152
153        let Some((scheme, _rest)) = raw_url.split_once("://") else {
154            element.remove_attribute(attribute_name);
155            return;
156        };
157
158        if !self.allowed_url_schemes.contains(scheme) {
159            element.remove_attribute(attribute_name);
160        }
161    }
162
163    #[inline]
164    fn delete_element(&self, element: &mut Element<'_, '_>, tag_name: &str) {
165        if self.preserve_escaped {
166            let start_tag = element.start_tag();
167
168            let mut formatted = String::new();
169            let _ = write!(formatted, "<{tag_name}");
170
171            for attribute in start_tag.attributes() {
172                let _ = write!(formatted, " {}=\"{}\"", attribute.name(), attribute.value());
173            }
174
175            if start_tag.self_closing() {
176                formatted.push_str(" />");
177            } else {
178                formatted.push('>');
179            }
180
181            start_tag.replace(&formatted, ContentType::Text);
182
183            if let Some(handlers) = element.end_tag_handlers() {
184                handlers.push(Box::new(move |end_tag| {
185                    let tag_name = end_tag.name();
186                    let content = format!("</{tag_name}>");
187                    end_tag.replace(&content, ContentType::Text);
188
189                    Ok(())
190                }));
191            }
192        } else {
193            element.remove_and_keep_content();
194        }
195    }
196
197    #[inline]
198    fn element_handler(
199        &self,
200        element: &mut Element<'_, '_>,
201        unclosed_tags: Rc<RefCell<Slab<String>>>,
202    ) -> HandlerResult {
203        let tag_name = element.tag_name();
204
205        if self.remove_content_tags.contains(tag_name.as_str()) {
206            element.remove();
207            return Ok(());
208        }
209
210        if !self.allowed_tags.contains(tag_name.as_str()) {
211            self.delete_element(element, &tag_name);
212            return Ok(());
213        }
214
215        self.clean_attributes(element, &tag_name);
216
217        if let Some(set_attributes) = self.set_tag_attributes.get(tag_name.as_str()) {
218            for (name, value) in set_attributes {
219                element.set_attribute(name, value)?;
220            }
221        }
222
223        if let Some(attributes) = self.clean_url_attributes.get(tag_name.as_str()) {
224            for name in attributes {
225                self.clean_link(element, name);
226            }
227        }
228
229        // Manually balance the tags if they aren't self-closing
230        if !element.is_self_closing() {
231            let unclosed_tag_idx = {
232                let mut unclosed_tags = unclosed_tags.borrow_mut();
233                unclosed_tags.insert(tag_name)
234            };
235
236            if let Some(end_tag_handlers) = element.end_tag_handlers() {
237                end_tag_handlers.push(Box::new(move |_end_tag| {
238                    unclosed_tags.borrow_mut().remove(unclosed_tag_idx);
239                    Ok(())
240                }));
241            }
242        }
243
244        Ok(())
245    }
246
247    #[inline]
248    fn count_unclosed_opening_tags<B>(counter: &mut usize, input: B)
249    where
250        B: AsRef<[u8]>,
251    {
252        let bytes = input.as_ref();
253
254        let opening_tags = bytecount::count(bytes, b'<');
255        let closing_tags = bytecount::count(bytes, b'>');
256
257        *counter = counter.saturating_add(opening_tags);
258        *counter = counter.saturating_sub(closing_tags);
259    }
260
261    #[inline]
262    fn subtract_opening_tags<B>(counter: &mut usize, input: B)
263    where
264        B: AsRef<[u8]>,
265    {
266        let mut tmp_counter = 0;
267        Self::count_unclosed_opening_tags(&mut tmp_counter, input);
268
269        *counter = counter.saturating_sub(tmp_counter);
270    }
271
272    #[inline]
273    fn comment_handler(comment: &mut Comment<'_>, opening_tags: &RefCell<usize>) {
274        Self::subtract_opening_tags(&mut opening_tags.borrow_mut(), comment.text());
275        comment.remove();
276    }
277
278    #[inline]
279    fn text_handler(chunk: &mut TextChunk<'_>, opening_tags: &RefCell<usize>) {
280        Self::subtract_opening_tags(&mut opening_tags.borrow_mut(), chunk.as_str());
281        *chunk.as_mut_str() = clean_text(chunk.as_str());
282    }
283
284    /// Clean HTML in a streaming fashion
285    ///
286    /// # Errors
287    ///
288    /// - The HTML rewriter ran out of memory
289    /// - The HTML parser ran into an ambiguous state (in this case you should just discard the text instead of trying to fix it)
290    /// - The name of an attribute you put into the `set_tag_attributes` hashmap is invalid
291    #[inline]
292    pub fn clean_streaming<'a, I, S>(&self, input: I, sink: S) -> Result<(), Error>
293    where
294        I: Iterator<Item = &'a [u8]>,
295        S: FnMut(&[u8]),
296    {
297        let unclosed_tags = Rc::new(RefCell::new(Slab::new()));
298        let opening_tags = RefCell::new(0);
299
300        let comment_handler = |comment: &mut Comment<'_>| {
301            Self::comment_handler(comment, &opening_tags);
302            Ok(())
303        };
304        let document_end_handler = |document_end: &mut DocumentEnd<'_>| {
305            let unclosed_tags = unclosed_tags.borrow();
306            for (_key, content) in unclosed_tags.iter() {
307                let formatted = format!("</{content}>");
308                document_end.append(&formatted, ContentType::Html);
309            }
310
311            Ok(())
312        };
313        let text_handler = |chunk: &mut TextChunk<'_>| {
314            Self::text_handler(chunk, &opening_tags);
315            Ok(())
316        };
317
318        let document_content_handlers = vec![DocumentContentHandlers::default()
319            .comments(comment_handler)
320            .text(text_handler)
321            .end(document_end_handler)];
322
323        // Don't ask me why we need this. This is dumb and I don't like it.
324        // It's required so the compiler recognizes that our closure, indeed, implements the handler trait.
325        #[inline(always)]
326        fn bounds_assertion<T>(uwu: T) -> T
327        where
328            T: FnMut(&mut Element<'_, '_>) -> HandlerResult,
329        {
330            uwu
331        }
332
333        let element_content_handlers = vec![(
334            Cow::Borrowed(&*SELECT_ALL),
335            ElementContentHandlers::default().element(bounds_assertion(|element| {
336                self.element_handler(element, unclosed_tags.clone())
337            })),
338        )];
339
340        let settings = Settings {
341            document_content_handlers,
342            element_content_handlers,
343            memory_settings: MemorySettings {
344                preallocated_parsing_buffer_size: self
345                    .memory_settings
346                    .preallocated_parsing_buffer_size,
347                max_allowed_memory_usage: self.memory_settings.max_allowed_memory_usage,
348            },
349            ..Settings::default()
350        };
351
352        let mut rewriter = HtmlRewriter::new(settings, sink);
353
354        for chunk in input {
355            Self::count_unclosed_opening_tags(&mut opening_tags.borrow_mut(), chunk);
356
357            rewriter.write(chunk)?;
358        }
359
360        let opening_tags = *opening_tags.borrow();
361        for _ in 0..opening_tags {
362            rewriter.write(&[b'>'])?;
363        }
364
365        rewriter.end()?;
366
367        Ok(())
368    }
369
370    /// Clean the provided HTML content
371    ///
372    /// # Errors
373    ///
374    /// - The output of the HTML transformer was not valid UTF-8
375    ///
376    /// Check [`Self::clean_streaming`] for additional errors
377    #[inline]
378    pub fn clean(&self, content: &str) -> Result<String, Error> {
379        let mut acc = Vec::with_capacity(content.len());
380        self.clean_streaming(iter::once(content.as_bytes()), |out| {
381            acc.extend_from_slice(out);
382        })?;
383
384        // SAFETY: Since the input is a string slice, we can be confident that it is valid UTF-8.
385        // We also buffered the entirety of the output into the accumulator.
386        //
387        // According to [this comment](https://github.com/cloudflare/lol-html/issues/200#issuecomment-1829731640),
388        // `lol_html` always outputs the data in the same encoding it was supplied in.
389        //
390        // Meaning, since we have the entire output accumulated and the source encoding is valid UTF-8,
391        // this byte vector is, indeed, valid UTF-8.
392        #[allow(unsafe_code)]
393        Ok(unsafe { String::from_utf8_unchecked(acc) })
394    }
395}
396
397impl Default for BubbleBath<'static> {
398    #[allow(clippy::too_many_lines)]
399    fn default() -> Self {
400        // Safe defaults taken from ammonia
401        #[rustfmt::skip]
402        let allowed_tags = hashset![
403            "a", "abbr", "acronym", "area", "article", "aside", "b", "bdi",
404            "bdo", "blockquote", "br", "caption", "center", "cite", "code",
405            "col", "colgroup", "data", "dd", "del", "details", "dfn", "div",
406            "dl", "dt", "em", "figcaption", "figure", "footer", "h1", "h2",
407            "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "i", "img",
408            "ins", "kbd", "li", "map", "mark", "nav", "ol", "p", "pre",
409            "q", "rp", "rt", "rtc", "ruby", "s", "samp", "small", "span",
410            "strike", "strong", "sub", "summary", "sup", "table", "tbody",
411            "td", "th", "thead", "time", "tr", "tt", "u", "ul", "var", "wbr",
412        ];
413        let allowed_generic_attributes = hashset!["lang", "title"];
414        let allowed_tag_attributes = hashmap![
415            "a" => hashset![
416                "href", "hreflang"
417            ],
418            "bdo" => hashset![
419                "dir"
420            ],
421            "blockquote" => hashset![
422                "cite"
423            ],
424            "col" => hashset![
425                "align", "char", "charoff", "span"
426            ],
427            "colgroup" => hashset![
428                "align", "char", "charoff", "span"
429            ],
430            "del" => hashset![
431                "cite", "datetime"
432            ],
433            "hr" => hashset![
434                "align", "size", "width"
435            ],
436            "img" => hashset![
437                "align", "alt", "height", "src", "width"
438            ],
439            "ins" => hashset![
440                "cite", "datetime"
441            ],
442            "ol" => hashset![
443                "start"
444            ],
445            "q" => hashset![
446                "cite"
447            ],
448            "table" => hashset![
449                "align", "char", "charoff", "summary"
450            ],
451            "tbody" => hashset![
452                "align", "char", "charoff"
453            ],
454            "td" => hashset![
455                "align", "char", "charoff", "colspan", "headers", "rowspan"
456            ],
457            "tfoot" => hashset![
458                "align", "char", "charoff"
459            ],
460            "th" => hashset![
461                "align", "char", "charoff", "colspan", "headers", "rowspan", "scope"
462            ],
463            "thead" => hashset![
464                "align", "char", "charoff"
465            ],
466            "tr" => hashset![
467                "align", "char", "charoff"
468            ],
469        ];
470        let allowed_url_schemes = hashset![
471            "bitcoin",
472            "ftp",
473            "ftps",
474            "geo",
475            "http",
476            "https",
477            "im",
478            "irc",
479            "ircs",
480            "magnet",
481            "mailto",
482            "mms",
483            "mx",
484            "news",
485            "nntp",
486            "openpgp4fpr",
487            "sip",
488            "sms",
489            "smsto",
490            "ssh",
491            "tel",
492            "url",
493            "webcal",
494            "wtai",
495            "xmpp",
496        ];
497        let clean_url_attributes = hashmap![
498            "a" => hashset!["href"],
499            "img" => hashset!["src"],
500            "link" => hashset!["href"],
501        ];
502        let remove_content_tags = hashset!["script", "style"];
503        let set_tag_attributes = hashmap![
504            "a" => hashmap![
505                "rel" => "noopener noreferrer",
506            ],
507        ];
508
509        Self {
510            allowed_tags,
511            allowed_generic_attributes,
512            allowed_tag_attributes,
513            allowed_url_schemes,
514            clean_url_attributes,
515            memory_settings: MemorySettings::default(),
516            preserve_escaped: false,
517            remove_content_tags,
518            set_tag_attributes,
519        }
520    }
521}