feedparser_rs/util/
sanitize.rs

1//! HTML sanitization utilities
2//!
3//! This module provides functions for sanitizing HTML content to prevent XSS attacks
4//! while preserving safe formatting.
5
6use ammonia::Builder;
7use std::collections::HashSet;
8
9/// Sanitize HTML content, removing dangerous tags and attributes
10///
11/// This function uses ammonia to clean HTML content, allowing only safe tags
12/// and attributes. It's designed to match feedparser's sanitization behavior.
13///
14/// # Arguments
15///
16/// * `input` - HTML string to sanitize
17///
18/// # Returns
19///
20/// Sanitized HTML string with dangerous content removed
21///
22/// # Examples
23///
24/// ```
25/// use feedparser_rs::util::sanitize::sanitize_html;
26///
27/// let unsafe_html = r#"<p>Hello</p><script>alert('XSS')</script>"#;
28/// let safe_html = sanitize_html(unsafe_html);
29/// assert_eq!(safe_html, "<p>Hello</p>");
30/// ```
31pub fn sanitize_html(input: &str) -> String {
32    // NOTE: Inline HashSet construction is faster than LazyLock with .clone()
33    // because ammonia requires owned values. See benchmark results in .local/
34    let safe_tags: HashSet<_> = [
35        // Text formatting
36        "a",
37        "abbr",
38        "acronym",
39        "b",
40        "cite",
41        "code",
42        "em",
43        "i",
44        "kbd",
45        "mark",
46        "s",
47        "samp",
48        "small",
49        "strike",
50        "strong",
51        "sub",
52        "sup",
53        "u",
54        "var",
55        // Structural
56        "br",
57        "div",
58        "hr",
59        "p",
60        "span",
61        // Headings
62        "h1",
63        "h2",
64        "h3",
65        "h4",
66        "h5",
67        "h6",
68        // Lists
69        "dd",
70        "dl",
71        "dt",
72        "li",
73        "ol",
74        "ul",
75        // Tables
76        "caption",
77        "table",
78        "tbody",
79        "td",
80        "tfoot",
81        "th",
82        "thead",
83        "tr",
84        // Quotes
85        "blockquote",
86        "q",
87        // Pre-formatted
88        "pre",
89        // Media
90        "img",
91    ]
92    .into_iter()
93    .collect();
94
95    let safe_attrs: HashSet<_> = ["alt", "cite", "class", "href", "id", "src", "title"]
96        .into_iter()
97        .collect();
98
99    let safe_url_schemes: HashSet<_> = ["http", "https", "mailto"].into_iter().collect();
100
101    Builder::default()
102        .tags(safe_tags)
103        .generic_attributes(safe_attrs)
104        .link_rel(Some("nofollow noopener noreferrer"))
105        .url_schemes(safe_url_schemes)
106        .clean(input)
107        .to_string()
108}
109
110/// Decode HTML entities to Unicode characters
111///
112/// # Examples
113///
114/// ```
115/// use feedparser_rs::util::sanitize::decode_entities;
116///
117/// assert_eq!(decode_entities("&lt;p&gt;Hello&lt;/p&gt;"), "<p>Hello</p>");
118/// assert_eq!(decode_entities("&amp;amp;"), "&amp;");
119/// ```
120pub fn decode_entities(input: &str) -> String {
121    html_escape::decode_html_entities(input).to_string()
122}
123
124/// Sanitize and decode HTML content
125///
126/// This combines sanitization and entity decoding in the correct order:
127/// 1. Decode entities first
128/// 2. Then sanitize HTML
129///
130/// # Examples
131///
132/// ```
133/// use feedparser_rs::util::sanitize::sanitize_and_decode;
134///
135/// let input = "&lt;p&gt;Safe&lt;/p&gt;&lt;script&gt;alert('XSS')&lt;/script&gt;";
136/// let output = sanitize_and_decode(input);
137/// assert_eq!(output, "<p>Safe</p>");
138/// ```
139pub fn sanitize_and_decode(input: &str) -> String {
140    let decoded = decode_entities(input);
141    sanitize_html(&decoded)
142}
143
144/// Strip all HTML tags, leaving only text content
145///
146/// # Examples
147///
148/// ```
149/// use feedparser_rs::util::sanitize::strip_tags;
150///
151/// assert_eq!(strip_tags("<p>Hello <b>world</b></p>"), "Hello world");
152/// ```
153pub fn strip_tags(input: &str) -> String {
154    Builder::default()
155        .tags(HashSet::new())
156        .clean(input)
157        .to_string()
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn test_sanitize_removes_script() {
166        let html = r"<p>Hello</p><script>alert('XSS')</script>";
167        let clean = sanitize_html(html);
168        assert!(!clean.contains("script"));
169        assert!(clean.contains("Hello"));
170    }
171
172    #[test]
173    fn test_sanitize_allows_safe_tags() {
174        let html = r#"<p>Hello <b>world</b> <a href="http://example.com">link</a></p>"#;
175        let clean = sanitize_html(html);
176        assert!(clean.contains("<p>"));
177        assert!(clean.contains("<b>"));
178        assert!(clean.contains("<a"));
179    }
180
181    #[test]
182    fn test_sanitize_removes_onclick() {
183        let html = r#"<a href="/" onclick="alert('XSS')">Click</a>"#;
184        let clean = sanitize_html(html);
185        assert!(!clean.contains("onclick"));
186        assert!(clean.contains("href"));
187    }
188
189    #[test]
190    fn test_decode_entities() {
191        assert_eq!(decode_entities("&lt;p&gt;"), "<p>");
192        assert_eq!(decode_entities("&amp;"), "&");
193        assert_eq!(decode_entities("&quot;"), "\"");
194        assert_eq!(decode_entities("&#39;"), "'");
195    }
196
197    #[test]
198    fn test_decode_numeric_entities() {
199        assert_eq!(decode_entities("&#60;"), "<");
200        assert_eq!(decode_entities("&#x3C;"), "<");
201    }
202
203    #[test]
204    fn test_sanitize_and_decode() {
205        let input = "&lt;p&gt;Safe&lt;/p&gt;&lt;script&gt;Bad&lt;/script&gt;";
206        let output = sanitize_and_decode(input);
207        assert!(output.contains("<p>Safe</p>"));
208        assert!(!output.contains("script"));
209    }
210
211    #[test]
212    fn test_strip_tags() {
213        let html = "<p>Hello <b>world</b></p>";
214        assert_eq!(strip_tags(html), "Hello world");
215    }
216
217    #[test]
218    fn test_xss_img_onerror() {
219        let html = r#"<img src="x" onerror="alert('XSS')">"#;
220        let clean = sanitize_html(html);
221        assert!(!clean.contains("onerror"));
222    }
223
224    #[test]
225    fn test_xss_javascript_url() {
226        let html = r#"<a href="javascript:alert('XSS')">Click</a>"#;
227        let clean = sanitize_html(html);
228        assert!(!clean.contains("javascript:"));
229    }
230
231    #[test]
232    fn test_xss_iframe() {
233        let html = r#"<iframe src="http://evil.com"></iframe>"#;
234        let clean = sanitize_html(html);
235        assert!(!clean.contains("iframe"));
236    }
237
238    #[test]
239    fn test_xss_data_url() {
240        let html = r#"<a href="data:text/html,<script>alert('XSS')</script>">Click</a>"#;
241        let clean = sanitize_html(html);
242        assert!(!clean.contains("data:"));
243    }
244
245    #[test]
246    fn test_sanitize_empty_string() {
247        assert_eq!(sanitize_html(""), "");
248    }
249
250    #[test]
251    fn test_sanitize_plain_text() {
252        let text = "Plain text with no tags";
253        assert_eq!(sanitize_html(text), text);
254    }
255
256    #[test]
257    fn test_decode_entities_no_entities() {
258        let text = "No entities here";
259        assert_eq!(decode_entities(text), text);
260    }
261
262    #[test]
263    fn test_strip_tags_nested() {
264        let html = "<div><p>Hello <span><b>world</b></span></p></div>";
265        assert_eq!(strip_tags(html), "Hello world");
266    }
267
268    #[test]
269    fn test_sanitize_link_rel_attribute() {
270        let html = r#"<a href="http://example.com">Link</a>"#;
271        let clean = sanitize_html(html);
272        assert!(clean.contains("nofollow"));
273        assert!(clean.contains("noopener"));
274        assert!(clean.contains("noreferrer"));
275    }
276}