feedparser_rs/util/
sanitize.rs1use ammonia::Builder;
7use std::collections::HashSet;
8
9pub fn sanitize_html(input: &str) -> String {
32 let safe_tags: HashSet<_> = [
35 "a",
37 "abbr",
38 "acronym",
39 "b",
40 "cite",
41 "code",
42 "em",
43 "i",
44 "kbd",
45 "mark",
46 "s",
47 "samp",
48 "small",
49 "strike",
50 "strong",
51 "sub",
52 "sup",
53 "u",
54 "var",
55 "br",
57 "div",
58 "hr",
59 "p",
60 "span",
61 "h1",
63 "h2",
64 "h3",
65 "h4",
66 "h5",
67 "h6",
68 "dd",
70 "dl",
71 "dt",
72 "li",
73 "ol",
74 "ul",
75 "caption",
77 "table",
78 "tbody",
79 "td",
80 "tfoot",
81 "th",
82 "thead",
83 "tr",
84 "blockquote",
86 "q",
87 "pre",
89 "img",
91 ]
92 .into_iter()
93 .collect();
94
95 let safe_attrs: HashSet<_> = ["alt", "cite", "class", "href", "id", "src", "title"]
96 .into_iter()
97 .collect();
98
99 let safe_url_schemes: HashSet<_> = ["http", "https", "mailto"].into_iter().collect();
100
101 Builder::default()
102 .tags(safe_tags)
103 .generic_attributes(safe_attrs)
104 .link_rel(Some("nofollow noopener noreferrer"))
105 .url_schemes(safe_url_schemes)
106 .clean(input)
107 .to_string()
108}
109
110pub fn decode_entities(input: &str) -> String {
121 html_escape::decode_html_entities(input).to_string()
122}
123
124pub fn sanitize_and_decode(input: &str) -> String {
140 let decoded = decode_entities(input);
141 sanitize_html(&decoded)
142}
143
144pub fn strip_tags(input: &str) -> String {
154 Builder::default()
155 .tags(HashSet::new())
156 .clean(input)
157 .to_string()
158}
159
160#[cfg(test)]
161mod tests {
162 use super::*;
163
164 #[test]
165 fn test_sanitize_removes_script() {
166 let html = r"<p>Hello</p><script>alert('XSS')</script>";
167 let clean = sanitize_html(html);
168 assert!(!clean.contains("script"));
169 assert!(clean.contains("Hello"));
170 }
171
172 #[test]
173 fn test_sanitize_allows_safe_tags() {
174 let html = r#"<p>Hello <b>world</b> <a href="http://example.com">link</a></p>"#;
175 let clean = sanitize_html(html);
176 assert!(clean.contains("<p>"));
177 assert!(clean.contains("<b>"));
178 assert!(clean.contains("<a"));
179 }
180
181 #[test]
182 fn test_sanitize_removes_onclick() {
183 let html = r#"<a href="/" onclick="alert('XSS')">Click</a>"#;
184 let clean = sanitize_html(html);
185 assert!(!clean.contains("onclick"));
186 assert!(clean.contains("href"));
187 }
188
189 #[test]
190 fn test_decode_entities() {
191 assert_eq!(decode_entities("<p>"), "<p>");
192 assert_eq!(decode_entities("&"), "&");
193 assert_eq!(decode_entities("""), "\"");
194 assert_eq!(decode_entities("'"), "'");
195 }
196
197 #[test]
198 fn test_decode_numeric_entities() {
199 assert_eq!(decode_entities("<"), "<");
200 assert_eq!(decode_entities("<"), "<");
201 }
202
203 #[test]
204 fn test_sanitize_and_decode() {
205 let input = "<p>Safe</p><script>Bad</script>";
206 let output = sanitize_and_decode(input);
207 assert!(output.contains("<p>Safe</p>"));
208 assert!(!output.contains("script"));
209 }
210
211 #[test]
212 fn test_strip_tags() {
213 let html = "<p>Hello <b>world</b></p>";
214 assert_eq!(strip_tags(html), "Hello world");
215 }
216
217 #[test]
218 fn test_xss_img_onerror() {
219 let html = r#"<img src="x" onerror="alert('XSS')">"#;
220 let clean = sanitize_html(html);
221 assert!(!clean.contains("onerror"));
222 }
223
224 #[test]
225 fn test_xss_javascript_url() {
226 let html = r#"<a href="javascript:alert('XSS')">Click</a>"#;
227 let clean = sanitize_html(html);
228 assert!(!clean.contains("javascript:"));
229 }
230
231 #[test]
232 fn test_xss_iframe() {
233 let html = r#"<iframe src="http://evil.com"></iframe>"#;
234 let clean = sanitize_html(html);
235 assert!(!clean.contains("iframe"));
236 }
237
238 #[test]
239 fn test_xss_data_url() {
240 let html = r#"<a href="data:text/html,<script>alert('XSS')</script>">Click</a>"#;
241 let clean = sanitize_html(html);
242 assert!(!clean.contains("data:"));
243 }
244
245 #[test]
246 fn test_sanitize_empty_string() {
247 assert_eq!(sanitize_html(""), "");
248 }
249
250 #[test]
251 fn test_sanitize_plain_text() {
252 let text = "Plain text with no tags";
253 assert_eq!(sanitize_html(text), text);
254 }
255
256 #[test]
257 fn test_decode_entities_no_entities() {
258 let text = "No entities here";
259 assert_eq!(decode_entities(text), text);
260 }
261
262 #[test]
263 fn test_strip_tags_nested() {
264 let html = "<div><p>Hello <span><b>world</b></span></p></div>";
265 assert_eq!(strip_tags(html), "Hello world");
266 }
267
268 #[test]
269 fn test_sanitize_link_rel_attribute() {
270 let html = r#"<a href="http://example.com">Link</a>"#;
271 let clean = sanitize_html(html);
272 assert!(clean.contains("nofollow"));
273 assert!(clean.contains("noopener"));
274 assert!(clean.contains("noreferrer"));
275 }
276}