1use std::collections::HashSet;
2
3pub struct HtmlUtil;
4
5impl HtmlUtil {
6 pub fn remove_html_tag(html: &str, tag: &str) -> String {
8 let mut result = String::new();
9 let mut in_tag = false;
10 let mut buffer = String::new();
11 let tag_start = format!("<{}", tag);
12 let tag_end = format!("</{}>", tag);
13 let mut tag_depth = 0;
14
15 let mut chars = html.chars().peekable();
16 while let Some(c) = chars.next() {
17 if c == '<' {
18 if let Some(next) = chars.peek() {
19 if *next == '/' {
20 let mut potential_end = String::from("</");
22 let mut temp_chars = chars.clone();
23 temp_chars.next(); for _ in 0..tag.len() {
26 if let Some(tc) = temp_chars.next() {
27 potential_end.push(tc);
28 }
29 }
30
31 if potential_end == tag_end {
32 tag_depth -= 1;
33 if tag_depth == 0 {
34 in_tag = false;
35 for _ in 0..(tag.len() + 3) { chars.next();
38 }
39 buffer.clear();
40 continue;
41 }
42 }
43 } else {
44 let mut potential_start = String::from("<");
46 let mut temp_chars = chars.clone();
47
48 for _ in 0..tag.len() {
49 if let Some(tc) = temp_chars.next() {
50 potential_start.push(tc);
51 }
52 }
53
54 if potential_start == tag_start {
55 let mut is_self_closing = false;
57 let mut temp_chars = chars.clone();
58 let mut tag_content = String::new();
59
60 while let Some(tc) = temp_chars.next() {
61 tag_content.push(tc);
62 if tc == '>' {
63 break;
64 }
65 }
66
67 if tag_content.ends_with("/>") {
68 is_self_closing = true;
69 }
70
71 if is_self_closing {
72 for _ in 0..tag_content.len() {
74 chars.next();
75 }
76 continue;
77 } else {
78 tag_depth += 1;
79 if tag_depth == 1 {
80 in_tag = true;
81 buffer.clear();
82 }
83 }
84 }
85 }
86 }
87 }
88
89 if !in_tag {
90 result.push(c);
91 } else {
92 buffer.push(c);
93 }
94 }
95
96 result
97 }
98
99 pub fn clean_html_tag(html: &str) -> String {
101 let mut result = String::new();
102 let mut in_tag = false;
103
104 for c in html.chars() {
105 if c == '<' {
106 in_tag = true;
107 continue;
108 }
109 if c == '>' {
110 in_tag = false;
111 continue;
112 }
113 if !in_tag {
114 result.push(c);
115 }
116 }
117
118 result
119 }
120
121 pub fn unwrap_html_tag(html: &str, tag: &str) -> String {
122 let mut result = String::new();
123 let mut buffer = String::new();
124 let mut in_tag = false;
125 let mut in_target_tag = false;
126 let mut tag_depth = 0;
127
128 let open_tag = format!("<{}", tag);
129 let close_tag = format!("</{}>", tag);
130
131 let mut chars = html.chars().peekable();
132 while let Some(c) = chars.next() {
133 if c == '<' {
134 if in_tag {
135 buffer.push(c);
136 continue;
137 }
138
139 in_tag = true;
140 buffer.clear();
141 buffer.push(c);
142 continue;
143 }
144
145 if in_tag {
146 buffer.push(c);
147
148 if c == '>' {
149 in_tag = false;
150
151 if buffer.starts_with(&open_tag) {
152 if buffer.ends_with("/>") {
153 continue;
155 } else {
156 tag_depth += 1;
158 in_target_tag = true;
159 }
160 } else if buffer.starts_with(&close_tag) {
161 tag_depth -= 1;
163 if tag_depth == 0 {
164 in_target_tag = false;
165 }
166 } else if in_target_tag {
167 result.push_str(&buffer);
169 }
170
171 buffer.clear();
172 continue;
173 }
174 } else {
175 if in_target_tag || !in_tag {
176 result.push(c);
177 }
178 }
179 }
180
181 result
182 }
183
184
185 pub fn remove_html_attr(html: &str, attr: &str) -> String {
187 let mut result = String::new();
188 let mut in_tag = false;
189 let mut in_attr = false;
190 let mut current_attr = String::new();
191 let mut skip_until = None;
192
193 for c in html.chars() {
194 if skip_until == Some(c) {
195 skip_until = None;
196 continue;
197 }
198
199 if c == '<' && !in_tag {
200 in_tag = true;
201 result.push(c);
202 continue;
203 }
204
205 if c == '>' && in_tag {
206 in_tag = false;
207 result.push(c);
208 continue;
209 }
210
211 if in_tag {
212 if c == ' ' && !in_attr {
213 result.push(c);
214 continue;
215 }
216
217 if c == '=' && in_attr {
218 if current_attr == attr {
219 skip_until = Some(' ');
221 in_attr = false;
222 current_attr.clear();
223 continue;
224 } else {
225 result.push_str(¤t_attr);
226 result.push('=');
227 current_attr.clear();
228 in_attr = false;
229 continue;
230 }
231 }
232
233 if in_attr {
234 current_attr.push(c);
235 } else {
236 if c == ' ' {
237 if !current_attr.is_empty() {
238 if current_attr != attr {
239 result.push_str(¤t_attr);
240 }
241 current_attr.clear();
242 }
243 result.push(' ');
244 } else {
245 in_attr = true;
246 current_attr.push(c);
247 }
248 }
249 } else {
250 result.push(c);
251 }
252 }
253
254 if !current_attr.is_empty() && current_attr != attr {
255 result.push_str(¤t_attr);
256 }
257
258 result
259 }
260
261 pub fn remove_all_html_attr(html: &str, tag: &str) -> String {
263 let mut result = String::new();
264 let tag_start = format!("<{}", tag);
265 let mut in_target_tag = false;
266 let mut in_tag = false;
267 let mut in_attr = false;
268
269 let mut chars = html.chars().peekable();
270 while let Some(c) = chars.next() {
271 if c == '<' {
272 let mut is_target = false;
274 let mut temp_chars = chars.clone();
275 let mut potential_tag = String::from("<");
276
277 for _ in 0..tag.len() {
278 if let Some(tc) = temp_chars.next() {
279 potential_tag.push(tc);
280 }
281 }
282
283 if potential_tag == tag_start {
284 is_target = true;
285 }
286
287 in_tag = true;
288 in_target_tag = is_target;
289 result.push(c);
290
291 if is_target {
292 for _ in 0..tag.len() {
294 if let Some(tc) = chars.next() {
295 result.push(tc);
296 }
297 }
298
299 while let Some(tc) = chars.peek() {
301 if *tc == '>' {
302 break;
303 }
304 chars.next();
305 }
306 }
307 continue;
308 }
309
310 if c == '>' && in_tag {
311 in_tag = false;
312 in_target_tag = false;
313 result.push(c);
314 continue;
315 }
316
317 if !in_target_tag {
318 result.push(c);
319 }
320 }
321
322 result
323 }
324
325 pub fn filter(html: &str) -> String {
327 let allowed_tags: HashSet<&str> = [
328 "a", "abbr", "acronym", "address", "area", "b", "big", "blockquote", "br", "button",
329 "caption", "center", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dir",
330 "div", "dl", "dt", "em", "fieldset", "font", "form", "h1", "h2", "h3", "h4", "h5", "h6",
331 "hr", "i", "img", "input", "ins", "kbd", "label", "legend", "li", "map", "menu", "ol",
332 "optgroup", "option", "p", "pre", "q", "s", "samp", "select", "small", "span", "strike",
333 "strong", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "tr",
334 "tt", "u", "ul", "var",
335 ].iter().cloned().collect();
336
337 let mut result = String::new();
338 let mut current_tag = String::new();
339 let mut in_tag = false;
340
341 for c in html.chars() {
342 if c == '<' {
343 in_tag = true;
344 current_tag.clear();
345 continue;
346 }
347
348 if c == '>' && in_tag {
349 in_tag = false;
350 let tag_name = current_tag.split_whitespace().next().unwrap_or("");
351 if allowed_tags.contains(tag_name) {
352 result.push('<');
353 result.push_str(¤t_tag);
354 result.push('>');
355 }
356 current_tag.clear();
357 continue;
358 }
359
360 if in_tag {
361 current_tag.push(c);
362 } else {
363 result.push(c);
364 }
365 }
366
367 result
368 }
369}
370
371#[cfg(test)]
372mod tests {
373 use super::*;
374
375 #[test]
376 fn test_remove_html_tag() {
377 let html = "pre<img src=\"xxx/dfdsfds/test.jpg\">";
378 assert_eq!(HtmlUtil::remove_html_tag(html, "img"), "pre");
379
380 let html = "pre<div>content</div>";
381 assert_eq!(HtmlUtil::remove_html_tag(html, "div"), "pre");
382
383 let html = "pre<div><div>nested</div></div>";
384 assert_eq!(HtmlUtil::remove_html_tag(html, "div"), "pre");
385 }
386
387 #[test]
388 fn test_clean_html_tag() {
389 let html = "pre<div class=\"test_div\">\r\n\t\tdfdsfdsfdsf\r\n</div><div class=\"test_div\">BBBB</div>";
390 assert_eq!(HtmlUtil::clean_html_tag(html), "pre\r\n\t\tdfdsfdsfdsf\r\nBBBB");
391 }
392
393 #[test]
394 fn test_unwrap_html_tag() {
395 let html = "pre<div class=\"test_div\">abc</div>";
396 assert_eq!(HtmlUtil::unwrap_html_tag(html, "div"), "preabc");
397
398 let html_nested = "pre<div>outer<div>inner</div></div>";
399 assert_eq!(HtmlUtil::unwrap_html_tag(html_nested, "div"), "preouterinner");
400
401 let html_self_closing = "pre<img src=\"test.jpg\"/>";
402 assert_eq!(HtmlUtil::unwrap_html_tag(html_self_closing, "img"), "pre");
403 }
404
405 #[test]
406 fn test_remove_html_attr() {
407 let html = "<div class=\"test_div\"></div><span class=\"test_div\"></span>";
408 assert_eq!(HtmlUtil::remove_html_attr(html, "class"), "<div></div><span></span>");
409 }
410
411 #[test]
412 fn test_remove_all_html_attr() {
413 let html = "<div class=\"test_div\" width=\"120\"></div>";
414 assert_eq!(HtmlUtil::remove_all_html_attr(html, "div"), "<div></div>");
415 }
416
417 #[test]
418 fn test_filter() {
419 let html = "<alert></alert><script>malicious()</script><p>safe</p>";
420 assert_eq!(HtmlUtil::filter(html), "<p>safe</p>");
421 }
422}