next_web_utils/html/
html_util.rs

1use std::collections::HashSet;
2
3pub struct HtmlUtil;
4
5impl HtmlUtil {
6    /// 清除指定HTML标签和被标签包围的内容
7    pub fn remove_html_tag(html: &str, tag: &str) -> String {
8        let mut result = String::new();
9        let mut in_tag = false;
10        let mut buffer = String::new();
11        let tag_start = format!("<{}", tag);
12        let tag_end = format!("</{}>", tag);
13        let mut tag_depth = 0;
14
15        let mut chars = html.chars().peekable();
16        while let Some(c) = chars.next() {
17            if c == '<' {
18                if let Some(next) = chars.peek() {
19                    if *next == '/' {
20                        // 检查是否是结束标签
21                        let mut potential_end = String::from("</");
22                        let mut temp_chars = chars.clone();
23                        temp_chars.next(); // 跳过 '/'
24                        
25                        for _ in 0..tag.len() {
26                            if let Some(tc) = temp_chars.next() {
27                                potential_end.push(tc);
28                            }
29                        }
30                        
31                        if potential_end == tag_end {
32                            tag_depth -= 1;
33                            if tag_depth == 0 {
34                                in_tag = false;
35                                // 跳过整个结束标签
36                                for _ in 0..(tag.len() + 3) { // </tag>
37                                    chars.next();
38                                }
39                                buffer.clear();
40                                continue;
41                            }
42                        }
43                    } else {
44                        // 检查是否是开始标签
45                        let mut potential_start = String::from("<");
46                        let mut temp_chars = chars.clone();
47                        
48                        for _ in 0..tag.len() {
49                            if let Some(tc) = temp_chars.next() {
50                                potential_start.push(tc);
51                            }
52                        }
53                        
54                        if potential_start == tag_start {
55                            // 检查是否是自闭合标签
56                            let mut is_self_closing = false;
57                            let mut temp_chars = chars.clone();
58                            let mut tag_content = String::new();
59                            
60                            while let Some(tc) = temp_chars.next() {
61                                tag_content.push(tc);
62                                if tc == '>' {
63                                    break;
64                                }
65                            }
66                            
67                            if tag_content.ends_with("/>") {
68                                is_self_closing = true;
69                            }
70                            
71                            if is_self_closing {
72                                // 跳过自闭合标签
73                                for _ in 0..tag_content.len() {
74                                    chars.next();
75                                }
76                                continue;
77                            } else {
78                                tag_depth += 1;
79                                if tag_depth == 1 {
80                                    in_tag = true;
81                                    buffer.clear();
82                                }
83                            }
84                        }
85                    }
86                }
87            }
88
89            if !in_tag {
90                result.push(c);
91            } else {
92                buffer.push(c);
93            }
94        }
95
96        result
97    }
98
99    /// 清除所有HTML标签,但是保留标签内的内容
100    pub fn clean_html_tag(html: &str) -> String {
101        let mut result = String::new();
102        let mut in_tag = false;
103
104        for c in html.chars() {
105            if c == '<' {
106                in_tag = true;
107                continue;
108            }
109            if c == '>' {
110                in_tag = false;
111                continue;
112            }
113            if !in_tag {
114                result.push(c);
115            }
116        }
117
118        result
119    }
120
121    pub fn unwrap_html_tag(html: &str, tag: &str) -> String {
122        let mut result = String::new();
123        let mut buffer = String::new();
124        let mut in_tag = false;
125        let mut in_target_tag = false;
126        let mut tag_depth = 0;
127        
128        let open_tag = format!("<{}", tag);
129        let close_tag = format!("</{}>", tag);
130    
131        let mut chars = html.chars().peekable();
132        while let Some(c) = chars.next() {
133            if c == '<' {
134                if in_tag {
135                    buffer.push(c);
136                    continue;
137                }
138                
139                in_tag = true;
140                buffer.clear();
141                buffer.push(c);
142                continue;
143            }
144            
145            if in_tag {
146                buffer.push(c);
147                
148                if c == '>' {
149                    in_tag = false;
150                    
151                    if buffer.starts_with(&open_tag) {
152                        if buffer.ends_with("/>") {
153                            // 自闭合标签,直接跳过
154                            continue;
155                        } else {
156                            // 开始标签
157                            tag_depth += 1;
158                            in_target_tag = true;
159                        }
160                    } else if buffer.starts_with(&close_tag) {
161                        // 结束标签
162                        tag_depth -= 1;
163                        if tag_depth == 0 {
164                            in_target_tag = false;
165                        }
166                    } else if in_target_tag {
167                        // 在目标标签内的其他标签,保留内容
168                        result.push_str(&buffer);
169                    }
170                    
171                    buffer.clear();
172                    continue;
173                }
174            } else {
175                if in_target_tag || !in_tag {
176                    result.push(c);
177                }
178            }
179        }
180    
181        result
182    }
183
184    
185    /// 去除HTML标签中的指定属性
186    pub fn remove_html_attr(html: &str, attr: &str) -> String {
187        let mut result = String::new();
188        let mut in_tag = false;
189        let mut in_attr = false;
190        let mut current_attr = String::new();
191        let mut skip_until = None;
192
193        for c in html.chars() {
194            if skip_until == Some(c) {
195                skip_until = None;
196                continue;
197            }
198
199            if c == '<' && !in_tag {
200                in_tag = true;
201                result.push(c);
202                continue;
203            }
204
205            if c == '>' && in_tag {
206                in_tag = false;
207                result.push(c);
208                continue;
209            }
210
211            if in_tag {
212                if c == ' ' && !in_attr {
213                    result.push(c);
214                    continue;
215                }
216
217                if c == '=' && in_attr {
218                    if current_attr == attr {
219                        // 跳过属性值
220                        skip_until = Some(' ');
221                        in_attr = false;
222                        current_attr.clear();
223                        continue;
224                    } else {
225                        result.push_str(&current_attr);
226                        result.push('=');
227                        current_attr.clear();
228                        in_attr = false;
229                        continue;
230                    }
231                }
232
233                if in_attr {
234                    current_attr.push(c);
235                } else {
236                    if c == ' ' {
237                        if !current_attr.is_empty() {
238                            if current_attr != attr {
239                                result.push_str(&current_attr);
240                            }
241                            current_attr.clear();
242                        }
243                        result.push(' ');
244                    } else {
245                        in_attr = true;
246                        current_attr.push(c);
247                    }
248                }
249            } else {
250                result.push(c);
251            }
252        }
253
254        if !current_attr.is_empty() && current_attr != attr {
255            result.push_str(&current_attr);
256        }
257
258        result
259    }
260
261    /// 去除指定标签的所有属性
262    pub fn remove_all_html_attr(html: &str, tag: &str) -> String {
263        let mut result = String::new();
264        let tag_start = format!("<{}", tag);
265        let mut in_target_tag = false;
266        let mut in_tag = false;
267        let mut in_attr = false;
268
269        let mut chars = html.chars().peekable();
270        while let Some(c) = chars.next() {
271            if c == '<' {
272                // 检查是否是目标标签
273                let mut is_target = false;
274                let mut temp_chars = chars.clone();
275                let mut potential_tag = String::from("<");
276                
277                for _ in 0..tag.len() {
278                    if let Some(tc) = temp_chars.next() {
279                        potential_tag.push(tc);
280                    }
281                }
282                
283                if potential_tag == tag_start {
284                    is_target = true;
285                }
286                
287                in_tag = true;
288                in_target_tag = is_target;
289                result.push(c);
290                
291                if is_target {
292                    // 跳过标签名
293                    for _ in 0..tag.len() {
294                        if let Some(tc) = chars.next() {
295                            result.push(tc);
296                        }
297                    }
298                    
299                    // 跳过所有属性直到 '>'
300                    while let Some(tc) = chars.peek() {
301                        if *tc == '>' {
302                            break;
303                        }
304                        chars.next();
305                    }
306                }
307                continue;
308            }
309
310            if c == '>' && in_tag {
311                in_tag = false;
312                in_target_tag = false;
313                result.push(c);
314                continue;
315            }
316
317            if !in_target_tag {
318                result.push(c);
319            }
320        }
321
322        result
323    }
324
325    /// 过滤HTML文本,防止XSS攻击
326    pub fn filter(html: &str) -> String {
327        let allowed_tags: HashSet<&str> = [
328            "a", "abbr", "acronym", "address", "area", "b", "big", "blockquote", "br", "button",
329            "caption", "center", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dir",
330            "div", "dl", "dt", "em", "fieldset", "font", "form", "h1", "h2", "h3", "h4", "h5", "h6",
331            "hr", "i", "img", "input", "ins", "kbd", "label", "legend", "li", "map", "menu", "ol",
332            "optgroup", "option", "p", "pre", "q", "s", "samp", "select", "small", "span", "strike",
333            "strong", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "tr",
334            "tt", "u", "ul", "var",
335        ].iter().cloned().collect();
336
337        let mut result = String::new();
338        let mut current_tag = String::new();
339        let mut in_tag = false;
340
341        for c in html.chars() {
342            if c == '<' {
343                in_tag = true;
344                current_tag.clear();
345                continue;
346            }
347
348            if c == '>' && in_tag {
349                in_tag = false;
350                let tag_name = current_tag.split_whitespace().next().unwrap_or("");
351                if allowed_tags.contains(tag_name) {
352                    result.push('<');
353                    result.push_str(&current_tag);
354                    result.push('>');
355                }
356                current_tag.clear();
357                continue;
358            }
359
360            if in_tag {
361                current_tag.push(c);
362            } else {
363                result.push(c);
364            }
365        }
366
367        result
368    }
369}
370
371#[cfg(test)]
372mod tests {
373    use super::*;
374
375    #[test]
376    fn test_remove_html_tag() {
377        let html = "pre<img src=\"xxx/dfdsfds/test.jpg\">";
378        assert_eq!(HtmlUtil::remove_html_tag(html, "img"), "pre");
379
380        let html = "pre<div>content</div>";
381        assert_eq!(HtmlUtil::remove_html_tag(html, "div"), "pre");
382
383        let html = "pre<div><div>nested</div></div>";
384        assert_eq!(HtmlUtil::remove_html_tag(html, "div"), "pre");
385    }
386
387    #[test]
388    fn test_clean_html_tag() {
389        let html = "pre<div class=\"test_div\">\r\n\t\tdfdsfdsfdsf\r\n</div><div class=\"test_div\">BBBB</div>";
390        assert_eq!(HtmlUtil::clean_html_tag(html), "pre\r\n\t\tdfdsfdsfdsf\r\nBBBB");
391    }
392
393    #[test]
394    fn test_unwrap_html_tag() {
395        let html = "pre<div class=\"test_div\">abc</div>";
396        assert_eq!(HtmlUtil::unwrap_html_tag(html, "div"), "preabc");
397        
398        let html_nested = "pre<div>outer<div>inner</div></div>";
399        assert_eq!(HtmlUtil::unwrap_html_tag(html_nested, "div"), "preouterinner");
400        
401        let html_self_closing = "pre<img src=\"test.jpg\"/>";
402        assert_eq!(HtmlUtil::unwrap_html_tag(html_self_closing, "img"), "pre");
403    }
404
405    #[test]
406    fn test_remove_html_attr() {
407        let html = "<div class=\"test_div\"></div><span class=\"test_div\"></span>";
408        assert_eq!(HtmlUtil::remove_html_attr(html, "class"), "<div></div><span></span>");
409    }
410
411    #[test]
412    fn test_remove_all_html_attr() {
413        let html = "<div class=\"test_div\" width=\"120\"></div>";
414        assert_eq!(HtmlUtil::remove_all_html_attr(html, "div"), "<div></div>");
415    }
416
417    #[test]
418    fn test_filter() {
419        let html = "<alert></alert><script>malicious()</script><p>safe</p>";
420        assert_eq!(HtmlUtil::filter(html), "<p>safe</p>");
421    }
422}