Skip to main content

crates_docs/tools/docs/
html.rs

1//! HTML processing utilities
2//!
3//! Provides HTML cleaning and conversion functions for documentation extraction.
4
5/// Tags whose content should be completely removed during HTML cleaning
6const SKIP_TAGS: [&str; 4] = ["script", "style", "noscript", "iframe"];
7
8/// Common HTML entity mappings
9const HTML_ENTITIES: [(&str, &str); 6] = [
10    ("lt", "<"),
11    ("gt", ">"),
12    ("amp", "&"),
13    ("quot", "\""),
14    ("apos", "'"),
15    ("nbsp", " "),
16];
17
18/// Clean HTML by removing unwanted tags (script, style, noscript, iframe) and their content
19#[must_use]
20pub fn clean_html(html: &str) -> String {
21    let mut result = String::with_capacity(html.len());
22    let chars: Vec<char> = html.chars().collect();
23    let len = chars.len();
24    let mut i = 0;
25    let mut skip_depth = 0;
26
27    while i < len {
28        let c = chars[i];
29
30        if c == '<' {
31            let start = i;
32            let mut j = i + 1;
33
34            // Collect tag name
35            let tag_name = collect_tag_name(&chars, &mut j, len);
36            let tag_lower = tag_name.to_lowercase();
37            let pure_tag = tag_lower.trim_start_matches('/');
38
39            // Check if this is a skip tag
40            let is_skip_tag = SKIP_TAGS.contains(&pure_tag);
41
42            if is_skip_tag {
43                if tag_lower.starts_with('/') {
44                    // Closing tag
45                    if skip_depth > 0 {
46                        skip_depth -= 1;
47                    }
48                    skip_to_tag_end(&chars, &mut j, len);
49                    i = j;
50                    continue;
51                }
52
53                // Opening tag
54                skip_depth += 1;
55                skip_to_tag_end(&chars, &mut j, len);
56                i = j;
57                continue;
58            }
59
60            // Skip to end of tag
61            skip_to_tag_end(&chars, &mut j, len);
62
63            // Keep content if not inside a skip tag
64            if skip_depth == 0 {
65                result.extend(chars[start..j].iter().copied());
66            }
67
68            i = j;
69        } else {
70            if skip_depth == 0 {
71                result.push(c);
72            }
73            i += 1;
74        }
75    }
76
77    result
78}
79
80/// Convert HTML to plain text by removing all HTML tags
81#[must_use]
82pub fn html_to_text(html: &str) -> String {
83    let mut result = String::with_capacity(html.len());
84    let chars: Vec<char> = html.chars().collect();
85    let len = chars.len();
86    let mut i = 0;
87    let mut skip_content = false;
88
89    while i < len {
90        let c = chars[i];
91
92        match c {
93            '<' => {
94                let mut j = i + 1;
95                let tag_name = collect_tag_name(&chars, &mut j, len);
96                let tag_lower = tag_name.to_lowercase();
97                let is_closing = tag_lower.starts_with('/');
98                let pure_tag = tag_lower.trim_start_matches('/');
99
100                // Check if we should skip content
101                if !is_closing && !skip_content {
102                    skip_content = SKIP_TAGS.contains(&pure_tag);
103                } else if is_closing {
104                    skip_content = false;
105                }
106
107                skip_to_tag_end(&chars, &mut j, len);
108                i = j;
109
110                // Add space after block-level elements
111                if !skip_content {
112                    result.push(' ');
113                }
114            }
115            '&' => {
116                let mut j = i + 1;
117                let entity = collect_entity(&chars, &mut j, len);
118
119                // Look up entity replacement
120                let replacement = HTML_ENTITIES
121                    .iter()
122                    .find_map(
123                        |&(name, repl)| {
124                            if entity == name {
125                                Some(repl)
126                            } else {
127                                None
128                            }
129                        },
130                    )
131                    .unwrap_or("");
132
133                if !replacement.is_empty() {
134                    result.push_str(replacement);
135                }
136                i = j;
137            }
138            _ => {
139                if !skip_content {
140                    result.push(c);
141                }
142                i += 1;
143            }
144        }
145    }
146
147    clean_whitespace(&result)
148}
149
150/// Extract documentation from HTML by cleaning and converting to Markdown
151#[must_use]
152pub fn extract_documentation(html: &str) -> String {
153    let cleaned_html = clean_html(html);
154    html2md::parse_html(&cleaned_html)
155}
156
157/// Extract search results from HTML
158#[must_use]
159pub fn extract_search_results(html: &str, item_path: &str) -> String {
160    let cleaned_html = clean_html(html);
161    let markdown = html2md::parse_html(&cleaned_html);
162
163    if markdown.trim().is_empty() {
164        format!("未找到项目 '{item_path}' 的文档")
165    } else {
166        format!("## 搜索结果: {item_path}\n\n{markdown}")
167    }
168}
169
170/// Clean extra whitespace from text
171fn clean_whitespace(text: &str) -> String {
172    text.split_whitespace().collect::<Vec<_>>().join(" ")
173}
174
175/// Collect tag name starting from current position
176fn collect_tag_name(chars: &[char], j: &mut usize, len: usize) -> String {
177    let mut tag_name = String::new();
178    while *j < len && chars[*j] != '>' && !chars[*j].is_whitespace() {
179        tag_name.push(chars[*j]);
180        *j += 1;
181    }
182    tag_name
183}
184
185/// Skip to the end of current tag
186fn skip_to_tag_end(chars: &[char], j: &mut usize, len: usize) {
187    while *j < len && chars[*j] != '>' {
188        *j += 1;
189    }
190    if *j < len {
191        *j += 1; // Skip '>'
192    }
193}
194
195/// Collect HTML entity name
196fn collect_entity(chars: &[char], j: &mut usize, len: usize) -> String {
197    let mut entity = String::new();
198    while *j < len && chars[*j] != ';' {
199        entity.push(chars[*j]);
200        *j += 1;
201    }
202    if *j < len {
203        *j += 1; // Skip ';'
204    }
205    entity
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    #[test]
213    fn test_clean_html_removes_script() {
214        let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
215        let cleaned = clean_html(html);
216        assert!(!cleaned.contains("script"));
217        assert!(!cleaned.contains("var x"));
218        assert!(cleaned.contains("Hello"));
219    }
220
221    #[test]
222    fn test_clean_html_removes_style() {
223        let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
224        let cleaned = clean_html(html);
225        assert!(!cleaned.contains("style"));
226        assert!(!cleaned.contains(".foo"));
227        assert!(cleaned.contains("Content"));
228    }
229
230    #[test]
231    fn test_html_to_text_removes_tags() {
232        let html = "<p>Hello <strong>World</strong>!</p>";
233        let text = html_to_text(html);
234        assert!(!text.contains('<'));
235        assert!(!text.contains('>'));
236        assert!(text.contains("Hello"));
237        assert!(text.contains("World"));
238    }
239
240    #[test]
241    fn test_html_to_text_handles_entities() {
242        // Test that HTML entities are converted to their character equivalents
243        // amp entity should be decoded to &
244        let html = r"<p>Tom & Jerry</p>";
245        let text = html_to_text(html);
246        // The function should decode amp entity
247        assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
248    }
249
250    #[test]
251    fn test_clean_whitespace() {
252        assert_eq!(clean_whitespace("  hello   world  "), "hello world");
253        assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
254    }
255
256    #[test]
257    fn test_extract_documentation() {
258        let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
259        let docs = extract_documentation(html);
260        assert!(docs.contains("Title"));
261        assert!(docs.contains("Content"));
262    }
263
264    #[test]
265    fn test_extract_search_results_found() {
266        let html = "<html><body><h1>Result</h1></body></html>";
267        let result = extract_search_results(html, "serde::Serialize");
268        assert!(result.contains("搜索结果"));
269        assert!(result.contains("serde::Serialize"));
270        assert!(result.contains("Result"));
271    }
272
273    #[test]
274    fn test_extract_search_results_not_found() {
275        let html = "<html><body></body></html>";
276        let result = extract_search_results(html, "nonexistent");
277        assert!(result.contains("未找到项目"));
278        assert!(result.contains("nonexistent"));
279    }
280}