crates_docs/tools/docs/
html.rs1const SKIP_TAGS: [&str; 4] = ["script", "style", "noscript", "iframe"];
7
8const HTML_ENTITIES: [(&str, &str); 6] = [
10 ("lt", "<"),
11 ("gt", ">"),
12 ("amp", "&"),
13 ("quot", "\""),
14 ("apos", "'"),
15 ("nbsp", " "),
16];
17
18#[must_use]
20pub fn clean_html(html: &str) -> String {
21 let mut result = String::with_capacity(html.len());
22 let chars: Vec<char> = html.chars().collect();
23 let len = chars.len();
24 let mut i = 0;
25 let mut skip_depth = 0;
26
27 while i < len {
28 let c = chars[i];
29
30 if c == '<' {
31 let start = i;
32 let mut j = i + 1;
33
34 let tag_name = collect_tag_name(&chars, &mut j, len);
36 let tag_lower = tag_name.to_lowercase();
37 let pure_tag = tag_lower.trim_start_matches('/');
38
39 let is_skip_tag = SKIP_TAGS.contains(&pure_tag);
41
42 if is_skip_tag {
43 if tag_lower.starts_with('/') {
44 if skip_depth > 0 {
46 skip_depth -= 1;
47 }
48 skip_to_tag_end(&chars, &mut j, len);
49 i = j;
50 continue;
51 }
52
53 skip_depth += 1;
55 skip_to_tag_end(&chars, &mut j, len);
56 i = j;
57 continue;
58 }
59
60 skip_to_tag_end(&chars, &mut j, len);
62
63 if skip_depth == 0 {
65 result.extend(chars[start..j].iter().copied());
66 }
67
68 i = j;
69 } else {
70 if skip_depth == 0 {
71 result.push(c);
72 }
73 i += 1;
74 }
75 }
76
77 result
78}
79
80#[must_use]
82pub fn html_to_text(html: &str) -> String {
83 let mut result = String::with_capacity(html.len());
84 let chars: Vec<char> = html.chars().collect();
85 let len = chars.len();
86 let mut i = 0;
87 let mut skip_content = false;
88
89 while i < len {
90 let c = chars[i];
91
92 match c {
93 '<' => {
94 let mut j = i + 1;
95 let tag_name = collect_tag_name(&chars, &mut j, len);
96 let tag_lower = tag_name.to_lowercase();
97 let is_closing = tag_lower.starts_with('/');
98 let pure_tag = tag_lower.trim_start_matches('/');
99
100 if !is_closing && !skip_content {
102 skip_content = SKIP_TAGS.contains(&pure_tag);
103 } else if is_closing {
104 skip_content = false;
105 }
106
107 skip_to_tag_end(&chars, &mut j, len);
108 i = j;
109
110 if !skip_content {
112 result.push(' ');
113 }
114 }
115 '&' => {
116 let mut j = i + 1;
117 let entity = collect_entity(&chars, &mut j, len);
118
119 let replacement = HTML_ENTITIES
121 .iter()
122 .find_map(
123 |&(name, repl)| {
124 if entity == name {
125 Some(repl)
126 } else {
127 None
128 }
129 },
130 )
131 .unwrap_or("");
132
133 if !replacement.is_empty() {
134 result.push_str(replacement);
135 }
136 i = j;
137 }
138 _ => {
139 if !skip_content {
140 result.push(c);
141 }
142 i += 1;
143 }
144 }
145 }
146
147 clean_whitespace(&result)
148}
149
150#[must_use]
152pub fn extract_documentation(html: &str) -> String {
153 let cleaned_html = clean_html(html);
154 html2md::parse_html(&cleaned_html)
155}
156
157#[must_use]
159pub fn extract_search_results(html: &str, item_path: &str) -> String {
160 let cleaned_html = clean_html(html);
161 let markdown = html2md::parse_html(&cleaned_html);
162
163 if markdown.trim().is_empty() {
164 format!("未找到项目 '{item_path}' 的文档")
165 } else {
166 format!("## 搜索结果: {item_path}\n\n{markdown}")
167 }
168}
169
170fn clean_whitespace(text: &str) -> String {
172 text.split_whitespace().collect::<Vec<_>>().join(" ")
173}
174
175fn collect_tag_name(chars: &[char], j: &mut usize, len: usize) -> String {
177 let mut tag_name = String::new();
178 while *j < len && chars[*j] != '>' && !chars[*j].is_whitespace() {
179 tag_name.push(chars[*j]);
180 *j += 1;
181 }
182 tag_name
183}
184
185fn skip_to_tag_end(chars: &[char], j: &mut usize, len: usize) {
187 while *j < len && chars[*j] != '>' {
188 *j += 1;
189 }
190 if *j < len {
191 *j += 1; }
193}
194
195fn collect_entity(chars: &[char], j: &mut usize, len: usize) -> String {
197 let mut entity = String::new();
198 while *j < len && chars[*j] != ';' {
199 entity.push(chars[*j]);
200 *j += 1;
201 }
202 if *j < len {
203 *j += 1; }
205 entity
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211
212 #[test]
213 fn test_clean_html_removes_script() {
214 let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
215 let cleaned = clean_html(html);
216 assert!(!cleaned.contains("script"));
217 assert!(!cleaned.contains("var x"));
218 assert!(cleaned.contains("Hello"));
219 }
220
221 #[test]
222 fn test_clean_html_removes_style() {
223 let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
224 let cleaned = clean_html(html);
225 assert!(!cleaned.contains("style"));
226 assert!(!cleaned.contains(".foo"));
227 assert!(cleaned.contains("Content"));
228 }
229
230 #[test]
231 fn test_html_to_text_removes_tags() {
232 let html = "<p>Hello <strong>World</strong>!</p>";
233 let text = html_to_text(html);
234 assert!(!text.contains('<'));
235 assert!(!text.contains('>'));
236 assert!(text.contains("Hello"));
237 assert!(text.contains("World"));
238 }
239
240 #[test]
241 fn test_html_to_text_handles_entities() {
242 let html = r"<p>Tom & Jerry</p>";
245 let text = html_to_text(html);
246 assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
248 }
249
250 #[test]
251 fn test_clean_whitespace() {
252 assert_eq!(clean_whitespace(" hello world "), "hello world");
253 assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
254 }
255
256 #[test]
257 fn test_extract_documentation() {
258 let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
259 let docs = extract_documentation(html);
260 assert!(docs.contains("Title"));
261 assert!(docs.contains("Content"));
262 }
263
264 #[test]
265 fn test_extract_search_results_found() {
266 let html = "<html><body><h1>Result</h1></body></html>";
267 let result = extract_search_results(html, "serde::Serialize");
268 assert!(result.contains("搜索结果"));
269 assert!(result.contains("serde::Serialize"));
270 assert!(result.contains("Result"));
271 }
272
273 #[test]
274 fn test_extract_search_results_not_found() {
275 let html = "<html><body></body></html>";
276 let result = extract_search_results(html, "nonexistent");
277 assert!(result.contains("未找到项目"));
278 assert!(result.contains("nonexistent"));
279 }
280}