1use regex::Regex;
7use scraper::{Html, Selector};
8use std::sync::LazyLock;
9
10const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
12
13const NAV_TAGS: &[&str] = &["nav", "header", "footer", "aside"];
15
16const UI_TAGS: &[&str] = &["button", "summary"];
20
21static LINK_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<link[^>]*>").unwrap());
23
24static META_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<meta[^>]*>").unwrap());
25
26static COPY_PATH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"Copy item path").unwrap());
28
29static ANCHOR_LINK_REGEX: LazyLock<Regex> =
31 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
32
33static SOURCE_LINK_REGEX: LazyLock<Regex> =
35 LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").unwrap());
36
37static RELATIVE_LINK_REGEX: LazyLock<Regex> =
40 LazyLock::new(|| Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").unwrap());
41
42static SECTION_MARKER_REGEX: LazyLock<Regex> =
44 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
45
46#[must_use]
51pub fn clean_html(html: &str) -> String {
52 let document = Html::parse_document(html);
53 remove_unwanted_elements(&document, html)
54}
55
56fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
58 let mut result = original_html.to_string();
59
60 for tag in SKIP_TAGS {
62 if let Ok(selector) = Selector::parse(tag) {
63 let elements: Vec<_> = document.select(&selector).collect();
64 for element in elements {
65 let element_html = element.html();
66 result = result.replace(&element_html, "");
67 }
68 }
69 }
70
71 let mut updated_doc = Html::parse_document(&result);
73
74 for tag in NAV_TAGS {
76 if let Ok(selector) = Selector::parse(tag) {
77 let elements: Vec<_> = updated_doc.select(&selector).collect();
78 for element in elements {
79 let element_html = element.html();
80 result = result.replace(&element_html, "");
81 }
82 }
83 }
84
85 updated_doc = Html::parse_document(&result);
87
88 for tag in UI_TAGS {
92 if let Ok(selector) = Selector::parse(tag) {
93 let elements: Vec<_> = updated_doc.select(&selector).collect();
94 for element in elements {
95 let element_html = element.html();
96 if tag == &"summary" {
97 let text_content: String = element.text().collect();
99 result = result.replace(&element_html, &text_content);
100 } else {
101 result = result.replace(&element_html, "");
103 }
104 }
105 }
106 }
107
108 result = LINK_TAG_REGEX.replace_all(&result, "").to_string();
110 result = META_TAG_REGEX.replace_all(&result, "").to_string();
111
112 result = COPY_PATH_REGEX.replace_all(&result, "").to_string();
114 result = ANCHOR_LINK_REGEX.replace_all(&result, "").to_string();
115
116 result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
118 result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
119
120 result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
122
123 result
124}
125
126#[must_use]
130pub fn html_to_text(html: &str) -> String {
131 let document = Html::parse_document(html);
132
133 let mut text_parts = Vec::new();
135
136 let body_selector = Selector::parse("body").unwrap();
138
139 if let Some(body) = document.select(&body_selector).next() {
140 extract_text_excluding_skip_tags(&body, &mut text_parts);
141 } else {
142 let all_selector = Selector::parse("*").unwrap();
144 if let Some(root) = document.select(&all_selector).next() {
145 extract_text_excluding_skip_tags(&root, &mut text_parts);
146 }
147 }
148
149 clean_whitespace(&text_parts.join(" "))
150}
151
152fn extract_text_excluding_skip_tags(
154 element: &scraper::element_ref::ElementRef,
155 text_parts: &mut Vec<String>,
156) {
157 let tag_name = element.value().name().to_lowercase();
158
159 if SKIP_TAGS.contains(&tag_name.as_str()) {
161 return;
162 }
163
164 for text in element.text() {
166 let trimmed = text.trim();
167 if !trimmed.is_empty() {
168 text_parts.push(trimmed.to_string());
169 }
170 }
171}
172
173#[allow(dead_code)]
175fn is_block_element(tag: &str) -> bool {
176 const BLOCK_ELEMENTS: &[&str] = &[
177 "address",
178 "article",
179 "aside",
180 "blockquote",
181 "body",
182 "canvas",
183 "dd",
184 "div",
185 "dl",
186 "dt",
187 "fieldset",
188 "figcaption",
189 "figure",
190 "footer",
191 "form",
192 "h1",
193 "h2",
194 "h3",
195 "h4",
196 "h5",
197 "h6",
198 "head",
199 "header",
200 "hgroup",
201 "hr",
202 "html",
203 "li",
204 "main",
205 "nav",
206 "noscript",
207 "ol",
208 "p",
209 "pre",
210 "section",
211 "table",
212 "tbody",
213 "td",
214 "tfoot",
215 "th",
216 "thead",
217 "tr",
218 "ul",
219 "video",
220 ];
221 BLOCK_ELEMENTS.contains(&tag)
222}
223
224#[must_use]
229pub fn extract_documentation(html: &str) -> String {
230 let main_content = extract_main_content(html);
232 let cleaned_html = clean_html(&main_content);
233 let markdown = html2md::parse_html(&cleaned_html);
234
235 clean_markdown(&markdown)
237}
238
239fn clean_markdown(markdown: &str) -> String {
241 let result = markdown.to_string();
242
243 let result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
245
246 let result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
248
249 let result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
251
252 let result = result.replace("\n\n\n", "\n\n");
254
255 result.trim().to_string()
256}
257
258fn extract_main_content(html: &str) -> String {
263 let document = Html::parse_document(html);
264
265 if let Ok(selector) = Selector::parse("#main-content") {
267 if let Some(main_section) = document.select(&selector).next() {
268 return main_section.html();
269 }
270 }
271
272 if let Ok(selector) = Selector::parse("#rustdoc_body_wrapper") {
274 if let Some(wrapper) = document.select(&selector).next() {
275 return wrapper.html();
276 }
277 }
278
279 html.to_string()
281}
282
283#[must_use]
285pub fn extract_search_results(html: &str, item_path: &str) -> String {
286 let main_content = extract_main_content(html);
287 let cleaned_html = clean_html(&main_content);
288 let markdown = html2md::parse_html(&cleaned_html);
289 let cleaned_markdown = clean_markdown(&markdown);
290
291 if cleaned_markdown.trim().is_empty() {
292 format!("未找到项目 '{item_path}' 的文档")
293 } else {
294 format!("## 搜索结果: {item_path}\n\n{cleaned_markdown}")
295 }
296}
297
298fn clean_whitespace(text: &str) -> String {
300 text.split_whitespace().collect::<Vec<_>>().join(" ")
301}
302
303#[cfg(test)]
304mod tests {
305 use super::*;
306
307 #[test]
308 fn test_clean_html_removes_script() {
309 let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
310 let cleaned = clean_html(html);
311 assert!(!cleaned.contains("script"));
312 assert!(!cleaned.contains("var x"));
313 assert!(cleaned.contains("Hello"));
314 }
315
316 #[test]
317 fn test_clean_html_removes_style() {
318 let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
319 let cleaned = clean_html(html);
320 assert!(!cleaned.contains("style"));
321 assert!(!cleaned.contains(".foo"));
322 assert!(cleaned.contains("Content"));
323 }
324
325 #[test]
326 fn test_html_to_text_removes_tags() {
327 let html = "<p>Hello <strong>World</strong>!</p>";
328 let text = html_to_text(html);
329 assert!(!text.contains('<'));
330 assert!(!text.contains('>'));
331 assert!(text.contains("Hello"));
332 assert!(text.contains("World"));
333 }
334
335 #[test]
336 fn test_html_to_text_handles_entities() {
337 let html = r"<p>Tom & Jerry</p>";
340 let text = html_to_text(html);
341 assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
343 }
344
345 #[test]
346 fn test_clean_whitespace() {
347 assert_eq!(clean_whitespace(" hello world "), "hello world");
348 assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
349 }
350
351 #[test]
352 fn test_extract_documentation() {
353 let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
354 let docs = extract_documentation(html);
355 assert!(docs.contains("Title"));
356 assert!(docs.contains("Content"));
357 }
358
359 #[test]
360 fn test_extract_search_results_found() {
361 let html = "<html><body><h1>Result</h1></body></html>";
362 let result = extract_search_results(html, "serde::Serialize");
363 assert!(result.contains("搜索结果"));
364 assert!(result.contains("serde::Serialize"));
365 assert!(result.contains("Result"));
366 }
367
368 #[test]
369 fn test_extract_search_results_not_found() {
370 let html = "<html><body></body></html>";
371 let result = extract_search_results(html, "nonexistent");
372 assert!(result.contains("未找到项目"));
373 assert!(result.contains("nonexistent"));
374 }
375
376 #[test]
377 fn test_clean_html_removes_link_tags() {
378 let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
379 let cleaned = clean_html(html);
380 assert!(
381 !cleaned.contains("link"),
382 "link tag should be removed, got: {cleaned}"
383 );
384 assert!(
385 !cleaned.contains("stylesheet"),
386 "stylesheet should be removed, got: {cleaned}"
387 );
388 assert!(
389 cleaned.contains("Hello"),
390 "Body content should remain, got: {cleaned}"
391 );
392 }
393
394 #[test]
395 fn test_clean_html_removes_meta_tags() {
396 let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
397 let cleaned = clean_html(html);
398 assert!(
399 !cleaned.contains("meta"),
400 "meta tag should be removed, got: {cleaned}"
401 );
402 assert!(
403 cleaned.contains("Content"),
404 "Body content should remain, got: {cleaned}"
405 );
406 }
407
408 #[test]
409 fn test_relative_link_regex() {
410 let re = &RELATIVE_LINK_REGEX;
412
413 assert!(re.is_match("[module](module/index.html)"));
415 assert!(re.is_match("[struct](struct.Struct.html)"));
416
417 assert!(!re.is_match("[Section](#section)")); assert!(
420 !re.is_match("[External](https://example.com)"),
421 "Should not match external URLs"
422 ); }
424
425 #[test]
426 fn test_clean_markdown_preserves_content() {
427 let markdown = r"# Dioxus
429
430## At a glance
431
432Dioxus is a framework for building cross-platform apps.
433
434## Quick start
435
436To get started with Dioxus:
437
438```
439cargo install dioxus-cli
440```
441
442[External Link](https://dioxuslabs.com)
443
444[Anchor](#quick-start)
445";
446 let cleaned = clean_markdown(markdown);
447
448 assert!(cleaned.contains("Dioxus is a framework"));
450 assert!(cleaned.contains("At a glance"));
451 assert!(cleaned.contains("Quick start"));
452 assert!(cleaned.contains("cargo install"));
453
454 assert!(
456 cleaned.contains("[External Link](https://dioxuslabs.com)"),
457 "Should preserve external links"
458 );
459 assert!(
460 cleaned.contains("[Anchor](#quick-start)"),
461 "Should preserve anchor links"
462 );
463 }
464}