Skip to main content

crates_docs/tools/docs/
html.rs

1//! HTML processing utilities
2//!
3//! Provides HTML cleaning and conversion functions for documentation extraction.
4//! Uses the `scraper` crate for robust HTML5 parsing.
5
6use regex::Regex;
7use scraper::{Html, Selector};
8use std::sync::LazyLock;
9
10/// Tags whose content should be completely removed during HTML cleaning
11const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
12
13/// Tags that represent navigation/structure elements to remove
14const NAV_TAGS: &[&str] = &["nav", "header", "footer", "aside"];
15
16/// UI elements that don't contribute to documentation content
17/// Note: We don't include "details" here because docs.rs uses <details class="toggle top-doc">
18/// to wrap the main documentation content. We only remove "summary" tags but keep their content.
19const UI_TAGS: &[&str] = &["button", "summary"];
20
21/// Regex patterns for self-closing/void tags to remove
22static LINK_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<link[^>]*>").unwrap());
23
24static META_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<meta[^>]*>").unwrap());
25
26/// Regex to remove "Copy item path" and similar UI text
27static COPY_PATH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"Copy item path").unwrap());
28
29/// Regex to remove anchor links like [§](#xxx)
30static ANCHOR_LINK_REGEX: LazyLock<Regex> =
31    LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
32
33/// Regex to remove relative source links like [Source](../src/...)
34static SOURCE_LINK_REGEX: LazyLock<Regex> =
35    LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").unwrap());
36
37/// Regex to remove relative documentation links like [de](de/index.html) or [forward\_to\_deserialize\_any](macro.xxx.html)
38/// Matches: [text](relative_path.html) where `relative_path` starts with letter and ends with .html
39static RELATIVE_LINK_REGEX: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").unwrap());
41
42/// Regex to clean up section markers like [§](#xxx) that may remain in headings
43static SECTION_MARKER_REGEX: LazyLock<Regex> =
44    LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
45
46/// Clean HTML by removing unwanted tags and their content
47///
48/// Uses the `scraper` crate for robust HTML5 parsing, which handles
49/// malformed HTML better than manual parsing.
50#[must_use]
51pub fn clean_html(html: &str) -> String {
52    let document = Html::parse_document(html);
53    remove_unwanted_elements(&document, html)
54}
55
56/// Remove unwanted elements from HTML using scraper for parsing
57fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
58    let mut result = original_html.to_string();
59
60    // Remove skip tags with their content using scraper
61    for tag in SKIP_TAGS {
62        if let Ok(selector) = Selector::parse(tag) {
63            let elements: Vec<_> = document.select(&selector).collect();
64            for element in elements {
65                let element_html = element.html();
66                result = result.replace(&element_html, "");
67            }
68        }
69    }
70
71    // Re-parse after removing skip tags
72    let mut updated_doc = Html::parse_document(&result);
73
74    // Remove navigation/structure elements
75    for tag in NAV_TAGS {
76        if let Ok(selector) = Selector::parse(tag) {
77            let elements: Vec<_> = updated_doc.select(&selector).collect();
78            for element in elements {
79                let element_html = element.html();
80                result = result.replace(&element_html, "");
81            }
82        }
83    }
84
85    // Re-parse after removing nav tags
86    updated_doc = Html::parse_document(&result);
87
88    // Remove UI elements (buttons, summary)
89    // For buttons: remove completely
90    // For summary: remove the tag but keep the text content
91    for tag in UI_TAGS {
92        if let Ok(selector) = Selector::parse(tag) {
93            let elements: Vec<_> = updated_doc.select(&selector).collect();
94            for element in elements {
95                let element_html = element.html();
96                if tag == &"summary" {
97                    // For summary tags, extract and keep the text content
98                    let text_content: String = element.text().collect();
99                    result = result.replace(&element_html, &text_content);
100                } else {
101                    // For other UI tags (like button), remove completely
102                    result = result.replace(&element_html, "");
103                }
104            }
105        }
106    }
107
108    // Use regex to remove self-closing tags (link, meta)
109    result = LINK_TAG_REGEX.replace_all(&result, "").to_string();
110    result = META_TAG_REGEX.replace_all(&result, "").to_string();
111
112    // Remove UI text and anchor links
113    result = COPY_PATH_REGEX.replace_all(&result, "").to_string();
114    result = ANCHOR_LINK_REGEX.replace_all(&result, "").to_string();
115
116    // Remove relative source and documentation links
117    result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
118    result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
119
120    // Clean up any remaining section markers
121    result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
122
123    result
124}
125
126/// Convert HTML to plain text by removing all HTML tags
127///
128/// Uses the `scraper` crate for robust HTML5 parsing.
129#[must_use]
130pub fn html_to_text(html: &str) -> String {
131    let document = Html::parse_document(html);
132
133    // Build selectors for skip tags
134    let mut text_parts = Vec::new();
135
136    // Select the root and extract text, handling skip tags
137    let body_selector = Selector::parse("body").unwrap();
138
139    if let Some(body) = document.select(&body_selector).next() {
140        extract_text_excluding_skip_tags(&body, &mut text_parts);
141    } else {
142        // No body tag, extract from entire document
143        let all_selector = Selector::parse("*").unwrap();
144        if let Some(root) = document.select(&all_selector).next() {
145            extract_text_excluding_skip_tags(&root, &mut text_parts);
146        }
147    }
148
149    clean_whitespace(&text_parts.join(" "))
150}
151
152/// Extract text from an element, excluding content in skip tags
153fn extract_text_excluding_skip_tags(
154    element: &scraper::element_ref::ElementRef,
155    text_parts: &mut Vec<String>,
156) {
157    let tag_name = element.value().name().to_lowercase();
158
159    // Skip unwanted tags entirely
160    if SKIP_TAGS.contains(&tag_name.as_str()) {
161        return;
162    }
163
164    // Get direct text content
165    for text in element.text() {
166        let trimmed = text.trim();
167        if !trimmed.is_empty() {
168            text_parts.push(trimmed.to_string());
169        }
170    }
171}
172
173/// Check if an element is a block-level element
174#[allow(dead_code)]
175fn is_block_element(tag: &str) -> bool {
176    const BLOCK_ELEMENTS: &[&str] = &[
177        "address",
178        "article",
179        "aside",
180        "blockquote",
181        "body",
182        "canvas",
183        "dd",
184        "div",
185        "dl",
186        "dt",
187        "fieldset",
188        "figcaption",
189        "figure",
190        "footer",
191        "form",
192        "h1",
193        "h2",
194        "h3",
195        "h4",
196        "h5",
197        "h6",
198        "head",
199        "header",
200        "hgroup",
201        "hr",
202        "html",
203        "li",
204        "main",
205        "nav",
206        "noscript",
207        "ol",
208        "p",
209        "pre",
210        "section",
211        "table",
212        "tbody",
213        "td",
214        "tfoot",
215        "th",
216        "thead",
217        "tr",
218        "ul",
219        "video",
220    ];
221    BLOCK_ELEMENTS.contains(&tag)
222}
223
224/// Extract documentation from HTML by cleaning and converting to Markdown
225///
226/// For docs.rs pages, extracts only the main content area to avoid
227/// navigation elements, footers, and other non-documentation content.
228#[must_use]
229pub fn extract_documentation(html: &str) -> String {
230    // Try to extract main content area from docs.rs pages
231    let main_content = extract_main_content(html);
232    let cleaned_html = clean_html(&main_content);
233    let markdown = html2md::parse_html(&cleaned_html);
234
235    // Post-process markdown to remove unwanted links
236    clean_markdown(&markdown)
237}
238
239/// Clean markdown output by removing relative links and UI artifacts
240fn clean_markdown(markdown: &str) -> String {
241    let result = markdown.to_string();
242
243    // Remove source links like [Source](../src/...)
244    let result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
245
246    // Remove relative documentation links like [de](de/index.html)
247    let result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
248
249    // Remove section markers like [§](#xxx)
250    let result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
251
252    // Clean up multiple blank lines
253    let result = result.replace("\n\n\n", "\n\n");
254
255    result.trim().to_string()
256}
257
258/// Extract main content from docs.rs HTML
259///
260/// Looks for `<section id="main-content">` which contains the actual documentation.
261/// Falls back to full HTML if main content section is not found.
262fn extract_main_content(html: &str) -> String {
263    let document = Html::parse_document(html);
264
265    // Try to find main-content section (docs.rs structure)
266    if let Ok(selector) = Selector::parse("#main-content") {
267        if let Some(main_section) = document.select(&selector).next() {
268            return main_section.html();
269        }
270    }
271
272    // Fallback: try rustdoc_body_wrapper
273    if let Ok(selector) = Selector::parse("#rustdoc_body_wrapper") {
274        if let Some(wrapper) = document.select(&selector).next() {
275            return wrapper.html();
276        }
277    }
278
279    // Last resort: return original HTML
280    html.to_string()
281}
282
283/// Extract search results from HTML
284#[must_use]
285pub fn extract_search_results(html: &str, item_path: &str) -> String {
286    let main_content = extract_main_content(html);
287    let cleaned_html = clean_html(&main_content);
288    let markdown = html2md::parse_html(&cleaned_html);
289    let cleaned_markdown = clean_markdown(&markdown);
290
291    if cleaned_markdown.trim().is_empty() {
292        format!("未找到项目 '{item_path}' 的文档")
293    } else {
294        format!("## 搜索结果: {item_path}\n\n{cleaned_markdown}")
295    }
296}
297
298/// Clean extra whitespace from text
299fn clean_whitespace(text: &str) -> String {
300    text.split_whitespace().collect::<Vec<_>>().join(" ")
301}
302
303#[cfg(test)]
304mod tests {
305    use super::*;
306
307    #[test]
308    fn test_clean_html_removes_script() {
309        let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
310        let cleaned = clean_html(html);
311        assert!(!cleaned.contains("script"));
312        assert!(!cleaned.contains("var x"));
313        assert!(cleaned.contains("Hello"));
314    }
315
316    #[test]
317    fn test_clean_html_removes_style() {
318        let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
319        let cleaned = clean_html(html);
320        assert!(!cleaned.contains("style"));
321        assert!(!cleaned.contains(".foo"));
322        assert!(cleaned.contains("Content"));
323    }
324
325    #[test]
326    fn test_html_to_text_removes_tags() {
327        let html = "<p>Hello <strong>World</strong>!</p>";
328        let text = html_to_text(html);
329        assert!(!text.contains('<'));
330        assert!(!text.contains('>'));
331        assert!(text.contains("Hello"));
332        assert!(text.contains("World"));
333    }
334
335    #[test]
336    fn test_html_to_text_handles_entities() {
337        // Test that HTML entities are converted to their character equivalents
338        // amp entity should be decoded to &
339        let html = r"<p>Tom & Jerry</p>";
340        let text = html_to_text(html);
341        // The function should decode amp entity
342        assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
343    }
344
345    #[test]
346    fn test_clean_whitespace() {
347        assert_eq!(clean_whitespace("  hello   world  "), "hello world");
348        assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
349    }
350
351    #[test]
352    fn test_extract_documentation() {
353        let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
354        let docs = extract_documentation(html);
355        assert!(docs.contains("Title"));
356        assert!(docs.contains("Content"));
357    }
358
359    #[test]
360    fn test_extract_search_results_found() {
361        let html = "<html><body><h1>Result</h1></body></html>";
362        let result = extract_search_results(html, "serde::Serialize");
363        assert!(result.contains("搜索结果"));
364        assert!(result.contains("serde::Serialize"));
365        assert!(result.contains("Result"));
366    }
367
368    #[test]
369    fn test_extract_search_results_not_found() {
370        let html = "<html><body></body></html>";
371        let result = extract_search_results(html, "nonexistent");
372        assert!(result.contains("未找到项目"));
373        assert!(result.contains("nonexistent"));
374    }
375
376    #[test]
377    fn test_clean_html_removes_link_tags() {
378        let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
379        let cleaned = clean_html(html);
380        assert!(
381            !cleaned.contains("link"),
382            "link tag should be removed, got: {cleaned}"
383        );
384        assert!(
385            !cleaned.contains("stylesheet"),
386            "stylesheet should be removed, got: {cleaned}"
387        );
388        assert!(
389            cleaned.contains("Hello"),
390            "Body content should remain, got: {cleaned}"
391        );
392    }
393
394    #[test]
395    fn test_clean_html_removes_meta_tags() {
396        let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
397        let cleaned = clean_html(html);
398        assert!(
399            !cleaned.contains("meta"),
400            "meta tag should be removed, got: {cleaned}"
401        );
402        assert!(
403            cleaned.contains("Content"),
404            "Body content should remain, got: {cleaned}"
405        );
406    }
407
408    #[test]
409    fn test_relative_link_regex() {
410        // Test that RELATIVE_LINK_REGEX only matches relative .html links
411        let re = &RELATIVE_LINK_REGEX;
412
413        // Should match - relative .html links
414        assert!(re.is_match("[module](module/index.html)"));
415        assert!(re.is_match("[struct](struct.Struct.html)"));
416
417        // Should NOT match
418        assert!(!re.is_match("[Section](#section)")); // Anchor link
419        assert!(
420            !re.is_match("[External](https://example.com)"),
421            "Should not match external URLs"
422        ); // External URL
423    }
424
425    #[test]
426    fn test_clean_markdown_preserves_content() {
427        // Test that clean_markdown doesn't remove too much content
428        let markdown = r"# Dioxus
429
430## At a glance
431
432Dioxus is a framework for building cross-platform apps.
433
434## Quick start
435
436To get started with Dioxus:
437
438```
439cargo install dioxus-cli
440```
441
442[External Link](https://dioxuslabs.com)
443
444[Anchor](#quick-start)
445";
446        let cleaned = clean_markdown(markdown);
447
448        // Should preserve main content
449        assert!(cleaned.contains("Dioxus is a framework"));
450        assert!(cleaned.contains("At a glance"));
451        assert!(cleaned.contains("Quick start"));
452        assert!(cleaned.contains("cargo install"));
453
454        // Should preserve external links and anchor links
455        assert!(
456            cleaned.contains("[External Link](https://dioxuslabs.com)"),
457            "Should preserve external links"
458        );
459        assert!(
460            cleaned.contains("[Anchor](#quick-start)"),
461            "Should preserve anchor links"
462        );
463    }
464}