crates_docs/tools/docs/
html.rs

1//! HTML processing utilities
2//!
3//! Provides HTML cleaning and conversion functions for documentation extraction.
4//! Uses the `scraper` crate for robust HTML5 parsing.
5
6use regex::Regex;
7use scraper::{Html, Selector};
8use std::sync::LazyLock;
9
10/// Tags whose content should be completely removed during HTML cleaning
11const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
12
13/// Tags that represent navigation/structure elements to remove
14const NAV_TAGS: &[&str] = &["nav", "header", "footer", "aside"];
15
16/// UI elements that don't contribute to documentation content
17const UI_TAGS: &[&str] = &["button", "details", "summary"];
18
19/// Regex patterns for self-closing/void tags to remove
20static LINK_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<link[^>]*>").unwrap());
21
22static META_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<meta[^>]*>").unwrap());
23
24/// Regex to remove "Copy item path" and similar UI text
25static COPY_PATH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"Copy item path").unwrap());
26
27/// Regex to remove anchor links like [§](#xxx)
28static ANCHOR_LINK_REGEX: LazyLock<Regex> =
29    LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
30
31/// Regex to remove relative source links like [Source](../src/...)
32static SOURCE_LINK_REGEX: LazyLock<Regex> =
33    LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").unwrap());
34
35/// Regex to remove relative documentation links like [de](de/index.html) or [forward\_to\_deserialize\_any](macro.xxx.html)
36/// Matches: [text](relative_path.html) where `relative_path` starts with letter and ends with .html
37static RELATIVE_LINK_REGEX: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").unwrap());
39
40/// Regex to clean up section markers like [§](#xxx) that may remain in headings
41static SECTION_MARKER_REGEX: LazyLock<Regex> =
42    LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
43
44/// Clean HTML by removing unwanted tags and their content
45///
46/// Uses the `scraper` crate for robust HTML5 parsing, which handles
47/// malformed HTML better than manual parsing.
48#[must_use]
49pub fn clean_html(html: &str) -> String {
50    let document = Html::parse_document(html);
51    remove_unwanted_elements(&document, html)
52}
53
54/// Remove unwanted elements from HTML using scraper for parsing
55fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
56    let mut result = original_html.to_string();
57
58    // Remove skip tags with their content using scraper
59    for tag in SKIP_TAGS {
60        if let Ok(selector) = Selector::parse(tag) {
61            let elements: Vec<_> = document.select(&selector).collect();
62            for element in elements {
63                let element_html = element.html();
64                result = result.replace(&element_html, "");
65            }
66        }
67    }
68
69    // Re-parse after removing skip tags
70    let mut updated_doc = Html::parse_document(&result);
71
72    // Remove navigation/structure elements
73    for tag in NAV_TAGS {
74        if let Ok(selector) = Selector::parse(tag) {
75            let elements: Vec<_> = updated_doc.select(&selector).collect();
76            for element in elements {
77                let element_html = element.html();
78                result = result.replace(&element_html, "");
79            }
80        }
81    }
82
83    // Re-parse after removing nav tags
84    updated_doc = Html::parse_document(&result);
85
86    // Remove UI elements (buttons, details, summary)
87    for tag in UI_TAGS {
88        if let Ok(selector) = Selector::parse(tag) {
89            let elements: Vec<_> = updated_doc.select(&selector).collect();
90            for element in elements {
91                let element_html = element.html();
92                result = result.replace(&element_html, "");
93            }
94        }
95    }
96
97    // Use regex to remove self-closing tags (link, meta)
98    result = LINK_TAG_REGEX.replace_all(&result, "").to_string();
99    result = META_TAG_REGEX.replace_all(&result, "").to_string();
100
101    // Remove UI text and anchor links
102    result = COPY_PATH_REGEX.replace_all(&result, "").to_string();
103    result = ANCHOR_LINK_REGEX.replace_all(&result, "").to_string();
104
105    // Remove relative source and documentation links
106    result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
107    result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
108
109    // Clean up any remaining section markers
110    result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
111
112    result
113}
114
115/// Convert HTML to plain text by removing all HTML tags
116///
117/// Uses the `scraper` crate for robust HTML5 parsing.
118#[must_use]
119pub fn html_to_text(html: &str) -> String {
120    let document = Html::parse_document(html);
121
122    // Build selectors for skip tags
123    let mut text_parts = Vec::new();
124
125    // Select the root and extract text, handling skip tags
126    let body_selector = Selector::parse("body").unwrap();
127
128    if let Some(body) = document.select(&body_selector).next() {
129        extract_text_excluding_skip_tags(&body, &mut text_parts);
130    } else {
131        // No body tag, extract from entire document
132        let all_selector = Selector::parse("*").unwrap();
133        if let Some(root) = document.select(&all_selector).next() {
134            extract_text_excluding_skip_tags(&root, &mut text_parts);
135        }
136    }
137
138    clean_whitespace(&text_parts.join(" "))
139}
140
141/// Extract text from an element, excluding content in skip tags
142fn extract_text_excluding_skip_tags(
143    element: &scraper::element_ref::ElementRef,
144    text_parts: &mut Vec<String>,
145) {
146    let tag_name = element.value().name().to_lowercase();
147
148    // Skip unwanted tags entirely
149    if SKIP_TAGS.contains(&tag_name.as_str()) {
150        return;
151    }
152
153    // Get direct text content
154    for text in element.text() {
155        let trimmed = text.trim();
156        if !trimmed.is_empty() {
157            text_parts.push(trimmed.to_string());
158        }
159    }
160}
161
162/// Check if an element is a block-level element
163#[allow(dead_code)]
164fn is_block_element(tag: &str) -> bool {
165    const BLOCK_ELEMENTS: &[&str] = &[
166        "address",
167        "article",
168        "aside",
169        "blockquote",
170        "body",
171        "canvas",
172        "dd",
173        "div",
174        "dl",
175        "dt",
176        "fieldset",
177        "figcaption",
178        "figure",
179        "footer",
180        "form",
181        "h1",
182        "h2",
183        "h3",
184        "h4",
185        "h5",
186        "h6",
187        "head",
188        "header",
189        "hgroup",
190        "hr",
191        "html",
192        "li",
193        "main",
194        "nav",
195        "noscript",
196        "ol",
197        "p",
198        "pre",
199        "section",
200        "table",
201        "tbody",
202        "td",
203        "tfoot",
204        "th",
205        "thead",
206        "tr",
207        "ul",
208        "video",
209    ];
210    BLOCK_ELEMENTS.contains(&tag)
211}
212
213/// Extract documentation from HTML by cleaning and converting to Markdown
214///
215/// For docs.rs pages, extracts only the main content area to avoid
216/// navigation elements, footers, and other non-documentation content.
217#[must_use]
218pub fn extract_documentation(html: &str) -> String {
219    // Try to extract main content area from docs.rs pages
220    let main_content = extract_main_content(html);
221    let cleaned_html = clean_html(&main_content);
222    let markdown = html2md::parse_html(&cleaned_html);
223
224    // Post-process markdown to remove unwanted links
225    clean_markdown(&markdown)
226}
227
228/// Clean markdown output by removing relative links and UI artifacts
229fn clean_markdown(markdown: &str) -> String {
230    let result = markdown.to_string();
231
232    // Remove source links like [Source](../src/...)
233    let result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
234
235    // Remove relative documentation links like [de](de/index.html)
236    let result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
237
238    // Remove section markers like [§](#xxx)
239    let result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
240
241    // Clean up multiple blank lines
242    let result = result.replace("\n\n\n", "\n\n");
243
244    result.trim().to_string()
245}
246
247/// Extract main content from docs.rs HTML
248///
249/// Looks for `<section id="main-content">` which contains the actual documentation.
250/// Falls back to full HTML if main content section is not found.
251fn extract_main_content(html: &str) -> String {
252    let document = Html::parse_document(html);
253
254    // Try to find main-content section (docs.rs structure)
255    if let Ok(selector) = Selector::parse("#main-content") {
256        if let Some(main_section) = document.select(&selector).next() {
257            return main_section.html();
258        }
259    }
260
261    // Fallback: try rustdoc_body_wrapper
262    if let Ok(selector) = Selector::parse("#rustdoc_body_wrapper") {
263        if let Some(wrapper) = document.select(&selector).next() {
264            return wrapper.html();
265        }
266    }
267
268    // Last resort: return original HTML
269    html.to_string()
270}
271
272/// Extract search results from HTML
273#[must_use]
274pub fn extract_search_results(html: &str, item_path: &str) -> String {
275    let main_content = extract_main_content(html);
276    let cleaned_html = clean_html(&main_content);
277    let markdown = html2md::parse_html(&cleaned_html);
278    let cleaned_markdown = clean_markdown(&markdown);
279
280    if cleaned_markdown.trim().is_empty() {
281        format!("未找到项目 '{item_path}' 的文档")
282    } else {
283        format!("## 搜索结果: {item_path}\n\n{cleaned_markdown}")
284    }
285}
286
287/// Clean extra whitespace from text
288fn clean_whitespace(text: &str) -> String {
289    text.split_whitespace().collect::<Vec<_>>().join(" ")
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    #[test]
297    fn test_clean_html_removes_script() {
298        let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
299        let cleaned = clean_html(html);
300        assert!(!cleaned.contains("script"));
301        assert!(!cleaned.contains("var x"));
302        assert!(cleaned.contains("Hello"));
303    }
304
305    #[test]
306    fn test_clean_html_removes_style() {
307        let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
308        let cleaned = clean_html(html);
309        assert!(!cleaned.contains("style"));
310        assert!(!cleaned.contains(".foo"));
311        assert!(cleaned.contains("Content"));
312    }
313
314    #[test]
315    fn test_html_to_text_removes_tags() {
316        let html = "<p>Hello <strong>World</strong>!</p>";
317        let text = html_to_text(html);
318        assert!(!text.contains('<'));
319        assert!(!text.contains('>'));
320        assert!(text.contains("Hello"));
321        assert!(text.contains("World"));
322    }
323
324    #[test]
325    fn test_html_to_text_handles_entities() {
326        // Test that HTML entities are converted to their character equivalents
327        // amp entity should be decoded to &
328        let html = r"<p>Tom & Jerry</p>";
329        let text = html_to_text(html);
330        // The function should decode amp entity
331        assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
332    }
333
334    #[test]
335    fn test_clean_whitespace() {
336        assert_eq!(clean_whitespace("  hello   world  "), "hello world");
337        assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
338    }
339
340    #[test]
341    fn test_extract_documentation() {
342        let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
343        let docs = extract_documentation(html);
344        assert!(docs.contains("Title"));
345        assert!(docs.contains("Content"));
346    }
347
348    #[test]
349    fn test_extract_search_results_found() {
350        let html = "<html><body><h1>Result</h1></body></html>";
351        let result = extract_search_results(html, "serde::Serialize");
352        assert!(result.contains("搜索结果"));
353        assert!(result.contains("serde::Serialize"));
354        assert!(result.contains("Result"));
355    }
356
357    #[test]
358    fn test_extract_search_results_not_found() {
359        let html = "<html><body></body></html>";
360        let result = extract_search_results(html, "nonexistent");
361        assert!(result.contains("未找到项目"));
362        assert!(result.contains("nonexistent"));
363    }
364
365    #[test]
366    fn test_clean_html_removes_link_tags() {
367        let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
368        let cleaned = clean_html(html);
369        assert!(
370            !cleaned.contains("link"),
371            "link tag should be removed, got: {cleaned}"
372        );
373        assert!(
374            !cleaned.contains("stylesheet"),
375            "stylesheet should be removed, got: {cleaned}"
376        );
377        assert!(
378            cleaned.contains("Hello"),
379            "Body content should remain, got: {cleaned}"
380        );
381    }
382
383    #[test]
384    fn test_clean_html_removes_meta_tags() {
385        let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
386        let cleaned = clean_html(html);
387        assert!(
388            !cleaned.contains("meta"),
389            "meta tag should be removed, got: {cleaned}"
390        );
391        assert!(
392            cleaned.contains("Content"),
393            "Body content should remain, got: {cleaned}"
394        );
395    }
396}
crates_docs/tools/docs/html.rs

crates_docs/tools/docs/
html.rs