Skip to main content

crates_docs/tools/docs/
html.rs

1//! HTML processing utilities
2//!
3//! Provides HTML cleaning and conversion functions for documentation extraction.
4//! Uses the `scraper` crate for robust HTML5 parsing.
5
6use regex::Regex;
7use scraper::{Html, Selector};
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11/// Tags whose content should be completely removed during HTML cleaning
12const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
13
14/// Tags that represent navigation/structure elements to remove
15const NAV_TAGS: &[&str] = &["nav", "header", "footer", "aside"];
16
17/// UI elements that don't contribute to documentation content
18/// Note: We don't include "details" here because docs.rs uses <details class="toggle top-doc">
19/// to wrap the main documentation content. We only remove "summary" tags but keep their content.
20const UI_TAGS: &[&str] = &["button", "summary"];
21
22/// Regex patterns for self-closing/void tags to remove
23static LINK_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<link[^>]*>").unwrap());
24
25static META_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<meta[^>]*>").unwrap());
26
27/// Regex to remove "Copy item path" and similar UI text
28static COPY_PATH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"Copy item path").unwrap());
29
30/// Regex to remove anchor links like [§](#xxx)
31static ANCHOR_LINK_REGEX: LazyLock<Regex> =
32    LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
33
34/// Regex to remove relative source links like [Source](../src/...)
35static SOURCE_LINK_REGEX: LazyLock<Regex> =
36    LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").unwrap());
37
38/// Regex to remove relative documentation links like [de](de/index.html) or [forward\_to\_deserialize\_any](macro.xxx.html)
39/// Matches: [text](relative_path.html) where `relative_path` starts with letter and ends with .html
40static RELATIVE_LINK_REGEX: LazyLock<Regex> =
41    LazyLock::new(|| Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").unwrap());
42
43/// Regex to clean up section markers like [§](#xxx) that may remain in headings
44static SECTION_MARKER_REGEX: LazyLock<Regex> =
45    LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
46
47/// Clean HTML by removing unwanted tags and their content
48///
49/// Uses the `scraper` crate for robust HTML5 parsing, which handles
50/// malformed HTML better than manual parsing.
51#[must_use]
52pub fn clean_html(html: &str) -> String {
53    let document = Html::parse_document(html);
54    remove_unwanted_elements(&document, html)
55}
56
57/// Remove unwanted elements from HTML using scraper for parsing
58#[inline]
59fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
60    let mut result = original_html.to_string();
61
62    // Remove skip tags with their content using scraper
63    for tag in SKIP_TAGS {
64        if let Ok(selector) = Selector::parse(tag) {
65            let elements: Vec<_> = document.select(&selector).collect();
66            for element in elements {
67                let element_html = element.html();
68                result = result.replace(&element_html, "");
69            }
70        }
71    }
72
73    // Re-parse after removing skip tags
74    let mut updated_doc = Html::parse_document(&result);
75
76    // Remove navigation/structure elements
77    for tag in NAV_TAGS {
78        if let Ok(selector) = Selector::parse(tag) {
79            let elements: Vec<_> = updated_doc.select(&selector).collect();
80            for element in elements {
81                let element_html = element.html();
82                result = result.replace(&element_html, "");
83            }
84        }
85    }
86
87    // Re-parse after removing nav tags
88    updated_doc = Html::parse_document(&result);
89
90    // Remove UI elements (buttons, summary)
91    // For buttons: remove completely
92    // For summary: remove the tag but keep the text content
93    for tag in UI_TAGS {
94        if let Ok(selector) = Selector::parse(tag) {
95            let elements: Vec<_> = updated_doc.select(&selector).collect();
96            for element in elements {
97                let element_html = element.html();
98                if tag == &"summary" {
99                    // For summary tags, extract and keep the text content
100                    let text_content: String = element.text().collect();
101                    result = result.replace(&element_html, &text_content);
102                } else {
103                    // For other UI tags (like button), remove completely
104                    result = result.replace(&element_html, "");
105                }
106            }
107        }
108    }
109
110    // Use regex to remove self-closing tags (link, meta)
111    result = LINK_TAG_REGEX.replace_all(&result, "").to_string();
112    result = META_TAG_REGEX.replace_all(&result, "").to_string();
113
114    // Remove UI text and anchor links
115    result = COPY_PATH_REGEX.replace_all(&result, "").to_string();
116    result = ANCHOR_LINK_REGEX.replace_all(&result, "").to_string();
117
118    // Remove relative source and documentation links
119    result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
120    result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
121
122    // Clean up any remaining section markers
123    result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
124
125    result
126}
127
128/// Convert HTML to plain text by removing all HTML tags
129///
130/// Uses the `scraper` crate for robust HTML5 parsing.
131#[must_use]
132pub fn html_to_text(html: &str) -> String {
133    let document = Html::parse_document(html);
134
135    // Build selectors for skip tags
136    let mut text_parts = Vec::new();
137
138    // Select the root and extract text, handling skip tags
139    let body_selector = Selector::parse("body").unwrap();
140
141    if let Some(body) = document.select(&body_selector).next() {
142        extract_text_excluding_skip_tags(&body, &mut text_parts);
143    } else {
144        // No body tag, extract from entire document
145        let all_selector = Selector::parse("*").unwrap();
146        if let Some(root) = document.select(&all_selector).next() {
147            extract_text_excluding_skip_tags(&root, &mut text_parts);
148        }
149    }
150
151    clean_whitespace(&text_parts.join(" "))
152}
153
154#[inline]
155fn extract_text_excluding_skip_tags(
156    element: &scraper::element_ref::ElementRef,
157    text_parts: &mut Vec<String>,
158) {
159    let tag_name = element.value().name().to_lowercase();
160
161    if SKIP_TAGS.contains(&tag_name.as_str()) {
162        return;
163    }
164
165    for text in element.text() {
166        let trimmed = text.trim();
167        if !trimmed.is_empty() {
168            text_parts.push(trimmed.to_string());
169        }
170    }
171}
172
173#[inline]
174#[allow(dead_code)]
175fn is_block_element(tag: &str) -> bool {
176    const BLOCK_ELEMENTS: &[&str] = &[
177        "address",
178        "article",
179        "aside",
180        "blockquote",
181        "body",
182        "canvas",
183        "dd",
184        "div",
185        "dl",
186        "dt",
187        "fieldset",
188        "figcaption",
189        "figure",
190        "footer",
191        "form",
192        "h1",
193        "h2",
194        "h3",
195        "h4",
196        "h5",
197        "h6",
198        "head",
199        "header",
200        "hgroup",
201        "hr",
202        "html",
203        "li",
204        "main",
205        "nav",
206        "noscript",
207        "ol",
208        "p",
209        "pre",
210        "section",
211        "table",
212        "tbody",
213        "td",
214        "tfoot",
215        "th",
216        "thead",
217        "tr",
218        "ul",
219        "video",
220    ];
221    BLOCK_ELEMENTS.contains(&tag)
222}
223
224/// Extract documentation from HTML by cleaning and converting to Markdown
225///
226/// For docs.rs pages, extracts only the main content area to avoid
227/// navigation elements, footers, and other non-documentation content.
228#[must_use]
229pub fn extract_documentation(html: &str) -> String {
230    // Try to extract main content area from docs.rs pages
231    let main_content = extract_main_content(html);
232    let cleaned_html = clean_html(&main_content);
233    let markdown = html2md::parse_html(&cleaned_html);
234
235    // Post-process markdown to remove unwanted links
236    clean_markdown(&markdown)
237}
238
239/// Clean markdown output by removing relative links and UI artifacts
240#[inline]
241fn clean_markdown(markdown: &str) -> String {
242    let result = SOURCE_LINK_REGEX.replace_all(markdown, Cow::Borrowed(""));
243    let result = RELATIVE_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
244    let result = SECTION_MARKER_REGEX.replace_all(&result, Cow::Borrowed(""));
245    let result = result.replace("\n\n\n", "\n\n");
246    result.trim().to_string()
247}
248
249/// Extract main content from docs.rs HTML
250///
251/// Looks for `<section id="main-content">` which contains the actual documentation.
252/// Falls back to full HTML if main content section is not found.
253#[inline]
254fn extract_main_content(html: &str) -> String {
255    let document = Html::parse_document(html);
256
257    // Try to find main-content section (docs.rs structure)
258    if let Ok(selector) = Selector::parse("#main-content") {
259        if let Some(main_section) = document.select(&selector).next() {
260            return main_section.html();
261        }
262    }
263
264    // Fallback: try rustdoc_body_wrapper
265    if let Ok(selector) = Selector::parse("#rustdoc_body_wrapper") {
266        if let Some(wrapper) = document.select(&selector).next() {
267            return wrapper.html();
268        }
269    }
270
271    // Last resort: return original HTML
272    html.to_string()
273}
274
275/// Extract search results from HTML
276#[must_use]
277pub fn extract_search_results(html: &str, item_path: &str) -> String {
278    let main_content = extract_main_content(html);
279    let cleaned_html = clean_html(&main_content);
280    let markdown = html2md::parse_html(&cleaned_html);
281    let cleaned_markdown = clean_markdown(&markdown);
282
283    if cleaned_markdown.trim().is_empty() {
284        format!("Documentation for '{item_path}' not found")
285    } else {
286        format!("## Search Results: {item_path}\n\n{cleaned_markdown}")
287    }
288}
289
290#[inline]
291fn clean_whitespace(text: &str) -> String {
292    text.split_whitespace().collect::<Vec<_>>().join(" ")
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298
299    #[test]
300    fn test_clean_html_removes_script() {
301        let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
302        let cleaned = clean_html(html);
303        assert!(!cleaned.contains("script"));
304        assert!(!cleaned.contains("var x"));
305        assert!(cleaned.contains("Hello"));
306    }
307
308    #[test]
309    fn test_clean_html_removes_style() {
310        let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
311        let cleaned = clean_html(html);
312        assert!(!cleaned.contains("style"));
313        assert!(!cleaned.contains(".foo"));
314        assert!(cleaned.contains("Content"));
315    }
316
317    #[test]
318    fn test_html_to_text_removes_tags() {
319        let html = "<p>Hello <strong>World</strong>!</p>";
320        let text = html_to_text(html);
321        assert!(!text.contains('<'));
322        assert!(!text.contains('>'));
323        assert!(text.contains("Hello"));
324        assert!(text.contains("World"));
325    }
326
327    #[test]
328    fn test_html_to_text_handles_entities() {
329        // Test that HTML entities are converted to their character equivalents
330        // amp entity should be decoded to &
331        let html = r"<p>Tom & Jerry</p>";
332        let text = html_to_text(html);
333        // The function should decode amp entity
334        assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
335    }
336
337    #[test]
338    fn test_clean_whitespace() {
339        assert_eq!(clean_whitespace("  hello   world  "), "hello world");
340        assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
341    }
342
343    #[test]
344    fn test_extract_documentation() {
345        let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
346        let docs = extract_documentation(html);
347        assert!(docs.contains("Title"));
348        assert!(docs.contains("Content"));
349    }
350
351    #[test]
352    fn test_extract_search_results_found() {
353        let html = "<html><body><h1>Result</h1></body></html>";
354        let result = extract_search_results(html, "serde::Serialize");
355        assert!(result.contains("Search Results"));
356        assert!(result.contains("serde::Serialize"));
357        assert!(result.contains("Result"));
358    }
359
360    #[test]
361    fn test_extract_search_results_not_found() {
362        let html = "<html><body></body></html>";
363        let result = extract_search_results(html, "nonexistent");
364        assert!(result.contains("not found"));
365        assert!(result.contains("nonexistent"));
366    }
367
368    #[test]
369    fn test_clean_html_removes_link_tags() {
370        let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
371        let cleaned = clean_html(html);
372        assert!(
373            !cleaned.contains("link"),
374            "link tag should be removed, got: {cleaned}"
375        );
376        assert!(
377            !cleaned.contains("stylesheet"),
378            "stylesheet should be removed, got: {cleaned}"
379        );
380        assert!(
381            cleaned.contains("Hello"),
382            "Body content should remain, got: {cleaned}"
383        );
384    }
385
386    #[test]
387    fn test_clean_html_removes_meta_tags() {
388        let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
389        let cleaned = clean_html(html);
390        assert!(
391            !cleaned.contains("meta"),
392            "meta tag should be removed, got: {cleaned}"
393        );
394        assert!(
395            cleaned.contains("Content"),
396            "Body content should remain, got: {cleaned}"
397        );
398    }
399
400    #[test]
401    fn test_relative_link_regex() {
402        // Test that RELATIVE_LINK_REGEX only matches relative .html links
403        let re = &RELATIVE_LINK_REGEX;
404
405        // Should match - relative .html links
406        assert!(re.is_match("[module](module/index.html)"));
407        assert!(re.is_match("[struct](struct.Struct.html)"));
408
409        // Should NOT match
410        assert!(!re.is_match("[Section](#section)")); // Anchor link
411        assert!(
412            !re.is_match("[External](https://example.com)"),
413            "Should not match external URLs"
414        ); // External URL
415    }
416
417    #[test]
418    fn test_clean_markdown_preserves_content() {
419        // Test that clean_markdown doesn't remove too much content
420        let markdown = r"# Dioxus
421
422## At a glance
423
424Dioxus is a framework for building cross-platform apps.
425
426## Quick start
427
428To get started with Dioxus:
429
430```
431cargo install dioxus-cli
432```
433
434[External Link](https://dioxuslabs.com)
435
436[Anchor](#quick-start)
437";
438        let cleaned = clean_markdown(markdown);
439
440        // Should preserve main content
441        assert!(cleaned.contains("Dioxus is a framework"));
442        assert!(cleaned.contains("At a glance"));
443        assert!(cleaned.contains("Quick start"));
444        assert!(cleaned.contains("cargo install"));
445
446        // Should preserve external links and anchor links
447        assert!(
448            cleaned.contains("[External Link](https://dioxuslabs.com)"),
449            "Should preserve external links"
450        );
451        assert!(
452            cleaned.contains("[Anchor](#quick-start)"),
453            "Should preserve anchor links"
454        );
455    }
456}