Skip to main content

crates_docs/tools/docs/
html.rs

1//! HTML processing utilities
2//!
3//! Provides HTML cleaning and conversion functions for documentation extraction.
4//! Uses the `scraper` crate for robust HTML5 parsing.
5
6use regex::Regex;
7use scraper::{Html, Selector};
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11/// Tags whose content should be completely removed during HTML cleaning
12const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
13
14/// Tags that represent navigation/structure elements to remove
15const NAV_TAGS: &[&str] = &["nav", "header", "footer", "aside"];
16
17/// UI elements that don't contribute to documentation content
18/// Note: We don't include "details" here because docs.rs uses <details class="toggle top-doc">
19/// to wrap the main documentation content. We only remove "summary" tags but keep their content.
20const UI_TAGS: &[&str] = &["button", "summary"];
21
22/// Regex patterns for self-closing/void tags to remove
23static LINK_TAG_REGEX: LazyLock<Regex> =
24    LazyLock::new(|| Regex::new(r"<link[^>]*>").expect("hardcoded valid regex pattern"));
25
26static META_TAG_REGEX: LazyLock<Regex> =
27    LazyLock::new(|| Regex::new(r"<meta[^>]*>").expect("hardcoded valid regex pattern"));
28
29/// Regex to remove "Copy item path" and similar UI text
30static COPY_PATH_REGEX: LazyLock<Regex> =
31    LazyLock::new(|| Regex::new(r"Copy item path").expect("hardcoded valid regex pattern"));
32
33/// Regex to remove anchor links like [§](#xxx)
34static ANCHOR_LINK_REGEX: LazyLock<Regex> =
35    LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").expect("hardcoded valid regex pattern"));
36
37/// Regex to remove relative source links like [Source](../src/...)
38static SOURCE_LINK_REGEX: LazyLock<Regex> =
39    LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").expect("hardcoded valid regex pattern"));
40
41/// Regex to remove relative documentation links like [de](de/index.html) or [forward\_to\_deserialize\_any](macro.xxx.html)
42/// Matches: [text](relative_path.html) where `relative_path` starts with letter and ends with .html
43static RELATIVE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").expect("hardcoded valid regex pattern")
45});
46
47/// Clean HTML by removing unwanted tags and their content
48///
49/// Uses the `scraper` crate for robust HTML5 parsing, which handles
50/// malformed HTML better than manual parsing.
51#[must_use]
52pub fn clean_html(html: &str) -> String {
53    let document = Html::parse_document(html);
54    remove_unwanted_elements(&document, html)
55}
56
57/// Remove unwanted elements from HTML using scraper for parsing
58#[inline]
59fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
60    let mut result = original_html.to_string();
61
62    // Remove skip tags with their content using scraper
63    for tag in SKIP_TAGS {
64        if let Ok(selector) = Selector::parse(tag) {
65            let elements: Vec<_> = document.select(&selector).collect();
66            for element in elements {
67                let element_html = element.html();
68                result = result.replace(&element_html, "");
69            }
70        }
71    }
72
73    // Re-parse after removing skip tags
74    let mut updated_doc = Html::parse_document(&result);
75
76    // Remove navigation/structure elements
77    for tag in NAV_TAGS {
78        if let Ok(selector) = Selector::parse(tag) {
79            let elements: Vec<_> = updated_doc.select(&selector).collect();
80            for element in elements {
81                let element_html = element.html();
82                result = result.replace(&element_html, "");
83            }
84        }
85    }
86
87    // Re-parse after removing nav tags
88    updated_doc = Html::parse_document(&result);
89
90    // Remove UI elements (buttons, summary)
91    // For buttons: remove completely
92    // For summary: remove the tag but keep the text content
93    for tag in UI_TAGS {
94        if let Ok(selector) = Selector::parse(tag) {
95            let elements: Vec<_> = updated_doc.select(&selector).collect();
96            for element in elements {
97                let element_html = element.html();
98                if tag == &"summary" {
99                    // For summary tags, extract and keep the text content
100                    let text_content: String = element.text().collect();
101                    result = result.replace(&element_html, &text_content);
102                } else {
103                    // For other UI tags (like button), remove completely
104                    result = result.replace(&element_html, "");
105                }
106            }
107        }
108    }
109
110    // Use regex to remove self-closing tags (link, meta)
111    result = LINK_TAG_REGEX.replace_all(&result, "").to_string();
112    result = META_TAG_REGEX.replace_all(&result, "").to_string();
113
114    // Remove UI text and anchor links
115    result = COPY_PATH_REGEX.replace_all(&result, "").to_string();
116    result = ANCHOR_LINK_REGEX.replace_all(&result, "").to_string();
117
118    // Remove relative source and documentation links
119    result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
120    result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
121
122    result
123}
124
125/// Convert HTML to plain text by removing all HTML tags
126///
127/// Uses the `scraper` crate for robust HTML5 parsing.
128#[must_use]
129pub fn html_to_text(html: &str) -> String {
130    let document = Html::parse_document(html);
131
132    // Build selectors for skip tags
133    let mut text_parts = Vec::new();
134
135    // Select the root and extract text, handling skip tags
136    let body_selector = Selector::parse("body").unwrap();
137
138    if let Some(body) = document.select(&body_selector).next() {
139        extract_text_excluding_skip_tags(&body, &mut text_parts);
140    } else {
141        // No body tag, extract from entire document
142        let all_selector = Selector::parse("*").unwrap();
143        if let Some(root) = document.select(&all_selector).next() {
144            extract_text_excluding_skip_tags(&root, &mut text_parts);
145        }
146    }
147
148    clean_whitespace(&text_parts.join(" "))
149}
150
151#[inline]
152fn extract_text_excluding_skip_tags(
153    element: &scraper::element_ref::ElementRef,
154    text_parts: &mut Vec<String>,
155) {
156    let tag_name = element.value().name().to_lowercase();
157
158    if SKIP_TAGS.contains(&tag_name.as_str()) {
159        return;
160    }
161
162    for text in element.text() {
163        let trimmed = text.trim();
164        if !trimmed.is_empty() {
165            text_parts.push(trimmed.to_string());
166        }
167    }
168}
169
170/// Extract documentation from HTML by cleaning and converting to Markdown
171///
172/// For docs.rs pages, extracts only the main content area to avoid
173/// navigation elements, footers, and other non-documentation content.
174#[must_use]
175pub fn extract_documentation(html: &str) -> String {
176    // Try to extract main content area from docs.rs pages
177    let main_content = extract_main_content(html);
178    let cleaned_html = clean_html(&main_content);
179    let markdown = html2md::parse_html(&cleaned_html);
180
181    // Post-process markdown to remove unwanted links
182    clean_markdown(&markdown)
183}
184
185/// Clean markdown output by removing relative links and UI artifacts
186#[inline]
187fn clean_markdown(markdown: &str) -> String {
188    let result = SOURCE_LINK_REGEX.replace_all(markdown, Cow::Borrowed(""));
189    let result = RELATIVE_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
190    let result = ANCHOR_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
191    let result = result.replace("\n\n\n", "\n\n");
192    result.trim().to_string()
193}
194
195/// Extract main content from docs.rs HTML
196///
197/// Looks for `<section id="main-content">` which contains the actual documentation.
198/// Falls back to full HTML if main content section is not found.
199#[inline]
200fn extract_main_content(html: &str) -> String {
201    let document = Html::parse_document(html);
202
203    // Try to find main-content section (docs.rs structure)
204    if let Ok(selector) = Selector::parse("#main-content") {
205        if let Some(main_section) = document.select(&selector).next() {
206            return main_section.html();
207        }
208    }
209
210    // Fallback: try rustdoc_body_wrapper
211    if let Ok(selector) = Selector::parse("#rustdoc_body_wrapper") {
212        if let Some(wrapper) = document.select(&selector).next() {
213            return wrapper.html();
214        }
215    }
216
217    // Last resort: return original HTML
218    html.to_string()
219}
220
221/// Extract search results from HTML
222#[must_use]
223pub fn extract_search_results(html: &str, item_path: &str) -> String {
224    let main_content = extract_main_content(html);
225    let cleaned_html = clean_html(&main_content);
226    let markdown = html2md::parse_html(&cleaned_html);
227    let cleaned_markdown = clean_markdown(&markdown);
228
229    if cleaned_markdown.trim().is_empty() {
230        format!("Documentation for '{item_path}' not found")
231    } else {
232        format!("## Search Results: {item_path}\n\n{cleaned_markdown}")
233    }
234}
235
236#[inline]
237fn clean_whitespace(text: &str) -> String {
238    text.split_whitespace().collect::<Vec<_>>().join(" ")
239}
240
241#[cfg(test)]
242mod tests {
243    use super::*;
244
245    #[test]
246    fn test_clean_html_removes_script() {
247        let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
248        let cleaned = clean_html(html);
249        assert!(!cleaned.contains("script"));
250        assert!(!cleaned.contains("var x"));
251        assert!(cleaned.contains("Hello"));
252    }
253
254    #[test]
255    fn test_clean_html_removes_style() {
256        let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
257        let cleaned = clean_html(html);
258        assert!(!cleaned.contains("style"));
259        assert!(!cleaned.contains(".foo"));
260        assert!(cleaned.contains("Content"));
261    }
262
263    #[test]
264    fn test_html_to_text_removes_tags() {
265        let html = "<p>Hello <strong>World</strong>!</p>";
266        let text = html_to_text(html);
267        assert!(!text.contains('<'));
268        assert!(!text.contains('>'));
269        assert!(text.contains("Hello"));
270        assert!(text.contains("World"));
271    }
272
273    #[test]
274    fn test_html_to_text_handles_entities() {
275        // Test that HTML entities are converted to their character equivalents
276        // amp entity should be decoded to &
277        let html = r"<p>Tom & Jerry</p>";
278        let text = html_to_text(html);
279        // The function should decode amp entity
280        assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
281    }
282
283    #[test]
284    fn test_clean_whitespace() {
285        assert_eq!(clean_whitespace(" hello world "), "hello world");
286        // Multi-space boundary test
287        assert_eq!(clean_whitespace("  hello    world  "), "hello world");
288        assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
289    }
290
291    #[test]
292    fn test_extract_documentation() {
293        let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
294        let docs = extract_documentation(html);
295        assert!(docs.contains("Title"));
296        assert!(docs.contains("Content"));
297    }
298
299    #[test]
300    fn test_extract_search_results_found() {
301        let html = "<html><body><h1>Result</h1></body></html>";
302        let result = extract_search_results(html, "serde::Serialize");
303        assert!(result.contains("Search Results"));
304        assert!(result.contains("serde::Serialize"));
305        assert!(result.contains("Result"));
306    }
307
308    #[test]
309    fn test_extract_search_results_not_found() {
310        let html = "<html><body></body></html>";
311        let result = extract_search_results(html, "nonexistent");
312        assert!(result.contains("not found"));
313        assert!(result.contains("nonexistent"));
314    }
315
316    #[test]
317    fn test_clean_html_removes_link_tags() {
318        let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
319        let cleaned = clean_html(html);
320        assert!(
321            !cleaned.contains("link"),
322            "link tag should be removed, got: {cleaned}"
323        );
324        assert!(
325            !cleaned.contains("stylesheet"),
326            "stylesheet should be removed, got: {cleaned}"
327        );
328        assert!(
329            cleaned.contains("Hello"),
330            "Body content should remain, got: {cleaned}"
331        );
332    }
333
334    #[test]
335    fn test_clean_html_removes_meta_tags() {
336        let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
337        let cleaned = clean_html(html);
338        assert!(
339            !cleaned.contains("meta"),
340            "meta tag should be removed, got: {cleaned}"
341        );
342        assert!(
343            cleaned.contains("Content"),
344            "Body content should remain, got: {cleaned}"
345        );
346    }
347
348    #[test]
349    fn test_relative_link_regex() {
350        // Test that RELATIVE_LINK_REGEX only matches relative .html links
351        let re = &RELATIVE_LINK_REGEX;
352
353        // Should match - relative .html links
354        assert!(re.is_match("[module](module/index.html)"));
355        assert!(re.is_match("[struct](struct.Struct.html)"));
356
357        // Should NOT match
358        assert!(!re.is_match("[Section](#section)")); // Anchor link
359        assert!(
360            !re.is_match("[External](https://example.com)"),
361            "Should not match external URLs"
362        ); // External URL
363    }
364
365    #[test]
366    fn test_clean_markdown_preserves_content() {
367        // Test that clean_markdown doesn't remove too much content
368        let markdown = r"# Dioxus
369
370## At a glance
371
372Dioxus is a framework for building cross-platform apps.
373
374## Quick start
375
376To get started with Dioxus:
377
378```
379cargo install dioxus-cli
380```
381
382[External Link](https://dioxuslabs.com)
383
384[Anchor](#quick-start)
385";
386        let cleaned = clean_markdown(markdown);
387
388        // Should preserve main content
389        assert!(cleaned.contains("Dioxus is a framework"));
390        assert!(cleaned.contains("At a glance"));
391        assert!(cleaned.contains("Quick start"));
392        assert!(cleaned.contains("cargo install"));
393
394        // Should preserve external links and anchor links
395        assert!(
396            cleaned.contains("[External Link](https://dioxuslabs.com)"),
397            "Should preserve external links"
398        );
399        assert!(
400            cleaned.contains("[Anchor](#quick-start)"),
401            "Should preserve anchor links"
402        );
403    }
404}