crates_docs/tools/docs/
html.rs1use regex::Regex;
7use scraper::{Html, Selector};
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
13
14const NAV_TAGS: &[&str] = &["nav", "header", "footer", "aside"];
16
17const UI_TAGS: &[&str] = &["button", "summary"];
21
22static LINK_TAG_REGEX: LazyLock<Regex> =
24 LazyLock::new(|| Regex::new(r"<link[^>]*>").expect("hardcoded valid regex pattern"));
25
26static META_TAG_REGEX: LazyLock<Regex> =
27 LazyLock::new(|| Regex::new(r"<meta[^>]*>").expect("hardcoded valid regex pattern"));
28
29static COPY_PATH_REGEX: LazyLock<Regex> =
31 LazyLock::new(|| Regex::new(r"Copy item path").expect("hardcoded valid regex pattern"));
32
33static ANCHOR_LINK_REGEX: LazyLock<Regex> =
35 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").expect("hardcoded valid regex pattern"));
36
37static SOURCE_LINK_REGEX: LazyLock<Regex> =
39 LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").expect("hardcoded valid regex pattern"));
40
41static RELATIVE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
44 Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").expect("hardcoded valid regex pattern")
45});
46
47#[must_use]
52pub fn clean_html(html: &str) -> String {
53 let document = Html::parse_document(html);
54 remove_unwanted_elements(&document, html)
55}
56
57#[inline]
59fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
60 let mut result = original_html.to_string();
61
62 for tag in SKIP_TAGS {
64 if let Ok(selector) = Selector::parse(tag) {
65 let elements: Vec<_> = document.select(&selector).collect();
66 for element in elements {
67 let element_html = element.html();
68 result = result.replace(&element_html, "");
69 }
70 }
71 }
72
73 let mut updated_doc = Html::parse_document(&result);
75
76 for tag in NAV_TAGS {
78 if let Ok(selector) = Selector::parse(tag) {
79 let elements: Vec<_> = updated_doc.select(&selector).collect();
80 for element in elements {
81 let element_html = element.html();
82 result = result.replace(&element_html, "");
83 }
84 }
85 }
86
87 updated_doc = Html::parse_document(&result);
89
90 for tag in UI_TAGS {
94 if let Ok(selector) = Selector::parse(tag) {
95 let elements: Vec<_> = updated_doc.select(&selector).collect();
96 for element in elements {
97 let element_html = element.html();
98 if tag == &"summary" {
99 let text_content: String = element.text().collect();
101 result = result.replace(&element_html, &text_content);
102 } else {
103 result = result.replace(&element_html, "");
105 }
106 }
107 }
108 }
109
110 result = LINK_TAG_REGEX.replace_all(&result, "").to_string();
112 result = META_TAG_REGEX.replace_all(&result, "").to_string();
113
114 result = COPY_PATH_REGEX.replace_all(&result, "").to_string();
116 result = ANCHOR_LINK_REGEX.replace_all(&result, "").to_string();
117
118 result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
120 result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
121
122 result
123}
124
125#[must_use]
129pub fn html_to_text(html: &str) -> String {
130 let document = Html::parse_document(html);
131
132 let mut text_parts = Vec::new();
134
135 let body_selector = Selector::parse("body").unwrap();
137
138 if let Some(body) = document.select(&body_selector).next() {
139 extract_text_excluding_skip_tags(&body, &mut text_parts);
140 } else {
141 let all_selector = Selector::parse("*").unwrap();
143 if let Some(root) = document.select(&all_selector).next() {
144 extract_text_excluding_skip_tags(&root, &mut text_parts);
145 }
146 }
147
148 clean_whitespace(&text_parts.join(" "))
149}
150
151#[inline]
152fn extract_text_excluding_skip_tags(
153 element: &scraper::element_ref::ElementRef,
154 text_parts: &mut Vec<String>,
155) {
156 let tag_name = element.value().name().to_lowercase();
157
158 if SKIP_TAGS.contains(&tag_name.as_str()) {
159 return;
160 }
161
162 for text in element.text() {
163 let trimmed = text.trim();
164 if !trimmed.is_empty() {
165 text_parts.push(trimmed.to_string());
166 }
167 }
168}
169
170#[must_use]
175pub fn extract_documentation(html: &str) -> String {
176 let main_content = extract_main_content(html);
178 let cleaned_html = clean_html(&main_content);
179 let markdown = html2md::parse_html(&cleaned_html);
180
181 clean_markdown(&markdown)
183}
184
185#[inline]
187fn clean_markdown(markdown: &str) -> String {
188 let result = SOURCE_LINK_REGEX.replace_all(markdown, Cow::Borrowed(""));
189 let result = RELATIVE_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
190 let result = ANCHOR_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
191 let result = result.replace("\n\n\n", "\n\n");
192 result.trim().to_string()
193}
194
195#[inline]
200fn extract_main_content(html: &str) -> String {
201 let document = Html::parse_document(html);
202
203 if let Ok(selector) = Selector::parse("#main-content") {
205 if let Some(main_section) = document.select(&selector).next() {
206 return main_section.html();
207 }
208 }
209
210 if let Ok(selector) = Selector::parse("#rustdoc_body_wrapper") {
212 if let Some(wrapper) = document.select(&selector).next() {
213 return wrapper.html();
214 }
215 }
216
217 html.to_string()
219}
220
221#[must_use]
223pub fn extract_search_results(html: &str, item_path: &str) -> String {
224 let main_content = extract_main_content(html);
225 let cleaned_html = clean_html(&main_content);
226 let markdown = html2md::parse_html(&cleaned_html);
227 let cleaned_markdown = clean_markdown(&markdown);
228
229 if cleaned_markdown.trim().is_empty() {
230 format!("Documentation for '{item_path}' not found")
231 } else {
232 format!("## Search Results: {item_path}\n\n{cleaned_markdown}")
233 }
234}
235
236#[inline]
237fn clean_whitespace(text: &str) -> String {
238 text.split_whitespace().collect::<Vec<_>>().join(" ")
239}
240
241#[cfg(test)]
242mod tests {
243 use super::*;
244
245 #[test]
246 fn test_clean_html_removes_script() {
247 let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
248 let cleaned = clean_html(html);
249 assert!(!cleaned.contains("script"));
250 assert!(!cleaned.contains("var x"));
251 assert!(cleaned.contains("Hello"));
252 }
253
254 #[test]
255 fn test_clean_html_removes_style() {
256 let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
257 let cleaned = clean_html(html);
258 assert!(!cleaned.contains("style"));
259 assert!(!cleaned.contains(".foo"));
260 assert!(cleaned.contains("Content"));
261 }
262
263 #[test]
264 fn test_html_to_text_removes_tags() {
265 let html = "<p>Hello <strong>World</strong>!</p>";
266 let text = html_to_text(html);
267 assert!(!text.contains('<'));
268 assert!(!text.contains('>'));
269 assert!(text.contains("Hello"));
270 assert!(text.contains("World"));
271 }
272
273 #[test]
274 fn test_html_to_text_handles_entities() {
275 let html = r"<p>Tom & Jerry</p>";
278 let text = html_to_text(html);
279 assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
281 }
282
283 #[test]
284 fn test_clean_whitespace() {
285 assert_eq!(clean_whitespace(" hello world "), "hello world");
286 assert_eq!(clean_whitespace(" hello world "), "hello world");
288 assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
289 }
290
291 #[test]
292 fn test_extract_documentation() {
293 let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
294 let docs = extract_documentation(html);
295 assert!(docs.contains("Title"));
296 assert!(docs.contains("Content"));
297 }
298
299 #[test]
300 fn test_extract_search_results_found() {
301 let html = "<html><body><h1>Result</h1></body></html>";
302 let result = extract_search_results(html, "serde::Serialize");
303 assert!(result.contains("Search Results"));
304 assert!(result.contains("serde::Serialize"));
305 assert!(result.contains("Result"));
306 }
307
308 #[test]
309 fn test_extract_search_results_not_found() {
310 let html = "<html><body></body></html>";
311 let result = extract_search_results(html, "nonexistent");
312 assert!(result.contains("not found"));
313 assert!(result.contains("nonexistent"));
314 }
315
316 #[test]
317 fn test_clean_html_removes_link_tags() {
318 let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
319 let cleaned = clean_html(html);
320 assert!(
321 !cleaned.contains("link"),
322 "link tag should be removed, got: {cleaned}"
323 );
324 assert!(
325 !cleaned.contains("stylesheet"),
326 "stylesheet should be removed, got: {cleaned}"
327 );
328 assert!(
329 cleaned.contains("Hello"),
330 "Body content should remain, got: {cleaned}"
331 );
332 }
333
334 #[test]
335 fn test_clean_html_removes_meta_tags() {
336 let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
337 let cleaned = clean_html(html);
338 assert!(
339 !cleaned.contains("meta"),
340 "meta tag should be removed, got: {cleaned}"
341 );
342 assert!(
343 cleaned.contains("Content"),
344 "Body content should remain, got: {cleaned}"
345 );
346 }
347
348 #[test]
349 fn test_relative_link_regex() {
350 let re = &RELATIVE_LINK_REGEX;
352
353 assert!(re.is_match("[module](module/index.html)"));
355 assert!(re.is_match("[struct](struct.Struct.html)"));
356
357 assert!(!re.is_match("[Section](#section)")); assert!(
360 !re.is_match("[External](https://example.com)"),
361 "Should not match external URLs"
362 ); }
364
365 #[test]
366 fn test_clean_markdown_preserves_content() {
367 let markdown = r"# Dioxus
369
370## At a glance
371
372Dioxus is a framework for building cross-platform apps.
373
374## Quick start
375
376To get started with Dioxus:
377
378```
379cargo install dioxus-cli
380```
381
382[External Link](https://dioxuslabs.com)
383
384[Anchor](#quick-start)
385";
386 let cleaned = clean_markdown(markdown);
387
388 assert!(cleaned.contains("Dioxus is a framework"));
390 assert!(cleaned.contains("At a glance"));
391 assert!(cleaned.contains("Quick start"));
392 assert!(cleaned.contains("cargo install"));
393
394 assert!(
396 cleaned.contains("[External Link](https://dioxuslabs.com)"),
397 "Should preserve external links"
398 );
399 assert!(
400 cleaned.contains("[Anchor](#quick-start)"),
401 "Should preserve anchor links"
402 );
403 }
404}