1use regex::Regex;
7use scraper::{Html, Selector};
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
13
14const NAV_TAGS: &[&str] = &["nav", "header", "footer", "aside"];
16
17const UI_TAGS: &[&str] = &["button", "summary"];
21
22static LINK_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<link[^>]*>").unwrap());
24
25static META_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<meta[^>]*>").unwrap());
26
27static COPY_PATH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"Copy item path").unwrap());
29
30static ANCHOR_LINK_REGEX: LazyLock<Regex> =
32 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
33
34static SOURCE_LINK_REGEX: LazyLock<Regex> =
36 LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").unwrap());
37
38static RELATIVE_LINK_REGEX: LazyLock<Regex> =
41 LazyLock::new(|| Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").unwrap());
42
43static SECTION_MARKER_REGEX: LazyLock<Regex> =
45 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
46
47#[must_use]
52pub fn clean_html(html: &str) -> String {
53 let document = Html::parse_document(html);
54 remove_unwanted_elements(&document, html)
55}
56
57#[inline]
59fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
60 let mut result = original_html.to_string();
61
62 for tag in SKIP_TAGS {
64 if let Ok(selector) = Selector::parse(tag) {
65 let elements: Vec<_> = document.select(&selector).collect();
66 for element in elements {
67 let element_html = element.html();
68 result = result.replace(&element_html, "");
69 }
70 }
71 }
72
73 let mut updated_doc = Html::parse_document(&result);
75
76 for tag in NAV_TAGS {
78 if let Ok(selector) = Selector::parse(tag) {
79 let elements: Vec<_> = updated_doc.select(&selector).collect();
80 for element in elements {
81 let element_html = element.html();
82 result = result.replace(&element_html, "");
83 }
84 }
85 }
86
87 updated_doc = Html::parse_document(&result);
89
90 for tag in UI_TAGS {
94 if let Ok(selector) = Selector::parse(tag) {
95 let elements: Vec<_> = updated_doc.select(&selector).collect();
96 for element in elements {
97 let element_html = element.html();
98 if tag == &"summary" {
99 let text_content: String = element.text().collect();
101 result = result.replace(&element_html, &text_content);
102 } else {
103 result = result.replace(&element_html, "");
105 }
106 }
107 }
108 }
109
110 result = LINK_TAG_REGEX.replace_all(&result, "").to_string();
112 result = META_TAG_REGEX.replace_all(&result, "").to_string();
113
114 result = COPY_PATH_REGEX.replace_all(&result, "").to_string();
116 result = ANCHOR_LINK_REGEX.replace_all(&result, "").to_string();
117
118 result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
120 result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
121
122 result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
124
125 result
126}
127
128#[must_use]
132pub fn html_to_text(html: &str) -> String {
133 let document = Html::parse_document(html);
134
135 let mut text_parts = Vec::new();
137
138 let body_selector = Selector::parse("body").unwrap();
140
141 if let Some(body) = document.select(&body_selector).next() {
142 extract_text_excluding_skip_tags(&body, &mut text_parts);
143 } else {
144 let all_selector = Selector::parse("*").unwrap();
146 if let Some(root) = document.select(&all_selector).next() {
147 extract_text_excluding_skip_tags(&root, &mut text_parts);
148 }
149 }
150
151 clean_whitespace(&text_parts.join(" "))
152}
153
154#[inline]
155fn extract_text_excluding_skip_tags(
156 element: &scraper::element_ref::ElementRef,
157 text_parts: &mut Vec<String>,
158) {
159 let tag_name = element.value().name().to_lowercase();
160
161 if SKIP_TAGS.contains(&tag_name.as_str()) {
162 return;
163 }
164
165 for text in element.text() {
166 let trimmed = text.trim();
167 if !trimmed.is_empty() {
168 text_parts.push(trimmed.to_string());
169 }
170 }
171}
172
173#[inline]
174#[allow(dead_code)]
175fn is_block_element(tag: &str) -> bool {
176 const BLOCK_ELEMENTS: &[&str] = &[
177 "address",
178 "article",
179 "aside",
180 "blockquote",
181 "body",
182 "canvas",
183 "dd",
184 "div",
185 "dl",
186 "dt",
187 "fieldset",
188 "figcaption",
189 "figure",
190 "footer",
191 "form",
192 "h1",
193 "h2",
194 "h3",
195 "h4",
196 "h5",
197 "h6",
198 "head",
199 "header",
200 "hgroup",
201 "hr",
202 "html",
203 "li",
204 "main",
205 "nav",
206 "noscript",
207 "ol",
208 "p",
209 "pre",
210 "section",
211 "table",
212 "tbody",
213 "td",
214 "tfoot",
215 "th",
216 "thead",
217 "tr",
218 "ul",
219 "video",
220 ];
221 BLOCK_ELEMENTS.contains(&tag)
222}
223
224#[must_use]
229pub fn extract_documentation(html: &str) -> String {
230 let main_content = extract_main_content(html);
232 let cleaned_html = clean_html(&main_content);
233 let markdown = html2md::parse_html(&cleaned_html);
234
235 clean_markdown(&markdown)
237}
238
239#[inline]
241fn clean_markdown(markdown: &str) -> String {
242 let result = SOURCE_LINK_REGEX.replace_all(markdown, Cow::Borrowed(""));
243 let result = RELATIVE_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
244 let result = SECTION_MARKER_REGEX.replace_all(&result, Cow::Borrowed(""));
245 let result = result.replace("\n\n\n", "\n\n");
246 result.trim().to_string()
247}
248
249#[inline]
254fn extract_main_content(html: &str) -> String {
255 let document = Html::parse_document(html);
256
257 if let Ok(selector) = Selector::parse("#main-content") {
259 if let Some(main_section) = document.select(&selector).next() {
260 return main_section.html();
261 }
262 }
263
264 if let Ok(selector) = Selector::parse("#rustdoc_body_wrapper") {
266 if let Some(wrapper) = document.select(&selector).next() {
267 return wrapper.html();
268 }
269 }
270
271 html.to_string()
273}
274
275#[must_use]
277pub fn extract_search_results(html: &str, item_path: &str) -> String {
278 let main_content = extract_main_content(html);
279 let cleaned_html = clean_html(&main_content);
280 let markdown = html2md::parse_html(&cleaned_html);
281 let cleaned_markdown = clean_markdown(&markdown);
282
283 if cleaned_markdown.trim().is_empty() {
284 format!("Documentation for '{item_path}' not found")
285 } else {
286 format!("## Search Results: {item_path}\n\n{cleaned_markdown}")
287 }
288}
289
290#[inline]
291fn clean_whitespace(text: &str) -> String {
292 text.split_whitespace().collect::<Vec<_>>().join(" ")
293}
294
295#[cfg(test)]
296mod tests {
297 use super::*;
298
299 #[test]
300 fn test_clean_html_removes_script() {
301 let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
302 let cleaned = clean_html(html);
303 assert!(!cleaned.contains("script"));
304 assert!(!cleaned.contains("var x"));
305 assert!(cleaned.contains("Hello"));
306 }
307
308 #[test]
309 fn test_clean_html_removes_style() {
310 let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
311 let cleaned = clean_html(html);
312 assert!(!cleaned.contains("style"));
313 assert!(!cleaned.contains(".foo"));
314 assert!(cleaned.contains("Content"));
315 }
316
317 #[test]
318 fn test_html_to_text_removes_tags() {
319 let html = "<p>Hello <strong>World</strong>!</p>";
320 let text = html_to_text(html);
321 assert!(!text.contains('<'));
322 assert!(!text.contains('>'));
323 assert!(text.contains("Hello"));
324 assert!(text.contains("World"));
325 }
326
327 #[test]
328 fn test_html_to_text_handles_entities() {
329 let html = r"<p>Tom & Jerry</p>";
332 let text = html_to_text(html);
333 assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
335 }
336
337 #[test]
338 fn test_clean_whitespace() {
339 assert_eq!(clean_whitespace(" hello world "), "hello world");
340 assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
341 }
342
343 #[test]
344 fn test_extract_documentation() {
345 let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
346 let docs = extract_documentation(html);
347 assert!(docs.contains("Title"));
348 assert!(docs.contains("Content"));
349 }
350
351 #[test]
352 fn test_extract_search_results_found() {
353 let html = "<html><body><h1>Result</h1></body></html>";
354 let result = extract_search_results(html, "serde::Serialize");
355 assert!(result.contains("Search Results"));
356 assert!(result.contains("serde::Serialize"));
357 assert!(result.contains("Result"));
358 }
359
360 #[test]
361 fn test_extract_search_results_not_found() {
362 let html = "<html><body></body></html>";
363 let result = extract_search_results(html, "nonexistent");
364 assert!(result.contains("not found"));
365 assert!(result.contains("nonexistent"));
366 }
367
368 #[test]
369 fn test_clean_html_removes_link_tags() {
370 let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
371 let cleaned = clean_html(html);
372 assert!(
373 !cleaned.contains("link"),
374 "link tag should be removed, got: {cleaned}"
375 );
376 assert!(
377 !cleaned.contains("stylesheet"),
378 "stylesheet should be removed, got: {cleaned}"
379 );
380 assert!(
381 cleaned.contains("Hello"),
382 "Body content should remain, got: {cleaned}"
383 );
384 }
385
386 #[test]
387 fn test_clean_html_removes_meta_tags() {
388 let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
389 let cleaned = clean_html(html);
390 assert!(
391 !cleaned.contains("meta"),
392 "meta tag should be removed, got: {cleaned}"
393 );
394 assert!(
395 cleaned.contains("Content"),
396 "Body content should remain, got: {cleaned}"
397 );
398 }
399
400 #[test]
401 fn test_relative_link_regex() {
402 let re = &RELATIVE_LINK_REGEX;
404
405 assert!(re.is_match("[module](module/index.html)"));
407 assert!(re.is_match("[struct](struct.Struct.html)"));
408
409 assert!(!re.is_match("[Section](#section)")); assert!(
412 !re.is_match("[External](https://example.com)"),
413 "Should not match external URLs"
414 ); }
416
417 #[test]
418 fn test_clean_markdown_preserves_content() {
419 let markdown = r"# Dioxus
421
422## At a glance
423
424Dioxus is a framework for building cross-platform apps.
425
426## Quick start
427
428To get started with Dioxus:
429
430```
431cargo install dioxus-cli
432```
433
434[External Link](https://dioxuslabs.com)
435
436[Anchor](#quick-start)
437";
438 let cleaned = clean_markdown(markdown);
439
440 assert!(cleaned.contains("Dioxus is a framework"));
442 assert!(cleaned.contains("At a glance"));
443 assert!(cleaned.contains("Quick start"));
444 assert!(cleaned.contains("cargo install"));
445
446 assert!(
448 cleaned.contains("[External Link](https://dioxuslabs.com)"),
449 "Should preserve external links"
450 );
451 assert!(
452 cleaned.contains("[Anchor](#quick-start)"),
453 "Should preserve anchor links"
454 );
455 }
456}