1use regex::Regex;
7use scraper::{Html, Selector};
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
13
14static ANCHOR_LINK_REGEX: LazyLock<Regex> =
16 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").expect("hardcoded valid regex pattern"));
17
18static SOURCE_LINK_REGEX: LazyLock<Regex> =
20 LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").expect("hardcoded valid regex pattern"));
21
22static RELATIVE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
25 Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").expect("hardcoded valid regex pattern")
26});
27
28static MULTIPLE_NEWLINES_REGEX: LazyLock<Regex> =
30 LazyLock::new(|| Regex::new(r"\n\n\n+").expect("hardcoded valid regex pattern"));
31
32static BODY_SELECTOR: LazyLock<Selector> =
34 LazyLock::new(|| Selector::parse("body").expect("hardcoded valid selector"));
35
36static ALL_SELECTOR: LazyLock<Selector> =
38 LazyLock::new(|| Selector::parse("*").expect("hardcoded valid selector"));
39
40static SCRIPT_SELECTOR: LazyLock<Selector> =
42 LazyLock::new(|| Selector::parse("script").expect("hardcoded valid selector"));
43static STYLE_SELECTOR: LazyLock<Selector> =
44 LazyLock::new(|| Selector::parse("style").expect("hardcoded valid selector"));
45static NOSCRIPT_SELECTOR: LazyLock<Selector> =
46 LazyLock::new(|| Selector::parse("noscript").expect("hardcoded valid selector"));
47static IFRAME_SELECTOR: LazyLock<Selector> =
48 LazyLock::new(|| Selector::parse("iframe").expect("hardcoded valid selector"));
49
50static NAV_SELECTOR: LazyLock<Selector> =
52 LazyLock::new(|| Selector::parse("nav").expect("hardcoded valid selector"));
53static HEADER_SELECTOR: LazyLock<Selector> =
54 LazyLock::new(|| Selector::parse("header").expect("hardcoded valid selector"));
55static FOOTER_SELECTOR: LazyLock<Selector> =
56 LazyLock::new(|| Selector::parse("footer").expect("hardcoded valid selector"));
57static ASIDE_SELECTOR: LazyLock<Selector> =
58 LazyLock::new(|| Selector::parse("aside").expect("hardcoded valid selector"));
59
60static BUTTON_SELECTOR: LazyLock<Selector> =
62 LazyLock::new(|| Selector::parse("button").expect("hardcoded valid selector"));
63static SUMMARY_SELECTOR: LazyLock<Selector> =
64 LazyLock::new(|| Selector::parse("summary").expect("hardcoded valid selector"));
65
66static MAIN_CONTENT_SELECTOR: LazyLock<Selector> =
68 LazyLock::new(|| Selector::parse("#main-content").expect("hardcoded valid selector"));
69static RUSTDOC_BODY_WRAPPER_SELECTOR: LazyLock<Selector> =
70 LazyLock::new(|| Selector::parse("#rustdoc_body_wrapper").expect("hardcoded valid selector"));
71
72#[must_use]
80pub fn clean_html(html: &str) -> String {
81 let document = Html::parse_document(html);
82 remove_unwanted_elements(&document, html)
83}
84
85#[inline]
93fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
94 let mut replacements: Vec<(String, Option<String>)> = Vec::new();
96
97 for element in document.select(&SCRIPT_SELECTOR) {
99 replacements.push((element.html(), None));
100 }
101 for element in document.select(&STYLE_SELECTOR) {
102 replacements.push((element.html(), None));
103 }
104 for element in document.select(&NOSCRIPT_SELECTOR) {
105 replacements.push((element.html(), None));
106 }
107 for element in document.select(&IFRAME_SELECTOR) {
108 replacements.push((element.html(), None));
109 }
110
111 for element in document.select(&NAV_SELECTOR) {
113 replacements.push((element.html(), None));
114 }
115 for element in document.select(&HEADER_SELECTOR) {
116 replacements.push((element.html(), None));
117 }
118 for element in document.select(&FOOTER_SELECTOR) {
119 replacements.push((element.html(), None));
120 }
121 for element in document.select(&ASIDE_SELECTOR) {
122 replacements.push((element.html(), None));
123 }
124
125 for element in document.select(&BUTTON_SELECTOR) {
127 replacements.push((element.html(), None));
128 }
129 for element in document.select(&SUMMARY_SELECTOR) {
130 let element_html = element.html();
131 let text_content: String = element.text().collect();
133 replacements.push((element_html, Some(text_content)));
134 }
135
136 if replacements.is_empty() {
138 return apply_regex_patterns(original_html);
139 }
140
141 replacements.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
144
145 let mut result = original_html.to_string();
147 for (element_html, replacement) in replacements {
148 result = if let Some(text) = replacement {
151 result.replace(&element_html, &text)
152 } else {
153 result.replace(&element_html, "")
154 };
155 }
156
157 apply_regex_patterns(&result)
158}
159
160static COMBINED_CLEANUP_REGEX: LazyLock<Regex> = LazyLock::new(|| {
174 Regex::new(
175 r"(?:<link[^>]*>|<meta[^>]*>|Copy item path|\[§\]\([^)]*\)|\[Source\]\([^)]*\)|\[[^\]]*\]\([a-zA-Z][^)]*\.html\))",
176 )
177 .expect("hardcoded valid regex pattern")
178});
179
180#[inline]
197fn apply_regex_patterns(html: &str) -> String {
198 COMBINED_CLEANUP_REGEX.replace_all(html, "").into_owned()
200}
201
202#[must_use]
206pub fn html_to_text(html: &str) -> String {
207 let document = Html::parse_document(html);
208
209 let mut text_parts = Vec::new();
211
212 if let Some(body) = document.select(&BODY_SELECTOR).next() {
214 extract_text_excluding_skip_tags(&body, &mut text_parts);
215 } else {
216 if let Some(root) = document.select(&ALL_SELECTOR).next() {
218 extract_text_excluding_skip_tags(&root, &mut text_parts);
219 }
220 }
221
222 clean_whitespace(&text_parts.join(" "))
223}
224
225#[inline]
226fn extract_text_excluding_skip_tags(
227 element: &scraper::element_ref::ElementRef,
228 text_parts: &mut Vec<String>,
229) {
230 let tag_name = element.value().name().to_lowercase();
231
232 if SKIP_TAGS.contains(&tag_name.as_str()) {
233 return;
234 }
235
236 for text in element.text() {
237 let trimmed = text.trim();
238 if !trimmed.is_empty() {
239 text_parts.push(trimmed.to_string());
240 }
241 }
242}
243
244#[must_use]
249pub fn extract_documentation(html: &str) -> String {
250 let main_content = extract_main_content(html);
252 let cleaned_html = clean_html(&main_content);
253 let markdown = html2md::parse_html(&cleaned_html);
254
255 clean_markdown(&markdown)
257}
258
259#[inline]
261fn clean_markdown(markdown: &str) -> String {
262 let result = SOURCE_LINK_REGEX.replace_all(markdown, Cow::Borrowed(""));
265 let result = RELATIVE_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
266 let result = ANCHOR_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
267 let result = MULTIPLE_NEWLINES_REGEX.replace_all(&result, Cow::Borrowed("\n\n"));
268 result.trim().to_string()
269}
270
271#[inline]
276fn extract_main_content(html: &str) -> String {
277 let document = Html::parse_document(html);
278
279 if let Some(main_section) = document.select(&MAIN_CONTENT_SELECTOR).next() {
281 return main_section.html();
282 }
283
284 if let Some(wrapper) = document.select(&RUSTDOC_BODY_WRAPPER_SELECTOR).next() {
286 return wrapper.html();
287 }
288
289 html.to_string()
291}
292
293#[must_use]
295pub fn extract_search_results(html: &str, item_path: &str) -> String {
296 let main_content = extract_main_content(html);
297 let cleaned_html = clean_html(&main_content);
298 let markdown = html2md::parse_html(&cleaned_html);
299 let cleaned_markdown = clean_markdown(&markdown);
300
301 if cleaned_markdown.trim().is_empty() {
302 format!("Documentation for '{item_path}' not found")
303 } else {
304 format!("## Search Results: {item_path}\n\n{cleaned_markdown}")
305 }
306}
307
308#[inline]
309fn clean_whitespace(text: &str) -> String {
310 text.split_whitespace().collect::<Vec<_>>().join(" ")
311}
312
313#[cfg(test)]
314mod tests {
315 use super::*;
316
317 #[test]
318 fn test_clean_html_removes_script() {
319 let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
320 let cleaned = clean_html(html);
321 assert!(!cleaned.contains("script"));
322 assert!(!cleaned.contains("var x"));
323 assert!(cleaned.contains("Hello"));
324 }
325
326 #[test]
327 fn test_clean_html_removes_style() {
328 let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
329 let cleaned = clean_html(html);
330 assert!(!cleaned.contains("style"));
331 assert!(!cleaned.contains(".foo"));
332 assert!(cleaned.contains("Content"));
333 }
334
335 #[test]
336 fn test_html_to_text_removes_tags() {
337 let html = "<p>Hello <strong>World</strong>!</p>";
338 let text = html_to_text(html);
339 assert!(!text.contains('<'));
340 assert!(!text.contains('>'));
341 assert!(text.contains("Hello"));
342 assert!(text.contains("World"));
343 }
344
345 #[test]
346 fn test_html_to_text_handles_entities() {
347 let html = r"<p>Tom & Jerry</p>";
350 let text = html_to_text(html);
351 assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
353 }
354
355 #[test]
356 fn test_clean_whitespace() {
357 assert_eq!(clean_whitespace(" hello world "), "hello world");
358 assert_eq!(clean_whitespace(" hello world "), "hello world");
360 assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
361 }
362
363 #[test]
364 fn test_extract_documentation() {
365 let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
366 let docs = extract_documentation(html);
367 assert!(docs.contains("Title"));
368 assert!(docs.contains("Content"));
369 }
370
371 #[test]
372 fn test_extract_search_results_found() {
373 let html = "<html><body><h1>Result</h1></body></html>";
374 let result = extract_search_results(html, "serde::Serialize");
375 assert!(result.contains("Search Results"));
376 assert!(result.contains("serde::Serialize"));
377 assert!(result.contains("Result"));
378 }
379
380 #[test]
381 fn test_extract_search_results_not_found() {
382 let html = "<html><body></body></html>";
383 let result = extract_search_results(html, "nonexistent");
384 assert!(result.contains("not found"));
385 assert!(result.contains("nonexistent"));
386 }
387
388 #[test]
389 fn test_clean_html_removes_link_tags() {
390 let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
391 let cleaned = clean_html(html);
392 assert!(
393 !cleaned.contains("link"),
394 "link tag should be removed, got: {cleaned}"
395 );
396 assert!(
397 !cleaned.contains("stylesheet"),
398 "stylesheet should be removed, got: {cleaned}"
399 );
400 assert!(
401 cleaned.contains("Hello"),
402 "Body content should remain, got: {cleaned}"
403 );
404 }
405
406 #[test]
407 fn test_clean_html_removes_meta_tags() {
408 let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
409 let cleaned = clean_html(html);
410 assert!(
411 !cleaned.contains("meta"),
412 "meta tag should be removed, got: {cleaned}"
413 );
414 assert!(
415 cleaned.contains("Content"),
416 "Body content should remain, got: {cleaned}"
417 );
418 }
419
420 #[test]
421 fn test_relative_link_regex() {
422 let re = &RELATIVE_LINK_REGEX;
424
425 assert!(re.is_match("[module](module/index.html)"));
427 assert!(re.is_match("[struct](struct.Struct.html)"));
428
429 assert!(!re.is_match("[Section](#section)")); assert!(
432 !re.is_match("[External](https://example.com)"),
433 "Should not match external URLs"
434 ); }
436
437 #[test]
438 fn test_clean_markdown_preserves_content() {
439 let markdown = r"# Dioxus
441
442## At a glance
443
444Dioxus is a framework for building cross-platform apps.
445
446## Quick start
447
448To get started with Dioxus:
449
450```
451cargo install dioxus-cli
452```
453
454[External Link](https://dioxuslabs.com)
455
456[Anchor](#quick-start)
457";
458 let cleaned = clean_markdown(markdown);
459
460 assert!(cleaned.contains("Dioxus is a framework"));
462 assert!(cleaned.contains("At a glance"));
463 assert!(cleaned.contains("Quick start"));
464 assert!(cleaned.contains("cargo install"));
465
466 assert!(
468 cleaned.contains("[External Link](https://dioxuslabs.com)"),
469 "Should preserve external links"
470 );
471 assert!(
472 cleaned.contains("[Anchor](#quick-start)"),
473 "Should preserve anchor links"
474 );
475 }
476
477 #[test]
484 fn test_extract_documentation_single_pass_optimization() {
485 let html = r#"
486<!DOCTYPE html>
487<html>
488<head><title>Test Crate</title></head>
489<body>
490 <nav>Navigation content</nav>
491 <section id="main-content">
492 <h1>Test Crate</h1>
493 <p>This is the main documentation.</p>
494 <script>console.log('test');</script>
495 <div class="docblock">
496 <p>Docblock content here.</p>
497 </div>
498 </section>
499 <footer>Footer content</footer>
500</body>
501</html>
502"#;
503 let docs = extract_documentation(html);
504
505 assert!(docs.contains("Test Crate"), "Should contain title");
507 assert!(
508 docs.contains("main documentation"),
509 "Should contain main content"
510 );
511 assert!(
512 docs.contains("Docblock content"),
513 "Should preserve docblock"
514 );
515
516 assert!(!docs.contains("Navigation content"), "Should remove nav");
518 assert!(!docs.contains("Footer content"), "Should remove footer");
519 assert!(!docs.contains("console.log"), "Should remove script");
520 }
521
522 #[test]
525 fn test_extract_search_results_single_pass_optimization() {
526 let html = r#"
527<!DOCTYPE html>
528<html>
529<body>
530 <section id="main-content">
531 <h1>serde::Serialize</h1>
532 <pre><code>pub trait Serialize { }</code></pre>
533 <p>Serialize trait documentation.</p>
534 </section>
535 <nav>Sidebar</nav>
536</body>
537</html>
538"#;
539 let result = extract_search_results(html, "serde::Serialize");
540
541 assert!(result.contains("Search Results"));
543 assert!(result.contains("serde::Serialize"));
544 assert!(result.contains("Serialize trait"));
545
546 assert!(!result.contains("Sidebar"));
548 }
549
550 #[test]
552 fn test_clean_html_multiple_skip_tags() {
553 let html = r"
554<html>
555<head>
556 <style>.test { color: red; }</style>
557 <script>var x = 1;</script>
558</head>
559<body>
560 <nav>Navigation</nav>
561 <article>
562 <h1>Title</h1>
563 <p>Content with <script>inline script</script> removed.</p>
564 <footer>Article footer</footer>
565 </article>
566 <footer>Page footer</footer>
567</body>
568</html>
569";
570 let cleaned = clean_html(html);
571
572 assert!(cleaned.contains("Title"));
574 assert!(cleaned.contains("Content"));
575
576 assert!(!cleaned.contains("style"), "Should remove style tags");
578 assert!(!cleaned.contains("script"), "Should remove script tags");
579 assert!(!cleaned.contains("Navigation"), "Should remove nav");
580 assert!(!cleaned.contains("footer"), "Should remove footer");
581 assert!(!cleaned.contains(".test"), "Should remove CSS content");
582 assert!(!cleaned.contains("var x"), "Should remove JS content");
583 }
584
585 #[test]
587 fn test_cached_selectors_all_tag_types() {
588 let test_cases = [
590 (
591 "<script>alert('test')</script><p>Content</p>",
592 "script",
593 "Content",
594 ),
595 ("<style>.x{}</style><p>Content</p>", "style", "Content"),
596 (
597 "<noscript>Enable JS</noscript><p>Content</p>",
598 "noscript",
599 "Content",
600 ),
601 (
602 "<iframe src=\"x\"></iframe><p>Content</p>",
603 "iframe",
604 "Content",
605 ),
606 ("<nav><a>Link</a></nav><p>Content</p>", "nav", "Content"),
607 ("<header>Head</header><p>Content</p>", "header", "Content"),
608 ("<footer>Foot</footer><p>Content</p>", "footer", "Content"),
609 ("<aside>Sidebar</aside><p>Content</p>", "aside", "Content"),
610 ("<button>Click</button><p>Content</p>", "button", "Content"),
611 ];
612
613 for (html, tag_to_remove, expected_content) in test_cases {
614 let cleaned = clean_html(html);
615 assert!(
616 !cleaned.contains(tag_to_remove),
617 "Should remove {tag_to_remove} tag"
618 );
619 assert!(
620 cleaned.contains(expected_content),
621 "Should preserve {expected_content}"
622 );
623 }
624 }
625}