use regex::Regex;
use scraper::{Html, Selector};
use std::borrow::Cow;
use std::sync::LazyLock;
const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
static ANCHOR_LINK_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").expect("hardcoded valid regex pattern"));
static SOURCE_LINK_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").expect("hardcoded valid regex pattern"));
static RELATIVE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").expect("hardcoded valid regex pattern")
});
static MULTIPLE_NEWLINES_REGEX: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\n\n\n+").expect("hardcoded valid regex pattern"));
static BODY_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("body").expect("hardcoded valid selector"));
static ALL_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("*").expect("hardcoded valid selector"));
static SCRIPT_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("script").expect("hardcoded valid selector"));
static STYLE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("style").expect("hardcoded valid selector"));
static NOSCRIPT_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("noscript").expect("hardcoded valid selector"));
static IFRAME_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("iframe").expect("hardcoded valid selector"));
static NAV_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("nav").expect("hardcoded valid selector"));
static HEADER_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("header").expect("hardcoded valid selector"));
static FOOTER_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("footer").expect("hardcoded valid selector"));
static ASIDE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("aside").expect("hardcoded valid selector"));
static BUTTON_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("button").expect("hardcoded valid selector"));
static SUMMARY_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("summary").expect("hardcoded valid selector"));
static MAIN_CONTENT_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("#main-content").expect("hardcoded valid selector"));
static RUSTDOC_BODY_WRAPPER_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("#rustdoc_body_wrapper").expect("hardcoded valid selector"));
#[must_use]
pub fn clean_html(html: &str) -> String {
let document = Html::parse_document(html);
remove_unwanted_elements(&document, html)
}
#[inline]
fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
let mut replacements: Vec<(String, Option<String>)> = Vec::new();
for element in document.select(&SCRIPT_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&STYLE_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&NOSCRIPT_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&IFRAME_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&NAV_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&HEADER_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&FOOTER_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&ASIDE_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&BUTTON_SELECTOR) {
replacements.push((element.html(), None));
}
for element in document.select(&SUMMARY_SELECTOR) {
let element_html = element.html();
let text_content: String = element.text().collect();
replacements.push((element_html, Some(text_content)));
}
if replacements.is_empty() {
return apply_regex_patterns(original_html);
}
replacements.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
let mut result = original_html.to_string();
for (element_html, replacement) in replacements {
result = if let Some(text) = replacement {
result.replace(&element_html, &text)
} else {
result.replace(&element_html, "")
};
}
apply_regex_patterns(&result)
}
static COMBINED_CLEANUP_REGEX: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?:<link[^>]*>|<meta[^>]*>|Copy item path|\[§\]\([^)]*\)|\[Source\]\([^)]*\)|\[[^\]]*\]\([a-zA-Z][^)]*\.html\))",
)
.expect("hardcoded valid regex pattern")
});
#[inline]
fn apply_regex_patterns(html: &str) -> String {
COMBINED_CLEANUP_REGEX.replace_all(html, "").into_owned()
}
#[must_use]
pub fn html_to_text(html: &str) -> String {
let document = Html::parse_document(html);
let mut text_parts = Vec::new();
if let Some(body) = document.select(&BODY_SELECTOR).next() {
extract_text_excluding_skip_tags(&body, &mut text_parts);
} else {
if let Some(root) = document.select(&ALL_SELECTOR).next() {
extract_text_excluding_skip_tags(&root, &mut text_parts);
}
}
clean_whitespace(&text_parts.join(" "))
}
#[inline]
fn extract_text_excluding_skip_tags(
element: &scraper::element_ref::ElementRef,
text_parts: &mut Vec<String>,
) {
let tag_name = element.value().name().to_lowercase();
if SKIP_TAGS.contains(&tag_name.as_str()) {
return;
}
for text in element.text() {
let trimmed = text.trim();
if !trimmed.is_empty() {
text_parts.push(trimmed.to_string());
}
}
}
#[must_use]
pub fn extract_documentation(html: &str) -> String {
let main_content = extract_main_content(html);
let cleaned_html = clean_html(&main_content);
let markdown = html2md::parse_html(&cleaned_html);
clean_markdown(&markdown)
}
#[inline]
fn clean_markdown(markdown: &str) -> String {
let result = SOURCE_LINK_REGEX.replace_all(markdown, Cow::Borrowed(""));
let result = RELATIVE_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
let result = ANCHOR_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
let result = MULTIPLE_NEWLINES_REGEX.replace_all(&result, Cow::Borrowed("\n\n"));
result.trim().to_string()
}
#[inline]
fn extract_main_content(html: &str) -> String {
let document = Html::parse_document(html);
if let Some(main_section) = document.select(&MAIN_CONTENT_SELECTOR).next() {
return main_section.html();
}
if let Some(wrapper) = document.select(&RUSTDOC_BODY_WRAPPER_SELECTOR).next() {
return wrapper.html();
}
html.to_string()
}
#[must_use]
pub fn extract_search_results(html: &str, item_path: &str) -> String {
let main_content = extract_main_content(html);
let cleaned_html = clean_html(&main_content);
let markdown = html2md::parse_html(&cleaned_html);
let cleaned_markdown = clean_markdown(&markdown);
if cleaned_markdown.trim().is_empty() {
format!("Documentation for '{item_path}' not found")
} else {
format!("## Search Results: {item_path}\n\n{cleaned_markdown}")
}
}
#[inline]
fn clean_whitespace(text: &str) -> String {
text.split_whitespace().collect::<Vec<_>>().join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clean_html_removes_script() {
let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
let cleaned = clean_html(html);
assert!(!cleaned.contains("script"));
assert!(!cleaned.contains("var x"));
assert!(cleaned.contains("Hello"));
}
#[test]
fn test_clean_html_removes_style() {
let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
let cleaned = clean_html(html);
assert!(!cleaned.contains("style"));
assert!(!cleaned.contains(".foo"));
assert!(cleaned.contains("Content"));
}
#[test]
fn test_html_to_text_removes_tags() {
let html = "<p>Hello <strong>World</strong>!</p>";
let text = html_to_text(html);
assert!(!text.contains('<'));
assert!(!text.contains('>'));
assert!(text.contains("Hello"));
assert!(text.contains("World"));
}
#[test]
fn test_html_to_text_handles_entities() {
let html = r"<p>Tom & Jerry</p>";
let text = html_to_text(html);
assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
}
#[test]
fn test_clean_whitespace() {
assert_eq!(clean_whitespace(" hello world "), "hello world");
assert_eq!(clean_whitespace(" hello world "), "hello world");
assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
}
#[test]
fn test_extract_documentation() {
let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
let docs = extract_documentation(html);
assert!(docs.contains("Title"));
assert!(docs.contains("Content"));
}
#[test]
fn test_extract_search_results_found() {
let html = "<html><body><h1>Result</h1></body></html>";
let result = extract_search_results(html, "serde::Serialize");
assert!(result.contains("Search Results"));
assert!(result.contains("serde::Serialize"));
assert!(result.contains("Result"));
}
#[test]
fn test_extract_search_results_not_found() {
let html = "<html><body></body></html>";
let result = extract_search_results(html, "nonexistent");
assert!(result.contains("not found"));
assert!(result.contains("nonexistent"));
}
#[test]
fn test_clean_html_removes_link_tags() {
let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
let cleaned = clean_html(html);
assert!(
!cleaned.contains("link"),
"link tag should be removed, got: {cleaned}"
);
assert!(
!cleaned.contains("stylesheet"),
"stylesheet should be removed, got: {cleaned}"
);
assert!(
cleaned.contains("Hello"),
"Body content should remain, got: {cleaned}"
);
}
#[test]
fn test_clean_html_removes_meta_tags() {
let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
let cleaned = clean_html(html);
assert!(
!cleaned.contains("meta"),
"meta tag should be removed, got: {cleaned}"
);
assert!(
cleaned.contains("Content"),
"Body content should remain, got: {cleaned}"
);
}
#[test]
fn test_relative_link_regex() {
let re = &RELATIVE_LINK_REGEX;
assert!(re.is_match("[module](module/index.html)"));
assert!(re.is_match("[struct](struct.Struct.html)"));
assert!(!re.is_match("[Section](#section)")); assert!(
!re.is_match("[External](https://example.com)"),
"Should not match external URLs"
); }
#[test]
fn test_clean_markdown_preserves_content() {
let markdown = r"# Dioxus
## At a glance
Dioxus is a framework for building cross-platform apps.
## Quick start
To get started with Dioxus:
```
cargo install dioxus-cli
```
[External Link](https://dioxuslabs.com)
[Anchor](#quick-start)
";
let cleaned = clean_markdown(markdown);
assert!(cleaned.contains("Dioxus is a framework"));
assert!(cleaned.contains("At a glance"));
assert!(cleaned.contains("Quick start"));
assert!(cleaned.contains("cargo install"));
assert!(
cleaned.contains("[External Link](https://dioxuslabs.com)"),
"Should preserve external links"
);
assert!(
cleaned.contains("[Anchor](#quick-start)"),
"Should preserve anchor links"
);
}
#[test]
fn test_extract_documentation_single_pass_optimization() {
let html = r#"
<!DOCTYPE html>
<html>
<head><title>Test Crate</title></head>
<body>
<nav>Navigation content</nav>
<section id="main-content">
<h1>Test Crate</h1>
<p>This is the main documentation.</p>
<script>console.log('test');</script>
<div class="docblock">
<p>Docblock content here.</p>
</div>
</section>
<footer>Footer content</footer>
</body>
</html>
"#;
let docs = extract_documentation(html);
assert!(docs.contains("Test Crate"), "Should contain title");
assert!(
docs.contains("main documentation"),
"Should contain main content"
);
assert!(
docs.contains("Docblock content"),
"Should preserve docblock"
);
assert!(!docs.contains("Navigation content"), "Should remove nav");
assert!(!docs.contains("Footer content"), "Should remove footer");
assert!(!docs.contains("console.log"), "Should remove script");
}
#[test]
fn test_extract_search_results_single_pass_optimization() {
let html = r#"
<!DOCTYPE html>
<html>
<body>
<section id="main-content">
<h1>serde::Serialize</h1>
<pre><code>pub trait Serialize { }</code></pre>
<p>Serialize trait documentation.</p>
</section>
<nav>Sidebar</nav>
</body>
</html>
"#;
let result = extract_search_results(html, "serde::Serialize");
assert!(result.contains("Search Results"));
assert!(result.contains("serde::Serialize"));
assert!(result.contains("Serialize trait"));
assert!(!result.contains("Sidebar"));
}
#[test]
fn test_clean_html_multiple_skip_tags() {
let html = r"
<html>
<head>
<style>.test { color: red; }</style>
<script>var x = 1;</script>
</head>
<body>
<nav>Navigation</nav>
<article>
<h1>Title</h1>
<p>Content with <script>inline script</script> removed.</p>
<footer>Article footer</footer>
</article>
<footer>Page footer</footer>
</body>
</html>
";
let cleaned = clean_html(html);
assert!(cleaned.contains("Title"));
assert!(cleaned.contains("Content"));
assert!(!cleaned.contains("style"), "Should remove style tags");
assert!(!cleaned.contains("script"), "Should remove script tags");
assert!(!cleaned.contains("Navigation"), "Should remove nav");
assert!(!cleaned.contains("footer"), "Should remove footer");
assert!(!cleaned.contains(".test"), "Should remove CSS content");
assert!(!cleaned.contains("var x"), "Should remove JS content");
}
#[test]
fn test_cached_selectors_all_tag_types() {
let test_cases = [
(
"<script>alert('test')</script><p>Content</p>",
"script",
"Content",
),
("<style>.x{}</style><p>Content</p>", "style", "Content"),
(
"<noscript>Enable JS</noscript><p>Content</p>",
"noscript",
"Content",
),
(
"<iframe src=\"x\"></iframe><p>Content</p>",
"iframe",
"Content",
),
("<nav><a>Link</a></nav><p>Content</p>", "nav", "Content"),
("<header>Head</header><p>Content</p>", "header", "Content"),
("<footer>Foot</footer><p>Content</p>", "footer", "Content"),
("<aside>Sidebar</aside><p>Content</p>", "aside", "Content"),
("<button>Click</button><p>Content</p>", "button", "Content"),
];
for (html, tag_to_remove, expected_content) in test_cases {
let cleaned = clean_html(html);
assert!(
!cleaned.contains(tag_to_remove),
"Should remove {tag_to_remove} tag"
);
assert!(
cleaned.contains(expected_content),
"Should preserve {expected_content}"
);
}
}
}