use crate::error::Result;
use regex::Regex;
use scraper::{Html, Selector};
use std::sync::LazyLock;
use url::Url;
macro_rules! cached_regex {
($name:ident, $pattern:expr) => {
static $name: LazyLock<Regex> = LazyLock::new(|| Regex::new($pattern).unwrap());
};
}
cached_regex!(RE_SCRIPT, r"(?is)<script[^>]*?>.*?</script>");
cached_regex!(RE_STYLE, r"(?is)<style[^>]*?>.*?</style>");
cached_regex!(RE_NOSCRIPT, r"(?is)<noscript[^>]*?>.*?</noscript>");
cached_regex!(RE_SVG, r"(?is)<svg[^>]*?>.*?</svg>");
cached_regex!(RE_HEAD, r"(?is)<head[^>]*?>.*?</head>");
cached_regex!(RE_COMMENT, r"(?is)<!--.*?-->");
cached_regex!(RE_SETEXT_H1, r"(?m)^[ \t]*(.+)\n[ \t]*={3,}\s*$");
cached_regex!(RE_SETEXT_H2, r"(?m)^[ \t]*(.+)\n[ \t]*-{3,}\s*$");
cached_regex!(RE_ESCAPED_TAG, r"\\</?[a-zA-Z!][^\n>]*?\\?>");
cached_regex!(RE_CSS_ROOT, r":root\{--[^}]+\}");
cached_regex!(RE_BASE64_IMG, r"(!\[[^\]]*\])\(data:image/[^;]+;base64,[^)]*\)");
cached_regex!(RE_EMPTY_LINK, r"\[([^\]]*)\]\(\s*\)");
cached_regex!(RE_EMPTY_LIST_RUN, r"(?m)(?:^\s*\*\s*\n){3,}");
cached_regex!(RE_COLLAPSE_NEWLINES, r"\n\s*\n\s*\n+");
cached_regex!(RE_IMG_TAG, r#"<img\s[^>]*?>"#);
cached_regex!(RE_IMG_SRC, r#"src\s*=\s*["']([^"']+)["']"#);
cached_regex!(RE_IMG_ALT, r#"alt\s*=\s*["']([^"']*?)["']"#);
cached_regex!(RE_CATCHALL_TAG, r"</?[a-zA-Z][a-zA-Z0-9]*(?:\s[^>]*)?>");
cached_regex!(RE_MULTI_NEWLINE, r"\n{3,}");
cached_regex!(RE_ANCHOR_TAG, r#"(?is)<a\s[^>]*?href\s*=\s*["']([^"']*)["'][^>]*?>(.*?)</a>"#);
cached_regex!(RE_PRE_CODE, r#"(?is)<pre[^>]*?>\s*<code([^>]*)>(.*?)</code>\s*</pre>"#);
cached_regex!(RE_PRE_BARE, r#"(?is)<pre[^>]*?>(.*?)</pre>"#);
cached_regex!(RE_LANG_CLASS, r#"(?i)\b(?:language|lang|highlight)-([a-zA-Z0-9_+-]+)"#);
cached_regex!(RE_HEADING_IN_LINK, r"\[\s*(#{1,6})\s+(.+?)\s*#{0,6}\s*\]\(([^)]+)\)");
cached_regex!(RE_EMPTY_TEXT_LINK, r"(^|[^!])\[\]\([^)]+\)");
cached_regex!(RE_TRACKING_PIXEL, r"!\[[^\]]*\]\([^)]*(?:s_1x2\.gif|pixel\.gif|spacer\.gif|blank\.gif|clear\.gif)[^)]*\)");
pub fn html_to_markdown(html: &str, base_url: &str, only_main: bool) -> Result<String> {
let trimmed = html.trim();
if (trimmed.starts_with('{') && trimmed.ends_with('}'))
|| (trimmed.starts_with('[') && trimmed.ends_with(']'))
{
if serde_json::from_str::<serde_json::Value>(trimmed).is_ok() {
return Ok(format!("# JSON Response\n\n```json\n{}\n```", trimmed));
}
}
if !trimmed.is_empty() && !trimmed.contains('<') {
return Ok(trimmed.to_string());
}
let content = if only_main {
extract_main_content_html(html)?
} else {
html.to_string()
};
let content = crate::format::image_processing::rescue_noscript_images(&content);
let content = strip_non_content_tags(&content);
let content = crate::format::image_processing::resolve_picture_elements(&content);
let content = preprocess_html_for_conversion(&content, base_url);
let content = crate::format::image_processing::resolve_srcsets(&content);
let content = crate::format::image_processing::resolve_lazy_images(&content);
let content = crate::format::image_processing::resolve_video_posters(&content);
let content = strip_layout_tables(&content);
let content = strip_excessive_tables(&content);
let markdown = safe_parse_html(&content);
let markdown = if markdown.len() > content.len() * 3 && content.len() > 10000 {
static RE_INDIVIDUAL_TABLE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?is)<table[^>]*>(.*?)</table>").unwrap()
});
let selective = RE_INDIVIDUAL_TABLE.replace_all(&content, |caps: ®ex::Captures| {
let full_match = &caps[0];
let table_attrs = full_match.split('>').next().unwrap_or("");
let is_important = table_attrs.contains("infobox")
|| table_attrs.contains("wikitable")
|| table_attrs.contains("data-table");
let row_count = full_match.matches("<tr").count();
if is_important || row_count <= 30 {
full_match.to_string()
} else {
"\n".to_string()
}
}).to_string();
safe_parse_html(&selective)
} else {
markdown
};
let markdown = compress_markdown_tables(&markdown);
const MAX_MARKDOWN_BYTES: usize = 500_000;
let markdown = if markdown.len() > MAX_MARKDOWN_BYTES {
let truncated = &markdown[..MAX_MARKDOWN_BYTES];
let cutoff = truncated.rfind('\n').unwrap_or(MAX_MARKDOWN_BYTES);
format!(
"{}\n\n[Content truncated: {} chars total]",
&markdown[..cutoff],
markdown.len()
)
} else {
markdown
};
let cleaned = clean_markdown(&markdown);
let escaped = escape_multiline_links(&cleaned);
let no_skip_links = remove_accessibility_links(&escaped);
let collapsed = RE_COLLAPSE_NEWLINES
.replace_all(&no_skip_links, "\n\n")
.to_string();
let collapsed = if !collapsed.lines().any(|l| l.trim_start().starts_with('#')) {
if let Some(title) = extract_title_from_html(html) {
if !title.is_empty() {
format!("# {}\n\n{}", title.trim(), collapsed)
} else {
collapsed
}
} else {
collapsed
}
} else {
collapsed
};
if only_main && collapsed.trim().len() < 50 {
return html_to_markdown(html, base_url, false);
}
let portable = convert_urls_to_absolute(&collapsed, base_url)?;
Ok(portable)
}
fn extract_title_from_html(html: &str) -> Option<String> {
static RE_TITLE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?is)<title[^>]*>(.*?)</title>").unwrap()
});
RE_TITLE.captures(html).map(|caps| {
let raw = caps[1].trim().to_string();
raw.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace(" ", " ")
})
}
pub fn extract_main_content_html(html: &str) -> Result<String> {
let document = Html::parse_document(html);
let main_selectors = [
"#readme", ".markdown-body", "#mw-content-text",
".mw-parser-output",
"main article",
"[role='main'] article",
".docs-content",
".doc-content",
"[data-docs-content]",
".prose", ".article-body",
"main",
"article",
"[role='main']",
".main-content",
"#main-content",
".content",
"#content",
".post-content",
".entry-content",
".article-content",
".page-content",
".body-content",
"#inside",
".stories",
".itemlist",
];
let html_len = html.len();
for selector_str in &main_selectors {
if let Ok(selector) = Selector::parse(selector_str) {
if let Some(element) = document.select(&selector).next() {
let content = element.html();
let min_size = html_len / 10;
if content.len() >= min_size {
return Ok(remove_nested_nav(&content));
}
}
}
}
let mut cleaned_html = html.to_string();
static REMOVE_SELECTORS_PARSED: LazyLock<Vec<Selector>> = LazyLock::new(|| {
[
".Layout-sidebar", ".file-navigation", ".BorderGrid",
".Layout-sidebar-left", ".Layout-sidebar-right", ".repository-content",
".file-tree", ".js-file-line-container", ".blob-wrapper",
".contributors-wrapper", ".discussion-sidebar",
"nav", "header", "footer", "aside",
".navigation", ".sidebar", ".menu", ".header", ".footer",
"#header", "#footer", "#navigation",
".docs-sidebar", ".doc-sidebar", ".sidebar-nav",
".toc-sidebar", ".page-sidebar", ".left-sidebar",
".side-nav", ".sidenav",
"#sidebar", "#toc",
"[role='navigation']", "[role='complementary']",
".toc", ".table-of-contents",
".skip-link", ".skip-to-content",
".mw-editsection", "#mw-panel", "#mw-head", ".navbox", ".catlinks", ".mw-indicators", ".sistersitebox", "#p-lang-btn", ".vector-page-toolbar", ".vector-column-start", ".cookie-banner", ".cookie-consent", ".cookie-notice",
"#cookie-banner", "#cookie-consent",
".share-buttons", ".social-share", ".social-links",
".ad", ".advertisement", ".ads",
".breadcrumb", ".breadcrumbs",
".search-form", ".search-box",
".modal", ".popup", "#modal", ".overlay",
".widget", "#widget",
".lang-selector", ".language", "#language-selector",
".top-bar", ".bottom-bar",
".gh-header", "#gh-header",
"script", "style", "noscript", "svg",
]
.iter()
.filter_map(|s| Selector::parse(s).ok())
.collect()
});
let doc = Html::parse_document(&cleaned_html);
let mut to_remove = String::new();
for selector in REMOVE_SELECTORS_PARSED.iter() {
for element in doc.select(selector) {
to_remove.push_str(&element.html());
}
}
let attr_selectors = [
"[class*='cookie']", "[aria-label='breadcrumb']",
"[class*='cart']", "[class*='wishlist']", "[class*='account-']",
"[class*='sponsored']", "[class*='banner']",
"[class*='notification']", "[class*='alert']",
];
for selector_str in &attr_selectors {
if let Ok(selector) = Selector::parse(selector_str) {
for element in doc.select(&selector) {
to_remove.push_str(&element.html());
}
}
}
for line in to_remove.lines() {
if !line.trim().is_empty() {
cleaned_html = cleaned_html.replace(line, "");
}
}
Ok(if cleaned_html.trim().is_empty() {
html.to_string()
} else {
cleaned_html
})
}
fn remove_nested_nav(html: &str) -> String {
static NESTED_NAV_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
vec![
Regex::new(r"(?is)<nav[^>]*>.*?</nav>").unwrap(),
Regex::new(r"(?is)<aside[^>]*>.*?</aside>").unwrap(),
Regex::new(r#"(?is)<div[^>]*class\s*=\s*["'][^"']*\b(?:sidebar|side-nav|sidenav|sidebar-nav|toc-sidebar|page-sidebar)\b[^"']*["'][^>]*>.*?</div>"#).unwrap(),
Regex::new(r#"(?is)<\w+[^>]*role\s*=\s*["'](?:navigation|complementary)["'][^>]*>.*?</\w+>"#).unwrap(),
]
});
let mut result = html.to_string();
for re in NESTED_NAV_PATTERNS.iter() {
result = re.replace_all(&result, "").to_string();
}
result
}
fn safe_parse_html(html: &str) -> String {
if html.len() < 500_000 {
return html2md::parse_html(html);
}
let html_owned = html.to_string();
let result = std::thread::Builder::new()
.name("html2md-parser".to_string())
.stack_size(32 * 1024 * 1024) .spawn(move || html2md::parse_html(&html_owned))
.and_then(|handle| {
handle.join().map_err(|_| {
std::io::Error::other("html2md thread panicked")
})
});
match result {
Ok(markdown) => markdown,
Err(e) => {
tracing::warn!("html2md failed with large HTML ({}KB): {}", html.len() / 1024, e);
let doc = Html::parse_document(html);
doc.root_element()
.text()
.collect::<Vec<_>>()
.join(" ")
}
}
}
fn preprocess_html_for_conversion(html: &str, base_url: &str) -> String {
let base = match Url::parse(base_url) {
Ok(u) => u,
Err(_) => return html.to_string(),
};
static RE_HREF: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(<a\s[^>]*?href\s*=\s*["'])([^"']+)(["'])"#).unwrap()
});
let result = RE_HREF.replace_all(html, |caps: ®ex::Captures| {
let prefix = &caps[1];
let href = &caps[2];
let suffix = &caps[3];
if href.starts_with("http://") || href.starts_with("https://") || href.starts_with("data:") || href.starts_with("javascript:") {
caps[0].to_string()
} else if href.starts_with('#') {
let base_str = base.as_str().split('#').next().unwrap_or(base.as_str());
format!("{}{}{}{}", prefix, base_str, href, suffix)
} else if href.starts_with("//") {
format!("{}https:{}{}", prefix, href, suffix)
} else {
match base.join(href) {
Ok(abs) => format!("{}{}{}", prefix, abs, suffix),
Err(_) => caps[0].to_string(),
}
}
}).to_string();
static RE_IMG_SRC_ATTR: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(<img\s[^>]*?src\s*=\s*["'])([^"']+)(["'])"#).unwrap()
});
let result = RE_IMG_SRC_ATTR.replace_all(&result, |caps: ®ex::Captures| {
let prefix = &caps[1];
let src = &caps[2];
let suffix = &caps[3];
if src.starts_with("http://") || src.starts_with("https://") || src.starts_with("data:") {
caps[0].to_string()
} else if src.starts_with("//") {
format!("{}https:{}{}", prefix, src, suffix)
} else {
match base.join(src) {
Ok(abs) => format!("{}{}{}", prefix, abs, suffix),
Err(_) => caps[0].to_string(),
}
}
}).to_string();
static RE_GUTTER: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(?is)<(?:td|span|div)[^>]*class\s*=\s*["'][^"']*(?:gutter|line-number|linenumber|hljs-ln-numbers|blob-num)[^"']*["'][^>]*>.*?</(?:td|span|div)>"#).unwrap()
});
let result = RE_GUTTER.replace_all(&result, "").to_string();
result
}
fn strip_non_content_tags(html: &str) -> String {
let regexes: &[&Regex] = &[&RE_SCRIPT, &RE_STYLE, &RE_NOSCRIPT, &RE_SVG, &RE_HEAD, &RE_COMMENT];
let mut result = html.to_string();
for re in regexes {
result = re.replace_all(&result, "").to_string();
}
result
}
fn strip_layout_tables(html: &str) -> String {
let mut result = html.to_string();
static RE_LAYOUT_TBL: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(?s)<table[^>]*(cellpadding|cellspacing|border=["']?0["']?)[^>]*>.*?</table>"#).unwrap()
});
let layout_table_regex = &*RE_LAYOUT_TBL;
loop {
let mut replacements = Vec::new();
for cap in layout_table_regex.find_iter(&result) {
let table_html = cap.as_str();
if table_html.contains("<th") || table_html.contains("<th>") {
continue;
}
let table_doc = Html::parse_fragment(table_html);
let text_content = table_doc
.root_element()
.text()
.collect::<Vec<_>>()
.join("\n");
replacements.push((
table_html.to_string(),
format!(
"<div class=\"extracted-from-layout-table\">\n{}\n</div>",
text_content
),
));
}
if replacements.is_empty() {
break;
}
for (old, new) in replacements {
result = result.replace(&old, &new);
}
}
result
}
fn strip_excessive_tables(html: &str) -> String {
if html.len() < 50_000 {
return html.to_string();
}
static RE_TABLE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?is)<table[^>]*>.*?</table>").unwrap()
});
let total_table_bytes: usize = RE_TABLE_BLOCK
.find_iter(html)
.map(|m| m.as_str().len())
.sum();
if total_table_bytes < html.len() / 2 {
return html.to_string();
}
RE_TABLE_BLOCK
.replace_all(html, |caps: ®ex::Captures| {
let table_html = &caps[0];
let table_attrs = table_html.split('>').next().unwrap_or("");
let is_data = table_attrs.contains("infobox")
|| table_attrs.contains("wikitable")
|| table_attrs.contains("data-table")
|| table_attrs.contains("sortable");
let has_headers = table_html.contains("<th") || table_html.contains("<th>");
let row_count = table_html.matches("<tr").count();
if is_data || (has_headers && row_count <= 50) || row_count <= 15 {
table_html.to_string()
} else {
let doc = Html::parse_fragment(table_html);
let text: String = doc
.root_element()
.text()
.collect::<Vec<_>>()
.join(" ");
let trimmed = text.trim();
if trimmed.is_empty() {
"\n".to_string()
} else {
format!("\n{}\n", trimmed)
}
}
})
.to_string()
}
fn compress_markdown_tables(markdown: &str) -> String {
let mut result = String::with_capacity(markdown.len());
for line in markdown.lines() {
let trimmed = line.trim();
if trimmed.starts_with('|') && trimmed.ends_with('|') {
let cells: Vec<&str> = trimmed.split('|').collect();
let compressed: Vec<String> = cells
.iter()
.map(|cell| {
let t = cell.trim();
if t.is_empty() {
String::new()
} else if t.chars().all(|c| c == '-' || c == ':' || c == ' ') {
let t = t.trim();
if t.starts_with(':') && t.ends_with(':') {
" :---: ".to_string()
} else if t.ends_with(':') {
" ---: ".to_string()
} else if t.starts_with(':') {
" :--- ".to_string()
} else {
" --- ".to_string()
}
} else {
format!(" {} ", t)
}
})
.collect();
result.push_str(&compressed.join("|"));
} else {
result.push_str(line);
}
result.push('\n');
}
if result.ends_with('\n') && !markdown.ends_with('\n') {
result.pop();
}
result
}
fn clean_markdown(markdown: &str) -> String {
let cleaned = clean_html_from_markdown(markdown);
let cleaned = strip_invisible_unicode(&cleaned);
let cleaned = RE_SETEXT_H1.replace_all(&cleaned, "# $1").to_string();
let cleaned = RE_SETEXT_H2.replace_all(&cleaned, "## $1").to_string();
static RE_TRAILING_HASHES: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?m)^(#{1,6}\s+.+?)\s+#+\s*$").unwrap());
let cleaned = RE_TRAILING_HASHES.replace_all(&cleaned, "$1").to_string();
let lines: Vec<String> = cleaned.lines().map(|l| l.trim_end().to_string()).collect();
let mut result = Vec::new();
let mut blank_count = 0;
for line in lines.iter() {
if line.trim().is_empty() {
blank_count += 1;
if blank_count <= 2 {
result.push(line.clone());
}
} else {
blank_count = 0;
result.push(line.clone());
}
}
let joined = result.join("\n").trim().to_string();
let joined = RE_ESCAPED_TAG.replace_all(&joined, "").to_string();
let joined = joined.replace("\\ ", " ").replace("\\\\", "");
let joined = RE_CSS_ROOT.replace_all(&joined, "").to_string();
let joined = RE_BASE64_IMG.replace_all(&joined, "$1(data:image-removed)").to_string();
let joined = RE_EMPTY_LINK.replace_all(&joined, "").to_string();
let joined = RE_EMPTY_LIST_RUN.replace_all(&joined, "\n").to_string();
static UI_NOISE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(concat!(
r"(?m)^\s*(?:",
r"Ask about this section|Copy for LLM|View as Markdown|Copy as Markdown",
r"|Open (?:Markdown|in Claude)(?:\s*Ask Docs AI)?(?:\s*Open in Claude)?",
r"|Ask Docs AI\s*Open in Claude",
r"|Was this (?:section |page )?helpful\s*(?:to you)?\??",
r"|(?:Share|Tweet|Pin it|Email)",
r"|(?:Table of [Cc]ontents|In this article|On this page)",
r"|Show more|Read more|Load more|See all|Expand all|Collapse all",
r"|Scroll to top|Back to top",
r"|Primary navigation",
r"|Loading\.\.\.",
r"|Sponsored",
r"|Notifications",
r"|Expand (?:Cart|Watch List|My eBay)",
r"|Shop by category",
r"|All Categories",
r"|Toggle the table of contents",
r"|move to sidebar\s*hide",
r"|\d+\s+languages?",
r"|Edit links?",
r"|Edit this page on GitHub\s*",
r"|Was this page helpful\s*(?:to you)?\??\s*(?:Yes|No)?",
r"|Suggest (?:changes|edits?)",
r"|Report (?:an? )?(?:issue|bug)",
r")\s*$"
)).unwrap()
});
let cleaned = UI_NOISE.replace_all(&joined, "").to_string();
static RE_EDIT_LINKS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\s*\[\[edit\]\([^)]*\)\]").unwrap()
});
let cleaned = RE_EDIT_LINKS.replace_all(&cleaned, "").to_string();
let cleaned = RE_HEADING_IN_LINK
.replace_all(&cleaned, "$1 [$2]($3)")
.to_string();
let cleaned = RE_EMPTY_TEXT_LINK.replace_all(&cleaned, "$1").to_string();
let cleaned = RE_TRACKING_PIXEL.replace_all(&cleaned, "").to_string();
static RE_LEAKED_JS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?m)^\s*(?:var|let|const)\s+\w+(?:\\?\w+)*\s*=.*$").unwrap()
});
let cleaned = RE_LEAKED_JS.replace_all(&cleaned, "").to_string();
static RE_JS_PROP_ASSIGN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r#"(?m)^\s*\w+(?:\\?\w+)*(?:\.\w+(?:\\?\w+)*|\[['"][^'"]*['"]\])\s*=\s*.*;\s*$"#).unwrap()
});
let cleaned = RE_JS_PROP_ASSIGN.replace_all(&cleaned, "").to_string();
static RE_JS_FUNC_CALL: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?m)^\s*\w+(?:\\?\w+)*(?:\.\w+(?:\\?\w+)*)*\([^)]*\)\s*;\s*$").unwrap()
});
let cleaned = RE_JS_FUNC_CALL.replace_all(&cleaned, "").to_string();
static RE_COPYRIGHT: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?m)^\s*Copyright\s+©.*$").unwrap()
});
let cleaned = RE_COPYRIGHT.replace_all(&cleaned, "").to_string();
static RE_LINK_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\[\s{2,}([^\]]*?)\s{2,}\]\(").unwrap()
});
let cleaned = RE_LINK_WHITESPACE.replace_all(&cleaned, "[$1](").to_string();
static RE_LINK_INNER_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\[([^\]]*?)\s{2,}([^\]]*?)\]\(").unwrap()
});
let mut cleaned = cleaned;
for _ in 0..3 {
cleaned = RE_LINK_INNER_WHITESPACE.replace_all(&cleaned, "[$1 $2](").to_string();
}
static RE_LINK_TEXT: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\[([^\]]+)\]\(").unwrap()
});
let cleaned = RE_LINK_TEXT.replace_all(&cleaned, |caps: ®ex::Captures| {
let text = caps[1].trim();
let words: Vec<&str> = text.split_whitespace().collect();
let len = words.len();
if len >= 2 && len.is_multiple_of(2) {
let half = len / 2;
if words[..half] == words[half..] {
return format!("[{}](", words[..half].join(" "));
}
}
format!("[{}](", text)
}).to_string();
let cleaned = {
let lines: Vec<&str> = cleaned.lines().collect();
let mut result_lines: Vec<&str> = Vec::with_capacity(lines.len());
let mut prev_item: Option<&str> = None;
let mut repeat_count = 0u32;
for line in &lines {
let trimmed = line.trim();
if trimmed.starts_with("* ") || trimmed.starts_with("- ") {
if Some(trimmed) == prev_item {
repeat_count += 1;
if repeat_count < 2 {
result_lines.push(line);
}
} else {
prev_item = Some(trimmed);
repeat_count = 0;
result_lines.push(line);
}
} else {
if !trimmed.is_empty() {
prev_item = None;
repeat_count = 0;
}
result_lines.push(line);
}
}
result_lines.join("\n")
};
RE_COLLAPSE_NEWLINES.replace_all(&cleaned, "\n\n").to_string()
}
fn strip_invisible_unicode(text: &str) -> String {
text.replace(['\u{200B}', '\u{FEFF}', '\u{200C}', '\u{200D}', '\u{2060}', '\u{FFFE}'], "") }
fn decode_html_entities(text: &str) -> String {
text.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
.replace(" ", " ")
.replace("–", "\u{2013}")
.replace("—", "\u{2014}")
.replace("…", "\u{2026}")
.replace("‘", "\u{2018}")
.replace("’", "\u{2019}")
.replace("“", "\u{201C}")
.replace("”", "\u{201D}")
.replace("•", "\u{2022}")
.replace("·", "\u{00B7}")
.replace("©", "\u{00A9}")
.replace("®", "\u{00AE}")
.replace("™", "\u{2122}")
}
fn clean_html_from_markdown(text: &str) -> String {
static RE_CODE_FENCE_LOCAL: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?s)```[^\n]*\n.*?```").unwrap());
static RE_INLINE_CODE_SPAN: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"`[^`\n]+?`").unwrap());
let mut code_blocks: Vec<String> = Vec::new();
let text = RE_CODE_FENCE_LOCAL
.replace_all(text, |caps: ®ex::Captures| {
let placeholder = format!("\x00CODE_FENCE_{}\x00", code_blocks.len());
code_blocks.push(caps[0].to_string());
placeholder
})
.to_string();
let mut inline_code_spans: Vec<String> = Vec::new();
let text = RE_INLINE_CODE_SPAN
.replace_all(&text, |caps: ®ex::Captures| {
let placeholder = format!("\x00INLINE_CODE_{}\x00", inline_code_spans.len());
inline_code_spans.push(caps[0].to_string());
placeholder
})
.to_string();
let mut result = RE_PRE_CODE
.replace_all(&text, |caps: ®ex::Captures| {
let attrs = caps.get(1).map_or("", |m| m.as_str());
let lang = RE_LANG_CLASS
.captures(attrs)
.and_then(|c| c.get(1))
.map_or("", |m| m.as_str());
let code_content = decode_html_entities(caps.get(2).map_or("", |m| m.as_str()));
let trimmed = code_content.trim();
if trimmed.is_empty() {
String::new()
} else {
format!("\n```{}\n{}\n```\n", lang, trimmed)
}
})
.to_string();
result = RE_PRE_BARE
.replace_all(&result, |caps: ®ex::Captures| {
let content = caps.get(1).map_or("", |m| m.as_str()).trim();
if content.is_empty() {
String::new()
} else {
format!("\n```\n{}\n```\n", decode_html_entities(content))
}
})
.to_string();
result = RE_ANCHOR_TAG
.replace_all(&result, |caps: ®ex::Captures| {
let href = &caps[1];
let link_text = caps[2].trim();
if link_text.is_empty() || href.is_empty() || href.starts_with("javascript:") {
link_text.to_string()
} else {
format!("[{}]({})", link_text, href)
}
})
.to_string();
result = RE_IMG_TAG
.replace_all(&result, |caps: ®ex::Captures| {
let img_tag = &caps[0];
let src = RE_IMG_SRC
.captures(img_tag)
.and_then(|c| c.get(1))
.map(|m| m.as_str())
.unwrap_or("");
let alt = RE_IMG_ALT
.captures(img_tag)
.and_then(|c| c.get(1))
.map(|m| m.as_str())
.unwrap_or("");
if !src.is_empty() {
format!("", alt, src)
} else {
String::new()
}
})
.to_string();
static RE_MD_LINK_WITH_TAG: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\[([^\]]*<[a-zA-Z][a-zA-Z0-9]*[^]]*)\]\(([^)]+)\)").unwrap()
});
static RE_BARE_HTML_TAG_NAME: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"<(/?)([a-zA-Z][a-zA-Z0-9]*)>").unwrap()
});
result = RE_MD_LINK_WITH_TAG
.replace_all(&result, |caps: ®ex::Captures| {
let link_text = &caps[1];
let url = &caps[2];
let protected = RE_BARE_HTML_TAG_NAME.replace_all(link_text, "`<$1$2>`");
format!("[{}]({})", protected, url)
})
.to_string();
static RE_INLINE_TAG_REF: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"([a-zA-Z.,;:!?\s`])(</?[a-zA-Z][a-zA-Z0-9]*>)([a-zA-Z.,;:!?\s`])").unwrap()
});
static FORMATTING_TAGS: &[&str] = &[
"em", "strong", "b", "i", "u", "s", "code", "kbd", "samp", "var",
"mark", "small", "sup", "sub", "abbr", "cite", "dfn", "time", "data",
"del", "ins", "q",
];
for _ in 0..2 {
result = RE_INLINE_TAG_REF
.replace_all(&result, |caps: ®ex::Captures| {
let pre = &caps[1];
let tag = &caps[2];
let post = &caps[3];
let tag_name = tag.trim_start_matches('<')
.trim_start_matches('/')
.trim_end_matches('>')
.to_lowercase();
if FORMATTING_TAGS.contains(&tag_name.as_str()) {
format!("{}{}{}", pre, tag, post)
} else {
format!("{}`{}`{}", pre, tag, post)
}
})
.to_string();
}
static TAG_PATTERNS: LazyLock<Vec<(Regex, &str)>> = LazyLock::new(|| {
vec![
(Regex::new(r"</?div[^>]*?>").unwrap(), ""),
(Regex::new(r"</?span[^>]*?>").unwrap(), ""),
(Regex::new(r"</?p[^>]*?>").unwrap(), "\n"),
(Regex::new(r"<br\s*/?>\s*").unwrap(), "\n"),
(Regex::new(r"</?section[^>]*?>").unwrap(), ""),
(Regex::new(r"</?article[^>]*?>").unwrap(), ""),
(Regex::new(r"</?header[^>]*?>").unwrap(), ""),
(Regex::new(r"</?footer[^>]*?>").unwrap(), ""),
(Regex::new(r"</?nav[^>]*?>").unwrap(), ""),
(Regex::new(r"</?aside[^>]*?>").unwrap(), ""),
(Regex::new(r"</?main[^>]*?>").unwrap(), ""),
(Regex::new(r"</?button[^>]*?>").unwrap(), ""),
(Regex::new(r"</?form[^>]*?>").unwrap(), ""),
(Regex::new(r"<input[^>]*?>").unwrap(), ""),
(Regex::new(r"</?select[^>]*?>").unwrap(), ""),
(Regex::new(r"</?option[^>]*?>").unwrap(), ""),
(Regex::new(r"</?textarea[^>]*?>").unwrap(), ""),
(Regex::new(r"</?label[^>]*?>").unwrap(), ""),
(Regex::new(r"</?fieldset[^>]*?>").unwrap(), ""),
(Regex::new(r"</?legend[^>]*?>").unwrap(), ""),
(Regex::new(r"</?sup[^>]*?>").unwrap(), ""),
(Regex::new(r"</?sub[^>]*?>").unwrap(), ""),
(Regex::new(r"</?small[^>]*?>").unwrap(), ""),
(Regex::new(r"</?mark[^>]*?>").unwrap(), ""),
(Regex::new(r"<em[^>]*?>").unwrap(), "_"),
(Regex::new(r"</em>").unwrap(), "_"),
(Regex::new(r"<strong[^>]*?>").unwrap(), "**"),
(Regex::new(r"</strong>").unwrap(), "**"),
(Regex::new(r"<b[^>]*?>").unwrap(), "**"),
(Regex::new(r"</b>").unwrap(), "**"),
(Regex::new(r"<i[^>]*?>").unwrap(), "_"),
(Regex::new(r"</i>").unwrap(), "_"),
(Regex::new(r"</?u[^>]*?>").unwrap(), ""),
(Regex::new(r"</?s(?:\s[^>]*?)?>").unwrap(), ""),
(Regex::new(r"<code[^>]*?>").unwrap(), "`"),
(Regex::new(r"</code>").unwrap(), "`"),
(Regex::new(r"</?kbd[^>]*?>").unwrap(), ""),
(Regex::new(r"</?samp[^>]*?>").unwrap(), ""),
(Regex::new(r"</?var[^>]*?>").unwrap(), ""),
(Regex::new(r"</?abbr[^>]*?>").unwrap(), ""),
(Regex::new(r"</?cite[^>]*?>").unwrap(), ""),
(Regex::new(r"</?dfn[^>]*?>").unwrap(), ""),
(Regex::new(r"</?time[^>]*?>").unwrap(), ""),
(Regex::new(r"</?data[^>]*?>").unwrap(), ""),
(Regex::new(r"</?h[1-6][^>]*?>").unwrap(), ""),
(Regex::new(r"</?ul[^>]*?>").unwrap(), "\n"),
(Regex::new(r"</?ol[^>]*?>").unwrap(), "\n"),
(Regex::new(r"<li[^>]*?>").unwrap(), "- "),
(Regex::new(r"</li>").unwrap(), "\n"),
(Regex::new(r"</?table[^>]*?>").unwrap(), "\n"),
(Regex::new(r"</?thead[^>]*?>").unwrap(), ""),
(Regex::new(r"</?tbody[^>]*?>").unwrap(), ""),
(Regex::new(r"</?tfoot[^>]*?>").unwrap(), ""),
(Regex::new(r"</?tr[^>]*?>").unwrap(), "\n"),
(Regex::new(r"</?th[^>]*?>").unwrap(), " | "),
(Regex::new(r"</?td[^>]*?>").unwrap(), " "),
(Regex::new(r"</?caption[^>]*?>").unwrap(), "\n"),
(Regex::new(r"</?colgroup[^>]*?>").unwrap(), ""),
(Regex::new(r"</?col[^>]*?>").unwrap(), ""),
(Regex::new(r"<!DOCTYPE[^>]*?>").unwrap(), ""),
(Regex::new(r"</?meta[^>]*?>").unwrap(), ""),
(Regex::new(r"</?link[^>]*?>").unwrap(), ""),
(Regex::new(r"</?title[^>]*?>").unwrap(), ""),
(Regex::new(r"</?base[^>]*?>").unwrap(), ""),
(Regex::new(r"</?head[^>]*?>").unwrap(), ""),
(Regex::new(r"</?body[^>]*?>").unwrap(), ""),
(Regex::new(r"</?html[^>]*?>").unwrap(), ""),
(Regex::new(r"</?blockquote[^>]*?>").unwrap(), "\n"),
(Regex::new(r"</?pre[^>]*?>").unwrap(), "\n"),
(Regex::new(r"<hr[^>]*?>").unwrap(), "\n---\n"),
(Regex::new(r"</?dl[^>]*?>").unwrap(), "\n"),
(Regex::new(r"</?dt[^>]*?>").unwrap(), "\n"),
(Regex::new(r"</?dd[^>]*?>").unwrap(), " "),
(Regex::new(r"</?picture[^>]*?>").unwrap(), ""),
(Regex::new(r"</?video[^>]*?>").unwrap(), ""),
(Regex::new(r"</?audio[^>]*?>").unwrap(), ""),
(Regex::new(r"</?source[^>]*?>").unwrap(), ""),
(Regex::new(r"</?track[^>]*?>").unwrap(), ""),
(Regex::new(r"</?canvas[^>]*?>").unwrap(), ""),
(Regex::new(r"</?figure[^>]*?>").unwrap(), ""),
(Regex::new(r"</?figcaption[^>]*?>").unwrap(), ""),
(Regex::new(r"</?details[^>]*?>").unwrap(), ""),
(Regex::new(r"</?summary[^>]*?>").unwrap(), ""),
(Regex::new(r"</?dialog[^>]*?>").unwrap(), ""),
(Regex::new(r"(?is)<script[^>]*?>.*?</script>").unwrap(), ""),
(Regex::new(r"(?is)<style[^>]*?>.*?</style>").unwrap(), ""),
(Regex::new(r"(?is)<noscript[^>]*?>.*?</noscript>").unwrap(), ""),
(Regex::new(r"(?is)<!--.*?-->").unwrap(), ""),
(Regex::new(r"(?is)<!\[CDATA\[.*?\]\]>").unwrap(), ""),
(Regex::new(r"(?is)<\?xml[^>]*?\?>").unwrap(), ""),
(Regex::new(r"</?address[^>]*?>").unwrap(), ""),
(Regex::new(r"</?ins[^>]*?>").unwrap(), ""),
(Regex::new(r"</?del[^>]*?>").unwrap(), ""),
(Regex::new(r"</?q[^>]*?>").unwrap(), ""),
(Regex::new(r"</?wbr[^>]*?/?>").unwrap(), ""),
(Regex::new(r"</?ruby[^>]*?>").unwrap(), ""),
(Regex::new(r"</?rt[^>]*?>").unwrap(), ""),
(Regex::new(r"</?rp[^>]*?>").unwrap(), ""),
(Regex::new(r"</?bdi[^>]*?>").unwrap(), ""),
(Regex::new(r"</?bdo[^>]*?>").unwrap(), ""),
(Regex::new(r"(?is)<iframe[^>]*?>.*?</iframe>").unwrap(), ""),
(Regex::new(r"<iframe[^>]*?/?>").unwrap(), ""),
(Regex::new(r"(?is)<object[^>]*?>.*?</object>").unwrap(), ""),
(Regex::new(r"<embed[^>]*?/?>").unwrap(), ""),
(Regex::new(r"</?param[^>]*?>").unwrap(), ""),
(Regex::new(r"(?is)<template[^>]*?>.*?</template>").unwrap(), ""),
(Regex::new(r"</?slot[^>]*?>").unwrap(), ""),
]
});
for (regex, replacement) in TAG_PATTERNS.iter() {
result = regex.replace_all(&result, *replacement).to_string();
}
static RE_BARE_ELEMENT: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(</?[a-zA-Z][a-zA-Z0-9]*>)").unwrap()
});
result = RE_BARE_ELEMENT.replace_all(&result, "`$1`").to_string();
result = RE_CATCHALL_TAG.replace_all(&result, "").to_string();
result = RE_MULTI_NEWLINE.replace_all(&result, "\n\n").to_string();
result = decode_html_entities(&result);
result = RE_CATCHALL_TAG.replace_all(&result, "").to_string();
for (i, span) in inline_code_spans.iter().enumerate() {
let placeholder = format!("\x00INLINE_CODE_{}\x00", i);
result = result.replace(&placeholder, span);
}
for (i, block) in code_blocks.iter().enumerate() {
let placeholder = format!("\x00CODE_FENCE_{}\x00", i);
result = result.replace(&placeholder, block);
}
result
}
fn escape_multiline_links(markdown: &str) -> String {
let mut result = String::with_capacity(markdown.len());
let mut in_link_text = false;
let mut bracket_depth: i32 = 0;
for ch in markdown.chars() {
match ch {
'[' => {
bracket_depth += 1;
in_link_text = true;
result.push(ch);
}
']' if in_link_text => {
bracket_depth = bracket_depth.saturating_sub(1);
if bracket_depth == 0 {
in_link_text = false;
}
result.push(ch);
}
'\n' if in_link_text && bracket_depth > 0 => {
result.push(' ');
}
_ => result.push(ch),
}
}
result
}
fn remove_accessibility_links(markdown: &str) -> String {
static SKIP_LINKS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
vec![
Regex::new(r"(?mi)^\s*\[Skip to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
Regex::new(r"(?mi)^\s*\[Jump to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
Regex::new(r"(?mi)^\s*\[Go to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
Regex::new(r"(?mi)^\s*\[Skip (navigation|nav|to main content|to content)\]\([^)]*\)\s*").unwrap(),
Regex::new(r"(?mi)^\s*\[Back to (Top|Main|Content)\]\([^)]*\)\s*").unwrap(),
]
});
static SCREEN_READER: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?mi)^\s*\[Screen reader only:?[^\]]*\]\([^)]*\)\s*").unwrap()
});
let mut result = markdown.to_string();
let mut changed = true;
while changed {
changed = false;
for regex in SKIP_LINKS.iter() {
let new_result = regex.replace_all(&result, "").to_string();
if new_result != result {
changed = true;
result = new_result;
}
}
}
SCREEN_READER.replace_all(&result, "").to_string()
}
fn convert_urls_to_absolute(markdown: &str, base_url: &str) -> Result<String> {
use crate::error::ScrapeError;
let base = Url::parse(base_url)
.map_err(|e| ScrapeError::InvalidUrl(format!("Invalid base URL: {}", e)))?;
static RE_IMG_URL: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
let img_regex = &*RE_IMG_URL;
let mut result = img_regex
.replace_all(markdown, |caps: ®ex::Captures| {
let alt = &caps[1];
let url = &caps[2];
if url.starts_with("http://")
|| url.starts_with("https://")
|| url.starts_with("data:")
{
return caps[0].to_string();
}
if url.starts_with("//") {
return format!("", alt, url);
}
match base.join(url) {
Ok(absolute) => format!("", alt, absolute),
Err(_) => caps[0].to_string(), }
})
.to_string();
static RE_LINK_URL: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
let link_regex = &*RE_LINK_URL;
result = link_regex
.replace_all(&result, |caps: ®ex::Captures| {
let text = &caps[1];
let url = &caps[2];
if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("#") {
return caps[0].to_string();
}
if url.starts_with("//") {
return format!("[{}](https:{})", text, url);
}
match base.join(url) {
Ok(absolute) => format!("[{}]({})", text, absolute),
Err(_) => caps[0].to_string(),
}
})
.to_string();
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_html_to_markdown_simple() {
let html = "<h1>Hello</h1><p>World</p>";
let result = html_to_markdown(html, "https://example.com", false);
assert!(result.is_ok());
let md = result.unwrap();
assert!(md.contains("Hello"));
assert!(md.contains("World"));
}
#[test]
fn test_extract_main_content() {
let html = r#"
<html>
<body>
<nav>Navigation</nav>
<main>
<h1>Main Content</h1>
<p>This is the main content.</p>
</main>
<footer>Footer</footer>
</body>
</html>
"#;
let result = extract_main_content_html(html);
assert!(result.is_ok());
let content = result.unwrap();
assert!(content.contains("Main Content"));
assert!(!content.contains("Navigation"));
}
#[test]
fn test_clean_markdown() {
let markdown = "# Hello\n\n\n\n\nWorld\n\n\n";
let cleaned = clean_markdown(markdown);
assert_eq!(cleaned, "# Hello\n\nWorld");
}
#[test]
fn test_clean_html_from_markdown_images() {
let input =
r#"Some text <img src="https://example.com/logo.png" alt="Company Logo"> more text"#;
let result = clean_html_from_markdown(input);
assert!(result.contains(""));
assert!(!result.contains("<img"));
let input = r#"<img alt="Logo" src="logo.png">"#;
let result = clean_html_from_markdown(input);
assert!(result.contains(""));
let input = r#"<img src="image.jpg">"#;
let result = clean_html_from_markdown(input);
assert!(result.contains(""));
}
#[test]
fn test_clean_html_from_markdown_images_with_attributes() {
let input = r#"<img src="/path/image.jpg" alt="Local Image" title="A title" width="300" height="200">"#;
let result = clean_html_from_markdown(input);
assert!(result.contains(""));
assert!(!result.contains("width"));
assert!(!result.contains("height"));
assert!(!result.contains("title"));
}
#[test]
fn test_clean_html_from_markdown_multiple_images() {
let input = r#"
<h1>Gallery</h1>
<img src="photo1.jpg" alt="Photo One">
<img src="photo2.jpg" alt="Photo Two">
<img src="photo3.jpg">
"#;
let result = clean_html_from_markdown(input);
assert!(result.contains(""));
assert!(result.contains(""));
assert!(result.contains(""));
}
#[test]
fn test_clean_html_from_markdown_remove_tags() {
let input = r#"<div class="container"><span>Hello</span> <p>World</p></div>"#;
let result = clean_html_from_markdown(input);
assert!(!result.contains("<div"));
assert!(!result.contains("<span"));
assert!(!result.contains("<p>"));
assert!(result.contains("Hello"));
assert!(result.contains("World"));
}
#[test]
fn test_clean_html_from_markdown_br_tags() {
let input = "Line 1<br>Line 2<br />Line 3";
let result = clean_html_from_markdown(input);
assert!(!result.contains("<br"));
assert!(result.contains("Line 1"));
assert!(result.contains("Line 2"));
assert!(result.contains("Line 3"));
}
#[test]
fn test_clean_html_from_markdown_form_elements() {
let input = r#"<form><input type="text" name="email"><button>Submit</button></form>"#;
let result = clean_html_from_markdown(input);
assert!(!result.contains("<form"));
assert!(!result.contains("<input"));
assert!(!result.contains("<button"));
}
#[test]
fn test_clean_html_from_markdown_removes_multiline_script_blocks() {
let input = r#"
Before
<script>
var d = data[i].join(" ");
console.log("template");
</script>
After
"#;
let result = clean_html_from_markdown(input);
assert!(result.contains("Before"));
assert!(result.contains("After"));
assert!(!result.contains("var d = data"));
assert!(!result.contains("console.log"));
assert!(!result.contains("<script"));
}
#[test]
fn test_clean_html_from_markdown_removes_multiline_noscript_and_comments() {
let input = r#"
Keep this
<!--
multi-line comment
-->
<noscript>
fallback
content
</noscript>
<![CDATA[
hidden payload
]]>
Done
"#;
let result = clean_html_from_markdown(input);
assert!(result.contains("Keep this"));
assert!(result.contains("Done"));
assert!(!result.contains("multi-line comment"));
assert!(!result.contains("fallback"));
assert!(!result.contains("hidden payload"));
}
#[test]
fn test_strip_non_content_tags_removes_scripts() {
let html = r#"<html><body>
<p>Real content</p>
<script>var x = "malicious"; console.log(x);</script>
<p>More content</p>
</body></html>"#;
let result = strip_non_content_tags(html);
assert!(result.contains("Real content"));
assert!(result.contains("More content"));
assert!(!result.contains("malicious"));
assert!(!result.contains("console.log"));
}
#[test]
fn test_strip_non_content_tags_removes_style_and_svg() {
let html = r#"<html><body>
<p>Content</p>
<style>.foo { color: red; } :root { --bg: #000; }</style>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40"/></svg>
<p>End</p>
</body></html>"#;
let result = strip_non_content_tags(html);
assert!(result.contains("Content"));
assert!(result.contains("End"));
assert!(!result.contains("color: red"));
assert!(!result.contains("circle"));
assert!(!result.contains("<svg"));
}
#[test]
fn test_strip_non_content_tags_removes_head() {
let html = r#"<html>
<head><title>Page</title><meta charset="utf-8"><link rel="stylesheet" href="x.css"></head>
<body><p>Body content</p></body>
</html>"#;
let result = strip_non_content_tags(html);
assert!(result.contains("Body content"));
assert!(!result.contains("x.css"));
}
#[test]
fn test_strip_non_content_tags_removes_html_comments() {
let html = r#"<p>Before</p>
<!-- This is a long multi-line
HTML comment that should be removed -->
<p>After</p>"#;
let result = strip_non_content_tags(html);
assert!(result.contains("Before"));
assert!(result.contains("After"));
assert!(!result.contains("long multi-line"));
}
#[test]
fn test_full_pipeline_strips_inline_javascript() {
let html = r#"<html>
<head><script>var tracking = "analytics_data"; function init(){}</script></head>
<body>
<script>$ssgST=new Date().getTime(); var config = {key: "val"};</script>
<h1>Product Listing</h1>
<p>Buy the best laptops here.</p>
<script type="application/json">{"@context":"schema.org"}</script>
</body></html>"#;
let result = html_to_markdown(html, "https://example.com", false).unwrap();
assert!(result.contains("Product Listing"));
assert!(result.contains("Buy the best laptops"));
assert!(!result.contains("$ssgST"), "Inline JS should be stripped before markdown conversion");
assert!(!result.contains("analytics_data"), "Head scripts should be stripped");
assert!(!result.contains("function init"), "Script functions should not appear in output");
}
#[test]
fn test_full_pipeline_preserves_content_around_scripts() {
let html = r#"<html><body>
<h1>Title</h1>
<script>alert('bad');</script>
<p>Paragraph one.</p>
<style>body { margin: 0; }</style>
<p>Paragraph two.</p>
<noscript><p>Please enable JavaScript</p></noscript>
<p>Final paragraph.</p>
</body></html>"#;
let result = html_to_markdown(html, "https://example.com", false).unwrap();
assert!(result.contains("Title"));
assert!(result.contains("Paragraph one"));
assert!(result.contains("Paragraph two"));
assert!(result.contains("Final paragraph"));
assert!(!result.contains("alert"));
assert!(!result.contains("margin: 0"));
assert!(!result.contains("enable JavaScript"));
}
#[test]
fn test_html_to_markdown_with_images() {
let html = r#"
<html>
<body>
<h1>Test Page</h1>
<p>Welcome to the test page.</p>
<img src="logo.png" alt="Site Logo">
<p>More content here.</p>
</body>
</html>
"#;
let result = html_to_markdown(html, "https://example.com", false);
assert!(result.is_ok());
let md = result.unwrap();
assert!(md.contains(""));
assert!(!md.contains("<img"));
assert!(!md.contains(""));
assert!(md.contains("Test Page"));
assert!(md.contains("Welcome"));
}
#[test]
fn test_token_reduction_with_image_cleaning() {
let html_version =
r#"<img src="https://example.com/very-long-url-path/image.png" alt="Description">"#;
let cleaned = clean_html_from_markdown(html_version);
assert!(
cleaned.contains("")
);
assert!(!cleaned.contains("<img"));
assert!(!cleaned.contains("src="));
assert!(!cleaned.contains("alt="));
}
#[test]
fn test_github_content_extraction() {
let github_html = r#"
<html>
<body>
<div class="Layout-sidebar">
<div class="file-navigation">File Tree Noise</div>
<div class="contributors-wrapper">Contributors Widget</div>
</div>
<div id="readme">
<h1>Project README</h1>
<p>This is the actual content we want.</p>
</div>
<div class="BorderGrid">
<div>Sidebar noise</div>
</div>
</body>
</html>
"#;
let result = extract_main_content_html(github_html).unwrap();
assert!(result.contains("Project README"));
assert!(result.contains("actual content we want"));
assert!(!result.contains("File Tree Noise"));
assert!(!result.contains("Contributors Widget"));
assert!(!result.contains("Sidebar noise"));
}
#[test]
fn test_github_markdown_body_extraction() {
let github_html = r#"
<html>
<body>
<nav>Navigation Bar</nav>
<div class="markdown-body">
<h1>Documentation</h1>
<p>Main documentation content here.</p>
</div>
<aside class="Layout-sidebar">Sidebar content</aside>
</body>
</html>
"#;
let result = extract_main_content_html(github_html).unwrap();
assert!(result.contains("Documentation"));
assert!(result.contains("Main documentation content"));
assert!(!result.contains("Navigation Bar"));
assert!(!result.contains("Sidebar content"));
}
#[test]
fn test_html_entity_decoding() {
let text_with_entities =
"Copyright © 2024 & Company™. Click "here" for more info.";
let decoded = decode_html_entities(text_with_entities);
assert_eq!(
decoded,
"Copyright © 2024 & Company™. Click \"here\" for more info."
);
assert!(!decoded.contains("&"));
assert!(!decoded.contains("©"));
assert!(!decoded.contains("™"));
assert!(!decoded.contains("""));
}
#[test]
fn test_html_entity_in_urls() {
let html = "Text with & entity "quoted" content";
let cleaned = clean_html_from_markdown(html);
assert!(cleaned.contains("Text with & entity"));
assert!(cleaned.contains("\"quoted\""));
assert!(!cleaned.contains("&"));
assert!(!cleaned.contains("""));
}
#[test]
fn test_html_entity_common_cases() {
let input = "Less than < greater than > and nbsp space";
let decoded = decode_html_entities(input);
assert_eq!(decoded, "Less than < greater than > and nbsp space");
}
#[test]
fn test_strip_invisible_unicode() {
let text_with_zwsp = "Hello\u{200B}World";
let cleaned = strip_invisible_unicode(text_with_zwsp);
assert_eq!(cleaned, "HelloWorld");
let text_with_bom = "\u{FEFF}Content";
let cleaned = strip_invisible_unicode(text_with_bom);
assert_eq!(cleaned, "Content");
let text_with_multiple = "A\u{200B}\u{200C}\u{200D}B\u{2060}C";
let cleaned = strip_invisible_unicode(text_with_multiple);
assert_eq!(cleaned, "ABC");
}
#[test]
fn test_invisible_unicode_in_anchor_links() {
let markdown = "[\u{200B}\u{200B}\n\n](#heading)";
let cleaned = clean_markdown(markdown);
assert!(!cleaned.contains('\u{200B}'));
assert!(!cleaned.contains("\n\n\n"));
}
#[test]
fn test_full_pipeline_with_all_fixes() {
let html = r#"
<html>
<body>
<div class="Layout-sidebar">Sidebar noise</div>
<div id="readme">
<h1>Test & Demo</h1>
<p>Content with entities here "quoted".</p>
<a href="page?a=1&b=2">Link</a>
<p>Invisible\u{200B}chars\u{200C}removed</p>
</div>
</body>
</html>
"#;
let result = html_to_markdown(html, "https://example.com", true).unwrap();
assert!(result.contains("Test & Demo"));
assert!(!result.contains("Sidebar noise"));
assert!(result.contains("&"));
assert!(result.contains("\"quoted\""));
assert!(result.contains("a=1&b=2"));
assert!(!result.contains("&"));
assert!(!result.contains("""));
assert!(!result.contains(" "));
assert!(!result.contains('\u{200B}'));
assert!(!result.contains('\u{200C}'));
}
#[test]
fn test_complex_image_tag_with_attributes_before_src() {
let input = r#"<img width="50" height="50" src="https://example.com/logo.png" class="thumbnail" alt="Logo" decoding="async" />"#;
let result = clean_html_from_markdown(input);
assert!(result.contains(""));
assert!(!result.contains("<img"));
assert!(!result.contains("width="));
assert!(!result.contains("class="));
}
#[test]
fn test_doctype_and_document_declarations() {
let input = r#"
Example API response:
```json
{
"html": "<!DOCTYPE html><body class=\"main\">content</body>",
"data": "<![CDATA[some data]]>"
}
```
Also test standalone: <!DOCTYPE html> and <?xml version="1.0"?>
"#;
let result = clean_html_from_markdown(input);
assert!(!result.contains("Also test standalone: <!DOCTYPE html>"), "Standalone DOCTYPE should be removed");
assert!(!result.contains("<?xml"), "Standalone XML declaration should be removed");
assert!(result.contains("<!DOCTYPE html>"), "DOCTYPE inside code fence should be preserved");
assert!(result.contains("<![CDATA["), "CDATA inside code fence should be preserved");
assert!(result.contains("Example API response"));
}
#[test]
fn test_picture_and_svg_elements() {
let input = r#"
<picture>
<source srcset="image.webp" type="image/webp">
<img src="image.png" alt="Test">
</picture>
<svg><path d="M10 10"/><circle cx="5" cy="5" r="3"/></svg>
"#;
let result = clean_html_from_markdown(input);
assert!(!result.contains("<picture"));
assert!(!result.contains("<source"));
assert!(!result.contains("<svg"));
assert!(!result.contains("<path"));
assert!(!result.contains("<circle"));
assert!(result.contains(""));
}
#[test]
fn test_multiple_complex_images() {
let input = r#"
<img width="100" src="image1.jpg" alt="First">
<img alt="Second" height="50" src="image2.png" class="thumb">
<img src="image3.gif">
"#;
let result = clean_html_from_markdown(input);
assert!(result.contains(""));
assert!(result.contains(""));
assert!(result.contains(""));
assert!(!result.contains("<img"));
}
#[test]
fn test_apple_footnote_cleaning() {
let html = "iPhone 17<sup class=\"footnote\"><a aria-label=\"footnote 1\" href=\"#footnote-1\">1</a></sup> features";
let result = clean_html_from_markdown(html);
assert!(!result.contains("<sup"));
assert!(!result.contains("<a"));
assert!(!result.contains("</a>"));
assert!(!result.contains("</sup>"));
assert!(result.contains("iPhone 17"));
assert!(result.contains("features"));
}
#[test]
fn test_semantic_html_tag_conversion() {
let html = r#"<strong>Bold</strong> <em>italic</em> <mark>highlight</mark> <code>code</code> text"#;
let result = clean_html_from_markdown(html);
assert!(!result.contains("<strong"));
assert!(!result.contains("<em"));
assert!(!result.contains("<mark"));
assert!(!result.contains("<code"));
assert!(result.contains("**Bold**"), "Expected **Bold**, got: {}", result);
assert!(result.contains("_italic_"), "Expected _italic_, got: {}", result);
assert!(result.contains("highlight"));
assert!(result.contains("`code`"), "Expected `code`, got: {}", result);
}
#[test]
fn test_heading_tag_removal() {
let html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3><h4>Subsection</h4><h5>Minor</h5><h6>Smallest</h6>";
let result = clean_html_from_markdown(html);
assert!(!result.contains("<h1"));
assert!(!result.contains("<h2"));
assert!(!result.contains("<h3"));
assert!(!result.contains("<h4"));
assert!(!result.contains("<h5"));
assert!(!result.contains("<h6"));
assert!(result.contains("Title"));
assert!(result.contains("Subtitle"));
assert!(result.contains("Section"));
}
#[test]
fn test_list_tag_removal() {
let html = "<ul><li>Item 1</li><li>Item 2</li></ul><ol><li>First</li><li>Second</li></ol>";
let result = clean_html_from_markdown(html);
assert!(!result.contains("<ul"));
assert!(!result.contains("<ol"));
assert!(!result.contains("<li"));
assert!(result.contains("Item 1"));
assert!(result.contains("Item 2"));
assert!(result.contains("First"));
assert!(result.contains("Second"));
}
#[test]
fn test_table_tag_removal() {
let html = "<table><thead><tr><th>Header 1</th><th>Header 2</th></tr></thead><tbody><tr><td>Cell 1</td><td>Cell 2</td></tr></tbody></table>";
let result = clean_html_from_markdown(html);
assert!(!result.contains("<table"));
assert!(!result.contains("<thead"));
assert!(!result.contains("<tbody"));
assert!(!result.contains("<tr"));
assert!(!result.contains("<th"));
assert!(!result.contains("<td"));
assert!(result.contains("Header 1"));
assert!(result.contains("Header 2"));
assert!(result.contains("Cell 1"));
assert!(result.contains("Cell 2"));
}
#[test]
fn test_metadata_tag_removal() {
let html = r#"<head><meta charset="utf-8"><link rel="stylesheet" href="style.css"><title>Page Title</title></head><body>Content</body>"#;
let result = clean_html_from_markdown(html);
assert!(!result.contains("<head"));
assert!(!result.contains("<meta"));
assert!(!result.contains("<link"));
assert!(!result.contains("<title"));
assert!(!result.contains("<body"));
assert!(!result.contains("<html"));
assert!(result.contains("Content"));
}
#[test]
fn test_semantic_block_tag_removal() {
let html = r#"<blockquote>Quote</blockquote><pre>Code block</pre><hr>After line"#;
let result = clean_html_from_markdown(html);
assert!(!result.contains("<blockquote"));
assert!(!result.contains("<pre"));
assert!(!result.contains("<hr"));
assert!(result.contains("Quote"));
assert!(result.contains("Code block"));
assert!(result.contains("After line"));
}
#[test]
fn test_definition_list_tag_removal() {
let html =
"<dl><dt>Term 1</dt><dd>Definition 1</dd><dt>Term 2</dt><dd>Definition 2</dd></dl>";
let result = clean_html_from_markdown(html);
assert!(!result.contains("<dl"));
assert!(!result.contains("<dt"));
assert!(!result.contains("<dd"));
assert!(result.contains("Term 1"));
assert!(result.contains("Definition 1"));
assert!(result.contains("Term 2"));
}
#[test]
fn test_media_tag_removal() {
let html = r#"<video src="video.mp4"></video><audio src="audio.mp3"></audio><canvas></canvas><svg><path d="M0,0"/></svg>"#;
let result = clean_html_from_markdown(html);
assert!(!result.contains("<video"));
assert!(!result.contains("<audio"));
assert!(!result.contains("<canvas"));
assert!(!result.contains("<svg"));
assert!(!result.contains("<path"));
}
#[test]
fn test_container_tag_removal() {
let html = r#"<figure><figcaption>Caption</figcaption><img src="img.jpg"></figure><details><summary>Summary</summary>Content</details>"#;
let result = clean_html_from_markdown(html);
assert!(!result.contains("<figure"));
assert!(!result.contains("<figcaption"));
assert!(!result.contains("<details"));
assert!(!result.contains("<summary"));
assert!(result.contains("Caption"));
assert!(result.contains("Summary"));
assert!(result.contains("Content"));
}
#[test]
fn test_html_comment_removal() {
let html = r#"<!-- This is a comment -->Content<!-- Another comment -->"#;
let result = clean_html_from_markdown(html);
assert!(!result.contains("<!--"));
assert!(!result.contains("-->"));
assert!(result.contains("Content"));
assert!(!result.contains("This is a comment"));
assert!(!result.contains("Another comment"));
}
#[test]
fn test_comprehensive_tag_cleanup() {
let html = r#"
<html>
<head><title>Test</title><meta charset="utf-8"></head>
<body>
<!-- Comment -->
<h1>Heading</h1>
<ul><li>List item</li></ul>
<table><tr><td>Table cell</td></tr></table>
<video src="v.mp4"></video>
<figure><figcaption>Fig</figcaption></figure>
</body>
</html>
"#;
let result = clean_html_from_markdown(html);
assert!(!result.contains("<html"));
assert!(!result.contains("<head"));
assert!(!result.contains("<title"));
assert!(!result.contains("<meta"));
assert!(!result.contains("<body"));
assert!(!result.contains("<h1"));
assert!(!result.contains("<ul"));
assert!(!result.contains("<li"));
assert!(!result.contains("<table"));
assert!(!result.contains("<tr"));
assert!(!result.contains("<td"));
assert!(!result.contains("<video"));
assert!(!result.contains("<figure"));
assert!(!result.contains("<figcaption"));
assert!(!result.contains("<!--"));
assert!(result.contains("Heading"));
assert!(result.contains("List item"));
assert!(result.contains("Table cell"));
assert!(result.contains("Fig"));
}
#[test]
fn test_github_token_reduction() {
let github_with_noise = r#"
<html>
<body>
<div class="file-navigation">
<div>src/</div><div>lib/</div><div>tests/</div><div>docs/</div>
<div>Very long file tree that goes on and on...</div>
</div>
<div class="Layout-sidebar">
<div class="contributors-wrapper">
<img src="avatar1.png"><img src="avatar2.png">
<div>Contributor 1</div><div>Contributor 2</div>
</div>
</div>
<div id="readme">
<h1>Project</h1>
<p>Short README content.</p>
</div>
</body>
</html>
"#;
let extracted = extract_main_content_html(github_with_noise).unwrap();
assert!(extracted.len() < github_with_noise.len() / 2);
assert!(extracted.contains("Project"));
assert!(extracted.contains("Short README"));
assert!(!extracted.contains("file-navigation"));
assert!(!extracted.contains("contributors-wrapper"));
assert!(!extracted.contains("Contributor 1"));
}
#[test]
fn test_strip_layout_tables_hacker_news_pattern() {
let hn_html = r#"
<table border="0" cellpadding="0" cellspacing="0">
<tr>
<td>
<table border="0">
<tr><td>Story 1</td></tr>
<tr><td>Story 2</td></tr>
</table>
</td>
</tr>
</table>
"#;
let result = strip_layout_tables(hn_html);
assert!(!result.contains("<table"));
assert!(!result.contains("cellpadding"));
assert!(result.contains("Story 1"));
assert!(result.contains("Story 2"));
}
#[test]
fn test_strip_layout_tables_preserves_data_tables() {
let data_table_html = r#"
<table>
<tr><th>Name</th><th>Value</th></tr>
<tr><td>Item 1</td><td>100</td></tr>
<tr><td>Item 2</td><td>200</td></tr>
</table>
"#;
let result = strip_layout_tables(data_table_html);
assert!(result.contains("<table"));
assert!(result.contains("<th>"));
assert!(result.contains("Name"));
assert!(result.contains("Value"));
assert!(result.contains("Item 1"));
}
#[test]
fn test_layout_table_with_cellpadding_stripped() {
let layout_html =
r#"<table cellpadding="5" cellspacing="0"><tr><td>Content</td></tr></table>"#;
let result = strip_layout_tables(layout_html);
assert!(!result.contains("<table"));
assert!(result.contains("Content"));
}
#[test]
fn test_simple_table_without_headers_stripped() {
let layout_html =
r#"<table border="0"><tr><td>Nav Item 1</td><td>Nav Item 2</td></tr></table>"#;
let result = strip_layout_tables(layout_html);
assert!(!result.contains("<table"));
assert!(result.contains("Nav Item 1"));
assert!(result.contains("Nav Item 2"));
}
#[test]
fn test_hacker_news_markdown_bloat_fix() {
let hn_html = r#"
<html>
<body>
<table border="0" cellpadding="0" cellspacing="0" width="85%">
<tr>
<td>
<table border="0">
<tr><td class="title">Article Title 1</td></tr>
<tr><td class="subtext">100 points by user1</td></tr>
</table>
</td>
</tr>
<tr>
<td>
<table border="0">
<tr><td class="title">Article Title 2</td></tr>
<tr><td class="subtext">200 points by user2</td></tr>
</table>
</td>
</tr>
</table>
</body>
</html>
"#;
let result = html_to_markdown(hn_html, "https://news.ycombinator.com", false).unwrap();
assert!(
result.len() < 1000,
"Markdown output too large: {} bytes",
result.len()
);
assert!(result.contains("Article Title 1"));
assert!(result.contains("Article Title 2"));
let pipe_count = result.chars().filter(|&c| c == '|').count();
assert!(pipe_count < 10, "Too many table delimiters: {}", pipe_count);
}
#[test]
fn test_convert_relative_image_to_absolute() {
let md = "";
let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
assert_eq!(result, "");
}
#[test]
fn test_convert_relative_link_to_absolute() {
let md = "[Home](../index.html)";
let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
assert_eq!(result, "[Home](https://example.com/index.html)");
}
#[test]
fn test_keep_absolute_urls_unchanged() {
let md = "";
let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
assert_eq!(result, "");
}
#[test]
fn test_keep_data_uris_unchanged() {
let md = "";
let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
assert_eq!(result, "");
}
#[test]
fn test_keep_anchors_unchanged() {
let md = "[Section](#heading)";
let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
assert_eq!(result, "[Section](#heading)");
}
#[test]
fn test_complex_relative_paths() {
let md = "";
let result =
convert_urls_to_absolute(md, "https://example.com/a/b/c/page.html").unwrap();
assert_eq!(result, "");
}
#[test]
fn test_root_relative_urls() {
let md = "";
let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
assert_eq!(result, "");
}
#[test]
fn test_protocol_relative_urls() {
let md = "";
let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
assert_eq!(result, "");
}
#[test]
fn test_urls_with_query_params() {
let md = "[API](../api/v1?foo=bar&baz=qux)";
let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
assert_eq!(
result,
"[API](https://example.com/api/v1?foo=bar&baz=qux)"
);
}
#[test]
fn test_urls_with_fragments() {
let md = "[Section](../page.html#section)";
let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
assert_eq!(result, "[Section](https://example.com/page.html#section)");
}
#[test]
fn test_multiple_images_and_links() {
let md = r#"

[Home](../index.html)

[Absolute](https://other.com/page)
"#;
let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
assert!(result.contains(""));
assert!(result.contains("[Home](https://example.com/index.html)"));
assert!(result.contains(""));
assert!(result.contains("[Absolute](https://other.com/page)"));
}
#[test]
fn test_full_pipeline_with_url_conversion() {
let html = r#"
<html>
<body>
<img src="../images/logo.png" alt="Logo">
<a href="../about.html">About</a>
<img src="https://cdn.example.com/banner.jpg" alt="Banner">
</body>
</html>
"#;
let result = html_to_markdown(html, "https://example.com/docs/page.html", false).unwrap();
assert!(result.contains(""));
assert!(result.contains(""));
assert!(!result.contains("../images/logo.png"));
}
#[test]
fn test_edge_case_empty_alt_text() {
let md = "";
let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
assert_eq!(result, "");
}
#[test]
fn test_edge_case_special_chars_in_url() {
let md = "[Link](path%20with%20spaces.html)";
let result = convert_urls_to_absolute(md, "https://example.com/").unwrap();
assert_eq!(result, "[Link](https://example.com/path%20with%20spaces.html)");
}
#[test]
fn test_escape_multiline_links_simple() {
let input = "[This is a\nlink](#heading)";
let result = escape_multiline_links(input);
assert_eq!(result, "[This is a link](#heading)");
}
#[test]
fn test_escape_multiline_links_multiple() {
let input = "[Line 1\nLine 2\nLine 3](url)";
let result = escape_multiline_links(input);
assert_eq!(result, "[Line 1 Line 2 Line 3](url)");
}
#[test]
fn test_escape_multiline_links_nested_brackets() {
let input = "[[inner\nlink]](url)";
let result = escape_multiline_links(input);
assert_eq!(result, "[[inner link]](url)");
}
#[test]
fn test_escape_multiline_links_no_newlines() {
let input = "[Normal link](url)";
let result = escape_multiline_links(input);
assert_eq!(result, "[Normal link](url)");
}
#[test]
fn test_escape_multiline_links_outside_links() {
let input = "Text before\n[link](url)\nText after";
let result = escape_multiline_links(input);
assert_eq!(result, "Text before\n[link](url)\nText after");
}
#[test]
fn test_escape_multiline_links_multiple_links() {
let input = "[First\nlink](url1) and [second\nlink](url2)";
let result = escape_multiline_links(input);
assert_eq!(result, "[First link](url1) and [second link](url2)");
}
#[test]
fn test_escape_multiline_links_empty_link() {
let input = "[](url)";
let result = escape_multiline_links(input);
assert_eq!(result, "[](url)");
}
#[test]
fn test_escape_multiline_links_image_syntax() {
let input = "";
let result = escape_multiline_links(input);
assert_eq!(result, "");
}
#[test]
fn test_escape_multiline_links_unmatched_bracket() {
let input = "[unclosed link\nwithout closing bracket";
let result = escape_multiline_links(input);
assert_eq!(result, "[unclosed link without closing bracket");
}
#[test]
fn test_remove_skip_to_content() {
let input = "[Skip to Content](#main)\n\n# Welcome\n\nContent here.";
let expected = "# Welcome\n\nContent here.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_remove_skip_to_main() {
let input = "[Skip to Main](#main-content)\n\nActual content.";
let expected = "Actual content.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_remove_skip_to_navigation() {
let input = "[Skip to Navigation](#nav)\n\nPage content.";
let expected = "Page content.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_remove_jump_to_content() {
let input = "[Jump to Content](#content)\n\nMain text.";
let expected = "Main text.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_remove_multiple_skip_links() {
let input = "[Skip to Content](#main)\n[Skip to Navigation](#nav)\n\nContent.";
let expected = "Content.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_preserve_regular_links() {
let input = "[Regular Link](https://example.com)\n\nContent.";
let expected = "[Regular Link](https://example.com)\n\nContent.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_case_insensitive_skip_links() {
let input = "[SKIP TO CONTENT](#main)\n\nContent.";
let expected = "Content.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_screen_reader_text() {
let input = "[Screen reader only: Navigation menu](#nav)\n\nContent.";
let expected = "Content.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_no_removal_in_middle_of_text() {
let input = "Some text [Skip to Content](#main) more text.";
assert!(remove_accessibility_links(input).contains("Skip to Content"));
}
#[test]
fn test_back_to_top_removal() {
let input = "Content here\n[Back to Top](#top)";
let expected = "Content here\n";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_go_to_content_removal() {
let input = "[Go to Main](#main)\n\nPage content.";
let expected = "Page content.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_skip_navigation_lowercase() {
let input = "[Skip navigation](#nav)\n\nContent.";
let expected = "Content.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_multiple_accessibility_variants() {
let input = "[Skip to Content](#main)\n[Jump to Navigation](#nav)\n[Back to Top](#top)\n\nActual content.";
let expected = "Actual content.";
assert_eq!(remove_accessibility_links(input), expected);
}
#[test]
fn test_debug_regex_pattern() {
let input = "[Skip to Main](#main)\n\nArticle Title\n==========";
let result = remove_accessibility_links(input);
assert!(!result.contains("Skip to Main"));
assert_eq!(result, "Article Title\n==========");
}
#[test]
fn test_full_pipeline_with_all_improvements() {
let html = r##"
<html>
<body>
<nav><a href="#main">Skip to content</a></nav>
<main>
<h1>Test Page</h1>
<p>Regular content here.</p>
<img srcset="small.jpg 300w, medium.jpg 600w, large.jpg 1200w" alt="Test Image">
<p>More content with <a href="#section">multi-line
link text</a>.</p>
</main>
<footer><a href="#top">Back to Top</a></footer>
</body>
</html>
"##;
let result = html_to_markdown(html, "https://example.com", false).unwrap();
assert!(result.contains(""));
assert!(!result.contains("Skip to content"));
assert!(!result.contains("Back to Top"));
assert!(!result.contains("small.jpg"));
assert!(!result.contains("medium.jpg"));
assert!(result.contains("Test Page"));
assert!(result.contains("Regular content"));
}
#[test]
fn test_srcset_resolution_integration() {
let html = r#"
<img srcset="img-400.jpg 400w, img-800.jpg 800w, img-1600.jpg 1600w" alt="Responsive">
<img srcset="icon@1x.png 1x, icon@2x.png 2x, icon@3x.png 3x" alt="Retina">
<img src="regular.jpg" alt="Normal">
"#;
let result = html_to_markdown(html, "https://cdn.example.com", false).unwrap();
eprintln!("Result markdown:\n{}", result);
assert!(result.contains("") ||
result.contains("img-1600.jpg"), "Expected to find img-1600.jpg in output");
assert!(result.contains("") ||
result.contains("icon@3x.png"), "Expected to find icon@3x.png in output");
assert!(result.contains("") ||
result.contains("regular.jpg"), "Expected to find regular.jpg in output");
}
#[test]
fn test_multiline_link_escaping_integration() {
let html = r#"
<a href="https://example.com">This is a
multi-line
link</a>
"#;
let result = html_to_markdown(html, "https://example.com", false).unwrap();
eprintln!("Multiline link result:\n{}", result);
assert!(result.contains("["));
assert!(result.contains("]"));
assert!(result.contains("(https://example.com)"));
}
#[test]
fn test_accessibility_link_removal_integration() {
let html = r##"
<nav>
<a href="#content">Skip to Content</a>
<a href="#main">Skip to Main</a>
</nav>
<main id="content">
<h1>Article Title</h1>
<p>Article content.</p>
<a href="https://example.com">Normal Link</a>
</main>
<footer>
<a href="#top">Back to Top</a>
</footer>
"##;
let result = html_to_markdown(html, "https://example.com", false).unwrap();
assert!(!result.contains("Skip to Content"));
assert!(!result.contains("Skip to Main"));
assert!(!result.contains("Back to Top"));
assert!(result.contains("Article Title"));
assert!(result.contains("Article content"));
}
#[test]
fn test_setext_h1_to_atx() {
let md = "Title\n=====\n\nContent";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("# Title"), "Expected ATX h1, got: {}", cleaned);
assert!(!cleaned.contains("====="));
}
#[test]
fn test_setext_h2_to_atx() {
let md = "Subtitle\n--------\n\nContent";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("## Subtitle"), "Expected ATX h2, got: {}", cleaned);
assert!(!cleaned.contains("--------"));
}
#[test]
fn test_setext_preserves_existing_atx() {
let md = "# Already ATX\n\nContent";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("# Already ATX"));
}
#[test]
fn test_setext_multiple_headings() {
let md = "First\n=====\n\nSecond\n------\n\nThird\n=====";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("# First"));
assert!(cleaned.contains("## Second"));
assert!(cleaned.contains("# Third"));
}
#[test]
fn test_base64_image_replacement() {
let md = "";
let cleaned = clean_markdown(md);
assert_eq!(cleaned, "");
}
#[test]
fn test_base64_image_preserves_normal_images() {
let md = "";
let cleaned = clean_markdown(md);
assert!(cleaned.contains(""));
}
#[test]
fn test_base64_image_mixed() {
let md = " and ";
let cleaned = clean_markdown(md);
assert!(cleaned.contains(""));
assert!(cleaned.contains(""));
assert!(!cleaned.contains("base64"));
}
#[test]
fn test_json_object_detection() {
let json = r#"{"name": "test", "value": 42}"#;
let result = html_to_markdown(json, "https://example.com", false).unwrap();
assert!(result.contains("# JSON Response"), "Expected heading, got: {}", result);
assert!(result.contains("```json\n"));
assert!(result.ends_with("\n```"));
assert!(result.contains(r#""name": "test""#));
}
#[test]
fn test_json_array_detection() {
let json = r#"[{"id": 1}, {"id": 2}]"#;
let result = html_to_markdown(json, "https://example.com", false).unwrap();
assert!(result.contains("```json\n"));
assert!(result.contains(r#""id": 1"#));
}
#[test]
fn test_html_not_detected_as_json() {
let html = "<html><body><p>Hello</p></body></html>";
let result = html_to_markdown(html, "https://example.com", false).unwrap();
assert!(!result.starts_with("```json"));
assert!(result.contains("Hello"));
}
#[test]
fn test_empty_result_fallback() {
let html = r#"
<html>
<body>
<main><span></span></main>
<div>Actual content is here with enough text to be useful for AI agents.</div>
</body>
</html>
"#;
let result = html_to_markdown(html, "https://example.com", true).unwrap();
assert!(result.contains("Actual content is here"));
}
#[test]
fn test_modal_noise_selectors_present() {
let html = r#"
<html>
<body>
<div class="modal">
<h2>Sign up now!</h2>
<form>
<input type="email" placeholder="Email">
<button>Subscribe</button>
</form>
</div>
<div class="overlay" style="position:fixed">
<p>Overlay content</p>
</div>
<h1>Main Page Title</h1>
<p>This is the main content of the page that should be preserved.</p>
<p>More content paragraphs here.</p>
</body>
</html>
"#;
let result = extract_main_content_html(html).unwrap();
assert!(!result.contains("Sign up now"), "Modal content should be removed");
assert!(!result.contains("Overlay content"), "Overlay content should be removed");
assert!(result.contains("Main Page Title"));
assert!(result.contains("main content of the page"));
}
#[test]
fn test_escaped_html_tag_removal() {
let md = r#"Content \<style\> \</style\> more text \</a\> end"#;
let cleaned = clean_markdown(md);
assert!(!cleaned.contains(r"\<style\>"), "Escaped style tag should be removed");
assert!(!cleaned.contains(r"\</a\>"), "Escaped closing tag should be removed");
assert!(cleaned.contains("Content"));
assert!(cleaned.contains("more text"));
assert!(cleaned.contains("end"));
}
#[test]
fn test_escaped_html_comment_removal() {
let md = r#"Before \<!-- comment --\> After"#;
let cleaned = clean_markdown(md);
assert!(!cleaned.contains("comment"), "Escaped comment should be removed");
assert!(cleaned.contains("Before"));
assert!(cleaned.contains("After"));
}
#[test]
fn test_escaped_tags_preserve_normal_content() {
let md = "The value is a < b and 5 > 3";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("a < b"), "Normal < should be preserved: {}", cleaned);
}
#[test]
fn test_code_fence_protection() {
let md = "Some text\n\n```html\n<div class=\"container\">\n <p>Hello</p>\n</div>\n```\n\nMore text";
let result = clean_html_from_markdown(md);
assert!(result.contains("<div class=\"container\">"), "HTML in code fence should be preserved: {}", result);
assert!(result.contains("<p>Hello</p>"), "HTML tags in code fence should be preserved: {}", result);
assert!(result.contains("Some text"));
assert!(result.contains("More text"));
}
#[test]
fn test_code_fence_protection_multiple_blocks() {
let md = "Text\n\n```html\n<strong>bold</strong>\n```\n\nMiddle <div>removed</div>\n\n```js\nconst x = '<span>test</span>';\n```\n\nEnd";
let result = clean_html_from_markdown(md);
assert!(result.contains("<strong>bold</strong>"), "HTML in first code fence preserved");
assert!(!result.contains("<div>removed</div>"), "HTML outside code fence should be stripped");
assert!(result.contains("removed"));
assert!(result.contains("<span>test</span>"), "HTML in second code fence preserved");
}
#[test]
fn test_inline_bold_conversion() {
let html = "<strong>important</strong> text <b>also bold</b>";
let result = clean_html_from_markdown(html);
assert!(result.contains("**important**"), "Expected **important**, got: {}", result);
assert!(result.contains("**also bold**"), "Expected **also bold**, got: {}", result);
}
#[test]
fn test_inline_italic_conversion() {
let html = "<em>emphasized</em> text <i>also italic</i>";
let result = clean_html_from_markdown(html);
assert!(result.contains("_emphasized_"), "Expected _emphasized_, got: {}", result);
assert!(result.contains("_also italic_"), "Expected _also italic_, got: {}", result);
}
#[test]
fn test_inline_code_conversion() {
let html = "Use <code>console.log()</code> for debugging";
let result = clean_html_from_markdown(html);
assert!(result.contains("`console.log()`"), "Expected `console.log()`, got: {}", result);
}
#[test]
fn test_pre_code_language_detection() {
let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
let result = clean_html_from_markdown(html);
assert!(result.contains("```rust"), "Expected ```rust, got: {}", result);
assert!(result.contains("fn main() {}"), "Expected code content");
}
#[test]
fn test_pre_code_lang_prefix() {
let html = r#"<pre><code class="lang-python">print("hello")</code></pre>"#;
let result = clean_html_from_markdown(html);
assert!(result.contains("```python"), "Expected ```python, got: {}", result);
}
#[test]
fn test_pre_code_highlight_prefix() {
let html = r#"<pre><code class="highlight-javascript">const x = 1;</code></pre>"#;
let result = clean_html_from_markdown(html);
assert!(result.contains("```javascript"), "Expected ```javascript, got: {}", result);
}
#[test]
fn test_anchor_to_markdown_link() {
let html = r#"Visit <a href="https://example.com">Example</a> for details."#;
let result = clean_html_from_markdown(html);
assert!(result.contains("[Example](https://example.com)"), "Expected markdown link, got: {}", result);
}
#[test]
fn test_anchor_javascript_href_stripped() {
let html = r#"<a href="javascript:void(0)">Click me</a>"#;
let result = clean_html_from_markdown(html);
assert!(result.contains("Click me"));
assert!(!result.contains("javascript:"));
}
#[test]
fn test_preprocess_resolves_relative_urls() {
let html = r#"<a href="/about">About</a> <img src="/logo.png">"#;
let result = preprocess_html_for_conversion(html, "https://example.com");
assert!(result.contains("https://example.com/about"), "Expected absolute href, got: {}", result);
assert!(result.contains("https://example.com/logo.png"), "Expected absolute src, got: {}", result);
}
#[test]
fn test_preprocess_preserves_absolute_urls() {
let html = r#"<a href="https://other.com/page">Link</a>"#;
let result = preprocess_html_for_conversion(html, "https://example.com");
assert!(result.contains("https://other.com/page"));
}
#[test]
fn test_preprocess_strips_gutter_elements() {
let html = r#"<pre><td class="gutter"><span>1</span></td><td class="code">let x = 1;</td></pre>"#;
let result = preprocess_html_for_conversion(html, "https://example.com");
assert!(!result.contains("gutter"), "Gutter should be stripped, got: {}", result);
assert!(result.contains("let x = 1;"));
}
#[test]
fn test_ui_noise_loading_sponsored() {
let md = "# Products\n\nLoading...\n\nSponsored\n\nSome product here\n\nNotifications";
let cleaned = clean_markdown(md);
assert!(!cleaned.contains("Loading..."));
assert!(!cleaned.contains("Sponsored"));
assert!(!cleaned.contains("Notifications"));
assert!(cleaned.contains("# Products"));
assert!(cleaned.contains("Some product here"));
}
#[test]
fn test_copyright_footer_removal() {
let md = "# Page\n\nContent here\n\nCopyright © 2024 Acme Inc. All Rights Reserved.\n\nMore content";
let cleaned = clean_markdown(md);
assert!(!cleaned.contains("Copyright ©"));
assert!(cleaned.contains("Content here"));
assert!(cleaned.contains("More content"));
}
#[test]
fn test_link_whitespace_normalization() {
let md = "[ Apple ](https://example.com)";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("[Apple](https://example.com)"), "Got: {}", cleaned);
}
#[test]
fn test_link_text_deduplication() {
let md = "[Apple Apple](https://example.com)";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("[Apple](https://example.com)"), "Got: {}", cleaned);
}
#[test]
fn test_link_text_dedup_multiword() {
let md = "[New York New York](https://example.com)";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("[New York](https://example.com)"), "Got: {}", cleaned);
}
#[test]
fn test_link_text_no_false_dedup() {
let md = "[Apple Samsung](https://example.com)";
let cleaned = clean_markdown(md);
assert!(cleaned.contains("[Apple Samsung](https://example.com)"), "Got: {}", cleaned);
}
#[test]
fn test_repeated_list_items_collapsed() {
let md = "* Product info page\n\n* Product info page\n\n* Product info page\n\n* Product info page\n\nOther content";
let cleaned = clean_markdown(md);
let count = cleaned.matches("Product info page").count();
assert!(count <= 2, "Expected <= 2 occurrences but got {}: {}", count, cleaned);
}
}