1use crate::error::Result;
2use regex::Regex;
3use scraper::{Html, Selector};
4use std::sync::LazyLock;
5use url::Url;
6
7macro_rules! cached_regex {
8 ($name:ident, $pattern:expr) => {
9 static $name: LazyLock<Regex> = LazyLock::new(|| Regex::new($pattern).unwrap());
10 };
11}
12
13cached_regex!(RE_SCRIPT, r"(?is)<script[^>]*?>.*?</script>");
15cached_regex!(RE_STYLE, r"(?is)<style[^>]*?>.*?</style>");
16cached_regex!(RE_NOSCRIPT, r"(?is)<noscript[^>]*?>.*?</noscript>");
17cached_regex!(RE_SVG, r"(?is)<svg[^>]*?>.*?</svg>");
18cached_regex!(RE_HEAD, r"(?is)<head[^>]*?>.*?</head>");
19cached_regex!(RE_COMMENT, r"(?is)<!--.*?-->");
20
21cached_regex!(RE_SETEXT_H1, r"(?m)^[ \t]*(.+)\n[ \t]*={3,}\s*$");
25cached_regex!(RE_SETEXT_H2, r"(?m)^[ \t]*(.+)\n[ \t]*-{3,}\s*$");
26cached_regex!(RE_ESCAPED_TAG, r"\\</?[a-zA-Z!][^\n>]*?\\?>");
27cached_regex!(RE_CSS_ROOT, r":root\{--[^}]+\}");
28cached_regex!(RE_BASE64_IMG, r"(!\[[^\]]*\])\(data:image/[^;]+;base64,[^)]*\)");
29cached_regex!(RE_EMPTY_LINK, r"\[([^\]]*)\]\(\s*\)");
30cached_regex!(RE_EMPTY_LIST_RUN, r"(?m)(?:^\s*\*\s*\n){3,}");
31cached_regex!(RE_COLLAPSE_NEWLINES, r"\n\s*\n\s*\n+");
32
33cached_regex!(RE_IMG_TAG, r#"<img\s[^>]*?>"#);
36cached_regex!(RE_IMG_SRC, r#"src\s*=\s*["']([^"']+)["']"#);
37cached_regex!(RE_IMG_ALT, r#"alt\s*=\s*["']([^"']*?)["']"#);
38cached_regex!(RE_CATCHALL_TAG, r"</?[a-zA-Z][a-zA-Z0-9]*(?:\s[^>]*)?>");
39cached_regex!(RE_MULTI_NEWLINE, r"\n{3,}");
40
41cached_regex!(RE_ANCHOR_TAG, r#"(?is)<a\s[^>]*?href\s*=\s*["']([^"']*)["'][^>]*?>(.*?)</a>"#);
43
44cached_regex!(RE_PRE_CODE, r#"(?is)<pre[^>]*?>\s*<code([^>]*)>(.*?)</code>\s*</pre>"#);
47cached_regex!(RE_PRE_BARE, r#"(?is)<pre[^>]*?>(.*?)</pre>"#);
48cached_regex!(RE_LANG_CLASS, r#"(?i)\b(?:language|lang|highlight)-([a-zA-Z0-9_+-]+)"#);
50
51cached_regex!(RE_HEADING_IN_LINK, r"\[\s*(#{1,6})\s+(.+?)\s*#{0,6}\s*\]\(([^)]+)\)");
55cached_regex!(RE_EMPTY_TEXT_LINK, r"(^|[^!])\[\]\([^)]+\)");
56cached_regex!(RE_TRACKING_PIXEL, r"!\[[^\]]*\]\([^)]*(?:s_1x2\.gif|pixel\.gif|spacer\.gif|blank\.gif|clear\.gif)[^)]*\)");
57
58pub fn html_to_markdown(html: &str, base_url: &str, only_main: bool) -> Result<String> {
62 let trimmed = html.trim();
64 if (trimmed.starts_with('{') && trimmed.ends_with('}'))
65 || (trimmed.starts_with('[') && trimmed.ends_with(']'))
66 {
67 if serde_json::from_str::<serde_json::Value>(trimmed).is_ok() {
69 return Ok(format!("# JSON Response\n\n```json\n{}\n```", trimmed));
70 }
71 }
72
73 if !trimmed.is_empty() && !trimmed.contains('<') {
76 return Ok(trimmed.to_string());
77 }
78
79 let content = if only_main {
80 extract_main_content_html(html)?
81 } else {
82 html.to_string()
83 };
84
85 let content = crate::format::image_processing::rescue_noscript_images(&content);
87
88 let content = strip_non_content_tags(&content);
90
91 let content = crate::format::image_processing::resolve_picture_elements(&content);
93
94 let content = preprocess_html_for_conversion(&content, base_url);
96
97 let content = crate::format::image_processing::resolve_srcsets(&content);
99
100 let content = crate::format::image_processing::resolve_lazy_images(&content);
102
103 let content = crate::format::image_processing::resolve_video_posters(&content);
105
106 let content = strip_layout_tables(&content);
108
109 let content = strip_excessive_tables(&content);
111
112 let markdown = safe_parse_html(&content);
114
115 let markdown = if markdown.len() > content.len() * 3 && content.len() > 10000 {
119 static RE_INDIVIDUAL_TABLE: LazyLock<Regex> = LazyLock::new(|| {
120 Regex::new(r"(?is)<table[^>]*>(.*?)</table>").unwrap()
121 });
122 let selective = RE_INDIVIDUAL_TABLE.replace_all(&content, |caps: ®ex::Captures| {
123 let full_match = &caps[0];
124 let table_attrs = full_match.split('>').next().unwrap_or("");
125 let is_important = table_attrs.contains("infobox")
127 || table_attrs.contains("wikitable")
128 || table_attrs.contains("data-table");
129 let row_count = full_match.matches("<tr").count();
130 if is_important || row_count <= 30 {
131 full_match.to_string()
132 } else {
133 "\n".to_string()
134 }
135 }).to_string();
136 safe_parse_html(&selective)
137 } else {
138 markdown
139 };
140
141 let markdown = compress_markdown_tables(&markdown);
143
144 const MAX_MARKDOWN_BYTES: usize = 500_000;
146 let markdown = if markdown.len() > MAX_MARKDOWN_BYTES {
147 let truncated = &markdown[..MAX_MARKDOWN_BYTES];
148 let cutoff = truncated.rfind('\n').unwrap_or(MAX_MARKDOWN_BYTES);
149 format!(
150 "{}\n\n[Content truncated: {} chars total]",
151 &markdown[..cutoff],
152 markdown.len()
153 )
154 } else {
155 markdown
156 };
157
158 let cleaned = clean_markdown(&markdown);
160
161 let escaped = escape_multiline_links(&cleaned);
163
164 let no_skip_links = remove_accessibility_links(&escaped);
166
167 let collapsed = RE_COLLAPSE_NEWLINES
168 .replace_all(&no_skip_links, "\n\n")
169 .to_string();
170
171 let collapsed = if !collapsed.lines().any(|l| l.trim_start().starts_with('#')) {
173 if let Some(title) = extract_title_from_html(html) {
174 if !title.is_empty() {
175 format!("# {}\n\n{}", title.trim(), collapsed)
176 } else {
177 collapsed
178 }
179 } else {
180 collapsed
181 }
182 } else {
183 collapsed
184 };
185
186 if only_main && collapsed.trim().len() < 50 {
188 return html_to_markdown(html, base_url, false);
189 }
190
191 let portable = convert_urls_to_absolute(&collapsed, base_url)?;
193
194 Ok(portable)
195}
196
197fn extract_title_from_html(html: &str) -> Option<String> {
200 static RE_TITLE: LazyLock<Regex> = LazyLock::new(|| {
201 Regex::new(r"(?is)<title[^>]*>(.*?)</title>").unwrap()
202 });
203 RE_TITLE.captures(html).map(|caps| {
204 let raw = caps[1].trim().to_string();
205 raw.replace("&", "&")
207 .replace("<", "<")
208 .replace(">", ">")
209 .replace(""", "\"")
210 .replace("'", "'")
211 .replace(" ", " ")
212 })
213}
214
215pub fn extract_main_content_html(html: &str) -> Result<String> {
217 let document = Html::parse_document(html);
218
219 let main_selectors = [
222 "#readme", ".markdown-body", "#mw-content-text",
226 ".mw-parser-output",
227 "main article",
229 "[role='main'] article",
230 ".docs-content",
232 ".doc-content",
233 "[data-docs-content]",
234 ".prose", ".article-body",
236 "main",
238 "article",
239 "[role='main']",
240 ".main-content",
241 "#main-content",
242 ".content",
243 "#content",
244 ".post-content",
245 ".entry-content",
246 ".article-content",
247 ".page-content",
248 ".body-content",
249 "#inside",
251 ".stories",
252 ".itemlist",
253 ];
254
255 let html_len = html.len();
256 for selector_str in &main_selectors {
257 if let Ok(selector) = Selector::parse(selector_str) {
258 if let Some(element) = document.select(&selector).next() {
259 let content = element.html();
260 let min_size = html_len / 10;
265 if content.len() >= min_size {
266 return Ok(remove_nested_nav(&content));
268 }
269 }
271 }
272 }
273
274 let mut cleaned_html = html.to_string();
276
277 static REMOVE_SELECTORS_PARSED: LazyLock<Vec<Selector>> = LazyLock::new(|| {
278 [
279 ".Layout-sidebar", ".file-navigation", ".BorderGrid",
281 ".Layout-sidebar-left", ".Layout-sidebar-right", ".repository-content",
282 ".file-tree", ".js-file-line-container", ".blob-wrapper",
283 ".contributors-wrapper", ".discussion-sidebar",
284 "nav", "header", "footer", "aside",
286 ".navigation", ".sidebar", ".menu", ".header", ".footer",
287 "#header", "#footer", "#navigation",
288 ".docs-sidebar", ".doc-sidebar", ".sidebar-nav",
290 ".toc-sidebar", ".page-sidebar", ".left-sidebar",
291 ".side-nav", ".sidenav",
292 "#sidebar", "#toc",
293 "[role='navigation']", "[role='complementary']",
295 ".toc", ".table-of-contents",
297 ".skip-link", ".skip-to-content",
299 ".mw-editsection", "#mw-panel", "#mw-head", ".navbox", ".catlinks", ".mw-indicators", ".sistersitebox", "#p-lang-btn", ".vector-page-toolbar", ".vector-column-start", ".cookie-banner", ".cookie-consent", ".cookie-notice",
312 "#cookie-banner", "#cookie-consent",
313 ".share-buttons", ".social-share", ".social-links",
315 ".ad", ".advertisement", ".ads",
317 ".breadcrumb", ".breadcrumbs",
319 ".search-form", ".search-box",
320 ".modal", ".popup", "#modal", ".overlay",
322 ".widget", "#widget",
324 ".lang-selector", ".language", "#language-selector",
326 ".top-bar", ".bottom-bar",
328 ".gh-header", "#gh-header",
329 "script", "style", "noscript", "svg",
331 ]
332 .iter()
333 .filter_map(|s| Selector::parse(s).ok())
334 .collect()
335 });
336
337 let doc = Html::parse_document(&cleaned_html);
338 let mut to_remove = String::new();
339
340 for selector in REMOVE_SELECTORS_PARSED.iter() {
341 for element in doc.select(selector) {
342 to_remove.push_str(&element.html());
343 }
344 }
345
346 let attr_selectors = [
348 "[class*='cookie']", "[aria-label='breadcrumb']",
349 "[class*='cart']", "[class*='wishlist']", "[class*='account-']",
350 "[class*='sponsored']", "[class*='banner']",
351 "[class*='notification']", "[class*='alert']",
352 ];
353 for selector_str in &attr_selectors {
354 if let Ok(selector) = Selector::parse(selector_str) {
355 for element in doc.select(&selector) {
356 to_remove.push_str(&element.html());
357 }
358 }
359 }
360
361 for line in to_remove.lines() {
363 if !line.trim().is_empty() {
364 cleaned_html = cleaned_html.replace(line, "");
365 }
366 }
367
368 Ok(if cleaned_html.trim().is_empty() {
369 html.to_string()
370 } else {
371 cleaned_html
372 })
373}
374
375fn remove_nested_nav(html: &str) -> String {
379 static NESTED_NAV_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
380 vec![
381 Regex::new(r"(?is)<nav[^>]*>.*?</nav>").unwrap(),
383 Regex::new(r"(?is)<aside[^>]*>.*?</aside>").unwrap(),
385 Regex::new(r#"(?is)<div[^>]*class\s*=\s*["'][^"']*\b(?:sidebar|side-nav|sidenav|sidebar-nav|toc-sidebar|page-sidebar)\b[^"']*["'][^>]*>.*?</div>"#).unwrap(),
387 Regex::new(r#"(?is)<\w+[^>]*role\s*=\s*["'](?:navigation|complementary)["'][^>]*>.*?</\w+>"#).unwrap(),
389 ]
390 });
391
392 let mut result = html.to_string();
393 for re in NESTED_NAV_PATTERNS.iter() {
394 result = re.replace_all(&result, "").to_string();
395 }
396 result
397}
398
399fn safe_parse_html(html: &str) -> String {
408 if html.len() < 500_000 {
410 return html2md::parse_html(html);
411 }
412
413 let html_owned = html.to_string();
415 let result = std::thread::Builder::new()
416 .name("html2md-parser".to_string())
417 .stack_size(32 * 1024 * 1024) .spawn(move || html2md::parse_html(&html_owned))
419 .and_then(|handle| {
420 handle.join().map_err(|_| {
421 std::io::Error::other("html2md thread panicked")
422 })
423 });
424
425 match result {
426 Ok(markdown) => markdown,
427 Err(e) => {
428 tracing::warn!("html2md failed with large HTML ({}KB): {}", html.len() / 1024, e);
429 let doc = Html::parse_document(html);
431 doc.root_element()
432 .text()
433 .collect::<Vec<_>>()
434 .join(" ")
435 }
436 }
437}
438
439fn preprocess_html_for_conversion(html: &str, base_url: &str) -> String {
449 let base = match Url::parse(base_url) {
450 Ok(u) => u,
451 Err(_) => return html.to_string(),
452 };
453
454 static RE_HREF: LazyLock<Regex> = LazyLock::new(|| {
456 Regex::new(r#"(<a\s[^>]*?href\s*=\s*["'])([^"']+)(["'])"#).unwrap()
457 });
458 let result = RE_HREF.replace_all(html, |caps: ®ex::Captures| {
459 let prefix = &caps[1];
460 let href = &caps[2];
461 let suffix = &caps[3];
462 if href.starts_with("http://") || href.starts_with("https://") || href.starts_with("data:") || href.starts_with("javascript:") {
463 caps[0].to_string()
464 } else if href.starts_with('#') {
465 let base_str = base.as_str().split('#').next().unwrap_or(base.as_str());
467 format!("{}{}{}{}", prefix, base_str, href, suffix)
468 } else if href.starts_with("//") {
469 format!("{}https:{}{}", prefix, href, suffix)
470 } else {
471 match base.join(href) {
472 Ok(abs) => format!("{}{}{}", prefix, abs, suffix),
473 Err(_) => caps[0].to_string(),
474 }
475 }
476 }).to_string();
477
478 static RE_IMG_SRC_ATTR: LazyLock<Regex> = LazyLock::new(|| {
480 Regex::new(r#"(<img\s[^>]*?src\s*=\s*["'])([^"']+)(["'])"#).unwrap()
481 });
482 let result = RE_IMG_SRC_ATTR.replace_all(&result, |caps: ®ex::Captures| {
483 let prefix = &caps[1];
484 let src = &caps[2];
485 let suffix = &caps[3];
486 if src.starts_with("http://") || src.starts_with("https://") || src.starts_with("data:") {
487 caps[0].to_string()
488 } else if src.starts_with("//") {
489 format!("{}https:{}{}", prefix, src, suffix)
490 } else {
491 match base.join(src) {
492 Ok(abs) => format!("{}{}{}", prefix, abs, suffix),
493 Err(_) => caps[0].to_string(),
494 }
495 }
496 }).to_string();
497
498 static RE_GUTTER: LazyLock<Regex> = LazyLock::new(|| {
500 Regex::new(r#"(?is)<(?:td|span|div)[^>]*class\s*=\s*["'][^"']*(?:gutter|line-number|linenumber|hljs-ln-numbers|blob-num)[^"']*["'][^>]*>.*?</(?:td|span|div)>"#).unwrap()
501 });
502 let result = RE_GUTTER.replace_all(&result, "").to_string();
503
504 result
505}
506
507fn strip_non_content_tags(html: &str) -> String {
511 let regexes: &[&Regex] = &[&RE_SCRIPT, &RE_STYLE, &RE_NOSCRIPT, &RE_SVG, &RE_HEAD, &RE_COMMENT];
512 let mut result = html.to_string();
513 for re in regexes {
514 result = re.replace_all(&result, "").to_string();
515 }
516 result
517}
518
519fn strip_layout_tables(html: &str) -> String {
520 let mut result = html.to_string();
524
525 static RE_LAYOUT_TBL: LazyLock<Regex> = LazyLock::new(|| {
529 Regex::new(r#"(?s)<table[^>]*(cellpadding|cellspacing|border=["']?0["']?)[^>]*>.*?</table>"#).unwrap()
530 });
531 let layout_table_regex = &*RE_LAYOUT_TBL;
532
533 loop {
536 let mut replacements = Vec::new();
537
538 for cap in layout_table_regex.find_iter(&result) {
539 let table_html = cap.as_str();
540
541 if table_html.contains("<th") || table_html.contains("<th>") {
543 continue;
544 }
545
546 let table_doc = Html::parse_fragment(table_html);
548 let text_content = table_doc
549 .root_element()
550 .text()
551 .collect::<Vec<_>>()
552 .join("\n");
553
554 replacements.push((
555 table_html.to_string(),
556 format!(
557 "<div class=\"extracted-from-layout-table\">\n{}\n</div>",
558 text_content
559 ),
560 ));
561 }
562
563 if replacements.is_empty() {
565 break;
566 }
567
568 for (old, new) in replacements {
570 result = result.replace(&old, &new);
571 }
572 }
573
574 result
575}
576
577fn strip_excessive_tables(html: &str) -> String {
582 if html.len() < 50_000 {
584 return html.to_string();
585 }
586
587 static RE_TABLE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
589 Regex::new(r"(?is)<table[^>]*>.*?</table>").unwrap()
590 });
591
592 let total_table_bytes: usize = RE_TABLE_BLOCK
593 .find_iter(html)
594 .map(|m| m.as_str().len())
595 .sum();
596
597 if total_table_bytes < html.len() / 2 {
599 return html.to_string();
600 }
601
602 RE_TABLE_BLOCK
604 .replace_all(html, |caps: ®ex::Captures| {
605 let table_html = &caps[0];
606 let table_attrs = table_html.split('>').next().unwrap_or("");
607
608 let is_data = table_attrs.contains("infobox")
610 || table_attrs.contains("wikitable")
611 || table_attrs.contains("data-table")
612 || table_attrs.contains("sortable");
613
614 let has_headers = table_html.contains("<th") || table_html.contains("<th>");
616
617 let row_count = table_html.matches("<tr").count();
618
619 if is_data || (has_headers && row_count <= 50) || row_count <= 15 {
620 table_html.to_string()
622 } else {
623 let doc = Html::parse_fragment(table_html);
625 let text: String = doc
626 .root_element()
627 .text()
628 .collect::<Vec<_>>()
629 .join(" ");
630 let trimmed = text.trim();
631 if trimmed.is_empty() {
632 "\n".to_string()
633 } else {
634 format!("\n{}\n", trimmed)
635 }
636 }
637 })
638 .to_string()
639}
640
641fn compress_markdown_tables(markdown: &str) -> String {
645 let mut result = String::with_capacity(markdown.len());
646 for line in markdown.lines() {
647 let trimmed = line.trim();
648 if trimmed.starts_with('|') && trimmed.ends_with('|') {
649 let cells: Vec<&str> = trimmed.split('|').collect();
651 let compressed: Vec<String> = cells
652 .iter()
653 .map(|cell| {
654 let t = cell.trim();
655 if t.is_empty() {
656 String::new()
657 } else if t.chars().all(|c| c == '-' || c == ':' || c == ' ') {
658 let t = t.trim();
660 if t.starts_with(':') && t.ends_with(':') {
661 " :---: ".to_string()
662 } else if t.ends_with(':') {
663 " ---: ".to_string()
664 } else if t.starts_with(':') {
665 " :--- ".to_string()
666 } else {
667 " --- ".to_string()
668 }
669 } else {
670 format!(" {} ", t)
671 }
672 })
673 .collect();
674 result.push_str(&compressed.join("|"));
675 } else {
676 result.push_str(line);
677 }
678 result.push('\n');
679 }
680 if result.ends_with('\n') && !markdown.ends_with('\n') {
682 result.pop();
683 }
684 result
685}
686
687fn clean_markdown(markdown: &str) -> String {
689 let cleaned = clean_html_from_markdown(markdown);
691
692 let cleaned = strip_invisible_unicode(&cleaned);
694
695 let cleaned = RE_SETEXT_H1.replace_all(&cleaned, "# $1").to_string();
697 let cleaned = RE_SETEXT_H2.replace_all(&cleaned, "## $1").to_string();
698
699 static RE_TRAILING_HASHES: LazyLock<Regex> =
701 LazyLock::new(|| Regex::new(r"(?m)^(#{1,6}\s+.+?)\s+#+\s*$").unwrap());
702 let cleaned = RE_TRAILING_HASHES.replace_all(&cleaned, "$1").to_string();
703
704 let lines: Vec<String> = cleaned.lines().map(|l| l.trim_end().to_string()).collect();
705
706 let mut result = Vec::new();
708 let mut blank_count = 0;
709
710 for line in lines.iter() {
711 if line.trim().is_empty() {
712 blank_count += 1;
713 if blank_count <= 2 {
714 result.push(line.clone());
715 }
716 } else {
717 blank_count = 0;
718 result.push(line.clone());
719 }
720 }
721
722 let joined = result.join("\n").trim().to_string();
724
725 let joined = RE_ESCAPED_TAG.replace_all(&joined, "").to_string();
726 let joined = joined.replace("\\ ", " ").replace("\\\\", "");
727 let joined = RE_CSS_ROOT.replace_all(&joined, "").to_string();
728 let joined = RE_BASE64_IMG.replace_all(&joined, "$1(data:image-removed)").to_string();
729 let joined = RE_EMPTY_LINK.replace_all(&joined, "").to_string();
730
731 let joined = RE_EMPTY_LIST_RUN.replace_all(&joined, "\n").to_string();
733
734 static UI_NOISE: LazyLock<Regex> = LazyLock::new(|| {
736 Regex::new(concat!(
737 r"(?m)^\s*(?:",
738 r"Ask about this section|Copy for LLM|View as Markdown|Copy as Markdown",
739 r"|Open (?:Markdown|in Claude)(?:\s*Ask Docs AI)?(?:\s*Open in Claude)?",
740 r"|Ask Docs AI\s*Open in Claude",
741 r"|Was this (?:section |page )?helpful\s*(?:to you)?\??",
742 r"|(?:Share|Tweet|Pin it|Email)",
743 r"|(?:Table of [Cc]ontents|In this article|On this page)",
744 r"|Show more|Read more|Load more|See all|Expand all|Collapse all",
745 r"|Scroll to top|Back to top",
746 r"|Primary navigation",
747 r"|Loading\.\.\.",
749 r"|Sponsored",
750 r"|Notifications",
751 r"|Expand (?:Cart|Watch List|My eBay)",
752 r"|Shop by category",
753 r"|All Categories",
754 r"|Toggle the table of contents",
756 r"|move to sidebar\s*hide",
757 r"|\d+\s+languages?",
758 r"|Edit links?",
759 r"|Edit this page on GitHub\s*",
761 r"|Was this page helpful\s*(?:to you)?\??\s*(?:Yes|No)?",
762 r"|Suggest (?:changes|edits?)",
763 r"|Report (?:an? )?(?:issue|bug)",
764 r")\s*$"
765 )).unwrap()
766 });
767 let cleaned = UI_NOISE.replace_all(&joined, "").to_string();
768
769 static RE_EDIT_LINKS: LazyLock<Regex> = LazyLock::new(|| {
771 Regex::new(r"\s*\[\[edit\]\([^)]*\)\]").unwrap()
772 });
773 let cleaned = RE_EDIT_LINKS.replace_all(&cleaned, "").to_string();
774
775 let cleaned = RE_HEADING_IN_LINK
777 .replace_all(&cleaned, "$1 [$2]($3)")
778 .to_string();
779
780 let cleaned = RE_EMPTY_TEXT_LINK.replace_all(&cleaned, "$1").to_string();
782
783 let cleaned = RE_TRACKING_PIXEL.replace_all(&cleaned, "").to_string();
785
786 static RE_LEAKED_JS: LazyLock<Regex> = LazyLock::new(|| {
790 Regex::new(r"(?m)^\s*(?:var|let|const)\s+\w+(?:\\?\w+)*\s*=.*$").unwrap()
791 });
792 let cleaned = RE_LEAKED_JS.replace_all(&cleaned, "").to_string();
793
794 static RE_JS_PROP_ASSIGN: LazyLock<Regex> = LazyLock::new(|| {
797 Regex::new(r#"(?m)^\s*\w+(?:\\?\w+)*(?:\.\w+(?:\\?\w+)*|\[['"][^'"]*['"]\])\s*=\s*.*;\s*$"#).unwrap()
798 });
799 let cleaned = RE_JS_PROP_ASSIGN.replace_all(&cleaned, "").to_string();
800
801 static RE_JS_FUNC_CALL: LazyLock<Regex> = LazyLock::new(|| {
803 Regex::new(r"(?m)^\s*\w+(?:\\?\w+)*(?:\.\w+(?:\\?\w+)*)*\([^)]*\)\s*;\s*$").unwrap()
804 });
805 let cleaned = RE_JS_FUNC_CALL.replace_all(&cleaned, "").to_string();
806
807 static RE_COPYRIGHT: LazyLock<Regex> = LazyLock::new(|| {
809 Regex::new(r"(?m)^\s*Copyright\s+©.*$").unwrap()
810 });
811 let cleaned = RE_COPYRIGHT.replace_all(&cleaned, "").to_string();
812
813 static RE_LINK_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| {
815 Regex::new(r"\[\s{2,}([^\]]*?)\s{2,}\]\(").unwrap()
816 });
817 let cleaned = RE_LINK_WHITESPACE.replace_all(&cleaned, "[$1](").to_string();
818
819 static RE_LINK_INNER_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| {
821 Regex::new(r"\[([^\]]*?)\s{2,}([^\]]*?)\]\(").unwrap()
822 });
823 let mut cleaned = cleaned;
825 for _ in 0..3 {
826 cleaned = RE_LINK_INNER_WHITESPACE.replace_all(&cleaned, "[$1 $2](").to_string();
827 }
828
829 static RE_LINK_TEXT: LazyLock<Regex> = LazyLock::new(|| {
831 Regex::new(r"\[([^\]]+)\]\(").unwrap()
832 });
833 let cleaned = RE_LINK_TEXT.replace_all(&cleaned, |caps: ®ex::Captures| {
834 let text = caps[1].trim();
835 let words: Vec<&str> = text.split_whitespace().collect();
836 let len = words.len();
837 if len >= 2 && len.is_multiple_of(2) {
839 let half = len / 2;
840 if words[..half] == words[half..] {
841 return format!("[{}](", words[..half].join(" "));
842 }
843 }
844 format!("[{}](", text)
845 }).to_string();
846
847 let cleaned = {
850 let lines: Vec<&str> = cleaned.lines().collect();
851 let mut result_lines: Vec<&str> = Vec::with_capacity(lines.len());
852 let mut prev_item: Option<&str> = None;
853 let mut repeat_count = 0u32;
854 for line in &lines {
855 let trimmed = line.trim();
856 if trimmed.starts_with("* ") || trimmed.starts_with("- ") {
857 if Some(trimmed) == prev_item {
858 repeat_count += 1;
859 if repeat_count < 2 {
860 result_lines.push(line);
861 }
862 } else {
864 prev_item = Some(trimmed);
865 repeat_count = 0;
866 result_lines.push(line);
867 }
868 } else {
869 if !trimmed.is_empty() {
870 prev_item = None;
871 repeat_count = 0;
872 }
873 result_lines.push(line);
874 }
875 }
876 result_lines.join("\n")
877 };
878
879 RE_COLLAPSE_NEWLINES.replace_all(&cleaned, "\n\n").to_string()
880}
881
882fn strip_invisible_unicode(text: &str) -> String {
884 text.replace(['\u{200B}', '\u{FEFF}', '\u{200C}', '\u{200D}', '\u{2060}', '\u{FFFE}'], "") }
886
887fn decode_html_entities(text: &str) -> String {
889 text.replace("&", "&")
890 .replace("<", "<")
891 .replace(">", ">")
892 .replace(""", "\"")
893 .replace("'", "'")
894 .replace("'", "'")
895 .replace(" ", " ")
896 .replace("–", "\u{2013}")
897 .replace("—", "\u{2014}")
898 .replace("…", "\u{2026}")
899 .replace("‘", "\u{2018}")
900 .replace("’", "\u{2019}")
901 .replace("“", "\u{201C}")
902 .replace("”", "\u{201D}")
903 .replace("•", "\u{2022}")
904 .replace("·", "\u{00B7}")
905 .replace("©", "\u{00A9}")
906 .replace("®", "\u{00AE}")
907 .replace("™", "\u{2122}")
908}
909
910
911fn clean_html_from_markdown(text: &str) -> String {
914 static RE_CODE_FENCE_LOCAL: LazyLock<Regex> =
917 LazyLock::new(|| Regex::new(r"(?s)```[^\n]*\n.*?```").unwrap());
918 static RE_INLINE_CODE_SPAN: LazyLock<Regex> =
920 LazyLock::new(|| Regex::new(r"`[^`\n]+?`").unwrap());
921 let mut code_blocks: Vec<String> = Vec::new();
922 let text = RE_CODE_FENCE_LOCAL
923 .replace_all(text, |caps: ®ex::Captures| {
924 let placeholder = format!("\x00CODE_FENCE_{}\x00", code_blocks.len());
925 code_blocks.push(caps[0].to_string());
926 placeholder
927 })
928 .to_string();
929 let mut inline_code_spans: Vec<String> = Vec::new();
931 let text = RE_INLINE_CODE_SPAN
932 .replace_all(&text, |caps: ®ex::Captures| {
933 let placeholder = format!("\x00INLINE_CODE_{}\x00", inline_code_spans.len());
934 inline_code_spans.push(caps[0].to_string());
935 placeholder
936 })
937 .to_string();
938
939 let mut result = RE_PRE_CODE
942 .replace_all(&text, |caps: ®ex::Captures| {
943 let attrs = caps.get(1).map_or("", |m| m.as_str());
944 let lang = RE_LANG_CLASS
945 .captures(attrs)
946 .and_then(|c| c.get(1))
947 .map_or("", |m| m.as_str());
948 let code_content = decode_html_entities(caps.get(2).map_or("", |m| m.as_str()));
949 let trimmed = code_content.trim();
950 if trimmed.is_empty() {
951 String::new()
952 } else {
953 format!("\n```{}\n{}\n```\n", lang, trimmed)
954 }
955 })
956 .to_string();
957
958 result = RE_PRE_BARE
960 .replace_all(&result, |caps: ®ex::Captures| {
961 let content = caps.get(1).map_or("", |m| m.as_str()).trim();
962 if content.is_empty() {
963 String::new()
964 } else {
965 format!("\n```\n{}\n```\n", decode_html_entities(content))
966 }
967 })
968 .to_string();
969
970 result = RE_ANCHOR_TAG
972 .replace_all(&result, |caps: ®ex::Captures| {
973 let href = &caps[1];
974 let link_text = caps[2].trim();
975 if link_text.is_empty() || href.is_empty() || href.starts_with("javascript:") {
976 link_text.to_string()
977 } else {
978 format!("[{}]({})", link_text, href)
979 }
980 })
981 .to_string();
982
983 result = RE_IMG_TAG
985 .replace_all(&result, |caps: ®ex::Captures| {
986 let img_tag = &caps[0];
987 let src = RE_IMG_SRC
988 .captures(img_tag)
989 .and_then(|c| c.get(1))
990 .map(|m| m.as_str())
991 .unwrap_or("");
992 let alt = RE_IMG_ALT
993 .captures(img_tag)
994 .and_then(|c| c.get(1))
995 .map(|m| m.as_str())
996 .unwrap_or("");
997 if !src.is_empty() {
998 format!("", alt, src)
999 } else {
1000 String::new()
1001 }
1002 })
1003 .to_string();
1004
1005 static RE_MD_LINK_WITH_TAG: LazyLock<Regex> = LazyLock::new(|| {
1009 Regex::new(r"\[([^\]]*<[a-zA-Z][a-zA-Z0-9]*[^]]*)\]\(([^)]+)\)").unwrap()
1010 });
1011 static RE_BARE_HTML_TAG_NAME: LazyLock<Regex> = LazyLock::new(|| {
1012 Regex::new(r"<(/?)([a-zA-Z][a-zA-Z0-9]*)>").unwrap()
1013 });
1014 result = RE_MD_LINK_WITH_TAG
1015 .replace_all(&result, |caps: ®ex::Captures| {
1016 let link_text = &caps[1];
1017 let url = &caps[2];
1018 let protected = RE_BARE_HTML_TAG_NAME.replace_all(link_text, "`<$1$2>`");
1019 format!("[{}]({})", protected, url)
1020 })
1021 .to_string();
1022
1023 static RE_INLINE_TAG_REF: LazyLock<Regex> = LazyLock::new(|| {
1029 Regex::new(r"([a-zA-Z.,;:!?\s`])(</?[a-zA-Z][a-zA-Z0-9]*>)([a-zA-Z.,;:!?\s`])").unwrap()
1030 });
1031 static FORMATTING_TAGS: &[&str] = &[
1033 "em", "strong", "b", "i", "u", "s", "code", "kbd", "samp", "var",
1034 "mark", "small", "sup", "sub", "abbr", "cite", "dfn", "time", "data",
1035 "del", "ins", "q",
1036 ];
1037 for _ in 0..2 {
1039 result = RE_INLINE_TAG_REF
1040 .replace_all(&result, |caps: ®ex::Captures| {
1041 let pre = &caps[1];
1042 let tag = &caps[2];
1043 let post = &caps[3];
1044 let tag_name = tag.trim_start_matches('<')
1046 .trim_start_matches('/')
1047 .trim_end_matches('>')
1048 .to_lowercase();
1049 if FORMATTING_TAGS.contains(&tag_name.as_str()) {
1050 format!("{}{}{}", pre, tag, post)
1052 } else {
1053 format!("{}`{}`{}", pre, tag, post)
1054 }
1055 })
1056 .to_string();
1057 }
1058
1059 static TAG_PATTERNS: LazyLock<Vec<(Regex, &str)>> = LazyLock::new(|| {
1061 vec![
1062 (Regex::new(r"</?div[^>]*?>").unwrap(), ""),
1063 (Regex::new(r"</?span[^>]*?>").unwrap(), ""),
1064 (Regex::new(r"</?p[^>]*?>").unwrap(), "\n"),
1065 (Regex::new(r"<br\s*/?>\s*").unwrap(), "\n"),
1066 (Regex::new(r"</?section[^>]*?>").unwrap(), ""),
1067 (Regex::new(r"</?article[^>]*?>").unwrap(), ""),
1068 (Regex::new(r"</?header[^>]*?>").unwrap(), ""),
1069 (Regex::new(r"</?footer[^>]*?>").unwrap(), ""),
1070 (Regex::new(r"</?nav[^>]*?>").unwrap(), ""),
1071 (Regex::new(r"</?aside[^>]*?>").unwrap(), ""),
1072 (Regex::new(r"</?main[^>]*?>").unwrap(), ""),
1073 (Regex::new(r"</?button[^>]*?>").unwrap(), ""),
1074 (Regex::new(r"</?form[^>]*?>").unwrap(), ""),
1075 (Regex::new(r"<input[^>]*?>").unwrap(), ""),
1076 (Regex::new(r"</?select[^>]*?>").unwrap(), ""),
1077 (Regex::new(r"</?option[^>]*?>").unwrap(), ""),
1078 (Regex::new(r"</?textarea[^>]*?>").unwrap(), ""),
1079 (Regex::new(r"</?label[^>]*?>").unwrap(), ""),
1080 (Regex::new(r"</?fieldset[^>]*?>").unwrap(), ""),
1081 (Regex::new(r"</?legend[^>]*?>").unwrap(), ""),
1082 (Regex::new(r"</?sup[^>]*?>").unwrap(), ""),
1083 (Regex::new(r"</?sub[^>]*?>").unwrap(), ""),
1084 (Regex::new(r"</?small[^>]*?>").unwrap(), ""),
1085 (Regex::new(r"</?mark[^>]*?>").unwrap(), ""),
1086 (Regex::new(r"<em[^>]*?>").unwrap(), "_"),
1087 (Regex::new(r"</em>").unwrap(), "_"),
1088 (Regex::new(r"<strong[^>]*?>").unwrap(), "**"),
1089 (Regex::new(r"</strong>").unwrap(), "**"),
1090 (Regex::new(r"<b[^>]*?>").unwrap(), "**"),
1091 (Regex::new(r"</b>").unwrap(), "**"),
1092 (Regex::new(r"<i[^>]*?>").unwrap(), "_"),
1093 (Regex::new(r"</i>").unwrap(), "_"),
1094 (Regex::new(r"</?u[^>]*?>").unwrap(), ""),
1095 (Regex::new(r"</?s(?:\s[^>]*?)?>").unwrap(), ""),
1096 (Regex::new(r"<code[^>]*?>").unwrap(), "`"),
1097 (Regex::new(r"</code>").unwrap(), "`"),
1098 (Regex::new(r"</?kbd[^>]*?>").unwrap(), ""),
1099 (Regex::new(r"</?samp[^>]*?>").unwrap(), ""),
1100 (Regex::new(r"</?var[^>]*?>").unwrap(), ""),
1101 (Regex::new(r"</?abbr[^>]*?>").unwrap(), ""),
1102 (Regex::new(r"</?cite[^>]*?>").unwrap(), ""),
1103 (Regex::new(r"</?dfn[^>]*?>").unwrap(), ""),
1104 (Regex::new(r"</?time[^>]*?>").unwrap(), ""),
1105 (Regex::new(r"</?data[^>]*?>").unwrap(), ""),
1106 (Regex::new(r"</?h[1-6][^>]*?>").unwrap(), ""),
1107 (Regex::new(r"</?ul[^>]*?>").unwrap(), "\n"),
1108 (Regex::new(r"</?ol[^>]*?>").unwrap(), "\n"),
1109 (Regex::new(r"<li[^>]*?>").unwrap(), "- "),
1110 (Regex::new(r"</li>").unwrap(), "\n"),
1111 (Regex::new(r"</?table[^>]*?>").unwrap(), "\n"),
1112 (Regex::new(r"</?thead[^>]*?>").unwrap(), ""),
1113 (Regex::new(r"</?tbody[^>]*?>").unwrap(), ""),
1114 (Regex::new(r"</?tfoot[^>]*?>").unwrap(), ""),
1115 (Regex::new(r"</?tr[^>]*?>").unwrap(), "\n"),
1116 (Regex::new(r"</?th[^>]*?>").unwrap(), " | "),
1117 (Regex::new(r"</?td[^>]*?>").unwrap(), " "),
1118 (Regex::new(r"</?caption[^>]*?>").unwrap(), "\n"),
1119 (Regex::new(r"</?colgroup[^>]*?>").unwrap(), ""),
1120 (Regex::new(r"</?col[^>]*?>").unwrap(), ""),
1121 (Regex::new(r"<!DOCTYPE[^>]*?>").unwrap(), ""),
1122 (Regex::new(r"</?meta[^>]*?>").unwrap(), ""),
1123 (Regex::new(r"</?link[^>]*?>").unwrap(), ""),
1124 (Regex::new(r"</?title[^>]*?>").unwrap(), ""),
1125 (Regex::new(r"</?base[^>]*?>").unwrap(), ""),
1126 (Regex::new(r"</?head[^>]*?>").unwrap(), ""),
1127 (Regex::new(r"</?body[^>]*?>").unwrap(), ""),
1128 (Regex::new(r"</?html[^>]*?>").unwrap(), ""),
1129 (Regex::new(r"</?blockquote[^>]*?>").unwrap(), "\n"),
1130 (Regex::new(r"</?pre[^>]*?>").unwrap(), "\n"),
1131 (Regex::new(r"<hr[^>]*?>").unwrap(), "\n---\n"),
1132 (Regex::new(r"</?dl[^>]*?>").unwrap(), "\n"),
1133 (Regex::new(r"</?dt[^>]*?>").unwrap(), "\n"),
1134 (Regex::new(r"</?dd[^>]*?>").unwrap(), " "),
1135 (Regex::new(r"</?picture[^>]*?>").unwrap(), ""),
1136 (Regex::new(r"</?video[^>]*?>").unwrap(), ""),
1137 (Regex::new(r"</?audio[^>]*?>").unwrap(), ""),
1138 (Regex::new(r"</?source[^>]*?>").unwrap(), ""),
1139 (Regex::new(r"</?track[^>]*?>").unwrap(), ""),
1140 (Regex::new(r"</?canvas[^>]*?>").unwrap(), ""),
1141 (Regex::new(r"</?figure[^>]*?>").unwrap(), ""),
1142 (Regex::new(r"</?figcaption[^>]*?>").unwrap(), ""),
1143 (Regex::new(r"</?details[^>]*?>").unwrap(), ""),
1144 (Regex::new(r"</?summary[^>]*?>").unwrap(), ""),
1145 (Regex::new(r"</?dialog[^>]*?>").unwrap(), ""),
1146 (Regex::new(r"(?is)<script[^>]*?>.*?</script>").unwrap(), ""),
1147 (Regex::new(r"(?is)<style[^>]*?>.*?</style>").unwrap(), ""),
1148 (Regex::new(r"(?is)<noscript[^>]*?>.*?</noscript>").unwrap(), ""),
1149 (Regex::new(r"(?is)<!--.*?-->").unwrap(), ""),
1150 (Regex::new(r"(?is)<!\[CDATA\[.*?\]\]>").unwrap(), ""),
1151 (Regex::new(r"(?is)<\?xml[^>]*?\?>").unwrap(), ""),
1152 (Regex::new(r"</?address[^>]*?>").unwrap(), ""),
1153 (Regex::new(r"</?ins[^>]*?>").unwrap(), ""),
1154 (Regex::new(r"</?del[^>]*?>").unwrap(), ""),
1155 (Regex::new(r"</?q[^>]*?>").unwrap(), ""),
1156 (Regex::new(r"</?wbr[^>]*?/?>").unwrap(), ""),
1157 (Regex::new(r"</?ruby[^>]*?>").unwrap(), ""),
1158 (Regex::new(r"</?rt[^>]*?>").unwrap(), ""),
1159 (Regex::new(r"</?rp[^>]*?>").unwrap(), ""),
1160 (Regex::new(r"</?bdi[^>]*?>").unwrap(), ""),
1161 (Regex::new(r"</?bdo[^>]*?>").unwrap(), ""),
1162 (Regex::new(r"(?is)<iframe[^>]*?>.*?</iframe>").unwrap(), ""),
1163 (Regex::new(r"<iframe[^>]*?/?>").unwrap(), ""),
1164 (Regex::new(r"(?is)<object[^>]*?>.*?</object>").unwrap(), ""),
1165 (Regex::new(r"<embed[^>]*?/?>").unwrap(), ""),
1166 (Regex::new(r"</?param[^>]*?>").unwrap(), ""),
1167 (Regex::new(r"(?is)<template[^>]*?>.*?</template>").unwrap(), ""),
1168 (Regex::new(r"</?slot[^>]*?>").unwrap(), ""),
1169 ]
1170 });
1171
1172 for (regex, replacement) in TAG_PATTERNS.iter() {
1173 result = regex.replace_all(&result, *replacement).to_string();
1174 }
1175
1176 static RE_BARE_ELEMENT: LazyLock<Regex> = LazyLock::new(|| {
1179 Regex::new(r"(</?[a-zA-Z][a-zA-Z0-9]*>)").unwrap()
1180 });
1181 result = RE_BARE_ELEMENT.replace_all(&result, "`$1`").to_string();
1182
1183 result = RE_CATCHALL_TAG.replace_all(&result, "").to_string();
1185 result = RE_MULTI_NEWLINE.replace_all(&result, "\n\n").to_string();
1186
1187 result = decode_html_entities(&result);
1188
1189 result = RE_CATCHALL_TAG.replace_all(&result, "").to_string();
1191
1192 for (i, span) in inline_code_spans.iter().enumerate() {
1194 let placeholder = format!("\x00INLINE_CODE_{}\x00", i);
1195 result = result.replace(&placeholder, span);
1196 }
1197
1198 for (i, block) in code_blocks.iter().enumerate() {
1200 let placeholder = format!("\x00CODE_FENCE_{}\x00", i);
1201 result = result.replace(&placeholder, block);
1202 }
1203
1204 result
1205}
1206
1207fn escape_multiline_links(markdown: &str) -> String {
1211 let mut result = String::with_capacity(markdown.len());
1212 let mut in_link_text = false;
1213 let mut bracket_depth: i32 = 0;
1214
1215 for ch in markdown.chars() {
1216 match ch {
1217 '[' => {
1218 bracket_depth += 1;
1219 in_link_text = true;
1220 result.push(ch);
1221 }
1222 ']' if in_link_text => {
1223 bracket_depth = bracket_depth.saturating_sub(1);
1224 if bracket_depth == 0 {
1225 in_link_text = false;
1226 }
1227 result.push(ch);
1228 }
1229 '\n' if in_link_text && bracket_depth > 0 => {
1230 result.push(' ');
1232 }
1233 _ => result.push(ch),
1234 }
1235 }
1236
1237 result
1238}
1239
1240fn remove_accessibility_links(markdown: &str) -> String {
1243 static SKIP_LINKS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
1244 vec![
1245 Regex::new(r"(?mi)^\s*\[Skip to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
1246 Regex::new(r"(?mi)^\s*\[Jump to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
1247 Regex::new(r"(?mi)^\s*\[Go to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
1248 Regex::new(r"(?mi)^\s*\[Skip (navigation|nav|to main content|to content)\]\([^)]*\)\s*").unwrap(),
1249 Regex::new(r"(?mi)^\s*\[Back to (Top|Main|Content)\]\([^)]*\)\s*").unwrap(),
1250 ]
1251 });
1252 static SCREEN_READER: LazyLock<Regex> = LazyLock::new(|| {
1253 Regex::new(r"(?mi)^\s*\[Screen reader only:?[^\]]*\]\([^)]*\)\s*").unwrap()
1254 });
1255
1256 let mut result = markdown.to_string();
1257 let mut changed = true;
1258 while changed {
1259 changed = false;
1260 for regex in SKIP_LINKS.iter() {
1261 let new_result = regex.replace_all(&result, "").to_string();
1262 if new_result != result {
1263 changed = true;
1264 result = new_result;
1265 }
1266 }
1267 }
1268
1269 SCREEN_READER.replace_all(&result, "").to_string()
1270}
1271
1272fn convert_urls_to_absolute(markdown: &str, base_url: &str) -> Result<String> {
1281 use crate::error::ScrapeError;
1282
1283 let base = Url::parse(base_url)
1284 .map_err(|e| ScrapeError::InvalidUrl(format!("Invalid base URL: {}", e)))?;
1285
1286 static RE_IMG_URL: LazyLock<Regex> =
1287 LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
1288 let img_regex = &*RE_IMG_URL;
1289 let mut result = img_regex
1290 .replace_all(markdown, |caps: ®ex::Captures| {
1291 let alt = &caps[1];
1292 let url = &caps[2];
1293
1294 if url.starts_with("http://")
1296 || url.starts_with("https://")
1297 || url.starts_with("data:")
1298 {
1299 return caps[0].to_string();
1300 }
1301
1302 if url.starts_with("//") {
1304 return format!("", alt, url);
1305 }
1306
1307 match base.join(url) {
1309 Ok(absolute) => format!("", alt, absolute),
1310 Err(_) => caps[0].to_string(), }
1312 })
1313 .to_string();
1314
1315 static RE_LINK_URL: LazyLock<Regex> =
1316 LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
1317 let link_regex = &*RE_LINK_URL;
1318 result = link_regex
1319 .replace_all(&result, |caps: ®ex::Captures| {
1320 let text = &caps[1];
1321 let url = &caps[2];
1322
1323 if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("#") {
1325 return caps[0].to_string();
1326 }
1327
1328 if url.starts_with("//") {
1330 return format!("[{}](https:{})", text, url);
1331 }
1332
1333 match base.join(url) {
1335 Ok(absolute) => format!("[{}]({})", text, absolute),
1336 Err(_) => caps[0].to_string(),
1337 }
1338 })
1339 .to_string();
1340
1341 Ok(result)
1342}
1343
1344#[cfg(test)]
1345mod tests {
1346 use super::*;
1347
1348 #[test]
1349 fn test_html_to_markdown_simple() {
1350 let html = "<h1>Hello</h1><p>World</p>";
1351 let result = html_to_markdown(html, "https://example.com", false);
1352 assert!(result.is_ok());
1353 let md = result.unwrap();
1354 assert!(md.contains("Hello"));
1355 assert!(md.contains("World"));
1356 }
1357
1358 #[test]
1359 fn test_extract_main_content() {
1360 let html = r#"
1361 <html>
1362 <body>
1363 <nav>Navigation</nav>
1364 <main>
1365 <h1>Main Content</h1>
1366 <p>This is the main content.</p>
1367 </main>
1368 <footer>Footer</footer>
1369 </body>
1370 </html>
1371 "#;
1372 let result = extract_main_content_html(html);
1373 assert!(result.is_ok());
1374 let content = result.unwrap();
1375 assert!(content.contains("Main Content"));
1376 assert!(!content.contains("Navigation"));
1377 }
1378
1379 #[test]
1380 fn test_clean_markdown() {
1381 let markdown = "# Hello\n\n\n\n\nWorld\n\n\n";
1382 let cleaned = clean_markdown(markdown);
1383 assert_eq!(cleaned, "# Hello\n\nWorld");
1385 }
1386
1387 #[test]
1388 fn test_clean_html_from_markdown_images() {
1389 let input =
1391 r#"Some text <img src="https://example.com/logo.png" alt="Company Logo"> more text"#;
1392 let result = clean_html_from_markdown(input);
1393 assert!(result.contains(""));
1394 assert!(!result.contains("<img"));
1395
1396 let input = r#"<img alt="Logo" src="logo.png">"#;
1398 let result = clean_html_from_markdown(input);
1399 assert!(result.contains(""));
1400
1401 let input = r#"<img src="image.jpg">"#;
1403 let result = clean_html_from_markdown(input);
1404 assert!(result.contains(""));
1405 }
1406
1407 #[test]
1408 fn test_clean_html_from_markdown_images_with_attributes() {
1409 let input = r#"<img src="/path/image.jpg" alt="Local Image" title="A title" width="300" height="200">"#;
1411 let result = clean_html_from_markdown(input);
1412 assert!(result.contains(""));
1413 assert!(!result.contains("width"));
1414 assert!(!result.contains("height"));
1415 assert!(!result.contains("title"));
1416 }
1417
1418 #[test]
1419 fn test_clean_html_from_markdown_multiple_images() {
1420 let input = r#"
1421 <h1>Gallery</h1>
1422 <img src="photo1.jpg" alt="Photo One">
1423 <img src="photo2.jpg" alt="Photo Two">
1424 <img src="photo3.jpg">
1425 "#;
1426 let result = clean_html_from_markdown(input);
1427 assert!(result.contains(""));
1428 assert!(result.contains(""));
1429 assert!(result.contains(""));
1430 }
1431
1432 #[test]
1433 fn test_clean_html_from_markdown_remove_tags() {
1434 let input = r#"<div class="container"><span>Hello</span> <p>World</p></div>"#;
1436 let result = clean_html_from_markdown(input);
1437 assert!(!result.contains("<div"));
1438 assert!(!result.contains("<span"));
1439 assert!(!result.contains("<p>"));
1440 assert!(result.contains("Hello"));
1441 assert!(result.contains("World"));
1442 }
1443
1444 #[test]
1445 fn test_clean_html_from_markdown_br_tags() {
1446 let input = "Line 1<br>Line 2<br />Line 3";
1447 let result = clean_html_from_markdown(input);
1448 assert!(!result.contains("<br"));
1449 assert!(result.contains("Line 1"));
1450 assert!(result.contains("Line 2"));
1451 assert!(result.contains("Line 3"));
1452 }
1453
1454 #[test]
1455 fn test_clean_html_from_markdown_form_elements() {
1456 let input = r#"<form><input type="text" name="email"><button>Submit</button></form>"#;
1457 let result = clean_html_from_markdown(input);
1458 assert!(!result.contains("<form"));
1459 assert!(!result.contains("<input"));
1460 assert!(!result.contains("<button"));
1461 }
1462
1463 #[test]
1464 fn test_clean_html_from_markdown_removes_multiline_script_blocks() {
1465 let input = r#"
1466 Before
1467 <script>
1468 var d = data[i].join(" ");
1469 console.log("template");
1470 </script>
1471 After
1472 "#;
1473 let result = clean_html_from_markdown(input);
1474
1475 assert!(result.contains("Before"));
1476 assert!(result.contains("After"));
1477 assert!(!result.contains("var d = data"));
1478 assert!(!result.contains("console.log"));
1479 assert!(!result.contains("<script"));
1480 }
1481
1482 #[test]
1483 fn test_clean_html_from_markdown_removes_multiline_noscript_and_comments() {
1484 let input = r#"
1485 Keep this
1486 <!--
1487 multi-line comment
1488 -->
1489 <noscript>
1490 fallback
1491 content
1492 </noscript>
1493 <![CDATA[
1494 hidden payload
1495 ]]>
1496 Done
1497 "#;
1498 let result = clean_html_from_markdown(input);
1499
1500 assert!(result.contains("Keep this"));
1501 assert!(result.contains("Done"));
1502 assert!(!result.contains("multi-line comment"));
1503 assert!(!result.contains("fallback"));
1504 assert!(!result.contains("hidden payload"));
1505 }
1506
1507 #[test]
1508 fn test_strip_non_content_tags_removes_scripts() {
1509 let html = r#"<html><body>
1510 <p>Real content</p>
1511 <script>var x = "malicious"; console.log(x);</script>
1512 <p>More content</p>
1513 </body></html>"#;
1514 let result = strip_non_content_tags(html);
1515 assert!(result.contains("Real content"));
1516 assert!(result.contains("More content"));
1517 assert!(!result.contains("malicious"));
1518 assert!(!result.contains("console.log"));
1519 }
1520
1521 #[test]
1522 fn test_strip_non_content_tags_removes_style_and_svg() {
1523 let html = r#"<html><body>
1524 <p>Content</p>
1525 <style>.foo { color: red; } :root { --bg: #000; }</style>
1526 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40"/></svg>
1527 <p>End</p>
1528 </body></html>"#;
1529 let result = strip_non_content_tags(html);
1530 assert!(result.contains("Content"));
1531 assert!(result.contains("End"));
1532 assert!(!result.contains("color: red"));
1533 assert!(!result.contains("circle"));
1534 assert!(!result.contains("<svg"));
1535 }
1536
1537 #[test]
1538 fn test_strip_non_content_tags_removes_head() {
1539 let html = r#"<html>
1540 <head><title>Page</title><meta charset="utf-8"><link rel="stylesheet" href="x.css"></head>
1541 <body><p>Body content</p></body>
1542 </html>"#;
1543 let result = strip_non_content_tags(html);
1544 assert!(result.contains("Body content"));
1545 assert!(!result.contains("x.css"));
1546 }
1547
1548 #[test]
1549 fn test_strip_non_content_tags_removes_html_comments() {
1550 let html = r#"<p>Before</p>
1551 <!-- This is a long multi-line
1552 HTML comment that should be removed -->
1553 <p>After</p>"#;
1554 let result = strip_non_content_tags(html);
1555 assert!(result.contains("Before"));
1556 assert!(result.contains("After"));
1557 assert!(!result.contains("long multi-line"));
1558 }
1559
1560 #[test]
1561 fn test_full_pipeline_strips_inline_javascript() {
1562 let html = r#"<html>
1565 <head><script>var tracking = "analytics_data"; function init(){}</script></head>
1566 <body>
1567 <script>$ssgST=new Date().getTime(); var config = {key: "val"};</script>
1568 <h1>Product Listing</h1>
1569 <p>Buy the best laptops here.</p>
1570 <script type="application/json">{"@context":"schema.org"}</script>
1571 </body></html>"#;
1572 let result = html_to_markdown(html, "https://example.com", false).unwrap();
1573 assert!(result.contains("Product Listing"));
1574 assert!(result.contains("Buy the best laptops"));
1575 assert!(!result.contains("$ssgST"), "Inline JS should be stripped before markdown conversion");
1576 assert!(!result.contains("analytics_data"), "Head scripts should be stripped");
1577 assert!(!result.contains("function init"), "Script functions should not appear in output");
1578 }
1579
1580 #[test]
1581 fn test_full_pipeline_preserves_content_around_scripts() {
1582 let html = r#"<html><body>
1583 <h1>Title</h1>
1584 <script>alert('bad');</script>
1585 <p>Paragraph one.</p>
1586 <style>body { margin: 0; }</style>
1587 <p>Paragraph two.</p>
1588 <noscript><p>Please enable JavaScript</p></noscript>
1589 <p>Final paragraph.</p>
1590 </body></html>"#;
1591 let result = html_to_markdown(html, "https://example.com", false).unwrap();
1592 assert!(result.contains("Title"));
1593 assert!(result.contains("Paragraph one"));
1594 assert!(result.contains("Paragraph two"));
1595 assert!(result.contains("Final paragraph"));
1596 assert!(!result.contains("alert"));
1597 assert!(!result.contains("margin: 0"));
1598 assert!(!result.contains("enable JavaScript"));
1599 }
1600
1601 #[test]
1602 fn test_html_to_markdown_with_images() {
1603 let html = r#"
1604 <html>
1605 <body>
1606 <h1>Test Page</h1>
1607 <p>Welcome to the test page.</p>
1608 <img src="logo.png" alt="Site Logo">
1609 <p>More content here.</p>
1610 </body>
1611 </html>
1612 "#;
1613 let result = html_to_markdown(html, "https://example.com", false);
1614 assert!(result.is_ok());
1615 let md = result.unwrap();
1616
1617 assert!(md.contains(""));
1619
1620 assert!(!md.contains("<img"));
1622
1623 assert!(!md.contains(""));
1625
1626 assert!(md.contains("Test Page"));
1628 assert!(md.contains("Welcome"));
1629 }
1630
1631 #[test]
1632 fn test_token_reduction_with_image_cleaning() {
1633 let html_version =
1635 r#"<img src="https://example.com/very-long-url-path/image.png" alt="Description">"#;
1636 let cleaned = clean_html_from_markdown(html_version);
1637
1638 assert!(
1640 cleaned.contains("")
1641 );
1642 assert!(!cleaned.contains("<img"));
1643
1644 assert!(!cleaned.contains("src="));
1646 assert!(!cleaned.contains("alt="));
1647 }
1648
1649 #[test]
1651 fn test_github_content_extraction() {
1652 let github_html = r#"
1653 <html>
1654 <body>
1655 <div class="Layout-sidebar">
1656 <div class="file-navigation">File Tree Noise</div>
1657 <div class="contributors-wrapper">Contributors Widget</div>
1658 </div>
1659 <div id="readme">
1660 <h1>Project README</h1>
1661 <p>This is the actual content we want.</p>
1662 </div>
1663 <div class="BorderGrid">
1664 <div>Sidebar noise</div>
1665 </div>
1666 </body>
1667 </html>
1668 "#;
1669
1670 let result = extract_main_content_html(github_html).unwrap();
1671
1672 assert!(result.contains("Project README"));
1674 assert!(result.contains("actual content we want"));
1675
1676 assert!(!result.contains("File Tree Noise"));
1678 assert!(!result.contains("Contributors Widget"));
1679 assert!(!result.contains("Sidebar noise"));
1680 }
1681
1682 #[test]
1683 fn test_github_markdown_body_extraction() {
1684 let github_html = r#"
1685 <html>
1686 <body>
1687 <nav>Navigation Bar</nav>
1688 <div class="markdown-body">
1689 <h1>Documentation</h1>
1690 <p>Main documentation content here.</p>
1691 </div>
1692 <aside class="Layout-sidebar">Sidebar content</aside>
1693 </body>
1694 </html>
1695 "#;
1696
1697 let result = extract_main_content_html(github_html).unwrap();
1698
1699 assert!(result.contains("Documentation"));
1701 assert!(result.contains("Main documentation content"));
1702
1703 assert!(!result.contains("Navigation Bar"));
1705 assert!(!result.contains("Sidebar content"));
1706 }
1707
1708 #[test]
1710 fn test_html_entity_decoding() {
1711 let text_with_entities =
1712 "Copyright © 2024 & Company™. Click "here" for more info.";
1713 let decoded = decode_html_entities(text_with_entities);
1714
1715 assert_eq!(
1716 decoded,
1717 "Copyright © 2024 & Company™. Click \"here\" for more info."
1718 );
1719 assert!(!decoded.contains("&"));
1720 assert!(!decoded.contains("©"));
1721 assert!(!decoded.contains("™"));
1722 assert!(!decoded.contains("""));
1723 }
1724
1725 #[test]
1726 fn test_html_entity_in_urls() {
1727 let html = "Text with & entity "quoted" content";
1729 let cleaned = clean_html_from_markdown(html);
1730
1731 assert!(cleaned.contains("Text with & entity"));
1733 assert!(cleaned.contains("\"quoted\""));
1734 assert!(!cleaned.contains("&"));
1735 assert!(!cleaned.contains("""));
1736 }
1737
1738 #[test]
1739 fn test_html_entity_common_cases() {
1740 let input = "Less than < greater than > and nbsp space";
1741 let decoded = decode_html_entities(input);
1742
1743 assert_eq!(decoded, "Less than < greater than > and nbsp space");
1744 }
1745
1746 #[test]
1748 fn test_strip_invisible_unicode() {
1749 let text_with_zwsp = "Hello\u{200B}World";
1751 let cleaned = strip_invisible_unicode(text_with_zwsp);
1752 assert_eq!(cleaned, "HelloWorld");
1753
1754 let text_with_bom = "\u{FEFF}Content";
1756 let cleaned = strip_invisible_unicode(text_with_bom);
1757 assert_eq!(cleaned, "Content");
1758
1759 let text_with_multiple = "A\u{200B}\u{200C}\u{200D}B\u{2060}C";
1761 let cleaned = strip_invisible_unicode(text_with_multiple);
1762 assert_eq!(cleaned, "ABC");
1763 }
1764
1765 #[test]
1766 fn test_invisible_unicode_in_anchor_links() {
1767 let markdown = "[\u{200B}\u{200B}\n\n](#heading)";
1769 let cleaned = clean_markdown(markdown);
1770
1771 assert!(!cleaned.contains('\u{200B}'));
1773 assert!(!cleaned.contains("\n\n\n"));
1774 }
1775
1776 #[test]
1777 fn test_full_pipeline_with_all_fixes() {
1778 let html = r#"
1780 <html>
1781 <body>
1782 <div class="Layout-sidebar">Sidebar noise</div>
1783 <div id="readme">
1784 <h1>Test & Demo</h1>
1785 <p>Content with entities here "quoted".</p>
1786 <a href="page?a=1&b=2">Link</a>
1787 <p>Invisible\u{200B}chars\u{200C}removed</p>
1788 </div>
1789 </body>
1790 </html>
1791 "#;
1792
1793 let result = html_to_markdown(html, "https://example.com", true).unwrap();
1794
1795 assert!(result.contains("Test & Demo"));
1797 assert!(!result.contains("Sidebar noise"));
1798
1799 assert!(result.contains("&"));
1801 assert!(result.contains("\"quoted\""));
1802 assert!(result.contains("a=1&b=2"));
1803 assert!(!result.contains("&"));
1804 assert!(!result.contains("""));
1805 assert!(!result.contains(" "));
1806
1807 assert!(!result.contains('\u{200B}'));
1809 assert!(!result.contains('\u{200C}'));
1810 }
1811
1812 #[test]
1813 fn test_complex_image_tag_with_attributes_before_src() {
1814 let input = r#"<img width="50" height="50" src="https://example.com/logo.png" class="thumbnail" alt="Logo" decoding="async" />"#;
1816 let result = clean_html_from_markdown(input);
1817
1818 assert!(result.contains(""));
1820 assert!(!result.contains("<img"));
1821 assert!(!result.contains("width="));
1822 assert!(!result.contains("class="));
1823 }
1824
1825 #[test]
1826 fn test_doctype_and_document_declarations() {
1827 let input = r#"
1830Example API response:
1831```json
1832{
1833 "html": "<!DOCTYPE html><body class=\"main\">content</body>",
1834 "data": "<![CDATA[some data]]>"
1835}
1836```
1837
1838Also test standalone: <!DOCTYPE html> and <?xml version="1.0"?>
1839 "#;
1840 let result = clean_html_from_markdown(input);
1841
1842 assert!(!result.contains("Also test standalone: <!DOCTYPE html>"), "Standalone DOCTYPE should be removed");
1844 assert!(!result.contains("<?xml"), "Standalone XML declaration should be removed");
1845
1846 assert!(result.contains("<!DOCTYPE html>"), "DOCTYPE inside code fence should be preserved");
1848 assert!(result.contains("<![CDATA["), "CDATA inside code fence should be preserved");
1849
1850 assert!(result.contains("Example API response"));
1852 }
1853
1854 #[test]
1855 fn test_picture_and_svg_elements() {
1856 let input = r#"
1858 <picture>
1859 <source srcset="image.webp" type="image/webp">
1860 <img src="image.png" alt="Test">
1861 </picture>
1862 <svg><path d="M10 10"/><circle cx="5" cy="5" r="3"/></svg>
1863 "#;
1864 let result = clean_html_from_markdown(input);
1865
1866 assert!(!result.contains("<picture"));
1868 assert!(!result.contains("<source"));
1869 assert!(!result.contains("<svg"));
1870 assert!(!result.contains("<path"));
1871 assert!(!result.contains("<circle"));
1872 assert!(result.contains(""));
1873 }
1874
1875 #[test]
1876 fn test_multiple_complex_images() {
1877 let input = r#"
1879 <img width="100" src="image1.jpg" alt="First">
1880 <img alt="Second" height="50" src="image2.png" class="thumb">
1881 <img src="image3.gif">
1882 "#;
1883 let result = clean_html_from_markdown(input);
1884
1885 assert!(result.contains(""));
1886 assert!(result.contains(""));
1887 assert!(result.contains(""));
1888 assert!(!result.contains("<img"));
1889 }
1890
1891 #[test]
1892 fn test_apple_footnote_cleaning() {
1893 let html = "iPhone 17<sup class=\"footnote\"><a aria-label=\"footnote 1\" href=\"#footnote-1\">1</a></sup> features";
1895 let result = clean_html_from_markdown(html);
1896
1897 assert!(!result.contains("<sup"));
1899 assert!(!result.contains("<a"));
1900 assert!(!result.contains("</a>"));
1901 assert!(!result.contains("</sup>"));
1902
1903 assert!(result.contains("iPhone 17"));
1905 assert!(result.contains("features"));
1906 }
1907
1908 #[test]
1909 fn test_semantic_html_tag_conversion() {
1910 let html = r#"<strong>Bold</strong> <em>italic</em> <mark>highlight</mark> <code>code</code> text"#;
1912 let result = clean_html_from_markdown(html);
1913
1914 assert!(!result.contains("<strong"));
1915 assert!(!result.contains("<em"));
1916 assert!(!result.contains("<mark"));
1917 assert!(!result.contains("<code"));
1918 assert!(result.contains("**Bold**"), "Expected **Bold**, got: {}", result);
1919 assert!(result.contains("_italic_"), "Expected _italic_, got: {}", result);
1920 assert!(result.contains("highlight"));
1921 assert!(result.contains("`code`"), "Expected `code`, got: {}", result);
1922 }
1923
1924 #[test]
1926 fn test_heading_tag_removal() {
1927 let html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3><h4>Subsection</h4><h5>Minor</h5><h6>Smallest</h6>";
1928 let result = clean_html_from_markdown(html);
1929 assert!(!result.contains("<h1"));
1930 assert!(!result.contains("<h2"));
1931 assert!(!result.contains("<h3"));
1932 assert!(!result.contains("<h4"));
1933 assert!(!result.contains("<h5"));
1934 assert!(!result.contains("<h6"));
1935 assert!(result.contains("Title"));
1936 assert!(result.contains("Subtitle"));
1937 assert!(result.contains("Section"));
1938 }
1939
1940 #[test]
1941 fn test_list_tag_removal() {
1942 let html = "<ul><li>Item 1</li><li>Item 2</li></ul><ol><li>First</li><li>Second</li></ol>";
1943 let result = clean_html_from_markdown(html);
1944 assert!(!result.contains("<ul"));
1945 assert!(!result.contains("<ol"));
1946 assert!(!result.contains("<li"));
1947 assert!(result.contains("Item 1"));
1948 assert!(result.contains("Item 2"));
1949 assert!(result.contains("First"));
1950 assert!(result.contains("Second"));
1951 }
1952
1953 #[test]
1954 fn test_table_tag_removal() {
1955 let html = "<table><thead><tr><th>Header 1</th><th>Header 2</th></tr></thead><tbody><tr><td>Cell 1</td><td>Cell 2</td></tr></tbody></table>";
1956 let result = clean_html_from_markdown(html);
1957 assert!(!result.contains("<table"));
1958 assert!(!result.contains("<thead"));
1959 assert!(!result.contains("<tbody"));
1960 assert!(!result.contains("<tr"));
1961 assert!(!result.contains("<th"));
1962 assert!(!result.contains("<td"));
1963 assert!(result.contains("Header 1"));
1964 assert!(result.contains("Header 2"));
1965 assert!(result.contains("Cell 1"));
1966 assert!(result.contains("Cell 2"));
1967 }
1968
1969 #[test]
1970 fn test_metadata_tag_removal() {
1971 let html = r#"<head><meta charset="utf-8"><link rel="stylesheet" href="style.css"><title>Page Title</title></head><body>Content</body>"#;
1972 let result = clean_html_from_markdown(html);
1973 assert!(!result.contains("<head"));
1974 assert!(!result.contains("<meta"));
1975 assert!(!result.contains("<link"));
1976 assert!(!result.contains("<title"));
1977 assert!(!result.contains("<body"));
1978 assert!(!result.contains("<html"));
1979 assert!(result.contains("Content"));
1980 }
1981
1982 #[test]
1983 fn test_semantic_block_tag_removal() {
1984 let html = r#"<blockquote>Quote</blockquote><pre>Code block</pre><hr>After line"#;
1985 let result = clean_html_from_markdown(html);
1986 assert!(!result.contains("<blockquote"));
1987 assert!(!result.contains("<pre"));
1988 assert!(!result.contains("<hr"));
1989 assert!(result.contains("Quote"));
1990 assert!(result.contains("Code block"));
1991 assert!(result.contains("After line"));
1992 }
1993
1994 #[test]
1995 fn test_definition_list_tag_removal() {
1996 let html =
1997 "<dl><dt>Term 1</dt><dd>Definition 1</dd><dt>Term 2</dt><dd>Definition 2</dd></dl>";
1998 let result = clean_html_from_markdown(html);
1999 assert!(!result.contains("<dl"));
2000 assert!(!result.contains("<dt"));
2001 assert!(!result.contains("<dd"));
2002 assert!(result.contains("Term 1"));
2003 assert!(result.contains("Definition 1"));
2004 assert!(result.contains("Term 2"));
2005 }
2006
2007 #[test]
2008 fn test_media_tag_removal() {
2009 let html = r#"<video src="video.mp4"></video><audio src="audio.mp3"></audio><canvas></canvas><svg><path d="M0,0"/></svg>"#;
2010 let result = clean_html_from_markdown(html);
2011 assert!(!result.contains("<video"));
2012 assert!(!result.contains("<audio"));
2013 assert!(!result.contains("<canvas"));
2014 assert!(!result.contains("<svg"));
2015 assert!(!result.contains("<path"));
2016 }
2017
2018 #[test]
2019 fn test_container_tag_removal() {
2020 let html = r#"<figure><figcaption>Caption</figcaption><img src="img.jpg"></figure><details><summary>Summary</summary>Content</details>"#;
2021 let result = clean_html_from_markdown(html);
2022 assert!(!result.contains("<figure"));
2023 assert!(!result.contains("<figcaption"));
2024 assert!(!result.contains("<details"));
2025 assert!(!result.contains("<summary"));
2026 assert!(result.contains("Caption"));
2027 assert!(result.contains("Summary"));
2028 assert!(result.contains("Content"));
2029 }
2030
2031 #[test]
2032 fn test_html_comment_removal() {
2033 let html = r#"<!-- This is a comment -->Content<!-- Another comment -->"#;
2034 let result = clean_html_from_markdown(html);
2035 assert!(!result.contains("<!--"));
2036 assert!(!result.contains("-->"));
2037 assert!(result.contains("Content"));
2038 assert!(!result.contains("This is a comment"));
2039 assert!(!result.contains("Another comment"));
2040 }
2041
2042 #[test]
2043 fn test_comprehensive_tag_cleanup() {
2044 let html = r#"
2046 <html>
2047 <head><title>Test</title><meta charset="utf-8"></head>
2048 <body>
2049 <!-- Comment -->
2050 <h1>Heading</h1>
2051 <ul><li>List item</li></ul>
2052 <table><tr><td>Table cell</td></tr></table>
2053 <video src="v.mp4"></video>
2054 <figure><figcaption>Fig</figcaption></figure>
2055 </body>
2056 </html>
2057 "#;
2058 let result = clean_html_from_markdown(html);
2059
2060 assert!(!result.contains("<html"));
2062 assert!(!result.contains("<head"));
2063 assert!(!result.contains("<title"));
2064 assert!(!result.contains("<meta"));
2065 assert!(!result.contains("<body"));
2066 assert!(!result.contains("<h1"));
2067 assert!(!result.contains("<ul"));
2068 assert!(!result.contains("<li"));
2069 assert!(!result.contains("<table"));
2070 assert!(!result.contains("<tr"));
2071 assert!(!result.contains("<td"));
2072 assert!(!result.contains("<video"));
2073 assert!(!result.contains("<figure"));
2074 assert!(!result.contains("<figcaption"));
2075 assert!(!result.contains("<!--"));
2076
2077 assert!(result.contains("Heading"));
2079 assert!(result.contains("List item"));
2080 assert!(result.contains("Table cell"));
2081 assert!(result.contains("Fig"));
2082 }
2083
2084 #[test]
2085 fn test_github_token_reduction() {
2086 let github_with_noise = r#"
2088 <html>
2089 <body>
2090 <div class="file-navigation">
2091 <div>src/</div><div>lib/</div><div>tests/</div><div>docs/</div>
2092 <div>Very long file tree that goes on and on...</div>
2093 </div>
2094 <div class="Layout-sidebar">
2095 <div class="contributors-wrapper">
2096 <img src="avatar1.png"><img src="avatar2.png">
2097 <div>Contributor 1</div><div>Contributor 2</div>
2098 </div>
2099 </div>
2100 <div id="readme">
2101 <h1>Project</h1>
2102 <p>Short README content.</p>
2103 </div>
2104 </body>
2105 </html>
2106 "#;
2107
2108 let extracted = extract_main_content_html(github_with_noise).unwrap();
2109
2110 assert!(extracted.len() < github_with_noise.len() / 2);
2112
2113 assert!(extracted.contains("Project"));
2115 assert!(extracted.contains("Short README"));
2116
2117 assert!(!extracted.contains("file-navigation"));
2119 assert!(!extracted.contains("contributors-wrapper"));
2120 assert!(!extracted.contains("Contributor 1"));
2121 }
2122
2123 #[test]
2125 fn test_strip_layout_tables_hacker_news_pattern() {
2126 let hn_html = r#"
2128 <table border="0" cellpadding="0" cellspacing="0">
2129 <tr>
2130 <td>
2131 <table border="0">
2132 <tr><td>Story 1</td></tr>
2133 <tr><td>Story 2</td></tr>
2134 </table>
2135 </td>
2136 </tr>
2137 </table>
2138 "#;
2139
2140 let result = strip_layout_tables(hn_html);
2141
2142 assert!(!result.contains("<table"));
2144 assert!(!result.contains("cellpadding"));
2145
2146 assert!(result.contains("Story 1"));
2148 assert!(result.contains("Story 2"));
2149 }
2150
2151 #[test]
2152 fn test_strip_layout_tables_preserves_data_tables() {
2153 let data_table_html = r#"
2155 <table>
2156 <tr><th>Name</th><th>Value</th></tr>
2157 <tr><td>Item 1</td><td>100</td></tr>
2158 <tr><td>Item 2</td><td>200</td></tr>
2159 </table>
2160 "#;
2161
2162 let result = strip_layout_tables(data_table_html);
2163
2164 assert!(result.contains("<table"));
2166 assert!(result.contains("<th>"));
2167
2168 assert!(result.contains("Name"));
2170 assert!(result.contains("Value"));
2171 assert!(result.contains("Item 1"));
2172 }
2173
2174 #[test]
2175 fn test_layout_table_with_cellpadding_stripped() {
2176 let layout_html =
2177 r#"<table cellpadding="5" cellspacing="0"><tr><td>Content</td></tr></table>"#;
2178 let result = strip_layout_tables(layout_html);
2179
2180 assert!(!result.contains("<table"));
2181 assert!(result.contains("Content"));
2182 }
2183
2184 #[test]
2185 fn test_simple_table_without_headers_stripped() {
2186 let layout_html =
2188 r#"<table border="0"><tr><td>Nav Item 1</td><td>Nav Item 2</td></tr></table>"#;
2189 let result = strip_layout_tables(layout_html);
2190
2191 assert!(!result.contains("<table"));
2192 assert!(result.contains("Nav Item 1"));
2193 assert!(result.contains("Nav Item 2"));
2194 }
2195
2196 #[test]
2197 fn test_hacker_news_markdown_bloat_fix() {
2198 let hn_html = r#"
2200 <html>
2201 <body>
2202 <table border="0" cellpadding="0" cellspacing="0" width="85%">
2203 <tr>
2204 <td>
2205 <table border="0">
2206 <tr><td class="title">Article Title 1</td></tr>
2207 <tr><td class="subtext">100 points by user1</td></tr>
2208 </table>
2209 </td>
2210 </tr>
2211 <tr>
2212 <td>
2213 <table border="0">
2214 <tr><td class="title">Article Title 2</td></tr>
2215 <tr><td class="subtext">200 points by user2</td></tr>
2216 </table>
2217 </td>
2218 </tr>
2219 </table>
2220 </body>
2221 </html>
2222 "#;
2223
2224 let result = html_to_markdown(hn_html, "https://news.ycombinator.com", false).unwrap();
2225
2226 assert!(
2228 result.len() < 1000,
2229 "Markdown output too large: {} bytes",
2230 result.len()
2231 );
2232
2233 assert!(result.contains("Article Title 1"));
2235 assert!(result.contains("Article Title 2"));
2236
2237 let pipe_count = result.chars().filter(|&c| c == '|').count();
2240 assert!(pipe_count < 10, "Too many table delimiters: {}", pipe_count);
2241 }
2242
2243 #[test]
2245 fn test_convert_relative_image_to_absolute() {
2246 let md = "";
2247 let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
2248
2249 assert_eq!(result, "");
2250 }
2251
2252 #[test]
2253 fn test_convert_relative_link_to_absolute() {
2254 let md = "[Home](../index.html)";
2255 let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
2256
2257 assert_eq!(result, "[Home](https://example.com/index.html)");
2258 }
2259
2260 #[test]
2261 fn test_keep_absolute_urls_unchanged() {
2262 let md = "";
2263 let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
2264
2265 assert_eq!(result, "");
2266 }
2267
2268 #[test]
2269 fn test_keep_data_uris_unchanged() {
2270 let md = "";
2271 let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
2272
2273 assert_eq!(result, "");
2274 }
2275
2276 #[test]
2277 fn test_keep_anchors_unchanged() {
2278 let md = "[Section](#heading)";
2279 let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
2280
2281 assert_eq!(result, "[Section](#heading)");
2282 }
2283
2284 #[test]
2285 fn test_complex_relative_paths() {
2286 let md = "";
2287 let result =
2288 convert_urls_to_absolute(md, "https://example.com/a/b/c/page.html").unwrap();
2289
2290 assert_eq!(result, "");
2291 }
2292
2293 #[test]
2294 fn test_root_relative_urls() {
2295 let md = "";
2296 let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
2297
2298 assert_eq!(result, "");
2299 }
2300
2301 #[test]
2302 fn test_protocol_relative_urls() {
2303 let md = "";
2304 let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
2305
2306 assert_eq!(result, "");
2307 }
2308
2309 #[test]
2310 fn test_urls_with_query_params() {
2311 let md = "[API](../api/v1?foo=bar&baz=qux)";
2312 let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
2313
2314 assert_eq!(
2315 result,
2316 "[API](https://example.com/api/v1?foo=bar&baz=qux)"
2317 );
2318 }
2319
2320 #[test]
2321 fn test_urls_with_fragments() {
2322 let md = "[Section](../page.html#section)";
2323 let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
2324
2325 assert_eq!(result, "[Section](https://example.com/page.html#section)");
2326 }
2327
2328 #[test]
2329 fn test_multiple_images_and_links() {
2330 let md = r#"
2331
2332[Home](../index.html)
2333
2334[Absolute](https://other.com/page)
2335"#;
2336 let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();
2337
2338 assert!(result.contains(""));
2339 assert!(result.contains("[Home](https://example.com/index.html)"));
2340 assert!(result.contains(""));
2341 assert!(result.contains("[Absolute](https://other.com/page)"));
2342 }
2343
2344 #[test]
2345 fn test_full_pipeline_with_url_conversion() {
2346 let html = r#"
2347 <html>
2348 <body>
2349 <img src="../images/logo.png" alt="Logo">
2350 <a href="../about.html">About</a>
2351 <img src="https://cdn.example.com/banner.jpg" alt="Banner">
2352 </body>
2353 </html>
2354 "#;
2355
2356 let result = html_to_markdown(html, "https://example.com/docs/page.html", false).unwrap();
2357
2358 assert!(result.contains(""));
2360
2361 assert!(result.contains(""));
2363
2364 assert!(!result.contains("../images/logo.png"));
2366 }
2367
2368 #[test]
2369 fn test_edge_case_empty_alt_text() {
2370 let md = "";
2371 let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();
2372
2373 assert_eq!(result, "");
2374 }
2375
2376 #[test]
2377 fn test_edge_case_special_chars_in_url() {
2378 let md = "[Link](path%20with%20spaces.html)";
2379 let result = convert_urls_to_absolute(md, "https://example.com/").unwrap();
2380
2381 assert_eq!(result, "[Link](https://example.com/path%20with%20spaces.html)");
2382 }
2383
2384 #[test]
2386 fn test_escape_multiline_links_simple() {
2387 let input = "[This is a\nlink](#heading)";
2388 let result = escape_multiline_links(input);
2389 assert_eq!(result, "[This is a link](#heading)");
2390 }
2391
2392 #[test]
2393 fn test_escape_multiline_links_multiple() {
2394 let input = "[Line 1\nLine 2\nLine 3](url)";
2395 let result = escape_multiline_links(input);
2396 assert_eq!(result, "[Line 1 Line 2 Line 3](url)");
2397 }
2398
2399 #[test]
2400 fn test_escape_multiline_links_nested_brackets() {
2401 let input = "[[inner\nlink]](url)";
2402 let result = escape_multiline_links(input);
2403 assert_eq!(result, "[[inner link]](url)");
2404 }
2405
2406 #[test]
2407 fn test_escape_multiline_links_no_newlines() {
2408 let input = "[Normal link](url)";
2409 let result = escape_multiline_links(input);
2410 assert_eq!(result, "[Normal link](url)");
2411 }
2412
2413 #[test]
2414 fn test_escape_multiline_links_outside_links() {
2415 let input = "Text before\n[link](url)\nText after";
2416 let result = escape_multiline_links(input);
2417 assert_eq!(result, "Text before\n[link](url)\nText after");
2418 }
2419
2420 #[test]
2421 fn test_escape_multiline_links_multiple_links() {
2422 let input = "[First\nlink](url1) and [second\nlink](url2)";
2423 let result = escape_multiline_links(input);
2424 assert_eq!(result, "[First link](url1) and [second link](url2)");
2425 }
2426
2427 #[test]
2428 fn test_escape_multiline_links_empty_link() {
2429 let input = "[](url)";
2430 let result = escape_multiline_links(input);
2431 assert_eq!(result, "[](url)");
2432 }
2433
2434 #[test]
2435 fn test_escape_multiline_links_image_syntax() {
2436 let input = "";
2438 let result = escape_multiline_links(input);
2439 assert_eq!(result, "");
2440 }
2441
2442 #[test]
2443 fn test_escape_multiline_links_unmatched_bracket() {
2444 let input = "[unclosed link\nwithout closing bracket";
2446 let result = escape_multiline_links(input);
2447 assert_eq!(result, "[unclosed link without closing bracket");
2449 }
2450
2451 #[test]
2453 fn test_remove_skip_to_content() {
2454 let input = "[Skip to Content](#main)\n\n# Welcome\n\nContent here.";
2455 let expected = "# Welcome\n\nContent here.";
2456 assert_eq!(remove_accessibility_links(input), expected);
2457 }
2458
2459 #[test]
2460 fn test_remove_skip_to_main() {
2461 let input = "[Skip to Main](#main-content)\n\nActual content.";
2462 let expected = "Actual content.";
2463 assert_eq!(remove_accessibility_links(input), expected);
2464 }
2465
2466 #[test]
2467 fn test_remove_skip_to_navigation() {
2468 let input = "[Skip to Navigation](#nav)\n\nPage content.";
2469 let expected = "Page content.";
2470 assert_eq!(remove_accessibility_links(input), expected);
2471 }
2472
2473 #[test]
2474 fn test_remove_jump_to_content() {
2475 let input = "[Jump to Content](#content)\n\nMain text.";
2476 let expected = "Main text.";
2477 assert_eq!(remove_accessibility_links(input), expected);
2478 }
2479
2480 #[test]
2481 fn test_remove_multiple_skip_links() {
2482 let input = "[Skip to Content](#main)\n[Skip to Navigation](#nav)\n\nContent.";
2483 let expected = "Content.";
2484 assert_eq!(remove_accessibility_links(input), expected);
2485 }
2486
2487 #[test]
2488 fn test_preserve_regular_links() {
2489 let input = "[Regular Link](https://example.com)\n\nContent.";
2490 let expected = "[Regular Link](https://example.com)\n\nContent.";
2491 assert_eq!(remove_accessibility_links(input), expected);
2492 }
2493
2494 #[test]
2495 fn test_case_insensitive_skip_links() {
2496 let input = "[SKIP TO CONTENT](#main)\n\nContent.";
2497 let expected = "Content.";
2498 assert_eq!(remove_accessibility_links(input), expected);
2499 }
2500
2501 #[test]
2502 fn test_screen_reader_text() {
2503 let input = "[Screen reader only: Navigation menu](#nav)\n\nContent.";
2504 let expected = "Content.";
2505 assert_eq!(remove_accessibility_links(input), expected);
2506 }
2507
2508 #[test]
2509 fn test_no_removal_in_middle_of_text() {
2510 let input = "Some text [Skip to Content](#main) more text.";
2511 assert!(remove_accessibility_links(input).contains("Skip to Content"));
2513 }
2514
2515 #[test]
2516 fn test_back_to_top_removal() {
2517 let input = "Content here\n[Back to Top](#top)";
2518 let expected = "Content here\n";
2519 assert_eq!(remove_accessibility_links(input), expected);
2520 }
2521
2522 #[test]
2523 fn test_go_to_content_removal() {
2524 let input = "[Go to Main](#main)\n\nPage content.";
2525 let expected = "Page content.";
2526 assert_eq!(remove_accessibility_links(input), expected);
2527 }
2528
2529 #[test]
2530 fn test_skip_navigation_lowercase() {
2531 let input = "[Skip navigation](#nav)\n\nContent.";
2532 let expected = "Content.";
2533 assert_eq!(remove_accessibility_links(input), expected);
2534 }
2535
2536 #[test]
2537 fn test_multiple_accessibility_variants() {
2538 let input = "[Skip to Content](#main)\n[Jump to Navigation](#nav)\n[Back to Top](#top)\n\nActual content.";
2539 let expected = "Actual content.";
2540 assert_eq!(remove_accessibility_links(input), expected);
2541 }
2542
2543 #[test]
2544 fn test_debug_regex_pattern() {
2545 let input = "[Skip to Main](#main)\n\nArticle Title\n==========";
2546 let result = remove_accessibility_links(input);
2547
2548 assert!(!result.contains("Skip to Main"));
2550 assert_eq!(result, "Article Title\n==========");
2551 }
2552
2553 #[test]
2555 fn test_full_pipeline_with_all_improvements() {
2556 let html = r##"
2557 <html>
2558 <body>
2559 <nav><a href="#main">Skip to content</a></nav>
2560 <main>
2561 <h1>Test Page</h1>
2562 <p>Regular content here.</p>
2563 <img srcset="small.jpg 300w, medium.jpg 600w, large.jpg 1200w" alt="Test Image">
2564 <p>More content with <a href="#section">multi-line
2565link text</a>.</p>
2566 </main>
2567 <footer><a href="#top">Back to Top</a></footer>
2568 </body>
2569 </html>
2570 "##;
2571
2572 let result = html_to_markdown(html, "https://example.com", false).unwrap();
2573
2574 assert!(result.contains(""));
2576
2577 assert!(!result.contains("Skip to content"));
2579 assert!(!result.contains("Back to Top"));
2580
2581 assert!(!result.contains("small.jpg"));
2583 assert!(!result.contains("medium.jpg"));
2584
2585 assert!(result.contains("Test Page"));
2587 assert!(result.contains("Regular content"));
2588 }
2589
2590 #[test]
2591 fn test_srcset_resolution_integration() {
2592 let html = r#"
2593 <img srcset="img-400.jpg 400w, img-800.jpg 800w, img-1600.jpg 1600w" alt="Responsive">
2594 <img srcset="icon@1x.png 1x, icon@2x.png 2x, icon@3x.png 3x" alt="Retina">
2595 <img src="regular.jpg" alt="Normal">
2596 "#;
2597
2598 let result = html_to_markdown(html, "https://cdn.example.com", false).unwrap();
2599
2600 eprintln!("Result markdown:\n{}", result);
2602
2603 assert!(result.contains("") ||
2605 result.contains("img-1600.jpg"), "Expected to find img-1600.jpg in output");
2606
2607 assert!(result.contains("") ||
2609 result.contains("icon@3x.png"), "Expected to find icon@3x.png in output");
2610
2611 assert!(result.contains("") ||
2613 result.contains("regular.jpg"), "Expected to find regular.jpg in output");
2614 }
2615
2616 #[test]
2617 fn test_multiline_link_escaping_integration() {
2618 let html = r#"
2619 <a href="https://example.com">This is a
2620 multi-line
2621 link</a>
2622 "#;
2623
2624 let result = html_to_markdown(html, "https://example.com", false).unwrap();
2625
2626 eprintln!("Multiline link result:\n{}", result);
2628
2629 assert!(result.contains("["));
2632 assert!(result.contains("]"));
2633 assert!(result.contains("(https://example.com)"));
2634 }
2635
2636 #[test]
2637 fn test_accessibility_link_removal_integration() {
2638 let html = r##"
2639 <nav>
2640 <a href="#content">Skip to Content</a>
2641 <a href="#main">Skip to Main</a>
2642 </nav>
2643 <main id="content">
2644 <h1>Article Title</h1>
2645 <p>Article content.</p>
2646 <a href="https://example.com">Normal Link</a>
2647 </main>
2648 <footer>
2649 <a href="#top">Back to Top</a>
2650 </footer>
2651 "##;
2652
2653 let result = html_to_markdown(html, "https://example.com", false).unwrap();
2654
2655 assert!(!result.contains("Skip to Content"));
2657 assert!(!result.contains("Skip to Main"));
2658 assert!(!result.contains("Back to Top"));
2659
2660 assert!(result.contains("Article Title"));
2662 assert!(result.contains("Article content"));
2663 }
2664
2665 #[test]
2667 fn test_setext_h1_to_atx() {
2668 let md = "Title\n=====\n\nContent";
2669 let cleaned = clean_markdown(md);
2670 assert!(cleaned.contains("# Title"), "Expected ATX h1, got: {}", cleaned);
2671 assert!(!cleaned.contains("====="));
2672 }
2673
2674 #[test]
2675 fn test_setext_h2_to_atx() {
2676 let md = "Subtitle\n--------\n\nContent";
2677 let cleaned = clean_markdown(md);
2678 assert!(cleaned.contains("## Subtitle"), "Expected ATX h2, got: {}", cleaned);
2679 assert!(!cleaned.contains("--------"));
2680 }
2681
2682 #[test]
2683 fn test_setext_preserves_existing_atx() {
2684 let md = "# Already ATX\n\nContent";
2685 let cleaned = clean_markdown(md);
2686 assert!(cleaned.contains("# Already ATX"));
2687 }
2688
2689 #[test]
2690 fn test_setext_multiple_headings() {
2691 let md = "First\n=====\n\nSecond\n------\n\nThird\n=====";
2692 let cleaned = clean_markdown(md);
2693 assert!(cleaned.contains("# First"));
2694 assert!(cleaned.contains("## Second"));
2695 assert!(cleaned.contains("# Third"));
2696 }
2697
2698 #[test]
2700 fn test_base64_image_replacement() {
2701 let md = "";
2702 let cleaned = clean_markdown(md);
2703 assert_eq!(cleaned, "");
2704 }
2705
2706 #[test]
2707 fn test_base64_image_preserves_normal_images() {
2708 let md = "";
2709 let cleaned = clean_markdown(md);
2710 assert!(cleaned.contains(""));
2711 }
2712
2713 #[test]
2714 fn test_base64_image_mixed() {
2715 let md = " and ";
2716 let cleaned = clean_markdown(md);
2717 assert!(cleaned.contains(""));
2718 assert!(cleaned.contains(""));
2719 assert!(!cleaned.contains("base64"));
2720 }
2721
2722 #[test]
2724 fn test_json_object_detection() {
2725 let json = r#"{"name": "test", "value": 42}"#;
2726 let result = html_to_markdown(json, "https://example.com", false).unwrap();
2727 assert!(result.contains("# JSON Response"), "Expected heading, got: {}", result);
2728 assert!(result.contains("```json\n"));
2729 assert!(result.ends_with("\n```"));
2730 assert!(result.contains(r#""name": "test""#));
2731 }
2732
2733 #[test]
2734 fn test_json_array_detection() {
2735 let json = r#"[{"id": 1}, {"id": 2}]"#;
2736 let result = html_to_markdown(json, "https://example.com", false).unwrap();
2737 assert!(result.contains("```json\n"));
2738 assert!(result.contains(r#""id": 1"#));
2739 }
2740
2741 #[test]
2742 fn test_html_not_detected_as_json() {
2743 let html = "<html><body><p>Hello</p></body></html>";
2744 let result = html_to_markdown(html, "https://example.com", false).unwrap();
2745 assert!(!result.starts_with("```json"));
2746 assert!(result.contains("Hello"));
2747 }
2748
2749 #[test]
2751 fn test_empty_result_fallback() {
2752 let html = r#"
2754 <html>
2755 <body>
2756 <main><span></span></main>
2757 <div>Actual content is here with enough text to be useful for AI agents.</div>
2758 </body>
2759 </html>
2760 "#;
2761 let result = html_to_markdown(html, "https://example.com", true).unwrap();
2762 assert!(result.contains("Actual content is here"));
2764 }
2765
2766 #[test]
2768 fn test_modal_noise_selectors_present() {
2769 let html = r#"
2774 <html>
2775 <body>
2776 <div class="modal">
2777 <h2>Sign up now!</h2>
2778 <form>
2779 <input type="email" placeholder="Email">
2780 <button>Subscribe</button>
2781 </form>
2782 </div>
2783 <div class="overlay" style="position:fixed">
2784 <p>Overlay content</p>
2785 </div>
2786 <h1>Main Page Title</h1>
2787 <p>This is the main content of the page that should be preserved.</p>
2788 <p>More content paragraphs here.</p>
2789 </body>
2790 </html>
2791 "#;
2792 let result = extract_main_content_html(html).unwrap();
2793 assert!(!result.contains("Sign up now"), "Modal content should be removed");
2795 assert!(!result.contains("Overlay content"), "Overlay content should be removed");
2796 assert!(result.contains("Main Page Title"));
2798 assert!(result.contains("main content of the page"));
2799 }
2800
2801 #[test]
2803 fn test_escaped_html_tag_removal() {
2804 let md = r#"Content \<style\> \</style\> more text \</a\> end"#;
2805 let cleaned = clean_markdown(md);
2806 assert!(!cleaned.contains(r"\<style\>"), "Escaped style tag should be removed");
2807 assert!(!cleaned.contains(r"\</a\>"), "Escaped closing tag should be removed");
2808 assert!(cleaned.contains("Content"));
2809 assert!(cleaned.contains("more text"));
2810 assert!(cleaned.contains("end"));
2811 }
2812
2813 #[test]
2814 fn test_escaped_html_comment_removal() {
2815 let md = r#"Before \<!-- comment --\> After"#;
2816 let cleaned = clean_markdown(md);
2817 assert!(!cleaned.contains("comment"), "Escaped comment should be removed");
2818 assert!(cleaned.contains("Before"));
2819 assert!(cleaned.contains("After"));
2820 }
2821
2822 #[test]
2823 fn test_escaped_tags_preserve_normal_content() {
2824 let md = "The value is a < b and 5 > 3";
2826 let cleaned = clean_markdown(md);
2827 assert!(cleaned.contains("a < b"), "Normal < should be preserved: {}", cleaned);
2828 }
2829
2830 #[test]
2832 fn test_code_fence_protection() {
2833 let md = "Some text\n\n```html\n<div class=\"container\">\n <p>Hello</p>\n</div>\n```\n\nMore text";
2834 let result = clean_html_from_markdown(md);
2835 assert!(result.contains("<div class=\"container\">"), "HTML in code fence should be preserved: {}", result);
2837 assert!(result.contains("<p>Hello</p>"), "HTML tags in code fence should be preserved: {}", result);
2838 assert!(result.contains("Some text"));
2840 assert!(result.contains("More text"));
2841 }
2842
2843 #[test]
2844 fn test_code_fence_protection_multiple_blocks() {
2845 let md = "Text\n\n```html\n<strong>bold</strong>\n```\n\nMiddle <div>removed</div>\n\n```js\nconst x = '<span>test</span>';\n```\n\nEnd";
2846 let result = clean_html_from_markdown(md);
2847 assert!(result.contains("<strong>bold</strong>"), "HTML in first code fence preserved");
2849 assert!(!result.contains("<div>removed</div>"), "HTML outside code fence should be stripped");
2851 assert!(result.contains("removed"));
2852 assert!(result.contains("<span>test</span>"), "HTML in second code fence preserved");
2854 }
2855
2856 #[test]
2858 fn test_inline_bold_conversion() {
2859 let html = "<strong>important</strong> text <b>also bold</b>";
2860 let result = clean_html_from_markdown(html);
2861 assert!(result.contains("**important**"), "Expected **important**, got: {}", result);
2862 assert!(result.contains("**also bold**"), "Expected **also bold**, got: {}", result);
2863 }
2864
2865 #[test]
2866 fn test_inline_italic_conversion() {
2867 let html = "<em>emphasized</em> text <i>also italic</i>";
2868 let result = clean_html_from_markdown(html);
2869 assert!(result.contains("_emphasized_"), "Expected _emphasized_, got: {}", result);
2870 assert!(result.contains("_also italic_"), "Expected _also italic_, got: {}", result);
2871 }
2872
2873 #[test]
2874 fn test_inline_code_conversion() {
2875 let html = "Use <code>console.log()</code> for debugging";
2876 let result = clean_html_from_markdown(html);
2877 assert!(result.contains("`console.log()`"), "Expected `console.log()`, got: {}", result);
2878 }
2879
2880 #[test]
2881 fn test_pre_code_language_detection() {
2882 let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
2883 let result = clean_html_from_markdown(html);
2884 assert!(result.contains("```rust"), "Expected ```rust, got: {}", result);
2885 assert!(result.contains("fn main() {}"), "Expected code content");
2886 }
2887
2888 #[test]
2889 fn test_pre_code_lang_prefix() {
2890 let html = r#"<pre><code class="lang-python">print("hello")</code></pre>"#;
2891 let result = clean_html_from_markdown(html);
2892 assert!(result.contains("```python"), "Expected ```python, got: {}", result);
2893 }
2894
2895 #[test]
2896 fn test_pre_code_highlight_prefix() {
2897 let html = r#"<pre><code class="highlight-javascript">const x = 1;</code></pre>"#;
2898 let result = clean_html_from_markdown(html);
2899 assert!(result.contains("```javascript"), "Expected ```javascript, got: {}", result);
2900 }
2901
2902 #[test]
2903 fn test_anchor_to_markdown_link() {
2904 let html = r#"Visit <a href="https://example.com">Example</a> for details."#;
2905 let result = clean_html_from_markdown(html);
2906 assert!(result.contains("[Example](https://example.com)"), "Expected markdown link, got: {}", result);
2907 }
2908
2909 #[test]
2910 fn test_anchor_javascript_href_stripped() {
2911 let html = r#"<a href="javascript:void(0)">Click me</a>"#;
2912 let result = clean_html_from_markdown(html);
2913 assert!(result.contains("Click me"));
2914 assert!(!result.contains("javascript:"));
2915 }
2916
2917 #[test]
2918 fn test_preprocess_resolves_relative_urls() {
2919 let html = r#"<a href="/about">About</a> <img src="/logo.png">"#;
2920 let result = preprocess_html_for_conversion(html, "https://example.com");
2921 assert!(result.contains("https://example.com/about"), "Expected absolute href, got: {}", result);
2922 assert!(result.contains("https://example.com/logo.png"), "Expected absolute src, got: {}", result);
2923 }
2924
2925 #[test]
2926 fn test_preprocess_preserves_absolute_urls() {
2927 let html = r#"<a href="https://other.com/page">Link</a>"#;
2928 let result = preprocess_html_for_conversion(html, "https://example.com");
2929 assert!(result.contains("https://other.com/page"));
2930 }
2931
2932 #[test]
2933 fn test_preprocess_strips_gutter_elements() {
2934 let html = r#"<pre><td class="gutter"><span>1</span></td><td class="code">let x = 1;</td></pre>"#;
2935 let result = preprocess_html_for_conversion(html, "https://example.com");
2936 assert!(!result.contains("gutter"), "Gutter should be stripped, got: {}", result);
2937 assert!(result.contains("let x = 1;"));
2938 }
2939
2940 #[test]
2941 fn test_ui_noise_loading_sponsored() {
2942 let md = "# Products\n\nLoading...\n\nSponsored\n\nSome product here\n\nNotifications";
2943 let cleaned = clean_markdown(md);
2944 assert!(!cleaned.contains("Loading..."));
2945 assert!(!cleaned.contains("Sponsored"));
2946 assert!(!cleaned.contains("Notifications"));
2947 assert!(cleaned.contains("# Products"));
2948 assert!(cleaned.contains("Some product here"));
2949 }
2950
2951 #[test]
2952 fn test_copyright_footer_removal() {
2953 let md = "# Page\n\nContent here\n\nCopyright © 2024 Acme Inc. All Rights Reserved.\n\nMore content";
2954 let cleaned = clean_markdown(md);
2955 assert!(!cleaned.contains("Copyright ©"));
2956 assert!(cleaned.contains("Content here"));
2957 assert!(cleaned.contains("More content"));
2958 }
2959
2960 #[test]
2961 fn test_link_whitespace_normalization() {
2962 let md = "[ Apple ](https://example.com)";
2963 let cleaned = clean_markdown(md);
2964 assert!(cleaned.contains("[Apple](https://example.com)"), "Got: {}", cleaned);
2965 }
2966
2967 #[test]
2968 fn test_link_text_deduplication() {
2969 let md = "[Apple Apple](https://example.com)";
2970 let cleaned = clean_markdown(md);
2971 assert!(cleaned.contains("[Apple](https://example.com)"), "Got: {}", cleaned);
2972 }
2973
2974 #[test]
2975 fn test_link_text_dedup_multiword() {
2976 let md = "[New York New York](https://example.com)";
2978 let cleaned = clean_markdown(md);
2979 assert!(cleaned.contains("[New York](https://example.com)"), "Got: {}", cleaned);
2980 }
2981
2982 #[test]
2983 fn test_link_text_no_false_dedup() {
2984 let md = "[Apple Samsung](https://example.com)";
2986 let cleaned = clean_markdown(md);
2987 assert!(cleaned.contains("[Apple Samsung](https://example.com)"), "Got: {}", cleaned);
2988 }
2989
2990 #[test]
2991 fn test_repeated_list_items_collapsed() {
2992 let md = "* Product info page\n\n* Product info page\n\n* Product info page\n\n* Product info page\n\nOther content";
2993 let cleaned = clean_markdown(md);
2994 let count = cleaned.matches("Product info page").count();
2996 assert!(count <= 2, "Expected <= 2 occurrences but got {}: {}", count, cleaned);
2997 }
2998}