essence-engine 0.2.0

use crate::error::Result;
use regex::Regex;
use scraper::{Html, Selector};
use std::sync::LazyLock;
use url::Url;

macro_rules! cached_regex {
    ($name:ident, $pattern:expr) => {
        static $name: LazyLock<Regex> = LazyLock::new(|| Regex::new($pattern).unwrap());
    };
}

// Pre-compiled regexes for strip_non_content_tags
cached_regex!(RE_SCRIPT, r"(?is)<script[^>]*?>.*?</script>");
cached_regex!(RE_STYLE, r"(?is)<style[^>]*?>.*?</style>");
cached_regex!(RE_NOSCRIPT, r"(?is)<noscript[^>]*?>.*?</noscript>");
cached_regex!(RE_SVG, r"(?is)<svg[^>]*?>.*?</svg>");
cached_regex!(RE_HEAD, r"(?is)<head[^>]*?>.*?</head>");
cached_regex!(RE_COMMENT, r"(?is)<!--.*?-->");

// (strip_layout_tables uses function-local LazyLock)

// Pre-compiled regexes for clean_markdown
cached_regex!(RE_SETEXT_H1, r"(?m)^[ \t]*(.+)\n[ \t]*={3,}\s*$");
cached_regex!(RE_SETEXT_H2, r"(?m)^[ \t]*(.+)\n[ \t]*-{3,}\s*$");
cached_regex!(RE_ESCAPED_TAG, r"\\</?[a-zA-Z!][^\n>]*?\\?>");
cached_regex!(RE_CSS_ROOT, r":root\{--[^}]+\}");
cached_regex!(RE_BASE64_IMG, r"(!\[[^\]]*\])\(data:image/[^;]+;base64,[^)]*\)");
cached_regex!(RE_EMPTY_LINK, r"\[([^\]]*)\]\(\s*\)");
cached_regex!(RE_EMPTY_LIST_RUN, r"(?m)(?:^\s*\*\s*\n){3,}");
cached_regex!(RE_COLLAPSE_NEWLINES, r"\n\s*\n\s*\n+");

// (convert_urls_to_absolute uses function-local LazyLock)

cached_regex!(RE_IMG_TAG, r#"<img\s[^>]*?>"#);
cached_regex!(RE_IMG_SRC, r#"src\s*=\s*["']([^"']+)["']"#);
cached_regex!(RE_IMG_ALT, r#"alt\s*=\s*["']([^"']*?)["']"#);
cached_regex!(RE_CATCHALL_TAG, r"</?[a-zA-Z][a-zA-Z0-9]*(?:\s[^>]*)?>"); 
cached_regex!(RE_MULTI_NEWLINE, r"\n{3,}");

// Pre-compiled regexes for link conversion (new: convert <a> to markdown instead of stripping)
cached_regex!(RE_ANCHOR_TAG, r#"(?is)<a\s[^>]*?href\s*=\s*["']([^"']*)["'][^>]*?>(.*?)</a>"#);

// Pre-compiled regexes for pre/code block preservation  
// Captures the full <pre>...<code class="language-X">...</code>...</pre> block
cached_regex!(RE_PRE_CODE, r#"(?is)<pre[^>]*?>\s*<code([^>]*)>(.*?)</code>\s*</pre>"#);
cached_regex!(RE_PRE_BARE, r#"(?is)<pre[^>]*?>(.*?)</pre>"#);
// For extracting language from class attribute (Firecrawl: language-X, lang-X, highlight-X)
cached_regex!(RE_LANG_CLASS, r#"(?i)\b(?:language|lang|highlight)-([a-zA-Z0-9_+-]+)"#);

// (bloat detection uses per-function LazyLock for selective table removal)

// Pre-compiled regexes for heading-in-link extraction and tracking pixel removal
cached_regex!(RE_HEADING_IN_LINK, r"\[\s*(#{1,6})\s+(.+?)\s*#{0,6}\s*\]\(([^)]+)\)");
cached_regex!(RE_EMPTY_TEXT_LINK, r"(^|[^!])\[\]\([^)]+\)");
cached_regex!(RE_TRACKING_PIXEL, r"!\[[^\]]*\]\([^)]*(?:s_1x2\.gif|pixel\.gif|spacer\.gif|blank\.gif|clear\.gif)[^)]*\)");

// (escape_multiline_links uses char-by-char iteration, no regex needed)

/// Convert HTML to Markdown using html2md
pub fn html_to_markdown(html: &str, base_url: &str, only_main: bool) -> Result<String> {
    // Detect JSON responses and wrap in code fences instead of parsing as HTML
    let trimmed = html.trim();
    if (trimmed.starts_with('{') && trimmed.ends_with('}'))
        || (trimmed.starts_with('[') && trimmed.ends_with(']'))
    {
        // Validate it's actually JSON, not HTML that starts with a brace
        if serde_json::from_str::<serde_json::Value>(trimmed).is_ok() {
            return Ok(format!("# JSON Response\n\n```json\n{}\n```", trimmed));
        }
    }

    // Detect plain text responses (no HTML tags) and return as-is
    // This handles text/plain content like RFCs, READMEs, etc.
    if !trimmed.is_empty() && !trimmed.contains('<') {
        return Ok(trimmed.to_string());
    }

    let content = if only_main {
        extract_main_content_html(html)?
    } else {
        html.to_string()
    };

    // Rescue <img> tags from <noscript> blocks before they get stripped
    let content = crate::format::image_processing::rescue_noscript_images(&content);

    // Strip script/style/noscript BEFORE markdown conversion.
    let content = strip_non_content_tags(&content);

    // Resolve <picture> elements to simple <img> tags (pick largest source)
    let content = crate::format::image_processing::resolve_picture_elements(&content);

    // Pre-process HTML: resolve URLs, strip gutters (Firecrawl technique)
    let content = preprocess_html_for_conversion(&content, base_url);

    // Resolve srcset before markdown conversion to pick largest images
    let content = crate::format::image_processing::resolve_srcsets(&content);

    // Resolve lazy-loaded images (data-src, data-lazy-src, data-original, data-lazy-load) → src
    let content = crate::format::image_processing::resolve_lazy_images(&content);

    // Extract video poster frames as images
    let content = crate::format::image_processing::resolve_video_posters(&content);

    // Remove layout tables before markdown conversion to prevent mega-cell bloat
    let content = strip_layout_tables(&content);

    // Aggressively strip non-data tables for large HTML with heavy table content
    let content = strip_excessive_tables(&content);

    // Use html2md for conversion (in a thread with larger stack to handle deeply nested HTML)
    let markdown = safe_parse_html(&content);

    // BLOAT DETECTION: If markdown is massively larger than the HTML input,
    // selectively strip only large tables while preserving small/important ones
    // (infoboxes, data tables with few rows).
    let markdown = if markdown.len() > content.len() * 3 && content.len() > 10000 {
        static RE_INDIVIDUAL_TABLE: LazyLock<Regex> = LazyLock::new(|| {
            Regex::new(r"(?is)<table[^>]*>(.*?)</table>").unwrap()
        });
        let selective = RE_INDIVIDUAL_TABLE.replace_all(&content, |caps: &regex::Captures| {
            let full_match = &caps[0];
            let table_attrs = full_match.split('>').next().unwrap_or("");
            // Preserve infoboxes, wikitables, and small tables
            let is_important = table_attrs.contains("infobox")
                || table_attrs.contains("wikitable")
                || table_attrs.contains("data-table");
            let row_count = full_match.matches("<tr").count();
            if is_important || row_count <= 30 {
                full_match.to_string()
            } else {
                "\n".to_string()
            }
        }).to_string();
        safe_parse_html(&selective)
    } else {
        markdown
    };

    // Compress table cell whitespace (html2md pads cells for ASCII alignment, wasting tokens)
    let markdown = compress_markdown_tables(&markdown);

    // Safety cap: truncate excessively large output (e.g. Wikipedia with deeply nested tables)
    const MAX_MARKDOWN_BYTES: usize = 500_000;
    let markdown = if markdown.len() > MAX_MARKDOWN_BYTES {
        let truncated = &markdown[..MAX_MARKDOWN_BYTES];
        let cutoff = truncated.rfind('\n').unwrap_or(MAX_MARKDOWN_BYTES);
        format!(
            "{}\n\n[Content truncated: {} chars total]",
            &markdown[..cutoff],
            markdown.len()
        )
    } else {
        markdown
    };

    // Clean up the markdown
    let cleaned = clean_markdown(&markdown);

    // NEW: Escape multi-line links to prevent broken syntax
    let escaped = escape_multiline_links(&cleaned);

    // NEW: Remove accessibility links (skip to content, back to top)
    let no_skip_links = remove_accessibility_links(&escaped);

    let collapsed = RE_COLLAPSE_NEWLINES
        .replace_all(&no_skip_links, "\n\n")
        .to_string();

    // Inject page title as H1 if the markdown has no headings (e.g. paulgraham.com essays)
    let collapsed = if !collapsed.lines().any(|l| l.trim_start().starts_with('#')) {
        if let Some(title) = extract_title_from_html(html) {
            if !title.is_empty() {
                format!("# {}\n\n{}", title.trim(), collapsed)
            } else {
                collapsed
            }
        } else {
            collapsed
        }
    } else {
        collapsed
    };

    // Fallback: if main content extraction produced near-empty markdown, retry without it
    if only_main && collapsed.trim().len() < 50 {
        return html_to_markdown(html, base_url, false);
    }

    // Convert relative URLs to absolute for portability
    let portable = convert_urls_to_absolute(&collapsed, base_url)?;

    Ok(portable)
}

/// Extract <title> text from raw HTML for title injection on bare pages.
/// Uses regex to avoid re-parsing the full document.
fn extract_title_from_html(html: &str) -> Option<String> {
    static RE_TITLE: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"(?is)<title[^>]*>(.*?)</title>").unwrap()
    });
    RE_TITLE.captures(html).map(|caps| {
        let raw = caps[1].trim().to_string();
        // Decode common entities in title
        raw.replace("&amp;", "&")
            .replace("&lt;", "<")
            .replace("&gt;", ">")
            .replace("&quot;", "\"")
            .replace("&#39;", "'")
            .replace("&nbsp;", " ")
    })
}

/// Extract main content from HTML (remove nav, footer, etc.)
pub fn extract_main_content_html(html: &str) -> Result<String> {
    let document = Html::parse_document(html);

    // Try common main content selectors in priority order
    // GitHub-specific selectors first for 5x token reduction
    let main_selectors = [
        "#readme",        // GitHub README content
        ".markdown-body", // GitHub markdown content
        // Wikipedia / MediaWiki
        "#mw-content-text",
        ".mw-parser-output",
        // Documentation: prefer article inside main (excludes sidebar)
        "main article",
        "[role='main'] article",
        // Docs frameworks that nest content specifically
        ".docs-content",
        ".doc-content",
        "[data-docs-content]",
        ".prose",          // Tailwind prose (Next.js docs, etc.)
        ".article-body",
        // Generic
        "main",
        "article",
        "[role='main']",
        ".main-content",
        "#main-content",
        ".content",
        "#content",
        ".post-content",
        ".entry-content",
        ".article-content",
        ".page-content",
        ".body-content",
        // Forum / community patterns
        "#inside",
        ".stories",
        ".itemlist",
    ];

    let html_len = html.len();
    for selector_str in &main_selectors {
        if let Ok(selector) = Selector::parse(selector_str) {
            if let Some(element) = document.select(&selector).next() {
                let content = element.html();
                // Skip if the matched element is too small relative to the page.
                // This prevents grabbing a single <article> product card on pages
                // with many <article> elements (e.g., books.toscrape.com).
                // Require at least 10% of the original HTML size.
                let min_size = html_len / 10;
                if content.len() >= min_size {
                    // Post-extraction: remove nav/sidebar elements nested inside main content
                    return Ok(remove_nested_nav(&content));
                }
                // Element too small, continue to next selector
            }
        }
    }

    // If no main content found, remove common non-content elements
    let mut cleaned_html = html.to_string();

    static REMOVE_SELECTORS_PARSED: LazyLock<Vec<Selector>> = LazyLock::new(|| {
        [
            // GitHub
            ".Layout-sidebar", ".file-navigation", ".BorderGrid",
            ".Layout-sidebar-left", ".Layout-sidebar-right", ".repository-content",
            ".file-tree", ".js-file-line-container", ".blob-wrapper",
            ".contributors-wrapper", ".discussion-sidebar",
            // Standard non-content
            "nav", "header", "footer", "aside",
            ".navigation", ".sidebar", ".menu", ".header", ".footer",
            "#header", "#footer", "#navigation",
            // Docs-specific sidebars/navs
            ".docs-sidebar", ".doc-sidebar", ".sidebar-nav",
            ".toc-sidebar", ".page-sidebar", ".left-sidebar",
            ".side-nav", ".sidenav",
            "#sidebar", "#toc",
            // ARIA roles for nav/complementary
            "[role='navigation']", "[role='complementary']",
            // Table of contents
            ".toc", ".table-of-contents",
            // Skip/accessibility links
            ".skip-link", ".skip-to-content",
            // Wikipedia-specific noise
            ".mw-editsection",   // [edit] links
            "#mw-panel",         // Left sidebar
            "#mw-head",          // Top nav
            ".navbox",           // Navigation boxes at bottom
            ".catlinks",         // Category links
            ".mw-indicators",    // Page status indicators
            ".sistersitebox",    // Sister project links
            "#p-lang-btn",       // Language button
            ".vector-page-toolbar", // Page tools
            ".vector-column-start", // Left column nav
            // Cookie/privacy
            ".cookie-banner", ".cookie-consent", ".cookie-notice",
            "#cookie-banner", "#cookie-consent",
            // Social/sharing
            ".share-buttons", ".social-share", ".social-links",
            // Ads
            ".ad", ".advertisement", ".ads",
            // Navigation noise
            ".breadcrumb", ".breadcrumbs",
            ".search-form", ".search-box",
            // Modals/overlays
            ".modal", ".popup", "#modal", ".overlay",
            // Widgets
            ".widget", "#widget",
            // Language selectors
            ".lang-selector", ".language", "#language-selector",
            // Bars
            ".top-bar", ".bottom-bar",
            ".gh-header", "#gh-header",
            // Raw noise elements
            "script", "style", "noscript", "svg",
        ]
        .iter()
        .filter_map(|s| Selector::parse(s).ok())
        .collect()
    });

    let doc = Html::parse_document(&cleaned_html);
    let mut to_remove = String::new();

    for selector in REMOVE_SELECTORS_PARSED.iter() {
        for element in doc.select(selector) {
            to_remove.push_str(&element.html());
        }
    }

    // Also remove with attribute-based selectors (can't be pre-parsed as easily)
    let attr_selectors = [
        "[class*='cookie']", "[aria-label='breadcrumb']",
        "[class*='cart']", "[class*='wishlist']", "[class*='account-']",
        "[class*='sponsored']", "[class*='banner']",
        "[class*='notification']", "[class*='alert']",
    ];
    for selector_str in &attr_selectors {
        if let Ok(selector) = Selector::parse(selector_str) {
            for element in doc.select(&selector) {
                to_remove.push_str(&element.html());
            }
        }
    }

    // Simple removal (not perfect but works for basic cases)
    for line in to_remove.lines() {
        if !line.trim().is_empty() {
            cleaned_html = cleaned_html.replace(line, "");
        }
    }

    Ok(if cleaned_html.trim().is_empty() {
        html.to_string()
    } else {
        cleaned_html
    })
}

/// Remove <nav>, <aside>, and sidebar elements nested inside extracted main content.
/// When extract_main_content_html selects <main>, sidebar navigation inside it survives.
/// This strips those elements from the fragment using regex (fast, no re-parse needed).
fn remove_nested_nav(html: &str) -> String {
    static NESTED_NAV_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
        vec![
            // <nav> elements (with any attributes or content)
            Regex::new(r"(?is)<nav[^>]*>.*?</nav>").unwrap(),
            // <aside> elements
            Regex::new(r"(?is)<aside[^>]*>.*?</aside>").unwrap(),
            // Common sidebar class patterns
            Regex::new(r#"(?is)<div[^>]*class\s*=\s*["'][^"']*\b(?:sidebar|side-nav|sidenav|sidebar-nav|toc-sidebar|page-sidebar)\b[^"']*["'][^>]*>.*?</div>"#).unwrap(),
            // role="navigation" or role="complementary"
            Regex::new(r#"(?is)<\w+[^>]*role\s*=\s*["'](?:navigation|complementary)["'][^>]*>.*?</\w+>"#).unwrap(),
        ]
    });

    let mut result = html.to_string();
    for re in NESTED_NAV_PATTERNS.iter() {
        result = re.replace_all(&result, "").to_string();
    }
    result
}

/// Strip layout tables (tables used for page structure, not data) to prevent markdown bloat.
///
/// Layout tables are characterized by:
/// - No <th> tags (data tables have headers)
/// - Nested tables inside cells
///
/// Run html2md::parse_html in a thread with a larger stack to handle deeply nested HTML
/// (e.g., Amazon product pages with 100+ nesting levels that overflow the default 8MB stack).
fn safe_parse_html(html: &str) -> String {
    // For small HTML, use the current thread directly
    if html.len() < 500_000 {
        return html2md::parse_html(html);
    }

    // For large HTML, spawn a thread with 32MB stack
    let html_owned = html.to_string();
    let result = std::thread::Builder::new()
        .name("html2md-parser".to_string())
        .stack_size(32 * 1024 * 1024) // 32MB stack
        .spawn(move || html2md::parse_html(&html_owned))
        .and_then(|handle| {
            handle.join().map_err(|_| {
                std::io::Error::other("html2md thread panicked")
            })
        });

    match result {
        Ok(markdown) => markdown,
        Err(e) => {
            tracing::warn!("html2md failed with large HTML ({}KB): {}", html.len() / 1024, e);
            // Fallback: extract text content without markdown formatting
            let doc = Html::parse_document(html);
            doc.root_element()
                .text()
                .collect::<Vec<_>>()
                .join(" ")
        }
    }
}

/// - cellpadding/cellspacing/border attributes (old-school layout)
/// - Used for page structure (like Hacker News)
///
/// Pre-process HTML before markdown conversion (like Firecrawl's approach):
///  1. Resolve relative URLs to absolute on <a href> and <img src> tags
///  2. Convert <pre><code> blocks to placeholder fenced code (with language detection)
///  3. Filter gutter/line-number elements from code blocks
///
/// Doing this in HTML-space (before html2md) produces much cleaner markdown output.
fn preprocess_html_for_conversion(html: &str, base_url: &str) -> String {
    let base = match Url::parse(base_url) {
        Ok(u) => u,
        Err(_) => return html.to_string(),
    };

    // Resolve <a href="..."> to absolute
    static RE_HREF: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r#"(<a\s[^>]*?href\s*=\s*["'])([^"']+)(["'])"#).unwrap()
    });
    let result = RE_HREF.replace_all(html, |caps: &regex::Captures| {
        let prefix = &caps[1];
        let href = &caps[2];
        let suffix = &caps[3];
        if href.starts_with("http://") || href.starts_with("https://") || href.starts_with("data:") || href.starts_with("javascript:") {
            caps[0].to_string()
        } else if href.starts_with('#') {
            // Resolve #anchor to full URL for portability
            let base_str = base.as_str().split('#').next().unwrap_or(base.as_str());
            format!("{}{}{}{}", prefix, base_str, href, suffix)
        } else if href.starts_with("//") {
            format!("{}https:{}{}", prefix, href, suffix)
        } else {
            match base.join(href) {
                Ok(abs) => format!("{}{}{}", prefix, abs, suffix),
                Err(_) => caps[0].to_string(),
            }
        }
    }).to_string();

    // Resolve <img src="..."> to absolute
    static RE_IMG_SRC_ATTR: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r#"(<img\s[^>]*?src\s*=\s*["'])([^"']+)(["'])"#).unwrap()
    });
    let result = RE_IMG_SRC_ATTR.replace_all(&result, |caps: &regex::Captures| {
        let prefix = &caps[1];
        let src = &caps[2];
        let suffix = &caps[3];
        if src.starts_with("http://") || src.starts_with("https://") || src.starts_with("data:") {
            caps[0].to_string()
        } else if src.starts_with("//") {
            format!("{}https:{}{}", prefix, src, suffix)
        } else {
            match base.join(src) {
                Ok(abs) => format!("{}{}{}", prefix, abs, suffix),
                Err(_) => caps[0].to_string(),
            }
        }
    }).to_string();

    // Remove gutter/line-number elements from code blocks (Firecrawl technique)
    static RE_GUTTER: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r#"(?is)<(?:td|span|div)[^>]*class\s*=\s*["'][^"']*(?:gutter|line-number|linenumber|hljs-ln-numbers|blob-num)[^"']*["'][^>]*>.*?</(?:td|span|div)>"#).unwrap()
    });
    let result = RE_GUTTER.replace_all(&result, "").to_string();

    result
}

/// Remove <script>, <style>, <noscript>, <svg>, and <head> tags with their content
/// before markdown conversion. html2md extracts text from these tags, producing
/// JavaScript/CSS/SVG noise in the output.
fn strip_non_content_tags(html: &str) -> String {
    let regexes: &[&Regex] = &[&RE_SCRIPT, &RE_STYLE, &RE_NOSCRIPT, &RE_SVG, &RE_HEAD, &RE_COMMENT];
    let mut result = html.to_string();
    for re in regexes {
        result = re.replace_all(&result, "").to_string();
    }
    result
}

fn strip_layout_tables(html: &str) -> String {
    // Use regex to find and remove layout tables
    // (?s) flag makes . match newlines

    let mut result = html.to_string();

    // Pattern 1: Tables with cellpadding/cellspacing/border="0" (classic layout tables)
    // These are almost always layout tables, not data tables
    // Note: (?s) makes . match newlines, .*? is non-greedy
    static RE_LAYOUT_TBL: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r#"(?s)<table[^>]*(cellpadding|cellspacing|border=["']?0["']?)[^>]*>.*?</table>"#).unwrap()
    });
    let layout_table_regex = &*RE_LAYOUT_TBL;

    // For nested tables, we need to process iteratively until no more matches
    // because simple regex can't handle balanced tags
    loop {
        let mut replacements = Vec::new();

        for cap in layout_table_regex.find_iter(&result) {
            let table_html = cap.as_str();

            // If the table has <th> tags, it's likely a data table, so preserve it
            if table_html.contains("<th") || table_html.contains("<th>") {
                continue;
            }

            // It's a layout table - extract text content only
            let table_doc = Html::parse_fragment(table_html);
            let text_content = table_doc
                .root_element()
                .text()
                .collect::<Vec<_>>()
                .join("\n");

            replacements.push((
                table_html.to_string(),
                format!(
                    "<div class=\"extracted-from-layout-table\">\n{}\n</div>",
                    text_content
                ),
            ));
        }

        // If no more replacements, we're done
        if replacements.is_empty() {
            break;
        }

        // Apply replacements
        for (old, new) in replacements {
            result = result.replace(&old, &new);
        }
    }

    result
}

/// Aggressively strip non-data tables when HTML is large and table-heavy.
/// Wikipedia articles can have deeply nested template/layout tables that
/// don't have cellpadding/cellspacing attributes (so strip_layout_tables misses them).
/// This catches them by checking total table bytes vs HTML size.
fn strip_excessive_tables(html: &str) -> String {
    // Only apply for large HTML documents
    if html.len() < 50_000 {
        return html.to_string();
    }

    // Count total bytes inside <table> tags
    static RE_TABLE_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"(?is)<table[^>]*>.*?</table>").unwrap()
    });

    let total_table_bytes: usize = RE_TABLE_BLOCK
        .find_iter(html)
        .map(|m| m.as_str().len())
        .sum();

    // If tables aren't dominant (< 50% of HTML), leave them alone
    if total_table_bytes < html.len() / 2 {
        return html.to_string();
    }

    // Tables are dominant — strip non-data tables aggressively
    RE_TABLE_BLOCK
        .replace_all(html, |caps: &regex::Captures| {
            let table_html = &caps[0];
            let table_attrs = table_html.split('>').next().unwrap_or("");

            // Always preserve known data table classes
            let is_data = table_attrs.contains("infobox")
                || table_attrs.contains("wikitable")
                || table_attrs.contains("data-table")
                || table_attrs.contains("sortable");

            // Check for <th> elements (header = data table)
            let has_headers = table_html.contains("<th") || table_html.contains("<th>");

            let row_count = table_html.matches("<tr").count();

            if is_data || (has_headers && row_count <= 50) || row_count <= 15 {
                // Keep data tables and small tables
                table_html.to_string()
            } else {
                // Layout/template table — extract text content only
                let doc = Html::parse_fragment(table_html);
                let text: String = doc
                    .root_element()
                    .text()
                    .collect::<Vec<_>>()
                    .join(" ");
                let trimmed = text.trim();
                if trimmed.is_empty() {
                    "\n".to_string()
                } else {
                    format!("\n{}\n", trimmed)
                }
            }
        })
        .to_string()
}

/// Compress whitespace in markdown table cells.
/// html2md pads table cells with spaces for ASCII column alignment,
/// which wastes thousands of tokens for tables with one long cell value.
fn compress_markdown_tables(markdown: &str) -> String {
    let mut result = String::with_capacity(markdown.len());
    for line in markdown.lines() {
        let trimmed = line.trim();
        if trimmed.starts_with('|') && trimmed.ends_with('|') {
            // This is a table row — compress whitespace in each cell
            let cells: Vec<&str> = trimmed.split('|').collect();
            let compressed: Vec<String> = cells
                .iter()
                .map(|cell| {
                    let t = cell.trim();
                    if t.is_empty() {
                        String::new()
                    } else if t.chars().all(|c| c == '-' || c == ':' || c == ' ') {
                        // Separator row — normalize to minimal form
                        let t = t.trim();
                        if t.starts_with(':') && t.ends_with(':') {
                            " :---: ".to_string()
                        } else if t.ends_with(':') {
                            " ---: ".to_string()
                        } else if t.starts_with(':') {
                            " :--- ".to_string()
                        } else {
                            " --- ".to_string()
                        }
                    } else {
                        format!(" {} ", t)
                    }
                })
                .collect();
            result.push_str(&compressed.join("|"));
        } else {
            result.push_str(line);
        }
        result.push('\n');
    }
    // Remove trailing newline added by the loop
    if result.ends_with('\n') && !markdown.ends_with('\n') {
        result.pop();
    }
    result
}

/// Clean up markdown output - remove HTML tags and convert images
fn clean_markdown(markdown: &str) -> String {
    // First, clean HTML tags that leaked through
    let cleaned = clean_html_from_markdown(markdown);

    // FIX #3: Remove invisible Unicode characters (zero-width spaces, BOM, etc.)
    let cleaned = strip_invisible_unicode(&cleaned);

    // Convert setext headings to ATX style for better AI agent parsing
    let cleaned = RE_SETEXT_H1.replace_all(&cleaned, "# $1").to_string();
    let cleaned = RE_SETEXT_H2.replace_all(&cleaned, "## $1").to_string();

    // Strip trailing duplicate hashes from ATX headings: "### Title ###" → "### Title"
    static RE_TRAILING_HASHES: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r"(?m)^(#{1,6}\s+.+?)\s+#+\s*$").unwrap());
    let cleaned = RE_TRAILING_HASHES.replace_all(&cleaned, "$1").to_string();

    let lines: Vec<String> = cleaned.lines().map(|l| l.trim_end().to_string()).collect();

    // Remove excessive blank lines (more than 2 consecutive)
    let mut result = Vec::new();
    let mut blank_count = 0;

    for line in lines.iter() {
        if line.trim().is_empty() {
            blank_count += 1;
            if blank_count <= 2 {
                result.push(line.clone());
            }
        } else {
            blank_count = 0;
            result.push(line.clone());
        }
    }

    // Join and trim
    let joined = result.join("\n").trim().to_string();

    let joined = RE_ESCAPED_TAG.replace_all(&joined, "").to_string();
    let joined = joined.replace("\\ ", " ").replace("\\\\", "");
    let joined = RE_CSS_ROOT.replace_all(&joined, "").to_string();
    let joined = RE_BASE64_IMG.replace_all(&joined, "$1(data:image-removed)").to_string();
    let joined = RE_EMPTY_LINK.replace_all(&joined, "").to_string();

    // Collapse runs of empty list items (nav boilerplate from JS-rendered menus)
    let joined = RE_EMPTY_LIST_RUN.replace_all(&joined, "\n").to_string();

    // Remove common UI interactive noise (standalone lines)
    static UI_NOISE: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(concat!(
            r"(?m)^\s*(?:",
            r"Ask about this section|Copy for LLM|View as Markdown|Copy as Markdown",
            r"|Open (?:Markdown|in Claude)(?:\s*Ask Docs AI)?(?:\s*Open in Claude)?",
            r"|Ask Docs AI\s*Open in Claude",
            r"|Was this (?:section |page )?helpful\s*(?:to you)?\??",
            r"|(?:Share|Tweet|Pin it|Email)",
            r"|(?:Table of [Cc]ontents|In this article|On this page)",
            r"|Show more|Read more|Load more|See all|Expand all|Collapse all",
            r"|Scroll to top|Back to top",
            r"|Primary navigation",
            // E-commerce / general UI noise
            r"|Loading\.\.\.",
            r"|Sponsored",
            r"|Notifications",
            r"|Expand (?:Cart|Watch List|My eBay)",
            r"|Shop by category",
            r"|All Categories",
            // Wikipedia-specific
            r"|Toggle the table of contents",
            r"|move to sidebar\s*hide",
            r"|\d+\s+languages?",
            r"|Edit links?",
            // Docs-specific
            r"|Edit this page on GitHub\s*",
            r"|Was this page helpful\s*(?:to you)?\??\s*(?:Yes|No)?",
            r"|Suggest (?:changes|edits?)",
            r"|Report (?:an? )?(?:issue|bug)",
            r")\s*$"
        )).unwrap()
    });
    let cleaned = UI_NOISE.replace_all(&joined, "").to_string();

    // Remove Wikipedia edit section links: [[edit](url)]
    static RE_EDIT_LINKS: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"\s*\[\[edit\]\([^)]*\)\]").unwrap()
    });
    let cleaned = RE_EDIT_LINKS.replace_all(&cleaned, "").to_string();

    // Extract headings trapped inside link text: [### Title ###](url) → ### [Title](url)
    let cleaned = RE_HEADING_IN_LINK
        .replace_all(&cleaned, "$1 [$2]($3)")
        .to_string();

    // Remove empty-text links [](url) — but not images ![](url)
    let cleaned = RE_EMPTY_TEXT_LINK.replace_all(&cleaned, "$1").to_string();

    // Remove tracking pixel images (1x2 spacer GIFs, etc.)
    let cleaned = RE_TRACKING_PIXEL.replace_all(&cleaned, "").to_string();

    // Remove leaked JavaScript code (inline scripts that escaped HTML stripping)
    // Handles escaped underscores in variable names (e.g. csell\_token\_map from html2md)
    // \w+(?:\\?\w+)* matches word chars optionally separated by backslash-escaped chars
    static RE_LEAKED_JS: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"(?m)^\s*(?:var|let|const)\s+\w+(?:\\?\w+)*\s*=.*$").unwrap()
    });
    let cleaned = RE_LEAKED_JS.replace_all(&cleaned, "").to_string();

    // Remove JavaScript-style property assignments (obj.prop = ...; or obj['key'] = ...;)
    // Handles escaped underscores in property names
    static RE_JS_PROP_ASSIGN: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r#"(?m)^\s*\w+(?:\\?\w+)*(?:\.\w+(?:\\?\w+)*|\[['"][^'"]*['"]\])\s*=\s*.*;\s*$"#).unwrap()
    });
    let cleaned = RE_JS_PROP_ASSIGN.replace_all(&cleaned, "").to_string();

    // Remove JavaScript function calls on their own line (e.g. csell\_GLOBAL\_INIT\_TAG();)
    static RE_JS_FUNC_CALL: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"(?m)^\s*\w+(?:\\?\w+)*(?:\.\w+(?:\\?\w+)*)*\([^)]*\)\s*;\s*$").unwrap()
    });
    let cleaned = RE_JS_FUNC_CALL.replace_all(&cleaned, "").to_string();

    // Strip copyright footer lines (common boilerplate)
    static RE_COPYRIGHT: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"(?m)^\s*Copyright\s+©.*$").unwrap()
    });
    let cleaned = RE_COPYRIGHT.replace_all(&cleaned, "").to_string();

    // Normalize excessive whitespace inside markdown link text: [  text  ](url) → [text](url)
    static RE_LINK_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"\[\s{2,}([^\]]*?)\s{2,}\]\(").unwrap()
    });
    let cleaned = RE_LINK_WHITESPACE.replace_all(&cleaned, "[$1](").to_string();

    // Collapse internal whitespace runs in link text: [  Apple   Apple  ](url)
    static RE_LINK_INNER_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"\[([^\]]*?)\s{2,}([^\]]*?)\]\(").unwrap()
    });
    // Apply multiple times since regex doesn't backtrack into replaced text
    let mut cleaned = cleaned;
    for _ in 0..3 {
        cleaned = RE_LINK_INNER_WHITESPACE.replace_all(&cleaned, "[$1 $2](").to_string();
    }

    // Deduplicate adjacent identical phrases in link text: [Apple Apple](url) → [Apple](url)
    static RE_LINK_TEXT: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"\[([^\]]+)\]\(").unwrap()
    });
    let cleaned = RE_LINK_TEXT.replace_all(&cleaned, |caps: &regex::Captures| {
        let text = caps[1].trim();
        let words: Vec<&str> = text.split_whitespace().collect();
        let len = words.len();
        // Check if text is exactly two identical halves (e.g. "Apple Apple", "HP HP")
        if len >= 2 && len.is_multiple_of(2) {
            let half = len / 2;
            if words[..half] == words[half..] {
                return format!("[{}](", words[..half].join(" "));
            }
        }
        format!("[{}](", text)
    }).to_string();

    // Collapse repeated identical list items (3+ in a row) to a single instance
    // e.g., "* Product information page\n\n* Product information page\n\n..."
    let cleaned = {
        let lines: Vec<&str> = cleaned.lines().collect();
        let mut result_lines: Vec<&str> = Vec::with_capacity(lines.len());
        let mut prev_item: Option<&str> = None;
        let mut repeat_count = 0u32;
        for line in &lines {
            let trimmed = line.trim();
            if trimmed.starts_with("* ") || trimmed.starts_with("- ") {
                if Some(trimmed) == prev_item {
                    repeat_count += 1;
                    if repeat_count < 2 {
                        result_lines.push(line);
                    }
                    // Skip 3rd+ consecutive identical list items
                } else {
                    prev_item = Some(trimmed);
                    repeat_count = 0;
                    result_lines.push(line);
                }
            } else {
                if !trimmed.is_empty() {
                    prev_item = None;
                    repeat_count = 0;
                }
                result_lines.push(line);
            }
        }
        result_lines.join("\n")
    };

    RE_COLLAPSE_NEWLINES.replace_all(&cleaned, "\n\n").to_string()
}

/// FIX #3: Remove invisible Unicode characters that waste tokens and break parsers
fn strip_invisible_unicode(text: &str) -> String {
    text.replace(['\u{200B}', '\u{FEFF}', '\u{200C}', '\u{200D}', '\u{2060}', '\u{FFFE}'], "") // Invalid BOM
}

/// FIX #2: Decode HTML entities to save tokens and fix URL parsing
fn decode_html_entities(text: &str) -> String {
    text.replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&quot;", "\"")
        .replace("&#39;", "'")
        .replace("&#x27;", "'")
        .replace("&nbsp;", " ")
        .replace("&ndash;", "\u{2013}")
        .replace("&mdash;", "\u{2014}")
        .replace("&hellip;", "\u{2026}")
        .replace("&lsquo;", "\u{2018}")
        .replace("&rsquo;", "\u{2019}")
        .replace("&ldquo;", "\u{201C}")
        .replace("&rdquo;", "\u{201D}")
        .replace("&bull;", "\u{2022}")
        .replace("&middot;", "\u{00B7}")
        .replace("&copy;", "\u{00A9}")
        .replace("&reg;", "\u{00AE}")
        .replace("&trade;", "\u{2122}")
}


/// Clean HTML tags from markdown output
/// Converts <img> tags to markdown format and removes other HTML
fn clean_html_from_markdown(text: &str) -> String {
    // STEP 0: Protect code fences AND inline code spans from HTML stripping.
    // Code fences (```...```)
    static RE_CODE_FENCE_LOCAL: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r"(?s)```[^\n]*\n.*?```").unwrap());
    // Inline code spans (`...`) — must not be empty; code fences already protected above
    static RE_INLINE_CODE_SPAN: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r"`[^`\n]+?`").unwrap());
    let mut code_blocks: Vec<String> = Vec::new();
    let text = RE_CODE_FENCE_LOCAL
        .replace_all(text, |caps: &regex::Captures| {
            let placeholder = format!("\x00CODE_FENCE_{}\x00", code_blocks.len());
            code_blocks.push(caps[0].to_string());
            placeholder
        })
        .to_string();
    // Protect inline code spans (e.g. `<head>`, `<title>`) from tag stripping
    let mut inline_code_spans: Vec<String> = Vec::new();
    let text = RE_INLINE_CODE_SPAN
        .replace_all(&text, |caps: &regex::Captures| {
            let placeholder = format!("\x00INLINE_CODE_{}\x00", inline_code_spans.len());
            inline_code_spans.push(caps[0].to_string());
            placeholder
        })
        .to_string();

    // STEP 1: Convert remaining <pre><code> blocks to markdown fences before stripping
    // Detect language from class="language-X" or class="lang-X" (Firecrawl technique)
    let mut result = RE_PRE_CODE
        .replace_all(&text, |caps: &regex::Captures| {
            let attrs = caps.get(1).map_or("", |m| m.as_str());
            let lang = RE_LANG_CLASS
                .captures(attrs)
                .and_then(|c| c.get(1))
                .map_or("", |m| m.as_str());
            let code_content = decode_html_entities(caps.get(2).map_or("", |m| m.as_str()));
            let trimmed = code_content.trim();
            if trimmed.is_empty() {
                String::new()
            } else {
                format!("\n```{}\n{}\n```\n", lang, trimmed)
            }
        })
        .to_string();

    // Convert bare <pre> blocks (no <code> child) to code fences
    result = RE_PRE_BARE
        .replace_all(&result, |caps: &regex::Captures| {
            let content = caps.get(1).map_or("", |m| m.as_str()).trim();
            if content.is_empty() {
                String::new()
            } else {
                format!("\n```\n{}\n```\n", decode_html_entities(content))
            }
        })
        .to_string();

    // STEP 2: Convert remaining <a> tags to markdown links instead of stripping
    result = RE_ANCHOR_TAG
        .replace_all(&result, |caps: &regex::Captures| {
            let href = &caps[1];
            let link_text = caps[2].trim();
            if link_text.is_empty() || href.is_empty() || href.starts_with("javascript:") {
                link_text.to_string()
            } else {
                format!("[{}]({})", link_text, href)
            }
        })
        .to_string();

    // STEP 3: Convert <img> tags to markdown images
    result = RE_IMG_TAG
        .replace_all(&result, |caps: &regex::Captures| {
            let img_tag = &caps[0];
            let src = RE_IMG_SRC
                .captures(img_tag)
                .and_then(|c| c.get(1))
                .map(|m| m.as_str())
                .unwrap_or("");
            let alt = RE_IMG_ALT
                .captures(img_tag)
                .and_then(|c| c.get(1))
                .map(|m| m.as_str())
                .unwrap_or("");
            if !src.is_empty() {
                format!("![{}]({})", alt, src)
            } else {
                String::new()
            }
        })
        .to_string();

    // STEP 3.5: Protect HTML tag names inside markdown link text from being stripped.
    // e.g. [<head>](url) → [`<head>`](url) so the tag survives the cleanup below.
    // Also protects inline content like "the <title> tag" → "the `<title>` tag"
    static RE_MD_LINK_WITH_TAG: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"\[([^\]]*<[a-zA-Z][a-zA-Z0-9]*[^]]*)\]\(([^)]+)\)").unwrap()
    });
    static RE_BARE_HTML_TAG_NAME: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"<(/?)([a-zA-Z][a-zA-Z0-9]*)>").unwrap()
    });
    result = RE_MD_LINK_WITH_TAG
        .replace_all(&result, |caps: &regex::Captures| {
            let link_text = &caps[1];
            let url = &caps[2];
            let protected = RE_BARE_HTML_TAG_NAME.replace_all(link_text, "`<$1$2>`");
            format!("[{}]({})", protected, url)
        })
        .to_string();

    // STEP 3.6: Protect inline HTML tag references in running text BEFORE TAG_PATTERNS strips them.
    // e.g. "Use the <title> tag in HTML" → "Use the `<title>` tag in HTML"
    // Only protects bare tags (no attributes) preceded and followed by text characters.
    // Excludes formatting tags that TAG_PATTERNS converts to markdown (em, strong, b, i, code, etc.)
    // Rust regex doesn't support lookbehinds, so we capture surrounding context.
    static RE_INLINE_TAG_REF: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"([a-zA-Z.,;:!?\s`])(</?[a-zA-Z][a-zA-Z0-9]*>)([a-zA-Z.,;:!?\s`])").unwrap()
    });
    // Tags that TAG_PATTERNS converts to markdown formatting — don't protect these
    static FORMATTING_TAGS: &[&str] = &[
        "em", "strong", "b", "i", "u", "s", "code", "kbd", "samp", "var",
        "mark", "small", "sup", "sub", "abbr", "cite", "dfn", "time", "data",
        "del", "ins", "q",
    ];
    // Apply twice to catch adjacent tags (first pass consumes trailing context char)
    for _ in 0..2 {
        result = RE_INLINE_TAG_REF
            .replace_all(&result, |caps: &regex::Captures| {
                let pre = &caps[1];
                let tag = &caps[2];
                let post = &caps[3];
                // Extract tag name (strip < / >)
                let tag_name = tag.trim_start_matches('<')
                    .trim_start_matches('/')
                    .trim_end_matches('>')
                    .to_lowercase();
                if FORMATTING_TAGS.contains(&tag_name.as_str()) {
                    // Let TAG_PATTERNS handle it
                    format!("{}{}{}", pre, tag, post)
                } else {
                    format!("{}`{}`{}", pre, tag, post)
                }
            })
            .to_string();
    }

    // STEP 4: Remove all remaining HTML tags using cached compiled regexes
    static TAG_PATTERNS: LazyLock<Vec<(Regex, &str)>> = LazyLock::new(|| {
        vec![
            (Regex::new(r"</?div[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?span[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?p[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"<br\s*/?>\s*").unwrap(), "\n"),
            (Regex::new(r"</?section[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?article[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?header[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?footer[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?nav[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?aside[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?main[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?button[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?form[^>]*?>").unwrap(), ""),
            (Regex::new(r"<input[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?select[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?option[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?textarea[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?label[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?fieldset[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?legend[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?sup[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?sub[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?small[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?mark[^>]*?>").unwrap(), ""),
            (Regex::new(r"<em[^>]*?>").unwrap(), "_"),
            (Regex::new(r"</em>").unwrap(), "_"),
            (Regex::new(r"<strong[^>]*?>").unwrap(), "**"),
            (Regex::new(r"</strong>").unwrap(), "**"),
            (Regex::new(r"<b[^>]*?>").unwrap(), "**"),
            (Regex::new(r"</b>").unwrap(), "**"),
            (Regex::new(r"<i[^>]*?>").unwrap(), "_"),
            (Regex::new(r"</i>").unwrap(), "_"),
            (Regex::new(r"</?u[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?s(?:\s[^>]*?)?>").unwrap(), ""),
            (Regex::new(r"<code[^>]*?>").unwrap(), "`"),
            (Regex::new(r"</code>").unwrap(), "`"),
            (Regex::new(r"</?kbd[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?samp[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?var[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?abbr[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?cite[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?dfn[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?time[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?data[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?h[1-6][^>]*?>").unwrap(), ""),
            (Regex::new(r"</?ul[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"</?ol[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"<li[^>]*?>").unwrap(), "- "),
            (Regex::new(r"</li>").unwrap(), "\n"),
            (Regex::new(r"</?table[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"</?thead[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?tbody[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?tfoot[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?tr[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"</?th[^>]*?>").unwrap(), " | "),
            (Regex::new(r"</?td[^>]*?>").unwrap(), " "),
            (Regex::new(r"</?caption[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"</?colgroup[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?col[^>]*?>").unwrap(), ""),
            (Regex::new(r"<!DOCTYPE[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?meta[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?link[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?title[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?base[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?head[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?body[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?html[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?blockquote[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"</?pre[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"<hr[^>]*?>").unwrap(), "\n---\n"),
            (Regex::new(r"</?dl[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"</?dt[^>]*?>").unwrap(), "\n"),
            (Regex::new(r"</?dd[^>]*?>").unwrap(), "  "),
            (Regex::new(r"</?picture[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?video[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?audio[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?source[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?track[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?canvas[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?figure[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?figcaption[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?details[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?summary[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?dialog[^>]*?>").unwrap(), ""),
            (Regex::new(r"(?is)<script[^>]*?>.*?</script>").unwrap(), ""),
            (Regex::new(r"(?is)<style[^>]*?>.*?</style>").unwrap(), ""),
            (Regex::new(r"(?is)<noscript[^>]*?>.*?</noscript>").unwrap(), ""),
            (Regex::new(r"(?is)<!--.*?-->").unwrap(), ""),
            (Regex::new(r"(?is)<!\[CDATA\[.*?\]\]>").unwrap(), ""),
            (Regex::new(r"(?is)<\?xml[^>]*?\?>").unwrap(), ""),
            (Regex::new(r"</?address[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?ins[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?del[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?q[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?wbr[^>]*?/?>").unwrap(), ""),
            (Regex::new(r"</?ruby[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?rt[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?rp[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?bdi[^>]*?>").unwrap(), ""),
            (Regex::new(r"</?bdo[^>]*?>").unwrap(), ""),
            (Regex::new(r"(?is)<iframe[^>]*?>.*?</iframe>").unwrap(), ""),
            (Regex::new(r"<iframe[^>]*?/?>").unwrap(), ""),
            (Regex::new(r"(?is)<object[^>]*?>.*?</object>").unwrap(), ""),
            (Regex::new(r"<embed[^>]*?/?>").unwrap(), ""),
            (Regex::new(r"</?param[^>]*?>").unwrap(), ""),
            (Regex::new(r"(?is)<template[^>]*?>.*?</template>").unwrap(), ""),
            (Regex::new(r"</?slot[^>]*?>").unwrap(), ""),
        ]
    });

    for (regex, replacement) in TAG_PATTERNS.iter() {
        result = regex.replace_all(&result, *replacement).to_string();
    }

    // Before catchall: convert remaining bare HTML element references to backtick code
    // e.g. "the <title> tag" → "the `<title>` tag". Only simple tags with no attributes.
    static RE_BARE_ELEMENT: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"(</?[a-zA-Z][a-zA-Z0-9]*>)").unwrap()
    });
    result = RE_BARE_ELEMENT.replace_all(&result, "`$1`").to_string();

    // Catchall: remove any remaining HTML tags (those with attributes)
    result = RE_CATCHALL_TAG.replace_all(&result, "").to_string();
    result = RE_MULTI_NEWLINE.replace_all(&result, "\n\n").to_string();

    result = decode_html_entities(&result);

    // Post-entity-decode: entity decoding can create new HTML tags
    result = RE_CATCHALL_TAG.replace_all(&result, "").to_string();

    // Restore protected inline code spans
    for (i, span) in inline_code_spans.iter().enumerate() {
        let placeholder = format!("\x00INLINE_CODE_{}\x00", i);
        result = result.replace(&placeholder, span);
    }

    // Restore protected code fence blocks
    for (i, block) in code_blocks.iter().enumerate() {
        let placeholder = format!("\x00CODE_FENCE_{}\x00", i);
        result = result.replace(&placeholder, block);
    }

    result
}

/// Collapse newlines inside markdown link text to spaces.
/// This prevents broken link syntax and cleans up messy multiline links
/// like [\n\nRuby\n\n]() → [Ruby]() which are much cleaner for LLMs.
fn escape_multiline_links(markdown: &str) -> String {
    let mut result = String::with_capacity(markdown.len());
    let mut in_link_text = false;
    let mut bracket_depth: i32 = 0;

    for ch in markdown.chars() {
        match ch {
            '[' => {
                bracket_depth += 1;
                in_link_text = true;
                result.push(ch);
            }
            ']' if in_link_text => {
                bracket_depth = bracket_depth.saturating_sub(1);
                if bracket_depth == 0 {
                    in_link_text = false;
                }
                result.push(ch);
            }
            '\n' if in_link_text && bracket_depth > 0 => {
                // Collapse newline to space inside link text for cleaner output
                result.push(' ');
            }
            _ => result.push(ch),
        }
    }

    result
}

/// Remove accessibility skip links that add noise to LLM context
/// Examples: [Skip to Content](#main), [Skip to Navigation](#nav)
fn remove_accessibility_links(markdown: &str) -> String {
    static SKIP_LINKS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
        vec![
            Regex::new(r"(?mi)^\s*\[Skip to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
            Regex::new(r"(?mi)^\s*\[Jump to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
            Regex::new(r"(?mi)^\s*\[Go to (Content|Main|Navigation|Footer|Top|Bottom)\]\([^)]*\)\s*").unwrap(),
            Regex::new(r"(?mi)^\s*\[Skip (navigation|nav|to main content|to content)\]\([^)]*\)\s*").unwrap(),
            Regex::new(r"(?mi)^\s*\[Back to (Top|Main|Content)\]\([^)]*\)\s*").unwrap(),
        ]
    });
    static SCREEN_READER: LazyLock<Regex> = LazyLock::new(|| {
        Regex::new(r"(?mi)^\s*\[Screen reader only:?[^\]]*\]\([^)]*\)\s*").unwrap()
    });

    let mut result = markdown.to_string();
    let mut changed = true;
    while changed {
        changed = false;
        for regex in SKIP_LINKS.iter() {
            let new_result = regex.replace_all(&result, "").to_string();
            if new_result != result {
                changed = true;
                result = new_result;
            }
        }
    }

    SCREEN_READER.replace_all(&result, "").to_string()
}

/// Convert relative URLs in markdown to absolute URLs for portability
///
/// Handles:
/// - Images: ![alt](url)
/// - Links: [text](url)
/// - Protocol-relative URLs: //cdn.example.com/image.png
/// - Root-relative URLs: /assets/logo.png
/// - Preserves absolute URLs, data URIs, and anchor links
fn convert_urls_to_absolute(markdown: &str, base_url: &str) -> Result<String> {
    use crate::error::ScrapeError;

    let base = Url::parse(base_url)
        .map_err(|e| ScrapeError::InvalidUrl(format!("Invalid base URL: {}", e)))?;

    static RE_IMG_URL: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
    let img_regex = &*RE_IMG_URL;
    let mut result = img_regex
        .replace_all(markdown, |caps: &regex::Captures| {
            let alt = &caps[1];
            let url = &caps[2];

            // Skip if already absolute or data URI
            if url.starts_with("http://")
                || url.starts_with("https://")
                || url.starts_with("data:")
            {
                return caps[0].to_string();
            }

            // Handle protocol-relative URLs
            if url.starts_with("//") {
                return format!("![{}](https:{})", alt, url);
            }

            // Convert to absolute
            match base.join(url) {
                Ok(absolute) => format!("![{}]({})", alt, absolute),
                Err(_) => caps[0].to_string(), // Keep original if conversion fails
            }
        })
        .to_string();

    static RE_LINK_URL: LazyLock<Regex> =
        LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
    let link_regex = &*RE_LINK_URL;
    result = link_regex
        .replace_all(&result, |caps: &regex::Captures| {
            let text = &caps[1];
            let url = &caps[2];

            // Skip if already absolute or anchor
            if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("#") {
                return caps[0].to_string();
            }

            // Handle protocol-relative URLs
            if url.starts_with("//") {
                return format!("[{}](https:{})", text, url);
            }

            // Convert to absolute
            match base.join(url) {
                Ok(absolute) => format!("[{}]({})", text, absolute),
                Err(_) => caps[0].to_string(),
            }
        })
        .to_string();

    Ok(result)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_html_to_markdown_simple() {
        let html = "<h1>Hello</h1><p>World</p>";
        let result = html_to_markdown(html, "https://example.com", false);
        assert!(result.is_ok());
        let md = result.unwrap();
        assert!(md.contains("Hello"));
        assert!(md.contains("World"));
    }

    #[test]
    fn test_extract_main_content() {
        let html = r#"
            <html>
                <body>
                    <nav>Navigation</nav>
                    <main>
                        <h1>Main Content</h1>
                        <p>This is the main content.</p>
                    </main>
                    <footer>Footer</footer>
                </body>
            </html>
        "#;
        let result = extract_main_content_html(html);
        assert!(result.is_ok());
        let content = result.unwrap();
        assert!(content.contains("Main Content"));
        assert!(!content.contains("Navigation"));
    }

    #[test]
    fn test_clean_markdown() {
        let markdown = "# Hello\n\n\n\n\nWorld\n\n\n";
        let cleaned = clean_markdown(markdown);
        // After HTML cleaning and excessive newline removal, we get 2 newlines max
        assert_eq!(cleaned, "# Hello\n\nWorld");
    }

    #[test]
    fn test_clean_html_from_markdown_images() {
        // Test image with alt and src
        let input =
            r#"Some text <img src="https://example.com/logo.png" alt="Company Logo"> more text"#;
        let result = clean_html_from_markdown(input);
        assert!(result.contains("![Company Logo](https://example.com/logo.png)"));
        assert!(!result.contains("<img"));

        // Test image with alt first
        let input = r#"<img alt="Logo" src="logo.png">"#;
        let result = clean_html_from_markdown(input);
        assert!(result.contains("![Logo](logo.png)"));

        // Test image without alt
        let input = r#"<img src="image.jpg">"#;
        let result = clean_html_from_markdown(input);
        assert!(result.contains("![](image.jpg)"));
    }

    #[test]
    fn test_clean_html_from_markdown_images_with_attributes() {
        // Test image with extra attributes (width, height, title, etc.)
        let input = r#"<img src="/path/image.jpg" alt="Local Image" title="A title" width="300" height="200">"#;
        let result = clean_html_from_markdown(input);
        assert!(result.contains("![Local Image](/path/image.jpg)"));
        assert!(!result.contains("width"));
        assert!(!result.contains("height"));
        assert!(!result.contains("title"));
    }

    #[test]
    fn test_clean_html_from_markdown_multiple_images() {
        let input = r#"
            <h1>Gallery</h1>
            <img src="photo1.jpg" alt="Photo One">
            <img src="photo2.jpg" alt="Photo Two">
            <img src="photo3.jpg">
        "#;
        let result = clean_html_from_markdown(input);
        assert!(result.contains("![Photo One](photo1.jpg)"));
        assert!(result.contains("![Photo Two](photo2.jpg)"));
        assert!(result.contains("![](photo3.jpg)"));
    }

    #[test]
    fn test_clean_html_from_markdown_remove_tags() {
        // Test removal of div, span, etc.
        let input = r#"<div class="container"><span>Hello</span> <p>World</p></div>"#;
        let result = clean_html_from_markdown(input);
        assert!(!result.contains("<div"));
        assert!(!result.contains("<span"));
        assert!(!result.contains("<p>"));
        assert!(result.contains("Hello"));
        assert!(result.contains("World"));
    }

    #[test]
    fn test_clean_html_from_markdown_br_tags() {
        let input = "Line 1<br>Line 2<br />Line 3";
        let result = clean_html_from_markdown(input);
        assert!(!result.contains("<br"));
        assert!(result.contains("Line 1"));
        assert!(result.contains("Line 2"));
        assert!(result.contains("Line 3"));
    }

    #[test]
    fn test_clean_html_from_markdown_form_elements() {
        let input = r#"<form><input type="text" name="email"><button>Submit</button></form>"#;
        let result = clean_html_from_markdown(input);
        assert!(!result.contains("<form"));
        assert!(!result.contains("<input"));
        assert!(!result.contains("<button"));
    }

    #[test]
    fn test_clean_html_from_markdown_removes_multiline_script_blocks() {
        let input = r#"
            Before
            <script>
                var d = data[i].join(" ");
                console.log("template");
            </script>
            After
        "#;
        let result = clean_html_from_markdown(input);

        assert!(result.contains("Before"));
        assert!(result.contains("After"));
        assert!(!result.contains("var d = data"));
        assert!(!result.contains("console.log"));
        assert!(!result.contains("<script"));
    }

    #[test]
    fn test_clean_html_from_markdown_removes_multiline_noscript_and_comments() {
        let input = r#"
            Keep this
            <!--
                multi-line comment
            -->
            <noscript>
                fallback
                content
            </noscript>
            <![CDATA[
                hidden payload
            ]]>
            Done
        "#;
        let result = clean_html_from_markdown(input);

        assert!(result.contains("Keep this"));
        assert!(result.contains("Done"));
        assert!(!result.contains("multi-line comment"));
        assert!(!result.contains("fallback"));
        assert!(!result.contains("hidden payload"));
    }

    #[test]
    fn test_strip_non_content_tags_removes_scripts() {
        let html = r#"<html><body>
            <p>Real content</p>
            <script>var x = "malicious"; console.log(x);</script>
            <p>More content</p>
        </body></html>"#;
        let result = strip_non_content_tags(html);
        assert!(result.contains("Real content"));
        assert!(result.contains("More content"));
        assert!(!result.contains("malicious"));
        assert!(!result.contains("console.log"));
    }

    #[test]
    fn test_strip_non_content_tags_removes_style_and_svg() {
        let html = r#"<html><body>
            <p>Content</p>
            <style>.foo { color: red; } :root { --bg: #000; }</style>
            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40"/></svg>
            <p>End</p>
        </body></html>"#;
        let result = strip_non_content_tags(html);
        assert!(result.contains("Content"));
        assert!(result.contains("End"));
        assert!(!result.contains("color: red"));
        assert!(!result.contains("circle"));
        assert!(!result.contains("<svg"));
    }

    #[test]
    fn test_strip_non_content_tags_removes_head() {
        let html = r#"<html>
            <head><title>Page</title><meta charset="utf-8"><link rel="stylesheet" href="x.css"></head>
            <body><p>Body content</p></body>
        </html>"#;
        let result = strip_non_content_tags(html);
        assert!(result.contains("Body content"));
        assert!(!result.contains("x.css"));
    }

    #[test]
    fn test_strip_non_content_tags_removes_html_comments() {
        let html = r#"<p>Before</p>
            <!-- This is a long multi-line
                 HTML comment that should be removed -->
            <p>After</p>"#;
        let result = strip_non_content_tags(html);
        assert!(result.contains("Before"));
        assert!(result.contains("After"));
        assert!(!result.contains("long multi-line"));
    }

    #[test]
    fn test_full_pipeline_strips_inline_javascript() {
        // Simulates the eBay-style issue: massive inline JS that html2md
        // would extract as text if not stripped beforehand
        let html = r#"<html>
        <head><script>var tracking = "analytics_data"; function init(){}</script></head>
        <body>
            <script>$ssgST=new Date().getTime(); var config = {key: "val"};</script>
            <h1>Product Listing</h1>
            <p>Buy the best laptops here.</p>
            <script type="application/json">{"@context":"schema.org"}</script>
        </body></html>"#;
        let result = html_to_markdown(html, "https://example.com", false).unwrap();
        assert!(result.contains("Product Listing"));
        assert!(result.contains("Buy the best laptops"));
        assert!(!result.contains("$ssgST"), "Inline JS should be stripped before markdown conversion");
        assert!(!result.contains("analytics_data"), "Head scripts should be stripped");
        assert!(!result.contains("function init"), "Script functions should not appear in output");
    }

    #[test]
    fn test_full_pipeline_preserves_content_around_scripts() {
        let html = r#"<html><body>
            <h1>Title</h1>
            <script>alert('bad');</script>
            <p>Paragraph one.</p>
            <style>body { margin: 0; }</style>
            <p>Paragraph two.</p>
            <noscript><p>Please enable JavaScript</p></noscript>
            <p>Final paragraph.</p>
        </body></html>"#;
        let result = html_to_markdown(html, "https://example.com", false).unwrap();
        assert!(result.contains("Title"));
        assert!(result.contains("Paragraph one"));
        assert!(result.contains("Paragraph two"));
        assert!(result.contains("Final paragraph"));
        assert!(!result.contains("alert"));
        assert!(!result.contains("margin: 0"));
        assert!(!result.contains("enable JavaScript"));
    }

    #[test]
    fn test_html_to_markdown_with_images() {
        let html = r#"
            <html>
            <body>
                <h1>Test Page</h1>
                <p>Welcome to the test page.</p>
                <img src="logo.png" alt="Site Logo">
                <p>More content here.</p>
            </body>
            </html>
        "#;
        let result = html_to_markdown(html, "https://example.com", false);
        assert!(result.is_ok());
        let md = result.unwrap();

        // Should contain markdown image format with absolute URL
        assert!(md.contains("![Site Logo](https://example.com/logo.png)"));

        // Should not contain HTML image tag
        assert!(!md.contains("<img"));

        // Should not contain relative URLs
        assert!(!md.contains("![Site Logo](logo.png)"));

        // Should contain the text content
        assert!(md.contains("Test Page"));
        assert!(md.contains("Welcome"));
    }

    #[test]
    fn test_token_reduction_with_image_cleaning() {
        // Test that HTML tags are longer than markdown equivalents
        let html_version =
            r#"<img src="https://example.com/very-long-url-path/image.png" alt="Description">"#;
        let cleaned = clean_html_from_markdown(html_version);

        // The cleaned version should be markdown format
        assert!(
            cleaned.contains("![Description](https://example.com/very-long-url-path/image.png)")
        );
        assert!(!cleaned.contains("<img"));

        // Verify it's actually cleaner (no HTML attributes)
        assert!(!cleaned.contains("src="));
        assert!(!cleaned.contains("alt="));
    }

    // FIX #1: Test GitHub-specific content extraction
    #[test]
    fn test_github_content_extraction() {
        let github_html = r#"
            <html>
                <body>
                    <div class="Layout-sidebar">
                        <div class="file-navigation">File Tree Noise</div>
                        <div class="contributors-wrapper">Contributors Widget</div>
                    </div>
                    <div id="readme">
                        <h1>Project README</h1>
                        <p>This is the actual content we want.</p>
                    </div>
                    <div class="BorderGrid">
                        <div>Sidebar noise</div>
                    </div>
                </body>
            </html>
        "#;

        let result = extract_main_content_html(github_html).unwrap();

        // Should extract README content
        assert!(result.contains("Project README"));
        assert!(result.contains("actual content we want"));

        // Should NOT contain sidebar noise
        assert!(!result.contains("File Tree Noise"));
        assert!(!result.contains("Contributors Widget"));
        assert!(!result.contains("Sidebar noise"));
    }

    #[test]
    fn test_github_markdown_body_extraction() {
        let github_html = r#"
            <html>
                <body>
                    <nav>Navigation Bar</nav>
                    <div class="markdown-body">
                        <h1>Documentation</h1>
                        <p>Main documentation content here.</p>
                    </div>
                    <aside class="Layout-sidebar">Sidebar content</aside>
                </body>
            </html>
        "#;

        let result = extract_main_content_html(github_html).unwrap();

        // Should extract markdown-body content
        assert!(result.contains("Documentation"));
        assert!(result.contains("Main documentation content"));

        // Should NOT contain navigation or sidebar
        assert!(!result.contains("Navigation Bar"));
        assert!(!result.contains("Sidebar content"));
    }

    // FIX #2: Test HTML entity decoding
    #[test]
    fn test_html_entity_decoding() {
        let text_with_entities =
            "Copyright &copy; 2024 &amp; Company&trade;. Click &quot;here&quot; for more info.";
        let decoded = decode_html_entities(text_with_entities);

        assert_eq!(
            decoded,
            "Copyright © 2024 & Company™. Click \"here\" for more info."
        );
        assert!(!decoded.contains("&amp;"));
        assert!(!decoded.contains("&copy;"));
        assert!(!decoded.contains("&trade;"));
        assert!(!decoded.contains("&quot;"));
    }

    #[test]
    fn test_html_entity_in_urls() {
        // FIX #4: Anchor tags are now removed entirely, so test just entity decoding
        let html = "Text with &amp; entity &quot;quoted&quot; content";
        let cleaned = clean_html_from_markdown(html);

        // Should decode entities
        assert!(cleaned.contains("Text with & entity"));
        assert!(cleaned.contains("\"quoted\""));
        assert!(!cleaned.contains("&amp;"));
        assert!(!cleaned.contains("&quot;"));
    }

    #[test]
    fn test_html_entity_common_cases() {
        let input = "Less than &lt; greater than &gt; and nbsp&nbsp;space";
        let decoded = decode_html_entities(input);

        assert_eq!(decoded, "Less than < greater than > and nbsp space");
    }

    // FIX #3: Test invisible Unicode character removal
    #[test]
    fn test_strip_invisible_unicode() {
        // Zero-width space (U+200B)
        let text_with_zwsp = "Hello\u{200B}World";
        let cleaned = strip_invisible_unicode(text_with_zwsp);
        assert_eq!(cleaned, "HelloWorld");

        // BOM (U+FEFF)
        let text_with_bom = "\u{FEFF}Content";
        let cleaned = strip_invisible_unicode(text_with_bom);
        assert_eq!(cleaned, "Content");

        // Multiple invisible chars
        let text_with_multiple = "A\u{200B}\u{200C}\u{200D}B\u{2060}C";
        let cleaned = strip_invisible_unicode(text_with_multiple);
        assert_eq!(cleaned, "ABC");
    }

    #[test]
    fn test_invisible_unicode_in_anchor_links() {
        // Simulates the broken anchor link case: [\n\n](#heading)
        let markdown = "[\u{200B}\u{200B}\n\n](#heading)";
        let cleaned = clean_markdown(markdown);

        // Should remove zero-width space and excessive newlines
        assert!(!cleaned.contains('\u{200B}'));
        assert!(!cleaned.contains("\n\n\n"));
    }

    #[test]
    fn test_full_pipeline_with_all_fixes() {
        // Test all 3 fixes together in the full pipeline
        let html = r#"
            <html>
                <body>
                    <div class="Layout-sidebar">Sidebar noise</div>
                    <div id="readme">
                        <h1>Test &amp; Demo</h1>
                        <p>Content with entities&nbsp;here &quot;quoted&quot;.</p>
                        <a href="page?a=1&amp;b=2">Link</a>
                        <p>Invisible\u{200B}chars\u{200C}removed</p>
                    </div>
                </body>
            </html>
        "#;

        let result = html_to_markdown(html, "https://example.com", true).unwrap();

        // FIX #1: Should extract README, not sidebar
        assert!(result.contains("Test & Demo"));
        assert!(!result.contains("Sidebar noise"));

        // FIX #2: Should decode entities
        assert!(result.contains("&"));
        assert!(result.contains("\"quoted\""));
        assert!(result.contains("a=1&b=2"));
        assert!(!result.contains("&amp;"));
        assert!(!result.contains("&quot;"));
        assert!(!result.contains("&nbsp;"));

        // FIX #3: Should remove invisible unicode
        assert!(!result.contains('\u{200B}'));
        assert!(!result.contains('\u{200C}'));
    }

    #[test]
    fn test_complex_image_tag_with_attributes_before_src() {
        // FIX: Test for bug where images with width/height before src weren't converted
        let input = r#"<img width="50" height="50" src="https://example.com/logo.png" class="thumbnail" alt="Logo" decoding="async" />"#;
        let result = clean_html_from_markdown(input);

        // Should convert to markdown format
        assert!(result.contains("![Logo](https://example.com/logo.png)"));
        assert!(!result.contains("<img"));
        assert!(!result.contains("width="));
        assert!(!result.contains("class="));
    }

    #[test]
    fn test_doctype_and_document_declarations() {
        // FIX: Remove DOCTYPE, XML declarations outside code fences.
        // Code fence content should be preserved intact.
        let input = r#"
Example API response:
```json
{
  "html": "<!DOCTYPE html><body class=\"main\">content</body>",
  "data": "<![CDATA[some data]]>"
}
```

Also test standalone: <!DOCTYPE html> and <?xml version="1.0"?>
        "#;
        let result = clean_html_from_markdown(input);

        // Standalone HTML outside code fences should be removed
        assert!(!result.contains("Also test standalone: <!DOCTYPE html>"), "Standalone DOCTYPE should be removed");
        assert!(!result.contains("<?xml"), "Standalone XML declaration should be removed");

        // Code fence content should be PRESERVED (not stripped)
        assert!(result.contains("<!DOCTYPE html>"), "DOCTYPE inside code fence should be preserved");
        assert!(result.contains("<![CDATA["), "CDATA inside code fence should be preserved");

        // Should preserve the actual content
        assert!(result.contains("Example API response"));
    }

    #[test]
    fn test_picture_and_svg_elements() {
        // FIX: Remove picture, source, and SVG elements
        let input = r#"
        <picture>
            <source srcset="image.webp" type="image/webp">
            <img src="image.png" alt="Test">
        </picture>
        <svg><path d="M10 10"/><circle cx="5" cy="5" r="3"/></svg>
        "#;
        let result = clean_html_from_markdown(input);

        // Should remove all tags but preserve alt text in markdown format
        assert!(!result.contains("<picture"));
        assert!(!result.contains("<source"));
        assert!(!result.contains("<svg"));
        assert!(!result.contains("<path"));
        assert!(!result.contains("<circle"));
        assert!(result.contains("![Test](image.png)"));
    }

    #[test]
    fn test_multiple_complex_images() {
        // Test multiple images with various attribute orders
        let input = r#"
            <img width="100" src="image1.jpg" alt="First">
            <img alt="Second" height="50" src="image2.png" class="thumb">
            <img src="image3.gif">
        "#;
        let result = clean_html_from_markdown(input);

        assert!(result.contains("![First](image1.jpg)"));
        assert!(result.contains("![Second](image2.png)"));
        assert!(result.contains("![](image3.gif)"));
        assert!(!result.contains("<img"));
    }

    #[test]
    fn test_apple_footnote_cleaning() {
        // FIX #4: Test Apple.com-style footnotes with <sup> and <a> tags
        let html = "iPhone 17<sup class=\"footnote\"><a aria-label=\"footnote 1\" href=\"#footnote-1\">1</a></sup> features";
        let result = clean_html_from_markdown(html);

        // Should remove all footnote tags
        assert!(!result.contains("<sup"));
        assert!(!result.contains("<a"));
        assert!(!result.contains("</a>"));
        assert!(!result.contains("</sup>"));

        // Should preserve main text content
        assert!(result.contains("iPhone 17"));
        assert!(result.contains("features"));
    }

    #[test]
    fn test_semantic_html_tag_conversion() {
        // Test conversion of inline formatting tags to markdown equivalents
        let html = r#"<strong>Bold</strong> <em>italic</em> <mark>highlight</mark> <code>code</code> text"#;
        let result = clean_html_from_markdown(html);

        assert!(!result.contains("<strong"));
        assert!(!result.contains("<em"));
        assert!(!result.contains("<mark"));
        assert!(!result.contains("<code"));
        assert!(result.contains("**Bold**"), "Expected **Bold**, got: {}", result);
        assert!(result.contains("_italic_"), "Expected _italic_, got: {}", result);
        assert!(result.contains("highlight"));
        assert!(result.contains("`code`"), "Expected `code`, got: {}", result);
    }

    // FIX #6: Test missing structural HTML tag removal
    #[test]
    fn test_heading_tag_removal() {
        let html = "<h1>Title</h1><h2>Subtitle</h2><h3>Section</h3><h4>Subsection</h4><h5>Minor</h5><h6>Smallest</h6>";
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<h1"));
        assert!(!result.contains("<h2"));
        assert!(!result.contains("<h3"));
        assert!(!result.contains("<h4"));
        assert!(!result.contains("<h5"));
        assert!(!result.contains("<h6"));
        assert!(result.contains("Title"));
        assert!(result.contains("Subtitle"));
        assert!(result.contains("Section"));
    }

    #[test]
    fn test_list_tag_removal() {
        let html = "<ul><li>Item 1</li><li>Item 2</li></ul><ol><li>First</li><li>Second</li></ol>";
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<ul"));
        assert!(!result.contains("<ol"));
        assert!(!result.contains("<li"));
        assert!(result.contains("Item 1"));
        assert!(result.contains("Item 2"));
        assert!(result.contains("First"));
        assert!(result.contains("Second"));
    }

    #[test]
    fn test_table_tag_removal() {
        let html = "<table><thead><tr><th>Header 1</th><th>Header 2</th></tr></thead><tbody><tr><td>Cell 1</td><td>Cell 2</td></tr></tbody></table>";
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<table"));
        assert!(!result.contains("<thead"));
        assert!(!result.contains("<tbody"));
        assert!(!result.contains("<tr"));
        assert!(!result.contains("<th"));
        assert!(!result.contains("<td"));
        assert!(result.contains("Header 1"));
        assert!(result.contains("Header 2"));
        assert!(result.contains("Cell 1"));
        assert!(result.contains("Cell 2"));
    }

    #[test]
    fn test_metadata_tag_removal() {
        let html = r#"<head><meta charset="utf-8"><link rel="stylesheet" href="style.css"><title>Page Title</title></head><body>Content</body>"#;
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<head"));
        assert!(!result.contains("<meta"));
        assert!(!result.contains("<link"));
        assert!(!result.contains("<title"));
        assert!(!result.contains("<body"));
        assert!(!result.contains("<html"));
        assert!(result.contains("Content"));
    }

    #[test]
    fn test_semantic_block_tag_removal() {
        let html = r#"<blockquote>Quote</blockquote><pre>Code block</pre><hr>After line"#;
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<blockquote"));
        assert!(!result.contains("<pre"));
        assert!(!result.contains("<hr"));
        assert!(result.contains("Quote"));
        assert!(result.contains("Code block"));
        assert!(result.contains("After line"));
    }

    #[test]
    fn test_definition_list_tag_removal() {
        let html =
            "<dl><dt>Term 1</dt><dd>Definition 1</dd><dt>Term 2</dt><dd>Definition 2</dd></dl>";
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<dl"));
        assert!(!result.contains("<dt"));
        assert!(!result.contains("<dd"));
        assert!(result.contains("Term 1"));
        assert!(result.contains("Definition 1"));
        assert!(result.contains("Term 2"));
    }

    #[test]
    fn test_media_tag_removal() {
        let html = r#"<video src="video.mp4"></video><audio src="audio.mp3"></audio><canvas></canvas><svg><path d="M0,0"/></svg>"#;
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<video"));
        assert!(!result.contains("<audio"));
        assert!(!result.contains("<canvas"));
        assert!(!result.contains("<svg"));
        assert!(!result.contains("<path"));
    }

    #[test]
    fn test_container_tag_removal() {
        let html = r#"<figure><figcaption>Caption</figcaption><img src="img.jpg"></figure><details><summary>Summary</summary>Content</details>"#;
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<figure"));
        assert!(!result.contains("<figcaption"));
        assert!(!result.contains("<details"));
        assert!(!result.contains("<summary"));
        assert!(result.contains("Caption"));
        assert!(result.contains("Summary"));
        assert!(result.contains("Content"));
    }

    #[test]
    fn test_html_comment_removal() {
        let html = r#"<!-- This is a comment -->Content<!-- Another comment -->"#;
        let result = clean_html_from_markdown(html);
        assert!(!result.contains("<!--"));
        assert!(!result.contains("-->"));
        assert!(result.contains("Content"));
        assert!(!result.contains("This is a comment"));
        assert!(!result.contains("Another comment"));
    }

    #[test]
    fn test_comprehensive_tag_cleanup() {
        // Test multiple categories of tags together
        let html = r#"
            <html>
                <head><title>Test</title><meta charset="utf-8"></head>
                <body>
                    <!-- Comment -->
                    <h1>Heading</h1>
                    <ul><li>List item</li></ul>
                    <table><tr><td>Table cell</td></tr></table>
                    <video src="v.mp4"></video>
                    <figure><figcaption>Fig</figcaption></figure>
                </body>
            </html>
        "#;
        let result = clean_html_from_markdown(html);

        // Should not contain any HTML tags or comments
        assert!(!result.contains("<html"));
        assert!(!result.contains("<head"));
        assert!(!result.contains("<title"));
        assert!(!result.contains("<meta"));
        assert!(!result.contains("<body"));
        assert!(!result.contains("<h1"));
        assert!(!result.contains("<ul"));
        assert!(!result.contains("<li"));
        assert!(!result.contains("<table"));
        assert!(!result.contains("<tr"));
        assert!(!result.contains("<td"));
        assert!(!result.contains("<video"));
        assert!(!result.contains("<figure"));
        assert!(!result.contains("<figcaption"));
        assert!(!result.contains("<!--"));

        // Should preserve content
        assert!(result.contains("Heading"));
        assert!(result.contains("List item"));
        assert!(result.contains("Table cell"));
        assert!(result.contains("Fig"));
    }

    #[test]
    fn test_github_token_reduction() {
        // Simulate GitHub page with lots of noise vs clean README
        let github_with_noise = r#"
            <html>
                <body>
                    <div class="file-navigation">
                        <div>src/</div><div>lib/</div><div>tests/</div><div>docs/</div>
                        <div>Very long file tree that goes on and on...</div>
                    </div>
                    <div class="Layout-sidebar">
                        <div class="contributors-wrapper">
                            <img src="avatar1.png"><img src="avatar2.png">
                            <div>Contributor 1</div><div>Contributor 2</div>
                        </div>
                    </div>
                    <div id="readme">
                        <h1>Project</h1>
                        <p>Short README content.</p>
                    </div>
                </body>
            </html>
        "#;

        let extracted = extract_main_content_html(github_with_noise).unwrap();

        // Extracted content should be much smaller (only README)
        assert!(extracted.len() < github_with_noise.len() / 2);

        // Should contain README
        assert!(extracted.contains("Project"));
        assert!(extracted.contains("Short README"));

        // Should NOT contain file tree or contributors
        assert!(!extracted.contains("file-navigation"));
        assert!(!extracted.contains("contributors-wrapper"));
        assert!(!extracted.contains("Contributor 1"));
    }

    // FIX #5: Test layout table stripping (Hacker News mega-cell bloat)
    #[test]
    fn test_strip_layout_tables_hacker_news_pattern() {
        // Simulate HN's nested table layout structure
        let hn_html = r#"
            <table border="0" cellpadding="0" cellspacing="0">
                <tr>
                    <td>
                        <table border="0">
                            <tr><td>Story 1</td></tr>
                            <tr><td>Story 2</td></tr>
                        </table>
                    </td>
                </tr>
            </table>
        "#;

        let result = strip_layout_tables(hn_html);

        // Should not contain <table> tags anymore
        assert!(!result.contains("<table"));
        assert!(!result.contains("cellpadding"));

        // Should still contain the content
        assert!(result.contains("Story 1"));
        assert!(result.contains("Story 2"));
    }

    #[test]
    fn test_strip_layout_tables_preserves_data_tables() {
        // Data tables with <th> headers should be preserved
        let data_table_html = r#"
            <table>
                <tr><th>Name</th><th>Value</th></tr>
                <tr><td>Item 1</td><td>100</td></tr>
                <tr><td>Item 2</td><td>200</td></tr>
            </table>
        "#;

        let result = strip_layout_tables(data_table_html);

        // Should still contain <table> tags (data table preserved)
        assert!(result.contains("<table"));
        assert!(result.contains("<th>"));

        // Content should be intact
        assert!(result.contains("Name"));
        assert!(result.contains("Value"));
        assert!(result.contains("Item 1"));
    }

    #[test]
    fn test_layout_table_with_cellpadding_stripped() {
        let layout_html =
            r#"<table cellpadding="5" cellspacing="0"><tr><td>Content</td></tr></table>"#;
        let result = strip_layout_tables(layout_html);

        assert!(!result.contains("<table"));
        assert!(result.contains("Content"));
    }

    #[test]
    fn test_simple_table_without_headers_stripped() {
        // Table without headers and with border="0" is layout table
        let layout_html =
            r#"<table border="0"><tr><td>Nav Item 1</td><td>Nav Item 2</td></tr></table>"#;
        let result = strip_layout_tables(layout_html);

        assert!(!result.contains("<table"));
        assert!(result.contains("Nav Item 1"));
        assert!(result.contains("Nav Item 2"));
    }

    #[test]
    fn test_hacker_news_markdown_bloat_fix() {
        // Test the full pipeline with HN-style nested tables
        let hn_html = r#"
            <html>
                <body>
                    <table border="0" cellpadding="0" cellspacing="0" width="85%">
                        <tr>
                            <td>
                                <table border="0">
                                    <tr><td class="title">Article Title 1</td></tr>
                                    <tr><td class="subtext">100 points by user1</td></tr>
                                </table>
                            </td>
                        </tr>
                        <tr>
                            <td>
                                <table border="0">
                                    <tr><td class="title">Article Title 2</td></tr>
                                    <tr><td class="subtext">200 points by user2</td></tr>
                                </table>
                            </td>
                        </tr>
                    </table>
                </body>
            </html>
        "#;

        let result = html_to_markdown(hn_html, "https://news.ycombinator.com", false).unwrap();

        // Result should be reasonably sized (not 4MB!)
        assert!(
            result.len() < 1000,
            "Markdown output too large: {} bytes",
            result.len()
        );

        // Should contain the actual content
        assert!(result.contains("Article Title 1"));
        assert!(result.contains("Article Title 2"));

        // Should NOT contain table markdown syntax (which would create mega-cells)
        // Count pipe characters - if it's a huge table, there will be many
        let pipe_count = result.chars().filter(|&c| c == '|').count();
        assert!(pipe_count < 10, "Too many table delimiters: {}", pipe_count);
    }

    // URL conversion tests
    #[test]
    fn test_convert_relative_image_to_absolute() {
        let md = "![Logo](../images/logo.png)";
        let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();

        assert_eq!(result, "![Logo](https://example.com/images/logo.png)");
    }

    #[test]
    fn test_convert_relative_link_to_absolute() {
        let md = "[Home](../index.html)";
        let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();

        assert_eq!(result, "[Home](https://example.com/index.html)");
    }

    #[test]
    fn test_keep_absolute_urls_unchanged() {
        let md = "![Logo](https://cdn.example.com/logo.png)";
        let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();

        assert_eq!(result, "![Logo](https://cdn.example.com/logo.png)");
    }

    #[test]
    fn test_keep_data_uris_unchanged() {
        let md = "![Inline](data:image/png;base64,ABC123)";
        let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();

        assert_eq!(result, "![Inline](data:image/png;base64,ABC123)");
    }

    #[test]
    fn test_keep_anchors_unchanged() {
        let md = "[Section](#heading)";
        let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();

        assert_eq!(result, "[Section](#heading)");
    }

    #[test]
    fn test_complex_relative_paths() {
        let md = "![](../../assets/img.jpg)";
        let result =
            convert_urls_to_absolute(md, "https://example.com/a/b/c/page.html").unwrap();

        assert_eq!(result, "![](https://example.com/a/assets/img.jpg)");
    }

    #[test]
    fn test_root_relative_urls() {
        let md = "![Logo](/assets/logo.png)";
        let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();

        assert_eq!(result, "![Logo](https://example.com/assets/logo.png)");
    }

    #[test]
    fn test_protocol_relative_urls() {
        let md = "![CDN](//cdn.example.com/image.png)";
        let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();

        assert_eq!(result, "![CDN](https://cdn.example.com/image.png)");
    }

    #[test]
    fn test_urls_with_query_params() {
        let md = "[API](../api/v1?foo=bar&baz=qux)";
        let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();

        assert_eq!(
            result,
            "[API](https://example.com/api/v1?foo=bar&baz=qux)"
        );
    }

    #[test]
    fn test_urls_with_fragments() {
        let md = "[Section](../page.html#section)";
        let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();

        assert_eq!(result, "[Section](https://example.com/page.html#section)");
    }

    #[test]
    fn test_multiple_images_and_links() {
        let md = r#"
![Logo](./logo.png)
[Home](../index.html)
![Banner](/assets/banner.jpg)
[Absolute](https://other.com/page)
"#;
        let result = convert_urls_to_absolute(md, "https://example.com/docs/page.html").unwrap();

        assert!(result.contains("![Logo](https://example.com/docs/logo.png)"));
        assert!(result.contains("[Home](https://example.com/index.html)"));
        assert!(result.contains("![Banner](https://example.com/assets/banner.jpg)"));
        assert!(result.contains("[Absolute](https://other.com/page)"));
    }

    #[test]
    fn test_full_pipeline_with_url_conversion() {
        let html = r#"
            <html>
                <body>
                    <img src="../images/logo.png" alt="Logo">
                    <a href="../about.html">About</a>
                    <img src="https://cdn.example.com/banner.jpg" alt="Banner">
                </body>
            </html>
        "#;

        let result = html_to_markdown(html, "https://example.com/docs/page.html", false).unwrap();

        // Relative image converted to absolute
        assert!(result.contains("![Logo](https://example.com/images/logo.png)"));

        // Absolute image unchanged (but the <a> tags are removed by clean_html_from_markdown)
        assert!(result.contains("![Banner](https://cdn.example.com/banner.jpg)"));

        // Should not have relative URLs
        assert!(!result.contains("../images/logo.png"));
    }

    #[test]
    fn test_edge_case_empty_alt_text() {
        let md = "![](relative/path.png)";
        let result = convert_urls_to_absolute(md, "https://example.com/page.html").unwrap();

        assert_eq!(result, "![](https://example.com/relative/path.png)");
    }

    #[test]
    fn test_edge_case_special_chars_in_url() {
        let md = "[Link](path%20with%20spaces.html)";
        let result = convert_urls_to_absolute(md, "https://example.com/").unwrap();

        assert_eq!(result, "[Link](https://example.com/path%20with%20spaces.html)");
    }

    // NEW: Tests for escape_multiline_links
    #[test]
    fn test_escape_multiline_links_simple() {
        let input = "[This is a\nlink](#heading)";
        let result = escape_multiline_links(input);
        assert_eq!(result, "[This is a link](#heading)");
    }

    #[test]
    fn test_escape_multiline_links_multiple() {
        let input = "[Line 1\nLine 2\nLine 3](url)";
        let result = escape_multiline_links(input);
        assert_eq!(result, "[Line 1 Line 2 Line 3](url)");
    }

    #[test]
    fn test_escape_multiline_links_nested_brackets() {
        let input = "[[inner\nlink]](url)";
        let result = escape_multiline_links(input);
        assert_eq!(result, "[[inner link]](url)");
    }

    #[test]
    fn test_escape_multiline_links_no_newlines() {
        let input = "[Normal link](url)";
        let result = escape_multiline_links(input);
        assert_eq!(result, "[Normal link](url)");
    }

    #[test]
    fn test_escape_multiline_links_outside_links() {
        let input = "Text before\n[link](url)\nText after";
        let result = escape_multiline_links(input);
        assert_eq!(result, "Text before\n[link](url)\nText after");
    }

    #[test]
    fn test_escape_multiline_links_multiple_links() {
        let input = "[First\nlink](url1) and [second\nlink](url2)";
        let result = escape_multiline_links(input);
        assert_eq!(result, "[First link](url1) and [second link](url2)");
    }

    #[test]
    fn test_escape_multiline_links_empty_link() {
        let input = "[](url)";
        let result = escape_multiline_links(input);
        assert_eq!(result, "[](url)");
    }

    #[test]
    fn test_escape_multiline_links_image_syntax() {
        // Image links should also have newlines collapsed
        let input = "![Alt text\nwith newline](image.jpg)";
        let result = escape_multiline_links(input);
        assert_eq!(result, "![Alt text with newline](image.jpg)");
    }

    #[test]
    fn test_escape_multiline_links_unmatched_bracket() {
        // Unmatched brackets should be handled gracefully
        let input = "[unclosed link\nwithout closing bracket";
        let result = escape_multiline_links(input);
        // The newline should be collapsed because we're inside bracket depth > 0
        assert_eq!(result, "[unclosed link without closing bracket");
    }

    // NEW: Tests for remove_accessibility_links
    #[test]
    fn test_remove_skip_to_content() {
        let input = "[Skip to Content](#main)\n\n# Welcome\n\nContent here.";
        let expected = "# Welcome\n\nContent here.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_remove_skip_to_main() {
        let input = "[Skip to Main](#main-content)\n\nActual content.";
        let expected = "Actual content.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_remove_skip_to_navigation() {
        let input = "[Skip to Navigation](#nav)\n\nPage content.";
        let expected = "Page content.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_remove_jump_to_content() {
        let input = "[Jump to Content](#content)\n\nMain text.";
        let expected = "Main text.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_remove_multiple_skip_links() {
        let input = "[Skip to Content](#main)\n[Skip to Navigation](#nav)\n\nContent.";
        let expected = "Content.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_preserve_regular_links() {
        let input = "[Regular Link](https://example.com)\n\nContent.";
        let expected = "[Regular Link](https://example.com)\n\nContent.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_case_insensitive_skip_links() {
        let input = "[SKIP TO CONTENT](#main)\n\nContent.";
        let expected = "Content.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_screen_reader_text() {
        let input = "[Screen reader only: Navigation menu](#nav)\n\nContent.";
        let expected = "Content.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_no_removal_in_middle_of_text() {
        let input = "Some text [Skip to Content](#main) more text.";
        // Should NOT remove if not at start of line
        assert!(remove_accessibility_links(input).contains("Skip to Content"));
    }

    #[test]
    fn test_back_to_top_removal() {
        let input = "Content here\n[Back to Top](#top)";
        let expected = "Content here\n";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_go_to_content_removal() {
        let input = "[Go to Main](#main)\n\nPage content.";
        let expected = "Page content.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_skip_navigation_lowercase() {
        let input = "[Skip navigation](#nav)\n\nContent.";
        let expected = "Content.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_multiple_accessibility_variants() {
        let input = "[Skip to Content](#main)\n[Jump to Navigation](#nav)\n[Back to Top](#top)\n\nActual content.";
        let expected = "Actual content.";
        assert_eq!(remove_accessibility_links(input), expected);
    }

    #[test]
    fn test_debug_regex_pattern() {
        let input = "[Skip to Main](#main)\n\nArticle Title\n==========";
        let result = remove_accessibility_links(input);

        // Should remove the skip link
        assert!(!result.contains("Skip to Main"));
        assert_eq!(result, "Article Title\n==========");
    }

    // Integration test for all 3 markdown post-processing improvements
    #[test]
    fn test_full_pipeline_with_all_improvements() {
        let html = r##"
            <html>
                <body>
                    <nav><a href="#main">Skip to content</a></nav>
                    <main>
                        <h1>Test Page</h1>
                        <p>Regular content here.</p>
                        <img srcset="small.jpg 300w, medium.jpg 600w, large.jpg 1200w" alt="Test Image">
                        <p>More content with <a href="#section">multi-line
link text</a>.</p>
                    </main>
                    <footer><a href="#top">Back to Top</a></footer>
                </body>
            </html>
        "##;

        let result = html_to_markdown(html, "https://example.com", false).unwrap();

        // Should resolve srcset to largest image
        assert!(result.contains("![Test Image](https://example.com/large.jpg)"));

        // Should NOT contain accessibility links
        assert!(!result.contains("Skip to content"));
        assert!(!result.contains("Back to Top"));

        // Should NOT contain small/medium images
        assert!(!result.contains("small.jpg"));
        assert!(!result.contains("medium.jpg"));

        // Should contain regular content
        assert!(result.contains("Test Page"));
        assert!(result.contains("Regular content"));
    }

    #[test]
    fn test_srcset_resolution_integration() {
        let html = r#"
            <img srcset="img-400.jpg 400w, img-800.jpg 800w, img-1600.jpg 1600w" alt="Responsive">
            <img srcset="icon@1x.png 1x, icon@2x.png 2x, icon@3x.png 3x" alt="Retina">
            <img src="regular.jpg" alt="Normal">
        "#;

        let result = html_to_markdown(html, "https://cdn.example.com", false).unwrap();

        // Debug: print the actual result
        eprintln!("Result markdown:\n{}", result);

        // Should pick largest from first srcset
        assert!(result.contains("![Responsive](https://cdn.example.com/img-1600.jpg)") ||
                result.contains("img-1600.jpg"), "Expected to find img-1600.jpg in output");

        // Should pick largest retina version
        assert!(result.contains("![Retina](https://cdn.example.com/icon@3x.png)") ||
                result.contains("icon@3x.png"), "Expected to find icon@3x.png in output");

        // Should keep regular image unchanged
        assert!(result.contains("![Normal](https://cdn.example.com/regular.jpg)") ||
                result.contains("regular.jpg"), "Expected to find regular.jpg in output");
    }

    #[test]
    fn test_multiline_link_escaping_integration() {
        let html = r#"
            <a href="https://example.com">This is a
            multi-line
            link</a>
        "#;

        let result = html_to_markdown(html, "https://example.com", false).unwrap();

        // Debug: print the actual result
        eprintln!("Multiline link result:\n{}", result);

        // The html2md converter might collapse whitespace, so the newlines might not be preserved
        // Instead, just check that the link is valid
        assert!(result.contains("["));
        assert!(result.contains("]"));
        assert!(result.contains("(https://example.com)"));
    }

    #[test]
    fn test_accessibility_link_removal_integration() {
        let html = r##"
            <nav>
                <a href="#content">Skip to Content</a>
                <a href="#main">Skip to Main</a>
            </nav>
            <main id="content">
                <h1>Article Title</h1>
                <p>Article content.</p>
                <a href="https://example.com">Normal Link</a>
            </main>
            <footer>
                <a href="#top">Back to Top</a>
            </footer>
        "##;

        let result = html_to_markdown(html, "https://example.com", false).unwrap();

        // Should remove all accessibility links
        assert!(!result.contains("Skip to Content"));
        assert!(!result.contains("Skip to Main"));
        assert!(!result.contains("Back to Top"));

        // Should keep normal content
        assert!(result.contains("Article Title"));
        assert!(result.contains("Article content"));
    }

    // NEW: Tests for setext-to-ATX heading conversion
    #[test]
    fn test_setext_h1_to_atx() {
        let md = "Title\n=====\n\nContent";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("# Title"), "Expected ATX h1, got: {}", cleaned);
        assert!(!cleaned.contains("====="));
    }

    #[test]
    fn test_setext_h2_to_atx() {
        let md = "Subtitle\n--------\n\nContent";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("## Subtitle"), "Expected ATX h2, got: {}", cleaned);
        assert!(!cleaned.contains("--------"));
    }

    #[test]
    fn test_setext_preserves_existing_atx() {
        let md = "# Already ATX\n\nContent";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("# Already ATX"));
    }

    #[test]
    fn test_setext_multiple_headings() {
        let md = "First\n=====\n\nSecond\n------\n\nThird\n=====";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("# First"));
        assert!(cleaned.contains("## Second"));
        assert!(cleaned.contains("# Third"));
    }

    // NEW: Tests for base64 image replacement
    #[test]
    fn test_base64_image_replacement() {
        let md = "![Logo](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==)";
        let cleaned = clean_markdown(md);
        assert_eq!(cleaned, "![Logo](data:image-removed)");
    }

    #[test]
    fn test_base64_image_preserves_normal_images() {
        let md = "![Photo](https://example.com/photo.jpg)";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("![Photo](https://example.com/photo.jpg)"));
    }

    #[test]
    fn test_base64_image_mixed() {
        let md = "![Normal](https://example.com/img.png) and ![Inline](data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw==)";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("![Normal](https://example.com/img.png)"));
        assert!(cleaned.contains("![Inline](data:image-removed)"));
        assert!(!cleaned.contains("base64"));
    }

    // NEW: Tests for JSON content detection
    #[test]
    fn test_json_object_detection() {
        let json = r#"{"name": "test", "value": 42}"#;
        let result = html_to_markdown(json, "https://example.com", false).unwrap();
        assert!(result.contains("# JSON Response"), "Expected heading, got: {}", result);
        assert!(result.contains("```json\n"));
        assert!(result.ends_with("\n```"));
        assert!(result.contains(r#""name": "test""#));
    }

    #[test]
    fn test_json_array_detection() {
        let json = r#"[{"id": 1}, {"id": 2}]"#;
        let result = html_to_markdown(json, "https://example.com", false).unwrap();
        assert!(result.contains("```json\n"));
        assert!(result.contains(r#""id": 1"#));
    }

    #[test]
    fn test_html_not_detected_as_json() {
        let html = "<html><body><p>Hello</p></body></html>";
        let result = html_to_markdown(html, "https://example.com", false).unwrap();
        assert!(!result.starts_with("```json"));
        assert!(result.contains("Hello"));
    }

    // NEW: Tests for empty-result fallback
    #[test]
    fn test_empty_result_fallback() {
        // HTML where main content selector matches but extracts too little
        let html = r#"
            <html>
                <body>
                    <main><span></span></main>
                    <div>Actual content is here with enough text to be useful for AI agents.</div>
                </body>
            </html>
        "#;
        let result = html_to_markdown(html, "https://example.com", true).unwrap();
        // Should fallback and include the div content
        assert!(result.contains("Actual content is here"));
    }

    // NEW: Tests for noise removal selectors
    #[test]
    fn test_modal_noise_selectors_present() {
        // Verify the new selectors are listed in the remove_selectors array
        // by checking that a document with these classes gets them stripped
        // when running through the full pipeline (the fallback removal path
        // uses line-by-line removal which is fragile, so we test via full pipeline)
        let html = r#"
            <html>
                <body>
                    <div class="modal">
                        <h2>Sign up now!</h2>
                        <form>
                            <input type="email" placeholder="Email">
                            <button>Subscribe</button>
                        </form>
                    </div>
                    <div class="overlay" style="position:fixed">
                        <p>Overlay content</p>
                    </div>
                    <h1>Main Page Title</h1>
                    <p>This is the main content of the page that should be preserved.</p>
                    <p>More content paragraphs here.</p>
                </body>
            </html>
        "#;
        let result = extract_main_content_html(html).unwrap();
        // The modal and overlay elements should be removed
        assert!(!result.contains("Sign up now"), "Modal content should be removed");
        assert!(!result.contains("Overlay content"), "Overlay content should be removed");
        // Main content should remain
        assert!(result.contains("Main Page Title"));
        assert!(result.contains("main content of the page"));
    }

    // NEW: Test escaped HTML tag removal
    #[test]
    fn test_escaped_html_tag_removal() {
        let md = r#"Content \<style\> \</style\> more text \</a\> end"#;
        let cleaned = clean_markdown(md);
        assert!(!cleaned.contains(r"\<style\>"), "Escaped style tag should be removed");
        assert!(!cleaned.contains(r"\</a\>"), "Escaped closing tag should be removed");
        assert!(cleaned.contains("Content"));
        assert!(cleaned.contains("more text"));
        assert!(cleaned.contains("end"));
    }

    #[test]
    fn test_escaped_html_comment_removal() {
        let md = r#"Before \<!-- comment --\> After"#;
        let cleaned = clean_markdown(md);
        assert!(!cleaned.contains("comment"), "Escaped comment should be removed");
        assert!(cleaned.contains("Before"));
        assert!(cleaned.contains("After"));
    }

    #[test]
    fn test_escaped_tags_preserve_normal_content() {
        // Normal < usage in content should not be affected
        let md = "The value is a < b and 5 > 3";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("a < b"), "Normal < should be preserved: {}", cleaned);
    }

    // NEW: Test code fence protection from HTML stripping
    #[test]
    fn test_code_fence_protection() {
        let md = "Some text\n\n```html\n<div class=\"container\">\n  <p>Hello</p>\n</div>\n```\n\nMore text";
        let result = clean_html_from_markdown(md);
        // HTML inside code fences should be preserved
        assert!(result.contains("<div class=\"container\">"), "HTML in code fence should be preserved: {}", result);
        assert!(result.contains("<p>Hello</p>"), "HTML tags in code fence should be preserved: {}", result);
        // Text outside code fences should still be cleaned
        assert!(result.contains("Some text"));
        assert!(result.contains("More text"));
    }

    #[test]
    fn test_code_fence_protection_multiple_blocks() {
        let md = "Text\n\n```html\n<strong>bold</strong>\n```\n\nMiddle <div>removed</div>\n\n```js\nconst x = '<span>test</span>';\n```\n\nEnd";
        let result = clean_html_from_markdown(md);
        // First code fence: HTML preserved
        assert!(result.contains("<strong>bold</strong>"), "HTML in first code fence preserved");
        // Outside code fence: HTML stripped
        assert!(!result.contains("<div>removed</div>"), "HTML outside code fence should be stripped");
        assert!(result.contains("removed"));
        // Second code fence: HTML preserved
        assert!(result.contains("<span>test</span>"), "HTML in second code fence preserved");
    }

    // NEW: Test inline formatting conversion
    #[test]
    fn test_inline_bold_conversion() {
        let html = "<strong>important</strong> text <b>also bold</b>";
        let result = clean_html_from_markdown(html);
        assert!(result.contains("**important**"), "Expected **important**, got: {}", result);
        assert!(result.contains("**also bold**"), "Expected **also bold**, got: {}", result);
    }

    #[test]
    fn test_inline_italic_conversion() {
        let html = "<em>emphasized</em> text <i>also italic</i>";
        let result = clean_html_from_markdown(html);
        assert!(result.contains("_emphasized_"), "Expected _emphasized_, got: {}", result);
        assert!(result.contains("_also italic_"), "Expected _also italic_, got: {}", result);
    }

    #[test]
    fn test_inline_code_conversion() {
        let html = "Use <code>console.log()</code> for debugging";
        let result = clean_html_from_markdown(html);
        assert!(result.contains("`console.log()`"), "Expected `console.log()`, got: {}", result);
    }

    #[test]
    fn test_pre_code_language_detection() {
        let html = r#"<pre><code class="language-rust">fn main() {}</code></pre>"#;
        let result = clean_html_from_markdown(html);
        assert!(result.contains("```rust"), "Expected ```rust, got: {}", result);
        assert!(result.contains("fn main() {}"), "Expected code content");
    }

    #[test]
    fn test_pre_code_lang_prefix() {
        let html = r#"<pre><code class="lang-python">print("hello")</code></pre>"#;
        let result = clean_html_from_markdown(html);
        assert!(result.contains("```python"), "Expected ```python, got: {}", result);
    }

    #[test]
    fn test_pre_code_highlight_prefix() {
        let html = r#"<pre><code class="highlight-javascript">const x = 1;</code></pre>"#;
        let result = clean_html_from_markdown(html);
        assert!(result.contains("```javascript"), "Expected ```javascript, got: {}", result);
    }

    #[test]
    fn test_anchor_to_markdown_link() {
        let html = r#"Visit <a href="https://example.com">Example</a> for details."#;
        let result = clean_html_from_markdown(html);
        assert!(result.contains("[Example](https://example.com)"), "Expected markdown link, got: {}", result);
    }

    #[test]
    fn test_anchor_javascript_href_stripped() {
        let html = r#"<a href="javascript:void(0)">Click me</a>"#;
        let result = clean_html_from_markdown(html);
        assert!(result.contains("Click me"));
        assert!(!result.contains("javascript:"));
    }

    #[test]
    fn test_preprocess_resolves_relative_urls() {
        let html = r#"<a href="/about">About</a> <img src="/logo.png">"#;
        let result = preprocess_html_for_conversion(html, "https://example.com");
        assert!(result.contains("https://example.com/about"), "Expected absolute href, got: {}", result);
        assert!(result.contains("https://example.com/logo.png"), "Expected absolute src, got: {}", result);
    }

    #[test]
    fn test_preprocess_preserves_absolute_urls() {
        let html = r#"<a href="https://other.com/page">Link</a>"#;
        let result = preprocess_html_for_conversion(html, "https://example.com");
        assert!(result.contains("https://other.com/page"));
    }

    #[test]
    fn test_preprocess_strips_gutter_elements() {
        let html = r#"<pre><td class="gutter"><span>1</span></td><td class="code">let x = 1;</td></pre>"#;
        let result = preprocess_html_for_conversion(html, "https://example.com");
        assert!(!result.contains("gutter"), "Gutter should be stripped, got: {}", result);
        assert!(result.contains("let x = 1;"));
    }

    #[test]
    fn test_ui_noise_loading_sponsored() {
        let md = "# Products\n\nLoading...\n\nSponsored\n\nSome product here\n\nNotifications";
        let cleaned = clean_markdown(md);
        assert!(!cleaned.contains("Loading..."));
        assert!(!cleaned.contains("Sponsored"));
        assert!(!cleaned.contains("Notifications"));
        assert!(cleaned.contains("# Products"));
        assert!(cleaned.contains("Some product here"));
    }

    #[test]
    fn test_copyright_footer_removal() {
        let md = "# Page\n\nContent here\n\nCopyright © 2024 Acme Inc. All Rights Reserved.\n\nMore content";
        let cleaned = clean_markdown(md);
        assert!(!cleaned.contains("Copyright ©"));
        assert!(cleaned.contains("Content here"));
        assert!(cleaned.contains("More content"));
    }

    #[test]
    fn test_link_whitespace_normalization() {
        let md = "[   Apple   ](https://example.com)";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("[Apple](https://example.com)"), "Got: {}", cleaned);
    }

    #[test]
    fn test_link_text_deduplication() {
        let md = "[Apple Apple](https://example.com)";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("[Apple](https://example.com)"), "Got: {}", cleaned);
    }

    #[test]
    fn test_link_text_dedup_multiword() {
        // Multi-word dedup: [New York New York] → [New York]
        let md = "[New York New York](https://example.com)";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("[New York](https://example.com)"), "Got: {}", cleaned);
    }

    #[test]
    fn test_link_text_no_false_dedup() {
        // Don't dedup when halves are different
        let md = "[Apple Samsung](https://example.com)";
        let cleaned = clean_markdown(md);
        assert!(cleaned.contains("[Apple Samsung](https://example.com)"), "Got: {}", cleaned);
    }

    #[test]
    fn test_repeated_list_items_collapsed() {
        let md = "* Product info page\n\n* Product info page\n\n* Product info page\n\n* Product info page\n\nOther content";
        let cleaned = clean_markdown(md);
        // Should collapse to at most 2 instances
        let count = cleaned.matches("Product info page").count();
        assert!(count <= 2, "Expected <= 2 occurrences but got {}: {}", count, cleaned);
    }
}