index-transformer 1.0.0

//! Typestate transformer pipeline.

use std::marker::PhantomData;

use index_core::{
    DiagnosticAction, DiagnosticConfidence, DiagnosticRecord, DiagnosticSeverity, DiagnosticSource,
    DocumentQuality, DocumentQualityCategory, FailureCause, FailureDiagnostic, IndexDocument,
    IndexNode, Link, SectionRole,
};
use index_dom::{HtmlDocument, parse_html};
use index_headless::{AccessibilityNode, AccessibilitySnapshot, HeadlessError, HeadlessSnapshot};
use index_readability::ReadablePage;

pub mod adapter;
pub mod cache;
pub mod instruction;
pub mod manifest;
pub mod state;

use adapter::{AdapterContext, AdapterRegistry};
use instruction::{ApplyMetadata, EmitLinks, EmitReadableNodes, EmitTitle, Instruction};
use state::{Empty, Extracted, Fetched, Parsed, Transformed};

pub use cache::{TransformCacheKey, TransformedDocumentCache};
pub use manifest::apply_index_manifest_hints;

/// A staged transformer.
#[derive(Debug, Clone)]
pub struct Transformer<S> {
    raw_html: Option<String>,
    parsed: Option<HtmlDocument>,
    extracted: Option<ReadablePage>,
    document: Option<IndexDocument>,
    _state: PhantomData<S>,
}

/// Transforms HTML using an in-memory transformed document cache.
#[must_use]
pub fn transform_html_cached(
    cache: &mut TransformedDocumentCache,
    source_url: Option<&str>,
    html: impl Into<String>,
) -> IndexDocument {
    let html = html.into();
    let key = TransformCacheKey::new(source_url, &html);
    if let Some(document) = cache.get(&key) {
        return document;
    }

    let document = Transformer::<Empty>::new()
        .fetched(html)
        .parse()
        .extract()
        .transform()
        .into_document();
    cache.insert(key, document.clone());
    document
}

impl Transformer<Empty> {
    /// Creates an empty transformer.
    #[must_use]
    pub const fn new() -> Self {
        Self {
            raw_html: None,
            parsed: None,
            extracted: None,
            document: None,
            _state: PhantomData,
        }
    }

    /// Accepts already-fetched HTML.
    #[must_use]
    pub fn fetched(self, raw_html: impl Into<String>) -> Transformer<Fetched> {
        Transformer {
            raw_html: Some(raw_html.into()),
            parsed: None,
            extracted: None,
            document: None,
            _state: PhantomData,
        }
    }
}

impl Default for Transformer<Empty> {
    fn default() -> Self {
        Self::new()
    }
}

impl Transformer<Fetched> {
    /// Parses fetched HTML.
    #[must_use]
    pub fn parse(self) -> Transformer<Parsed> {
        let raw_html = self.raw_html.unwrap_or_default();
        let parsed = parse_html(raw_html.clone());

        Transformer {
            raw_html: Some(raw_html),
            parsed: Some(parsed),
            extracted: None,
            document: None,
            _state: PhantomData,
        }
    }
}

impl Transformer<Parsed> {
    /// Extracts readable content.
    #[must_use]
    pub fn extract(self) -> Transformer<Extracted> {
        let extracted = self.parsed.as_ref().map(ReadablePage::from_html);

        Transformer {
            raw_html: self.raw_html,
            parsed: self.parsed,
            extracted,
            document: None,
            _state: PhantomData,
        }
    }
}

impl Transformer<Extracted> {
    /// Transforms extracted content into an Index document.
    #[must_use]
    pub fn transform(self) -> Transformer<Transformed> {
        let page = self.extracted.unwrap_or_else(|| ReadablePage {
            title: "Untitled".to_owned(),
            paragraphs: Vec::new(),
            nodes: Vec::new(),
            links: Vec::new(),
            forms: Vec::new(),
            metadata: Default::default(),
        });

        let context = AdapterContext { page: &page };
        let document = AdapterRegistry::default_registry()
            .transform(&context)
            .unwrap_or_else(|| transform_generic(&page));

        Transformer {
            raw_html: self.raw_html,
            parsed: self.parsed,
            extracted: Some(page),
            document: Some(document),
            _state: PhantomData,
        }
    }
}

impl Transformer<Transformed> {
    /// Consumes the transformer and returns the final document.
    #[must_use]
    pub fn into_document(self) -> IndexDocument {
        self.document.unwrap_or_default()
    }
}

/// Transforms a rendered DOM/accessibility snapshot into an Index document.
#[must_use]
pub fn transform_headless_snapshot(snapshot: &HeadlessSnapshot) -> IndexDocument {
    let mut parsed = parse_html(snapshot.dom_html.clone());
    if parsed.metadata.canonical_url.is_none() {
        parsed.metadata.canonical_url = Some(snapshot.final_url.to_string());
    }

    let page = ReadablePage::from_html(&parsed);
    if let Some(accessibility) = &snapshot.accessibility {
        if let Some(mut document) = accessibility_document(accessibility, snapshot, &page) {
            merge_dom_links(&mut document, &page);
            return document;
        }
    }

    if page.has_body() {
        return AdapterRegistry::default_registry()
            .transform(&AdapterContext { page: &page })
            .unwrap_or_else(|| transform_generic(&page));
    }

    let mut document = IndexDocument::titled("Headless snapshot");
    document.metadata.canonical_url = Some(snapshot.final_url.to_string());
    if let Some(accessibility) = &snapshot.accessibility {
        let text = accessibility.text_content();
        if !text.is_empty() {
            document.push(IndexNode::Paragraph(text));
        }
    }
    if document.is_empty() {
        return FailureDiagnostic::new(
            "Headless snapshot unreadable",
            DiagnosticSource::Headless,
            DiagnosticConfidence::Failed,
            "headless snapshot did not contain readable content",
        )
        .with_fallback("accessibility tree text extraction")
        .with_tried("headless DOM snapshot")
        .with_tried("accessibility tree extraction")
        .with_actions([DiagnosticAction::Retry, DiagnosticAction::Capture])
        .with_command(":capture save headless-unreadable.capture")
        .with_record(DiagnosticRecord::new(
            DiagnosticSeverity::Error,
            "INDEX-HEADLESS-EMPTY",
            format!("final_url={}", snapshot.final_url),
        ))
        .into_document();
    }
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Fallback,
        55,
        [
            "headless accessibility fallback".to_owned(),
            "DOM body was not readable".to_owned(),
        ],
    ));
    document
}

fn accessibility_document(
    accessibility: &AccessibilitySnapshot,
    snapshot: &HeadlessSnapshot,
    page: &ReadablePage,
) -> Option<IndexDocument> {
    let mut nodes = Vec::new();
    let mut evidence = AccessibilityEvidence::default();
    for node in &accessibility.nodes {
        append_accessibility_node(node, &mut nodes, &mut evidence);
    }
    if !evidence.is_confident() {
        return None;
    }

    let title = nodes
        .iter()
        .find_map(first_heading_text)
        .filter(|title| !title.trim().is_empty())
        .unwrap_or_else(|| {
            if page.title.trim().is_empty() {
                "Headless snapshot".to_owned()
            } else {
                page.title.clone()
            }
        });
    let mut document = IndexDocument::titled(title);
    document.metadata.canonical_url = Some(snapshot.final_url.to_string());
    document.nodes = nodes;
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Fallback,
        evidence.score(),
        [
            "accessibility tree supplied semantic roles".to_owned(),
            "headless DOM links merged when available".to_owned(),
        ],
    ));
    Some(document)
}

fn first_heading_text(node: &IndexNode) -> Option<String> {
    match node {
        IndexNode::Heading { text, .. } if !text.trim().is_empty() => Some(text.clone()),
        IndexNode::Section { nodes, .. } => nodes.iter().find_map(first_heading_text),
        _ => None,
    }
}

#[derive(Debug, Clone, Copy, Default)]
struct AccessibilityEvidence {
    named_nodes: usize,
    semantic_nodes: usize,
}

impl AccessibilityEvidence {
    fn observe(&mut self, semantic: bool, name: &str) {
        if !name.trim().is_empty() {
            self.named_nodes += 1;
        }
        if semantic {
            self.semantic_nodes += 1;
        }
    }

    fn is_confident(self) -> bool {
        self.semantic_nodes >= 2 || (self.semantic_nodes >= 1 && self.named_nodes >= 2)
    }

    fn score(self) -> u8 {
        let score =
            50 + (self.semantic_nodes.min(4) as u8 * 8) + (self.named_nodes.min(4) as u8 * 3);
        score.min(82)
    }
}

fn append_accessibility_node(
    node: &AccessibilityNode,
    output: &mut Vec<IndexNode>,
    evidence: &mut AccessibilityEvidence,
) {
    let role = node.role.trim().to_ascii_lowercase();
    let name = node.name.trim();
    match role.as_str() {
        "main" | "article" | "navigation" | "complementary" | "contentinfo" | "footer" => {
            let mut children = Vec::new();
            for child in &node.children {
                append_accessibility_node(child, &mut children, evidence);
            }
            if !children.is_empty() {
                evidence.observe(true, name);
                output.push(IndexNode::Section {
                    role: accessibility_section_role(&role),
                    title: (!name.is_empty()).then(|| name.to_owned()),
                    collapsed: !matches!(role.as_str(), "main" | "article"),
                    nodes: children,
                });
            }
        }
        "heading" => {
            if !name.is_empty() {
                evidence.observe(true, name);
                output.push(IndexNode::Heading {
                    level: 2,
                    text: name.to_owned(),
                });
            }
        }
        "paragraph" | "text" | "statictext" | "generic" => {
            if !name.is_empty() {
                evidence.observe(false, name);
                output.push(IndexNode::Paragraph(name.to_owned()));
            }
        }
        "link" | "button" | "searchbox" | "textbox" | "checkbox" => {
            if !name.is_empty() {
                evidence.observe(true, name);
                output.push(IndexNode::Paragraph(format!("{role}: {name}")));
            }
        }
        "list" => {
            let items = accessibility_list_items(&node.children);
            if !items.is_empty() {
                evidence.observe(true, name);
                output.push(IndexNode::List {
                    ordered: false,
                    items,
                });
            }
        }
        "listitem" => {
            if !name.is_empty() {
                evidence.observe(true, name);
                output.push(IndexNode::Paragraph(name.to_owned()));
            }
        }
        _ => {
            if !name.is_empty() {
                evidence.observe(false, name);
                output.push(IndexNode::Paragraph(name.to_owned()));
            }
            for child in &node.children {
                append_accessibility_node(child, output, evidence);
            }
        }
    }
}

fn accessibility_section_role(role: &str) -> SectionRole {
    match role {
        "main" | "article" => SectionRole::Main,
        "navigation" => SectionRole::Navigation,
        "complementary" => SectionRole::Aside,
        "contentinfo" | "footer" => SectionRole::Footer,
        _ => SectionRole::Unknown,
    }
}

fn accessibility_list_items(children: &[AccessibilityNode]) -> Vec<String> {
    children
        .iter()
        .filter_map(|child| {
            let name = child.name.trim();
            if name.is_empty() {
                let nested = child
                    .children
                    .iter()
                    .filter_map(|grandchild| {
                        let name = grandchild.name.trim();
                        (!name.is_empty()).then(|| name.to_owned())
                    })
                    .collect::<Vec<_>>();
                (!nested.is_empty()).then(|| nested.join(" "))
            } else {
                Some(name.to_owned())
            }
        })
        .collect()
}

fn merge_dom_links(document: &mut IndexDocument, page: &ReadablePage) {
    let mut existing = document_link_labels(document);
    for link in &page.links {
        if existing.iter().any(|label| label == &link.text) {
            continue;
        }
        existing.push(link.text.clone());
        document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
    }
}

fn document_link_labels(document: &IndexDocument) -> Vec<String> {
    document.nodes.iter().filter_map(link_label).collect()
}

fn link_label(node: &IndexNode) -> Option<String> {
    match node {
        IndexNode::Link(link) => Some(link.text.clone()),
        IndexNode::Section { nodes, .. } => nodes.iter().find_map(link_label),
        _ => None,
    }
}

/// Converts a headless failure into a deterministic document.
#[must_use]
pub fn transform_headless_failure(error: &HeadlessError) -> IndexDocument {
    FailureDiagnostic::new(
        "Headless fallback failed",
        DiagnosticSource::Headless,
        DiagnosticConfidence::Failed,
        error.to_string(),
    )
    .with_fallback("static transform or retry")
    .with_tried("headless backend execution")
    .with_actions([DiagnosticAction::Retry, DiagnosticAction::Capture])
    .with_command(":capture save headless-failure.capture")
    .with_record(DiagnosticRecord::new(
        DiagnosticSeverity::Error,
        "INDEX-HEADLESS-FAILED",
        error.to_string(),
    ))
    .into_document()
}

fn default_program() -> Vec<Box<dyn Instruction>> {
    vec![
        Box::new(ApplyMetadata),
        Box::new(EmitTitle),
        Box::new(EmitReadableNodes),
        Box::new(EmitLinks),
    ]
}

fn transform_generic(page: &ReadablePage) -> IndexDocument {
    let mut document = IndexDocument::titled(page.title.clone());
    let program = default_program();

    for instruction in program {
        instruction.execute(page, &mut document);
    }

    if let Some(blocked_flow_class) = blocked_flow_hint(page) {
        return generic_blocked_flow_document(page, blocked_flow_class);
    }

    if !page.has_body() && page.links.is_empty() && page.forms.is_empty() {
        document = FailureDiagnostic::new(
            page.title.clone(),
            DiagnosticSource::GenericTransformer,
            DiagnosticConfidence::Failed,
            "generic transformer did not find readable page content",
        )
        .with_fallback("generic static reader")
        .with_tried("static HTML parse")
        .with_tried("readability extraction")
        .with_tried("generic instruction program")
        .with_actions([
            DiagnosticAction::TryHeadless,
            DiagnosticAction::Extract,
            DiagnosticAction::Capture,
            DiagnosticAction::AddFixture,
        ])
        .with_command(":extract links")
        .with_command(":capture save unsupported-page.capture")
        .with_record(
            DiagnosticRecord::new(
                DiagnosticSeverity::Error,
                "INDEX-GENERIC-EMPTY",
                "no readable headings, paragraphs, tables, forms, links, or sections were emitted",
            )
            .with_field("title", &page.title),
        )
        .into_document();
    } else if page.paragraphs.is_empty() && page.nodes.len() <= 2 {
        document.push(IndexNode::Section {
            role: index_core::SectionRole::Unknown,
            title: Some("Diagnostic".to_owned()),
            collapsed: true,
            nodes: vec![
                IndexNode::Error(
                    "low-confidence transform: only sparse page structure was found".to_owned(),
                ),
                IndexNode::List {
                    ordered: false,
                    items: vec![
                        "try :extract links or :extract json".to_owned(),
                        "capture a redacted fixture if the page shape matters".to_owned(),
                    ],
                },
            ],
        });
        document.metadata.quality = Some(DocumentQuality::new(
            DocumentQualityCategory::PartialGeneric,
            45,
            [
                "sparse generic structure".to_owned(),
                "diagnostic section attached".to_owned(),
            ],
        ));
    } else {
        document.metadata.quality = Some(DocumentQuality::new(
            DocumentQualityCategory::StrongGeneric,
            82,
            [
                "generic reader emitted semantic content".to_owned(),
                "no low-confidence diagnostic attached".to_owned(),
            ],
        ));
    }

    document
}

fn blocked_flow_hint(page: &ReadablePage) -> Option<&'static str> {
    let sparse_shape = page.paragraphs.len() <= 2 && page.forms.is_empty() && page.links.len() <= 3;
    if !sparse_shape {
        return None;
    }

    let mut haystack = page.title.to_ascii_lowercase();
    for paragraph in &page.paragraphs {
        haystack.push('\n');
        haystack.push_str(&paragraph.to_ascii_lowercase());
    }

    if haystack.contains("captcha")
        || haystack.contains("verify you are human")
        || haystack.contains("robot check")
        || haystack.contains("cloudflare")
    {
        return Some("bot-gate");
    }
    if haystack.contains("not available in your region")
        || haystack.contains("not available in your country")
        || haystack.contains("geo-restricted")
        || haystack.contains("geoblocked")
    {
        return Some("geo-gate");
    }
    if haystack.contains("age verification")
        || haystack.contains("adults only")
        || haystack.contains("18+")
        || haystack.contains("confirm your age")
    {
        return Some("age-gate");
    }
    if haystack.contains("access denied")
        || haystack.contains("forbidden")
        || haystack.contains("blocked by policy")
        || haystack.contains("violates our terms")
        || haystack.contains("not permitted")
    {
        return Some("policy-blocked");
    }
    if haystack.contains("enable javascript")
        || haystack.contains("requires javascript")
        || haystack.contains("continue in app")
        || haystack.contains("app is not available")
    {
        return Some("script-gate");
    }
    if haystack.contains("log in")
        || haystack.contains("sign in")
        || haystack.contains("create account")
        || haystack.contains("authentication required")
        || haystack.contains("please log in")
    {
        return Some("auth-wall");
    }
    None
}

fn generic_blocked_flow_document(page: &ReadablePage, blocked_flow_class: &str) -> IndexDocument {
    let mut document = FailureDiagnostic::new(
        page.title.clone(),
        DiagnosticSource::GenericTransformer,
        DiagnosticConfidence::Low,
        format!("generic transform indicates a blocked flow ({blocked_flow_class})"),
    )
    .with_likely_cause(FailureCause::BlockedByPolicy)
    .with_fallback("read-only extraction and fixture capture")
    .with_tried("static HTML parse")
    .with_tried("readability extraction")
    .with_tried("generic instruction program")
    .with_actions([
        DiagnosticAction::TryHeadless,
        DiagnosticAction::Extract,
        DiagnosticAction::Capture,
        DiagnosticAction::AddFixture,
    ])
    .with_command(":extract links")
    .with_command(":capture save blocked-flow.capture")
    .with_record(
        DiagnosticRecord::new(
            DiagnosticSeverity::Warning,
            "INDEX-GENERIC-BLOCKED",
            format!("blocked-flow class: {blocked_flow_class}"),
        )
        .with_field("title", &page.title)
        .with_field("blocked_flow_class", blocked_flow_class),
    )
    .into_document();
    document.metadata.canonical_url = page.metadata.canonical_url.clone();
    document.metadata.language = page.metadata.language.clone();
    document
}

#[cfg(test)]
mod tests {
    use index_core::{DocumentQualityCategory, IndexNode, SectionRole};
    use index_headless::{
        AccessibilityNode, AccessibilitySnapshot, HeadlessError, HeadlessSnapshot,
    };

    use super::{
        Transformer, state::Empty, transform_headless_failure, transform_headless_snapshot,
        transform_html_cached,
    };

    fn count_links(nodes: &[IndexNode]) -> usize {
        nodes
            .iter()
            .map(|node| match node {
                IndexNode::Link(_) => 1,
                IndexNode::Section { nodes, .. } => count_links(nodes),
                _ => 0,
            })
            .sum()
    }

    #[test]
    fn typestate_pipeline_emits_document() {
        let document = Transformer::<Empty>::new()
            .fetched(r#"<title>Hello</title><p>Index works.</p>"#)
            .parse()
            .extract()
            .transform()
            .into_document();

        assert_eq!(document.title, "Hello");
        assert!(!document.nodes.is_empty());
        assert_eq!(
            document
                .metadata
                .quality
                .as_ref()
                .map(|quality| quality.category),
            Some(DocumentQualityCategory::StrongGeneric)
        );
    }

    #[test]
    fn cached_transform_reuses_matching_source_and_content() {
        let mut cache = super::TransformedDocumentCache::new();
        let first = transform_html_cached(
            &mut cache,
            Some("https://example.org"),
            r#"<title>Hello</title><p>Index works.</p>"#,
        );
        let second = transform_html_cached(
            &mut cache,
            Some("https://example.org"),
            r#"<title>Hello</title><p>Index works.</p>"#,
        );

        assert_eq!(first.title, second.title);
        assert_eq!(cache.len(), 1);
    }

    #[test]
    fn performance_fixtures_transform_through_cache() {
        let fixtures = [
            include_str!("../tests/fixtures/performance/large-doc.html"),
            include_str!("../tests/fixtures/performance/large-table.html"),
            include_str!("../tests/fixtures/performance/listing.html"),
            include_str!("../tests/fixtures/performance/forum.html"),
        ];
        let mut cache = super::TransformedDocumentCache::new();

        for (index, fixture) in fixtures.iter().enumerate() {
            let document =
                transform_html_cached(&mut cache, Some("fixture://performance"), *fixture);
            assert!(
                !document.nodes.is_empty(),
                "performance fixture {index} should transform"
            );
        }

        assert_eq!(cache.len(), fixtures.len());
    }

    #[test]
    fn generic_transform_bounds_very_large_link_sets() {
        let mut html = String::from("<html><head><title>Large Links</title></head><body><main>");
        for index in 0..1200 {
            html.push_str(&format!(
                "<a href=\"https://example.com/{index}\">Link {index}</a>"
            ));
        }
        html.push_str("</main></body></html>");

        let document = Transformer::<Empty>::new()
            .fetched(html)
            .parse()
            .extract()
            .transform()
            .into_document();

        assert!(count_links(&document.nodes) <= 300);
        assert!(document.nodes.iter().any(|node| matches!(
            node,
            IndexNode::Section {
                title: Some(title),
                ..
            } if title == "Diagnostic"
        )));
    }

    #[test]
    fn transformer_emits_links_after_paragraphs() {
        let document = Transformer::<Empty>::new()
            .fetched(r#"<title>Hello</title><p>Body.</p><a href="https://example.com">Example</a>"#)
            .parse()
            .extract()
            .transform()
            .into_document();

        let link_position = document
            .nodes
            .iter()
            .position(|node| matches!(node, IndexNode::Link(_)));
        let paragraph_position = document
            .nodes
            .iter()
            .position(|node| matches!(node, IndexNode::Paragraph(_)));

        assert!(paragraph_position < link_position);
    }

    #[test]
    fn transformer_emits_static_reader_nodes_and_metadata() {
        let document = Transformer::<Empty>::new()
            .fetched(
                r#"
                <html>
                  <head>
                    <meta name="description" content="Reader docs">
                    <link rel="canonical" href="https://example.com/docs">
                  </head>
                  <main>
                    <h2>Install</h2>
                    <ul><li>Read docs</li><li>Run locally</li></ul>
                    <pre><code class="language-sh">cargo install index</code></pre>
                    <table><tr><th>Command</th></tr><tr><td>index</td></tr></table>
                    <img src="logo.png" alt="Index logo">
                  </main>
                </html>
                "#,
            )
            .parse()
            .extract()
            .transform()
            .into_document();

        assert_eq!(
            document.metadata.description.as_deref(),
            Some("Reader docs")
        );
        assert!(document.nodes.iter().any(
            |node| matches!(node, IndexNode::Heading { level: 2, text } if text == "Install")
        ));
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::CodeBlock { .. }))
        );
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::List { .. }))
        );
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Table { .. }))
        );
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Image { alt, .. } if alt == "Index logo"))
        );
    }

    #[test]
    fn transformer_uses_site_adapter_when_canonical_url_matches() {
        let document = Transformer::<Empty>::new()
            .fetched(
                r#"
                <head><link rel="canonical" href="https://github.com/index-rs/index"></head>
                <main><p>Generic repository noise.</p><a href="/issues">Issues</a></main>
                "#,
            )
            .parse()
            .extract()
            .transform()
            .into_document();

        assert_eq!(
            document.metadata.adapter_id.as_ref().map(|id| id.as_str()),
            Some("github.repository")
        );
        assert_eq!(
            document
                .metadata
                .quality
                .as_ref()
                .map(|quality| quality.category),
            Some(DocumentQualityCategory::Adapter)
        );
        assert!(document.title.contains("GitHub repository"));
    }

    #[test]
    fn transformer_falls_back_to_generic_transformer_for_unknown_sites() {
        let document = Transformer::<Empty>::new()
            .fetched(
                r#"
                <head><link rel="canonical" href="https://example.com/article"></head>
                <main><p>Generic article body.</p></main>
                "#,
            )
            .parse()
            .extract()
            .transform()
            .into_document();

        assert_eq!(document.metadata.adapter_id, None);
        assert!(document.nodes.iter().any(
            |node| matches!(node, IndexNode::Paragraph(text) if text == "Generic article body.")
        ));
    }

    #[test]
    fn transforms_rendered_dom_snapshot() -> Result<(), Box<dyn std::error::Error>> {
        let snapshot = HeadlessSnapshot {
            final_url: index_core::IndexUrl::parse("https://example.com/app")?,
            dom_html: "<main><h1>Rendered</h1><p>Loaded by fallback.</p></main>".to_owned(),
            accessibility: None,
        };

        let document = transform_headless_snapshot(&snapshot);

        assert_eq!(
            document.metadata.canonical_url.as_deref(),
            Some("https://example.com/app")
        );
        assert!(document.nodes.iter().any(
            |node| matches!(node, IndexNode::Paragraph(text) if text == "Loaded by fallback.")
        ));
        Ok(())
    }

    #[test]
    fn transforms_accessibility_snapshot_when_dom_is_empty()
    -> Result<(), Box<dyn std::error::Error>> {
        let snapshot = HeadlessSnapshot {
            final_url: index_core::IndexUrl::parse("https://example.com/spa")?,
            dom_html: "<main></main>".to_owned(),
            accessibility: Some(AccessibilitySnapshot {
                nodes: vec![AccessibilityNode::leaf("button", "Search")],
            }),
        };

        let document = transform_headless_snapshot(&snapshot);

        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "button: Search"))
        );
        assert_eq!(
            document
                .metadata
                .quality
                .as_ref()
                .map(|quality| quality.category),
            Some(DocumentQualityCategory::Fallback)
        );
        Ok(())
    }

    #[test]
    fn accessibility_first_maps_roles_and_scores_confidence()
    -> Result<(), Box<dyn std::error::Error>> {
        let snapshot = HeadlessSnapshot {
            final_url: index_core::IndexUrl::parse("https://example.com/a11y")?,
            dom_html: "<main><p>DOM fallback should not win.</p></main>".to_owned(),
            accessibility: Some(AccessibilitySnapshot {
                nodes: vec![AccessibilityNode {
                    role: "main".to_owned(),
                    name: "Application".to_owned(),
                    children: vec![
                        AccessibilityNode::leaf("heading", "Accessible Title"),
                        AccessibilityNode::leaf("paragraph", "Readable accessible text."),
                        AccessibilityNode {
                            role: "list".to_owned(),
                            name: String::new(),
                            children: vec![
                                AccessibilityNode::leaf("listitem", "First"),
                                AccessibilityNode::leaf("listitem", "Second"),
                            ],
                        },
                    ],
                }],
            }),
        };

        let document = transform_headless_snapshot(&snapshot);

        assert_eq!(document.title, "Accessible Title");
        assert_eq!(
            document
                .metadata
                .quality
                .as_ref()
                .map(|quality| (quality.category, quality.score)),
            Some((DocumentQualityCategory::Fallback, 82))
        );
        assert!(document.nodes.iter().any(|node| matches!(
            node,
            IndexNode::Section {
                role: SectionRole::Main,
                collapsed: false,
                ..
            }
        )));
        assert!(!document.nodes.iter().any(
            |node| matches!(node, IndexNode::Paragraph(text) if text == "DOM fallback should not win.")
        ));
        Ok(())
    }

    #[test]
    fn accessibility_first_merges_dom_links_without_duplicate_link_nodes()
    -> Result<(), Box<dyn std::error::Error>> {
        let snapshot = HeadlessSnapshot {
            final_url: index_core::IndexUrl::parse("https://example.com/app")?,
            dom_html: "<main><a href=\"/docs\">Docs</a><a href=\"/docs\">Docs</a></main>"
                .to_owned(),
            accessibility: Some(AccessibilitySnapshot {
                nodes: vec![
                    AccessibilityNode::leaf("heading", "App"),
                    AccessibilityNode::leaf("link", "Docs"),
                ],
            }),
        };

        let document = transform_headless_snapshot(&snapshot);
        let links = document
            .nodes
            .iter()
            .filter(|node| matches!(node, IndexNode::Link(link) if link.text == "Docs"))
            .count();

        assert_eq!(links, 1);
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "link: Docs"))
        );
        Ok(())
    }

    #[test]
    fn sparse_accessibility_falls_back_to_rendered_dom() -> Result<(), Box<dyn std::error::Error>> {
        let snapshot = HeadlessSnapshot {
            final_url: index_core::IndexUrl::parse("https://example.com/sparse")?,
            dom_html: "<main><h1>Rendered</h1><p>DOM body wins.</p></main>".to_owned(),
            accessibility: Some(AccessibilitySnapshot {
                nodes: vec![AccessibilityNode::leaf("generic", "Sparse label")],
            }),
        };

        let document = transform_headless_snapshot(&snapshot);

        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "DOM body wins."))
        );
        assert!(
            !document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "Sparse label"))
        );
        Ok(())
    }

    #[test]
    fn accessibility_maps_secondary_regions_and_controls() -> Result<(), Box<dyn std::error::Error>>
    {
        let snapshot = HeadlessSnapshot {
            final_url: index_core::IndexUrl::parse("https://example.com/controls")?,
            dom_html: "<title>Controls</title><main><p>DOM backup.</p></main>".to_owned(),
            accessibility: Some(AccessibilitySnapshot {
                nodes: vec![
                    AccessibilityNode {
                        role: "navigation".to_owned(),
                        name: "Site navigation".to_owned(),
                        children: vec![AccessibilityNode::leaf("link", "Home")],
                    },
                    AccessibilityNode {
                        role: "complementary".to_owned(),
                        name: "Related".to_owned(),
                        children: vec![AccessibilityNode::leaf("button", "Subscribe")],
                    },
                    AccessibilityNode {
                        role: "footer".to_owned(),
                        name: "Footer".to_owned(),
                        children: vec![AccessibilityNode::leaf("checkbox", "Accept")],
                    },
                    AccessibilityNode::leaf("textbox", "Search docs"),
                ],
            }),
        };

        let document = transform_headless_snapshot(&snapshot);

        assert_eq!(document.title, "Controls");
        assert!(document.nodes.iter().any(|node| matches!(
            node,
            IndexNode::Section {
                role: SectionRole::Navigation,
                collapsed: true,
                ..
            }
        )));
        assert!(document.nodes.iter().any(|node| matches!(
            node,
            IndexNode::Section {
                role: SectionRole::Aside,
                collapsed: true,
                ..
            }
        )));
        assert!(document.nodes.iter().any(|node| matches!(
            node,
            IndexNode::Section {
                role: SectionRole::Footer,
                collapsed: true,
                ..
            }
        )));
        assert!(document.nodes.iter().any(
            |node| matches!(node, IndexNode::Paragraph(text) if text == "textbox: Search docs")
        ));
        Ok(())
    }

    #[test]
    fn accessibility_lists_can_use_nested_child_names() -> Result<(), Box<dyn std::error::Error>> {
        let snapshot = HeadlessSnapshot {
            final_url: index_core::IndexUrl::parse("https://example.com/list")?,
            dom_html: "<main></main>".to_owned(),
            accessibility: Some(AccessibilitySnapshot {
                nodes: vec![
                    AccessibilityNode::leaf("heading", "Nested List"),
                    AccessibilityNode {
                        role: "list".to_owned(),
                        name: String::new(),
                        children: vec![AccessibilityNode {
                            role: "listitem".to_owned(),
                            name: String::new(),
                            children: vec![
                                AccessibilityNode::leaf("staticText", "Alpha"),
                                AccessibilityNode::leaf("staticText", "Beta"),
                            ],
                        }],
                    },
                ],
            }),
        };

        let document = transform_headless_snapshot(&snapshot);

        assert!(document.nodes.iter().any(
            |node| matches!(node, IndexNode::List { items, .. } if items == &vec!["Alpha Beta".to_owned()])
        ));
        Ok(())
    }

    #[test]
    fn accessibility_unknown_roles_keep_names_and_children()
    -> Result<(), Box<dyn std::error::Error>> {
        let snapshot = HeadlessSnapshot {
            final_url: index_core::IndexUrl::parse("https://example.com/custom")?,
            dom_html: "<main></main>".to_owned(),
            accessibility: Some(AccessibilitySnapshot {
                nodes: vec![AccessibilityNode {
                    role: "custom-widget".to_owned(),
                    name: "Widget".to_owned(),
                    children: vec![AccessibilityNode::leaf("heading", "Widget Title")],
                }],
            }),
        };

        let document = transform_headless_snapshot(&snapshot);

        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "Widget"))
        );
        assert!(
            document.nodes.iter().any(
                |node| matches!(node, IndexNode::Heading { text, .. } if text == "Widget Title")
            )
        );
        Ok(())
    }

    #[test]
    fn transforms_headless_failure_to_deterministic_error_document() {
        let document = transform_headless_failure(&HeadlessError::TimedOut { timeout_ms: 10 });

        assert_eq!(document.title, "Headless fallback failed");
        assert!(document.nodes.iter().any(
            |node| matches!(node, IndexNode::Error(text) if text.contains("timed out after 10ms"))
        ));
    }

    #[test]
    fn generic_transformer_reports_missing_readable_content() {
        let document = Transformer::<Empty>::new()
            .fetched("<html><title>Empty</title><main></main></html>")
            .parse()
            .extract()
            .transform()
            .into_document();

        assert!(document.nodes.iter().any(
            |node| matches!(node, IndexNode::Error(text) if text.contains("did not find readable"))
        ));
        assert!(document.nodes.iter().any(
            |node| matches!(node, IndexNode::List { items, .. } if items.iter().any(|item| item.contains("confidence: failed")))
        ));
        assert_eq!(
            document
                .metadata
                .quality
                .as_ref()
                .map(|quality| quality.category),
            Some(DocumentQualityCategory::Failed)
        );
    }

    #[test]
    fn sparse_pages_include_low_confidence_diagnostic_section() {
        let document = Transformer::<Empty>::new()
            .fetched(
                "<html><title>Sparse</title><main><a href=\"/only\">Only link</a></main></html>",
            )
            .parse()
            .extract()
            .transform()
            .into_document();

        assert!(document.nodes.iter().any(|node| matches!(
            node,
            IndexNode::Section {
                title: Some(title),
                collapsed: true,
                ..
            } if title == "Diagnostic"
        )));
        assert_eq!(
            document
                .metadata
                .quality
                .as_ref()
                .map(|quality| quality.category),
            Some(DocumentQualityCategory::PartialGeneric)
        );
    }

    #[test]
    fn blocked_flow_guardrails_cover_required_classes() {
        let cases = [
            (
                "auth-wall",
                "<html><title>Sign in</title><main><p>Please log in to continue</p></main></html>",
            ),
            (
                "script-gate",
                "<html><title>JavaScript required</title><main><p>Enable JavaScript to continue in app</p></main></html>",
            ),
            (
                "bot-gate",
                "<html><title>Robot check</title><main><p>Captcha: verify you are human</p></main></html>",
            ),
            (
                "geo-gate",
                "<html><title>Not available</title><main><p>This content is not available in your region</p></main></html>",
            ),
            (
                "age-gate",
                "<html><title>Age verification</title><main><p>Confirm your age (18+) to continue</p></main></html>",
            ),
            (
                "policy-blocked",
                "<html><title>Forbidden</title><main><p>Access denied by policy</p></main></html>",
            ),
        ];

        for (class_name, html) in cases {
            let document = Transformer::<Empty>::new()
                .fetched(html)
                .parse()
                .extract()
                .transform()
                .into_document();
            let rendered = format!("{:?}", document.nodes);

            assert!(
                rendered.contains("INDEX-GENERIC-BLOCKED"),
                "missing blocked diagnostic code for {class_name}"
            );
            assert!(
                rendered.contains(class_name),
                "missing blocked-flow class in diagnostic for {class_name}"
            );
            assert!(
                rendered.contains(":capture save blocked-flow.capture"),
                "missing capture guidance for {class_name}"
            );
            assert_eq!(
                document
                    .metadata
                    .quality
                    .as_ref()
                    .map(|quality| quality.category),
                Some(DocumentQualityCategory::Failed)
            );
        }
    }

    #[test]
    fn blocked_flow_failure_document_is_deterministic() {
        let html = "<html><title>Access denied</title><main><p>Blocked by policy</p></main></html>";
        let first = Transformer::<Empty>::new()
            .fetched(html)
            .parse()
            .extract()
            .transform()
            .into_document();
        let second = Transformer::<Empty>::new()
            .fetched(html)
            .parse()
            .extract()
            .transform()
            .into_document();

        assert_eq!(first, second);
    }

    #[test]
    fn unsupported_page_shape_never_looks_successful() {
        let document = Transformer::<Empty>::new()
            .fetched(
                "<html><title>Unsupported</title><main><canvas></canvas><template></template></main></html>",
            )
            .parse()
            .extract()
            .transform()
            .into_document();
        let rendered = format!("{:?}", document.nodes);

        assert!(
            rendered.contains("INDEX-GENERIC-EMPTY"),
            "unsupported page should emit generic empty diagnostic"
        );
        assert!(
            rendered.contains("confidence: failed"),
            "unsupported page should be marked failed"
        );
        assert_eq!(
            document
                .metadata
                .quality
                .as_ref()
                .map(|quality| quality.category),
            Some(DocumentQualityCategory::Failed)
        );
    }
}