index-transformer 1.0.0

Typestate transformer and orthogonal instruction set for Index.
Documentation
//! Static reader fixture tests for Milestone 1.

use index_core::IndexNode;
use index_extract::{ExtractFormat, extract_document, validate_document_json_schema};
use index_renderer::{RenderOptions, render_document};
use index_transformer::{Transformer, state::Empty};

fn transform(html: &str) -> index_core::IndexDocument {
    Transformer::<Empty>::new()
        .fetched(html)
        .parse()
        .extract()
        .transform()
        .into_document()
}

#[test]
fn article_fixture_renders_without_nav_or_footer_noise() {
    let document = transform(include_str!("fixtures/article.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(rendered.contains("Index should keep the core article paragraph."));
    assert!(rendered.contains("Reader mode should drop surrounding navigation"));
    assert!(rendered.contains("[1] Read more -> https://example.com/articles/quiet/more"));
    assert!(!rendered.contains("Pricing"));
    assert!(!rendered.contains("noisy footer"));
    assert_eq!(
        document.metadata.canonical_url.as_deref(),
        Some("https://example.com/articles/quiet")
    );
}

#[test]
fn documentation_fixture_preserves_static_reader_components() {
    let document = transform(include_str!("fixtures/documentation.html"));

    assert!(
        document
            .nodes
            .iter()
            .any(|node| matches!(node, IndexNode::Heading { level: 2, text } if text == "Install"))
    );
    assert!(document.nodes.iter().any(|node| matches!(node, IndexNode::CodeBlock { language: Some(language), code } if language == "sh" && code.contains("cargo install"))));
    assert!(
        document
            .nodes
            .iter()
            .any(|node| matches!(node, IndexNode::Table { rows } if rows.len() == 2))
    );
    assert!(document.nodes.iter().any(|node| matches!(node, IndexNode::Image { alt, src: Some(src) } if alt == "Index terminal screenshot" && src == "https://example.com/docs/assets/screenshot.png")));
    assert_eq!(
        document.metadata.open_graph_title.as_deref(),
        Some("Index Documentation")
    );
}

#[test]
fn malformed_fixture_does_not_panic_and_keeps_content() {
    let document = transform(include_str!("fixtures/malformed.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(rendered.contains("Broken Page"));
    assert!(rendered.contains("Malformed pages should still produce readable text."));
    assert!(rendered.contains("Broken link -> https://example.com/broken"));
}

#[test]
fn link_heavy_fixture_uses_stable_link_only_addresses() {
    let document = transform(include_str!("fixtures/link-heavy.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(rendered.contains("[1] One -> https://example.com/one"));
    assert!(rendered.contains("[2] Two -> https://example.com/two"));
    assert!(rendered.contains("[3] Three -> https://example.com/three"));
}

#[test]
fn empty_fixture_still_emits_a_titled_document() {
    let document = transform(include_str!("fixtures/empty.html"));
    assert_eq!(document.title, "Empty Fixture");
    assert!(document.nodes.iter().any(|node| matches!(
        node,
        IndexNode::Heading { level: 1, text } if text == "Empty Fixture"
    )));
    assert!(document.nodes.iter().any(
        |node| matches!(node, IndexNode::Error(text) if text.contains("did not find readable"))
    ));
}

#[test]
fn wiki_reference_fixture_reaches_actionable_generic_tier() {
    let document = transform(include_str!("fixtures/wiki-reference.html"));
    let markdown = extract_document(&document, ExtractFormat::Markdown);

    assert_eq!(document.title, "Public Knowledge Base - Hypertext");
    assert_eq!(
        document.metadata.canonical_url.as_deref(),
        Some("https://knowledge.example/wiki/Hypertext")
    );
    assert!(document.nodes.iter().any(
        |node| matches!(node, IndexNode::List { items, .. } if items.iter().any(|item| item.contains("Links connect documents")))
    ));
    assert!(markdown.contains("## Core properties"));
    assert!(markdown.contains("- Links connect documents."));
}

#[test]
fn forum_thread_fixture_preserves_thread_posts_and_steps() {
    let document = transform(include_str!("fixtures/forum-thread.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(rendered.contains("## Maintainer"));
    assert!(rendered.contains("## Archivist"));
    assert!(rendered.contains("1. Collect public docs."));
    assert!(rendered.contains("[1] Next thread -> https://forum.example/t/43/"));
}

#[test]
fn search_results_fixture_preserves_form_results_and_links() {
    let document = transform(include_str!("fixtures/search-results.html"));
    let links = extract_document(&document, ExtractFormat::Links);

    assert!(document.nodes.iter().any(
        |node| matches!(node, IndexNode::Form(form) if form.name == "search" && form.action == "https://search.example/search")
    ));
    assert!(
        document.nodes.iter().any(
            |node| matches!(node, IndexNode::List { ordered: true, items } if items.len() == 3)
        )
    );
    assert!(links.contains("1\tIndex documentation\thttps://search.example/result/index-docs"));
    assert!(links.contains("3\tPublic archives\thttps://search.example/result/public-archives"));
}

#[test]
fn catalog_listing_fixture_preserves_table_list_and_download_links() {
    let document = transform(include_str!("fixtures/catalog-listing.html"));
    let markdown = extract_document(&document, ExtractFormat::Markdown);

    assert!(document.nodes.iter().any(
        |node| matches!(node, IndexNode::Table { rows } if rows.len() == 3 && rows[0][0] == "Name")
    ));
    assert!(document.nodes.iter().any(
        |node| matches!(node, IndexNode::List { items, .. } if items.iter().any(|item| item.contains("Transit stops CSV")))
    ));
    assert!(markdown.contains("| Name | Format | Updated |"));
    assert!(
        markdown.contains("[Library hours JSON](https://data.example/catalog/library-hours.json)")
    );
}

#[test]
fn archive_fixture_preserves_thread_listing_and_links() {
    let document = transform(include_str!("fixtures/archive-mailing-list.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(rendered.contains("Messages from the public knowledge-infrastructure list."));
    assert!(rendered.contains("• [index] Static readers and public manuals - 12 replies"));
    assert!(rendered.contains(
        "[2] Fixture sharing thread -> https://lists.example/archive/2026/05/thread-fixtures.html"
    ));
}

#[test]
fn robustness_fixture_matrix_paths_are_cataloged() {
    let catalog = include_str!("../../../docs/COVERAGE_CATALOG.md");
    let matrix = include_str!("../../../docs/FIXTURE_MATRIX.md");
    let fixtures = [
        "fixtures/robust-malformed-v2.html",
        "fixtures/robust-sparse.html",
        "fixtures/code-heavy-doc.html",
        "fixtures/table-nested-list.html",
        "fixtures/nav-sidebar-heavy.html",
        "fixtures/international-es.html",
        "fixtures/rtl-ar.html",
        "fixtures/cjk-reference.html",
    ];

    for fixture in fixtures {
        let catalog_path = format!("crates/index-transformer/tests/{fixture}");
        assert!(
            catalog.contains(&catalog_path),
            "{catalog_path} missing from catalog"
        );
        assert!(
            matrix.contains(&catalog_path),
            "{catalog_path} missing from matrix"
        );
    }
}

#[test]
fn robustness_fixtures_transform_extract_and_render() {
    let fixtures = [
        include_str!("fixtures/robust-malformed-v2.html"),
        include_str!("fixtures/robust-sparse.html"),
        include_str!("fixtures/code-heavy-doc.html"),
        include_str!("fixtures/table-nested-list.html"),
        include_str!("fixtures/nav-sidebar-heavy.html"),
        include_str!("fixtures/international-es.html"),
        include_str!("fixtures/rtl-ar.html"),
        include_str!("fixtures/cjk-reference.html"),
    ];

    for fixture in fixtures {
        let document = transform(fixture);
        let markdown = extract_document(&document, ExtractFormat::Markdown);
        let json = extract_document(&document, ExtractFormat::Json);
        let rendered = render_document(&document, RenderOptions::default());

        assert!(!document.title.trim().is_empty());
        assert!(!document.nodes.is_empty());
        assert!(json.starts_with("{\n"));
        assert!(!markdown.trim().is_empty());
        assert!(!rendered.trim().is_empty());
    }
}

#[test]
fn robust_malformed_v2_keeps_content_table_and_link() {
    let document = transform(include_str!("fixtures/robust-malformed-v2.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(rendered.contains("Readable text should survive"));
    assert!(rendered.contains("Parser"));
    assert!(rendered.contains("Kept reference -> /kept"));
}

#[test]
fn code_heavy_fixture_preserves_preformatted_commands() {
    let document = transform(include_str!("fixtures/code-heavy-doc.html"));
    let markdown = extract_document(&document, ExtractFormat::Markdown);

    assert!(document.nodes.iter().any(
        |node| matches!(node, IndexNode::CodeBlock { code, .. } if code.contains("cargo install index\nindex --version"))
    ));
    assert!(markdown.contains("```"));
    assert!(markdown.contains("JetBrainsMono Nerd Font Mono"));
}

#[test]
fn table_nested_list_fixture_preserves_dense_reference_shape() {
    let document = transform(include_str!("fixtures/table-nested-list.html"));
    let markdown = extract_document(&document, ExtractFormat::Markdown);

    assert!(document.nodes.iter().any(
        |node| matches!(node, IndexNode::Table { rows } if rows.len() == 3 && rows[0][0] == "Area")
    ));
    assert!(markdown.contains("| Parser | HTML, metadata, links | covered |"));
    assert!(markdown.contains("- Documents Manuals Reference pages"));
}

#[test]
fn navigation_heavy_fixture_prioritizes_main_content() {
    let document = transform(include_str!("fixtures/nav-sidebar-heavy.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(rendered.contains("The primary article should remain visible"));
    assert!(rendered.contains("Primary source -> /primary"));
    assert!(!rendered.starts_with("Pricing"));
}

#[test]
fn international_fixtures_keep_non_english_text_addressable() {
    let spanish = transform(include_str!("fixtures/international-es.html"));
    let rtl = transform(include_str!("fixtures/rtl-ar.html"));
    let cjk = transform(include_str!("fixtures/cjk-reference.html"));
    let spanish_json = extract_document(&spanish, ExtractFormat::Json);
    let rtl_rendered = render_document(&rtl, RenderOptions::default());
    let cjk_markdown = extract_document(&cjk, ExtractFormat::Markdown);

    assert!(render_document(&spanish, RenderOptions::default()).contains("Índice debe conservar"));
    assert_eq!(spanish.metadata.language.as_deref(), Some("es"));
    assert!(spanish_json.contains("\"language\": \"es\""));
    assert!(validate_document_json_schema(&spanish_json).is_ok());
    assert!(
        spanish
            .nodes
            .iter()
            .any(|node| matches!(node, IndexNode::Form(form) if form.action == "/buscar"))
    );
    assert_eq!(rtl.metadata.language.as_deref(), Some("ar"));
    assert!(rtl_rendered.contains("دليل عام"));
    assert!(rtl_rendered.contains("[1] الأرشيف -> /archive"));
    assert_eq!(cjk.metadata.language.as_deref(), Some("ja"));
    assert!(cjk_markdown.contains("公開リファレンス"));
    assert!(cjk_markdown.contains("[参考資料](/reference)"));
}

#[test]
fn slate_article_fixture_preserves_main_article_content() {
    let document = transform(include_str!("fixtures/slate-article-heavy-nav.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert_eq!(
        document.metadata.canonical_url.as_deref(),
        Some(
            "https://slate.com/technology/2004/11/the-death-of-the-last-maverick-tech-company.html"
        )
    );
    assert!(rendered.contains("Historical perspective on Nullsoft"));
    assert!(rendered.contains("Article body should remain readable"));
    assert!(rendered.contains(
        "Permalink -> https://slate.com/technology/2004/11/the-death-of-the-last-maverick-tech-company.html"
    ));
}

#[test]
fn readability_v2_article_suppresses_chrome_and_keeps_dense_body() {
    let document = transform(include_str!("fixtures/readability-v2-article.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(
        rendered.contains("Main paragraphs should win even if a chrome-like main appears first.")
    );
    assert!(!rendered.contains("Pricing"));
    assert!(!rendered.contains("Sign up for updates"));
}

#[test]
fn readability_v2_docs_preserve_code_whitespace_and_links() {
    let document = transform(include_str!("fixtures/readability-v2-docs.html"));
    let markdown = extract_document(&document, ExtractFormat::Markdown);
    let links = extract_document(&document, ExtractFormat::Links);

    assert!(
        document
            .nodes
            .iter()
            .any(|node| matches!(node, IndexNode::CodeBlock { code, .. } if code.contains("index compatibility-backlog --top 20")))
    );
    assert!(markdown.contains("keep raw\n  indentation"));
    assert!(links.contains("API reference\thttps://reader.example/docs/v2/reference"));
}

#[test]
fn readability_v2_news_keeps_vertical_rhythm_spacers() {
    let document = transform(include_str!("fixtures/readability-v2-news.html"));

    assert!(
        document
            .nodes
            .iter()
            .any(|node| matches!(node, IndexNode::Spacer { lines } if *lines >= 1))
    );
}

#[test]
fn readability_v2_markdown_snapshots_are_stable() {
    let docs = transform(include_str!("fixtures/readability-v2-docs.html"));
    let news = transform(include_str!("fixtures/readability-v2-news.html"));
    let docs_markdown = extract_document(&docs, ExtractFormat::Markdown);
    let news_markdown = extract_document(&news, ExtractFormat::Markdown);
    let docs_golden = include_str!("golden/readability-v2-docs.markdown");
    let news_golden = include_str!("golden/readability-v2-news.markdown");

    assert_eq!(docs_markdown.trim_end(), docs_golden.trim_end());
    assert_eq!(news_markdown.trim_end(), news_golden.trim_end());
}

#[test]
fn readability_v2_portal_falls_back_to_dense_region_without_main_landmark() {
    let document = transform(include_str!("fixtures/readability-v2-portal.html"));
    let rendered = render_document(&document, RenderOptions::default());

    assert!(rendered.contains("Dense region fallback should select this body"));
    assert!(!rendered.contains("Sports"));
}