use index_core::IndexNode;
use index_extract::{ExtractFormat, extract_document, validate_document_json_schema};
use index_renderer::{RenderOptions, render_document};
use index_transformer::{Transformer, state::Empty};
fn transform(html: &str) -> index_core::IndexDocument {
Transformer::<Empty>::new()
.fetched(html)
.parse()
.extract()
.transform()
.into_document()
}
#[test]
fn article_fixture_renders_without_nav_or_footer_noise() {
let document = transform(include_str!("fixtures/article.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(rendered.contains("Index should keep the core article paragraph."));
assert!(rendered.contains("Reader mode should drop surrounding navigation"));
assert!(rendered.contains("[1] Read more -> https://example.com/articles/quiet/more"));
assert!(!rendered.contains("Pricing"));
assert!(!rendered.contains("noisy footer"));
assert_eq!(
document.metadata.canonical_url.as_deref(),
Some("https://example.com/articles/quiet")
);
}
#[test]
fn documentation_fixture_preserves_static_reader_components() {
let document = transform(include_str!("fixtures/documentation.html"));
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Heading { level: 2, text } if text == "Install"))
);
assert!(document.nodes.iter().any(|node| matches!(node, IndexNode::CodeBlock { language: Some(language), code } if language == "sh" && code.contains("cargo install"))));
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Table { rows } if rows.len() == 2))
);
assert!(document.nodes.iter().any(|node| matches!(node, IndexNode::Image { alt, src: Some(src) } if alt == "Index terminal screenshot" && src == "https://example.com/docs/assets/screenshot.png")));
assert_eq!(
document.metadata.open_graph_title.as_deref(),
Some("Index Documentation")
);
}
#[test]
fn malformed_fixture_does_not_panic_and_keeps_content() {
let document = transform(include_str!("fixtures/malformed.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(rendered.contains("Broken Page"));
assert!(rendered.contains("Malformed pages should still produce readable text."));
assert!(rendered.contains("Broken link -> https://example.com/broken"));
}
#[test]
fn link_heavy_fixture_uses_stable_link_only_addresses() {
let document = transform(include_str!("fixtures/link-heavy.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(rendered.contains("[1] One -> https://example.com/one"));
assert!(rendered.contains("[2] Two -> https://example.com/two"));
assert!(rendered.contains("[3] Three -> https://example.com/three"));
}
#[test]
fn empty_fixture_still_emits_a_titled_document() {
let document = transform(include_str!("fixtures/empty.html"));
assert_eq!(document.title, "Empty Fixture");
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Heading { level: 1, text } if text == "Empty Fixture"
)));
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Error(text) if text.contains("did not find readable"))
));
}
#[test]
fn wiki_reference_fixture_reaches_actionable_generic_tier() {
let document = transform(include_str!("fixtures/wiki-reference.html"));
let markdown = extract_document(&document, ExtractFormat::Markdown);
assert_eq!(document.title, "Public Knowledge Base - Hypertext");
assert_eq!(
document.metadata.canonical_url.as_deref(),
Some("https://knowledge.example/wiki/Hypertext")
);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::List { items, .. } if items.iter().any(|item| item.contains("Links connect documents")))
));
assert!(markdown.contains("## Core properties"));
assert!(markdown.contains("- Links connect documents."));
}
#[test]
fn forum_thread_fixture_preserves_thread_posts_and_steps() {
let document = transform(include_str!("fixtures/forum-thread.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(rendered.contains("## Maintainer"));
assert!(rendered.contains("## Archivist"));
assert!(rendered.contains("1. Collect public docs."));
assert!(rendered.contains("[1] Next thread -> https://forum.example/t/43/"));
}
#[test]
fn search_results_fixture_preserves_form_results_and_links() {
let document = transform(include_str!("fixtures/search-results.html"));
let links = extract_document(&document, ExtractFormat::Links);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Form(form) if form.name == "search" && form.action == "https://search.example/search")
));
assert!(
document.nodes.iter().any(
|node| matches!(node, IndexNode::List { ordered: true, items } if items.len() == 3)
)
);
assert!(links.contains("1\tIndex documentation\thttps://search.example/result/index-docs"));
assert!(links.contains("3\tPublic archives\thttps://search.example/result/public-archives"));
}
#[test]
fn catalog_listing_fixture_preserves_table_list_and_download_links() {
let document = transform(include_str!("fixtures/catalog-listing.html"));
let markdown = extract_document(&document, ExtractFormat::Markdown);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Table { rows } if rows.len() == 3 && rows[0][0] == "Name")
));
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::List { items, .. } if items.iter().any(|item| item.contains("Transit stops CSV")))
));
assert!(markdown.contains("| Name | Format | Updated |"));
assert!(
markdown.contains("[Library hours JSON](https://data.example/catalog/library-hours.json)")
);
}
#[test]
fn archive_fixture_preserves_thread_listing_and_links() {
let document = transform(include_str!("fixtures/archive-mailing-list.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(rendered.contains("Messages from the public knowledge-infrastructure list."));
assert!(rendered.contains("• [index] Static readers and public manuals - 12 replies"));
assert!(rendered.contains(
"[2] Fixture sharing thread -> https://lists.example/archive/2026/05/thread-fixtures.html"
));
}
#[test]
fn robustness_fixture_matrix_paths_are_cataloged() {
let catalog = include_str!("../../../docs/COVERAGE_CATALOG.md");
let matrix = include_str!("../../../docs/FIXTURE_MATRIX.md");
let fixtures = [
"fixtures/robust-malformed-v2.html",
"fixtures/robust-sparse.html",
"fixtures/code-heavy-doc.html",
"fixtures/table-nested-list.html",
"fixtures/nav-sidebar-heavy.html",
"fixtures/international-es.html",
"fixtures/rtl-ar.html",
"fixtures/cjk-reference.html",
];
for fixture in fixtures {
let catalog_path = format!("crates/index-transformer/tests/{fixture}");
assert!(
catalog.contains(&catalog_path),
"{catalog_path} missing from catalog"
);
assert!(
matrix.contains(&catalog_path),
"{catalog_path} missing from matrix"
);
}
}
#[test]
fn robustness_fixtures_transform_extract_and_render() {
let fixtures = [
include_str!("fixtures/robust-malformed-v2.html"),
include_str!("fixtures/robust-sparse.html"),
include_str!("fixtures/code-heavy-doc.html"),
include_str!("fixtures/table-nested-list.html"),
include_str!("fixtures/nav-sidebar-heavy.html"),
include_str!("fixtures/international-es.html"),
include_str!("fixtures/rtl-ar.html"),
include_str!("fixtures/cjk-reference.html"),
];
for fixture in fixtures {
let document = transform(fixture);
let markdown = extract_document(&document, ExtractFormat::Markdown);
let json = extract_document(&document, ExtractFormat::Json);
let rendered = render_document(&document, RenderOptions::default());
assert!(!document.title.trim().is_empty());
assert!(!document.nodes.is_empty());
assert!(json.starts_with("{\n"));
assert!(!markdown.trim().is_empty());
assert!(!rendered.trim().is_empty());
}
}
#[test]
fn robust_malformed_v2_keeps_content_table_and_link() {
let document = transform(include_str!("fixtures/robust-malformed-v2.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(rendered.contains("Readable text should survive"));
assert!(rendered.contains("Parser"));
assert!(rendered.contains("Kept reference -> /kept"));
}
#[test]
fn code_heavy_fixture_preserves_preformatted_commands() {
let document = transform(include_str!("fixtures/code-heavy-doc.html"));
let markdown = extract_document(&document, ExtractFormat::Markdown);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::CodeBlock { code, .. } if code.contains("cargo install index\nindex --version"))
));
assert!(markdown.contains("```"));
assert!(markdown.contains("JetBrainsMono Nerd Font Mono"));
}
#[test]
fn table_nested_list_fixture_preserves_dense_reference_shape() {
let document = transform(include_str!("fixtures/table-nested-list.html"));
let markdown = extract_document(&document, ExtractFormat::Markdown);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Table { rows } if rows.len() == 3 && rows[0][0] == "Area")
));
assert!(markdown.contains("| Parser | HTML, metadata, links | covered |"));
assert!(markdown.contains("- Documents Manuals Reference pages"));
}
#[test]
fn navigation_heavy_fixture_prioritizes_main_content() {
let document = transform(include_str!("fixtures/nav-sidebar-heavy.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(rendered.contains("The primary article should remain visible"));
assert!(rendered.contains("Primary source -> /primary"));
assert!(!rendered.starts_with("Pricing"));
}
#[test]
fn international_fixtures_keep_non_english_text_addressable() {
let spanish = transform(include_str!("fixtures/international-es.html"));
let rtl = transform(include_str!("fixtures/rtl-ar.html"));
let cjk = transform(include_str!("fixtures/cjk-reference.html"));
let spanish_json = extract_document(&spanish, ExtractFormat::Json);
let rtl_rendered = render_document(&rtl, RenderOptions::default());
let cjk_markdown = extract_document(&cjk, ExtractFormat::Markdown);
assert!(render_document(&spanish, RenderOptions::default()).contains("Índice debe conservar"));
assert_eq!(spanish.metadata.language.as_deref(), Some("es"));
assert!(spanish_json.contains("\"language\": \"es\""));
assert!(validate_document_json_schema(&spanish_json).is_ok());
assert!(
spanish
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Form(form) if form.action == "/buscar"))
);
assert_eq!(rtl.metadata.language.as_deref(), Some("ar"));
assert!(rtl_rendered.contains("دليل عام"));
assert!(rtl_rendered.contains("[1] الأرشيف -> /archive"));
assert_eq!(cjk.metadata.language.as_deref(), Some("ja"));
assert!(cjk_markdown.contains("公開リファレンス"));
assert!(cjk_markdown.contains("[参考資料](/reference)"));
}
#[test]
fn slate_article_fixture_preserves_main_article_content() {
let document = transform(include_str!("fixtures/slate-article-heavy-nav.html"));
let rendered = render_document(&document, RenderOptions::default());
assert_eq!(
document.metadata.canonical_url.as_deref(),
Some(
"https://slate.com/technology/2004/11/the-death-of-the-last-maverick-tech-company.html"
)
);
assert!(rendered.contains("Historical perspective on Nullsoft"));
assert!(rendered.contains("Article body should remain readable"));
assert!(rendered.contains(
"Permalink -> https://slate.com/technology/2004/11/the-death-of-the-last-maverick-tech-company.html"
));
}
#[test]
fn readability_v2_article_suppresses_chrome_and_keeps_dense_body() {
let document = transform(include_str!("fixtures/readability-v2-article.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(
rendered.contains("Main paragraphs should win even if a chrome-like main appears first.")
);
assert!(!rendered.contains("Pricing"));
assert!(!rendered.contains("Sign up for updates"));
}
#[test]
fn readability_v2_docs_preserve_code_whitespace_and_links() {
let document = transform(include_str!("fixtures/readability-v2-docs.html"));
let markdown = extract_document(&document, ExtractFormat::Markdown);
let links = extract_document(&document, ExtractFormat::Links);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::CodeBlock { code, .. } if code.contains("index compatibility-backlog --top 20")))
);
assert!(markdown.contains("keep raw\n indentation"));
assert!(links.contains("API reference\thttps://reader.example/docs/v2/reference"));
}
#[test]
fn readability_v2_news_keeps_vertical_rhythm_spacers() {
let document = transform(include_str!("fixtures/readability-v2-news.html"));
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Spacer { lines } if *lines >= 1))
);
}
#[test]
fn readability_v2_markdown_snapshots_are_stable() {
let docs = transform(include_str!("fixtures/readability-v2-docs.html"));
let news = transform(include_str!("fixtures/readability-v2-news.html"));
let docs_markdown = extract_document(&docs, ExtractFormat::Markdown);
let news_markdown = extract_document(&news, ExtractFormat::Markdown);
let docs_golden = include_str!("golden/readability-v2-docs.markdown");
let news_golden = include_str!("golden/readability-v2-news.markdown");
assert_eq!(docs_markdown.trim_end(), docs_golden.trim_end());
assert_eq!(news_markdown.trim_end(), news_golden.trim_end());
}
#[test]
fn readability_v2_portal_falls_back_to_dense_region_without_main_landmark() {
let document = transform(include_str!("fixtures/readability-v2-portal.html"));
let rendered = render_document(&document, RenderOptions::default());
assert!(rendered.contains("Dense region fallback should select this body"));
assert!(!rendered.contains("Sports"));
}