use std::marker::PhantomData;
use index_core::{
DiagnosticAction, DiagnosticConfidence, DiagnosticRecord, DiagnosticSeverity, DiagnosticSource,
DocumentQuality, DocumentQualityCategory, FailureCause, FailureDiagnostic, IndexDocument,
IndexNode, Link, SectionRole,
};
use index_dom::{HtmlDocument, parse_html};
use index_headless::{AccessibilityNode, AccessibilitySnapshot, HeadlessError, HeadlessSnapshot};
use index_readability::ReadablePage;
pub mod adapter;
pub mod cache;
pub mod instruction;
pub mod manifest;
pub mod state;
use adapter::{AdapterContext, AdapterRegistry};
use instruction::{ApplyMetadata, EmitLinks, EmitReadableNodes, EmitTitle, Instruction};
use state::{Empty, Extracted, Fetched, Parsed, Transformed};
pub use cache::{TransformCacheKey, TransformedDocumentCache};
pub use manifest::apply_index_manifest_hints;
#[derive(Debug, Clone)]
pub struct Transformer<S> {
raw_html: Option<String>,
parsed: Option<HtmlDocument>,
extracted: Option<ReadablePage>,
document: Option<IndexDocument>,
_state: PhantomData<S>,
}
#[must_use]
pub fn transform_html_cached(
cache: &mut TransformedDocumentCache,
source_url: Option<&str>,
html: impl Into<String>,
) -> IndexDocument {
let html = html.into();
let key = TransformCacheKey::new(source_url, &html);
if let Some(document) = cache.get(&key) {
return document;
}
let document = Transformer::<Empty>::new()
.fetched(html)
.parse()
.extract()
.transform()
.into_document();
cache.insert(key, document.clone());
document
}
impl Transformer<Empty> {
#[must_use]
pub const fn new() -> Self {
Self {
raw_html: None,
parsed: None,
extracted: None,
document: None,
_state: PhantomData,
}
}
#[must_use]
pub fn fetched(self, raw_html: impl Into<String>) -> Transformer<Fetched> {
Transformer {
raw_html: Some(raw_html.into()),
parsed: None,
extracted: None,
document: None,
_state: PhantomData,
}
}
}
impl Default for Transformer<Empty> {
fn default() -> Self {
Self::new()
}
}
impl Transformer<Fetched> {
#[must_use]
pub fn parse(self) -> Transformer<Parsed> {
let raw_html = self.raw_html.unwrap_or_default();
let parsed = parse_html(raw_html.clone());
Transformer {
raw_html: Some(raw_html),
parsed: Some(parsed),
extracted: None,
document: None,
_state: PhantomData,
}
}
}
impl Transformer<Parsed> {
#[must_use]
pub fn extract(self) -> Transformer<Extracted> {
let extracted = self.parsed.as_ref().map(ReadablePage::from_html);
Transformer {
raw_html: self.raw_html,
parsed: self.parsed,
extracted,
document: None,
_state: PhantomData,
}
}
}
impl Transformer<Extracted> {
#[must_use]
pub fn transform(self) -> Transformer<Transformed> {
let page = self.extracted.unwrap_or_else(|| ReadablePage {
title: "Untitled".to_owned(),
paragraphs: Vec::new(),
nodes: Vec::new(),
links: Vec::new(),
forms: Vec::new(),
metadata: Default::default(),
});
let context = AdapterContext { page: &page };
let document = AdapterRegistry::default_registry()
.transform(&context)
.unwrap_or_else(|| transform_generic(&page));
Transformer {
raw_html: self.raw_html,
parsed: self.parsed,
extracted: Some(page),
document: Some(document),
_state: PhantomData,
}
}
}
impl Transformer<Transformed> {
#[must_use]
pub fn into_document(self) -> IndexDocument {
self.document.unwrap_or_default()
}
}
#[must_use]
pub fn transform_headless_snapshot(snapshot: &HeadlessSnapshot) -> IndexDocument {
let mut parsed = parse_html(snapshot.dom_html.clone());
if parsed.metadata.canonical_url.is_none() {
parsed.metadata.canonical_url = Some(snapshot.final_url.to_string());
}
let page = ReadablePage::from_html(&parsed);
if let Some(accessibility) = &snapshot.accessibility {
if let Some(mut document) = accessibility_document(accessibility, snapshot, &page) {
merge_dom_links(&mut document, &page);
return document;
}
}
if page.has_body() {
return AdapterRegistry::default_registry()
.transform(&AdapterContext { page: &page })
.unwrap_or_else(|| transform_generic(&page));
}
let mut document = IndexDocument::titled("Headless snapshot");
document.metadata.canonical_url = Some(snapshot.final_url.to_string());
if let Some(accessibility) = &snapshot.accessibility {
let text = accessibility.text_content();
if !text.is_empty() {
document.push(IndexNode::Paragraph(text));
}
}
if document.is_empty() {
return FailureDiagnostic::new(
"Headless snapshot unreadable",
DiagnosticSource::Headless,
DiagnosticConfidence::Failed,
"headless snapshot did not contain readable content",
)
.with_fallback("accessibility tree text extraction")
.with_tried("headless DOM snapshot")
.with_tried("accessibility tree extraction")
.with_actions([DiagnosticAction::Retry, DiagnosticAction::Capture])
.with_command(":capture save headless-unreadable.capture")
.with_record(DiagnosticRecord::new(
DiagnosticSeverity::Error,
"INDEX-HEADLESS-EMPTY",
format!("final_url={}", snapshot.final_url),
))
.into_document();
}
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Fallback,
55,
[
"headless accessibility fallback".to_owned(),
"DOM body was not readable".to_owned(),
],
));
document
}
fn accessibility_document(
accessibility: &AccessibilitySnapshot,
snapshot: &HeadlessSnapshot,
page: &ReadablePage,
) -> Option<IndexDocument> {
let mut nodes = Vec::new();
let mut evidence = AccessibilityEvidence::default();
for node in &accessibility.nodes {
append_accessibility_node(node, &mut nodes, &mut evidence);
}
if !evidence.is_confident() {
return None;
}
let title = nodes
.iter()
.find_map(first_heading_text)
.filter(|title| !title.trim().is_empty())
.unwrap_or_else(|| {
if page.title.trim().is_empty() {
"Headless snapshot".to_owned()
} else {
page.title.clone()
}
});
let mut document = IndexDocument::titled(title);
document.metadata.canonical_url = Some(snapshot.final_url.to_string());
document.nodes = nodes;
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Fallback,
evidence.score(),
[
"accessibility tree supplied semantic roles".to_owned(),
"headless DOM links merged when available".to_owned(),
],
));
Some(document)
}
fn first_heading_text(node: &IndexNode) -> Option<String> {
match node {
IndexNode::Heading { text, .. } if !text.trim().is_empty() => Some(text.clone()),
IndexNode::Section { nodes, .. } => nodes.iter().find_map(first_heading_text),
_ => None,
}
}
#[derive(Debug, Clone, Copy, Default)]
struct AccessibilityEvidence {
named_nodes: usize,
semantic_nodes: usize,
}
impl AccessibilityEvidence {
fn observe(&mut self, semantic: bool, name: &str) {
if !name.trim().is_empty() {
self.named_nodes += 1;
}
if semantic {
self.semantic_nodes += 1;
}
}
fn is_confident(self) -> bool {
self.semantic_nodes >= 2 || (self.semantic_nodes >= 1 && self.named_nodes >= 2)
}
fn score(self) -> u8 {
let score =
50 + (self.semantic_nodes.min(4) as u8 * 8) + (self.named_nodes.min(4) as u8 * 3);
score.min(82)
}
}
fn append_accessibility_node(
node: &AccessibilityNode,
output: &mut Vec<IndexNode>,
evidence: &mut AccessibilityEvidence,
) {
let role = node.role.trim().to_ascii_lowercase();
let name = node.name.trim();
match role.as_str() {
"main" | "article" | "navigation" | "complementary" | "contentinfo" | "footer" => {
let mut children = Vec::new();
for child in &node.children {
append_accessibility_node(child, &mut children, evidence);
}
if !children.is_empty() {
evidence.observe(true, name);
output.push(IndexNode::Section {
role: accessibility_section_role(&role),
title: (!name.is_empty()).then(|| name.to_owned()),
collapsed: !matches!(role.as_str(), "main" | "article"),
nodes: children,
});
}
}
"heading" => {
if !name.is_empty() {
evidence.observe(true, name);
output.push(IndexNode::Heading {
level: 2,
text: name.to_owned(),
});
}
}
"paragraph" | "text" | "statictext" | "generic" => {
if !name.is_empty() {
evidence.observe(false, name);
output.push(IndexNode::Paragraph(name.to_owned()));
}
}
"link" | "button" | "searchbox" | "textbox" | "checkbox" => {
if !name.is_empty() {
evidence.observe(true, name);
output.push(IndexNode::Paragraph(format!("{role}: {name}")));
}
}
"list" => {
let items = accessibility_list_items(&node.children);
if !items.is_empty() {
evidence.observe(true, name);
output.push(IndexNode::List {
ordered: false,
items,
});
}
}
"listitem" => {
if !name.is_empty() {
evidence.observe(true, name);
output.push(IndexNode::Paragraph(name.to_owned()));
}
}
_ => {
if !name.is_empty() {
evidence.observe(false, name);
output.push(IndexNode::Paragraph(name.to_owned()));
}
for child in &node.children {
append_accessibility_node(child, output, evidence);
}
}
}
}
fn accessibility_section_role(role: &str) -> SectionRole {
match role {
"main" | "article" => SectionRole::Main,
"navigation" => SectionRole::Navigation,
"complementary" => SectionRole::Aside,
"contentinfo" | "footer" => SectionRole::Footer,
_ => SectionRole::Unknown,
}
}
fn accessibility_list_items(children: &[AccessibilityNode]) -> Vec<String> {
children
.iter()
.filter_map(|child| {
let name = child.name.trim();
if name.is_empty() {
let nested = child
.children
.iter()
.filter_map(|grandchild| {
let name = grandchild.name.trim();
(!name.is_empty()).then(|| name.to_owned())
})
.collect::<Vec<_>>();
(!nested.is_empty()).then(|| nested.join(" "))
} else {
Some(name.to_owned())
}
})
.collect()
}
fn merge_dom_links(document: &mut IndexDocument, page: &ReadablePage) {
let mut existing = document_link_labels(document);
for link in &page.links {
if existing.iter().any(|label| label == &link.text) {
continue;
}
existing.push(link.text.clone());
document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
}
fn document_link_labels(document: &IndexDocument) -> Vec<String> {
document.nodes.iter().filter_map(link_label).collect()
}
fn link_label(node: &IndexNode) -> Option<String> {
match node {
IndexNode::Link(link) => Some(link.text.clone()),
IndexNode::Section { nodes, .. } => nodes.iter().find_map(link_label),
_ => None,
}
}
#[must_use]
pub fn transform_headless_failure(error: &HeadlessError) -> IndexDocument {
FailureDiagnostic::new(
"Headless fallback failed",
DiagnosticSource::Headless,
DiagnosticConfidence::Failed,
error.to_string(),
)
.with_fallback("static transform or retry")
.with_tried("headless backend execution")
.with_actions([DiagnosticAction::Retry, DiagnosticAction::Capture])
.with_command(":capture save headless-failure.capture")
.with_record(DiagnosticRecord::new(
DiagnosticSeverity::Error,
"INDEX-HEADLESS-FAILED",
error.to_string(),
))
.into_document()
}
fn default_program() -> Vec<Box<dyn Instruction>> {
vec![
Box::new(ApplyMetadata),
Box::new(EmitTitle),
Box::new(EmitReadableNodes),
Box::new(EmitLinks),
]
}
fn transform_generic(page: &ReadablePage) -> IndexDocument {
let mut document = IndexDocument::titled(page.title.clone());
let program = default_program();
for instruction in program {
instruction.execute(page, &mut document);
}
if let Some(blocked_flow_class) = blocked_flow_hint(page) {
return generic_blocked_flow_document(page, blocked_flow_class);
}
if !page.has_body() && page.links.is_empty() && page.forms.is_empty() {
document = FailureDiagnostic::new(
page.title.clone(),
DiagnosticSource::GenericTransformer,
DiagnosticConfidence::Failed,
"generic transformer did not find readable page content",
)
.with_fallback("generic static reader")
.with_tried("static HTML parse")
.with_tried("readability extraction")
.with_tried("generic instruction program")
.with_actions([
DiagnosticAction::TryHeadless,
DiagnosticAction::Extract,
DiagnosticAction::Capture,
DiagnosticAction::AddFixture,
])
.with_command(":extract links")
.with_command(":capture save unsupported-page.capture")
.with_record(
DiagnosticRecord::new(
DiagnosticSeverity::Error,
"INDEX-GENERIC-EMPTY",
"no readable headings, paragraphs, tables, forms, links, or sections were emitted",
)
.with_field("title", &page.title),
)
.into_document();
} else if page.paragraphs.is_empty() && page.nodes.len() <= 2 {
document.push(IndexNode::Section {
role: index_core::SectionRole::Unknown,
title: Some("Diagnostic".to_owned()),
collapsed: true,
nodes: vec![
IndexNode::Error(
"low-confidence transform: only sparse page structure was found".to_owned(),
),
IndexNode::List {
ordered: false,
items: vec![
"try :extract links or :extract json".to_owned(),
"capture a redacted fixture if the page shape matters".to_owned(),
],
},
],
});
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::PartialGeneric,
45,
[
"sparse generic structure".to_owned(),
"diagnostic section attached".to_owned(),
],
));
} else {
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::StrongGeneric,
82,
[
"generic reader emitted semantic content".to_owned(),
"no low-confidence diagnostic attached".to_owned(),
],
));
}
document
}
fn blocked_flow_hint(page: &ReadablePage) -> Option<&'static str> {
let sparse_shape = page.paragraphs.len() <= 2 && page.forms.is_empty() && page.links.len() <= 3;
if !sparse_shape {
return None;
}
let mut haystack = page.title.to_ascii_lowercase();
for paragraph in &page.paragraphs {
haystack.push('\n');
haystack.push_str(¶graph.to_ascii_lowercase());
}
if haystack.contains("captcha")
|| haystack.contains("verify you are human")
|| haystack.contains("robot check")
|| haystack.contains("cloudflare")
{
return Some("bot-gate");
}
if haystack.contains("not available in your region")
|| haystack.contains("not available in your country")
|| haystack.contains("geo-restricted")
|| haystack.contains("geoblocked")
{
return Some("geo-gate");
}
if haystack.contains("age verification")
|| haystack.contains("adults only")
|| haystack.contains("18+")
|| haystack.contains("confirm your age")
{
return Some("age-gate");
}
if haystack.contains("access denied")
|| haystack.contains("forbidden")
|| haystack.contains("blocked by policy")
|| haystack.contains("violates our terms")
|| haystack.contains("not permitted")
{
return Some("policy-blocked");
}
if haystack.contains("enable javascript")
|| haystack.contains("requires javascript")
|| haystack.contains("continue in app")
|| haystack.contains("app is not available")
{
return Some("script-gate");
}
if haystack.contains("log in")
|| haystack.contains("sign in")
|| haystack.contains("create account")
|| haystack.contains("authentication required")
|| haystack.contains("please log in")
{
return Some("auth-wall");
}
None
}
fn generic_blocked_flow_document(page: &ReadablePage, blocked_flow_class: &str) -> IndexDocument {
let mut document = FailureDiagnostic::new(
page.title.clone(),
DiagnosticSource::GenericTransformer,
DiagnosticConfidence::Low,
format!("generic transform indicates a blocked flow ({blocked_flow_class})"),
)
.with_likely_cause(FailureCause::BlockedByPolicy)
.with_fallback("read-only extraction and fixture capture")
.with_tried("static HTML parse")
.with_tried("readability extraction")
.with_tried("generic instruction program")
.with_actions([
DiagnosticAction::TryHeadless,
DiagnosticAction::Extract,
DiagnosticAction::Capture,
DiagnosticAction::AddFixture,
])
.with_command(":extract links")
.with_command(":capture save blocked-flow.capture")
.with_record(
DiagnosticRecord::new(
DiagnosticSeverity::Warning,
"INDEX-GENERIC-BLOCKED",
format!("blocked-flow class: {blocked_flow_class}"),
)
.with_field("title", &page.title)
.with_field("blocked_flow_class", blocked_flow_class),
)
.into_document();
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document
}
#[cfg(test)]
mod tests {
use index_core::{DocumentQualityCategory, IndexNode, SectionRole};
use index_headless::{
AccessibilityNode, AccessibilitySnapshot, HeadlessError, HeadlessSnapshot,
};
use super::{
Transformer, state::Empty, transform_headless_failure, transform_headless_snapshot,
transform_html_cached,
};
fn count_links(nodes: &[IndexNode]) -> usize {
nodes
.iter()
.map(|node| match node {
IndexNode::Link(_) => 1,
IndexNode::Section { nodes, .. } => count_links(nodes),
_ => 0,
})
.sum()
}
#[test]
fn typestate_pipeline_emits_document() {
let document = Transformer::<Empty>::new()
.fetched(r#"<title>Hello</title><p>Index works.</p>"#)
.parse()
.extract()
.transform()
.into_document();
assert_eq!(document.title, "Hello");
assert!(!document.nodes.is_empty());
assert_eq!(
document
.metadata
.quality
.as_ref()
.map(|quality| quality.category),
Some(DocumentQualityCategory::StrongGeneric)
);
}
#[test]
fn cached_transform_reuses_matching_source_and_content() {
let mut cache = super::TransformedDocumentCache::new();
let first = transform_html_cached(
&mut cache,
Some("https://example.org"),
r#"<title>Hello</title><p>Index works.</p>"#,
);
let second = transform_html_cached(
&mut cache,
Some("https://example.org"),
r#"<title>Hello</title><p>Index works.</p>"#,
);
assert_eq!(first.title, second.title);
assert_eq!(cache.len(), 1);
}
#[test]
fn performance_fixtures_transform_through_cache() {
let fixtures = [
include_str!("../tests/fixtures/performance/large-doc.html"),
include_str!("../tests/fixtures/performance/large-table.html"),
include_str!("../tests/fixtures/performance/listing.html"),
include_str!("../tests/fixtures/performance/forum.html"),
];
let mut cache = super::TransformedDocumentCache::new();
for (index, fixture) in fixtures.iter().enumerate() {
let document =
transform_html_cached(&mut cache, Some("fixture://performance"), *fixture);
assert!(
!document.nodes.is_empty(),
"performance fixture {index} should transform"
);
}
assert_eq!(cache.len(), fixtures.len());
}
#[test]
fn generic_transform_bounds_very_large_link_sets() {
let mut html = String::from("<html><head><title>Large Links</title></head><body><main>");
for index in 0..1200 {
html.push_str(&format!(
"<a href=\"https://example.com/{index}\">Link {index}</a>"
));
}
html.push_str("</main></body></html>");
let document = Transformer::<Empty>::new()
.fetched(html)
.parse()
.extract()
.transform()
.into_document();
assert!(count_links(&document.nodes) <= 300);
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
title: Some(title),
..
} if title == "Diagnostic"
)));
}
#[test]
fn transformer_emits_links_after_paragraphs() {
let document = Transformer::<Empty>::new()
.fetched(r#"<title>Hello</title><p>Body.</p><a href="https://example.com">Example</a>"#)
.parse()
.extract()
.transform()
.into_document();
let link_position = document
.nodes
.iter()
.position(|node| matches!(node, IndexNode::Link(_)));
let paragraph_position = document
.nodes
.iter()
.position(|node| matches!(node, IndexNode::Paragraph(_)));
assert!(paragraph_position < link_position);
}
#[test]
fn transformer_emits_static_reader_nodes_and_metadata() {
let document = Transformer::<Empty>::new()
.fetched(
r#"
<html>
<head>
<meta name="description" content="Reader docs">
<link rel="canonical" href="https://example.com/docs">
</head>
<main>
<h2>Install</h2>
<ul><li>Read docs</li><li>Run locally</li></ul>
<pre><code class="language-sh">cargo install index</code></pre>
<table><tr><th>Command</th></tr><tr><td>index</td></tr></table>
<img src="logo.png" alt="Index logo">
</main>
</html>
"#,
)
.parse()
.extract()
.transform()
.into_document();
assert_eq!(
document.metadata.description.as_deref(),
Some("Reader docs")
);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Heading { level: 2, text } if text == "Install")
));
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::CodeBlock { .. }))
);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::List { .. }))
);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Table { .. }))
);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Image { alt, .. } if alt == "Index logo"))
);
}
#[test]
fn transformer_uses_site_adapter_when_canonical_url_matches() {
let document = Transformer::<Empty>::new()
.fetched(
r#"
<head><link rel="canonical" href="https://github.com/index-rs/index"></head>
<main><p>Generic repository noise.</p><a href="/issues">Issues</a></main>
"#,
)
.parse()
.extract()
.transform()
.into_document();
assert_eq!(
document.metadata.adapter_id.as_ref().map(|id| id.as_str()),
Some("github.repository")
);
assert_eq!(
document
.metadata
.quality
.as_ref()
.map(|quality| quality.category),
Some(DocumentQualityCategory::Adapter)
);
assert!(document.title.contains("GitHub repository"));
}
#[test]
fn transformer_falls_back_to_generic_transformer_for_unknown_sites() {
let document = Transformer::<Empty>::new()
.fetched(
r#"
<head><link rel="canonical" href="https://example.com/article"></head>
<main><p>Generic article body.</p></main>
"#,
)
.parse()
.extract()
.transform()
.into_document();
assert_eq!(document.metadata.adapter_id, None);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Paragraph(text) if text == "Generic article body.")
));
}
#[test]
fn transforms_rendered_dom_snapshot() -> Result<(), Box<dyn std::error::Error>> {
let snapshot = HeadlessSnapshot {
final_url: index_core::IndexUrl::parse("https://example.com/app")?,
dom_html: "<main><h1>Rendered</h1><p>Loaded by fallback.</p></main>".to_owned(),
accessibility: None,
};
let document = transform_headless_snapshot(&snapshot);
assert_eq!(
document.metadata.canonical_url.as_deref(),
Some("https://example.com/app")
);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Paragraph(text) if text == "Loaded by fallback.")
));
Ok(())
}
#[test]
fn transforms_accessibility_snapshot_when_dom_is_empty()
-> Result<(), Box<dyn std::error::Error>> {
let snapshot = HeadlessSnapshot {
final_url: index_core::IndexUrl::parse("https://example.com/spa")?,
dom_html: "<main></main>".to_owned(),
accessibility: Some(AccessibilitySnapshot {
nodes: vec![AccessibilityNode::leaf("button", "Search")],
}),
};
let document = transform_headless_snapshot(&snapshot);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Paragraph(text) if text == "button: Search"))
);
assert_eq!(
document
.metadata
.quality
.as_ref()
.map(|quality| quality.category),
Some(DocumentQualityCategory::Fallback)
);
Ok(())
}
#[test]
fn accessibility_first_maps_roles_and_scores_confidence()
-> Result<(), Box<dyn std::error::Error>> {
let snapshot = HeadlessSnapshot {
final_url: index_core::IndexUrl::parse("https://example.com/a11y")?,
dom_html: "<main><p>DOM fallback should not win.</p></main>".to_owned(),
accessibility: Some(AccessibilitySnapshot {
nodes: vec![AccessibilityNode {
role: "main".to_owned(),
name: "Application".to_owned(),
children: vec![
AccessibilityNode::leaf("heading", "Accessible Title"),
AccessibilityNode::leaf("paragraph", "Readable accessible text."),
AccessibilityNode {
role: "list".to_owned(),
name: String::new(),
children: vec![
AccessibilityNode::leaf("listitem", "First"),
AccessibilityNode::leaf("listitem", "Second"),
],
},
],
}],
}),
};
let document = transform_headless_snapshot(&snapshot);
assert_eq!(document.title, "Accessible Title");
assert_eq!(
document
.metadata
.quality
.as_ref()
.map(|quality| (quality.category, quality.score)),
Some((DocumentQualityCategory::Fallback, 82))
);
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
role: SectionRole::Main,
collapsed: false,
..
}
)));
assert!(!document.nodes.iter().any(
|node| matches!(node, IndexNode::Paragraph(text) if text == "DOM fallback should not win.")
));
Ok(())
}
#[test]
fn accessibility_first_merges_dom_links_without_duplicate_link_nodes()
-> Result<(), Box<dyn std::error::Error>> {
let snapshot = HeadlessSnapshot {
final_url: index_core::IndexUrl::parse("https://example.com/app")?,
dom_html: "<main><a href=\"/docs\">Docs</a><a href=\"/docs\">Docs</a></main>"
.to_owned(),
accessibility: Some(AccessibilitySnapshot {
nodes: vec![
AccessibilityNode::leaf("heading", "App"),
AccessibilityNode::leaf("link", "Docs"),
],
}),
};
let document = transform_headless_snapshot(&snapshot);
let links = document
.nodes
.iter()
.filter(|node| matches!(node, IndexNode::Link(link) if link.text == "Docs"))
.count();
assert_eq!(links, 1);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Paragraph(text) if text == "link: Docs"))
);
Ok(())
}
#[test]
fn sparse_accessibility_falls_back_to_rendered_dom() -> Result<(), Box<dyn std::error::Error>> {
let snapshot = HeadlessSnapshot {
final_url: index_core::IndexUrl::parse("https://example.com/sparse")?,
dom_html: "<main><h1>Rendered</h1><p>DOM body wins.</p></main>".to_owned(),
accessibility: Some(AccessibilitySnapshot {
nodes: vec![AccessibilityNode::leaf("generic", "Sparse label")],
}),
};
let document = transform_headless_snapshot(&snapshot);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Paragraph(text) if text == "DOM body wins."))
);
assert!(
!document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Paragraph(text) if text == "Sparse label"))
);
Ok(())
}
#[test]
fn accessibility_maps_secondary_regions_and_controls() -> Result<(), Box<dyn std::error::Error>>
{
let snapshot = HeadlessSnapshot {
final_url: index_core::IndexUrl::parse("https://example.com/controls")?,
dom_html: "<title>Controls</title><main><p>DOM backup.</p></main>".to_owned(),
accessibility: Some(AccessibilitySnapshot {
nodes: vec![
AccessibilityNode {
role: "navigation".to_owned(),
name: "Site navigation".to_owned(),
children: vec![AccessibilityNode::leaf("link", "Home")],
},
AccessibilityNode {
role: "complementary".to_owned(),
name: "Related".to_owned(),
children: vec![AccessibilityNode::leaf("button", "Subscribe")],
},
AccessibilityNode {
role: "footer".to_owned(),
name: "Footer".to_owned(),
children: vec![AccessibilityNode::leaf("checkbox", "Accept")],
},
AccessibilityNode::leaf("textbox", "Search docs"),
],
}),
};
let document = transform_headless_snapshot(&snapshot);
assert_eq!(document.title, "Controls");
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
role: SectionRole::Navigation,
collapsed: true,
..
}
)));
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
role: SectionRole::Aside,
collapsed: true,
..
}
)));
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
role: SectionRole::Footer,
collapsed: true,
..
}
)));
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Paragraph(text) if text == "textbox: Search docs")
));
Ok(())
}
#[test]
fn accessibility_lists_can_use_nested_child_names() -> Result<(), Box<dyn std::error::Error>> {
let snapshot = HeadlessSnapshot {
final_url: index_core::IndexUrl::parse("https://example.com/list")?,
dom_html: "<main></main>".to_owned(),
accessibility: Some(AccessibilitySnapshot {
nodes: vec![
AccessibilityNode::leaf("heading", "Nested List"),
AccessibilityNode {
role: "list".to_owned(),
name: String::new(),
children: vec![AccessibilityNode {
role: "listitem".to_owned(),
name: String::new(),
children: vec![
AccessibilityNode::leaf("staticText", "Alpha"),
AccessibilityNode::leaf("staticText", "Beta"),
],
}],
},
],
}),
};
let document = transform_headless_snapshot(&snapshot);
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::List { items, .. } if items == &vec!["Alpha Beta".to_owned()])
));
Ok(())
}
#[test]
fn accessibility_unknown_roles_keep_names_and_children()
-> Result<(), Box<dyn std::error::Error>> {
let snapshot = HeadlessSnapshot {
final_url: index_core::IndexUrl::parse("https://example.com/custom")?,
dom_html: "<main></main>".to_owned(),
accessibility: Some(AccessibilitySnapshot {
nodes: vec![AccessibilityNode {
role: "custom-widget".to_owned(),
name: "Widget".to_owned(),
children: vec![AccessibilityNode::leaf("heading", "Widget Title")],
}],
}),
};
let document = transform_headless_snapshot(&snapshot);
assert!(
document
.nodes
.iter()
.any(|node| matches!(node, IndexNode::Paragraph(text) if text == "Widget"))
);
assert!(
document.nodes.iter().any(
|node| matches!(node, IndexNode::Heading { text, .. } if text == "Widget Title")
)
);
Ok(())
}
#[test]
fn transforms_headless_failure_to_deterministic_error_document() {
let document = transform_headless_failure(&HeadlessError::TimedOut { timeout_ms: 10 });
assert_eq!(document.title, "Headless fallback failed");
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Error(text) if text.contains("timed out after 10ms"))
));
}
#[test]
fn generic_transformer_reports_missing_readable_content() {
let document = Transformer::<Empty>::new()
.fetched("<html><title>Empty</title><main></main></html>")
.parse()
.extract()
.transform()
.into_document();
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::Error(text) if text.contains("did not find readable"))
));
assert!(document.nodes.iter().any(
|node| matches!(node, IndexNode::List { items, .. } if items.iter().any(|item| item.contains("confidence: failed")))
));
assert_eq!(
document
.metadata
.quality
.as_ref()
.map(|quality| quality.category),
Some(DocumentQualityCategory::Failed)
);
}
#[test]
fn sparse_pages_include_low_confidence_diagnostic_section() {
let document = Transformer::<Empty>::new()
.fetched(
"<html><title>Sparse</title><main><a href=\"/only\">Only link</a></main></html>",
)
.parse()
.extract()
.transform()
.into_document();
assert!(document.nodes.iter().any(|node| matches!(
node,
IndexNode::Section {
title: Some(title),
collapsed: true,
..
} if title == "Diagnostic"
)));
assert_eq!(
document
.metadata
.quality
.as_ref()
.map(|quality| quality.category),
Some(DocumentQualityCategory::PartialGeneric)
);
}
#[test]
fn blocked_flow_guardrails_cover_required_classes() {
let cases = [
(
"auth-wall",
"<html><title>Sign in</title><main><p>Please log in to continue</p></main></html>",
),
(
"script-gate",
"<html><title>JavaScript required</title><main><p>Enable JavaScript to continue in app</p></main></html>",
),
(
"bot-gate",
"<html><title>Robot check</title><main><p>Captcha: verify you are human</p></main></html>",
),
(
"geo-gate",
"<html><title>Not available</title><main><p>This content is not available in your region</p></main></html>",
),
(
"age-gate",
"<html><title>Age verification</title><main><p>Confirm your age (18+) to continue</p></main></html>",
),
(
"policy-blocked",
"<html><title>Forbidden</title><main><p>Access denied by policy</p></main></html>",
),
];
for (class_name, html) in cases {
let document = Transformer::<Empty>::new()
.fetched(html)
.parse()
.extract()
.transform()
.into_document();
let rendered = format!("{:?}", document.nodes);
assert!(
rendered.contains("INDEX-GENERIC-BLOCKED"),
"missing blocked diagnostic code for {class_name}"
);
assert!(
rendered.contains(class_name),
"missing blocked-flow class in diagnostic for {class_name}"
);
assert!(
rendered.contains(":capture save blocked-flow.capture"),
"missing capture guidance for {class_name}"
);
assert_eq!(
document
.metadata
.quality
.as_ref()
.map(|quality| quality.category),
Some(DocumentQualityCategory::Failed)
);
}
}
#[test]
fn blocked_flow_failure_document_is_deterministic() {
let html = "<html><title>Access denied</title><main><p>Blocked by policy</p></main></html>";
let first = Transformer::<Empty>::new()
.fetched(html)
.parse()
.extract()
.transform()
.into_document();
let second = Transformer::<Empty>::new()
.fetched(html)
.parse()
.extract()
.transform()
.into_document();
assert_eq!(first, second);
}
#[test]
fn unsupported_page_shape_never_looks_successful() {
let document = Transformer::<Empty>::new()
.fetched(
"<html><title>Unsupported</title><main><canvas></canvas><template></template></main></html>",
)
.parse()
.extract()
.transform()
.into_document();
let rendered = format!("{:?}", document.nodes);
assert!(
rendered.contains("INDEX-GENERIC-EMPTY"),
"unsupported page should emit generic empty diagnostic"
);
assert!(
rendered.contains("confidence: failed"),
"unsupported page should be marked failed"
);
assert_eq!(
document
.metadata
.quality
.as_ref()
.map(|quality| quality.category),
Some(DocumentQualityCategory::Failed)
);
}
}