1use std::marker::PhantomData;
4
5use index_core::{
6 DiagnosticAction, DiagnosticConfidence, DiagnosticRecord, DiagnosticSeverity, DiagnosticSource,
7 DocumentQuality, DocumentQualityCategory, FailureCause, FailureDiagnostic, IndexDocument,
8 IndexNode, Link, SectionRole,
9};
10use index_dom::{HtmlDocument, parse_html};
11use index_headless::{AccessibilityNode, AccessibilitySnapshot, HeadlessError, HeadlessSnapshot};
12use index_readability::ReadablePage;
13
14pub mod adapter;
15pub mod cache;
16pub mod instruction;
17pub mod manifest;
18pub mod state;
19
20use adapter::{AdapterContext, AdapterRegistry};
21use instruction::{ApplyMetadata, EmitLinks, EmitReadableNodes, EmitTitle, Instruction};
22use state::{Empty, Extracted, Fetched, Parsed, Transformed};
23
24pub use cache::{TransformCacheKey, TransformedDocumentCache};
25pub use manifest::apply_index_manifest_hints;
26
27#[derive(Debug, Clone)]
29pub struct Transformer<S> {
30 raw_html: Option<String>,
31 parsed: Option<HtmlDocument>,
32 extracted: Option<ReadablePage>,
33 document: Option<IndexDocument>,
34 _state: PhantomData<S>,
35}
36
37#[must_use]
39pub fn transform_html_cached(
40 cache: &mut TransformedDocumentCache,
41 source_url: Option<&str>,
42 html: impl Into<String>,
43) -> IndexDocument {
44 let html = html.into();
45 let key = TransformCacheKey::new(source_url, &html);
46 if let Some(document) = cache.get(&key) {
47 return document;
48 }
49
50 let document = Transformer::<Empty>::new()
51 .fetched(html)
52 .parse()
53 .extract()
54 .transform()
55 .into_document();
56 cache.insert(key, document.clone());
57 document
58}
59
60impl Transformer<Empty> {
61 #[must_use]
63 pub const fn new() -> Self {
64 Self {
65 raw_html: None,
66 parsed: None,
67 extracted: None,
68 document: None,
69 _state: PhantomData,
70 }
71 }
72
73 #[must_use]
75 pub fn fetched(self, raw_html: impl Into<String>) -> Transformer<Fetched> {
76 Transformer {
77 raw_html: Some(raw_html.into()),
78 parsed: None,
79 extracted: None,
80 document: None,
81 _state: PhantomData,
82 }
83 }
84}
85
86impl Default for Transformer<Empty> {
87 fn default() -> Self {
88 Self::new()
89 }
90}
91
92impl Transformer<Fetched> {
93 #[must_use]
95 pub fn parse(self) -> Transformer<Parsed> {
96 let raw_html = self.raw_html.unwrap_or_default();
97 let parsed = parse_html(raw_html.clone());
98
99 Transformer {
100 raw_html: Some(raw_html),
101 parsed: Some(parsed),
102 extracted: None,
103 document: None,
104 _state: PhantomData,
105 }
106 }
107}
108
109impl Transformer<Parsed> {
110 #[must_use]
112 pub fn extract(self) -> Transformer<Extracted> {
113 let extracted = self.parsed.as_ref().map(ReadablePage::from_html);
114
115 Transformer {
116 raw_html: self.raw_html,
117 parsed: self.parsed,
118 extracted,
119 document: None,
120 _state: PhantomData,
121 }
122 }
123}
124
125impl Transformer<Extracted> {
126 #[must_use]
128 pub fn transform(self) -> Transformer<Transformed> {
129 let page = self.extracted.unwrap_or_else(|| ReadablePage {
130 title: "Untitled".to_owned(),
131 paragraphs: Vec::new(),
132 nodes: Vec::new(),
133 links: Vec::new(),
134 forms: Vec::new(),
135 metadata: Default::default(),
136 });
137
138 let context = AdapterContext { page: &page };
139 let document = AdapterRegistry::default_registry()
140 .transform(&context)
141 .unwrap_or_else(|| transform_generic(&page));
142
143 Transformer {
144 raw_html: self.raw_html,
145 parsed: self.parsed,
146 extracted: Some(page),
147 document: Some(document),
148 _state: PhantomData,
149 }
150 }
151}
152
153impl Transformer<Transformed> {
154 #[must_use]
156 pub fn into_document(self) -> IndexDocument {
157 self.document.unwrap_or_default()
158 }
159}
160
161#[must_use]
163pub fn transform_headless_snapshot(snapshot: &HeadlessSnapshot) -> IndexDocument {
164 let mut parsed = parse_html(snapshot.dom_html.clone());
165 if parsed.metadata.canonical_url.is_none() {
166 parsed.metadata.canonical_url = Some(snapshot.final_url.to_string());
167 }
168
169 let page = ReadablePage::from_html(&parsed);
170 if let Some(accessibility) = &snapshot.accessibility {
171 if let Some(mut document) = accessibility_document(accessibility, snapshot, &page) {
172 merge_dom_links(&mut document, &page);
173 return document;
174 }
175 }
176
177 if page.has_body() {
178 return AdapterRegistry::default_registry()
179 .transform(&AdapterContext { page: &page })
180 .unwrap_or_else(|| transform_generic(&page));
181 }
182
183 let mut document = IndexDocument::titled("Headless snapshot");
184 document.metadata.canonical_url = Some(snapshot.final_url.to_string());
185 if let Some(accessibility) = &snapshot.accessibility {
186 let text = accessibility.text_content();
187 if !text.is_empty() {
188 document.push(IndexNode::Paragraph(text));
189 }
190 }
191 if document.is_empty() {
192 return FailureDiagnostic::new(
193 "Headless snapshot unreadable",
194 DiagnosticSource::Headless,
195 DiagnosticConfidence::Failed,
196 "headless snapshot did not contain readable content",
197 )
198 .with_fallback("accessibility tree text extraction")
199 .with_tried("headless DOM snapshot")
200 .with_tried("accessibility tree extraction")
201 .with_actions([DiagnosticAction::Retry, DiagnosticAction::Capture])
202 .with_command(":capture save headless-unreadable.capture")
203 .with_record(DiagnosticRecord::new(
204 DiagnosticSeverity::Error,
205 "INDEX-HEADLESS-EMPTY",
206 format!("final_url={}", snapshot.final_url),
207 ))
208 .into_document();
209 }
210 document.metadata.quality = Some(DocumentQuality::new(
211 DocumentQualityCategory::Fallback,
212 55,
213 [
214 "headless accessibility fallback".to_owned(),
215 "DOM body was not readable".to_owned(),
216 ],
217 ));
218 document
219}
220
221fn accessibility_document(
222 accessibility: &AccessibilitySnapshot,
223 snapshot: &HeadlessSnapshot,
224 page: &ReadablePage,
225) -> Option<IndexDocument> {
226 let mut nodes = Vec::new();
227 let mut evidence = AccessibilityEvidence::default();
228 for node in &accessibility.nodes {
229 append_accessibility_node(node, &mut nodes, &mut evidence);
230 }
231 if !evidence.is_confident() {
232 return None;
233 }
234
235 let title = nodes
236 .iter()
237 .find_map(first_heading_text)
238 .filter(|title| !title.trim().is_empty())
239 .unwrap_or_else(|| {
240 if page.title.trim().is_empty() {
241 "Headless snapshot".to_owned()
242 } else {
243 page.title.clone()
244 }
245 });
246 let mut document = IndexDocument::titled(title);
247 document.metadata.canonical_url = Some(snapshot.final_url.to_string());
248 document.nodes = nodes;
249 document.metadata.quality = Some(DocumentQuality::new(
250 DocumentQualityCategory::Fallback,
251 evidence.score(),
252 [
253 "accessibility tree supplied semantic roles".to_owned(),
254 "headless DOM links merged when available".to_owned(),
255 ],
256 ));
257 Some(document)
258}
259
260fn first_heading_text(node: &IndexNode) -> Option<String> {
261 match node {
262 IndexNode::Heading { text, .. } if !text.trim().is_empty() => Some(text.clone()),
263 IndexNode::Section { nodes, .. } => nodes.iter().find_map(first_heading_text),
264 _ => None,
265 }
266}
267
268#[derive(Debug, Clone, Copy, Default)]
269struct AccessibilityEvidence {
270 named_nodes: usize,
271 semantic_nodes: usize,
272}
273
274impl AccessibilityEvidence {
275 fn observe(&mut self, semantic: bool, name: &str) {
276 if !name.trim().is_empty() {
277 self.named_nodes += 1;
278 }
279 if semantic {
280 self.semantic_nodes += 1;
281 }
282 }
283
284 fn is_confident(self) -> bool {
285 self.semantic_nodes >= 2 || (self.semantic_nodes >= 1 && self.named_nodes >= 2)
286 }
287
288 fn score(self) -> u8 {
289 let score =
290 50 + (self.semantic_nodes.min(4) as u8 * 8) + (self.named_nodes.min(4) as u8 * 3);
291 score.min(82)
292 }
293}
294
295fn append_accessibility_node(
296 node: &AccessibilityNode,
297 output: &mut Vec<IndexNode>,
298 evidence: &mut AccessibilityEvidence,
299) {
300 let role = node.role.trim().to_ascii_lowercase();
301 let name = node.name.trim();
302 match role.as_str() {
303 "main" | "article" | "navigation" | "complementary" | "contentinfo" | "footer" => {
304 let mut children = Vec::new();
305 for child in &node.children {
306 append_accessibility_node(child, &mut children, evidence);
307 }
308 if !children.is_empty() {
309 evidence.observe(true, name);
310 output.push(IndexNode::Section {
311 role: accessibility_section_role(&role),
312 title: (!name.is_empty()).then(|| name.to_owned()),
313 collapsed: !matches!(role.as_str(), "main" | "article"),
314 nodes: children,
315 });
316 }
317 }
318 "heading" => {
319 if !name.is_empty() {
320 evidence.observe(true, name);
321 output.push(IndexNode::Heading {
322 level: 2,
323 text: name.to_owned(),
324 });
325 }
326 }
327 "paragraph" | "text" | "statictext" | "generic" => {
328 if !name.is_empty() {
329 evidence.observe(false, name);
330 output.push(IndexNode::Paragraph(name.to_owned()));
331 }
332 }
333 "link" | "button" | "searchbox" | "textbox" | "checkbox" => {
334 if !name.is_empty() {
335 evidence.observe(true, name);
336 output.push(IndexNode::Paragraph(format!("{role}: {name}")));
337 }
338 }
339 "list" => {
340 let items = accessibility_list_items(&node.children);
341 if !items.is_empty() {
342 evidence.observe(true, name);
343 output.push(IndexNode::List {
344 ordered: false,
345 items,
346 });
347 }
348 }
349 "listitem" => {
350 if !name.is_empty() {
351 evidence.observe(true, name);
352 output.push(IndexNode::Paragraph(name.to_owned()));
353 }
354 }
355 _ => {
356 if !name.is_empty() {
357 evidence.observe(false, name);
358 output.push(IndexNode::Paragraph(name.to_owned()));
359 }
360 for child in &node.children {
361 append_accessibility_node(child, output, evidence);
362 }
363 }
364 }
365}
366
367fn accessibility_section_role(role: &str) -> SectionRole {
368 match role {
369 "main" | "article" => SectionRole::Main,
370 "navigation" => SectionRole::Navigation,
371 "complementary" => SectionRole::Aside,
372 "contentinfo" | "footer" => SectionRole::Footer,
373 _ => SectionRole::Unknown,
374 }
375}
376
377fn accessibility_list_items(children: &[AccessibilityNode]) -> Vec<String> {
378 children
379 .iter()
380 .filter_map(|child| {
381 let name = child.name.trim();
382 if name.is_empty() {
383 let nested = child
384 .children
385 .iter()
386 .filter_map(|grandchild| {
387 let name = grandchild.name.trim();
388 (!name.is_empty()).then(|| name.to_owned())
389 })
390 .collect::<Vec<_>>();
391 (!nested.is_empty()).then(|| nested.join(" "))
392 } else {
393 Some(name.to_owned())
394 }
395 })
396 .collect()
397}
398
399fn merge_dom_links(document: &mut IndexDocument, page: &ReadablePage) {
400 let mut existing = document_link_labels(document);
401 for link in &page.links {
402 if existing.iter().any(|label| label == &link.text) {
403 continue;
404 }
405 existing.push(link.text.clone());
406 document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
407 }
408}
409
410fn document_link_labels(document: &IndexDocument) -> Vec<String> {
411 document.nodes.iter().filter_map(link_label).collect()
412}
413
414fn link_label(node: &IndexNode) -> Option<String> {
415 match node {
416 IndexNode::Link(link) => Some(link.text.clone()),
417 IndexNode::Section { nodes, .. } => nodes.iter().find_map(link_label),
418 _ => None,
419 }
420}
421
422#[must_use]
424pub fn transform_headless_failure(error: &HeadlessError) -> IndexDocument {
425 FailureDiagnostic::new(
426 "Headless fallback failed",
427 DiagnosticSource::Headless,
428 DiagnosticConfidence::Failed,
429 error.to_string(),
430 )
431 .with_fallback("static transform or retry")
432 .with_tried("headless backend execution")
433 .with_actions([DiagnosticAction::Retry, DiagnosticAction::Capture])
434 .with_command(":capture save headless-failure.capture")
435 .with_record(DiagnosticRecord::new(
436 DiagnosticSeverity::Error,
437 "INDEX-HEADLESS-FAILED",
438 error.to_string(),
439 ))
440 .into_document()
441}
442
443fn default_program() -> Vec<Box<dyn Instruction>> {
444 vec![
445 Box::new(ApplyMetadata),
446 Box::new(EmitTitle),
447 Box::new(EmitReadableNodes),
448 Box::new(EmitLinks),
449 ]
450}
451
452fn transform_generic(page: &ReadablePage) -> IndexDocument {
453 let mut document = IndexDocument::titled(page.title.clone());
454 let program = default_program();
455
456 for instruction in program {
457 instruction.execute(page, &mut document);
458 }
459
460 if let Some(blocked_flow_class) = blocked_flow_hint(page) {
461 return generic_blocked_flow_document(page, blocked_flow_class);
462 }
463
464 if !page.has_body() && page.links.is_empty() && page.forms.is_empty() {
465 document = FailureDiagnostic::new(
466 page.title.clone(),
467 DiagnosticSource::GenericTransformer,
468 DiagnosticConfidence::Failed,
469 "generic transformer did not find readable page content",
470 )
471 .with_fallback("generic static reader")
472 .with_tried("static HTML parse")
473 .with_tried("readability extraction")
474 .with_tried("generic instruction program")
475 .with_actions([
476 DiagnosticAction::TryHeadless,
477 DiagnosticAction::Extract,
478 DiagnosticAction::Capture,
479 DiagnosticAction::AddFixture,
480 ])
481 .with_command(":extract links")
482 .with_command(":capture save unsupported-page.capture")
483 .with_record(
484 DiagnosticRecord::new(
485 DiagnosticSeverity::Error,
486 "INDEX-GENERIC-EMPTY",
487 "no readable headings, paragraphs, tables, forms, links, or sections were emitted",
488 )
489 .with_field("title", &page.title),
490 )
491 .into_document();
492 } else if page.paragraphs.is_empty() && page.nodes.len() <= 2 {
493 document.push(IndexNode::Section {
494 role: index_core::SectionRole::Unknown,
495 title: Some("Diagnostic".to_owned()),
496 collapsed: true,
497 nodes: vec![
498 IndexNode::Error(
499 "low-confidence transform: only sparse page structure was found".to_owned(),
500 ),
501 IndexNode::List {
502 ordered: false,
503 items: vec![
504 "try :extract links or :extract json".to_owned(),
505 "capture a redacted fixture if the page shape matters".to_owned(),
506 ],
507 },
508 ],
509 });
510 document.metadata.quality = Some(DocumentQuality::new(
511 DocumentQualityCategory::PartialGeneric,
512 45,
513 [
514 "sparse generic structure".to_owned(),
515 "diagnostic section attached".to_owned(),
516 ],
517 ));
518 } else {
519 document.metadata.quality = Some(DocumentQuality::new(
520 DocumentQualityCategory::StrongGeneric,
521 82,
522 [
523 "generic reader emitted semantic content".to_owned(),
524 "no low-confidence diagnostic attached".to_owned(),
525 ],
526 ));
527 }
528
529 document
530}
531
532fn blocked_flow_hint(page: &ReadablePage) -> Option<&'static str> {
533 let sparse_shape = page.paragraphs.len() <= 2 && page.forms.is_empty() && page.links.len() <= 3;
534 if !sparse_shape {
535 return None;
536 }
537
538 let mut haystack = page.title.to_ascii_lowercase();
539 for paragraph in &page.paragraphs {
540 haystack.push('\n');
541 haystack.push_str(¶graph.to_ascii_lowercase());
542 }
543
544 if haystack.contains("captcha")
545 || haystack.contains("verify you are human")
546 || haystack.contains("robot check")
547 || haystack.contains("cloudflare")
548 {
549 return Some("bot-gate");
550 }
551 if haystack.contains("not available in your region")
552 || haystack.contains("not available in your country")
553 || haystack.contains("geo-restricted")
554 || haystack.contains("geoblocked")
555 {
556 return Some("geo-gate");
557 }
558 if haystack.contains("age verification")
559 || haystack.contains("adults only")
560 || haystack.contains("18+")
561 || haystack.contains("confirm your age")
562 {
563 return Some("age-gate");
564 }
565 if haystack.contains("access denied")
566 || haystack.contains("forbidden")
567 || haystack.contains("blocked by policy")
568 || haystack.contains("violates our terms")
569 || haystack.contains("not permitted")
570 {
571 return Some("policy-blocked");
572 }
573 if haystack.contains("enable javascript")
574 || haystack.contains("requires javascript")
575 || haystack.contains("continue in app")
576 || haystack.contains("app is not available")
577 {
578 return Some("script-gate");
579 }
580 if haystack.contains("log in")
581 || haystack.contains("sign in")
582 || haystack.contains("create account")
583 || haystack.contains("authentication required")
584 || haystack.contains("please log in")
585 {
586 return Some("auth-wall");
587 }
588 None
589}
590
591fn generic_blocked_flow_document(page: &ReadablePage, blocked_flow_class: &str) -> IndexDocument {
592 let mut document = FailureDiagnostic::new(
593 page.title.clone(),
594 DiagnosticSource::GenericTransformer,
595 DiagnosticConfidence::Low,
596 format!("generic transform indicates a blocked flow ({blocked_flow_class})"),
597 )
598 .with_likely_cause(FailureCause::BlockedByPolicy)
599 .with_fallback("read-only extraction and fixture capture")
600 .with_tried("static HTML parse")
601 .with_tried("readability extraction")
602 .with_tried("generic instruction program")
603 .with_actions([
604 DiagnosticAction::TryHeadless,
605 DiagnosticAction::Extract,
606 DiagnosticAction::Capture,
607 DiagnosticAction::AddFixture,
608 ])
609 .with_command(":extract links")
610 .with_command(":capture save blocked-flow.capture")
611 .with_record(
612 DiagnosticRecord::new(
613 DiagnosticSeverity::Warning,
614 "INDEX-GENERIC-BLOCKED",
615 format!("blocked-flow class: {blocked_flow_class}"),
616 )
617 .with_field("title", &page.title)
618 .with_field("blocked_flow_class", blocked_flow_class),
619 )
620 .into_document();
621 document.metadata.canonical_url = page.metadata.canonical_url.clone();
622 document.metadata.language = page.metadata.language.clone();
623 document
624}
625
626#[cfg(test)]
627mod tests {
628 use index_core::{DocumentQualityCategory, IndexNode, SectionRole};
629 use index_headless::{
630 AccessibilityNode, AccessibilitySnapshot, HeadlessError, HeadlessSnapshot,
631 };
632
633 use super::{
634 Transformer, state::Empty, transform_headless_failure, transform_headless_snapshot,
635 transform_html_cached,
636 };
637
638 fn count_links(nodes: &[IndexNode]) -> usize {
639 nodes
640 .iter()
641 .map(|node| match node {
642 IndexNode::Link(_) => 1,
643 IndexNode::Section { nodes, .. } => count_links(nodes),
644 _ => 0,
645 })
646 .sum()
647 }
648
649 #[test]
650 fn typestate_pipeline_emits_document() {
651 let document = Transformer::<Empty>::new()
652 .fetched(r#"<title>Hello</title><p>Index works.</p>"#)
653 .parse()
654 .extract()
655 .transform()
656 .into_document();
657
658 assert_eq!(document.title, "Hello");
659 assert!(!document.nodes.is_empty());
660 assert_eq!(
661 document
662 .metadata
663 .quality
664 .as_ref()
665 .map(|quality| quality.category),
666 Some(DocumentQualityCategory::StrongGeneric)
667 );
668 }
669
670 #[test]
671 fn cached_transform_reuses_matching_source_and_content() {
672 let mut cache = super::TransformedDocumentCache::new();
673 let first = transform_html_cached(
674 &mut cache,
675 Some("https://example.org"),
676 r#"<title>Hello</title><p>Index works.</p>"#,
677 );
678 let second = transform_html_cached(
679 &mut cache,
680 Some("https://example.org"),
681 r#"<title>Hello</title><p>Index works.</p>"#,
682 );
683
684 assert_eq!(first.title, second.title);
685 assert_eq!(cache.len(), 1);
686 }
687
688 #[test]
689 fn performance_fixtures_transform_through_cache() {
690 let fixtures = [
691 include_str!("../tests/fixtures/performance/large-doc.html"),
692 include_str!("../tests/fixtures/performance/large-table.html"),
693 include_str!("../tests/fixtures/performance/listing.html"),
694 include_str!("../tests/fixtures/performance/forum.html"),
695 ];
696 let mut cache = super::TransformedDocumentCache::new();
697
698 for (index, fixture) in fixtures.iter().enumerate() {
699 let document =
700 transform_html_cached(&mut cache, Some("fixture://performance"), *fixture);
701 assert!(
702 !document.nodes.is_empty(),
703 "performance fixture {index} should transform"
704 );
705 }
706
707 assert_eq!(cache.len(), fixtures.len());
708 }
709
710 #[test]
711 fn generic_transform_bounds_very_large_link_sets() {
712 let mut html = String::from("<html><head><title>Large Links</title></head><body><main>");
713 for index in 0..1200 {
714 html.push_str(&format!(
715 "<a href=\"https://example.com/{index}\">Link {index}</a>"
716 ));
717 }
718 html.push_str("</main></body></html>");
719
720 let document = Transformer::<Empty>::new()
721 .fetched(html)
722 .parse()
723 .extract()
724 .transform()
725 .into_document();
726
727 assert!(count_links(&document.nodes) <= 300);
728 assert!(document.nodes.iter().any(|node| matches!(
729 node,
730 IndexNode::Section {
731 title: Some(title),
732 ..
733 } if title == "Diagnostic"
734 )));
735 }
736
737 #[test]
738 fn transformer_emits_links_after_paragraphs() {
739 let document = Transformer::<Empty>::new()
740 .fetched(r#"<title>Hello</title><p>Body.</p><a href="https://example.com">Example</a>"#)
741 .parse()
742 .extract()
743 .transform()
744 .into_document();
745
746 let link_position = document
747 .nodes
748 .iter()
749 .position(|node| matches!(node, IndexNode::Link(_)));
750 let paragraph_position = document
751 .nodes
752 .iter()
753 .position(|node| matches!(node, IndexNode::Paragraph(_)));
754
755 assert!(paragraph_position < link_position);
756 }
757
758 #[test]
759 fn transformer_emits_static_reader_nodes_and_metadata() {
760 let document = Transformer::<Empty>::new()
761 .fetched(
762 r#"
763 <html>
764 <head>
765 <meta name="description" content="Reader docs">
766 <link rel="canonical" href="https://example.com/docs">
767 </head>
768 <main>
769 <h2>Install</h2>
770 <ul><li>Read docs</li><li>Run locally</li></ul>
771 <pre><code class="language-sh">cargo install index</code></pre>
772 <table><tr><th>Command</th></tr><tr><td>index</td></tr></table>
773 <img src="logo.png" alt="Index logo">
774 </main>
775 </html>
776 "#,
777 )
778 .parse()
779 .extract()
780 .transform()
781 .into_document();
782
783 assert_eq!(
784 document.metadata.description.as_deref(),
785 Some("Reader docs")
786 );
787 assert!(document.nodes.iter().any(
788 |node| matches!(node, IndexNode::Heading { level: 2, text } if text == "Install")
789 ));
790 assert!(
791 document
792 .nodes
793 .iter()
794 .any(|node| matches!(node, IndexNode::CodeBlock { .. }))
795 );
796 assert!(
797 document
798 .nodes
799 .iter()
800 .any(|node| matches!(node, IndexNode::List { .. }))
801 );
802 assert!(
803 document
804 .nodes
805 .iter()
806 .any(|node| matches!(node, IndexNode::Table { .. }))
807 );
808 assert!(
809 document
810 .nodes
811 .iter()
812 .any(|node| matches!(node, IndexNode::Image { alt, .. } if alt == "Index logo"))
813 );
814 }
815
816 #[test]
817 fn transformer_uses_site_adapter_when_canonical_url_matches() {
818 let document = Transformer::<Empty>::new()
819 .fetched(
820 r#"
821 <head><link rel="canonical" href="https://github.com/index-rs/index"></head>
822 <main><p>Generic repository noise.</p><a href="/issues">Issues</a></main>
823 "#,
824 )
825 .parse()
826 .extract()
827 .transform()
828 .into_document();
829
830 assert_eq!(
831 document.metadata.adapter_id.as_ref().map(|id| id.as_str()),
832 Some("github.repository")
833 );
834 assert_eq!(
835 document
836 .metadata
837 .quality
838 .as_ref()
839 .map(|quality| quality.category),
840 Some(DocumentQualityCategory::Adapter)
841 );
842 assert!(document.title.contains("GitHub repository"));
843 }
844
845 #[test]
846 fn transformer_falls_back_to_generic_transformer_for_unknown_sites() {
847 let document = Transformer::<Empty>::new()
848 .fetched(
849 r#"
850 <head><link rel="canonical" href="https://example.com/article"></head>
851 <main><p>Generic article body.</p></main>
852 "#,
853 )
854 .parse()
855 .extract()
856 .transform()
857 .into_document();
858
859 assert_eq!(document.metadata.adapter_id, None);
860 assert!(document.nodes.iter().any(
861 |node| matches!(node, IndexNode::Paragraph(text) if text == "Generic article body.")
862 ));
863 }
864
865 #[test]
866 fn transforms_rendered_dom_snapshot() -> Result<(), Box<dyn std::error::Error>> {
867 let snapshot = HeadlessSnapshot {
868 final_url: index_core::IndexUrl::parse("https://example.com/app")?,
869 dom_html: "<main><h1>Rendered</h1><p>Loaded by fallback.</p></main>".to_owned(),
870 accessibility: None,
871 };
872
873 let document = transform_headless_snapshot(&snapshot);
874
875 assert_eq!(
876 document.metadata.canonical_url.as_deref(),
877 Some("https://example.com/app")
878 );
879 assert!(document.nodes.iter().any(
880 |node| matches!(node, IndexNode::Paragraph(text) if text == "Loaded by fallback.")
881 ));
882 Ok(())
883 }
884
885 #[test]
886 fn transforms_accessibility_snapshot_when_dom_is_empty()
887 -> Result<(), Box<dyn std::error::Error>> {
888 let snapshot = HeadlessSnapshot {
889 final_url: index_core::IndexUrl::parse("https://example.com/spa")?,
890 dom_html: "<main></main>".to_owned(),
891 accessibility: Some(AccessibilitySnapshot {
892 nodes: vec![AccessibilityNode::leaf("button", "Search")],
893 }),
894 };
895
896 let document = transform_headless_snapshot(&snapshot);
897
898 assert!(
899 document
900 .nodes
901 .iter()
902 .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "button: Search"))
903 );
904 assert_eq!(
905 document
906 .metadata
907 .quality
908 .as_ref()
909 .map(|quality| quality.category),
910 Some(DocumentQualityCategory::Fallback)
911 );
912 Ok(())
913 }
914
915 #[test]
916 fn accessibility_first_maps_roles_and_scores_confidence()
917 -> Result<(), Box<dyn std::error::Error>> {
918 let snapshot = HeadlessSnapshot {
919 final_url: index_core::IndexUrl::parse("https://example.com/a11y")?,
920 dom_html: "<main><p>DOM fallback should not win.</p></main>".to_owned(),
921 accessibility: Some(AccessibilitySnapshot {
922 nodes: vec![AccessibilityNode {
923 role: "main".to_owned(),
924 name: "Application".to_owned(),
925 children: vec![
926 AccessibilityNode::leaf("heading", "Accessible Title"),
927 AccessibilityNode::leaf("paragraph", "Readable accessible text."),
928 AccessibilityNode {
929 role: "list".to_owned(),
930 name: String::new(),
931 children: vec![
932 AccessibilityNode::leaf("listitem", "First"),
933 AccessibilityNode::leaf("listitem", "Second"),
934 ],
935 },
936 ],
937 }],
938 }),
939 };
940
941 let document = transform_headless_snapshot(&snapshot);
942
943 assert_eq!(document.title, "Accessible Title");
944 assert_eq!(
945 document
946 .metadata
947 .quality
948 .as_ref()
949 .map(|quality| (quality.category, quality.score)),
950 Some((DocumentQualityCategory::Fallback, 82))
951 );
952 assert!(document.nodes.iter().any(|node| matches!(
953 node,
954 IndexNode::Section {
955 role: SectionRole::Main,
956 collapsed: false,
957 ..
958 }
959 )));
960 assert!(!document.nodes.iter().any(
961 |node| matches!(node, IndexNode::Paragraph(text) if text == "DOM fallback should not win.")
962 ));
963 Ok(())
964 }
965
966 #[test]
967 fn accessibility_first_merges_dom_links_without_duplicate_link_nodes()
968 -> Result<(), Box<dyn std::error::Error>> {
969 let snapshot = HeadlessSnapshot {
970 final_url: index_core::IndexUrl::parse("https://example.com/app")?,
971 dom_html: "<main><a href=\"/docs\">Docs</a><a href=\"/docs\">Docs</a></main>"
972 .to_owned(),
973 accessibility: Some(AccessibilitySnapshot {
974 nodes: vec![
975 AccessibilityNode::leaf("heading", "App"),
976 AccessibilityNode::leaf("link", "Docs"),
977 ],
978 }),
979 };
980
981 let document = transform_headless_snapshot(&snapshot);
982 let links = document
983 .nodes
984 .iter()
985 .filter(|node| matches!(node, IndexNode::Link(link) if link.text == "Docs"))
986 .count();
987
988 assert_eq!(links, 1);
989 assert!(
990 document
991 .nodes
992 .iter()
993 .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "link: Docs"))
994 );
995 Ok(())
996 }
997
998 #[test]
999 fn sparse_accessibility_falls_back_to_rendered_dom() -> Result<(), Box<dyn std::error::Error>> {
1000 let snapshot = HeadlessSnapshot {
1001 final_url: index_core::IndexUrl::parse("https://example.com/sparse")?,
1002 dom_html: "<main><h1>Rendered</h1><p>DOM body wins.</p></main>".to_owned(),
1003 accessibility: Some(AccessibilitySnapshot {
1004 nodes: vec![AccessibilityNode::leaf("generic", "Sparse label")],
1005 }),
1006 };
1007
1008 let document = transform_headless_snapshot(&snapshot);
1009
1010 assert!(
1011 document
1012 .nodes
1013 .iter()
1014 .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "DOM body wins."))
1015 );
1016 assert!(
1017 !document
1018 .nodes
1019 .iter()
1020 .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "Sparse label"))
1021 );
1022 Ok(())
1023 }
1024
1025 #[test]
1026 fn accessibility_maps_secondary_regions_and_controls() -> Result<(), Box<dyn std::error::Error>>
1027 {
1028 let snapshot = HeadlessSnapshot {
1029 final_url: index_core::IndexUrl::parse("https://example.com/controls")?,
1030 dom_html: "<title>Controls</title><main><p>DOM backup.</p></main>".to_owned(),
1031 accessibility: Some(AccessibilitySnapshot {
1032 nodes: vec![
1033 AccessibilityNode {
1034 role: "navigation".to_owned(),
1035 name: "Site navigation".to_owned(),
1036 children: vec![AccessibilityNode::leaf("link", "Home")],
1037 },
1038 AccessibilityNode {
1039 role: "complementary".to_owned(),
1040 name: "Related".to_owned(),
1041 children: vec![AccessibilityNode::leaf("button", "Subscribe")],
1042 },
1043 AccessibilityNode {
1044 role: "footer".to_owned(),
1045 name: "Footer".to_owned(),
1046 children: vec![AccessibilityNode::leaf("checkbox", "Accept")],
1047 },
1048 AccessibilityNode::leaf("textbox", "Search docs"),
1049 ],
1050 }),
1051 };
1052
1053 let document = transform_headless_snapshot(&snapshot);
1054
1055 assert_eq!(document.title, "Controls");
1056 assert!(document.nodes.iter().any(|node| matches!(
1057 node,
1058 IndexNode::Section {
1059 role: SectionRole::Navigation,
1060 collapsed: true,
1061 ..
1062 }
1063 )));
1064 assert!(document.nodes.iter().any(|node| matches!(
1065 node,
1066 IndexNode::Section {
1067 role: SectionRole::Aside,
1068 collapsed: true,
1069 ..
1070 }
1071 )));
1072 assert!(document.nodes.iter().any(|node| matches!(
1073 node,
1074 IndexNode::Section {
1075 role: SectionRole::Footer,
1076 collapsed: true,
1077 ..
1078 }
1079 )));
1080 assert!(document.nodes.iter().any(
1081 |node| matches!(node, IndexNode::Paragraph(text) if text == "textbox: Search docs")
1082 ));
1083 Ok(())
1084 }
1085
1086 #[test]
1087 fn accessibility_lists_can_use_nested_child_names() -> Result<(), Box<dyn std::error::Error>> {
1088 let snapshot = HeadlessSnapshot {
1089 final_url: index_core::IndexUrl::parse("https://example.com/list")?,
1090 dom_html: "<main></main>".to_owned(),
1091 accessibility: Some(AccessibilitySnapshot {
1092 nodes: vec![
1093 AccessibilityNode::leaf("heading", "Nested List"),
1094 AccessibilityNode {
1095 role: "list".to_owned(),
1096 name: String::new(),
1097 children: vec![AccessibilityNode {
1098 role: "listitem".to_owned(),
1099 name: String::new(),
1100 children: vec![
1101 AccessibilityNode::leaf("staticText", "Alpha"),
1102 AccessibilityNode::leaf("staticText", "Beta"),
1103 ],
1104 }],
1105 },
1106 ],
1107 }),
1108 };
1109
1110 let document = transform_headless_snapshot(&snapshot);
1111
1112 assert!(document.nodes.iter().any(
1113 |node| matches!(node, IndexNode::List { items, .. } if items == &vec!["Alpha Beta".to_owned()])
1114 ));
1115 Ok(())
1116 }
1117
1118 #[test]
1119 fn accessibility_unknown_roles_keep_names_and_children()
1120 -> Result<(), Box<dyn std::error::Error>> {
1121 let snapshot = HeadlessSnapshot {
1122 final_url: index_core::IndexUrl::parse("https://example.com/custom")?,
1123 dom_html: "<main></main>".to_owned(),
1124 accessibility: Some(AccessibilitySnapshot {
1125 nodes: vec![AccessibilityNode {
1126 role: "custom-widget".to_owned(),
1127 name: "Widget".to_owned(),
1128 children: vec![AccessibilityNode::leaf("heading", "Widget Title")],
1129 }],
1130 }),
1131 };
1132
1133 let document = transform_headless_snapshot(&snapshot);
1134
1135 assert!(
1136 document
1137 .nodes
1138 .iter()
1139 .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "Widget"))
1140 );
1141 assert!(
1142 document.nodes.iter().any(
1143 |node| matches!(node, IndexNode::Heading { text, .. } if text == "Widget Title")
1144 )
1145 );
1146 Ok(())
1147 }
1148
1149 #[test]
1150 fn transforms_headless_failure_to_deterministic_error_document() {
1151 let document = transform_headless_failure(&HeadlessError::TimedOut { timeout_ms: 10 });
1152
1153 assert_eq!(document.title, "Headless fallback failed");
1154 assert!(document.nodes.iter().any(
1155 |node| matches!(node, IndexNode::Error(text) if text.contains("timed out after 10ms"))
1156 ));
1157 }
1158
1159 #[test]
1160 fn generic_transformer_reports_missing_readable_content() {
1161 let document = Transformer::<Empty>::new()
1162 .fetched("<html><title>Empty</title><main></main></html>")
1163 .parse()
1164 .extract()
1165 .transform()
1166 .into_document();
1167
1168 assert!(document.nodes.iter().any(
1169 |node| matches!(node, IndexNode::Error(text) if text.contains("did not find readable"))
1170 ));
1171 assert!(document.nodes.iter().any(
1172 |node| matches!(node, IndexNode::List { items, .. } if items.iter().any(|item| item.contains("confidence: failed")))
1173 ));
1174 assert_eq!(
1175 document
1176 .metadata
1177 .quality
1178 .as_ref()
1179 .map(|quality| quality.category),
1180 Some(DocumentQualityCategory::Failed)
1181 );
1182 }
1183
1184 #[test]
1185 fn sparse_pages_include_low_confidence_diagnostic_section() {
1186 let document = Transformer::<Empty>::new()
1187 .fetched(
1188 "<html><title>Sparse</title><main><a href=\"/only\">Only link</a></main></html>",
1189 )
1190 .parse()
1191 .extract()
1192 .transform()
1193 .into_document();
1194
1195 assert!(document.nodes.iter().any(|node| matches!(
1196 node,
1197 IndexNode::Section {
1198 title: Some(title),
1199 collapsed: true,
1200 ..
1201 } if title == "Diagnostic"
1202 )));
1203 assert_eq!(
1204 document
1205 .metadata
1206 .quality
1207 .as_ref()
1208 .map(|quality| quality.category),
1209 Some(DocumentQualityCategory::PartialGeneric)
1210 );
1211 }
1212
1213 #[test]
1214 fn blocked_flow_guardrails_cover_required_classes() {
1215 let cases = [
1216 (
1217 "auth-wall",
1218 "<html><title>Sign in</title><main><p>Please log in to continue</p></main></html>",
1219 ),
1220 (
1221 "script-gate",
1222 "<html><title>JavaScript required</title><main><p>Enable JavaScript to continue in app</p></main></html>",
1223 ),
1224 (
1225 "bot-gate",
1226 "<html><title>Robot check</title><main><p>Captcha: verify you are human</p></main></html>",
1227 ),
1228 (
1229 "geo-gate",
1230 "<html><title>Not available</title><main><p>This content is not available in your region</p></main></html>",
1231 ),
1232 (
1233 "age-gate",
1234 "<html><title>Age verification</title><main><p>Confirm your age (18+) to continue</p></main></html>",
1235 ),
1236 (
1237 "policy-blocked",
1238 "<html><title>Forbidden</title><main><p>Access denied by policy</p></main></html>",
1239 ),
1240 ];
1241
1242 for (class_name, html) in cases {
1243 let document = Transformer::<Empty>::new()
1244 .fetched(html)
1245 .parse()
1246 .extract()
1247 .transform()
1248 .into_document();
1249 let rendered = format!("{:?}", document.nodes);
1250
1251 assert!(
1252 rendered.contains("INDEX-GENERIC-BLOCKED"),
1253 "missing blocked diagnostic code for {class_name}"
1254 );
1255 assert!(
1256 rendered.contains(class_name),
1257 "missing blocked-flow class in diagnostic for {class_name}"
1258 );
1259 assert!(
1260 rendered.contains(":capture save blocked-flow.capture"),
1261 "missing capture guidance for {class_name}"
1262 );
1263 assert_eq!(
1264 document
1265 .metadata
1266 .quality
1267 .as_ref()
1268 .map(|quality| quality.category),
1269 Some(DocumentQualityCategory::Failed)
1270 );
1271 }
1272 }
1273
1274 #[test]
1275 fn blocked_flow_failure_document_is_deterministic() {
1276 let html = "<html><title>Access denied</title><main><p>Blocked by policy</p></main></html>";
1277 let first = Transformer::<Empty>::new()
1278 .fetched(html)
1279 .parse()
1280 .extract()
1281 .transform()
1282 .into_document();
1283 let second = Transformer::<Empty>::new()
1284 .fetched(html)
1285 .parse()
1286 .extract()
1287 .transform()
1288 .into_document();
1289
1290 assert_eq!(first, second);
1291 }
1292
1293 #[test]
1294 fn unsupported_page_shape_never_looks_successful() {
1295 let document = Transformer::<Empty>::new()
1296 .fetched(
1297 "<html><title>Unsupported</title><main><canvas></canvas><template></template></main></html>",
1298 )
1299 .parse()
1300 .extract()
1301 .transform()
1302 .into_document();
1303 let rendered = format!("{:?}", document.nodes);
1304
1305 assert!(
1306 rendered.contains("INDEX-GENERIC-EMPTY"),
1307 "unsupported page should emit generic empty diagnostic"
1308 );
1309 assert!(
1310 rendered.contains("confidence: failed"),
1311 "unsupported page should be marked failed"
1312 );
1313 assert_eq!(
1314 document
1315 .metadata
1316 .quality
1317 .as_ref()
1318 .map(|quality| quality.category),
1319 Some(DocumentQualityCategory::Failed)
1320 );
1321 }
1322}