index_transformer/
lib.rs

1//! Typestate transformer pipeline.
2
3use std::marker::PhantomData;
4
5use index_core::{
6    DiagnosticAction, DiagnosticConfidence, DiagnosticRecord, DiagnosticSeverity, DiagnosticSource,
7    DocumentQuality, DocumentQualityCategory, FailureCause, FailureDiagnostic, IndexDocument,
8    IndexNode, Link, SectionRole,
9};
10use index_dom::{HtmlDocument, parse_html};
11use index_headless::{AccessibilityNode, AccessibilitySnapshot, HeadlessError, HeadlessSnapshot};
12use index_readability::ReadablePage;
13
14pub mod adapter;
15pub mod cache;
16pub mod instruction;
17pub mod manifest;
18pub mod state;
19
20use adapter::{AdapterContext, AdapterRegistry};
21use instruction::{ApplyMetadata, EmitLinks, EmitReadableNodes, EmitTitle, Instruction};
22use state::{Empty, Extracted, Fetched, Parsed, Transformed};
23
24pub use cache::{TransformCacheKey, TransformedDocumentCache};
25pub use manifest::apply_index_manifest_hints;
26
27/// A staged transformer.
28#[derive(Debug, Clone)]
29pub struct Transformer<S> {
30    raw_html: Option<String>,
31    parsed: Option<HtmlDocument>,
32    extracted: Option<ReadablePage>,
33    document: Option<IndexDocument>,
34    _state: PhantomData<S>,
35}
36
37/// Transforms HTML using an in-memory transformed document cache.
38#[must_use]
39pub fn transform_html_cached(
40    cache: &mut TransformedDocumentCache,
41    source_url: Option<&str>,
42    html: impl Into<String>,
43) -> IndexDocument {
44    let html = html.into();
45    let key = TransformCacheKey::new(source_url, &html);
46    if let Some(document) = cache.get(&key) {
47        return document;
48    }
49
50    let document = Transformer::<Empty>::new()
51        .fetched(html)
52        .parse()
53        .extract()
54        .transform()
55        .into_document();
56    cache.insert(key, document.clone());
57    document
58}
59
60impl Transformer<Empty> {
61    /// Creates an empty transformer.
62    #[must_use]
63    pub const fn new() -> Self {
64        Self {
65            raw_html: None,
66            parsed: None,
67            extracted: None,
68            document: None,
69            _state: PhantomData,
70        }
71    }
72
73    /// Accepts already-fetched HTML.
74    #[must_use]
75    pub fn fetched(self, raw_html: impl Into<String>) -> Transformer<Fetched> {
76        Transformer {
77            raw_html: Some(raw_html.into()),
78            parsed: None,
79            extracted: None,
80            document: None,
81            _state: PhantomData,
82        }
83    }
84}
85
86impl Default for Transformer<Empty> {
87    fn default() -> Self {
88        Self::new()
89    }
90}
91
92impl Transformer<Fetched> {
93    /// Parses fetched HTML.
94    #[must_use]
95    pub fn parse(self) -> Transformer<Parsed> {
96        let raw_html = self.raw_html.unwrap_or_default();
97        let parsed = parse_html(raw_html.clone());
98
99        Transformer {
100            raw_html: Some(raw_html),
101            parsed: Some(parsed),
102            extracted: None,
103            document: None,
104            _state: PhantomData,
105        }
106    }
107}
108
109impl Transformer<Parsed> {
110    /// Extracts readable content.
111    #[must_use]
112    pub fn extract(self) -> Transformer<Extracted> {
113        let extracted = self.parsed.as_ref().map(ReadablePage::from_html);
114
115        Transformer {
116            raw_html: self.raw_html,
117            parsed: self.parsed,
118            extracted,
119            document: None,
120            _state: PhantomData,
121        }
122    }
123}
124
125impl Transformer<Extracted> {
126    /// Transforms extracted content into an Index document.
127    #[must_use]
128    pub fn transform(self) -> Transformer<Transformed> {
129        let page = self.extracted.unwrap_or_else(|| ReadablePage {
130            title: "Untitled".to_owned(),
131            paragraphs: Vec::new(),
132            nodes: Vec::new(),
133            links: Vec::new(),
134            forms: Vec::new(),
135            metadata: Default::default(),
136        });
137
138        let context = AdapterContext { page: &page };
139        let document = AdapterRegistry::default_registry()
140            .transform(&context)
141            .unwrap_or_else(|| transform_generic(&page));
142
143        Transformer {
144            raw_html: self.raw_html,
145            parsed: self.parsed,
146            extracted: Some(page),
147            document: Some(document),
148            _state: PhantomData,
149        }
150    }
151}
152
153impl Transformer<Transformed> {
154    /// Consumes the transformer and returns the final document.
155    #[must_use]
156    pub fn into_document(self) -> IndexDocument {
157        self.document.unwrap_or_default()
158    }
159}
160
161/// Transforms a rendered DOM/accessibility snapshot into an Index document.
162#[must_use]
163pub fn transform_headless_snapshot(snapshot: &HeadlessSnapshot) -> IndexDocument {
164    let mut parsed = parse_html(snapshot.dom_html.clone());
165    if parsed.metadata.canonical_url.is_none() {
166        parsed.metadata.canonical_url = Some(snapshot.final_url.to_string());
167    }
168
169    let page = ReadablePage::from_html(&parsed);
170    if let Some(accessibility) = &snapshot.accessibility {
171        if let Some(mut document) = accessibility_document(accessibility, snapshot, &page) {
172            merge_dom_links(&mut document, &page);
173            return document;
174        }
175    }
176
177    if page.has_body() {
178        return AdapterRegistry::default_registry()
179            .transform(&AdapterContext { page: &page })
180            .unwrap_or_else(|| transform_generic(&page));
181    }
182
183    let mut document = IndexDocument::titled("Headless snapshot");
184    document.metadata.canonical_url = Some(snapshot.final_url.to_string());
185    if let Some(accessibility) = &snapshot.accessibility {
186        let text = accessibility.text_content();
187        if !text.is_empty() {
188            document.push(IndexNode::Paragraph(text));
189        }
190    }
191    if document.is_empty() {
192        return FailureDiagnostic::new(
193            "Headless snapshot unreadable",
194            DiagnosticSource::Headless,
195            DiagnosticConfidence::Failed,
196            "headless snapshot did not contain readable content",
197        )
198        .with_fallback("accessibility tree text extraction")
199        .with_tried("headless DOM snapshot")
200        .with_tried("accessibility tree extraction")
201        .with_actions([DiagnosticAction::Retry, DiagnosticAction::Capture])
202        .with_command(":capture save headless-unreadable.capture")
203        .with_record(DiagnosticRecord::new(
204            DiagnosticSeverity::Error,
205            "INDEX-HEADLESS-EMPTY",
206            format!("final_url={}", snapshot.final_url),
207        ))
208        .into_document();
209    }
210    document.metadata.quality = Some(DocumentQuality::new(
211        DocumentQualityCategory::Fallback,
212        55,
213        [
214            "headless accessibility fallback".to_owned(),
215            "DOM body was not readable".to_owned(),
216        ],
217    ));
218    document
219}
220
221fn accessibility_document(
222    accessibility: &AccessibilitySnapshot,
223    snapshot: &HeadlessSnapshot,
224    page: &ReadablePage,
225) -> Option<IndexDocument> {
226    let mut nodes = Vec::new();
227    let mut evidence = AccessibilityEvidence::default();
228    for node in &accessibility.nodes {
229        append_accessibility_node(node, &mut nodes, &mut evidence);
230    }
231    if !evidence.is_confident() {
232        return None;
233    }
234
235    let title = nodes
236        .iter()
237        .find_map(first_heading_text)
238        .filter(|title| !title.trim().is_empty())
239        .unwrap_or_else(|| {
240            if page.title.trim().is_empty() {
241                "Headless snapshot".to_owned()
242            } else {
243                page.title.clone()
244            }
245        });
246    let mut document = IndexDocument::titled(title);
247    document.metadata.canonical_url = Some(snapshot.final_url.to_string());
248    document.nodes = nodes;
249    document.metadata.quality = Some(DocumentQuality::new(
250        DocumentQualityCategory::Fallback,
251        evidence.score(),
252        [
253            "accessibility tree supplied semantic roles".to_owned(),
254            "headless DOM links merged when available".to_owned(),
255        ],
256    ));
257    Some(document)
258}
259
260fn first_heading_text(node: &IndexNode) -> Option<String> {
261    match node {
262        IndexNode::Heading { text, .. } if !text.trim().is_empty() => Some(text.clone()),
263        IndexNode::Section { nodes, .. } => nodes.iter().find_map(first_heading_text),
264        _ => None,
265    }
266}
267
268#[derive(Debug, Clone, Copy, Default)]
269struct AccessibilityEvidence {
270    named_nodes: usize,
271    semantic_nodes: usize,
272}
273
274impl AccessibilityEvidence {
275    fn observe(&mut self, semantic: bool, name: &str) {
276        if !name.trim().is_empty() {
277            self.named_nodes += 1;
278        }
279        if semantic {
280            self.semantic_nodes += 1;
281        }
282    }
283
284    fn is_confident(self) -> bool {
285        self.semantic_nodes >= 2 || (self.semantic_nodes >= 1 && self.named_nodes >= 2)
286    }
287
288    fn score(self) -> u8 {
289        let score =
290            50 + (self.semantic_nodes.min(4) as u8 * 8) + (self.named_nodes.min(4) as u8 * 3);
291        score.min(82)
292    }
293}
294
295fn append_accessibility_node(
296    node: &AccessibilityNode,
297    output: &mut Vec<IndexNode>,
298    evidence: &mut AccessibilityEvidence,
299) {
300    let role = node.role.trim().to_ascii_lowercase();
301    let name = node.name.trim();
302    match role.as_str() {
303        "main" | "article" | "navigation" | "complementary" | "contentinfo" | "footer" => {
304            let mut children = Vec::new();
305            for child in &node.children {
306                append_accessibility_node(child, &mut children, evidence);
307            }
308            if !children.is_empty() {
309                evidence.observe(true, name);
310                output.push(IndexNode::Section {
311                    role: accessibility_section_role(&role),
312                    title: (!name.is_empty()).then(|| name.to_owned()),
313                    collapsed: !matches!(role.as_str(), "main" | "article"),
314                    nodes: children,
315                });
316            }
317        }
318        "heading" => {
319            if !name.is_empty() {
320                evidence.observe(true, name);
321                output.push(IndexNode::Heading {
322                    level: 2,
323                    text: name.to_owned(),
324                });
325            }
326        }
327        "paragraph" | "text" | "statictext" | "generic" => {
328            if !name.is_empty() {
329                evidence.observe(false, name);
330                output.push(IndexNode::Paragraph(name.to_owned()));
331            }
332        }
333        "link" | "button" | "searchbox" | "textbox" | "checkbox" => {
334            if !name.is_empty() {
335                evidence.observe(true, name);
336                output.push(IndexNode::Paragraph(format!("{role}: {name}")));
337            }
338        }
339        "list" => {
340            let items = accessibility_list_items(&node.children);
341            if !items.is_empty() {
342                evidence.observe(true, name);
343                output.push(IndexNode::List {
344                    ordered: false,
345                    items,
346                });
347            }
348        }
349        "listitem" => {
350            if !name.is_empty() {
351                evidence.observe(true, name);
352                output.push(IndexNode::Paragraph(name.to_owned()));
353            }
354        }
355        _ => {
356            if !name.is_empty() {
357                evidence.observe(false, name);
358                output.push(IndexNode::Paragraph(name.to_owned()));
359            }
360            for child in &node.children {
361                append_accessibility_node(child, output, evidence);
362            }
363        }
364    }
365}
366
367fn accessibility_section_role(role: &str) -> SectionRole {
368    match role {
369        "main" | "article" => SectionRole::Main,
370        "navigation" => SectionRole::Navigation,
371        "complementary" => SectionRole::Aside,
372        "contentinfo" | "footer" => SectionRole::Footer,
373        _ => SectionRole::Unknown,
374    }
375}
376
377fn accessibility_list_items(children: &[AccessibilityNode]) -> Vec<String> {
378    children
379        .iter()
380        .filter_map(|child| {
381            let name = child.name.trim();
382            if name.is_empty() {
383                let nested = child
384                    .children
385                    .iter()
386                    .filter_map(|grandchild| {
387                        let name = grandchild.name.trim();
388                        (!name.is_empty()).then(|| name.to_owned())
389                    })
390                    .collect::<Vec<_>>();
391                (!nested.is_empty()).then(|| nested.join(" "))
392            } else {
393                Some(name.to_owned())
394            }
395        })
396        .collect()
397}
398
399fn merge_dom_links(document: &mut IndexDocument, page: &ReadablePage) {
400    let mut existing = document_link_labels(document);
401    for link in &page.links {
402        if existing.iter().any(|label| label == &link.text) {
403            continue;
404        }
405        existing.push(link.text.clone());
406        document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
407    }
408}
409
410fn document_link_labels(document: &IndexDocument) -> Vec<String> {
411    document.nodes.iter().filter_map(link_label).collect()
412}
413
414fn link_label(node: &IndexNode) -> Option<String> {
415    match node {
416        IndexNode::Link(link) => Some(link.text.clone()),
417        IndexNode::Section { nodes, .. } => nodes.iter().find_map(link_label),
418        _ => None,
419    }
420}
421
422/// Converts a headless failure into a deterministic document.
423#[must_use]
424pub fn transform_headless_failure(error: &HeadlessError) -> IndexDocument {
425    FailureDiagnostic::new(
426        "Headless fallback failed",
427        DiagnosticSource::Headless,
428        DiagnosticConfidence::Failed,
429        error.to_string(),
430    )
431    .with_fallback("static transform or retry")
432    .with_tried("headless backend execution")
433    .with_actions([DiagnosticAction::Retry, DiagnosticAction::Capture])
434    .with_command(":capture save headless-failure.capture")
435    .with_record(DiagnosticRecord::new(
436        DiagnosticSeverity::Error,
437        "INDEX-HEADLESS-FAILED",
438        error.to_string(),
439    ))
440    .into_document()
441}
442
443fn default_program() -> Vec<Box<dyn Instruction>> {
444    vec![
445        Box::new(ApplyMetadata),
446        Box::new(EmitTitle),
447        Box::new(EmitReadableNodes),
448        Box::new(EmitLinks),
449    ]
450}
451
452fn transform_generic(page: &ReadablePage) -> IndexDocument {
453    let mut document = IndexDocument::titled(page.title.clone());
454    let program = default_program();
455
456    for instruction in program {
457        instruction.execute(page, &mut document);
458    }
459
460    if let Some(blocked_flow_class) = blocked_flow_hint(page) {
461        return generic_blocked_flow_document(page, blocked_flow_class);
462    }
463
464    if !page.has_body() && page.links.is_empty() && page.forms.is_empty() {
465        document = FailureDiagnostic::new(
466            page.title.clone(),
467            DiagnosticSource::GenericTransformer,
468            DiagnosticConfidence::Failed,
469            "generic transformer did not find readable page content",
470        )
471        .with_fallback("generic static reader")
472        .with_tried("static HTML parse")
473        .with_tried("readability extraction")
474        .with_tried("generic instruction program")
475        .with_actions([
476            DiagnosticAction::TryHeadless,
477            DiagnosticAction::Extract,
478            DiagnosticAction::Capture,
479            DiagnosticAction::AddFixture,
480        ])
481        .with_command(":extract links")
482        .with_command(":capture save unsupported-page.capture")
483        .with_record(
484            DiagnosticRecord::new(
485                DiagnosticSeverity::Error,
486                "INDEX-GENERIC-EMPTY",
487                "no readable headings, paragraphs, tables, forms, links, or sections were emitted",
488            )
489            .with_field("title", &page.title),
490        )
491        .into_document();
492    } else if page.paragraphs.is_empty() && page.nodes.len() <= 2 {
493        document.push(IndexNode::Section {
494            role: index_core::SectionRole::Unknown,
495            title: Some("Diagnostic".to_owned()),
496            collapsed: true,
497            nodes: vec![
498                IndexNode::Error(
499                    "low-confidence transform: only sparse page structure was found".to_owned(),
500                ),
501                IndexNode::List {
502                    ordered: false,
503                    items: vec![
504                        "try :extract links or :extract json".to_owned(),
505                        "capture a redacted fixture if the page shape matters".to_owned(),
506                    ],
507                },
508            ],
509        });
510        document.metadata.quality = Some(DocumentQuality::new(
511            DocumentQualityCategory::PartialGeneric,
512            45,
513            [
514                "sparse generic structure".to_owned(),
515                "diagnostic section attached".to_owned(),
516            ],
517        ));
518    } else {
519        document.metadata.quality = Some(DocumentQuality::new(
520            DocumentQualityCategory::StrongGeneric,
521            82,
522            [
523                "generic reader emitted semantic content".to_owned(),
524                "no low-confidence diagnostic attached".to_owned(),
525            ],
526        ));
527    }
528
529    document
530}
531
532fn blocked_flow_hint(page: &ReadablePage) -> Option<&'static str> {
533    let sparse_shape = page.paragraphs.len() <= 2 && page.forms.is_empty() && page.links.len() <= 3;
534    if !sparse_shape {
535        return None;
536    }
537
538    let mut haystack = page.title.to_ascii_lowercase();
539    for paragraph in &page.paragraphs {
540        haystack.push('\n');
541        haystack.push_str(&paragraph.to_ascii_lowercase());
542    }
543
544    if haystack.contains("captcha")
545        || haystack.contains("verify you are human")
546        || haystack.contains("robot check")
547        || haystack.contains("cloudflare")
548    {
549        return Some("bot-gate");
550    }
551    if haystack.contains("not available in your region")
552        || haystack.contains("not available in your country")
553        || haystack.contains("geo-restricted")
554        || haystack.contains("geoblocked")
555    {
556        return Some("geo-gate");
557    }
558    if haystack.contains("age verification")
559        || haystack.contains("adults only")
560        || haystack.contains("18+")
561        || haystack.contains("confirm your age")
562    {
563        return Some("age-gate");
564    }
565    if haystack.contains("access denied")
566        || haystack.contains("forbidden")
567        || haystack.contains("blocked by policy")
568        || haystack.contains("violates our terms")
569        || haystack.contains("not permitted")
570    {
571        return Some("policy-blocked");
572    }
573    if haystack.contains("enable javascript")
574        || haystack.contains("requires javascript")
575        || haystack.contains("continue in app")
576        || haystack.contains("app is not available")
577    {
578        return Some("script-gate");
579    }
580    if haystack.contains("log in")
581        || haystack.contains("sign in")
582        || haystack.contains("create account")
583        || haystack.contains("authentication required")
584        || haystack.contains("please log in")
585    {
586        return Some("auth-wall");
587    }
588    None
589}
590
591fn generic_blocked_flow_document(page: &ReadablePage, blocked_flow_class: &str) -> IndexDocument {
592    let mut document = FailureDiagnostic::new(
593        page.title.clone(),
594        DiagnosticSource::GenericTransformer,
595        DiagnosticConfidence::Low,
596        format!("generic transform indicates a blocked flow ({blocked_flow_class})"),
597    )
598    .with_likely_cause(FailureCause::BlockedByPolicy)
599    .with_fallback("read-only extraction and fixture capture")
600    .with_tried("static HTML parse")
601    .with_tried("readability extraction")
602    .with_tried("generic instruction program")
603    .with_actions([
604        DiagnosticAction::TryHeadless,
605        DiagnosticAction::Extract,
606        DiagnosticAction::Capture,
607        DiagnosticAction::AddFixture,
608    ])
609    .with_command(":extract links")
610    .with_command(":capture save blocked-flow.capture")
611    .with_record(
612        DiagnosticRecord::new(
613            DiagnosticSeverity::Warning,
614            "INDEX-GENERIC-BLOCKED",
615            format!("blocked-flow class: {blocked_flow_class}"),
616        )
617        .with_field("title", &page.title)
618        .with_field("blocked_flow_class", blocked_flow_class),
619    )
620    .into_document();
621    document.metadata.canonical_url = page.metadata.canonical_url.clone();
622    document.metadata.language = page.metadata.language.clone();
623    document
624}
625
626#[cfg(test)]
627mod tests {
628    use index_core::{DocumentQualityCategory, IndexNode, SectionRole};
629    use index_headless::{
630        AccessibilityNode, AccessibilitySnapshot, HeadlessError, HeadlessSnapshot,
631    };
632
633    use super::{
634        Transformer, state::Empty, transform_headless_failure, transform_headless_snapshot,
635        transform_html_cached,
636    };
637
638    fn count_links(nodes: &[IndexNode]) -> usize {
639        nodes
640            .iter()
641            .map(|node| match node {
642                IndexNode::Link(_) => 1,
643                IndexNode::Section { nodes, .. } => count_links(nodes),
644                _ => 0,
645            })
646            .sum()
647    }
648
649    #[test]
650    fn typestate_pipeline_emits_document() {
651        let document = Transformer::<Empty>::new()
652            .fetched(r#"<title>Hello</title><p>Index works.</p>"#)
653            .parse()
654            .extract()
655            .transform()
656            .into_document();
657
658        assert_eq!(document.title, "Hello");
659        assert!(!document.nodes.is_empty());
660        assert_eq!(
661            document
662                .metadata
663                .quality
664                .as_ref()
665                .map(|quality| quality.category),
666            Some(DocumentQualityCategory::StrongGeneric)
667        );
668    }
669
670    #[test]
671    fn cached_transform_reuses_matching_source_and_content() {
672        let mut cache = super::TransformedDocumentCache::new();
673        let first = transform_html_cached(
674            &mut cache,
675            Some("https://example.org"),
676            r#"<title>Hello</title><p>Index works.</p>"#,
677        );
678        let second = transform_html_cached(
679            &mut cache,
680            Some("https://example.org"),
681            r#"<title>Hello</title><p>Index works.</p>"#,
682        );
683
684        assert_eq!(first.title, second.title);
685        assert_eq!(cache.len(), 1);
686    }
687
688    #[test]
689    fn performance_fixtures_transform_through_cache() {
690        let fixtures = [
691            include_str!("../tests/fixtures/performance/large-doc.html"),
692            include_str!("../tests/fixtures/performance/large-table.html"),
693            include_str!("../tests/fixtures/performance/listing.html"),
694            include_str!("../tests/fixtures/performance/forum.html"),
695        ];
696        let mut cache = super::TransformedDocumentCache::new();
697
698        for (index, fixture) in fixtures.iter().enumerate() {
699            let document =
700                transform_html_cached(&mut cache, Some("fixture://performance"), *fixture);
701            assert!(
702                !document.nodes.is_empty(),
703                "performance fixture {index} should transform"
704            );
705        }
706
707        assert_eq!(cache.len(), fixtures.len());
708    }
709
710    #[test]
711    fn generic_transform_bounds_very_large_link_sets() {
712        let mut html = String::from("<html><head><title>Large Links</title></head><body><main>");
713        for index in 0..1200 {
714            html.push_str(&format!(
715                "<a href=\"https://example.com/{index}\">Link {index}</a>"
716            ));
717        }
718        html.push_str("</main></body></html>");
719
720        let document = Transformer::<Empty>::new()
721            .fetched(html)
722            .parse()
723            .extract()
724            .transform()
725            .into_document();
726
727        assert!(count_links(&document.nodes) <= 300);
728        assert!(document.nodes.iter().any(|node| matches!(
729            node,
730            IndexNode::Section {
731                title: Some(title),
732                ..
733            } if title == "Diagnostic"
734        )));
735    }
736
737    #[test]
738    fn transformer_emits_links_after_paragraphs() {
739        let document = Transformer::<Empty>::new()
740            .fetched(r#"<title>Hello</title><p>Body.</p><a href="https://example.com">Example</a>"#)
741            .parse()
742            .extract()
743            .transform()
744            .into_document();
745
746        let link_position = document
747            .nodes
748            .iter()
749            .position(|node| matches!(node, IndexNode::Link(_)));
750        let paragraph_position = document
751            .nodes
752            .iter()
753            .position(|node| matches!(node, IndexNode::Paragraph(_)));
754
755        assert!(paragraph_position < link_position);
756    }
757
758    #[test]
759    fn transformer_emits_static_reader_nodes_and_metadata() {
760        let document = Transformer::<Empty>::new()
761            .fetched(
762                r#"
763                <html>
764                  <head>
765                    <meta name="description" content="Reader docs">
766                    <link rel="canonical" href="https://example.com/docs">
767                  </head>
768                  <main>
769                    <h2>Install</h2>
770                    <ul><li>Read docs</li><li>Run locally</li></ul>
771                    <pre><code class="language-sh">cargo install index</code></pre>
772                    <table><tr><th>Command</th></tr><tr><td>index</td></tr></table>
773                    <img src="logo.png" alt="Index logo">
774                  </main>
775                </html>
776                "#,
777            )
778            .parse()
779            .extract()
780            .transform()
781            .into_document();
782
783        assert_eq!(
784            document.metadata.description.as_deref(),
785            Some("Reader docs")
786        );
787        assert!(document.nodes.iter().any(
788            |node| matches!(node, IndexNode::Heading { level: 2, text } if text == "Install")
789        ));
790        assert!(
791            document
792                .nodes
793                .iter()
794                .any(|node| matches!(node, IndexNode::CodeBlock { .. }))
795        );
796        assert!(
797            document
798                .nodes
799                .iter()
800                .any(|node| matches!(node, IndexNode::List { .. }))
801        );
802        assert!(
803            document
804                .nodes
805                .iter()
806                .any(|node| matches!(node, IndexNode::Table { .. }))
807        );
808        assert!(
809            document
810                .nodes
811                .iter()
812                .any(|node| matches!(node, IndexNode::Image { alt, .. } if alt == "Index logo"))
813        );
814    }
815
816    #[test]
817    fn transformer_uses_site_adapter_when_canonical_url_matches() {
818        let document = Transformer::<Empty>::new()
819            .fetched(
820                r#"
821                <head><link rel="canonical" href="https://github.com/index-rs/index"></head>
822                <main><p>Generic repository noise.</p><a href="/issues">Issues</a></main>
823                "#,
824            )
825            .parse()
826            .extract()
827            .transform()
828            .into_document();
829
830        assert_eq!(
831            document.metadata.adapter_id.as_ref().map(|id| id.as_str()),
832            Some("github.repository")
833        );
834        assert_eq!(
835            document
836                .metadata
837                .quality
838                .as_ref()
839                .map(|quality| quality.category),
840            Some(DocumentQualityCategory::Adapter)
841        );
842        assert!(document.title.contains("GitHub repository"));
843    }
844
845    #[test]
846    fn transformer_falls_back_to_generic_transformer_for_unknown_sites() {
847        let document = Transformer::<Empty>::new()
848            .fetched(
849                r#"
850                <head><link rel="canonical" href="https://example.com/article"></head>
851                <main><p>Generic article body.</p></main>
852                "#,
853            )
854            .parse()
855            .extract()
856            .transform()
857            .into_document();
858
859        assert_eq!(document.metadata.adapter_id, None);
860        assert!(document.nodes.iter().any(
861            |node| matches!(node, IndexNode::Paragraph(text) if text == "Generic article body.")
862        ));
863    }
864
865    #[test]
866    fn transforms_rendered_dom_snapshot() -> Result<(), Box<dyn std::error::Error>> {
867        let snapshot = HeadlessSnapshot {
868            final_url: index_core::IndexUrl::parse("https://example.com/app")?,
869            dom_html: "<main><h1>Rendered</h1><p>Loaded by fallback.</p></main>".to_owned(),
870            accessibility: None,
871        };
872
873        let document = transform_headless_snapshot(&snapshot);
874
875        assert_eq!(
876            document.metadata.canonical_url.as_deref(),
877            Some("https://example.com/app")
878        );
879        assert!(document.nodes.iter().any(
880            |node| matches!(node, IndexNode::Paragraph(text) if text == "Loaded by fallback.")
881        ));
882        Ok(())
883    }
884
885    #[test]
886    fn transforms_accessibility_snapshot_when_dom_is_empty()
887    -> Result<(), Box<dyn std::error::Error>> {
888        let snapshot = HeadlessSnapshot {
889            final_url: index_core::IndexUrl::parse("https://example.com/spa")?,
890            dom_html: "<main></main>".to_owned(),
891            accessibility: Some(AccessibilitySnapshot {
892                nodes: vec![AccessibilityNode::leaf("button", "Search")],
893            }),
894        };
895
896        let document = transform_headless_snapshot(&snapshot);
897
898        assert!(
899            document
900                .nodes
901                .iter()
902                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "button: Search"))
903        );
904        assert_eq!(
905            document
906                .metadata
907                .quality
908                .as_ref()
909                .map(|quality| quality.category),
910            Some(DocumentQualityCategory::Fallback)
911        );
912        Ok(())
913    }
914
915    #[test]
916    fn accessibility_first_maps_roles_and_scores_confidence()
917    -> Result<(), Box<dyn std::error::Error>> {
918        let snapshot = HeadlessSnapshot {
919            final_url: index_core::IndexUrl::parse("https://example.com/a11y")?,
920            dom_html: "<main><p>DOM fallback should not win.</p></main>".to_owned(),
921            accessibility: Some(AccessibilitySnapshot {
922                nodes: vec![AccessibilityNode {
923                    role: "main".to_owned(),
924                    name: "Application".to_owned(),
925                    children: vec![
926                        AccessibilityNode::leaf("heading", "Accessible Title"),
927                        AccessibilityNode::leaf("paragraph", "Readable accessible text."),
928                        AccessibilityNode {
929                            role: "list".to_owned(),
930                            name: String::new(),
931                            children: vec![
932                                AccessibilityNode::leaf("listitem", "First"),
933                                AccessibilityNode::leaf("listitem", "Second"),
934                            ],
935                        },
936                    ],
937                }],
938            }),
939        };
940
941        let document = transform_headless_snapshot(&snapshot);
942
943        assert_eq!(document.title, "Accessible Title");
944        assert_eq!(
945            document
946                .metadata
947                .quality
948                .as_ref()
949                .map(|quality| (quality.category, quality.score)),
950            Some((DocumentQualityCategory::Fallback, 82))
951        );
952        assert!(document.nodes.iter().any(|node| matches!(
953            node,
954            IndexNode::Section {
955                role: SectionRole::Main,
956                collapsed: false,
957                ..
958            }
959        )));
960        assert!(!document.nodes.iter().any(
961            |node| matches!(node, IndexNode::Paragraph(text) if text == "DOM fallback should not win.")
962        ));
963        Ok(())
964    }
965
966    #[test]
967    fn accessibility_first_merges_dom_links_without_duplicate_link_nodes()
968    -> Result<(), Box<dyn std::error::Error>> {
969        let snapshot = HeadlessSnapshot {
970            final_url: index_core::IndexUrl::parse("https://example.com/app")?,
971            dom_html: "<main><a href=\"/docs\">Docs</a><a href=\"/docs\">Docs</a></main>"
972                .to_owned(),
973            accessibility: Some(AccessibilitySnapshot {
974                nodes: vec![
975                    AccessibilityNode::leaf("heading", "App"),
976                    AccessibilityNode::leaf("link", "Docs"),
977                ],
978            }),
979        };
980
981        let document = transform_headless_snapshot(&snapshot);
982        let links = document
983            .nodes
984            .iter()
985            .filter(|node| matches!(node, IndexNode::Link(link) if link.text == "Docs"))
986            .count();
987
988        assert_eq!(links, 1);
989        assert!(
990            document
991                .nodes
992                .iter()
993                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "link: Docs"))
994        );
995        Ok(())
996    }
997
998    #[test]
999    fn sparse_accessibility_falls_back_to_rendered_dom() -> Result<(), Box<dyn std::error::Error>> {
1000        let snapshot = HeadlessSnapshot {
1001            final_url: index_core::IndexUrl::parse("https://example.com/sparse")?,
1002            dom_html: "<main><h1>Rendered</h1><p>DOM body wins.</p></main>".to_owned(),
1003            accessibility: Some(AccessibilitySnapshot {
1004                nodes: vec![AccessibilityNode::leaf("generic", "Sparse label")],
1005            }),
1006        };
1007
1008        let document = transform_headless_snapshot(&snapshot);
1009
1010        assert!(
1011            document
1012                .nodes
1013                .iter()
1014                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "DOM body wins."))
1015        );
1016        assert!(
1017            !document
1018                .nodes
1019                .iter()
1020                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "Sparse label"))
1021        );
1022        Ok(())
1023    }
1024
1025    #[test]
1026    fn accessibility_maps_secondary_regions_and_controls() -> Result<(), Box<dyn std::error::Error>>
1027    {
1028        let snapshot = HeadlessSnapshot {
1029            final_url: index_core::IndexUrl::parse("https://example.com/controls")?,
1030            dom_html: "<title>Controls</title><main><p>DOM backup.</p></main>".to_owned(),
1031            accessibility: Some(AccessibilitySnapshot {
1032                nodes: vec![
1033                    AccessibilityNode {
1034                        role: "navigation".to_owned(),
1035                        name: "Site navigation".to_owned(),
1036                        children: vec![AccessibilityNode::leaf("link", "Home")],
1037                    },
1038                    AccessibilityNode {
1039                        role: "complementary".to_owned(),
1040                        name: "Related".to_owned(),
1041                        children: vec![AccessibilityNode::leaf("button", "Subscribe")],
1042                    },
1043                    AccessibilityNode {
1044                        role: "footer".to_owned(),
1045                        name: "Footer".to_owned(),
1046                        children: vec![AccessibilityNode::leaf("checkbox", "Accept")],
1047                    },
1048                    AccessibilityNode::leaf("textbox", "Search docs"),
1049                ],
1050            }),
1051        };
1052
1053        let document = transform_headless_snapshot(&snapshot);
1054
1055        assert_eq!(document.title, "Controls");
1056        assert!(document.nodes.iter().any(|node| matches!(
1057            node,
1058            IndexNode::Section {
1059                role: SectionRole::Navigation,
1060                collapsed: true,
1061                ..
1062            }
1063        )));
1064        assert!(document.nodes.iter().any(|node| matches!(
1065            node,
1066            IndexNode::Section {
1067                role: SectionRole::Aside,
1068                collapsed: true,
1069                ..
1070            }
1071        )));
1072        assert!(document.nodes.iter().any(|node| matches!(
1073            node,
1074            IndexNode::Section {
1075                role: SectionRole::Footer,
1076                collapsed: true,
1077                ..
1078            }
1079        )));
1080        assert!(document.nodes.iter().any(
1081            |node| matches!(node, IndexNode::Paragraph(text) if text == "textbox: Search docs")
1082        ));
1083        Ok(())
1084    }
1085
1086    #[test]
1087    fn accessibility_lists_can_use_nested_child_names() -> Result<(), Box<dyn std::error::Error>> {
1088        let snapshot = HeadlessSnapshot {
1089            final_url: index_core::IndexUrl::parse("https://example.com/list")?,
1090            dom_html: "<main></main>".to_owned(),
1091            accessibility: Some(AccessibilitySnapshot {
1092                nodes: vec![
1093                    AccessibilityNode::leaf("heading", "Nested List"),
1094                    AccessibilityNode {
1095                        role: "list".to_owned(),
1096                        name: String::new(),
1097                        children: vec![AccessibilityNode {
1098                            role: "listitem".to_owned(),
1099                            name: String::new(),
1100                            children: vec![
1101                                AccessibilityNode::leaf("staticText", "Alpha"),
1102                                AccessibilityNode::leaf("staticText", "Beta"),
1103                            ],
1104                        }],
1105                    },
1106                ],
1107            }),
1108        };
1109
1110        let document = transform_headless_snapshot(&snapshot);
1111
1112        assert!(document.nodes.iter().any(
1113            |node| matches!(node, IndexNode::List { items, .. } if items == &vec!["Alpha Beta".to_owned()])
1114        ));
1115        Ok(())
1116    }
1117
1118    #[test]
1119    fn accessibility_unknown_roles_keep_names_and_children()
1120    -> Result<(), Box<dyn std::error::Error>> {
1121        let snapshot = HeadlessSnapshot {
1122            final_url: index_core::IndexUrl::parse("https://example.com/custom")?,
1123            dom_html: "<main></main>".to_owned(),
1124            accessibility: Some(AccessibilitySnapshot {
1125                nodes: vec![AccessibilityNode {
1126                    role: "custom-widget".to_owned(),
1127                    name: "Widget".to_owned(),
1128                    children: vec![AccessibilityNode::leaf("heading", "Widget Title")],
1129                }],
1130            }),
1131        };
1132
1133        let document = transform_headless_snapshot(&snapshot);
1134
1135        assert!(
1136            document
1137                .nodes
1138                .iter()
1139                .any(|node| matches!(node, IndexNode::Paragraph(text) if text == "Widget"))
1140        );
1141        assert!(
1142            document.nodes.iter().any(
1143                |node| matches!(node, IndexNode::Heading { text, .. } if text == "Widget Title")
1144            )
1145        );
1146        Ok(())
1147    }
1148
1149    #[test]
1150    fn transforms_headless_failure_to_deterministic_error_document() {
1151        let document = transform_headless_failure(&HeadlessError::TimedOut { timeout_ms: 10 });
1152
1153        assert_eq!(document.title, "Headless fallback failed");
1154        assert!(document.nodes.iter().any(
1155            |node| matches!(node, IndexNode::Error(text) if text.contains("timed out after 10ms"))
1156        ));
1157    }
1158
1159    #[test]
1160    fn generic_transformer_reports_missing_readable_content() {
1161        let document = Transformer::<Empty>::new()
1162            .fetched("<html><title>Empty</title><main></main></html>")
1163            .parse()
1164            .extract()
1165            .transform()
1166            .into_document();
1167
1168        assert!(document.nodes.iter().any(
1169            |node| matches!(node, IndexNode::Error(text) if text.contains("did not find readable"))
1170        ));
1171        assert!(document.nodes.iter().any(
1172            |node| matches!(node, IndexNode::List { items, .. } if items.iter().any(|item| item.contains("confidence: failed")))
1173        ));
1174        assert_eq!(
1175            document
1176                .metadata
1177                .quality
1178                .as_ref()
1179                .map(|quality| quality.category),
1180            Some(DocumentQualityCategory::Failed)
1181        );
1182    }
1183
1184    #[test]
1185    fn sparse_pages_include_low_confidence_diagnostic_section() {
1186        let document = Transformer::<Empty>::new()
1187            .fetched(
1188                "<html><title>Sparse</title><main><a href=\"/only\">Only link</a></main></html>",
1189            )
1190            .parse()
1191            .extract()
1192            .transform()
1193            .into_document();
1194
1195        assert!(document.nodes.iter().any(|node| matches!(
1196            node,
1197            IndexNode::Section {
1198                title: Some(title),
1199                collapsed: true,
1200                ..
1201            } if title == "Diagnostic"
1202        )));
1203        assert_eq!(
1204            document
1205                .metadata
1206                .quality
1207                .as_ref()
1208                .map(|quality| quality.category),
1209            Some(DocumentQualityCategory::PartialGeneric)
1210        );
1211    }
1212
1213    #[test]
1214    fn blocked_flow_guardrails_cover_required_classes() {
1215        let cases = [
1216            (
1217                "auth-wall",
1218                "<html><title>Sign in</title><main><p>Please log in to continue</p></main></html>",
1219            ),
1220            (
1221                "script-gate",
1222                "<html><title>JavaScript required</title><main><p>Enable JavaScript to continue in app</p></main></html>",
1223            ),
1224            (
1225                "bot-gate",
1226                "<html><title>Robot check</title><main><p>Captcha: verify you are human</p></main></html>",
1227            ),
1228            (
1229                "geo-gate",
1230                "<html><title>Not available</title><main><p>This content is not available in your region</p></main></html>",
1231            ),
1232            (
1233                "age-gate",
1234                "<html><title>Age verification</title><main><p>Confirm your age (18+) to continue</p></main></html>",
1235            ),
1236            (
1237                "policy-blocked",
1238                "<html><title>Forbidden</title><main><p>Access denied by policy</p></main></html>",
1239            ),
1240        ];
1241
1242        for (class_name, html) in cases {
1243            let document = Transformer::<Empty>::new()
1244                .fetched(html)
1245                .parse()
1246                .extract()
1247                .transform()
1248                .into_document();
1249            let rendered = format!("{:?}", document.nodes);
1250
1251            assert!(
1252                rendered.contains("INDEX-GENERIC-BLOCKED"),
1253                "missing blocked diagnostic code for {class_name}"
1254            );
1255            assert!(
1256                rendered.contains(class_name),
1257                "missing blocked-flow class in diagnostic for {class_name}"
1258            );
1259            assert!(
1260                rendered.contains(":capture save blocked-flow.capture"),
1261                "missing capture guidance for {class_name}"
1262            );
1263            assert_eq!(
1264                document
1265                    .metadata
1266                    .quality
1267                    .as_ref()
1268                    .map(|quality| quality.category),
1269                Some(DocumentQualityCategory::Failed)
1270            );
1271        }
1272    }
1273
1274    #[test]
1275    fn blocked_flow_failure_document_is_deterministic() {
1276        let html = "<html><title>Access denied</title><main><p>Blocked by policy</p></main></html>";
1277        let first = Transformer::<Empty>::new()
1278            .fetched(html)
1279            .parse()
1280            .extract()
1281            .transform()
1282            .into_document();
1283        let second = Transformer::<Empty>::new()
1284            .fetched(html)
1285            .parse()
1286            .extract()
1287            .transform()
1288            .into_document();
1289
1290        assert_eq!(first, second);
1291    }
1292
1293    #[test]
1294    fn unsupported_page_shape_never_looks_successful() {
1295        let document = Transformer::<Empty>::new()
1296            .fetched(
1297                "<html><title>Unsupported</title><main><canvas></canvas><template></template></main></html>",
1298            )
1299            .parse()
1300            .extract()
1301            .transform()
1302            .into_document();
1303        let rendered = format!("{:?}", document.nodes);
1304
1305        assert!(
1306            rendered.contains("INDEX-GENERIC-EMPTY"),
1307            "unsupported page should emit generic empty diagnostic"
1308        );
1309        assert!(
1310            rendered.contains("confidence: failed"),
1311            "unsupported page should be marked failed"
1312        );
1313        assert_eq!(
1314            document
1315                .metadata
1316                .quality
1317                .as_ref()
1318                .map(|quality| quality.category),
1319            Some(DocumentQualityCategory::Failed)
1320        );
1321    }
1322}
index_transformer/lib.rs

index_transformer/
lib.rs