Skip to main content

index_extract/
lib.rs

1//! Deterministic extraction and scripting policy for Index documents.
2//!
3//! This crate consumes the Index Document Model. It does not parse HTML,
4//! render terminal UI, or execute local commands.
5
6use std::collections::BTreeSet;
7use std::fmt::{Display, Formatter};
8
9use index_core::{
10    ButtonAction, DocumentQuality, Form, IndexDocument, IndexNode, Input, Link, Metadata,
11    SectionRole,
12};
13
14/// Default maximum extraction output size in bytes.
15pub const DEFAULT_MAX_EXTRACTION_BYTES: usize = 1_048_576;
16
17/// Supported document extraction formats.
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum ExtractFormat {
20    /// Markdown document output.
21    Markdown,
22    /// Stable numeric link list output.
23    Links,
24    /// Deterministic machine-readable JSON output.
25    Json,
26}
27
28impl ExtractFormat {
29    /// Parses an extraction format name.
30    #[must_use]
31    pub fn parse(input: &str) -> Option<Self> {
32        match input.trim().to_ascii_lowercase().as_str() {
33            "markdown" | "md" => Some(Self::Markdown),
34            "links" => Some(Self::Links),
35            "json" => Some(Self::Json),
36            _ => None,
37        }
38    }
39
40    /// Returns the canonical format name.
41    #[must_use]
42    pub const fn as_str(&self) -> &'static str {
43        match self {
44            Self::Markdown => "markdown",
45            Self::Links => "links",
46            Self::Json => "json",
47        }
48    }
49}
50
51impl Display for ExtractFormat {
52    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
53        f.write_str(self.as_str())
54    }
55}
56
57/// Limits applied to extraction output.
58#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub struct ExtractionLimits {
60    /// Maximum output size in bytes.
61    pub max_output_bytes: usize,
62}
63
64impl ExtractionLimits {
65    /// Creates extraction limits.
66    #[must_use]
67    pub const fn new(max_output_bytes: usize) -> Self {
68        Self { max_output_bytes }
69    }
70}
71
72impl Default for ExtractionLimits {
73    fn default() -> Self {
74        Self::new(DEFAULT_MAX_EXTRACTION_BYTES)
75    }
76}
77
78/// Extraction failure.
79#[derive(Debug, Clone, PartialEq, Eq)]
80pub enum ExtractionError {
81    /// The rendered extraction exceeded the configured output limit.
82    OutputTooLarge {
83        /// Requested extraction format.
84        format: ExtractFormat,
85        /// Configured byte limit.
86        limit: usize,
87        /// Actual output byte length.
88        actual: usize,
89    },
90}
91
92impl Display for ExtractionError {
93    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
94        match self {
95            Self::OutputTooLarge {
96                format,
97                limit,
98                actual,
99            } => write!(
100                f,
101                "{format} extraction output too large: {actual} bytes exceeds limit {limit}"
102            ),
103        }
104    }
105}
106
107impl std::error::Error for ExtractionError {}
108
109/// Extracts a document in the requested format.
110#[must_use]
111pub fn extract_document(document: &IndexDocument, format: ExtractFormat) -> String {
112    match format {
113        ExtractFormat::Markdown => extract_markdown(document),
114        ExtractFormat::Links => extract_links(document),
115        ExtractFormat::Json => extract_json(document),
116    }
117}
118
119/// Extracts a document and rejects oversized output deterministically.
120pub fn try_extract_document(
121    document: &IndexDocument,
122    format: ExtractFormat,
123    limits: ExtractionLimits,
124) -> Result<String, ExtractionError> {
125    let output = extract_document(document, format);
126    let actual = output.len();
127    if actual > limits.max_output_bytes {
128        Err(ExtractionError::OutputTooLarge {
129            format,
130            limit: limits.max_output_bytes,
131            actual,
132        })
133    } else {
134        Ok(output)
135    }
136}
137
138/// Extracts a document as deterministic Markdown.
139#[must_use]
140pub fn extract_markdown(document: &IndexDocument) -> String {
141    let mut output = String::new();
142    if !document.title.trim().is_empty() {
143        output.push_str("# ");
144        output.push_str(document.title.trim());
145        output.push_str("\n\n");
146    }
147
148    for node in &document.nodes {
149        write_markdown_node(node, &mut output);
150    }
151
152    trim_trailing_blank_lines(&mut output);
153    output.push('\n');
154    output
155}
156
157/// Extracts document links as stable numeric addresses.
158#[must_use]
159pub fn extract_links(document: &IndexDocument) -> String {
160    let mut links = Vec::new();
161    collect_links_from_nodes(&document.nodes, &mut links);
162
163    let mut output = String::new();
164    for (index, link) in links.iter().enumerate() {
165        output.push_str(&(index + 1).to_string());
166        output.push('\t');
167        output.push_str(&link.text);
168        output.push('\t');
169        output.push_str(&link.href);
170        output.push('\n');
171    }
172    output
173}
174
175/// A stable external citation/reference extracted from a document link.
176#[derive(Debug, Clone, PartialEq, Eq)]
177pub struct Citation {
178    /// Stable one-based citation index.
179    pub index: usize,
180    /// Link text as presented by the document model.
181    pub text: String,
182    /// External reference URL.
183    pub href: String,
184}
185
186/// Extracts external HTTP(S) citations and references in document order.
187#[must_use]
188pub fn extract_citations(document: &IndexDocument) -> Vec<Citation> {
189    let mut links = Vec::new();
190    collect_links_from_nodes(&document.nodes, &mut links);
191    let mut seen = BTreeSet::new();
192    let mut citations = Vec::new();
193
194    for link in links {
195        let href = link.href.trim();
196        if !(href.starts_with("http://") || href.starts_with("https://")) {
197            continue;
198        }
199        if !seen.insert(href.to_owned()) {
200            continue;
201        }
202        citations.push(Citation {
203            index: citations.len() + 1,
204            text: link.text.trim().to_owned(),
205            href: href.to_owned(),
206        });
207    }
208
209    citations
210}
211
212/// Extracts external citations as deterministic TSV.
213#[must_use]
214pub fn extract_citations_tsv(document: &IndexDocument) -> String {
215    let mut output = String::new();
216    for citation in extract_citations(document) {
217        output.push_str(&citation.index.to_string());
218        output.push('\t');
219        output.push_str(&citation.text);
220        output.push('\t');
221        output.push_str(&citation.href);
222        output.push('\n');
223    }
224    output
225}
226
227/// Exports the first section whose heading or section title matches as Markdown.
228#[must_use]
229pub fn export_section_markdown(document: &IndexDocument, selector: &str) -> Option<String> {
230    let selector = selector.trim();
231    if selector.is_empty() {
232        return None;
233    }
234
235    let mut output = String::new();
236    if write_selected_section(&document.nodes, selector, &mut output) {
237        trim_trailing_blank_lines(&mut output);
238        output.push('\n');
239        Some(output)
240    } else {
241        None
242    }
243}
244
245/// Extracts a document as deterministic JSON.
246#[must_use]
247pub fn extract_json(document: &IndexDocument) -> String {
248    let mut output = String::new();
249    output.push_str("{\n");
250    output.push_str("  \"title\": ");
251    push_json_string(&mut output, &document.title);
252    output.push_str(",\n");
253    output.push_str("  \"metadata\": ");
254    push_json_metadata(&mut output, &document.metadata, 2);
255    output.push_str(",\n");
256    output.push_str("  \"nodes\": [\n");
257    for (index, node) in document.nodes.iter().enumerate() {
258        output.push_str("    ");
259        push_json_node(&mut output, node);
260        if index + 1 != document.nodes.len() {
261            output.push(',');
262        }
263        output.push('\n');
264    }
265    output.push_str("  ]\n");
266    output.push_str("}\n");
267    output
268}
269
270/// Result of validating the deterministic document JSON shape.
271#[derive(Debug, Clone, PartialEq, Eq)]
272pub enum JsonSchemaError {
273    /// Top-level title field is missing.
274    MissingTitle,
275    /// Top-level metadata object is missing.
276    MissingMetadata,
277    /// Top-level nodes array is missing.
278    MissingNodes,
279    /// A node object is missing its type discriminator.
280    MissingNodeType,
281}
282
283impl Display for JsonSchemaError {
284    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
285        match self {
286            Self::MissingTitle => f.write_str("document JSON is missing title"),
287            Self::MissingMetadata => f.write_str("document JSON is missing metadata"),
288            Self::MissingNodes => f.write_str("document JSON is missing nodes"),
289            Self::MissingNodeType => f.write_str("document JSON node is missing type"),
290        }
291    }
292}
293
294impl std::error::Error for JsonSchemaError {}
295
296/// Validates the stable JSON shape emitted by `extract_json`.
297///
298/// This is a lightweight schema guard for the dependency-light extraction
299/// layer. It verifies the top-level contract and node discriminators without
300/// accepting arbitrary JSON as input.
301pub fn validate_document_json_schema(json: &str) -> Result<(), JsonSchemaError> {
302    if !json.contains("\"title\":") {
303        return Err(JsonSchemaError::MissingTitle);
304    }
305    if !json.contains("\"metadata\":") {
306        return Err(JsonSchemaError::MissingMetadata);
307    }
308    if !json.contains("\"nodes\":") {
309        return Err(JsonSchemaError::MissingNodes);
310    }
311    if json.contains("{\"type\"") || json.contains("{ \"type\"") || json.contains("    {\"type\"") {
312        return Ok(());
313    }
314    if json.contains("\"nodes\": [\n  ]") || json.contains("\"nodes\": []") {
315        return Ok(());
316    }
317    Err(JsonSchemaError::MissingNodeType)
318}
319
320/// Policy decision for a `:pipe` command.
321#[derive(Debug, Clone, PartialEq, Eq)]
322pub enum PipeDecision {
323    /// The command is allowed because it used explicit confirmation.
324    Allowed(PipeCommand),
325    /// The command is syntactically safe but needs explicit confirmation.
326    RequiresConfirmation(PipeCommand),
327    /// The command is denied by policy.
328    Denied(PipeDeniedReason),
329}
330
331/// A command that can receive extracted document output from the host app.
332#[derive(Debug, Clone, PartialEq, Eq)]
333pub struct PipeCommand {
334    command: String,
335}
336
337impl PipeCommand {
338    /// Creates a pipe command.
339    #[must_use]
340    pub fn new(command: impl Into<String>) -> Self {
341        Self {
342            command: command.into(),
343        }
344    }
345
346    /// Returns the command text.
347    #[must_use]
348    pub fn as_str(&self) -> &str {
349        &self.command
350    }
351}
352
353/// Reason a pipe command was denied.
354#[derive(Debug, Clone, PartialEq, Eq)]
355pub enum PipeDeniedReason {
356    /// No command was supplied.
357    Empty,
358    /// Shell metacharacters or control characters were present.
359    ShellSyntax,
360    /// The command starts with a program that this policy does not allow.
361    ProgramNotAllowed(String),
362}
363
364impl Display for PipeDeniedReason {
365    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
366        match self {
367            Self::Empty => f.write_str("empty pipe command"),
368            Self::ShellSyntax => f.write_str("shell syntax is not allowed in pipe commands"),
369            Self::ProgramNotAllowed(program) => {
370                write!(f, "program is not allowed in pipe commands: {program}")
371            }
372        }
373    }
374}
375
376impl std::error::Error for PipeDeniedReason {}
377
378/// Classifies a `:pipe` command without executing it.
379#[must_use]
380pub fn classify_pipe_command(input: &str) -> PipeDecision {
381    let trimmed = input.trim();
382    let Some(command) = trimmed.strip_prefix("--confirm ") else {
383        return match validate_pipe_command(trimmed) {
384            Ok(()) => PipeDecision::RequiresConfirmation(PipeCommand::new(trimmed)),
385            Err(error) => PipeDecision::Denied(error),
386        };
387    };
388
389    match validate_pipe_command(command.trim()) {
390        Ok(()) => PipeDecision::Allowed(PipeCommand::new(command.trim())),
391        Err(error) => PipeDecision::Denied(error),
392    }
393}
394
395fn validate_pipe_command(command: &str) -> Result<(), PipeDeniedReason> {
396    if command.is_empty() {
397        return Err(PipeDeniedReason::Empty);
398    }
399    if command.chars().any(is_shell_syntax) {
400        return Err(PipeDeniedReason::ShellSyntax);
401    }
402
403    let program = command.split_whitespace().next().unwrap_or_default();
404    if allowed_pipe_program(program) {
405        Ok(())
406    } else {
407        Err(PipeDeniedReason::ProgramNotAllowed(program.to_owned()))
408    }
409}
410
411fn allowed_pipe_program(program: &str) -> bool {
412    matches!(
413        program,
414        "cat" | "cut" | "grep" | "head" | "jq" | "rg" | "sed" | "sort" | "tail" | "uniq" | "wc"
415    )
416}
417
418fn is_shell_syntax(ch: char) -> bool {
419    matches!(
420        ch,
421        ';' | '|' | '&' | '>' | '<' | '`' | '$' | '(' | ')' | '\n' | '\r'
422    )
423}
424
425fn write_markdown_node(node: &IndexNode, output: &mut String) {
426    match node {
427        IndexNode::Heading { level, text } => {
428            output.push_str(&"#".repeat(usize::from((*level).clamp(1, 6))));
429            output.push(' ');
430            output.push_str(text.trim());
431            output.push_str("\n\n");
432        }
433        IndexNode::Paragraph(text) => {
434            output.push_str(text.trim());
435            output.push_str("\n\n");
436        }
437        IndexNode::Link(link) => {
438            output.push('[');
439            output.push_str(link.text.trim());
440            output.push_str("](");
441            output.push_str(link.href.trim());
442            output.push_str(")\n\n");
443        }
444        IndexNode::List { ordered, items } => {
445            for (index, item) in items.iter().enumerate() {
446                if *ordered {
447                    output.push_str(&(index + 1).to_string());
448                    output.push_str(". ");
449                } else {
450                    output.push_str("- ");
451                }
452                output.push_str(item.trim());
453                output.push('\n');
454            }
455            output.push('\n');
456        }
457        IndexNode::CodeBlock { language, code } => {
458            output.push_str("```");
459            if let Some(language) = language {
460                output.push_str(language.trim());
461            }
462            output.push('\n');
463            output.push_str(code.trim_end());
464            output.push_str("\n```\n\n");
465        }
466        IndexNode::Table { rows } => write_markdown_table(rows, output),
467        IndexNode::Spacer { lines } => {
468            for _ in 0..(*lines).clamp(1, 3) {
469                output.push('\n');
470            }
471        }
472        IndexNode::Section {
473            role,
474            title,
475            collapsed,
476            nodes,
477        } => {
478            let marker = if *collapsed { "▸" } else { "▾" };
479            output.push_str("> ");
480            output.push_str(marker);
481            output.push(' ');
482            output.push_str(&section_label(*role, title.as_deref()));
483            output.push_str(" (");
484            output.push_str(&section_item_count(nodes).to_string());
485            output.push_str(" items)\n\n");
486            if !collapsed {
487                for node in nodes {
488                    write_markdown_node(node, output);
489                }
490            }
491        }
492        IndexNode::Image { alt, src } => {
493            output.push_str("![");
494            output.push_str(alt.trim());
495            output.push_str("](");
496            if let Some(src) = src {
497                output.push_str(src.trim());
498            }
499            output.push_str(")\n\n");
500        }
501        IndexNode::Form(form) => write_markdown_form(form, output),
502        IndexNode::Error(message) => {
503            output.push_str("> [error] ");
504            output.push_str(message.trim());
505            output.push_str("\n\n");
506        }
507    }
508}
509
510fn write_markdown_table(rows: &[Vec<String>], output: &mut String) {
511    if rows.is_empty() {
512        return;
513    }
514
515    for row in rows {
516        output.push('|');
517        for cell in row {
518            output.push(' ');
519            output.push_str(cell.trim());
520            output.push_str(" |");
521        }
522        output.push('\n');
523    }
524    output.push('\n');
525}
526
527fn write_markdown_form(form: &Form, output: &mut String) {
528    output.push_str("> Form ");
529    output.push_str(&form.method);
530    output.push(' ');
531    output.push_str(&form.name);
532    output.push_str(" -> ");
533    output.push_str(&form.action);
534    output.push('\n');
535    for input in &form.inputs {
536        output.push_str("> - ");
537        output.push_str(&input.name);
538        output.push_str(" (");
539        output.push_str(&input.kind);
540        if input.required {
541            output.push_str(", required");
542        }
543        output.push_str(")\n");
544    }
545    output.push('\n');
546}
547
548fn trim_trailing_blank_lines(output: &mut String) {
549    while output.ends_with("\n\n") {
550        output.pop();
551    }
552    while output.ends_with('\n') {
553        output.pop();
554    }
555}
556
557fn collect_links_from_nodes<'a>(nodes: &'a [IndexNode], links: &mut Vec<&'a Link>) {
558    for node in nodes {
559        if let IndexNode::Link(link) = node {
560            links.push(link);
561        } else if let IndexNode::Section { nodes, .. } = node {
562            collect_links_from_nodes(nodes, links);
563        }
564    }
565}
566
567fn write_selected_section(nodes: &[IndexNode], selector: &str, output: &mut String) -> bool {
568    if write_flat_heading_section(nodes, selector, output) {
569        return true;
570    }
571
572    for node in nodes {
573        if let IndexNode::Section { title, nodes, .. } = node {
574            if title
575                .as_deref()
576                .is_some_and(|title| text_matches(title, selector))
577            {
578                for node in nodes {
579                    write_markdown_node(node, output);
580                }
581                return true;
582            }
583            if write_selected_section(nodes, selector, output) {
584                return true;
585            }
586        }
587    }
588
589    false
590}
591
592fn write_flat_heading_section(nodes: &[IndexNode], selector: &str, output: &mut String) -> bool {
593    let mut selected_level = None;
594    for node in nodes {
595        if let IndexNode::Heading { level, text } = node {
596            if selected_level.is_none() && text_matches(text, selector) {
597                selected_level = Some(*level);
598            } else if selected_level.is_some_and(|selected| *level <= selected) {
599                break;
600            }
601        }
602
603        if selected_level.is_some() {
604            write_markdown_node(node, output);
605        }
606    }
607
608    selected_level.is_some()
609}
610
611fn text_matches(text: &str, selector: &str) -> bool {
612    text.trim().eq_ignore_ascii_case(selector)
613}
614
615fn push_json_metadata(output: &mut String, metadata: &Metadata, indent: usize) {
616    output.push_str("{\n");
617    push_json_option_field(
618        output,
619        "canonical_url",
620        metadata.canonical_url.as_deref(),
621        indent + 2,
622    );
623    output.push_str(",\n");
624    push_json_option_field(output, "author", metadata.author.as_deref(), indent + 2);
625    output.push_str(",\n");
626    push_json_option_field(output, "language", metadata.language.as_deref(), indent + 2);
627    output.push_str(",\n");
628    push_json_option_field(
629        output,
630        "description",
631        metadata.description.as_deref(),
632        indent + 2,
633    );
634    output.push_str(",\n");
635    push_json_option_field(
636        output,
637        "open_graph_title",
638        metadata.open_graph_title.as_deref(),
639        indent + 2,
640    );
641    output.push_str(",\n");
642    push_json_option_field(
643        output,
644        "open_graph_description",
645        metadata.open_graph_description.as_deref(),
646        indent + 2,
647    );
648    output.push_str(",\n");
649    push_json_option_field(
650        output,
651        "adapter_id",
652        metadata.adapter_id.as_ref().map(|adapter| adapter.as_str()),
653        indent + 2,
654    );
655    output.push_str(",\n");
656    output.push_str(&" ".repeat(indent + 2));
657    output.push_str("\"quality\": ");
658    push_json_quality(output, metadata.quality.as_ref());
659    output.push('\n');
660    output.push_str(&" ".repeat(indent));
661    output.push('}');
662}
663
664fn push_json_quality(output: &mut String, quality: Option<&DocumentQuality>) {
665    if let Some(quality) = quality {
666        output.push_str("{\"category\": ");
667        push_json_string(output, quality.category.as_str());
668        output.push_str(", \"score\": ");
669        output.push_str(&quality.score.to_string());
670        output.push_str(", \"reasons\": ");
671        push_json_string_array(output, &quality.reasons);
672        output.push('}');
673    } else {
674        output.push_str("null");
675    }
676}
677
678fn push_json_option_field(output: &mut String, name: &str, value: Option<&str>, indent: usize) {
679    output.push_str(&" ".repeat(indent));
680    output.push('"');
681    output.push_str(name);
682    output.push_str("\": ");
683    push_json_option_string(output, value);
684}
685
686fn push_json_node(output: &mut String, node: &IndexNode) {
687    match node {
688        IndexNode::Heading { level, text } => {
689            output.push_str("{\"type\": \"heading\", \"level\": ");
690            output.push_str(&level.to_string());
691            output.push_str(", \"text\": ");
692            push_json_string(output, text);
693            output.push('}');
694        }
695        IndexNode::Paragraph(text) => {
696            output.push_str("{\"type\": \"paragraph\", \"text\": ");
697            push_json_string(output, text);
698            output.push('}');
699        }
700        IndexNode::Link(link) => {
701            output.push_str("{\"type\": \"link\", \"text\": ");
702            push_json_string(output, &link.text);
703            output.push_str(", \"href\": ");
704            push_json_string(output, &link.href);
705            output.push('}');
706        }
707        IndexNode::List { ordered, items } => {
708            output.push_str("{\"type\": \"list\", \"ordered\": ");
709            output.push_str(if *ordered { "true" } else { "false" });
710            output.push_str(", \"items\": ");
711            push_json_string_array(output, items);
712            output.push('}');
713        }
714        IndexNode::CodeBlock { language, code } => {
715            output.push_str("{\"type\": \"code_block\", \"language\": ");
716            push_json_option_string(output, language.as_deref());
717            output.push_str(", \"code\": ");
718            push_json_string(output, code);
719            output.push('}');
720        }
721        IndexNode::Table { rows } => {
722            output.push_str("{\"type\": \"table\", \"headers\": ");
723            push_json_string_array(output, &table_headers(rows));
724            output.push_str(", \"row_labels\": ");
725            push_json_string_array(output, &table_row_labels(rows));
726            output.push_str(", \"rows\": ");
727            push_json_table(output, rows);
728            output.push('}');
729        }
730        IndexNode::Spacer { lines } => {
731            output.push_str("{\"type\": \"spacer\", \"lines\": ");
732            output.push_str(&(*lines).clamp(1, 3).to_string());
733            output.push('}');
734        }
735        IndexNode::Section {
736            role,
737            title,
738            collapsed,
739            nodes,
740        } => {
741            output.push_str("{\"type\": \"section\", \"role\": ");
742            push_json_string(output, role.as_str());
743            output.push_str(", \"title\": ");
744            push_json_option_string(output, title.as_deref());
745            output.push_str(", \"collapsed\": ");
746            output.push_str(if *collapsed { "true" } else { "false" });
747            output.push_str(", \"nodes\": [");
748            for (index, node) in nodes.iter().enumerate() {
749                push_json_node(output, node);
750                if index + 1 != nodes.len() {
751                    output.push_str(", ");
752                }
753            }
754            output.push_str("]}");
755        }
756        IndexNode::Image { alt, src } => {
757            output.push_str("{\"type\": \"image\", \"alt\": ");
758            push_json_string(output, alt);
759            output.push_str(", \"src\": ");
760            push_json_option_string(output, src.as_deref());
761            output.push('}');
762        }
763        IndexNode::Form(form) => push_json_form(output, form),
764        IndexNode::Error(message) => {
765            output.push_str("{\"type\": \"error\", \"message\": ");
766            push_json_string(output, message);
767            output.push('}');
768        }
769    }
770}
771
772fn section_label(role: SectionRole, title: Option<&str>) -> String {
773    match title.map(str::trim).filter(|title| !title.is_empty()) {
774        Some(title) => format!("{}: {title}", role.as_str()),
775        None => role.as_str().to_owned(),
776    }
777}
778
779fn section_item_count(nodes: &[IndexNode]) -> usize {
780    nodes
781        .iter()
782        .filter(|node| !matches!(node, IndexNode::Spacer { .. }))
783        .count()
784}
785
786fn push_json_form(output: &mut String, form: &Form) {
787    output.push_str("{\"type\": \"form\", \"name\": ");
788    push_json_string(output, &form.name);
789    output.push_str(", \"method\": ");
790    push_json_string(output, &form.method);
791    output.push_str(", \"action\": ");
792    push_json_string(output, &form.action);
793    output.push_str(", \"inputs\": [");
794    for (index, input) in form.inputs.iter().enumerate() {
795        push_json_input(output, input);
796        if index + 1 != form.inputs.len() {
797            output.push_str(", ");
798        }
799    }
800    output.push_str("], \"buttons\": [");
801    for (index, button) in form.buttons.iter().enumerate() {
802        push_json_button(output, button);
803        if index + 1 != form.buttons.len() {
804            output.push_str(", ");
805        }
806    }
807    output.push_str("]}");
808}
809
810fn push_json_input(output: &mut String, input: &Input) {
811    output.push_str("{\"name\": ");
812    push_json_string(output, &input.name);
813    output.push_str(", \"kind\": ");
814    push_json_string(output, &input.kind);
815    output.push_str(", \"value\": ");
816    push_json_option_string(output, input.value.as_deref());
817    output.push_str(", \"required\": ");
818    output.push_str(if input.required { "true" } else { "false" });
819    output.push('}');
820}
821
822fn push_json_button(output: &mut String, button: &ButtonAction) {
823    output.push_str("{\"name\": ");
824    push_json_option_string(output, button.name.as_deref());
825    output.push_str(", \"value\": ");
826    push_json_option_string(output, button.value.as_deref());
827    output.push_str(", \"label\": ");
828    push_json_string(output, &button.label);
829    output.push('}');
830}
831
832fn push_json_string_array(output: &mut String, items: &[String]) {
833    output.push('[');
834    for (index, item) in items.iter().enumerate() {
835        push_json_string(output, item);
836        if index + 1 != items.len() {
837            output.push_str(", ");
838        }
839    }
840    output.push(']');
841}
842
843fn push_json_table(output: &mut String, rows: &[Vec<String>]) {
844    output.push('[');
845    for (row_index, row) in rows.iter().enumerate() {
846        push_json_string_array(output, row);
847        if row_index + 1 != rows.len() {
848            output.push_str(", ");
849        }
850    }
851    output.push(']');
852}
853
854fn table_headers(rows: &[Vec<String>]) -> Vec<String> {
855    rows.first().cloned().unwrap_or_default()
856}
857
858fn table_row_labels(rows: &[Vec<String>]) -> Vec<String> {
859    rows.iter()
860        .skip(1)
861        .filter_map(|row| row.first())
862        .filter(|label| !label.trim().is_empty())
863        .cloned()
864        .collect()
865}
866
867fn push_json_option_string(output: &mut String, value: Option<&str>) {
868    if let Some(value) = value {
869        push_json_string(output, value);
870    } else {
871        output.push_str("null");
872    }
873}
874
875fn push_json_string(output: &mut String, value: &str) {
876    output.push('"');
877    for ch in value.chars() {
878        match ch {
879            '"' => output.push_str("\\\""),
880            '\\' => output.push_str("\\\\"),
881            '\n' => output.push_str("\\n"),
882            '\r' => output.push_str("\\r"),
883            '\t' => output.push_str("\\t"),
884            '\u{08}' => output.push_str("\\b"),
885            '\u{0c}' => output.push_str("\\f"),
886            ch if ch.is_control() => {
887                output.push_str("\\u");
888                output.push_str(&format!("{:04x}", ch as u32));
889            }
890            ch => output.push(ch),
891        }
892    }
893    output.push('"');
894}
895
896#[cfg(test)]
897mod tests {
898    use index_core::{
899        ButtonAction, DocumentQuality, DocumentQualityCategory, Form, IndexDocument, IndexNode,
900        Input, Link, SectionRole,
901    };
902
903    use super::{
904        ExtractFormat, ExtractionError, ExtractionLimits, JsonSchemaError, PipeDecision,
905        PipeDeniedReason, classify_pipe_command, export_section_markdown, extract_citations,
906        extract_citations_tsv, extract_json, extract_links, extract_markdown, try_extract_document,
907        validate_document_json_schema,
908    };
909
910    fn fixture_document() -> IndexDocument {
911        let mut document = IndexDocument::titled("Fixture");
912        document.metadata.description = Some("Document description".to_owned());
913        document.metadata.quality = Some(DocumentQuality::new(
914            DocumentQualityCategory::StrongGeneric,
915            82,
916            ["generic reader emitted semantic content"],
917        ));
918        document.push(IndexNode::Heading {
919            level: 2,
920            text: "Overview".to_owned(),
921        });
922        document.push(IndexNode::Paragraph("Hello from Index.".to_owned()));
923        document.push(IndexNode::Spacer { lines: 2 });
924        document.push(IndexNode::Link(Link::new(
925            "Docs",
926            "https://example.com/docs",
927        )));
928        document.push(IndexNode::Section {
929            role: SectionRole::Navigation,
930            title: Some("Site".to_owned()),
931            collapsed: true,
932            nodes: vec![IndexNode::Link(Link::new(
933                "About",
934                "https://example.com/about",
935            ))],
936        });
937        document.push(IndexNode::List {
938            ordered: true,
939            items: vec!["First".to_owned(), "Second".to_owned()],
940        });
941        document.push(IndexNode::CodeBlock {
942            language: Some("rust".to_owned()),
943            code: "fn main() {}\n".to_owned(),
944        });
945        document.push(IndexNode::Table {
946            rows: vec![
947                vec!["Name".to_owned(), "Value".to_owned()],
948                vec!["Index".to_owned(), "Semantic browser".to_owned()],
949            ],
950        });
951        document.push(IndexNode::Image {
952            alt: "Diagram".to_owned(),
953            src: Some("diagram.png".to_owned()),
954        });
955        document.push(IndexNode::Form(Form {
956            name: "search".to_owned(),
957            method: "GET".to_owned(),
958            action: "/search".to_owned(),
959            inputs: vec![Input {
960                name: "q".to_owned(),
961                kind: "text".to_owned(),
962                value: None,
963                required: true,
964            }],
965            buttons: vec![ButtonAction {
966                name: Some("go".to_owned()),
967                value: Some("1".to_owned()),
968                label: "Search".to_owned(),
969            }],
970        }));
971        document
972    }
973
974    #[test]
975    fn markdown_snapshot_is_deterministic() {
976        let markdown = extract_markdown(&fixture_document());
977        assert_eq!(
978            markdown,
979            "# Fixture\n\n## Overview\n\nHello from Index.\n\n\n\n[Docs](https://example.com/docs)\n\n> ▸ navigation: Site (1 items)\n\n1. First\n2. Second\n\n```rust\nfn main() {}\n```\n\n| Name | Value |\n| Index | Semantic browser |\n\n![Diagram](diagram.png)\n\n> Form GET search -> /search\n> - q (text, required)\n"
980        );
981    }
982
983    #[test]
984    fn links_use_stable_numeric_addresses() {
985        let links = extract_links(&fixture_document());
986        assert_eq!(
987            links,
988            "1\tDocs\thttps://example.com/docs\n2\tAbout\thttps://example.com/about\n"
989        );
990    }
991
992    #[test]
993    fn citations_use_external_links_once_in_document_order() {
994        let mut document = fixture_document();
995        document.push(IndexNode::Link(Link::new(
996            "Docs duplicate",
997            "https://example.com/docs",
998        )));
999        document.push(IndexNode::Link(Link::new("Local", "/local")));
1000
1001        let citations = extract_citations(&document);
1002
1003        assert_eq!(citations.len(), 2);
1004        assert_eq!(citations[0].index, 1);
1005        assert_eq!(citations[0].text, "Docs");
1006        assert_eq!(citations[0].href, "https://example.com/docs");
1007        assert_eq!(citations[1].text, "About");
1008        assert_eq!(
1009            extract_citations_tsv(&document),
1010            "1\tDocs\thttps://example.com/docs\n2\tAbout\thttps://example.com/about\n"
1011        );
1012    }
1013
1014    #[test]
1015    fn selected_heading_section_exports_until_next_peer_heading() {
1016        let mut document = IndexDocument::titled("Sections");
1017        document.push(IndexNode::Heading {
1018            level: 2,
1019            text: "Keep".to_owned(),
1020        });
1021        document.push(IndexNode::Paragraph("selected".to_owned()));
1022        document.push(IndexNode::Heading {
1023            level: 3,
1024            text: "Nested".to_owned(),
1025        });
1026        document.push(IndexNode::Paragraph("still selected".to_owned()));
1027        document.push(IndexNode::Heading {
1028            level: 2,
1029            text: "Stop".to_owned(),
1030        });
1031        document.push(IndexNode::Paragraph("not selected".to_owned()));
1032
1033        let exported = export_section_markdown(&document, "keep");
1034
1035        assert_eq!(
1036            exported.as_deref(),
1037            Some("## Keep\n\nselected\n\n### Nested\n\nstill selected\n")
1038        );
1039    }
1040
1041    #[test]
1042    fn selected_region_title_exports_section_nodes() {
1043        let mut document = IndexDocument::titled("Sections");
1044        document.push(IndexNode::Section {
1045            role: SectionRole::Main,
1046            title: Some("Article".to_owned()),
1047            collapsed: false,
1048            nodes: vec![IndexNode::Paragraph("body".to_owned())],
1049        });
1050
1051        let exported = export_section_markdown(&document, "article");
1052
1053        assert_eq!(exported.as_deref(), Some("body\n"));
1054    }
1055
1056    #[test]
1057    fn json_output_validates_against_document_schema() {
1058        let json = extract_json(&fixture_document());
1059        assert!(validate_document_json_schema(&json).is_ok());
1060        assert!(json.contains("\"type\": \"spacer\""));
1061        assert!(json.contains("\"type\": \"section\""));
1062        assert!(json.contains("\"type\": \"form\""));
1063        assert!(json.contains("\"quality\": {\"category\": \"strong-generic\", \"score\": 82"));
1064        assert!(json.contains("\"headers\": [\"Name\", \"Value\"]"));
1065        assert!(json.contains("\"row_labels\": [\"Index\"]"));
1066        assert!(json.contains("\"required\": true"));
1067    }
1068
1069    #[test]
1070    fn bounded_extraction_rejects_oversized_output() {
1071        let mut document = IndexDocument::titled("Large export");
1072        document.push(IndexNode::Paragraph("x".repeat(256)));
1073
1074        let result = try_extract_document(
1075            &document,
1076            ExtractFormat::Markdown,
1077            ExtractionLimits::new(32),
1078        );
1079
1080        assert!(matches!(
1081            result,
1082            Err(ExtractionError::OutputTooLarge {
1083                format: ExtractFormat::Markdown,
1084                limit: 32,
1085                actual
1086            }) if actual > 32
1087        ));
1088    }
1089
1090    #[test]
1091    fn json_schema_validation_rejects_missing_fields() {
1092        let result = validate_document_json_schema("{\"nodes\": []}");
1093        assert_eq!(result, Err(JsonSchemaError::MissingTitle));
1094    }
1095
1096    #[test]
1097    fn extract_format_parses_supported_names() {
1098        assert_eq!(
1099            ExtractFormat::parse("markdown"),
1100            Some(ExtractFormat::Markdown)
1101        );
1102        assert_eq!(ExtractFormat::parse("md"), Some(ExtractFormat::Markdown));
1103        assert_eq!(ExtractFormat::parse("links"), Some(ExtractFormat::Links));
1104        assert_eq!(ExtractFormat::parse("json"), Some(ExtractFormat::Json));
1105        assert_eq!(ExtractFormat::parse("xml"), None);
1106    }
1107
1108    #[test]
1109    fn pipe_requires_confirmation_for_safe_programs() {
1110        assert_eq!(
1111            classify_pipe_command("wc -l"),
1112            PipeDecision::RequiresConfirmation(super::PipeCommand::new("wc -l"))
1113        );
1114    }
1115
1116    #[test]
1117    fn pipe_allows_confirmed_safe_programs() {
1118        assert_eq!(
1119            classify_pipe_command("--confirm jq .title"),
1120            PipeDecision::Allowed(super::PipeCommand::new("jq .title"))
1121        );
1122    }
1123
1124    #[test]
1125    fn pipe_denies_shell_syntax_by_default() {
1126        assert_eq!(
1127            classify_pipe_command("wc -l; rm -rf target"),
1128            PipeDecision::Denied(PipeDeniedReason::ShellSyntax)
1129        );
1130    }
1131
1132    #[test]
1133    fn pipe_denies_unapproved_programs() {
1134        assert_eq!(
1135            classify_pipe_command("python script.py"),
1136            PipeDecision::Denied(PipeDeniedReason::ProgramNotAllowed("python".to_owned()))
1137        );
1138    }
1139}