1use std::collections::BTreeSet;
7use std::fmt::{Display, Formatter};
8
9use index_core::{
10 ButtonAction, DocumentQuality, Form, IndexDocument, IndexNode, Input, Link, Metadata,
11 SectionRole,
12};
13
14pub const DEFAULT_MAX_EXTRACTION_BYTES: usize = 1_048_576;
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum ExtractFormat {
20 Markdown,
22 Links,
24 Json,
26}
27
28impl ExtractFormat {
29 #[must_use]
31 pub fn parse(input: &str) -> Option<Self> {
32 match input.trim().to_ascii_lowercase().as_str() {
33 "markdown" | "md" => Some(Self::Markdown),
34 "links" => Some(Self::Links),
35 "json" => Some(Self::Json),
36 _ => None,
37 }
38 }
39
40 #[must_use]
42 pub const fn as_str(&self) -> &'static str {
43 match self {
44 Self::Markdown => "markdown",
45 Self::Links => "links",
46 Self::Json => "json",
47 }
48 }
49}
50
51impl Display for ExtractFormat {
52 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
53 f.write_str(self.as_str())
54 }
55}
56
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub struct ExtractionLimits {
60 pub max_output_bytes: usize,
62}
63
64impl ExtractionLimits {
65 #[must_use]
67 pub const fn new(max_output_bytes: usize) -> Self {
68 Self { max_output_bytes }
69 }
70}
71
72impl Default for ExtractionLimits {
73 fn default() -> Self {
74 Self::new(DEFAULT_MAX_EXTRACTION_BYTES)
75 }
76}
77
78#[derive(Debug, Clone, PartialEq, Eq)]
80pub enum ExtractionError {
81 OutputTooLarge {
83 format: ExtractFormat,
85 limit: usize,
87 actual: usize,
89 },
90}
91
92impl Display for ExtractionError {
93 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
94 match self {
95 Self::OutputTooLarge {
96 format,
97 limit,
98 actual,
99 } => write!(
100 f,
101 "{format} extraction output too large: {actual} bytes exceeds limit {limit}"
102 ),
103 }
104 }
105}
106
107impl std::error::Error for ExtractionError {}
108
109#[must_use]
111pub fn extract_document(document: &IndexDocument, format: ExtractFormat) -> String {
112 match format {
113 ExtractFormat::Markdown => extract_markdown(document),
114 ExtractFormat::Links => extract_links(document),
115 ExtractFormat::Json => extract_json(document),
116 }
117}
118
119pub fn try_extract_document(
121 document: &IndexDocument,
122 format: ExtractFormat,
123 limits: ExtractionLimits,
124) -> Result<String, ExtractionError> {
125 let output = extract_document(document, format);
126 let actual = output.len();
127 if actual > limits.max_output_bytes {
128 Err(ExtractionError::OutputTooLarge {
129 format,
130 limit: limits.max_output_bytes,
131 actual,
132 })
133 } else {
134 Ok(output)
135 }
136}
137
138#[must_use]
140pub fn extract_markdown(document: &IndexDocument) -> String {
141 let mut output = String::new();
142 if !document.title.trim().is_empty() {
143 output.push_str("# ");
144 output.push_str(document.title.trim());
145 output.push_str("\n\n");
146 }
147
148 for node in &document.nodes {
149 write_markdown_node(node, &mut output);
150 }
151
152 trim_trailing_blank_lines(&mut output);
153 output.push('\n');
154 output
155}
156
157#[must_use]
159pub fn extract_links(document: &IndexDocument) -> String {
160 let mut links = Vec::new();
161 collect_links_from_nodes(&document.nodes, &mut links);
162
163 let mut output = String::new();
164 for (index, link) in links.iter().enumerate() {
165 output.push_str(&(index + 1).to_string());
166 output.push('\t');
167 output.push_str(&link.text);
168 output.push('\t');
169 output.push_str(&link.href);
170 output.push('\n');
171 }
172 output
173}
174
175#[derive(Debug, Clone, PartialEq, Eq)]
177pub struct Citation {
178 pub index: usize,
180 pub text: String,
182 pub href: String,
184}
185
186#[must_use]
188pub fn extract_citations(document: &IndexDocument) -> Vec<Citation> {
189 let mut links = Vec::new();
190 collect_links_from_nodes(&document.nodes, &mut links);
191 let mut seen = BTreeSet::new();
192 let mut citations = Vec::new();
193
194 for link in links {
195 let href = link.href.trim();
196 if !(href.starts_with("http://") || href.starts_with("https://")) {
197 continue;
198 }
199 if !seen.insert(href.to_owned()) {
200 continue;
201 }
202 citations.push(Citation {
203 index: citations.len() + 1,
204 text: link.text.trim().to_owned(),
205 href: href.to_owned(),
206 });
207 }
208
209 citations
210}
211
212#[must_use]
214pub fn extract_citations_tsv(document: &IndexDocument) -> String {
215 let mut output = String::new();
216 for citation in extract_citations(document) {
217 output.push_str(&citation.index.to_string());
218 output.push('\t');
219 output.push_str(&citation.text);
220 output.push('\t');
221 output.push_str(&citation.href);
222 output.push('\n');
223 }
224 output
225}
226
227#[must_use]
229pub fn export_section_markdown(document: &IndexDocument, selector: &str) -> Option<String> {
230 let selector = selector.trim();
231 if selector.is_empty() {
232 return None;
233 }
234
235 let mut output = String::new();
236 if write_selected_section(&document.nodes, selector, &mut output) {
237 trim_trailing_blank_lines(&mut output);
238 output.push('\n');
239 Some(output)
240 } else {
241 None
242 }
243}
244
245#[must_use]
247pub fn extract_json(document: &IndexDocument) -> String {
248 let mut output = String::new();
249 output.push_str("{\n");
250 output.push_str(" \"title\": ");
251 push_json_string(&mut output, &document.title);
252 output.push_str(",\n");
253 output.push_str(" \"metadata\": ");
254 push_json_metadata(&mut output, &document.metadata, 2);
255 output.push_str(",\n");
256 output.push_str(" \"nodes\": [\n");
257 for (index, node) in document.nodes.iter().enumerate() {
258 output.push_str(" ");
259 push_json_node(&mut output, node);
260 if index + 1 != document.nodes.len() {
261 output.push(',');
262 }
263 output.push('\n');
264 }
265 output.push_str(" ]\n");
266 output.push_str("}\n");
267 output
268}
269
270#[derive(Debug, Clone, PartialEq, Eq)]
272pub enum JsonSchemaError {
273 MissingTitle,
275 MissingMetadata,
277 MissingNodes,
279 MissingNodeType,
281}
282
283impl Display for JsonSchemaError {
284 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
285 match self {
286 Self::MissingTitle => f.write_str("document JSON is missing title"),
287 Self::MissingMetadata => f.write_str("document JSON is missing metadata"),
288 Self::MissingNodes => f.write_str("document JSON is missing nodes"),
289 Self::MissingNodeType => f.write_str("document JSON node is missing type"),
290 }
291 }
292}
293
294impl std::error::Error for JsonSchemaError {}
295
296pub fn validate_document_json_schema(json: &str) -> Result<(), JsonSchemaError> {
302 if !json.contains("\"title\":") {
303 return Err(JsonSchemaError::MissingTitle);
304 }
305 if !json.contains("\"metadata\":") {
306 return Err(JsonSchemaError::MissingMetadata);
307 }
308 if !json.contains("\"nodes\":") {
309 return Err(JsonSchemaError::MissingNodes);
310 }
311 if json.contains("{\"type\"") || json.contains("{ \"type\"") || json.contains(" {\"type\"") {
312 return Ok(());
313 }
314 if json.contains("\"nodes\": [\n ]") || json.contains("\"nodes\": []") {
315 return Ok(());
316 }
317 Err(JsonSchemaError::MissingNodeType)
318}
319
320#[derive(Debug, Clone, PartialEq, Eq)]
322pub enum PipeDecision {
323 Allowed(PipeCommand),
325 RequiresConfirmation(PipeCommand),
327 Denied(PipeDeniedReason),
329}
330
331#[derive(Debug, Clone, PartialEq, Eq)]
333pub struct PipeCommand {
334 command: String,
335}
336
337impl PipeCommand {
338 #[must_use]
340 pub fn new(command: impl Into<String>) -> Self {
341 Self {
342 command: command.into(),
343 }
344 }
345
346 #[must_use]
348 pub fn as_str(&self) -> &str {
349 &self.command
350 }
351}
352
353#[derive(Debug, Clone, PartialEq, Eq)]
355pub enum PipeDeniedReason {
356 Empty,
358 ShellSyntax,
360 ProgramNotAllowed(String),
362}
363
364impl Display for PipeDeniedReason {
365 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
366 match self {
367 Self::Empty => f.write_str("empty pipe command"),
368 Self::ShellSyntax => f.write_str("shell syntax is not allowed in pipe commands"),
369 Self::ProgramNotAllowed(program) => {
370 write!(f, "program is not allowed in pipe commands: {program}")
371 }
372 }
373 }
374}
375
376impl std::error::Error for PipeDeniedReason {}
377
378#[must_use]
380pub fn classify_pipe_command(input: &str) -> PipeDecision {
381 let trimmed = input.trim();
382 let Some(command) = trimmed.strip_prefix("--confirm ") else {
383 return match validate_pipe_command(trimmed) {
384 Ok(()) => PipeDecision::RequiresConfirmation(PipeCommand::new(trimmed)),
385 Err(error) => PipeDecision::Denied(error),
386 };
387 };
388
389 match validate_pipe_command(command.trim()) {
390 Ok(()) => PipeDecision::Allowed(PipeCommand::new(command.trim())),
391 Err(error) => PipeDecision::Denied(error),
392 }
393}
394
395fn validate_pipe_command(command: &str) -> Result<(), PipeDeniedReason> {
396 if command.is_empty() {
397 return Err(PipeDeniedReason::Empty);
398 }
399 if command.chars().any(is_shell_syntax) {
400 return Err(PipeDeniedReason::ShellSyntax);
401 }
402
403 let program = command.split_whitespace().next().unwrap_or_default();
404 if allowed_pipe_program(program) {
405 Ok(())
406 } else {
407 Err(PipeDeniedReason::ProgramNotAllowed(program.to_owned()))
408 }
409}
410
411fn allowed_pipe_program(program: &str) -> bool {
412 matches!(
413 program,
414 "cat" | "cut" | "grep" | "head" | "jq" | "rg" | "sed" | "sort" | "tail" | "uniq" | "wc"
415 )
416}
417
418fn is_shell_syntax(ch: char) -> bool {
419 matches!(
420 ch,
421 ';' | '|' | '&' | '>' | '<' | '`' | '$' | '(' | ')' | '\n' | '\r'
422 )
423}
424
425fn write_markdown_node(node: &IndexNode, output: &mut String) {
426 match node {
427 IndexNode::Heading { level, text } => {
428 output.push_str(&"#".repeat(usize::from((*level).clamp(1, 6))));
429 output.push(' ');
430 output.push_str(text.trim());
431 output.push_str("\n\n");
432 }
433 IndexNode::Paragraph(text) => {
434 output.push_str(text.trim());
435 output.push_str("\n\n");
436 }
437 IndexNode::Link(link) => {
438 output.push('[');
439 output.push_str(link.text.trim());
440 output.push_str("](");
441 output.push_str(link.href.trim());
442 output.push_str(")\n\n");
443 }
444 IndexNode::List { ordered, items } => {
445 for (index, item) in items.iter().enumerate() {
446 if *ordered {
447 output.push_str(&(index + 1).to_string());
448 output.push_str(". ");
449 } else {
450 output.push_str("- ");
451 }
452 output.push_str(item.trim());
453 output.push('\n');
454 }
455 output.push('\n');
456 }
457 IndexNode::CodeBlock { language, code } => {
458 output.push_str("```");
459 if let Some(language) = language {
460 output.push_str(language.trim());
461 }
462 output.push('\n');
463 output.push_str(code.trim_end());
464 output.push_str("\n```\n\n");
465 }
466 IndexNode::Table { rows } => write_markdown_table(rows, output),
467 IndexNode::Spacer { lines } => {
468 for _ in 0..(*lines).clamp(1, 3) {
469 output.push('\n');
470 }
471 }
472 IndexNode::Section {
473 role,
474 title,
475 collapsed,
476 nodes,
477 } => {
478 let marker = if *collapsed { "▸" } else { "▾" };
479 output.push_str("> ");
480 output.push_str(marker);
481 output.push(' ');
482 output.push_str(§ion_label(*role, title.as_deref()));
483 output.push_str(" (");
484 output.push_str(§ion_item_count(nodes).to_string());
485 output.push_str(" items)\n\n");
486 if !collapsed {
487 for node in nodes {
488 write_markdown_node(node, output);
489 }
490 }
491 }
492 IndexNode::Image { alt, src } => {
493 output.push_str(";
496 if let Some(src) = src {
497 output.push_str(src.trim());
498 }
499 output.push_str(")\n\n");
500 }
501 IndexNode::Form(form) => write_markdown_form(form, output),
502 IndexNode::Error(message) => {
503 output.push_str("> [error] ");
504 output.push_str(message.trim());
505 output.push_str("\n\n");
506 }
507 }
508}
509
510fn write_markdown_table(rows: &[Vec<String>], output: &mut String) {
511 if rows.is_empty() {
512 return;
513 }
514
515 for row in rows {
516 output.push('|');
517 for cell in row {
518 output.push(' ');
519 output.push_str(cell.trim());
520 output.push_str(" |");
521 }
522 output.push('\n');
523 }
524 output.push('\n');
525}
526
527fn write_markdown_form(form: &Form, output: &mut String) {
528 output.push_str("> Form ");
529 output.push_str(&form.method);
530 output.push(' ');
531 output.push_str(&form.name);
532 output.push_str(" -> ");
533 output.push_str(&form.action);
534 output.push('\n');
535 for input in &form.inputs {
536 output.push_str("> - ");
537 output.push_str(&input.name);
538 output.push_str(" (");
539 output.push_str(&input.kind);
540 if input.required {
541 output.push_str(", required");
542 }
543 output.push_str(")\n");
544 }
545 output.push('\n');
546}
547
548fn trim_trailing_blank_lines(output: &mut String) {
549 while output.ends_with("\n\n") {
550 output.pop();
551 }
552 while output.ends_with('\n') {
553 output.pop();
554 }
555}
556
557fn collect_links_from_nodes<'a>(nodes: &'a [IndexNode], links: &mut Vec<&'a Link>) {
558 for node in nodes {
559 if let IndexNode::Link(link) = node {
560 links.push(link);
561 } else if let IndexNode::Section { nodes, .. } = node {
562 collect_links_from_nodes(nodes, links);
563 }
564 }
565}
566
567fn write_selected_section(nodes: &[IndexNode], selector: &str, output: &mut String) -> bool {
568 if write_flat_heading_section(nodes, selector, output) {
569 return true;
570 }
571
572 for node in nodes {
573 if let IndexNode::Section { title, nodes, .. } = node {
574 if title
575 .as_deref()
576 .is_some_and(|title| text_matches(title, selector))
577 {
578 for node in nodes {
579 write_markdown_node(node, output);
580 }
581 return true;
582 }
583 if write_selected_section(nodes, selector, output) {
584 return true;
585 }
586 }
587 }
588
589 false
590}
591
592fn write_flat_heading_section(nodes: &[IndexNode], selector: &str, output: &mut String) -> bool {
593 let mut selected_level = None;
594 for node in nodes {
595 if let IndexNode::Heading { level, text } = node {
596 if selected_level.is_none() && text_matches(text, selector) {
597 selected_level = Some(*level);
598 } else if selected_level.is_some_and(|selected| *level <= selected) {
599 break;
600 }
601 }
602
603 if selected_level.is_some() {
604 write_markdown_node(node, output);
605 }
606 }
607
608 selected_level.is_some()
609}
610
611fn text_matches(text: &str, selector: &str) -> bool {
612 text.trim().eq_ignore_ascii_case(selector)
613}
614
615fn push_json_metadata(output: &mut String, metadata: &Metadata, indent: usize) {
616 output.push_str("{\n");
617 push_json_option_field(
618 output,
619 "canonical_url",
620 metadata.canonical_url.as_deref(),
621 indent + 2,
622 );
623 output.push_str(",\n");
624 push_json_option_field(output, "author", metadata.author.as_deref(), indent + 2);
625 output.push_str(",\n");
626 push_json_option_field(output, "language", metadata.language.as_deref(), indent + 2);
627 output.push_str(",\n");
628 push_json_option_field(
629 output,
630 "description",
631 metadata.description.as_deref(),
632 indent + 2,
633 );
634 output.push_str(",\n");
635 push_json_option_field(
636 output,
637 "open_graph_title",
638 metadata.open_graph_title.as_deref(),
639 indent + 2,
640 );
641 output.push_str(",\n");
642 push_json_option_field(
643 output,
644 "open_graph_description",
645 metadata.open_graph_description.as_deref(),
646 indent + 2,
647 );
648 output.push_str(",\n");
649 push_json_option_field(
650 output,
651 "adapter_id",
652 metadata.adapter_id.as_ref().map(|adapter| adapter.as_str()),
653 indent + 2,
654 );
655 output.push_str(",\n");
656 output.push_str(&" ".repeat(indent + 2));
657 output.push_str("\"quality\": ");
658 push_json_quality(output, metadata.quality.as_ref());
659 output.push('\n');
660 output.push_str(&" ".repeat(indent));
661 output.push('}');
662}
663
664fn push_json_quality(output: &mut String, quality: Option<&DocumentQuality>) {
665 if let Some(quality) = quality {
666 output.push_str("{\"category\": ");
667 push_json_string(output, quality.category.as_str());
668 output.push_str(", \"score\": ");
669 output.push_str(&quality.score.to_string());
670 output.push_str(", \"reasons\": ");
671 push_json_string_array(output, &quality.reasons);
672 output.push('}');
673 } else {
674 output.push_str("null");
675 }
676}
677
678fn push_json_option_field(output: &mut String, name: &str, value: Option<&str>, indent: usize) {
679 output.push_str(&" ".repeat(indent));
680 output.push('"');
681 output.push_str(name);
682 output.push_str("\": ");
683 push_json_option_string(output, value);
684}
685
686fn push_json_node(output: &mut String, node: &IndexNode) {
687 match node {
688 IndexNode::Heading { level, text } => {
689 output.push_str("{\"type\": \"heading\", \"level\": ");
690 output.push_str(&level.to_string());
691 output.push_str(", \"text\": ");
692 push_json_string(output, text);
693 output.push('}');
694 }
695 IndexNode::Paragraph(text) => {
696 output.push_str("{\"type\": \"paragraph\", \"text\": ");
697 push_json_string(output, text);
698 output.push('}');
699 }
700 IndexNode::Link(link) => {
701 output.push_str("{\"type\": \"link\", \"text\": ");
702 push_json_string(output, &link.text);
703 output.push_str(", \"href\": ");
704 push_json_string(output, &link.href);
705 output.push('}');
706 }
707 IndexNode::List { ordered, items } => {
708 output.push_str("{\"type\": \"list\", \"ordered\": ");
709 output.push_str(if *ordered { "true" } else { "false" });
710 output.push_str(", \"items\": ");
711 push_json_string_array(output, items);
712 output.push('}');
713 }
714 IndexNode::CodeBlock { language, code } => {
715 output.push_str("{\"type\": \"code_block\", \"language\": ");
716 push_json_option_string(output, language.as_deref());
717 output.push_str(", \"code\": ");
718 push_json_string(output, code);
719 output.push('}');
720 }
721 IndexNode::Table { rows } => {
722 output.push_str("{\"type\": \"table\", \"headers\": ");
723 push_json_string_array(output, &table_headers(rows));
724 output.push_str(", \"row_labels\": ");
725 push_json_string_array(output, &table_row_labels(rows));
726 output.push_str(", \"rows\": ");
727 push_json_table(output, rows);
728 output.push('}');
729 }
730 IndexNode::Spacer { lines } => {
731 output.push_str("{\"type\": \"spacer\", \"lines\": ");
732 output.push_str(&(*lines).clamp(1, 3).to_string());
733 output.push('}');
734 }
735 IndexNode::Section {
736 role,
737 title,
738 collapsed,
739 nodes,
740 } => {
741 output.push_str("{\"type\": \"section\", \"role\": ");
742 push_json_string(output, role.as_str());
743 output.push_str(", \"title\": ");
744 push_json_option_string(output, title.as_deref());
745 output.push_str(", \"collapsed\": ");
746 output.push_str(if *collapsed { "true" } else { "false" });
747 output.push_str(", \"nodes\": [");
748 for (index, node) in nodes.iter().enumerate() {
749 push_json_node(output, node);
750 if index + 1 != nodes.len() {
751 output.push_str(", ");
752 }
753 }
754 output.push_str("]}");
755 }
756 IndexNode::Image { alt, src } => {
757 output.push_str("{\"type\": \"image\", \"alt\": ");
758 push_json_string(output, alt);
759 output.push_str(", \"src\": ");
760 push_json_option_string(output, src.as_deref());
761 output.push('}');
762 }
763 IndexNode::Form(form) => push_json_form(output, form),
764 IndexNode::Error(message) => {
765 output.push_str("{\"type\": \"error\", \"message\": ");
766 push_json_string(output, message);
767 output.push('}');
768 }
769 }
770}
771
772fn section_label(role: SectionRole, title: Option<&str>) -> String {
773 match title.map(str::trim).filter(|title| !title.is_empty()) {
774 Some(title) => format!("{}: {title}", role.as_str()),
775 None => role.as_str().to_owned(),
776 }
777}
778
779fn section_item_count(nodes: &[IndexNode]) -> usize {
780 nodes
781 .iter()
782 .filter(|node| !matches!(node, IndexNode::Spacer { .. }))
783 .count()
784}
785
786fn push_json_form(output: &mut String, form: &Form) {
787 output.push_str("{\"type\": \"form\", \"name\": ");
788 push_json_string(output, &form.name);
789 output.push_str(", \"method\": ");
790 push_json_string(output, &form.method);
791 output.push_str(", \"action\": ");
792 push_json_string(output, &form.action);
793 output.push_str(", \"inputs\": [");
794 for (index, input) in form.inputs.iter().enumerate() {
795 push_json_input(output, input);
796 if index + 1 != form.inputs.len() {
797 output.push_str(", ");
798 }
799 }
800 output.push_str("], \"buttons\": [");
801 for (index, button) in form.buttons.iter().enumerate() {
802 push_json_button(output, button);
803 if index + 1 != form.buttons.len() {
804 output.push_str(", ");
805 }
806 }
807 output.push_str("]}");
808}
809
810fn push_json_input(output: &mut String, input: &Input) {
811 output.push_str("{\"name\": ");
812 push_json_string(output, &input.name);
813 output.push_str(", \"kind\": ");
814 push_json_string(output, &input.kind);
815 output.push_str(", \"value\": ");
816 push_json_option_string(output, input.value.as_deref());
817 output.push_str(", \"required\": ");
818 output.push_str(if input.required { "true" } else { "false" });
819 output.push('}');
820}
821
822fn push_json_button(output: &mut String, button: &ButtonAction) {
823 output.push_str("{\"name\": ");
824 push_json_option_string(output, button.name.as_deref());
825 output.push_str(", \"value\": ");
826 push_json_option_string(output, button.value.as_deref());
827 output.push_str(", \"label\": ");
828 push_json_string(output, &button.label);
829 output.push('}');
830}
831
832fn push_json_string_array(output: &mut String, items: &[String]) {
833 output.push('[');
834 for (index, item) in items.iter().enumerate() {
835 push_json_string(output, item);
836 if index + 1 != items.len() {
837 output.push_str(", ");
838 }
839 }
840 output.push(']');
841}
842
843fn push_json_table(output: &mut String, rows: &[Vec<String>]) {
844 output.push('[');
845 for (row_index, row) in rows.iter().enumerate() {
846 push_json_string_array(output, row);
847 if row_index + 1 != rows.len() {
848 output.push_str(", ");
849 }
850 }
851 output.push(']');
852}
853
854fn table_headers(rows: &[Vec<String>]) -> Vec<String> {
855 rows.first().cloned().unwrap_or_default()
856}
857
858fn table_row_labels(rows: &[Vec<String>]) -> Vec<String> {
859 rows.iter()
860 .skip(1)
861 .filter_map(|row| row.first())
862 .filter(|label| !label.trim().is_empty())
863 .cloned()
864 .collect()
865}
866
867fn push_json_option_string(output: &mut String, value: Option<&str>) {
868 if let Some(value) = value {
869 push_json_string(output, value);
870 } else {
871 output.push_str("null");
872 }
873}
874
875fn push_json_string(output: &mut String, value: &str) {
876 output.push('"');
877 for ch in value.chars() {
878 match ch {
879 '"' => output.push_str("\\\""),
880 '\\' => output.push_str("\\\\"),
881 '\n' => output.push_str("\\n"),
882 '\r' => output.push_str("\\r"),
883 '\t' => output.push_str("\\t"),
884 '\u{08}' => output.push_str("\\b"),
885 '\u{0c}' => output.push_str("\\f"),
886 ch if ch.is_control() => {
887 output.push_str("\\u");
888 output.push_str(&format!("{:04x}", ch as u32));
889 }
890 ch => output.push(ch),
891 }
892 }
893 output.push('"');
894}
895
896#[cfg(test)]
897mod tests {
898 use index_core::{
899 ButtonAction, DocumentQuality, DocumentQualityCategory, Form, IndexDocument, IndexNode,
900 Input, Link, SectionRole,
901 };
902
903 use super::{
904 ExtractFormat, ExtractionError, ExtractionLimits, JsonSchemaError, PipeDecision,
905 PipeDeniedReason, classify_pipe_command, export_section_markdown, extract_citations,
906 extract_citations_tsv, extract_json, extract_links, extract_markdown, try_extract_document,
907 validate_document_json_schema,
908 };
909
910 fn fixture_document() -> IndexDocument {
911 let mut document = IndexDocument::titled("Fixture");
912 document.metadata.description = Some("Document description".to_owned());
913 document.metadata.quality = Some(DocumentQuality::new(
914 DocumentQualityCategory::StrongGeneric,
915 82,
916 ["generic reader emitted semantic content"],
917 ));
918 document.push(IndexNode::Heading {
919 level: 2,
920 text: "Overview".to_owned(),
921 });
922 document.push(IndexNode::Paragraph("Hello from Index.".to_owned()));
923 document.push(IndexNode::Spacer { lines: 2 });
924 document.push(IndexNode::Link(Link::new(
925 "Docs",
926 "https://example.com/docs",
927 )));
928 document.push(IndexNode::Section {
929 role: SectionRole::Navigation,
930 title: Some("Site".to_owned()),
931 collapsed: true,
932 nodes: vec![IndexNode::Link(Link::new(
933 "About",
934 "https://example.com/about",
935 ))],
936 });
937 document.push(IndexNode::List {
938 ordered: true,
939 items: vec!["First".to_owned(), "Second".to_owned()],
940 });
941 document.push(IndexNode::CodeBlock {
942 language: Some("rust".to_owned()),
943 code: "fn main() {}\n".to_owned(),
944 });
945 document.push(IndexNode::Table {
946 rows: vec![
947 vec!["Name".to_owned(), "Value".to_owned()],
948 vec!["Index".to_owned(), "Semantic browser".to_owned()],
949 ],
950 });
951 document.push(IndexNode::Image {
952 alt: "Diagram".to_owned(),
953 src: Some("diagram.png".to_owned()),
954 });
955 document.push(IndexNode::Form(Form {
956 name: "search".to_owned(),
957 method: "GET".to_owned(),
958 action: "/search".to_owned(),
959 inputs: vec![Input {
960 name: "q".to_owned(),
961 kind: "text".to_owned(),
962 value: None,
963 required: true,
964 }],
965 buttons: vec![ButtonAction {
966 name: Some("go".to_owned()),
967 value: Some("1".to_owned()),
968 label: "Search".to_owned(),
969 }],
970 }));
971 document
972 }
973
974 #[test]
975 fn markdown_snapshot_is_deterministic() {
976 let markdown = extract_markdown(&fixture_document());
977 assert_eq!(
978 markdown,
979 "# Fixture\n\n## Overview\n\nHello from Index.\n\n\n\n[Docs](https://example.com/docs)\n\n> ▸ navigation: Site (1 items)\n\n1. First\n2. Second\n\n```rust\nfn main() {}\n```\n\n| Name | Value |\n| Index | Semantic browser |\n\n\n\n> Form GET search -> /search\n> - q (text, required)\n"
980 );
981 }
982
983 #[test]
984 fn links_use_stable_numeric_addresses() {
985 let links = extract_links(&fixture_document());
986 assert_eq!(
987 links,
988 "1\tDocs\thttps://example.com/docs\n2\tAbout\thttps://example.com/about\n"
989 );
990 }
991
992 #[test]
993 fn citations_use_external_links_once_in_document_order() {
994 let mut document = fixture_document();
995 document.push(IndexNode::Link(Link::new(
996 "Docs duplicate",
997 "https://example.com/docs",
998 )));
999 document.push(IndexNode::Link(Link::new("Local", "/local")));
1000
1001 let citations = extract_citations(&document);
1002
1003 assert_eq!(citations.len(), 2);
1004 assert_eq!(citations[0].index, 1);
1005 assert_eq!(citations[0].text, "Docs");
1006 assert_eq!(citations[0].href, "https://example.com/docs");
1007 assert_eq!(citations[1].text, "About");
1008 assert_eq!(
1009 extract_citations_tsv(&document),
1010 "1\tDocs\thttps://example.com/docs\n2\tAbout\thttps://example.com/about\n"
1011 );
1012 }
1013
1014 #[test]
1015 fn selected_heading_section_exports_until_next_peer_heading() {
1016 let mut document = IndexDocument::titled("Sections");
1017 document.push(IndexNode::Heading {
1018 level: 2,
1019 text: "Keep".to_owned(),
1020 });
1021 document.push(IndexNode::Paragraph("selected".to_owned()));
1022 document.push(IndexNode::Heading {
1023 level: 3,
1024 text: "Nested".to_owned(),
1025 });
1026 document.push(IndexNode::Paragraph("still selected".to_owned()));
1027 document.push(IndexNode::Heading {
1028 level: 2,
1029 text: "Stop".to_owned(),
1030 });
1031 document.push(IndexNode::Paragraph("not selected".to_owned()));
1032
1033 let exported = export_section_markdown(&document, "keep");
1034
1035 assert_eq!(
1036 exported.as_deref(),
1037 Some("## Keep\n\nselected\n\n### Nested\n\nstill selected\n")
1038 );
1039 }
1040
1041 #[test]
1042 fn selected_region_title_exports_section_nodes() {
1043 let mut document = IndexDocument::titled("Sections");
1044 document.push(IndexNode::Section {
1045 role: SectionRole::Main,
1046 title: Some("Article".to_owned()),
1047 collapsed: false,
1048 nodes: vec![IndexNode::Paragraph("body".to_owned())],
1049 });
1050
1051 let exported = export_section_markdown(&document, "article");
1052
1053 assert_eq!(exported.as_deref(), Some("body\n"));
1054 }
1055
1056 #[test]
1057 fn json_output_validates_against_document_schema() {
1058 let json = extract_json(&fixture_document());
1059 assert!(validate_document_json_schema(&json).is_ok());
1060 assert!(json.contains("\"type\": \"spacer\""));
1061 assert!(json.contains("\"type\": \"section\""));
1062 assert!(json.contains("\"type\": \"form\""));
1063 assert!(json.contains("\"quality\": {\"category\": \"strong-generic\", \"score\": 82"));
1064 assert!(json.contains("\"headers\": [\"Name\", \"Value\"]"));
1065 assert!(json.contains("\"row_labels\": [\"Index\"]"));
1066 assert!(json.contains("\"required\": true"));
1067 }
1068
1069 #[test]
1070 fn bounded_extraction_rejects_oversized_output() {
1071 let mut document = IndexDocument::titled("Large export");
1072 document.push(IndexNode::Paragraph("x".repeat(256)));
1073
1074 let result = try_extract_document(
1075 &document,
1076 ExtractFormat::Markdown,
1077 ExtractionLimits::new(32),
1078 );
1079
1080 assert!(matches!(
1081 result,
1082 Err(ExtractionError::OutputTooLarge {
1083 format: ExtractFormat::Markdown,
1084 limit: 32,
1085 actual
1086 }) if actual > 32
1087 ));
1088 }
1089
1090 #[test]
1091 fn json_schema_validation_rejects_missing_fields() {
1092 let result = validate_document_json_schema("{\"nodes\": []}");
1093 assert_eq!(result, Err(JsonSchemaError::MissingTitle));
1094 }
1095
1096 #[test]
1097 fn extract_format_parses_supported_names() {
1098 assert_eq!(
1099 ExtractFormat::parse("markdown"),
1100 Some(ExtractFormat::Markdown)
1101 );
1102 assert_eq!(ExtractFormat::parse("md"), Some(ExtractFormat::Markdown));
1103 assert_eq!(ExtractFormat::parse("links"), Some(ExtractFormat::Links));
1104 assert_eq!(ExtractFormat::parse("json"), Some(ExtractFormat::Json));
1105 assert_eq!(ExtractFormat::parse("xml"), None);
1106 }
1107
1108 #[test]
1109 fn pipe_requires_confirmation_for_safe_programs() {
1110 assert_eq!(
1111 classify_pipe_command("wc -l"),
1112 PipeDecision::RequiresConfirmation(super::PipeCommand::new("wc -l"))
1113 );
1114 }
1115
1116 #[test]
1117 fn pipe_allows_confirmed_safe_programs() {
1118 assert_eq!(
1119 classify_pipe_command("--confirm jq .title"),
1120 PipeDecision::Allowed(super::PipeCommand::new("jq .title"))
1121 );
1122 }
1123
1124 #[test]
1125 fn pipe_denies_shell_syntax_by_default() {
1126 assert_eq!(
1127 classify_pipe_command("wc -l; rm -rf target"),
1128 PipeDecision::Denied(PipeDeniedReason::ShellSyntax)
1129 );
1130 }
1131
1132 #[test]
1133 fn pipe_denies_unapproved_programs() {
1134 assert_eq!(
1135 classify_pipe_command("python script.py"),
1136 PipeDecision::Denied(PipeDeniedReason::ProgramNotAllowed("python".to_owned()))
1137 );
1138 }
1139}