use std::collections::BTreeSet;
use std::fmt::{Display, Formatter};
use index_core::{
ButtonAction, DocumentQuality, Form, IndexDocument, IndexNode, Input, Link, Metadata,
SectionRole,
};
pub const DEFAULT_MAX_EXTRACTION_BYTES: usize = 1_048_576;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExtractFormat {
Markdown,
Links,
Json,
}
impl ExtractFormat {
#[must_use]
pub fn parse(input: &str) -> Option<Self> {
match input.trim().to_ascii_lowercase().as_str() {
"markdown" | "md" => Some(Self::Markdown),
"links" => Some(Self::Links),
"json" => Some(Self::Json),
_ => None,
}
}
#[must_use]
pub const fn as_str(&self) -> &'static str {
match self {
Self::Markdown => "markdown",
Self::Links => "links",
Self::Json => "json",
}
}
}
impl Display for ExtractFormat {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ExtractionLimits {
pub max_output_bytes: usize,
}
impl ExtractionLimits {
#[must_use]
pub const fn new(max_output_bytes: usize) -> Self {
Self { max_output_bytes }
}
}
impl Default for ExtractionLimits {
fn default() -> Self {
Self::new(DEFAULT_MAX_EXTRACTION_BYTES)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ExtractionError {
OutputTooLarge {
format: ExtractFormat,
limit: usize,
actual: usize,
},
}
impl Display for ExtractionError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::OutputTooLarge {
format,
limit,
actual,
} => write!(
f,
"{format} extraction output too large: {actual} bytes exceeds limit {limit}"
),
}
}
}
impl std::error::Error for ExtractionError {}
#[must_use]
pub fn extract_document(document: &IndexDocument, format: ExtractFormat) -> String {
match format {
ExtractFormat::Markdown => extract_markdown(document),
ExtractFormat::Links => extract_links(document),
ExtractFormat::Json => extract_json(document),
}
}
pub fn try_extract_document(
document: &IndexDocument,
format: ExtractFormat,
limits: ExtractionLimits,
) -> Result<String, ExtractionError> {
let output = extract_document(document, format);
let actual = output.len();
if actual > limits.max_output_bytes {
Err(ExtractionError::OutputTooLarge {
format,
limit: limits.max_output_bytes,
actual,
})
} else {
Ok(output)
}
}
#[must_use]
pub fn extract_markdown(document: &IndexDocument) -> String {
let mut output = String::new();
if !document.title.trim().is_empty() {
output.push_str("# ");
output.push_str(document.title.trim());
output.push_str("\n\n");
}
for node in &document.nodes {
write_markdown_node(node, &mut output);
}
trim_trailing_blank_lines(&mut output);
output.push('\n');
output
}
#[must_use]
pub fn extract_links(document: &IndexDocument) -> String {
let mut links = Vec::new();
collect_links_from_nodes(&document.nodes, &mut links);
let mut output = String::new();
for (index, link) in links.iter().enumerate() {
output.push_str(&(index + 1).to_string());
output.push('\t');
output.push_str(&link.text);
output.push('\t');
output.push_str(&link.href);
output.push('\n');
}
output
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Citation {
pub index: usize,
pub text: String,
pub href: String,
}
#[must_use]
pub fn extract_citations(document: &IndexDocument) -> Vec<Citation> {
let mut links = Vec::new();
collect_links_from_nodes(&document.nodes, &mut links);
let mut seen = BTreeSet::new();
let mut citations = Vec::new();
for link in links {
let href = link.href.trim();
if !(href.starts_with("http://") || href.starts_with("https://")) {
continue;
}
if !seen.insert(href.to_owned()) {
continue;
}
citations.push(Citation {
index: citations.len() + 1,
text: link.text.trim().to_owned(),
href: href.to_owned(),
});
}
citations
}
#[must_use]
pub fn extract_citations_tsv(document: &IndexDocument) -> String {
let mut output = String::new();
for citation in extract_citations(document) {
output.push_str(&citation.index.to_string());
output.push('\t');
output.push_str(&citation.text);
output.push('\t');
output.push_str(&citation.href);
output.push('\n');
}
output
}
#[must_use]
pub fn export_section_markdown(document: &IndexDocument, selector: &str) -> Option<String> {
let selector = selector.trim();
if selector.is_empty() {
return None;
}
let mut output = String::new();
if write_selected_section(&document.nodes, selector, &mut output) {
trim_trailing_blank_lines(&mut output);
output.push('\n');
Some(output)
} else {
None
}
}
#[must_use]
pub fn extract_json(document: &IndexDocument) -> String {
let mut output = String::new();
output.push_str("{\n");
output.push_str(" \"title\": ");
push_json_string(&mut output, &document.title);
output.push_str(",\n");
output.push_str(" \"metadata\": ");
push_json_metadata(&mut output, &document.metadata, 2);
output.push_str(",\n");
output.push_str(" \"nodes\": [\n");
for (index, node) in document.nodes.iter().enumerate() {
output.push_str(" ");
push_json_node(&mut output, node);
if index + 1 != document.nodes.len() {
output.push(',');
}
output.push('\n');
}
output.push_str(" ]\n");
output.push_str("}\n");
output
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum JsonSchemaError {
MissingTitle,
MissingMetadata,
MissingNodes,
MissingNodeType,
}
impl Display for JsonSchemaError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::MissingTitle => f.write_str("document JSON is missing title"),
Self::MissingMetadata => f.write_str("document JSON is missing metadata"),
Self::MissingNodes => f.write_str("document JSON is missing nodes"),
Self::MissingNodeType => f.write_str("document JSON node is missing type"),
}
}
}
impl std::error::Error for JsonSchemaError {}
pub fn validate_document_json_schema(json: &str) -> Result<(), JsonSchemaError> {
if !json.contains("\"title\":") {
return Err(JsonSchemaError::MissingTitle);
}
if !json.contains("\"metadata\":") {
return Err(JsonSchemaError::MissingMetadata);
}
if !json.contains("\"nodes\":") {
return Err(JsonSchemaError::MissingNodes);
}
if json.contains("{\"type\"") || json.contains("{ \"type\"") || json.contains(" {\"type\"") {
return Ok(());
}
if json.contains("\"nodes\": [\n ]") || json.contains("\"nodes\": []") {
return Ok(());
}
Err(JsonSchemaError::MissingNodeType)
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum PipeDecision {
Allowed(PipeCommand),
RequiresConfirmation(PipeCommand),
Denied(PipeDeniedReason),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PipeCommand {
command: String,
}
impl PipeCommand {
#[must_use]
pub fn new(command: impl Into<String>) -> Self {
Self {
command: command.into(),
}
}
#[must_use]
pub fn as_str(&self) -> &str {
&self.command
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum PipeDeniedReason {
Empty,
ShellSyntax,
ProgramNotAllowed(String),
}
impl Display for PipeDeniedReason {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::Empty => f.write_str("empty pipe command"),
Self::ShellSyntax => f.write_str("shell syntax is not allowed in pipe commands"),
Self::ProgramNotAllowed(program) => {
write!(f, "program is not allowed in pipe commands: {program}")
}
}
}
}
impl std::error::Error for PipeDeniedReason {}
#[must_use]
pub fn classify_pipe_command(input: &str) -> PipeDecision {
let trimmed = input.trim();
let Some(command) = trimmed.strip_prefix("--confirm ") else {
return match validate_pipe_command(trimmed) {
Ok(()) => PipeDecision::RequiresConfirmation(PipeCommand::new(trimmed)),
Err(error) => PipeDecision::Denied(error),
};
};
match validate_pipe_command(command.trim()) {
Ok(()) => PipeDecision::Allowed(PipeCommand::new(command.trim())),
Err(error) => PipeDecision::Denied(error),
}
}
fn validate_pipe_command(command: &str) -> Result<(), PipeDeniedReason> {
if command.is_empty() {
return Err(PipeDeniedReason::Empty);
}
if command.chars().any(is_shell_syntax) {
return Err(PipeDeniedReason::ShellSyntax);
}
let program = command.split_whitespace().next().unwrap_or_default();
if allowed_pipe_program(program) {
Ok(())
} else {
Err(PipeDeniedReason::ProgramNotAllowed(program.to_owned()))
}
}
fn allowed_pipe_program(program: &str) -> bool {
matches!(
program,
"cat" | "cut" | "grep" | "head" | "jq" | "rg" | "sed" | "sort" | "tail" | "uniq" | "wc"
)
}
fn is_shell_syntax(ch: char) -> bool {
matches!(
ch,
';' | '|' | '&' | '>' | '<' | '`' | '$' | '(' | ')' | '\n' | '\r'
)
}
fn write_markdown_node(node: &IndexNode, output: &mut String) {
match node {
IndexNode::Heading { level, text } => {
output.push_str(&"#".repeat(usize::from((*level).clamp(1, 6))));
output.push(' ');
output.push_str(text.trim());
output.push_str("\n\n");
}
IndexNode::Paragraph(text) => {
output.push_str(text.trim());
output.push_str("\n\n");
}
IndexNode::Link(link) => {
output.push('[');
output.push_str(link.text.trim());
output.push_str("](");
output.push_str(link.href.trim());
output.push_str(")\n\n");
}
IndexNode::List { ordered, items } => {
for (index, item) in items.iter().enumerate() {
if *ordered {
output.push_str(&(index + 1).to_string());
output.push_str(". ");
} else {
output.push_str("- ");
}
output.push_str(item.trim());
output.push('\n');
}
output.push('\n');
}
IndexNode::CodeBlock { language, code } => {
output.push_str("```");
if let Some(language) = language {
output.push_str(language.trim());
}
output.push('\n');
output.push_str(code.trim_end());
output.push_str("\n```\n\n");
}
IndexNode::Table { rows } => write_markdown_table(rows, output),
IndexNode::Spacer { lines } => {
for _ in 0..(*lines).clamp(1, 3) {
output.push('\n');
}
}
IndexNode::Section {
role,
title,
collapsed,
nodes,
} => {
let marker = if *collapsed { "â–¸" } else { "â–¾" };
output.push_str("> ");
output.push_str(marker);
output.push(' ');
output.push_str(§ion_label(*role, title.as_deref()));
output.push_str(" (");
output.push_str(§ion_item_count(nodes).to_string());
output.push_str(" items)\n\n");
if !collapsed {
for node in nodes {
write_markdown_node(node, output);
}
}
}
IndexNode::Image { alt, src } => {
output.push_str(";
if let Some(src) = src {
output.push_str(src.trim());
}
output.push_str(")\n\n");
}
IndexNode::Form(form) => write_markdown_form(form, output),
IndexNode::Error(message) => {
output.push_str("> [error] ");
output.push_str(message.trim());
output.push_str("\n\n");
}
}
}
fn write_markdown_table(rows: &[Vec<String>], output: &mut String) {
if rows.is_empty() {
return;
}
for row in rows {
output.push('|');
for cell in row {
output.push(' ');
output.push_str(cell.trim());
output.push_str(" |");
}
output.push('\n');
}
output.push('\n');
}
fn write_markdown_form(form: &Form, output: &mut String) {
output.push_str("> Form ");
output.push_str(&form.method);
output.push(' ');
output.push_str(&form.name);
output.push_str(" -> ");
output.push_str(&form.action);
output.push('\n');
for input in &form.inputs {
output.push_str("> - ");
output.push_str(&input.name);
output.push_str(" (");
output.push_str(&input.kind);
if input.required {
output.push_str(", required");
}
output.push_str(")\n");
}
output.push('\n');
}
fn trim_trailing_blank_lines(output: &mut String) {
while output.ends_with("\n\n") {
output.pop();
}
while output.ends_with('\n') {
output.pop();
}
}
fn collect_links_from_nodes<'a>(nodes: &'a [IndexNode], links: &mut Vec<&'a Link>) {
for node in nodes {
if let IndexNode::Link(link) = node {
links.push(link);
} else if let IndexNode::Section { nodes, .. } = node {
collect_links_from_nodes(nodes, links);
}
}
}
fn write_selected_section(nodes: &[IndexNode], selector: &str, output: &mut String) -> bool {
if write_flat_heading_section(nodes, selector, output) {
return true;
}
for node in nodes {
if let IndexNode::Section { title, nodes, .. } = node {
if title
.as_deref()
.is_some_and(|title| text_matches(title, selector))
{
for node in nodes {
write_markdown_node(node, output);
}
return true;
}
if write_selected_section(nodes, selector, output) {
return true;
}
}
}
false
}
fn write_flat_heading_section(nodes: &[IndexNode], selector: &str, output: &mut String) -> bool {
let mut selected_level = None;
for node in nodes {
if let IndexNode::Heading { level, text } = node {
if selected_level.is_none() && text_matches(text, selector) {
selected_level = Some(*level);
} else if selected_level.is_some_and(|selected| *level <= selected) {
break;
}
}
if selected_level.is_some() {
write_markdown_node(node, output);
}
}
selected_level.is_some()
}
fn text_matches(text: &str, selector: &str) -> bool {
text.trim().eq_ignore_ascii_case(selector)
}
fn push_json_metadata(output: &mut String, metadata: &Metadata, indent: usize) {
output.push_str("{\n");
push_json_option_field(
output,
"canonical_url",
metadata.canonical_url.as_deref(),
indent + 2,
);
output.push_str(",\n");
push_json_option_field(output, "author", metadata.author.as_deref(), indent + 2);
output.push_str(",\n");
push_json_option_field(output, "language", metadata.language.as_deref(), indent + 2);
output.push_str(",\n");
push_json_option_field(
output,
"description",
metadata.description.as_deref(),
indent + 2,
);
output.push_str(",\n");
push_json_option_field(
output,
"open_graph_title",
metadata.open_graph_title.as_deref(),
indent + 2,
);
output.push_str(",\n");
push_json_option_field(
output,
"open_graph_description",
metadata.open_graph_description.as_deref(),
indent + 2,
);
output.push_str(",\n");
push_json_option_field(
output,
"adapter_id",
metadata.adapter_id.as_ref().map(|adapter| adapter.as_str()),
indent + 2,
);
output.push_str(",\n");
output.push_str(&" ".repeat(indent + 2));
output.push_str("\"quality\": ");
push_json_quality(output, metadata.quality.as_ref());
output.push('\n');
output.push_str(&" ".repeat(indent));
output.push('}');
}
fn push_json_quality(output: &mut String, quality: Option<&DocumentQuality>) {
if let Some(quality) = quality {
output.push_str("{\"category\": ");
push_json_string(output, quality.category.as_str());
output.push_str(", \"score\": ");
output.push_str(&quality.score.to_string());
output.push_str(", \"reasons\": ");
push_json_string_array(output, &quality.reasons);
output.push('}');
} else {
output.push_str("null");
}
}
fn push_json_option_field(output: &mut String, name: &str, value: Option<&str>, indent: usize) {
output.push_str(&" ".repeat(indent));
output.push('"');
output.push_str(name);
output.push_str("\": ");
push_json_option_string(output, value);
}
fn push_json_node(output: &mut String, node: &IndexNode) {
match node {
IndexNode::Heading { level, text } => {
output.push_str("{\"type\": \"heading\", \"level\": ");
output.push_str(&level.to_string());
output.push_str(", \"text\": ");
push_json_string(output, text);
output.push('}');
}
IndexNode::Paragraph(text) => {
output.push_str("{\"type\": \"paragraph\", \"text\": ");
push_json_string(output, text);
output.push('}');
}
IndexNode::Link(link) => {
output.push_str("{\"type\": \"link\", \"text\": ");
push_json_string(output, &link.text);
output.push_str(", \"href\": ");
push_json_string(output, &link.href);
output.push('}');
}
IndexNode::List { ordered, items } => {
output.push_str("{\"type\": \"list\", \"ordered\": ");
output.push_str(if *ordered { "true" } else { "false" });
output.push_str(", \"items\": ");
push_json_string_array(output, items);
output.push('}');
}
IndexNode::CodeBlock { language, code } => {
output.push_str("{\"type\": \"code_block\", \"language\": ");
push_json_option_string(output, language.as_deref());
output.push_str(", \"code\": ");
push_json_string(output, code);
output.push('}');
}
IndexNode::Table { rows } => {
output.push_str("{\"type\": \"table\", \"headers\": ");
push_json_string_array(output, &table_headers(rows));
output.push_str(", \"row_labels\": ");
push_json_string_array(output, &table_row_labels(rows));
output.push_str(", \"rows\": ");
push_json_table(output, rows);
output.push('}');
}
IndexNode::Spacer { lines } => {
output.push_str("{\"type\": \"spacer\", \"lines\": ");
output.push_str(&(*lines).clamp(1, 3).to_string());
output.push('}');
}
IndexNode::Section {
role,
title,
collapsed,
nodes,
} => {
output.push_str("{\"type\": \"section\", \"role\": ");
push_json_string(output, role.as_str());
output.push_str(", \"title\": ");
push_json_option_string(output, title.as_deref());
output.push_str(", \"collapsed\": ");
output.push_str(if *collapsed { "true" } else { "false" });
output.push_str(", \"nodes\": [");
for (index, node) in nodes.iter().enumerate() {
push_json_node(output, node);
if index + 1 != nodes.len() {
output.push_str(", ");
}
}
output.push_str("]}");
}
IndexNode::Image { alt, src } => {
output.push_str("{\"type\": \"image\", \"alt\": ");
push_json_string(output, alt);
output.push_str(", \"src\": ");
push_json_option_string(output, src.as_deref());
output.push('}');
}
IndexNode::Form(form) => push_json_form(output, form),
IndexNode::Error(message) => {
output.push_str("{\"type\": \"error\", \"message\": ");
push_json_string(output, message);
output.push('}');
}
}
}
fn section_label(role: SectionRole, title: Option<&str>) -> String {
match title.map(str::trim).filter(|title| !title.is_empty()) {
Some(title) => format!("{}: {title}", role.as_str()),
None => role.as_str().to_owned(),
}
}
fn section_item_count(nodes: &[IndexNode]) -> usize {
nodes
.iter()
.filter(|node| !matches!(node, IndexNode::Spacer { .. }))
.count()
}
fn push_json_form(output: &mut String, form: &Form) {
output.push_str("{\"type\": \"form\", \"name\": ");
push_json_string(output, &form.name);
output.push_str(", \"method\": ");
push_json_string(output, &form.method);
output.push_str(", \"action\": ");
push_json_string(output, &form.action);
output.push_str(", \"inputs\": [");
for (index, input) in form.inputs.iter().enumerate() {
push_json_input(output, input);
if index + 1 != form.inputs.len() {
output.push_str(", ");
}
}
output.push_str("], \"buttons\": [");
for (index, button) in form.buttons.iter().enumerate() {
push_json_button(output, button);
if index + 1 != form.buttons.len() {
output.push_str(", ");
}
}
output.push_str("]}");
}
fn push_json_input(output: &mut String, input: &Input) {
output.push_str("{\"name\": ");
push_json_string(output, &input.name);
output.push_str(", \"kind\": ");
push_json_string(output, &input.kind);
output.push_str(", \"value\": ");
push_json_option_string(output, input.value.as_deref());
output.push_str(", \"required\": ");
output.push_str(if input.required { "true" } else { "false" });
output.push('}');
}
fn push_json_button(output: &mut String, button: &ButtonAction) {
output.push_str("{\"name\": ");
push_json_option_string(output, button.name.as_deref());
output.push_str(", \"value\": ");
push_json_option_string(output, button.value.as_deref());
output.push_str(", \"label\": ");
push_json_string(output, &button.label);
output.push('}');
}
fn push_json_string_array(output: &mut String, items: &[String]) {
output.push('[');
for (index, item) in items.iter().enumerate() {
push_json_string(output, item);
if index + 1 != items.len() {
output.push_str(", ");
}
}
output.push(']');
}
fn push_json_table(output: &mut String, rows: &[Vec<String>]) {
output.push('[');
for (row_index, row) in rows.iter().enumerate() {
push_json_string_array(output, row);
if row_index + 1 != rows.len() {
output.push_str(", ");
}
}
output.push(']');
}
fn table_headers(rows: &[Vec<String>]) -> Vec<String> {
rows.first().cloned().unwrap_or_default()
}
fn table_row_labels(rows: &[Vec<String>]) -> Vec<String> {
rows.iter()
.skip(1)
.filter_map(|row| row.first())
.filter(|label| !label.trim().is_empty())
.cloned()
.collect()
}
fn push_json_option_string(output: &mut String, value: Option<&str>) {
if let Some(value) = value {
push_json_string(output, value);
} else {
output.push_str("null");
}
}
fn push_json_string(output: &mut String, value: &str) {
output.push('"');
for ch in value.chars() {
match ch {
'"' => output.push_str("\\\""),
'\\' => output.push_str("\\\\"),
'\n' => output.push_str("\\n"),
'\r' => output.push_str("\\r"),
'\t' => output.push_str("\\t"),
'\u{08}' => output.push_str("\\b"),
'\u{0c}' => output.push_str("\\f"),
ch if ch.is_control() => {
output.push_str("\\u");
output.push_str(&format!("{:04x}", ch as u32));
}
ch => output.push(ch),
}
}
output.push('"');
}
#[cfg(test)]
mod tests {
use index_core::{
ButtonAction, DocumentQuality, DocumentQualityCategory, Form, IndexDocument, IndexNode,
Input, Link, SectionRole,
};
use super::{
ExtractFormat, ExtractionError, ExtractionLimits, JsonSchemaError, PipeDecision,
PipeDeniedReason, classify_pipe_command, export_section_markdown, extract_citations,
extract_citations_tsv, extract_json, extract_links, extract_markdown, try_extract_document,
validate_document_json_schema,
};
fn fixture_document() -> IndexDocument {
let mut document = IndexDocument::titled("Fixture");
document.metadata.description = Some("Document description".to_owned());
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::StrongGeneric,
82,
["generic reader emitted semantic content"],
));
document.push(IndexNode::Heading {
level: 2,
text: "Overview".to_owned(),
});
document.push(IndexNode::Paragraph("Hello from Index.".to_owned()));
document.push(IndexNode::Spacer { lines: 2 });
document.push(IndexNode::Link(Link::new(
"Docs",
"https://example.com/docs",
)));
document.push(IndexNode::Section {
role: SectionRole::Navigation,
title: Some("Site".to_owned()),
collapsed: true,
nodes: vec![IndexNode::Link(Link::new(
"About",
"https://example.com/about",
))],
});
document.push(IndexNode::List {
ordered: true,
items: vec!["First".to_owned(), "Second".to_owned()],
});
document.push(IndexNode::CodeBlock {
language: Some("rust".to_owned()),
code: "fn main() {}\n".to_owned(),
});
document.push(IndexNode::Table {
rows: vec![
vec!["Name".to_owned(), "Value".to_owned()],
vec!["Index".to_owned(), "Semantic browser".to_owned()],
],
});
document.push(IndexNode::Image {
alt: "Diagram".to_owned(),
src: Some("diagram.png".to_owned()),
});
document.push(IndexNode::Form(Form {
name: "search".to_owned(),
method: "GET".to_owned(),
action: "/search".to_owned(),
inputs: vec![Input {
name: "q".to_owned(),
kind: "text".to_owned(),
value: None,
required: true,
}],
buttons: vec![ButtonAction {
name: Some("go".to_owned()),
value: Some("1".to_owned()),
label: "Search".to_owned(),
}],
}));
document
}
#[test]
fn markdown_snapshot_is_deterministic() {
let markdown = extract_markdown(&fixture_document());
assert_eq!(
markdown,
"# Fixture\n\n## Overview\n\nHello from Index.\n\n\n\n[Docs](https://example.com/docs)\n\n> â–¸ navigation: Site (1 items)\n\n1. First\n2. Second\n\n```rust\nfn main() {}\n```\n\n| Name | Value |\n| Index | Semantic browser |\n\n\n\n> Form GET search -> /search\n> - q (text, required)\n"
);
}
#[test]
fn links_use_stable_numeric_addresses() {
let links = extract_links(&fixture_document());
assert_eq!(
links,
"1\tDocs\thttps://example.com/docs\n2\tAbout\thttps://example.com/about\n"
);
}
#[test]
fn citations_use_external_links_once_in_document_order() {
let mut document = fixture_document();
document.push(IndexNode::Link(Link::new(
"Docs duplicate",
"https://example.com/docs",
)));
document.push(IndexNode::Link(Link::new("Local", "/local")));
let citations = extract_citations(&document);
assert_eq!(citations.len(), 2);
assert_eq!(citations[0].index, 1);
assert_eq!(citations[0].text, "Docs");
assert_eq!(citations[0].href, "https://example.com/docs");
assert_eq!(citations[1].text, "About");
assert_eq!(
extract_citations_tsv(&document),
"1\tDocs\thttps://example.com/docs\n2\tAbout\thttps://example.com/about\n"
);
}
#[test]
fn selected_heading_section_exports_until_next_peer_heading() {
let mut document = IndexDocument::titled("Sections");
document.push(IndexNode::Heading {
level: 2,
text: "Keep".to_owned(),
});
document.push(IndexNode::Paragraph("selected".to_owned()));
document.push(IndexNode::Heading {
level: 3,
text: "Nested".to_owned(),
});
document.push(IndexNode::Paragraph("still selected".to_owned()));
document.push(IndexNode::Heading {
level: 2,
text: "Stop".to_owned(),
});
document.push(IndexNode::Paragraph("not selected".to_owned()));
let exported = export_section_markdown(&document, "keep");
assert_eq!(
exported.as_deref(),
Some("## Keep\n\nselected\n\n### Nested\n\nstill selected\n")
);
}
#[test]
fn selected_region_title_exports_section_nodes() {
let mut document = IndexDocument::titled("Sections");
document.push(IndexNode::Section {
role: SectionRole::Main,
title: Some("Article".to_owned()),
collapsed: false,
nodes: vec![IndexNode::Paragraph("body".to_owned())],
});
let exported = export_section_markdown(&document, "article");
assert_eq!(exported.as_deref(), Some("body\n"));
}
#[test]
fn json_output_validates_against_document_schema() {
let json = extract_json(&fixture_document());
assert!(validate_document_json_schema(&json).is_ok());
assert!(json.contains("\"type\": \"spacer\""));
assert!(json.contains("\"type\": \"section\""));
assert!(json.contains("\"type\": \"form\""));
assert!(json.contains("\"quality\": {\"category\": \"strong-generic\", \"score\": 82"));
assert!(json.contains("\"headers\": [\"Name\", \"Value\"]"));
assert!(json.contains("\"row_labels\": [\"Index\"]"));
assert!(json.contains("\"required\": true"));
}
#[test]
fn bounded_extraction_rejects_oversized_output() {
let mut document = IndexDocument::titled("Large export");
document.push(IndexNode::Paragraph("x".repeat(256)));
let result = try_extract_document(
&document,
ExtractFormat::Markdown,
ExtractionLimits::new(32),
);
assert!(matches!(
result,
Err(ExtractionError::OutputTooLarge {
format: ExtractFormat::Markdown,
limit: 32,
actual
}) if actual > 32
));
}
#[test]
fn json_schema_validation_rejects_missing_fields() {
let result = validate_document_json_schema("{\"nodes\": []}");
assert_eq!(result, Err(JsonSchemaError::MissingTitle));
}
#[test]
fn extract_format_parses_supported_names() {
assert_eq!(
ExtractFormat::parse("markdown"),
Some(ExtractFormat::Markdown)
);
assert_eq!(ExtractFormat::parse("md"), Some(ExtractFormat::Markdown));
assert_eq!(ExtractFormat::parse("links"), Some(ExtractFormat::Links));
assert_eq!(ExtractFormat::parse("json"), Some(ExtractFormat::Json));
assert_eq!(ExtractFormat::parse("xml"), None);
}
#[test]
fn pipe_requires_confirmation_for_safe_programs() {
assert_eq!(
classify_pipe_command("wc -l"),
PipeDecision::RequiresConfirmation(super::PipeCommand::new("wc -l"))
);
}
#[test]
fn pipe_allows_confirmed_safe_programs() {
assert_eq!(
classify_pipe_command("--confirm jq .title"),
PipeDecision::Allowed(super::PipeCommand::new("jq .title"))
);
}
#[test]
fn pipe_denies_shell_syntax_by_default() {
assert_eq!(
classify_pipe_command("wc -l; rm -rf target"),
PipeDecision::Denied(PipeDeniedReason::ShellSyntax)
);
}
#[test]
fn pipe_denies_unapproved_programs() {
assert_eq!(
classify_pipe_command("python script.py"),
PipeDecision::Denied(PipeDeniedReason::ProgramNotAllowed("python".to_owned()))
);
}
}