use anyhow::{Context, Result};
use kreuzberg::{
BatchFileItem, ExtractionConfig, ExtractionResult, FileExtractionConfig, batch_extract_files_sync,
extract_file_sync,
};
use std::path::PathBuf;
use std::time::Instant;
use crate::{
WireFormat,
output::{BatchEnvelope, ExtractEnvelope},
style,
};
pub fn extract_command(
path: PathBuf,
config: ExtractionConfig,
mime_type: Option<String>,
format: WireFormat,
) -> Result<()> {
let path_str = path.to_string_lossy().to_string();
let t0 = Instant::now();
let result = extract_file_sync(&path_str, mime_type.as_deref(), &config).with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?;
let extraction_time_ms = t0.elapsed().as_secs_f64() * 1000.0;
match format {
WireFormat::Text => {
print!("{}", result.content);
}
WireFormat::Json => {
let envelope = ExtractEnvelope {
result,
extraction_time_ms,
};
println!(
"{}",
serde_json::to_string_pretty(&envelope).context("Failed to serialize extraction result to JSON")?
);
}
WireFormat::Toon => {
println!(
"{}",
serde_toon::to_string(&result).context("Failed to serialize extraction result to TOON")?
);
}
}
Ok(())
}
pub fn batch_command(
paths: Vec<PathBuf>,
file_configs_map: Option<std::collections::HashMap<String, serde_json::Value>>,
config: ExtractionConfig,
format: WireFormat,
) -> Result<()> {
match format {
WireFormat::Json => {
let mut results: Vec<ExtractionResult> = Vec::with_capacity(paths.len());
let mut per_file_ms: Vec<f64> = Vec::with_capacity(paths.len());
let total_t0 = Instant::now();
for path in &paths {
let path_str = path.to_string_lossy().to_string();
let has_file_config = file_configs_map.as_ref().and_then(|m| m.get(&path_str)).is_some();
let t0 = Instant::now();
let result = if has_file_config {
let file_config = file_configs_map
.as_ref()
.and_then(|m| m.get(&path_str))
.map(|v| {
serde_json::from_value::<FileExtractionConfig>(v.clone())
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
})
.transpose()?;
let mut batch_results = batch_extract_files_sync(
vec![BatchFileItem {
path: path.clone(),
config: file_config,
}],
&config,
)
.with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?;
batch_results.remove(0)
} else {
extract_file_sync(&path_str, None, &config).with_context(|| {
format!(
"Failed to extract file '{}'. Ensure the file is readable and the format is supported.",
path.display()
)
})?
};
per_file_ms.push(t0.elapsed().as_secs_f64() * 1000.0);
results.push(result);
}
let total_ms = total_t0.elapsed().as_secs_f64() * 1000.0;
let envelope = BatchEnvelope {
results,
total_ms,
per_file_ms,
};
println!(
"{}",
serde_json::to_string_pretty(&envelope)
.context("Failed to serialize batch extraction results to JSON")?
);
}
WireFormat::Text => {
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
for (i, result) in results.iter().enumerate() {
println!("{}", style::header(&format!("=== Document {} ===", i + 1)));
println!("{} {}", style::label("MIME Type:"), style::success(&result.mime_type));
println!("{}\n{}", style::label("Content:"), result.content);
println!();
}
}
WireFormat::Toon => {
let results = run_batch_sync(&paths, file_configs_map.as_ref(), &config)?;
println!(
"{}",
serde_toon::to_string(&results).context("Failed to serialize batch extraction results to TOON")?
);
}
}
Ok(())
}
fn run_batch_sync(
paths: &[PathBuf],
file_configs_map: Option<&std::collections::HashMap<String, serde_json::Value>>,
config: &ExtractionConfig,
) -> Result<Vec<ExtractionResult>> {
let items: Vec<BatchFileItem> = paths
.iter()
.map(|p| {
let path_str = p.to_string_lossy().to_string();
let file_config = file_configs_map
.and_then(|m| m.get(&path_str))
.map(|v| {
serde_json::from_value::<FileExtractionConfig>(v.clone())
.with_context(|| format!("Failed to parse file config for '{}'", path_str))
})
.transpose()?;
Ok(BatchFileItem {
path: p.clone(),
config: file_config,
})
})
.collect::<Result<Vec<_>>>()?;
batch_extract_files_sync(items, config)
.context("Failed to batch extract documents. Check that all files are readable and formats are supported.")
}