use clap::{Parser, Subcommand};
use std::process::ExitCode;
mod cache;
mod client;
mod commands;
mod error;
mod output;
use commands::{
cache as cache_cmd, convert, create_document, extract, files, fill, segment, track_changes,
workflows,
};
use output::{Output, Progress};
const LONG_ABOUT: &str = "\
A powerful command-line interface for converting, extracting, and processing \
documents using the Datalab API.
All commands output JSON to stdout for easy integration with scripts and agents. \
Progress events are streamed to stderr as JSON when running interactively.
GETTING STARTED:
1. Get your API key from https://www.datalab.to/app/keys
2. Export it: export DATALAB_API_KEY=\"your-key\"
3. Convert a document: datalab convert document.pdf
For detailed documentation, visit: https://documentation.datalab.to/";
const AFTER_HELP: &str = "\
ENVIRONMENT VARIABLES:
DATALAB_API_KEY API key for authentication (required)
DATALAB_BASE_URL Custom API endpoint for on-premises deployments
NO_COLOR Disable colored output when set
EXAMPLES:
# Convert a PDF to markdown
datalab convert invoice.pdf
# Convert with high accuracy and chart understanding
datalab convert report.pdf --mode accurate --extras chart_understanding
# Extract structured data using a schema
datalab extract contract.pdf --schema schema.json --save-checkpoint
# Fill a form with field values
datalab fill form.pdf --fields '{\"name\": \"John Doe\", \"date\": \"2024-01-15\"}'
# Upload and manage files
datalab files upload document.pdf
datalab files list
For more examples, see: https://documentation.datalab.to/docs/welcome/quickstart";
#[derive(Parser)]
#[command(
name = "datalab",
author = "Datalab <support@datalab.to>",
version,
about = "CLI for the Datalab document processing API",
long_about = LONG_ABOUT,
after_help = AFTER_HELP,
propagate_version = true,
subcommand_required = true,
arg_required_else_help = true,
)]
struct Cli {
#[arg(short, long, global = true)]
quiet: bool,
#[arg(short, long, global = true)]
verbose: bool,
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
#[command(
long_about = "Convert PDF, images, or documents to structured formats.\n\n\
Supports markdown (default), HTML, JSON, and chunked output. \
Use --save-checkpoint to enable efficient follow-up extraction or segmentation \
on the same document.",
after_help = "EXAMPLES:\n \
# Basic conversion to markdown\n \
datalab convert document.pdf\n\n \
# High-quality conversion with chart understanding\n \
datalab convert report.pdf --mode accurate --extras chart_understanding\n\n \
# Convert specific pages and save to file\n \
datalab convert book.pdf --page-range \"0-10\" --output result.json\n\n \
# Convert from URL\n \
datalab convert https://example.com/document.pdf"
)]
Convert(convert::ConvertArgs),
#[command(
long_about = "Extract structured data from documents using a JSON schema.\n\n\
Define fields to extract with their types, and the API will locate and \
extract the values with source citations. Use --checkpoint-id to extract \
from a previously converted document without re-processing.",
after_help = "EXAMPLES:\n \
# Extract with inline schema\n \
datalab extract invoice.pdf --schema '{\"fields\": [{\"name\": \"total\"}]}'\n\n \
# Extract with schema file\n \
datalab extract contract.pdf --schema schema.json --save-checkpoint\n\n \
# Reuse checkpoint from previous conversion\n \
datalab extract invoice.pdf --schema schema.json --checkpoint-id abc123"
)]
Extract(extract::ExtractArgs),
#[command(
long_about = "Split multi-document PDFs and identify logical sections.\n\n\
Useful for processing document bundles, identifying section boundaries, \
or splitting large documents into manageable parts.",
after_help = "EXAMPLES:\n \
# Segment a document bundle\n \
datalab segment bundle.pdf --schema '{\"segments\": [\"invoice\", \"receipt\"]}'\n\n \
# Reuse checkpoint\n \
datalab segment bundle.pdf --schema schema.json --checkpoint-id abc123"
)]
Segment(segment::SegmentArgs),
#[command(
long_about = "Automatically fill PDF or image forms with provided field values.\n\n\
The API matches your field names to form fields using AI, so exact \
field name matching is not required. Adjust --confidence-threshold \
to control matching strictness.",
after_help = "EXAMPLES:\n \
# Fill with inline JSON\n \
datalab fill form.pdf --fields '{\"name\": \"John Doe\"}' --output filled.pdf\n\n \
# Fill with field file and context\n \
datalab fill application.pdf --fields fields.json --context \"Job application\"\n\n \
# Strict field matching\n \
datalab fill form.pdf --fields data.json --confidence-threshold 0.8"
)]
Fill(fill::FillArgs),
#[command(
long_about = "Extract insertions, deletions, and comments from Word documents.\n\n\
Returns the document content with revision markup in HTML or markdown format.",
after_help = "EXAMPLES:\n \
# Extract track changes as HTML and markdown\n \
datalab track-changes document.docx\n\n \
# Get only HTML output\n \
datalab track-changes document.docx --output-format html"
)]
TrackChanges(track_changes::TrackChangesArgs),
#[command(
long_about = "Generate DOCX files from markdown content.\n\n\
Supports revision markup with <ins>, <del>, and <comment> tags \
that become native Word track changes.",
after_help = "EXAMPLES:\n \
# Create from inline markdown\n \
datalab create-document --markdown \"# Report\\n\\nContent here\" --output report.docx\n\n \
# Create from markdown file\n \
datalab create-document --markdown content.md --output document.docx"
)]
CreateDocument(create_document::CreateDocumentArgs),
#[command(
long_about = "Score previously extracted data with per-field confidence ratings.\n\n\
Requires a checkpoint from a previous extraction with --save-checkpoint.",
after_help = "EXAMPLES:\n \
# Score extraction results\n \
datalab extract-score --checkpoint-id abc123"
)]
ExtractScore(extract::ExtractScoreArgs),
#[command(subcommand)]
Files(files::FilesCommand),
#[command(subcommand)]
Workflows(workflows::WorkflowsCommand),
#[command(subcommand)]
Cache(cache_cmd::CacheCommand),
}
#[tokio::main]
async fn main() -> ExitCode {
let cli = Cli::parse();
let progress = Progress::new(cli.quiet, cli.verbose);
let output = Output::new();
let result = match cli.command {
Commands::Convert(args) => convert::execute(args, &progress).await,
Commands::Extract(args) => extract::execute(args, &progress).await,
Commands::Segment(args) => segment::execute(args, &progress).await,
Commands::Fill(args) => fill::execute(args, &progress).await,
Commands::TrackChanges(args) => track_changes::execute(args, &progress).await,
Commands::CreateDocument(args) => create_document::execute(args, &progress).await,
Commands::ExtractScore(args) => extract::execute_score(args, &progress).await,
Commands::Files(cmd) => files::execute(cmd, &progress).await,
Commands::Workflows(cmd) => workflows::execute(cmd, &progress).await,
Commands::Cache(cmd) => cache_cmd::execute(cmd).await,
};
match result {
Ok(()) => {
progress.complete();
ExitCode::SUCCESS
}
Err(e) => {
progress.error(&e);
output.error(&e);
ExitCode::FAILURE
}
}
}