datalab-cli 0.1.0

A powerful CLI for converting, extracting, and processing documents using the Datalab API
Documentation
use clap::{Parser, Subcommand};
use std::process::ExitCode;

mod cache;
mod client;
mod commands;
mod error;
mod output;

use commands::{
    cache as cache_cmd, convert, create_document, extract, files, fill, segment, track_changes,
    workflows,
};
use output::{Output, Progress};

const LONG_ABOUT: &str = "\
A powerful command-line interface for converting, extracting, and processing \
documents using the Datalab API.

All commands output JSON to stdout for easy integration with scripts and agents. \
Progress events are streamed to stderr as JSON when running interactively.

GETTING STARTED:
  1. Get your API key from https://www.datalab.to/app/keys
  2. Export it: export DATALAB_API_KEY=\"your-key\"
  3. Convert a document: datalab convert document.pdf

For detailed documentation, visit: https://documentation.datalab.to/";

const AFTER_HELP: &str = "\
ENVIRONMENT VARIABLES:
  DATALAB_API_KEY     API key for authentication (required)
  DATALAB_BASE_URL    Custom API endpoint for on-premises deployments
  NO_COLOR            Disable colored output when set

EXAMPLES:
  # Convert a PDF to markdown
  datalab convert invoice.pdf

  # Convert with high accuracy and chart understanding
  datalab convert report.pdf --mode accurate --extras chart_understanding

  # Extract structured data using a schema
  datalab extract contract.pdf --schema schema.json --save-checkpoint

  # Fill a form with field values
  datalab fill form.pdf --fields '{\"name\": \"John Doe\", \"date\": \"2024-01-15\"}'

  # Upload and manage files
  datalab files upload document.pdf
  datalab files list

For more examples, see: https://documentation.datalab.to/docs/welcome/quickstart";

#[derive(Parser)]
#[command(
    name = "datalab",
    author = "Datalab <support@datalab.to>",
    version,
    about = "CLI for the Datalab document processing API",
    long_about = LONG_ABOUT,
    after_help = AFTER_HELP,
    propagate_version = true,
    subcommand_required = true,
    arg_required_else_help = true,
)]
struct Cli {
    /// Suppress all progress output
    #[arg(short, long, global = true)]
    quiet: bool,

    /// Enable verbose progress output (even when piped)
    #[arg(short, long, global = true)]
    verbose: bool,

    #[command(subcommand)]
    command: Commands,
}

#[derive(Subcommand)]
enum Commands {
    /// Convert a document to markdown, HTML, JSON, or chunks
    #[command(
        long_about = "Convert PDF, images, or documents to structured formats.\n\n\
                      Supports markdown (default), HTML, JSON, and chunked output. \
                      Use --save-checkpoint to enable efficient follow-up extraction or segmentation \
                      on the same document.",
        after_help = "EXAMPLES:\n  \
                      # Basic conversion to markdown\n  \
                      datalab convert document.pdf\n\n  \
                      # High-quality conversion with chart understanding\n  \
                      datalab convert report.pdf --mode accurate --extras chart_understanding\n\n  \
                      # Convert specific pages and save to file\n  \
                      datalab convert book.pdf --page-range \"0-10\" --output result.json\n\n  \
                      # Convert from URL\n  \
                      datalab convert https://example.com/document.pdf"
    )]
    Convert(convert::ConvertArgs),

    /// Extract structured data from a document using a JSON schema
    #[command(
        long_about = "Extract structured data from documents using a JSON schema.\n\n\
                      Define fields to extract with their types, and the API will locate and \
                      extract the values with source citations. Use --checkpoint-id to extract \
                      from a previously converted document without re-processing.",
        after_help = "EXAMPLES:\n  \
                      # Extract with inline schema\n  \
                      datalab extract invoice.pdf --schema '{\"fields\": [{\"name\": \"total\"}]}'\n\n  \
                      # Extract with schema file\n  \
                      datalab extract contract.pdf --schema schema.json --save-checkpoint\n\n  \
                      # Reuse checkpoint from previous conversion\n  \
                      datalab extract invoice.pdf --schema schema.json --checkpoint-id abc123"
    )]
    Extract(extract::ExtractArgs),

    /// Segment a document into logical sections
    #[command(
        long_about = "Split multi-document PDFs and identify logical sections.\n\n\
                      Useful for processing document bundles, identifying section boundaries, \
                      or splitting large documents into manageable parts.",
        after_help = "EXAMPLES:\n  \
                      # Segment a document bundle\n  \
                      datalab segment bundle.pdf --schema '{\"segments\": [\"invoice\", \"receipt\"]}'\n\n  \
                      # Reuse checkpoint\n  \
                      datalab segment bundle.pdf --schema schema.json --checkpoint-id abc123"
    )]
    Segment(segment::SegmentArgs),

    /// Fill a form with provided field data
    #[command(
        long_about = "Automatically fill PDF or image forms with provided field values.\n\n\
                      The API matches your field names to form fields using AI, so exact \
                      field name matching is not required. Adjust --confidence-threshold \
                      to control matching strictness.",
        after_help = "EXAMPLES:\n  \
                      # Fill with inline JSON\n  \
                      datalab fill form.pdf --fields '{\"name\": \"John Doe\"}' --output filled.pdf\n\n  \
                      # Fill with field file and context\n  \
                      datalab fill application.pdf --fields fields.json --context \"Job application\"\n\n  \
                      # Strict field matching\n  \
                      datalab fill form.pdf --fields data.json --confidence-threshold 0.8"
    )]
    Fill(fill::FillArgs),

    /// Extract track changes from a Word document
    #[command(
        long_about = "Extract insertions, deletions, and comments from Word documents.\n\n\
                      Returns the document content with revision markup in HTML or markdown format.",
        after_help = "EXAMPLES:\n  \
                      # Extract track changes as HTML and markdown\n  \
                      datalab track-changes document.docx\n\n  \
                      # Get only HTML output\n  \
                      datalab track-changes document.docx --output-format html"
    )]
    TrackChanges(track_changes::TrackChangesArgs),

    /// Create a document from markdown
    #[command(
        long_about = "Generate DOCX files from markdown content.\n\n\
                      Supports revision markup with <ins>, <del>, and <comment> tags \
                      that become native Word track changes.",
        after_help = "EXAMPLES:\n  \
                      # Create from inline markdown\n  \
                      datalab create-document --markdown \"# Report\\n\\nContent here\" --output report.docx\n\n  \
                      # Create from markdown file\n  \
                      datalab create-document --markdown content.md --output document.docx"
    )]
    CreateDocument(create_document::CreateDocumentArgs),

    /// Score extraction results with confidence ratings
    #[command(
        long_about = "Score previously extracted data with per-field confidence ratings.\n\n\
                      Requires a checkpoint from a previous extraction with --save-checkpoint.",
        after_help = "EXAMPLES:\n  \
                      # Score extraction results\n  \
                      datalab extract-score --checkpoint-id abc123"
    )]
    ExtractScore(extract::ExtractScoreArgs),

    /// File management operations
    #[command(subcommand)]
    Files(files::FilesCommand),

    /// Workflow management operations
    #[command(subcommand)]
    Workflows(workflows::WorkflowsCommand),

    /// Cache management operations
    #[command(subcommand)]
    Cache(cache_cmd::CacheCommand),
}

#[tokio::main]
async fn main() -> ExitCode {
    let cli = Cli::parse();

    // Create progress reporter based on flags
    let progress = Progress::new(cli.quiet, cli.verbose);
    let output = Output::new();

    let result = match cli.command {
        Commands::Convert(args) => convert::execute(args, &progress).await,
        Commands::Extract(args) => extract::execute(args, &progress).await,
        Commands::Segment(args) => segment::execute(args, &progress).await,
        Commands::Fill(args) => fill::execute(args, &progress).await,
        Commands::TrackChanges(args) => track_changes::execute(args, &progress).await,
        Commands::CreateDocument(args) => create_document::execute(args, &progress).await,
        Commands::ExtractScore(args) => extract::execute_score(args, &progress).await,
        Commands::Files(cmd) => files::execute(cmd, &progress).await,
        Commands::Workflows(cmd) => workflows::execute(cmd, &progress).await,
        Commands::Cache(cmd) => cache_cmd::execute(cmd).await,
    };

    match result {
        Ok(()) => {
            progress.complete();
            ExitCode::SUCCESS
        }
        Err(e) => {
            progress.error(&e);
            output.error(&e);
            ExitCode::FAILURE
        }
    }
}