use clap::{Parser, Subcommand, builder::styling};
use std::path::PathBuf;
use crate::client::commands::access_groups::AccessGroupCommands;
use crate::client::commands::admin::AdminCommands;
use crate::client::commands::compute_nodes::ComputeNodeCommands;
use crate::client::commands::config::ConfigCommands;
use crate::client::commands::events::EventCommands;
use crate::client::commands::failure_handlers::FailureHandlerCommands;
use crate::client::commands::files::FileCommands;
use crate::client::commands::hpc::HpcCommands;
use crate::client::commands::job_dependencies::JobDependencyCommands;
use crate::client::commands::jobs::JobCommands;
use crate::client::commands::logs::LogCommands;
use crate::client::commands::remote::RemoteCommands;
use crate::client::commands::resource_requirements::ResourceRequirementsCommands;
use crate::client::commands::results::ResultCommands;
use crate::client::commands::ro_crate::RoCrateCommands;
use crate::client::commands::scheduled_compute_nodes::ScheduledComputeNodeCommands;
use crate::client::commands::slurm::SlurmCommands;
use crate::client::commands::user_data::UserDataCommands;
use crate::client::commands::workflows::WorkflowCommands;
use crate::plot_resources_cmd;
use crate::tui_runner;
const STYLES: styling::Styles = styling::Styles::styled()
.header(styling::AnsiColor::Green.on_default().bold())
.usage(styling::AnsiColor::Green.on_default().bold())
.literal(styling::AnsiColor::Cyan.on_default().bold())
.placeholder(styling::AnsiColor::Cyan.on_default());
const HELP_TEMPLATE: &str = "\
{before-help}{name} {version}
{about-with-newline}
{usage-heading} {usage}
{all-args}
\x1b[1;32mWorkflow Lifecycle:\x1b[0m
\x1b[1;36mcreate\x1b[0m Create a workflow from spec file
\x1b[1;36mrun\x1b[0m Run a workflow locally
\x1b[1;36mexec\x1b[0m Run inline commands as a synthesized workflow
\x1b[1;36msubmit\x1b[0m Submit a workflow to scheduler
\x1b[1;36mstatus\x1b[0m Show workflow status and job summary
\x1b[1;36mwatch\x1b[0m Watch workflow and recover from failures
\x1b[1;36mrecover\x1b[0m Recover a Slurm workflow from failures
\x1b[1;36mcancel\x1b[0m Cancel a workflow and Slurm jobs
\x1b[1;36mdelete\x1b[0m Delete a workflow
\x1b[1;32mWorkflow Management:\x1b[0m
\x1b[1;36mworkflows\x1b[0m Workflow management commands
\x1b[1;36mjobs\x1b[0m Job management commands
\x1b[1;36mfiles\x1b[0m File management commands
\x1b[1;36muser-data\x1b[0m User data management commands
\x1b[1;36mevents\x1b[0m Event management commands
\x1b[1;36mresource-requirements\x1b[0m Resource requirements management
\x1b[1;36mresults\x1b[0m Result management commands
\x1b[1;36mfailure-handlers\x1b[0m Failure handler management
\x1b[1;36mcompute-nodes\x1b[0m Compute node management
\x1b[1;36mscheduled-compute-nodes\x1b[0m Scheduled compute node management
\x1b[1;36mtui\x1b[0m Interactive terminal UI
\x1b[1;32mScheduler & Compute:\x1b[0m
\x1b[1;36mslurm\x1b[0m Slurm scheduler commands
\x1b[1;36mhpc\x1b[0m HPC system profiles and partitions
\x1b[1;36mremote\x1b[0m Remote worker execution (SSH)
\x1b[1;32mAnalysis & Debugging:\x1b[0m
\x1b[1;36mlogs\x1b[0m Bundle and analyze workflow logs
\x1b[1;36mjob-dependencies\x1b[0m Job dependency queries
\x1b[1;36mro-crate\x1b[0m RO-Crate metadata management
\x1b[1;32mServer Administration:\x1b[0m
\x1b[1;36madmin\x1b[0m Server administration commands
\x1b[1;36mping\x1b[0m Check server connectivity
\x1b[1;32mConfiguration & Utilities:\x1b[0m
\x1b[1;36mconfig\x1b[0m Manage configuration settings
\x1b[1;36mplot-resources\x1b[0m Generate HTML resource plots
\x1b[1;36mcompletions\x1b[0m Generate shell completions
\x1b[1;36mhelp\x1b[0m Print help for a subcommand
{after-help}";
#[derive(Parser)]
#[command(author, version, about = "Torc workflow orchestration system", long_about = None)]
#[command(styles = STYLES, help_template = HELP_TEMPLATE, disable_help_subcommand = true, subcommand_help_heading = None)]
pub struct Cli {
#[arg(long, env = "RUST_LOG")]
pub log_level: Option<String>,
#[arg(short, long, default_value = "table")]
pub format: String,
#[arg(long, env = "TORC_API_URL")]
pub url: Option<String>,
#[arg(long, env = "TORC_PASSWORD")]
pub password: Option<String>,
#[arg(long)]
pub prompt_password: bool,
#[arg(long)]
pub skip_version_check: bool,
#[arg(long, env = "TORC_TLS_CA_CERT")]
pub tls_ca_cert: Option<String>,
#[arg(long, env = "TORC_TLS_INSECURE")]
pub tls_insecure: bool,
#[arg(long, env = "TORC_COOKIE_HEADER", hide_env_values = true)]
pub cookie_header: Option<String>,
#[arg(short = 's', long)]
pub standalone: bool,
#[arg(long, value_name = "PATH")]
pub db: Option<PathBuf>,
#[arg(
long,
env = "TORC_SERVER_BIN",
value_name = "PATH",
default_value = "torc-server"
)]
pub torc_server_bin: String,
#[command(subcommand)]
pub command: Commands,
}
#[derive(Subcommand)]
pub enum Commands {
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Create workflow from YAML
torc create my_workflow.yaml
# Validate spec before creating
torc create --dry-run my_workflow.yaml
# Get JSON output with workflow ID
torc -f json create my_workflow.yaml
"
)]
Create {
#[arg()]
file: String,
#[arg(long, default_value = "false")]
no_resource_monitoring: bool,
#[arg(long, default_value = "false")]
skip_checks: bool,
#[arg(long)]
dry_run: bool,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Run from spec file
torc run workflow.yaml
# Run existing workflow
torc run 123
# With resource limits
torc run --num-cpus 8 --memory-gb 32 --num-gpus 2 workflow.yaml
# Limit parallel jobs
torc run --max-parallel-jobs 4 workflow.yaml
# Custom output directory
torc run -o /path/to/torc_output workflow.yaml
SEE ALSO:
torc exec Run ad-hoc commands inline without a spec file.
"
)]
Run {
#[arg()]
workflow_spec_or_id: String,
#[arg(long)]
max_parallel_jobs: Option<i64>,
#[arg(long)]
num_cpus: Option<i64>,
#[arg(long)]
memory_gb: Option<f64>,
#[arg(long)]
num_gpus: Option<i64>,
#[arg(short, long)]
poll_interval: Option<f64>,
#[arg(short, long)]
output_dir: Option<PathBuf>,
#[arg(long)]
time_limit: Option<String>,
#[arg(long)]
end_time: Option<String>,
#[arg(long, default_value = "false")]
skip_checks: bool,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES (standalone — no running server required):
# Monitor CPU/memory of a single command
torc -s exec -c 'bash long_script.sh'
# Run a batch of commands with a parallelism cap
torc -s exec -c 'bash work.sh 1' -c 'bash work.sh 2' -c 'bash work.sh 3' -j 2
# Commands from a file (one per line)
torc -s exec -C commands.txt
# Shell-style invocation (everything after '--' is one command)
torc -s exec -- python train.py --epochs 10
# Parameterized template (Cartesian product = 9 jobs)
torc -s exec -c 'python train.py --lr {lr} --bs {bs}' \\
--param lr='[0.001,0.01,0.1]' \\
--param bs='[32,64,128]'
# Parameters zipped element-wise (3 jobs)
torc -s exec -c 'curl -o {out} {url}' \\
--param url=@urls.txt \\
--param out=@outfiles.txt \\
--link zip
# Inspect the persisted results later
torc -s results list
SEE ALSO:
torc run Run a workflow defined in a spec file.
"
)]
Exec {
#[arg(short = 'n', long, value_name = "NAME")]
name: Option<String>,
#[arg(long, value_name = "TEXT")]
description: Option<String>,
#[arg(short = 'c', long = "command", value_name = "CMD")]
command: Vec<String>,
#[arg(short = 'C', long = "commands-file", value_name = "FILE")]
commands_file: Option<String>,
#[arg(long = "param", value_name = "NAME=VALUE")]
param: Vec<String>,
#[arg(long = "link", value_name = "MODE", default_value = "product", value_parser = clap::builder::PossibleValuesParser::new(["product", "zip"]))]
link: String,
#[arg(short = 'j', long = "max-parallel-jobs")]
max_parallel_jobs: Option<i64>,
#[arg(short = 'o', long)]
output_dir: Option<PathBuf>,
#[arg(long, default_value_t = false)]
dry_run: bool,
#[arg(long, value_name = "MODE", default_value = "summary",
value_parser = clap::builder::PossibleValuesParser::new(
["off", "summary", "time-series"]))]
monitor: String,
#[arg(long, value_name = "MODE", default_value = "off",
value_parser = clap::builder::PossibleValuesParser::new(
["off", "summary", "time-series"]))]
monitor_compute_node: String,
#[arg(long, default_value_t = false)]
generate_plots: bool,
#[arg(
short = 'i',
long = "sample-interval-seconds",
value_name = "SECS",
value_parser = clap::value_parser!(u32).range(1..)
)]
sample_interval_seconds: Option<u32>,
#[arg(long, value_name = "MODE",
value_parser = clap::builder::PossibleValuesParser::new(
["separate", "combined", "no-stdout", "no-stderr", "none"]))]
stdio: Option<String>,
#[arg(hide = true, trailing_var_arg = true)]
trailing: Vec<String>,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Submit from spec file (must have on_workflow_start action)
torc submit workflow_with_actions.yaml
# Submit existing workflow
torc submit 123
# Ignore missing input data
torc submit -i workflow.yaml
# Custom output directory and poll interval
torc submit -o /scratch/output -p 60 workflow.yaml
# Limit parallel jobs per worker
torc submit --max-parallel-jobs 4 workflow.yaml
"
)]
Submit {
#[arg()]
workflow_spec_or_id: String,
#[arg(short, long, default_value = "false")]
ignore_missing_data: bool,
#[arg(long, default_value = "false")]
skip_checks: bool,
#[arg(long)]
max_parallel_jobs: Option<i32>,
#[arg(short, long, default_value = "torc_output")]
output_dir: String,
#[arg(short, long)]
poll_interval: Option<i32>,
},
#[command(
hide = true,
after_long_help = "\
USAGE MODES:
1. Basic monitoring (no recovery):
torc watch 123
Reports failures and exits. Use for manual intervention or AI-assisted recovery.
2. With automatic recovery (--recover):
torc watch 123 --recover
Automatically diagnoses OOM/timeout failures, adjusts resources, and retries.
Runs until all jobs complete or max retries exceeded.
3. With auto-scheduling (--auto-schedule):
torc watch 123 --auto-schedule
Automatically submits new Slurm allocations when retry jobs are waiting.
Essential for workflows using failure handlers that create retry jobs.
EXAMPLES:
# Basic: watch until completion, report failures
torc watch 123
# Recovery: automatically fix OOM/timeout failures
torc watch 123 --recover
# Recovery with aggressive resource increases
torc watch 123 --recover --memory-multiplier 2.0 --runtime-multiplier 2.0
# Recovery including unknown failures (transient errors)
torc watch 123 --recover --retry-unknown
# Auto-schedule: ensure retry jobs get scheduled
torc watch 123 --auto-schedule
# Full production setup: recovery + auto-scheduling
torc watch 123 --recover --auto-schedule
# Custom auto-schedule settings
torc watch 123 --auto-schedule \\
--auto-schedule-threshold 10 \\
--auto-schedule-cooldown 3600 \\
--auto-schedule-stranded-timeout 14400
AUTO-SCHEDULING BEHAVIOR:
When --auto-schedule is enabled:
1. No schedulers available: Immediately submits new allocations if ready jobs exist.
2. Threshold exceeded: If retry jobs (attempt_id > 1) exceed --auto-schedule-threshold
while schedulers are running, submits additional allocations after cooldown.
3. Stranded jobs: If retry jobs are below threshold but waiting longer than
--auto-schedule-stranded-timeout, schedules anyway to prevent indefinite waiting.
Defaults: threshold=5 jobs, cooldown=30min, stranded-timeout=2hrs
SEE ALSO:
torc recover One-shot recovery (no continuous monitoring)
Docs: https://nrel.github.io/torc/specialized/fault-tolerance/automatic-recovery.html
"
)]
Watch {
#[arg()]
workflow_id: i64,
#[arg(short, long, default_value = "60")]
poll_interval: u64,
#[arg(short, long)]
recover: bool,
#[arg(short, long)]
max_retries: Option<u32>,
#[arg(long, default_value = "1.5")]
memory_multiplier: f64,
#[arg(long, default_value = "1.5")]
runtime_multiplier: f64,
#[arg(long)]
retry_unknown: bool,
#[arg(long)]
recovery_hook: Option<String>,
#[arg(short, long, default_value = "torc_output")]
output_dir: PathBuf,
#[arg(short, long)]
show_job_counts: bool,
#[arg(long)]
auto_schedule: bool,
#[arg(long, default_value = "5")]
auto_schedule_threshold: u32,
#[arg(long, default_value = "1800")]
auto_schedule_cooldown: u64,
#[arg(long, default_value = "7200")]
auto_schedule_stranded_timeout: u64,
#[arg(long, verbatim_doc_comment)]
ai_recovery: bool,
#[arg(long, default_value = "claude", verbatim_doc_comment)]
ai_agent: String,
#[arg(long)]
partition: Option<String>,
#[arg(long)]
walltime: Option<String>,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Interactive recovery (default)
torc recover 123
# Dry run to preview changes without modifying anything
torc recover 123 --dry-run
# Skip interactive prompts (for scripting)
torc recover 123 --no-prompts
# Custom resource multipliers (with or without prompts)
torc recover 123 --memory-multiplier 2.0 --runtime-multiplier 1.5
# Also retry unknown failures (not just OOM/timeout)
torc recover 123 --retry-unknown
# With custom recovery hook for domain-specific fixes
torc recover 123 --recovery-hook 'bash fix-cluster.sh'
WHEN TO USE:
Use `torc recover` for:
- One-shot recovery after a workflow has completed with failures
- Manual investigation before retrying (use --dry-run first)
- Workflows where you want to inspect failures before retrying
Use `torc watch --recover` instead for:
- Continuous monitoring of long-running workflows
- Fully automated recovery without manual intervention
- Production workflows that should self-heal
SEE ALSO:
torc watch --recover Continuous monitoring with automatic recovery
Docs: https://nrel.github.io/torc/specialized/fault-tolerance/automatic-recovery.html
"
)]
Recover {
#[arg()]
workflow_id: i64,
#[arg(short, long, default_value = "torc_output")]
output_dir: PathBuf,
#[arg(long, default_value = "1.5")]
memory_multiplier: f64,
#[arg(long, default_value = "1.4")]
runtime_multiplier: f64,
#[arg(long)]
retry_unknown: bool,
#[arg(long)]
recovery_hook: Option<String>,
#[arg(long)]
dry_run: bool,
#[arg(long)]
no_prompts: bool,
#[arg(long, verbatim_doc_comment)]
ai_recovery: bool,
#[arg(long, default_value = "claude", verbatim_doc_comment)]
ai_agent: String,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Cancel a workflow and its Slurm jobs
torc cancel 123
# Get JSON status of cancellation
torc -f json cancel 123
"
)]
Cancel {
#[arg()]
workflow_id: Option<i64>,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Show status for a workflow
torc status 123
# Get JSON output for scripting
torc -f json status 123
"
)]
Status {
#[arg()]
workflow_id: Option<i64>,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Delete a single workflow
torc delete 123
# Delete multiple workflows
torc delete 123 456 789
# Delete without confirmation (use with caution)
torc delete --force 123
"
)]
Delete {
#[arg(required = true)]
workflow_ids: Vec<i64>,
#[arg(long)]
force: bool,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Connect to running server
torc tui
# Standalone mode (starts embedded server)
torc tui --standalone
# Standalone with custom settings
torc tui --standalone --port 9090 --database /path/to/db.sqlite
"
)]
Tui(tui_runner::Args),
#[command(hide = true)]
Workflows {
#[command(subcommand)]
command: WorkflowCommands,
},
#[command(hide = true)]
Jobs {
#[command(subcommand)]
command: JobCommands,
},
#[command(hide = true)]
Files {
#[command(subcommand)]
command: FileCommands,
},
#[command(hide = true)]
UserData {
#[command(subcommand)]
command: UserDataCommands,
},
#[command(hide = true)]
Events {
#[command(subcommand)]
command: EventCommands,
},
#[command(hide = true)]
Results {
#[command(subcommand)]
command: ResultCommands,
},
#[command(hide = true)]
Slurm {
#[command(subcommand)]
command: SlurmCommands,
},
#[command(hide = true)]
Hpc {
#[command(subcommand)]
command: HpcCommands,
},
#[command(hide = true)]
ComputeNodes {
#[command(subcommand)]
command: ComputeNodeCommands,
},
#[command(hide = true)]
ScheduledComputeNodes {
#[command(subcommand)]
command: ScheduledComputeNodeCommands,
},
#[command(hide = true)]
Remote {
#[command(subcommand)]
command: RemoteCommands,
},
#[command(hide = true)]
Logs {
#[command(subcommand)]
command: LogCommands,
},
#[command(hide = true)]
JobDependencies {
#[command(subcommand)]
command: JobDependencyCommands,
},
#[command(hide = true)]
ResourceRequirements {
#[command(subcommand)]
command: ResourceRequirementsCommands,
},
#[command(hide = true)]
FailureHandlers {
#[command(subcommand)]
command: FailureHandlerCommands,
},
#[command(name = "ro-crate", hide = true)]
RoCrate {
#[command(subcommand)]
command: RoCrateCommands,
},
#[command(hide = true)]
AccessGroups {
#[command(subcommand)]
command: AccessGroupCommands,
},
#[command(hide = true)]
Admin {
#[command(subcommand)]
command: AdminCommands,
},
#[command(hide = true)]
Config {
#[command(subcommand)]
command: ConfigCommands,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
torc plot-resources output/resource_metrics.db
torc plot-resources -o /reports/ resource_metrics.db
torc plot-resources -j job1,job2,job3 resource_metrics.db
"
)]
PlotResources(plot_resources_cmd::Args),
#[command(hide = true)]
Ping,
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Bash (add to ~/.bashrc)
torc completions bash > ~/.local/share/bash-completion/completions/torc
# Zsh (add to ~/.zshrc: fpath=(~/.zfunc $fpath))
torc completions zsh > ~/.zfunc/_torc
# Fish
torc completions fish > ~/.config/fish/completions/torc.fish
"
)]
Completions {
#[arg(value_enum)]
shell: clap_complete::Shell,
},
}