use clap::{Parser, Subcommand, builder::styling};
use std::path::PathBuf;
use crate::client::commands::access_groups::AccessGroupCommands;
use crate::client::commands::admin::AdminCommands;
use crate::client::commands::compute_nodes::ComputeNodeCommands;
use crate::client::commands::config::ConfigCommands;
use crate::client::commands::events::EventCommands;
use crate::client::commands::failure_handlers::FailureHandlerCommands;
use crate::client::commands::files::FileCommands;
use crate::client::commands::hpc::HpcCommands;
use crate::client::commands::job_dependencies::JobDependencyCommands;
use crate::client::commands::jobs::JobCommands;
use crate::client::commands::logs::LogCommands;
use crate::client::commands::remote::RemoteCommands;
use crate::client::commands::reports::ReportCommands;
use crate::client::commands::resource_requirements::ResourceRequirementsCommands;
use crate::client::commands::results::ResultCommands;
use crate::client::commands::ro_crate::RoCrateCommands;
use crate::client::commands::scheduled_compute_nodes::ScheduledComputeNodeCommands;
use crate::client::commands::slurm::{GroupByStrategy, SlurmCommands};
use crate::client::commands::user_data::UserDataCommands;
use crate::client::commands::workflows::WorkflowCommands;
use crate::plot_resources_cmd;
use crate::tui_runner;
const STYLES: styling::Styles = styling::Styles::styled()
.header(styling::AnsiColor::Green.on_default().bold())
.usage(styling::AnsiColor::Green.on_default().bold())
.literal(styling::AnsiColor::Cyan.on_default().bold())
.placeholder(styling::AnsiColor::Cyan.on_default());
const HELP_TEMPLATE: &str = "\
{before-help}{name} {version}
{about-with-newline}
{usage-heading} {usage}
{all-args}
\x1b[1;32mWorkflow Execution:\x1b[0m
\x1b[1;36mrun\x1b[0m Run a workflow locally
\x1b[1;36msubmit\x1b[0m Submit a workflow to scheduler
\x1b[1;36msubmit-slurm\x1b[0m Submit to Slurm with auto-generated schedulers
\x1b[1;36mwatch\x1b[0m Watch workflow and recover from failures
\x1b[1;36mrecover\x1b[0m Recover a Slurm workflow from failures
\x1b[1;32mWorkflow Management:\x1b[0m
\x1b[1;36mworkflows\x1b[0m Workflow management commands
\x1b[1;36mjobs\x1b[0m Job management commands
\x1b[1;36mfiles\x1b[0m File management commands
\x1b[1;36muser-data\x1b[0m User data management commands
\x1b[1;36mevents\x1b[0m Event management commands
\x1b[1;36mresource-requirements\x1b[0m Resource requirements management
\x1b[1;36mresults\x1b[0m Result management commands
\x1b[1;36mfailure-handlers\x1b[0m Failure handler management
\x1b[1;36mcompute-nodes\x1b[0m Compute node management
\x1b[1;36mscheduled-compute-nodes\x1b[0m Scheduled compute node management
\x1b[1;36mtui\x1b[0m Interactive terminal UI
\x1b[1;32mScheduler & Compute:\x1b[0m
\x1b[1;36mslurm\x1b[0m Slurm scheduler commands
\x1b[1;36mhpc\x1b[0m HPC system profiles and partitions
\x1b[1;36mremote\x1b[0m Remote worker execution (SSH)
\x1b[1;32mAnalysis & Debugging:\x1b[0m
\x1b[1;36mreports\x1b[0m Generate reports and analytics
\x1b[1;36mlogs\x1b[0m Bundle and analyze workflow logs
\x1b[1;36mjob-dependencies\x1b[0m Job dependency queries
\x1b[1;32mServer Administration:\x1b[0m
\x1b[1;36madmin\x1b[0m Server administration commands
\x1b[1;32mConfiguration & Utilities:\x1b[0m
\x1b[1;36mconfig\x1b[0m Manage configuration settings
\x1b[1;36mplot-resources\x1b[0m Generate HTML resource plots
\x1b[1;36mcompletions\x1b[0m Generate shell completions
\x1b[1;36mhelp\x1b[0m Print help for a subcommand
{after-help}";
#[derive(Parser)]
#[command(author, version, about = "Torc workflow orchestration system", long_about = None)]
#[command(styles = STYLES, help_template = HELP_TEMPLATE, disable_help_subcommand = true, subcommand_help_heading = None)]
pub struct Cli {
#[arg(long, env = "RUST_LOG")]
pub log_level: Option<String>,
#[arg(short, long, default_value = "table")]
pub format: String,
#[arg(long, env = "TORC_API_URL")]
pub url: Option<String>,
#[arg(long, env = "TORC_PASSWORD")]
pub password: Option<String>,
#[arg(long)]
pub prompt_password: bool,
#[arg(long)]
pub skip_version_check: bool,
#[arg(long, env = "TORC_TLS_CA_CERT")]
pub tls_ca_cert: Option<String>,
#[arg(long, env = "TORC_TLS_INSECURE")]
pub tls_insecure: bool,
#[command(subcommand)]
pub command: Commands,
}
#[derive(Subcommand)]
pub enum Commands {
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Run from spec file
torc run workflow.yaml
# Run existing workflow
torc run 123
# With resource limits
torc run --num-cpus 8 --memory-gb 32 --num-gpus 2 workflow.yaml
# Limit parallel jobs
torc run --max-parallel-jobs 4 workflow.yaml
# Custom output directory
torc run -o /path/to/torc_output workflow.yaml
"
)]
Run {
#[arg()]
workflow_spec_or_id: String,
#[arg(long)]
max_parallel_jobs: Option<i64>,
#[arg(long)]
num_cpus: Option<i64>,
#[arg(long)]
memory_gb: Option<f64>,
#[arg(long)]
num_gpus: Option<i64>,
#[arg(short, long)]
poll_interval: Option<f64>,
#[arg(short, long)]
output_dir: Option<PathBuf>,
#[arg(long)]
time_limit: Option<String>,
#[arg(long)]
end_time: Option<String>,
#[arg(long, default_value = "false")]
skip_checks: bool,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Submit from spec file (must have on_workflow_start action)
torc submit workflow_with_actions.yaml
# Submit existing workflow
torc submit 123
# Ignore missing input data
torc submit -i workflow.yaml
# Custom output directory and poll interval
torc submit -o /scratch/output -p 60 workflow.yaml
# Limit parallel jobs per worker
torc submit --max-parallel-jobs 4 workflow.yaml
"
)]
Submit {
#[arg()]
workflow_spec_or_id: String,
#[arg(short, long, default_value = "false")]
ignore_missing_data: bool,
#[arg(long, default_value = "false")]
skip_checks: bool,
#[arg(long)]
max_parallel_jobs: Option<i32>,
#[arg(short, long, default_value = "torc_output")]
output_dir: String,
#[arg(short, long)]
poll_interval: Option<i32>,
},
#[command(
name = "submit-slurm",
hide = true,
after_long_help = "\
EXAMPLES:
# Submit with auto-generated Slurm schedulers
torc submit-slurm --account myproject workflow.yaml
# Specify HPC profile
torc submit-slurm --account myproject --hpc-profile kestrel workflow.yaml
# Single allocation mode
torc submit-slurm --account myproject --single-allocation workflow.yaml
# Group by partition
torc submit-slurm --account myproject --group-by partition workflow.yaml
# Custom output directory and poll interval
torc submit-slurm --account myproject -o /scratch/output -p 60 workflow.yaml
# Limit parallel jobs per worker
torc submit-slurm --account myproject --max-parallel-jobs 4 workflow.yaml
"
)]
SubmitSlurm {
#[arg()]
workflow_spec: String,
#[arg(short, long)]
account: Option<String>,
#[arg(long)]
hpc_profile: Option<String>,
#[arg(long)]
single_allocation: bool,
#[arg(long, value_enum, default_value_t = GroupByStrategy::ResourceRequirements)]
group_by: GroupByStrategy,
#[arg(short, long, default_value = "false")]
ignore_missing_data: bool,
#[arg(long, default_value = "false")]
skip_checks: bool,
#[arg(long, default_value = "false")]
overwrite: bool,
#[arg(long)]
max_parallel_jobs: Option<i32>,
#[arg(short, long, default_value = "torc_output")]
output_dir: String,
#[arg(long)]
poll_interval: Option<i32>,
},
#[command(
hide = true,
after_long_help = "\
USAGE MODES:
1. Basic monitoring (no recovery):
torc watch 123
Reports failures and exits. Use for manual intervention or AI-assisted recovery.
2. With automatic recovery (--recover):
torc watch 123 --recover
Automatically diagnoses OOM/timeout failures, adjusts resources, and retries.
Runs until all jobs complete or max retries exceeded.
3. With auto-scheduling (--auto-schedule):
torc watch 123 --auto-schedule
Automatically submits new Slurm allocations when retry jobs are waiting.
Essential for workflows using failure handlers that create retry jobs.
EXAMPLES:
# Basic: watch until completion, report failures
torc watch 123
# Recovery: automatically fix OOM/timeout failures
torc watch 123 --recover
# Recovery with aggressive resource increases
torc watch 123 --recover --memory-multiplier 2.0 --runtime-multiplier 2.0
# Recovery including unknown failures (transient errors)
torc watch 123 --recover --retry-unknown
# Auto-schedule: ensure retry jobs get scheduled
torc watch 123 --auto-schedule
# Full production setup: recovery + auto-scheduling
torc watch 123 --recover --auto-schedule
# Custom auto-schedule settings
torc watch 123 --auto-schedule \\
--auto-schedule-threshold 10 \\
--auto-schedule-cooldown 3600 \\
--auto-schedule-stranded-timeout 14400
AUTO-SCHEDULING BEHAVIOR:
When --auto-schedule is enabled:
1. No schedulers available: Immediately submits new allocations if ready jobs exist.
2. Threshold exceeded: If retry jobs (attempt_id > 1) exceed --auto-schedule-threshold
while schedulers are running, submits additional allocations after cooldown.
3. Stranded jobs: If retry jobs are below threshold but waiting longer than
--auto-schedule-stranded-timeout, schedules anyway to prevent indefinite waiting.
Defaults: threshold=5 jobs, cooldown=30min, stranded-timeout=2hrs
SEE ALSO:
torc recover One-shot recovery (no continuous monitoring)
Docs: https://nrel.github.io/torc/specialized/fault-tolerance/automatic-recovery.html
"
)]
Watch {
#[arg()]
workflow_id: i64,
#[arg(short, long, default_value = "60")]
poll_interval: u64,
#[arg(short, long)]
recover: bool,
#[arg(short, long)]
max_retries: Option<u32>,
#[arg(long, default_value = "1.5")]
memory_multiplier: f64,
#[arg(long, default_value = "1.5")]
runtime_multiplier: f64,
#[arg(long)]
retry_unknown: bool,
#[arg(long)]
recovery_hook: Option<String>,
#[arg(short, long, default_value = "torc_output")]
output_dir: PathBuf,
#[arg(short, long)]
show_job_counts: bool,
#[arg(long)]
auto_schedule: bool,
#[arg(long, default_value = "5")]
auto_schedule_threshold: u32,
#[arg(long, default_value = "1800")]
auto_schedule_cooldown: u64,
#[arg(long, default_value = "7200")]
auto_schedule_stranded_timeout: u64,
#[arg(long, verbatim_doc_comment)]
ai_recovery: bool,
#[arg(long, default_value = "claude", verbatim_doc_comment)]
ai_agent: String,
#[arg(long)]
partition: Option<String>,
#[arg(long)]
walltime: Option<String>,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Basic recovery
torc recover 123
# Dry run to preview changes without modifying anything
torc recover 123 --dry-run
# Custom resource multipliers
torc recover 123 --memory-multiplier 2.0 --runtime-multiplier 1.5
# Also retry unknown failures (not just OOM/timeout)
torc recover 123 --retry-unknown
# With custom recovery hook for domain-specific fixes
torc recover 123 --recovery-hook 'bash fix-cluster.sh'
WHEN TO USE:
Use `torc recover` for:
- One-shot recovery after a workflow has completed with failures
- Manual investigation before retrying (use --dry-run first)
- Workflows where you want to inspect failures before retrying
Use `torc watch --recover` instead for:
- Continuous monitoring of long-running workflows
- Fully automated recovery without manual intervention
- Production workflows that should self-heal
SEE ALSO:
torc watch --recover Continuous monitoring with automatic recovery
Docs: https://nrel.github.io/torc/specialized/fault-tolerance/automatic-recovery.html
"
)]
Recover {
#[arg()]
workflow_id: i64,
#[arg(short, long, default_value = "torc_output")]
output_dir: PathBuf,
#[arg(long, default_value = "1.5")]
memory_multiplier: f64,
#[arg(long, default_value = "1.4")]
runtime_multiplier: f64,
#[arg(long)]
retry_unknown: bool,
#[arg(long)]
recovery_hook: Option<String>,
#[arg(long)]
dry_run: bool,
#[arg(long, verbatim_doc_comment)]
ai_recovery: bool,
#[arg(long, default_value = "claude", verbatim_doc_comment)]
ai_agent: String,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Connect to running server
torc tui
# Standalone mode (starts embedded server)
torc tui --standalone
# Standalone with custom settings
torc tui --standalone --port 9090 --database /path/to/db.sqlite
"
)]
Tui(tui_runner::Args),
#[command(hide = true)]
Workflows {
#[command(subcommand)]
command: WorkflowCommands,
},
#[command(hide = true)]
Jobs {
#[command(subcommand)]
command: JobCommands,
},
#[command(hide = true)]
Files {
#[command(subcommand)]
command: FileCommands,
},
#[command(hide = true)]
UserData {
#[command(subcommand)]
command: UserDataCommands,
},
#[command(hide = true)]
Events {
#[command(subcommand)]
command: EventCommands,
},
#[command(hide = true)]
Results {
#[command(subcommand)]
command: ResultCommands,
},
#[command(hide = true)]
Slurm {
#[command(subcommand)]
command: SlurmCommands,
},
#[command(hide = true)]
Hpc {
#[command(subcommand)]
command: HpcCommands,
},
#[command(hide = true)]
ComputeNodes {
#[command(subcommand)]
command: ComputeNodeCommands,
},
#[command(hide = true)]
ScheduledComputeNodes {
#[command(subcommand)]
command: ScheduledComputeNodeCommands,
},
#[command(hide = true)]
Remote {
#[command(subcommand)]
command: RemoteCommands,
},
#[command(hide = true)]
Reports {
#[command(subcommand)]
command: ReportCommands,
},
#[command(hide = true)]
Logs {
#[command(subcommand)]
command: LogCommands,
},
#[command(hide = true)]
JobDependencies {
#[command(subcommand)]
command: JobDependencyCommands,
},
#[command(hide = true)]
ResourceRequirements {
#[command(subcommand)]
command: ResourceRequirementsCommands,
},
#[command(hide = true)]
FailureHandlers {
#[command(subcommand)]
command: FailureHandlerCommands,
},
#[command(name = "ro-crate")]
RoCrate {
#[command(subcommand)]
command: RoCrateCommands,
},
#[command(hide = true)]
AccessGroups {
#[command(subcommand)]
command: AccessGroupCommands,
},
#[command(hide = true)]
Admin {
#[command(subcommand)]
command: AdminCommands,
},
#[command(hide = true)]
Config {
#[command(subcommand)]
command: ConfigCommands,
},
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
torc plot-resources output/resource_metrics.db
torc plot-resources -o /reports/ resource_metrics.db
torc plot-resources -j job1,job2,job3 resource_metrics.db
"
)]
PlotResources(plot_resources_cmd::Args),
Ping,
#[command(
hide = true,
after_long_help = "\
EXAMPLES:
# Bash (add to ~/.bashrc)
torc completions bash > ~/.local/share/bash-completion/completions/torc
# Zsh (add to ~/.zshrc: fpath=(~/.zfunc $fpath))
torc completions zsh > ~/.zfunc/_torc
# Fish
torc completions fish > ~/.config/fish/completions/torc.fish
"
)]
Completions {
#[arg(value_enum)]
shell: clap_complete::Shell,
},
}