torc 0.21.0

Workflow management system
// Machine Learning Hyperparameter Sweep Workflow with Shared Parameters
// Demonstrates using workflow-level shared parameters to avoid repeating parameter definitions
// This creates 3 * 3 * 2 = 18 training jobs with different hyperparameter combinations
//
// Compare this to hyperparameter_sweep.kdl - same result, but parameters defined only once!

name "hyperparameter_sweep_shared_params"

description "Grid search over learning rate, batch size, and optimizer (using shared parameters)"

// Workflow-level shared parameters - defined once, used by multiple jobs and files
parameters {
    lr "[0.0001,0.001,0.01]"
    batch_size "[16,32,64]"
    optimizer "['adam','sgd']"
}

// Resource requirements
resource_requirements "minimal" {
    num_cpus 1
    num_gpus 0
    num_nodes 1
    memory "2g"
    runtime "PT5M"
}

resource_requirements "data_prep" {
    num_cpus 4
    num_gpus 0
    num_nodes 1
    memory "8g"
    runtime "PT30M"
}

resource_requirements "gpu_training" {
    num_cpus 8
    num_gpus 1
    num_nodes 1
    memory "32g"
    runtime "PT2H"
}

// File specifications
file "train_data" path="/data/train.pkl"
file "val_data" path="/data/validation.pkl"

// Model files - one per hyperparameter combination
file "model_lr{lr:.4f}_bs{batch_size}_opt{optimizer}" {
    path "/models/model_lr{lr:.4f}_bs{batch_size}_opt{optimizer}.pt"
    use_parameters "lr" "batch_size" "optimizer"
}

// Metrics files - one per hyperparameter combination
file "metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}" {
    path "/results/metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}.json"
    use_parameters "lr" "batch_size" "optimizer"
}

// Prepare datasets (one-time setup jobs)
job "prepare_train_data" {
    command "python scripts/prepare_data.py --split=train --output=/data/train.pkl"
    resource_requirements "data_prep"
    output_file "train_data"
}

job "prepare_val_data" {
    command "python scripts/prepare_data.py --split=validation --output=/data/validation.pkl"
    resource_requirements "data_prep"
    output_file "val_data"
}

// Training jobs - one job per hyperparameter combination
// This single job spec expands to 18 jobs
job "train_lr{lr:.4f}_bs{batch_size}_opt{optimizer}" {
    command "python train.py --learning-rate={lr} --batch-size={batch_size} --optimizer={optimizer} --train-data=/data/train.pkl --val-data=/data/validation.pkl --model-output=/models/model_lr{lr:.4f}_bs{batch_size}_opt{optimizer}.pt --metrics-output=/results/metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}.json"
    resource_requirements "gpu_training"
    depends_on_job "prepare_train_data"
    depends_on_job "prepare_val_data"
    input_file "train_data"
    input_file "val_data"
    output_file "model_lr{lr:.4f}_bs{batch_size}_opt{optimizer}"
    output_file "metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}"
    use_parameters "lr" "batch_size" "optimizer"
}

// Aggregate results from all training runs
job "aggregate_results" {
    command "python scripts/aggregate_metrics.py --input-dir=/results --output=/results/summary.csv"
    resource_requirements "minimal"
    // This will be expanded to wait for all 18 training jobs
    depends_on_job "train_lr{lr:.4f}_bs{batch_size}_opt{optimizer}"
    // Wait for all metrics files
    input_file "metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}"
    use_parameters "lr" "batch_size" "optimizer"
}