name: hyperparameter_sweep_shared_params
description: Grid search over learning rate, batch size, and optimizer (using shared parameters)
parameters:
lr: "[0.0001,0.001,0.01]"
batch_size: "[16,32,64]"
optimizer: "['adam','sgd']"
jobs:
- name: prepare_train_data
command: python scripts/prepare_data.py --split=train --output=/data/train.pkl
resource_requirements: data_prep
output_files:
- train_data
- name: prepare_val_data
command: python scripts/prepare_data.py --split=validation --output=/data/validation.pkl
resource_requirements: data_prep
output_files:
- val_data
- name: train_lr{lr:.4f}_bs{batch_size}_opt{optimizer}
command: |
python train.py \
--learning-rate={lr} \
--batch-size={batch_size} \
--optimizer={optimizer} \
--train-data=/data/train.pkl \
--val-data=/data/validation.pkl \
--model-output=/models/model_lr{lr:.4f}_bs{batch_size}_opt{optimizer}.pt \
--metrics-output=/results/metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}.json
resource_requirements: gpu_training
depends_on:
- prepare_train_data
- prepare_val_data
input_files:
- train_data
- val_data
output_files:
- model_lr{lr:.4f}_bs{batch_size}_opt{optimizer}
- metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}
use_parameters:
- lr
- batch_size
- optimizer
- name: aggregate_results
command: python scripts/aggregate_metrics.py --input-dir=/results --output=/results/summary.csv
resource_requirements: minimal
depends_on:
- train_lr{lr:.4f}_bs{batch_size}_opt{optimizer}
input_files:
- metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}
use_parameters:
- lr
- batch_size
- optimizer
files:
- name: train_data
path: /data/train.pkl
- name: val_data
path: /data/validation.pkl
- name: model_lr{lr:.4f}_bs{batch_size}_opt{optimizer}
path: /models/model_lr{lr:.4f}_bs{batch_size}_opt{optimizer}.pt
use_parameters:
- lr
- batch_size
- optimizer
- name: metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}
path: /results/metrics_lr{lr:.4f}_bs{batch_size}_opt{optimizer}.json
use_parameters:
- lr
- batch_size
- optimizer
resource_requirements:
- name: minimal
num_cpus: 1
num_gpus: 0
num_nodes: 1
memory: 2g
runtime: PT5M
- name: data_prep
num_cpus: 4
num_gpus: 0
num_nodes: 1
memory: 8g
runtime: PT30M
- name: gpu_training
num_cpus: 8
num_gpus: 1
num_nodes: 1
memory: 32g
runtime: PT2H