1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
/// Training pipeline subcommands (forjar-style plan/apply).
///
/// Thin CLI wrappers around entrenar's training plan/apply infrastructure.
#[derive(Subcommand, Debug)]
pub enum TrainCommands {
/// Generate a training plan without touching the GPU.
///
/// Validates data quality, checks model compatibility, builds HPO search space,
/// estimates resource usage, and runs pre-flight checks. Outputs a serializable
/// plan manifest (text, JSON, or YAML).
///
/// Analogous to `forjar plan` — shows what will happen before committing GPU time.
Plan {
/// Path to training data (JSONL) — required for --task classify
#[arg(long, value_name = "FILE")]
data: Option<PathBuf>,
/// Model size: "0.5B", "9B", "7B", "13B"
#[arg(long, default_value = "0.5B")]
model_size: String,
/// Path to model weights directory
#[arg(long, value_name = "DIR")]
model_path: Option<PathBuf>,
/// Number of output classes
#[arg(long, default_value = "5")]
num_classes: usize,
/// Task type: classify, pretrain
#[arg(long, default_value = "classify")]
task: String,
/// YAML training config (for --task pretrain)
#[arg(long, value_name = "FILE")]
config: Option<PathBuf>,
/// Output directory for checkpoints
#[arg(short, long, default_value = "/tmp/training-output")]
output: PathBuf,
/// HPO strategy: tpe, grid, random, manual
#[arg(long, default_value = "tpe")]
strategy: String,
/// HPO budget (number of trials)
#[arg(long, default_value = "20")]
budget: usize,
/// Scout mode: 1 epoch per trial for fast exploration
#[arg(long)]
scout: bool,
/// Maximum epochs per trial
#[arg(long, default_value = "3")]
max_epochs: usize,
/// Manual learning rate (only used with --strategy manual)
#[arg(long)]
learning_rate: Option<f32>,
/// Manual LoRA rank (only used with --strategy manual)
#[arg(long)]
lora_rank: Option<usize>,
/// Manual batch size (only used with --strategy manual)
#[arg(long)]
batch_size: Option<usize>,
/// Validation data file (JSONL)
#[arg(long, value_name = "FILE")]
val_data: Option<PathBuf>,
/// Test data file (JSONL)
#[arg(long, value_name = "FILE")]
test_data: Option<PathBuf>,
/// Output format: text, json, yaml
#[arg(long, default_value = "text")]
format: String,
},
/// Execute a training plan (allocate GPU, run trials).
///
/// Reads a previously generated plan (YAML/JSON) and executes it:
/// - Manual strategy: single training run with specified hyperparameters
/// - HPO strategy: multiple trials with automatic hyperparameter tuning
///
/// Analogous to `forjar apply` — commits resources and executes the plan.
Apply {
/// Path to a saved plan file (YAML or JSON from `apr train plan`)
#[arg(long, value_name = "FILE")]
plan: Option<PathBuf>,
/// YAML training config (for --task pretrain)
#[arg(long, value_name = "FILE")]
config: Option<PathBuf>,
/// Task type: classify, pretrain
#[arg(long, default_value = "classify")]
task: String,
// ── Inline plan params (used when no --plan file is given) ─────
/// Path to training data (JSONL)
#[arg(long, value_name = "FILE")]
data: Option<PathBuf>,
/// Model size: "0.5B", "9B", "7B", "13B"
#[arg(long, default_value = "0.5B")]
model_size: String,
/// Path to model weights directory
#[arg(long, value_name = "DIR")]
model_path: Option<PathBuf>,
/// Number of output classes
#[arg(long, default_value = "5")]
num_classes: usize,
/// Output directory for checkpoints and leaderboard
#[arg(short, long, default_value = "/tmp/training-output")]
output: PathBuf,
/// HPO strategy: tpe, grid, random, manual
#[arg(long, default_value = "tpe")]
strategy: String,
/// HPO budget (number of trials)
#[arg(long, default_value = "20")]
budget: usize,
/// Scout mode: 1 epoch per trial
#[arg(long)]
scout: bool,
/// Maximum epochs per trial
#[arg(long, default_value = "3")]
max_epochs: usize,
/// Manual learning rate (only used with --strategy manual)
#[arg(long)]
learning_rate: Option<f32>,
/// Manual LoRA rank (only used with --strategy manual)
#[arg(long)]
lora_rank: Option<usize>,
/// Manual batch size (only used with --strategy manual)
#[arg(long)]
batch_size: Option<usize>,
// ── Distributed training params (tickets #131-#140, aprender #393) ──
/// Enable distributed data-parallel training
#[arg(long)]
distributed: bool,
/// Total number of workers (default: auto-detect GPUs)
#[arg(long, value_name = "N")]
world_size: Option<usize>,
/// This worker's global rank (default: 0 = coordinator)
#[arg(long, value_name = "N")]
rank: Option<usize>,
/// Coordinator address for distributed training (default: 0.0.0.0:9000)
#[arg(long, value_name = "HOST:PORT")]
coordinator_addr: Option<String>,
// ── Reproducibility params (R-084 C-DETERM-001) ──
/// Enable bitwise deterministic training (CUBLAS_WORKSPACE_CONFIG, cuDNN deterministic)
#[arg(long)]
deterministic: bool,
/// Random seed for reproducibility (default: from YAML or 42)
#[arg(long, value_name = "N")]
seed: Option<u64>,
// ── Profiling params (PMAT-486) ──
/// Enable StepProfiler for per-phase wall-clock timing (KAIZEN-047)
#[arg(long)]
profile: bool,
/// StepProfiler report interval (every N steps, default: 50)
#[arg(long, value_name = "N", default_value = "50")]
profile_interval: usize,
},
/// Watch a training run with automatic restart on crash and hang detection.
///
/// Monitors a running or to-be-started training process:
/// - Detects crashes (SIGABRT, SIGSEGV, OOM) and restarts with backoff
/// - Detects hangs via heartbeat/training_state.json staleness
/// - Captures GPU state and crash diagnostics
/// - Auto-enables CUDA_LAUNCH_BLOCKING on async crash pattern
///
/// Sovereign Rust replacement for train-guard.sh.
Watch {
/// YAML training config to run and watch
#[arg(long, value_name = "FILE")]
config: PathBuf,
/// Maximum number of restart attempts
#[arg(long, default_value = "5")]
max_restarts: usize,
/// Heartbeat staleness threshold in seconds
#[arg(long, default_value = "300")]
heartbeat_timeout: u64,
/// Initial backoff delay in seconds
#[arg(long, default_value = "30")]
backoff_initial: u64,
/// Maximum backoff delay in seconds
#[arg(long, default_value = "600")]
backoff_max: u64,
},
/// Generate hyperparameter sweep configs from a base YAML.
///
/// Creates N training configs with varied hyperparameters using grid
/// or random search. Each config is a complete YAML that can be
/// passed to `apr train apply --task pretrain --config <file>`.
///
/// Sovereign Rust replacement for hyperparam-sweep.py.
Sweep {
/// Base YAML training config to sweep from
#[arg(long, value_name = "FILE")]
config: PathBuf,
/// Search strategy: grid or random
#[arg(long, default_value = "random")]
strategy: String,
/// Number of configs to generate (random) or max combinations (grid)
#[arg(long, default_value = "10")]
num_configs: usize,
/// Output directory for generated configs
#[arg(long, default_value = "sweeps/")]
output_dir: PathBuf,
/// Seed for random search reproducibility
#[arg(long, default_value = "42")]
seed: u64,
},
/// Run successive halving HPO on sweep configs (C-HPO-001).
///
/// Takes a directory of sweep configs (from `apr train sweep`), runs each
/// for `--steps-per-round` steps, kills the worst half by val_ppl, doubles
/// steps, and repeats for `--rounds` rounds. Reports the winner with
/// μTransfer-scaled LR for the target model width.
///
/// References: Hyperband (Li et al. 2018, arXiv:1603.06560),
/// μTransfer (Yang et al. 2022, arXiv:2203.03466).
Halving {
/// Directory containing sweep-*.yaml configs (from `apr train sweep`)
#[arg(long, value_name = "DIR")]
sweep_dir: PathBuf,
/// Number of halving rounds (default: 3)
#[arg(long, default_value = "3")]
rounds: usize,
/// Training steps in first round (doubles each round)
#[arg(long, default_value = "500")]
steps_per_round: usize,
/// Proxy model hidden_size (for μTransfer scaling)
#[arg(long, default_value = "512")]
source_width: usize,
/// Target model hidden_size (for μTransfer scaling)
#[arg(long, default_value = "1024")]
target_width: usize,
/// Output JSON file for results
#[arg(long, default_value = "sweeps/hpo-results.json")]
output: PathBuf,
},
/// Archive a checkpoint into a release bundle.
///
/// Packages model weights, config, training state, and metadata
/// into a self-contained directory with integrity manifest.
Archive {
/// Path to checkpoint directory
#[arg(value_name = "CHECKPOINT_DIR")]
checkpoint_dir: PathBuf,
/// Output archive directory
#[arg(short, long, value_name = "DIR")]
output: PathBuf,
/// Release version tag (e.g., "v1.0")
#[arg(long = "release-version")]
release_version: Option<String>,
/// Release notes
#[arg(long)]
notes: Option<String>,
},
/// Submit multi-adapter training jobs to a cluster (GPU-SHARE Phase 3).
///
/// Reads a cluster.yaml config, places adapter jobs across nodes using
/// the greedy placement algorithm, and generates launch commands.
Submit {
/// Path to cluster config YAML
#[arg(long, value_name = "FILE")]
cluster: PathBuf,
/// Model checkpoint path (.apr)
#[arg(long, value_name = "FILE")]
model: PathBuf,
/// Adapter specs: DATA:CHECKPOINT pairs (one per adapter)
#[arg(long = "adapter", value_name = "DATA:CHECKPOINT")]
adapters: Vec<String>,
/// LoRA rank
#[arg(long, default_value = "16")]
rank: u32,
/// Number of training epochs
#[arg(long, default_value = "3")]
epochs: u32,
/// Estimated VRAM budget per adapter (MB)
#[arg(long, default_value = "6000")]
budget_mb: u64,
/// Dry run: show placement and commands without executing
#[arg(long)]
dry_run: bool,
},
/// Show cluster status: nodes, GPUs, adapter capacity (GPU-SHARE Phase 3).
///
/// Reads a cluster.yaml config and displays node health, VRAM availability,
/// and adapter placement capacity.
ClusterStatus {
/// Path to cluster config YAML
#[arg(long, value_name = "FILE")]
cluster: PathBuf,
},
}