1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#[derive(Subcommand, Debug)]
pub enum ModelOpsCommands {
/// Fine-tune model with LoRA/QLoRA (GH-244)
#[cfg(feature = "training")]
Finetune {
/// Input model file
#[arg(value_name = "FILE")]
file: Option<PathBuf>,
/// Fine-tuning method: auto, full, lora, qlora
#[arg(long, short = 'm', default_value = "auto")]
method: String,
/// LoRA rank (default: auto-selected)
#[arg(long, short = 'r')]
rank: Option<u32>,
/// Available VRAM in GB
#[arg(long, default_value = "16.0")]
vram: f64,
/// Plan mode (estimate only)
#[arg(long)]
plan: bool,
/// Training data file (JSONL format)
#[arg(long, short = 'd', value_name = "FILE")]
data: Option<PathBuf>,
/// Output path (adapter dir or merged model)
#[arg(short, long)]
output: Option<PathBuf>,
/// Adapter path for merge mode
#[arg(long)]
adapter: Option<PathBuf>,
/// Merge adapter into base model
#[arg(long)]
merge: bool,
/// Training epochs
#[arg(long, default_value = "3")]
epochs: u32,
/// Learning rate
#[arg(long, default_value = "0.0002")]
learning_rate: f64,
/// Model size for planning (e.g., "7B", "1.5B")
#[arg(long, value_name = "SIZE")]
model_size: Option<String>,
/// Fine-tuning task: classify (sequence classification)
#[arg(long)]
task: Option<String>,
/// Number of classes for classification task
#[arg(long, default_value = "5")]
num_classes: usize,
/// Output format for checkpoints: apr, safetensors, or both (comma-separated)
#[arg(long, value_name = "FORMAT", default_value = "apr,safetensors")]
checkpoint_format: String,
/// Oversample minority classes to match majority (for imbalanced datasets)
#[arg(long)]
oversample: bool,
/// Maximum sequence length for GPU buffer allocation (lower = less VRAM)
#[arg(long, value_name = "LEN")]
max_seq_len: Option<usize>,
/// Quantize frozen weights to NF4 (4-bit) for QLoRA training (~8x VRAM savings)
#[arg(long)]
quantize_nf4: bool,
/// GPU indices for data-parallel training (e.g., "0,1" for dual GPU)
#[arg(long, value_name = "INDICES")]
gpus: Option<String>,
/// GPU backend selection: auto, cuda, wgpu
#[arg(long, default_value = "auto")]
gpu_backend: String,
/// Distributed training role: coordinator or worker
#[arg(long, value_name = "ROLE")]
role: Option<String>,
/// Address to bind (coordinator) or connect to (worker)
#[arg(long, value_name = "ADDR")]
bind: Option<String>,
/// Coordinator address for worker nodes (e.g., "intel:9000")
#[arg(long, value_name = "ADDR")]
coordinator: Option<String>,
/// Expected number of workers (coordinator only)
#[arg(long, value_name = "N")]
expect_workers: Option<usize>,
/// Wait for VRAM availability before training (timeout in seconds, 0 = no wait)
#[arg(long, value_name = "SECS", default_value = "0")]
wait_gpu: u64,
/// Multi-adapter training: data:checkpoint pairs (GPU-SHARE Phase 2)
/// Format: --adapters data/corpus-a.jsonl:checkpoints/adapter-a
/// Can be specified multiple times for concurrent adapter training.
#[arg(long, value_name = "DATA:CHECKPOINT")]
adapters: Vec<String>,
/// Multi-adapter config file: TOML with [[adapter]] entries (GPU-SHARE §2.4)
#[arg(long, value_name = "FILE")]
adapters_config: Option<PathBuf>,
/// Enable experimental CUDA MPS for concurrent GPU sharing (GPU-SHARE §1.5).
/// WARNING: A GPU fault in any MPS client will crash ALL clients on that GPU.
#[arg(long)]
experimental_mps: bool,
/// MPS thread percentage (1-100). Controls SM allocation per process.
/// Only effective with --experimental-mps. Default: 50.
#[arg(long, value_name = "PCT", default_value = "50")]
gpu_share: u32,
/// PMAT-486: Enable StepProfiler for per-phase wall-clock timing
#[arg(long)]
profile: bool,
},
/// Prune model (structured/unstructured pruning) (GH-247)
Prune {
/// Input model file
#[arg(value_name = "FILE")]
file: PathBuf,
/// Pruning method: magnitude, structured, depth, width, wanda, sparsegpt
#[arg(long, short = 'm', default_value = "magnitude")]
method: String,
/// Target pruning ratio (0-1)
#[arg(long, default_value = "0.5")]
target_ratio: f32,
/// Sparsity level (0-1)
#[arg(long, default_value = "0.0")]
sparsity: f32,
/// Output file path
#[arg(short, long)]
output: Option<PathBuf>,
/// Layers to remove for depth pruning (e.g., "20-24")
#[arg(long)]
remove_layers: Option<String>,
/// Analyze mode (identify pruning opportunities)
#[arg(long)]
analyze: bool,
/// Plan mode (estimate only)
#[arg(long)]
plan: bool,
/// Calibration data file
#[arg(long, value_name = "FILE")]
calibration: Option<PathBuf>,
},
/// Knowledge distillation (teacher -> student) (GH-247, ALB-011)
Distill {
/// Teacher model file (positional, for file-based mode)
#[arg(value_name = "TEACHER")]
teacher: Option<PathBuf>,
/// Student model file
#[arg(long, value_name = "FILE")]
student: Option<PathBuf>,
/// Training data file
#[arg(long, short = 'd', value_name = "FILE")]
data: Option<PathBuf>,
/// Output file path
#[arg(short, long)]
output: Option<PathBuf>,
/// Distillation strategy: standard, progressive, ensemble
#[arg(long, default_value = "standard")]
strategy: String,
/// Temperature for softmax scaling
#[arg(long, default_value = "3.0")]
temperature: f64,
/// Alpha weight for KL vs task loss
#[arg(long, default_value = "0.7")]
alpha: f64,
/// Training epochs
#[arg(long, default_value = "3")]
epochs: u32,
/// Plan mode (estimate only)
#[arg(long)]
plan: bool,
/// YAML config file for two-stage distillation (ALB-011)
#[arg(long, value_name = "FILE")]
config: Option<PathBuf>,
/// Distillation stage: precompute, train (logit KD), or generate (text-based, GH-455)
#[arg(long, value_name = "STAGE")]
stage: Option<String>,
},
}