apr_cli/extended_commands.rs
1
2/// Extended CLI commands (analysis, profiling, QA, benchmarks, and advanced tools).
3///
4/// Flattened into `Commands` via `#[command(flatten)]` so all subcommands remain
5/// top-level from the user's perspective (e.g., `apr chat`, `apr profile`).
6#[derive(Subcommand, Debug)]
7pub enum ExtendedCommands {
8 /// Interactive chat with language model
9 Chat {
10 /// Path to .apr model file
11 #[arg(value_name = "FILE")]
12 file: PathBuf,
13 /// Sampling temperature (0 = greedy, higher = more random)
14 #[arg(long, default_value = "0.7")]
15 temperature: f32,
16 /// Nucleus sampling threshold
17 #[arg(long, default_value = "0.9")]
18 top_p: f32,
19 /// Maximum tokens to generate per response
20 #[arg(long, default_value = "512")]
21 max_tokens: usize,
22 /// System prompt to set model behavior
23 #[arg(long)]
24 system: Option<String>,
25 /// Show inspection info (top-k probs, tokens/sec)
26 #[arg(long)]
27 inspect: bool,
28 /// Disable GPU acceleration (use CPU)
29 #[arg(long)]
30 no_gpu: bool,
31 /// Force GPU acceleration (requires CUDA)
32 #[arg(long)]
33 gpu: bool,
34 /// Enable inference tracing (APR-TRACE-001)
35 #[arg(long)]
36 trace: bool,
37 /// Trace specific steps only (comma-separated)
38 #[arg(long, value_delimiter = ',')]
39 trace_steps: Option<Vec<String>>,
40 /// Verbose tracing
41 #[arg(long)]
42 trace_verbose: bool,
43 /// Save trace output to JSON file
44 #[arg(long, value_name = "FILE")]
45 trace_output: Option<PathBuf>,
46 /// Trace detail level (none, basic, layer, payload)
47 #[arg(long, value_name = "LEVEL", default_value = "basic")]
48 trace_level: String,
49 /// Enable inline Roofline profiling (PMAT-SHOWCASE-METHODOLOGY-001)
50 #[arg(long)]
51 profile: bool,
52 },
53 /// Benchmark throughput (spec H12: >= 10 tok/s)
54 Bench {
55 /// Path to model file
56 #[arg(value_name = "FILE")]
57 file: PathBuf,
58 /// Number of warmup iterations
59 #[arg(long, default_value = "3")]
60 warmup: usize,
61 /// Number of measurement iterations
62 #[arg(long, default_value = "5")]
63 iterations: usize,
64 /// Max tokens to generate per iteration
65 #[arg(long, default_value = "32")]
66 max_tokens: usize,
67 /// Test prompt
68 #[arg(long)]
69 prompt: Option<String>,
70 /// Use realizar for fast inference (vs aprender baseline)
71 #[arg(long)]
72 fast: bool,
73 /// Benchmark specific brick
74 #[arg(long)]
75 brick: Option<String>,
76 },
77 /// Evaluate model perplexity (spec H13: PPL <= 20)
78 Eval {
79 /// Path to model file
80 #[arg(value_name = "FILE")]
81 file: PathBuf,
82 /// Dataset: wikitext-2, lambada, or custom
83 #[arg(long, default_value = "wikitext-2")]
84 dataset: String,
85 /// Custom text (when dataset=custom)
86 #[arg(long)]
87 text: Option<String>,
88 /// Maximum tokens to evaluate
89 #[arg(long, default_value = "512")]
90 max_tokens: usize,
91 /// Perplexity threshold for pass/fail
92 #[arg(long, default_value = "20.0")]
93 threshold: f32,
94 },
95 /// Deep profiling with Roofline analysis
96 Profile {
97 /// Path to model file
98 #[arg(value_name = "FILE")]
99 file: PathBuf,
100 /// Layer-by-layer granular analysis
101 #[arg(long)]
102 granular: bool,
103 /// Output format (human, json, flamegraph)
104 #[arg(long, default_value = "human")]
105 format: String,
106 /// Focus on specific operation
107 #[arg(long)]
108 focus: Option<String>,
109 /// Detect naive implementations
110 #[arg(long)]
111 detect_naive: bool,
112 /// GFLOPS threshold for naive detection
113 #[arg(long, default_value = "10.0")]
114 threshold: f64,
115 /// Compare against HuggingFace baseline
116 #[arg(long)]
117 compare_hf: Option<String>,
118 /// Measure energy consumption (requires RAPL)
119 #[arg(long)]
120 energy: bool,
121 /// Compute performance grade (vs Ollama baseline)
122 #[arg(long)]
123 perf_grade: bool,
124 /// Show call graph
125 #[arg(long)]
126 callgraph: bool,
127 /// Exit non-zero if naive implementation detected
128 #[arg(long)]
129 fail_on_naive: bool,
130 /// Output file path for flamegraph SVG (GH-174, PMAT-182)
131 #[arg(long, short = 'o')]
132 output: Option<PathBuf>,
133
134 // PMAT-192: CI Assertion Mode (GH-180)
135 /// Enable CI mode with assertion checks (exits 1 on failure)
136 #[arg(long)]
137 ci: bool,
138 /// Minimum throughput in tok/s (CI assertion, exits 1 if below)
139 #[arg(long)]
140 assert_throughput: Option<f64>,
141 /// Maximum p99 latency in ms (CI assertion, exits 1 if above)
142 #[arg(long)]
143 assert_p99: Option<f64>,
144 /// Maximum p50 latency in ms (CI assertion, exits 1 if above)
145 #[arg(long)]
146 assert_p50: Option<f64>,
147 /// Warmup passes before measurement (default: 3)
148 #[arg(long, default_value = "3")]
149 warmup: usize,
150 /// Measurement passes (default: 10)
151 #[arg(long, default_value = "10")]
152 measure: usize,
153 /// Number of tokens to generate per measurement pass (default: 32)
154 #[arg(long, default_value = "32")]
155 tokens: usize,
156 /// Compare against Ollama baseline (runs ollama for comparison)
157 #[arg(long)]
158 ollama: bool,
159 /// Disable GPU (force CPU-only profiling)
160 #[arg(long)]
161 no_gpu: bool,
162 /// Compare against another model format (F-PROFILE-011)
163 #[arg(long, value_name = "FILE")]
164 compare: Option<PathBuf>,
165 },
166 /// Falsifiable QA checklist for model releases
167 Qa {
168 /// Path to model file
169 #[arg(value_name = "FILE")]
170 file: PathBuf,
171 /// Minimum throughput threshold in tok/s
172 #[arg(long, value_name = "TPS")]
173 assert_tps: Option<f64>,
174 /// Minimum speedup vs Ollama
175 #[arg(long, value_name = "SPEEDUP")]
176 assert_speedup: Option<f64>,
177 /// Minimum GPU vs CPU speedup (F-PERF-042)
178 #[arg(long, value_name = "SPEEDUP")]
179 assert_gpu_speedup: Option<f64>,
180 /// Skip golden output test
181 #[arg(long)]
182 skip_golden: bool,
183 /// Skip throughput benchmark
184 #[arg(long)]
185 skip_throughput: bool,
186 /// Skip Ollama parity comparison
187 #[arg(long)]
188 skip_ollama: bool,
189 /// Skip GPU vs CPU speedup test (F-PERF-042)
190 #[arg(long)]
191 skip_gpu_speedup: bool,
192 /// Skip tensor contract validation (PMAT-235)
193 #[arg(long)]
194 skip_contract: bool,
195 /// Skip cross-format parity test (F-QUAL-032)
196 #[arg(long)]
197 skip_format_parity: bool,
198 /// Skip PTX parity validation (GH-219)
199 #[arg(long)]
200 skip_ptx_parity: bool,
201 /// SafeTensors model path for cross-format parity test (F-QUAL-032)
202 #[arg(long, value_name = "PATH")]
203 safetensors_path: Option<PathBuf>,
204 /// Number of benchmark iterations
205 #[arg(long, default_value = "10")]
206 iterations: usize,
207 /// Number of warmup iterations
208 #[arg(long, default_value = "3")]
209 warmup: usize,
210 /// Maximum tokens to generate
211 #[arg(long, default_value = "32")]
212 max_tokens: usize,
213 /// Output as JSON (for CI integration)
214 #[arg(long)]
215 json: bool,
216 /// Verbose output
217 #[arg(short, long)]
218 verbose: bool,
219 /// Minimum number of gates that must execute (fail if fewer)
220 #[arg(long, value_name = "N")]
221 min_executed: Option<usize>,
222 /// Previous QA report for regression detection
223 #[arg(long, value_name = "FILE")]
224 previous_report: Option<PathBuf>,
225 /// Maximum allowed performance regression ratio (default: 0.10 = 10%)
226 #[arg(long, value_name = "RATIO")]
227 regression_threshold: Option<f64>,
228 /// Skip GPU state isolation test
229 #[arg(long)]
230 skip_gpu_state: bool,
231 /// Skip metadata plausibility validation (Bug 210, GH-222)
232 #[arg(long)]
233 skip_metadata: bool,
234 /// Skip GPU capability match gate (GH-280)
235 #[arg(long)]
236 skip_capability: bool,
237 /// Assert classifier head presence and shape (F-CLASS-004)
238 #[arg(long)]
239 assert_classifier_head: bool,
240 },
241 /// GPU/CPU parity check (PMAT-232: genchi genbutsu — see where GPU diverges)
242 Parity {
243 /// Path to GGUF model file
244 #[arg(value_name = "FILE")]
245 file: PathBuf,
246 /// Prompt text (default: "What is 2+2?")
247 #[arg(short, long, default_value = "What is 2+2?")]
248 prompt: String,
249 /// Assert parity (exit non-zero on divergence)
250 #[arg(long)]
251 assert: bool,
252 },
253 /// Model-to-PTX source mapping (Mieruka: make GPU kernel dispatch visible)
254 #[command(name = "ptx-map")]
255 PtxMap {
256 /// Path to GGUF model file
257 #[arg(value_name = "FILE")]
258 file: PathBuf,
259 /// Filter to specific kernel (e.g., --kernel Q4KGemv)
260 #[arg(long)]
261 kernel: Option<String>,
262 /// Reverse lookup: kernel name -> which layers/steps use it
263 #[arg(long)]
264 reverse: Option<String>,
265 /// Output as JSON
266 #[arg(long)]
267 json: bool,
268 /// Full PTX snippets and detailed analysis
269 #[arg(short, long)]
270 verbose: bool,
271 /// Show batched prefill kernel variants instead of decode
272 #[arg(long)]
273 prefill: bool,
274 },
275 /// PTX analysis and bug detection (trueno-explain: register pressure, roofline, 15+ bug detectors)
276 #[command(name = "ptx")]
277 Ptx {
278 /// Path to a PTX source file
279 #[arg(value_name = "FILE")]
280 file: Option<PathBuf>,
281 /// Analyze a named kernel from trueno-gpu
282 #[arg(long, short)]
283 kernel: Option<String>,
284 /// Strict mode (no performance whitelist)
285 #[arg(long)]
286 strict: bool,
287 /// Show only bug analysis (skip register/memory/roofline)
288 #[arg(long)]
289 bugs: bool,
290 /// Output as JSON
291 #[arg(long)]
292 json: bool,
293 /// Verbose output (include PTX source listing)
294 #[arg(short, long)]
295 verbose: bool,
296 },
297 /// ML tuning: LoRA/QLoRA configuration and memory planning (GH-176)
298 Tune {
299 /// Path to model file (optional if using --model)
300 #[arg(value_name = "FILE")]
301 file: Option<PathBuf>,
302 /// Tuning method: auto, full, lora, qlora
303 #[arg(long, short = 'm', default_value = "auto")]
304 method: String,
305 /// LoRA rank (default: auto-selected)
306 #[arg(long, short = 'r')]
307 rank: Option<u32>,
308 /// Available VRAM in GB
309 #[arg(long, default_value = "16.0")]
310 vram: f64,
311 /// Only plan configuration, don't train
312 #[arg(long)]
313 plan: bool,
314 /// Model size for planning (e.g., "7B", "1.5B")
315 #[arg(long, value_name = "SIZE")]
316 model: Option<String>,
317 /// Freeze base model weights
318 #[arg(long)]
319 freeze_base: bool,
320 /// Training data file (JSONL format)
321 #[arg(long, value_name = "FILE")]
322 train_data: Option<PathBuf>,
323 /// Output as JSON (for CI integration)
324 #[arg(long)]
325 json: bool,
326 },
327 /// Attach live TUI to a running training session
328 Monitor {
329 /// Experiment output directory (same as finetune -o)
330 #[arg(value_name = "DIR")]
331 dir: PathBuf,
332 /// Refresh interval in milliseconds
333 #[arg(long, default_value = "500")]
334 refresh_ms: u64,
335 /// Compact display mode
336 #[arg(long)]
337 compact: bool,
338 },
339 /// ComputeBrick pipeline monitor (cbtop)
340 Cbtop {
341 /// Model name (e.g., qwen2.5-coder-1.5b)
342 #[arg(long)]
343 model: Option<String>,
344 /// Attach to running realizar process
345 #[arg(long)]
346 attach: Option<String>,
347 /// Path to GGUF model file for real profiling
348 #[arg(long, value_name = "MODEL")]
349 model_path: Option<PathBuf>,
350 /// Run in headless mode (no TUI, for CI/automation)
351 #[arg(long)]
352 headless: bool,
353 /// Output JSON format (requires --headless)
354 #[arg(long)]
355 json: bool,
356 /// Output file path (requires --headless)
357 #[arg(long, value_name = "FILE")]
358 output: Option<PathBuf>,
359 /// CI mode: exit with code 1 if thresholds not met
360 #[arg(long)]
361 ci: bool,
362 /// Minimum throughput threshold in tok/s (for --ci)
363 #[arg(long, value_name = "TOK_S")]
364 throughput: Option<f64>,
365 /// Minimum brick score threshold 0-100 (for --ci)
366 #[arg(long, value_name = "SCORE")]
367 brick_score: Option<u32>,
368 /// Number of warmup iterations before measurement
369 #[arg(long, default_value = "10")]
370 warmup: usize,
371 /// Number of measurement iterations
372 #[arg(long, default_value = "100")]
373 iterations: usize,
374 /// PAR-100: Enable speculative decoding benchmark
375 #[arg(long)]
376 speculative: bool,
377 /// PAR-100: Number of tokens to draft speculatively (default: 4)
378 #[arg(long, default_value = "4")]
379 speculation_k: usize,
380 /// PAR-099: Path to draft model for speculative decoding
381 #[arg(long, value_name = "DRAFT_MODEL")]
382 draft_model: Option<PathBuf>,
383 /// PAR-102: Number of concurrent requests
384 #[arg(long, default_value = "1")]
385 concurrent: usize,
386 /// Use simulated data (for CI testing only)
387 #[arg(long)]
388 simulated: bool,
389 },
390 /// Export for probar visual testing
391 Probar {
392 /// Path to .apr model file
393 #[arg(value_name = "FILE")]
394 file: PathBuf,
395 /// Output directory for test artifacts
396 #[arg(short, long, default_value = "./probar-export")]
397 output: PathBuf,
398 /// Export format: json, png, or both
399 #[arg(long, default_value = "both")]
400 format: String,
401 /// Golden reference directory for comparison
402 #[arg(long)]
403 golden: Option<PathBuf>,
404 /// Filter layers by name pattern
405 #[arg(long)]
406 layer: Option<String>,
407 },
408 /// Compare APR model against HuggingFace source
409 #[command(name = "compare-hf")]
410 CompareHf {
411 /// Path to .apr model file
412 #[arg(value_name = "FILE")]
413 file: PathBuf,
414 /// HuggingFace repo ID (e.g., openai/whisper-tiny)
415 #[arg(long)]
416 hf: String,
417 /// Filter tensors by name pattern
418 #[arg(long)]
419 tensor: Option<String>,
420 /// Comparison threshold (default: 1e-5)
421 #[arg(long, default_value = "1e-5")]
422 threshold: f64,
423 /// Output as JSON
424 #[arg(long)]
425 json: bool,
426 },
427 /// Format-aware binary forensics (10X better than xxd)
428 Hex {
429 /// Path to model file (APR, GGUF, or SafeTensors)
430 #[arg(value_name = "FILE")]
431 file: PathBuf,
432 /// Filter tensors by name pattern
433 #[arg(long)]
434 tensor: Option<String>,
435 /// Limit bytes/values to display
436 #[arg(long, default_value = "64")]
437 limit: usize,
438 /// Show tensor statistics
439 #[arg(long)]
440 stats: bool,
441 /// List tensor names only
442 #[arg(long)]
443 list: bool,
444 /// Output as JSON
445 #[arg(long)]
446 json: bool,
447 /// Annotated file header (magic, version, tensor count, metadata)
448 #[arg(long)]
449 header: bool,
450 /// Q4K/Q6K/Q8_0 super-block structure with field annotations
451 #[arg(long)]
452 blocks: bool,
453 /// Value histogram + entropy + kurtosis analysis
454 #[arg(long)]
455 distribution: bool,
456 /// Layout contract verification overlay per tensor
457 #[arg(long)]
458 contract: bool,
459 /// Per-region byte entropy analysis
460 #[arg(long)]
461 entropy: bool,
462 /// Raw bytes (like xxd but format-aware, with ASCII column)
463 #[arg(long)]
464 raw: bool,
465 /// Start at byte offset (supports 0x prefix for hex)
466 #[arg(long, default_value = "0")]
467 offset: String,
468 /// Bytes per row for raw output (default: 16)
469 #[arg(long, default_value = "16")]
470 width: usize,
471 /// Slice range for partial tensor reads (e.g., 0:3 for first 3 elements)
472 #[arg(long)]
473 slice: Option<String>,
474 },
475 /// Model architecture tree view
476 Tree {
477 /// Path to .apr model file
478 #[arg(value_name = "FILE")]
479 file: PathBuf,
480 /// Filter by component pattern
481 #[arg(long)]
482 filter: Option<String>,
483 /// Output format: ascii, dot, mermaid, json
484 #[arg(long, default_value = "ascii")]
485 format: String,
486 /// Show tensor sizes
487 #[arg(long)]
488 sizes: bool,
489 /// Maximum tree depth
490 #[arg(long)]
491 depth: Option<usize>,
492 },
493 /// Data flow visualization
494 Flow {
495 /// Path to .apr model file
496 #[arg(value_name = "FILE")]
497 file: PathBuf,
498 /// Filter by layer pattern
499 #[arg(long)]
500 layer: Option<String>,
501 /// Component to visualize: full, encoder, decoder, etc.
502 #[arg(long, default_value = "full")]
503 component: String,
504 /// Verbose output with statistics
505 #[arg(short, long)]
506 verbose: bool,
507 /// Output as JSON
508 #[arg(long)]
509 json: bool,
510 },
511 /// Cross-subcommand smoke test (does every tool handle this model?)
512 Qualify {
513 /// Path to model file (APR, GGUF, or SafeTensors)
514 #[arg(value_name = "FILE")]
515 file: PathBuf,
516 /// Testing tier: smoke (Phase 1), standard (+contracts), full (+playbook)
517 #[arg(long, default_value = "smoke")]
518 tier: String,
519 /// Timeout per gate in seconds
520 #[arg(long, default_value = "120")]
521 timeout: u64,
522 /// Output as JSON
523 #[arg(long)]
524 json: bool,
525 /// Show subcommand output (disable stdout suppression)
526 #[arg(short, long)]
527 verbose: bool,
528 /// Skip specific gates (comma-separated)
529 #[arg(long, value_delimiter = ',')]
530 skip: Option<Vec<String>>,
531 },
532 /// Publishing, conversion, and analysis tools
533 #[command(flatten)]
534 Tools(ToolCommands),
535}