apr_cli/extended_commands.rs
1
2/// Extended CLI commands (analysis, profiling, QA, benchmarks, and advanced tools).
3///
4/// Flattened into `Commands` via `#[command(flatten)]` so all subcommands remain
5/// top-level from the user's perspective (e.g., `apr chat`, `apr profile`).
6#[derive(Subcommand, Debug)]
7pub enum ExtendedCommands {
8 /// Interactive chat with language model
9 Chat {
10 /// Path to .apr model file
11 #[arg(value_name = "FILE")]
12 file: PathBuf,
13 /// Sampling temperature (0 = greedy, higher = more random)
14 #[arg(long, default_value = "0.7")]
15 temperature: f32,
16 /// Nucleus sampling threshold
17 #[arg(long, default_value = "0.9")]
18 top_p: f32,
19 /// Maximum tokens to generate per response
20 #[arg(long, default_value = "512")]
21 max_tokens: usize,
22 /// System prompt to set model behavior
23 #[arg(long)]
24 system: Option<String>,
25 /// Show inspection info (top-k probs, tokens/sec)
26 #[arg(long)]
27 inspect: bool,
28 /// Disable GPU acceleration (use CPU)
29 #[arg(long)]
30 no_gpu: bool,
31 /// Force GPU acceleration (requires CUDA)
32 #[arg(long)]
33 gpu: bool,
34 /// Enable inference tracing (APR-TRACE-001)
35 #[arg(long)]
36 trace: bool,
37 /// Trace specific steps only (comma-separated)
38 #[arg(long, value_delimiter = ',')]
39 trace_steps: Option<Vec<String>>,
40 /// Verbose tracing
41 #[arg(long)]
42 trace_verbose: bool,
43 /// Save trace output to JSON file
44 #[arg(long, value_name = "FILE")]
45 trace_output: Option<PathBuf>,
46 /// Trace detail level (none, basic, layer, payload)
47 #[arg(long, value_name = "LEVEL", default_value = "basic")]
48 trace_level: String,
49 /// Enable inline Roofline profiling (PMAT-SHOWCASE-METHODOLOGY-001)
50 #[arg(long)]
51 profile: bool,
52 },
53 /// Benchmark throughput (spec H12: >= 10 tok/s)
54 Bench {
55 /// Path to model file
56 #[arg(value_name = "FILE")]
57 file: PathBuf,
58 /// Number of warmup iterations
59 #[arg(long, default_value = "3")]
60 warmup: usize,
61 /// Number of measurement iterations
62 #[arg(long, default_value = "5")]
63 iterations: usize,
64 /// Max tokens to generate per iteration
65 #[arg(long, default_value = "32")]
66 max_tokens: usize,
67 /// Test prompt
68 #[arg(long)]
69 prompt: Option<String>,
70 /// Use realizar for fast inference (vs aprender baseline)
71 #[arg(long)]
72 fast: bool,
73 /// Benchmark specific brick
74 #[arg(long)]
75 brick: Option<String>,
76 },
77 /// Evaluate model perplexity (spec H13: PPL <= 20)
78 Eval {
79 /// Path to model file
80 #[arg(value_name = "FILE")]
81 file: PathBuf,
82 /// Dataset: wikitext-2, lambada, or custom
83 #[arg(long, default_value = "wikitext-2")]
84 dataset: String,
85 /// Custom text (when dataset=custom)
86 #[arg(long)]
87 text: Option<String>,
88 /// Maximum tokens to evaluate
89 #[arg(long, default_value = "512")]
90 max_tokens: usize,
91 /// Perplexity threshold for pass/fail
92 #[arg(long, default_value = "20.0")]
93 threshold: f32,
94 },
95 /// Deep profiling with Roofline analysis
96 Profile {
97 /// Path to model file
98 #[arg(value_name = "FILE")]
99 file: PathBuf,
100 /// Layer-by-layer granular analysis
101 #[arg(long)]
102 granular: bool,
103 /// Output format (human, json, flamegraph)
104 #[arg(long, default_value = "human")]
105 format: String,
106 /// Focus on specific operation
107 #[arg(long)]
108 focus: Option<String>,
109 /// Detect naive implementations
110 #[arg(long)]
111 detect_naive: bool,
112 /// GFLOPS threshold for naive detection
113 #[arg(long, default_value = "10.0")]
114 threshold: f64,
115 /// Compare against HuggingFace baseline
116 #[arg(long)]
117 compare_hf: Option<String>,
118 /// Measure energy consumption (requires RAPL)
119 #[arg(long)]
120 energy: bool,
121 /// Compute performance grade (vs Ollama baseline)
122 #[arg(long)]
123 perf_grade: bool,
124 /// Show call graph
125 #[arg(long)]
126 callgraph: bool,
127 /// Exit non-zero if naive implementation detected
128 #[arg(long)]
129 fail_on_naive: bool,
130 /// Output file path for flamegraph SVG (GH-174, PMAT-182)
131 #[arg(long, short = 'o')]
132 output: Option<PathBuf>,
133
134 // PMAT-192: CI Assertion Mode (GH-180)
135 /// Enable CI mode with assertion checks (exits 1 on failure)
136 #[arg(long)]
137 ci: bool,
138 /// Minimum throughput in tok/s (CI assertion, exits 1 if below)
139 #[arg(long)]
140 assert_throughput: Option<f64>,
141 /// Maximum p99 latency in ms (CI assertion, exits 1 if above)
142 #[arg(long)]
143 assert_p99: Option<f64>,
144 /// Maximum p50 latency in ms (CI assertion, exits 1 if above)
145 #[arg(long)]
146 assert_p50: Option<f64>,
147 /// Warmup passes before measurement (default: 3)
148 #[arg(long, default_value = "3")]
149 warmup: usize,
150 /// Measurement passes (default: 10)
151 #[arg(long, default_value = "10")]
152 measure: usize,
153 /// Number of tokens to generate per measurement pass (default: 32)
154 #[arg(long, default_value = "32")]
155 tokens: usize,
156 /// Compare against Ollama baseline (runs ollama for comparison)
157 #[arg(long)]
158 ollama: bool,
159 /// Disable GPU (force CPU-only profiling)
160 #[arg(long)]
161 no_gpu: bool,
162 /// Compare against another model format (F-PROFILE-011)
163 #[arg(long, value_name = "FILE")]
164 compare: Option<PathBuf>,
165 },
166 /// Falsifiable QA checklist for model releases
167 Qa {
168 /// Path to model file
169 #[arg(value_name = "FILE")]
170 file: PathBuf,
171 /// Minimum throughput threshold in tok/s
172 #[arg(long, value_name = "TPS")]
173 assert_tps: Option<f64>,
174 /// Minimum speedup vs Ollama
175 #[arg(long, value_name = "SPEEDUP")]
176 assert_speedup: Option<f64>,
177 /// Minimum GPU vs CPU speedup (F-PERF-042)
178 #[arg(long, value_name = "SPEEDUP")]
179 assert_gpu_speedup: Option<f64>,
180 /// Skip golden output test
181 #[arg(long)]
182 skip_golden: bool,
183 /// Skip throughput benchmark
184 #[arg(long)]
185 skip_throughput: bool,
186 /// Skip Ollama parity comparison
187 #[arg(long)]
188 skip_ollama: bool,
189 /// Skip GPU vs CPU speedup test (F-PERF-042)
190 #[arg(long)]
191 skip_gpu_speedup: bool,
192 /// Skip tensor contract validation (PMAT-235)
193 #[arg(long)]
194 skip_contract: bool,
195 /// Skip cross-format parity test (F-QUAL-032)
196 #[arg(long)]
197 skip_format_parity: bool,
198 /// Skip PTX parity validation (GH-219)
199 #[arg(long)]
200 skip_ptx_parity: bool,
201 /// SafeTensors model path for cross-format parity test (F-QUAL-032)
202 #[arg(long, value_name = "PATH")]
203 safetensors_path: Option<PathBuf>,
204 /// Number of benchmark iterations
205 #[arg(long, default_value = "10")]
206 iterations: usize,
207 /// Number of warmup iterations
208 #[arg(long, default_value = "3")]
209 warmup: usize,
210 /// Maximum tokens to generate
211 #[arg(long, default_value = "32")]
212 max_tokens: usize,
213 /// Output as JSON (for CI integration)
214 #[arg(long)]
215 json: bool,
216 /// Verbose output
217 #[arg(short, long)]
218 verbose: bool,
219 /// Minimum number of gates that must execute (fail if fewer)
220 #[arg(long, value_name = "N")]
221 min_executed: Option<usize>,
222 /// Previous QA report for regression detection
223 #[arg(long, value_name = "FILE")]
224 previous_report: Option<PathBuf>,
225 /// Maximum allowed performance regression ratio (default: 0.10 = 10%)
226 #[arg(long, value_name = "RATIO")]
227 regression_threshold: Option<f64>,
228 /// Skip GPU state isolation test
229 #[arg(long)]
230 skip_gpu_state: bool,
231 /// Skip metadata plausibility validation (Bug 210, GH-222)
232 #[arg(long)]
233 skip_metadata: bool,
234 /// Skip GPU capability match gate (GH-280)
235 #[arg(long)]
236 skip_capability: bool,
237 },
238 /// GPU/CPU parity check (PMAT-232: genchi genbutsu — see where GPU diverges)
239 Parity {
240 /// Path to GGUF model file
241 #[arg(value_name = "FILE")]
242 file: PathBuf,
243 /// Prompt text (default: "What is 2+2?")
244 #[arg(short, long, default_value = "What is 2+2?")]
245 prompt: String,
246 /// Assert parity (exit non-zero on divergence)
247 #[arg(long)]
248 assert: bool,
249 },
250 /// Model-to-PTX source mapping (Mieruka: make GPU kernel dispatch visible)
251 #[command(name = "ptx-map")]
252 PtxMap {
253 /// Path to GGUF model file
254 #[arg(value_name = "FILE")]
255 file: PathBuf,
256 /// Filter to specific kernel (e.g., --kernel Q4KGemv)
257 #[arg(long)]
258 kernel: Option<String>,
259 /// Reverse lookup: kernel name -> which layers/steps use it
260 #[arg(long)]
261 reverse: Option<String>,
262 /// Output as JSON
263 #[arg(long)]
264 json: bool,
265 /// Full PTX snippets and detailed analysis
266 #[arg(short, long)]
267 verbose: bool,
268 /// Show batched prefill kernel variants instead of decode
269 #[arg(long)]
270 prefill: bool,
271 },
272 /// PTX analysis and bug detection (trueno-explain: register pressure, roofline, 15+ bug detectors)
273 #[command(name = "ptx")]
274 Ptx {
275 /// Path to a PTX source file
276 #[arg(value_name = "FILE")]
277 file: Option<PathBuf>,
278 /// Analyze a named kernel from trueno-gpu
279 #[arg(long, short)]
280 kernel: Option<String>,
281 /// Strict mode (no performance whitelist)
282 #[arg(long)]
283 strict: bool,
284 /// Show only bug analysis (skip register/memory/roofline)
285 #[arg(long)]
286 bugs: bool,
287 /// Output as JSON
288 #[arg(long)]
289 json: bool,
290 /// Verbose output (include PTX source listing)
291 #[arg(short, long)]
292 verbose: bool,
293 },
294 /// ML tuning: LoRA/QLoRA configuration and memory planning (GH-176)
295 Tune {
296 /// Path to model file (optional if using --model)
297 #[arg(value_name = "FILE")]
298 file: Option<PathBuf>,
299 /// Tuning method: auto, full, lora, qlora
300 #[arg(long, short = 'm', default_value = "auto")]
301 method: String,
302 /// LoRA rank (default: auto-selected)
303 #[arg(long, short = 'r')]
304 rank: Option<u32>,
305 /// Available VRAM in GB
306 #[arg(long, default_value = "16.0")]
307 vram: f64,
308 /// Only plan configuration, don't train
309 #[arg(long)]
310 plan: bool,
311 /// Model size for planning (e.g., "7B", "1.5B")
312 #[arg(long, value_name = "SIZE")]
313 model: Option<String>,
314 /// Freeze base model weights
315 #[arg(long)]
316 freeze_base: bool,
317 /// Training data file (JSONL format)
318 #[arg(long, value_name = "FILE")]
319 train_data: Option<PathBuf>,
320 /// Output as JSON (for CI integration)
321 #[arg(long)]
322 json: bool,
323 },
324 /// ComputeBrick pipeline monitor (cbtop)
325 Cbtop {
326 /// Model name (e.g., qwen2.5-coder-1.5b)
327 #[arg(long)]
328 model: Option<String>,
329 /// Attach to running realizar process
330 #[arg(long)]
331 attach: Option<String>,
332 /// Path to GGUF model file for real profiling
333 #[arg(long, value_name = "MODEL")]
334 model_path: Option<PathBuf>,
335 /// Run in headless mode (no TUI, for CI/automation)
336 #[arg(long)]
337 headless: bool,
338 /// Output JSON format (requires --headless)
339 #[arg(long)]
340 json: bool,
341 /// Output file path (requires --headless)
342 #[arg(long, value_name = "FILE")]
343 output: Option<PathBuf>,
344 /// CI mode: exit with code 1 if thresholds not met
345 #[arg(long)]
346 ci: bool,
347 /// Minimum throughput threshold in tok/s (for --ci)
348 #[arg(long, value_name = "TOK_S")]
349 throughput: Option<f64>,
350 /// Minimum brick score threshold 0-100 (for --ci)
351 #[arg(long, value_name = "SCORE")]
352 brick_score: Option<u32>,
353 /// Number of warmup iterations before measurement
354 #[arg(long, default_value = "10")]
355 warmup: usize,
356 /// Number of measurement iterations
357 #[arg(long, default_value = "100")]
358 iterations: usize,
359 /// PAR-100: Enable speculative decoding benchmark
360 #[arg(long)]
361 speculative: bool,
362 /// PAR-100: Number of tokens to draft speculatively (default: 4)
363 #[arg(long, default_value = "4")]
364 speculation_k: usize,
365 /// PAR-099: Path to draft model for speculative decoding
366 #[arg(long, value_name = "DRAFT_MODEL")]
367 draft_model: Option<PathBuf>,
368 /// PAR-102: Number of concurrent requests
369 #[arg(long, default_value = "1")]
370 concurrent: usize,
371 /// Use simulated data (for CI testing only)
372 #[arg(long)]
373 simulated: bool,
374 },
375 /// Export for probar visual testing
376 Probar {
377 /// Path to .apr model file
378 #[arg(value_name = "FILE")]
379 file: PathBuf,
380 /// Output directory for test artifacts
381 #[arg(short, long, default_value = "./probar-export")]
382 output: PathBuf,
383 /// Export format: json, png, or both
384 #[arg(long, default_value = "both")]
385 format: String,
386 /// Golden reference directory for comparison
387 #[arg(long)]
388 golden: Option<PathBuf>,
389 /// Filter layers by name pattern
390 #[arg(long)]
391 layer: Option<String>,
392 },
393 /// Compare APR model against HuggingFace source
394 #[command(name = "compare-hf")]
395 CompareHf {
396 /// Path to .apr model file
397 #[arg(value_name = "FILE")]
398 file: PathBuf,
399 /// HuggingFace repo ID (e.g., openai/whisper-tiny)
400 #[arg(long)]
401 hf: String,
402 /// Filter tensors by name pattern
403 #[arg(long)]
404 tensor: Option<String>,
405 /// Comparison threshold (default: 1e-5)
406 #[arg(long, default_value = "1e-5")]
407 threshold: f64,
408 /// Output as JSON
409 #[arg(long)]
410 json: bool,
411 },
412 /// Format-aware binary forensics (10X better than xxd)
413 Hex {
414 /// Path to model file (APR, GGUF, or SafeTensors)
415 #[arg(value_name = "FILE")]
416 file: PathBuf,
417 /// Filter tensors by name pattern
418 #[arg(long)]
419 tensor: Option<String>,
420 /// Limit bytes/values to display
421 #[arg(long, default_value = "64")]
422 limit: usize,
423 /// Show tensor statistics
424 #[arg(long)]
425 stats: bool,
426 /// List tensor names only
427 #[arg(long)]
428 list: bool,
429 /// Output as JSON
430 #[arg(long)]
431 json: bool,
432 /// Annotated file header (magic, version, tensor count, metadata)
433 #[arg(long)]
434 header: bool,
435 /// Q4K/Q6K/Q8_0 super-block structure with field annotations
436 #[arg(long)]
437 blocks: bool,
438 /// Value histogram + entropy + kurtosis analysis
439 #[arg(long)]
440 distribution: bool,
441 /// Layout contract verification overlay per tensor
442 #[arg(long)]
443 contract: bool,
444 /// Per-region byte entropy analysis
445 #[arg(long)]
446 entropy: bool,
447 /// Raw bytes (like xxd but format-aware, with ASCII column)
448 #[arg(long)]
449 raw: bool,
450 /// Start at byte offset (supports 0x prefix for hex)
451 #[arg(long, default_value = "0")]
452 offset: String,
453 /// Bytes per row for raw output (default: 16)
454 #[arg(long, default_value = "16")]
455 width: usize,
456 /// Slice range for partial tensor reads (e.g., 0:3 for first 3 elements)
457 #[arg(long)]
458 slice: Option<String>,
459 },
460 /// Model architecture tree view
461 Tree {
462 /// Path to .apr model file
463 #[arg(value_name = "FILE")]
464 file: PathBuf,
465 /// Filter by component pattern
466 #[arg(long)]
467 filter: Option<String>,
468 /// Output format: ascii, dot, mermaid, json
469 #[arg(long, default_value = "ascii")]
470 format: String,
471 /// Show tensor sizes
472 #[arg(long)]
473 sizes: bool,
474 /// Maximum tree depth
475 #[arg(long)]
476 depth: Option<usize>,
477 },
478 /// Data flow visualization
479 Flow {
480 /// Path to .apr model file
481 #[arg(value_name = "FILE")]
482 file: PathBuf,
483 /// Filter by layer pattern
484 #[arg(long)]
485 layer: Option<String>,
486 /// Component to visualize: full, encoder, decoder, etc.
487 #[arg(long, default_value = "full")]
488 component: String,
489 /// Verbose output with statistics
490 #[arg(short, long)]
491 verbose: bool,
492 /// Output as JSON
493 #[arg(long)]
494 json: bool,
495 },
496 /// Publishing, conversion, and analysis tools
497 #[command(flatten)]
498 Tools(ToolCommands),
499}