Skip to main content

apr_cli/
extended_commands.rs

1
2/// Extended CLI commands (analysis, profiling, QA, benchmarks, and advanced tools).
3///
4/// Flattened into `Commands` via `#[command(flatten)]` so all subcommands remain
5/// top-level from the user's perspective (e.g., `apr chat`, `apr profile`).
6#[derive(Subcommand, Debug)]
7pub enum ExtendedCommands {
8    /// Interactive chat with language model
9    Chat {
10        /// Path to .apr model file
11        #[arg(value_name = "FILE")]
12        file: PathBuf,
13        /// Sampling temperature (0 = greedy, higher = more random)
14        #[arg(long, default_value = "0.7")]
15        temperature: f32,
16        /// Nucleus sampling threshold
17        #[arg(long, default_value = "0.9")]
18        top_p: f32,
19        /// Maximum tokens to generate per response
20        #[arg(long, default_value = "512")]
21        max_tokens: usize,
22        /// System prompt to set model behavior
23        #[arg(long)]
24        system: Option<String>,
25        /// Show inspection info (top-k probs, tokens/sec)
26        #[arg(long)]
27        inspect: bool,
28        /// Disable GPU acceleration (use CPU)
29        #[arg(long)]
30        no_gpu: bool,
31        /// Force GPU acceleration (requires CUDA)
32        #[arg(long)]
33        gpu: bool,
34        /// Enable inference tracing (APR-TRACE-001)
35        #[arg(long)]
36        trace: bool,
37        /// Trace specific steps only (comma-separated)
38        #[arg(long, value_delimiter = ',')]
39        trace_steps: Option<Vec<String>>,
40        /// Verbose tracing
41        #[arg(long)]
42        trace_verbose: bool,
43        /// Save trace output to JSON file
44        #[arg(long, value_name = "FILE")]
45        trace_output: Option<PathBuf>,
46        /// Trace detail level (none, basic, layer, payload)
47        #[arg(long, value_name = "LEVEL", default_value = "basic")]
48        trace_level: String,
49        /// Enable inline Roofline profiling (PMAT-SHOWCASE-METHODOLOGY-001)
50        #[arg(long)]
51        profile: bool,
52    },
53    /// Benchmark throughput (spec H12: >= 10 tok/s)
54    Bench {
55        /// Path to model file
56        #[arg(value_name = "FILE")]
57        file: PathBuf,
58        /// Number of warmup iterations
59        #[arg(long, default_value = "3")]
60        warmup: usize,
61        /// Number of measurement iterations
62        #[arg(long, default_value = "5")]
63        iterations: usize,
64        /// Max tokens to generate per iteration
65        #[arg(long, default_value = "32")]
66        max_tokens: usize,
67        /// Test prompt
68        #[arg(long)]
69        prompt: Option<String>,
70        /// Use realizar for fast inference (vs aprender baseline)
71        #[arg(long)]
72        fast: bool,
73        /// Benchmark specific brick
74        #[arg(long)]
75        brick: Option<String>,
76    },
77    /// Evaluate model perplexity (spec H13: PPL <= 20)
78    Eval {
79        /// Path to model file
80        #[arg(value_name = "FILE")]
81        file: PathBuf,
82        /// Dataset: wikitext-2, lambada, or custom
83        #[arg(long, default_value = "wikitext-2")]
84        dataset: String,
85        /// Custom text (when dataset=custom)
86        #[arg(long)]
87        text: Option<String>,
88        /// Maximum tokens to evaluate
89        #[arg(long, default_value = "512")]
90        max_tokens: usize,
91        /// Perplexity threshold for pass/fail
92        #[arg(long, default_value = "20.0")]
93        threshold: f32,
94    },
95    /// Deep profiling with Roofline analysis
96    Profile {
97        /// Path to model file
98        #[arg(value_name = "FILE")]
99        file: PathBuf,
100        /// Layer-by-layer granular analysis
101        #[arg(long)]
102        granular: bool,
103        /// Output format (human, json, flamegraph)
104        #[arg(long, default_value = "human")]
105        format: String,
106        /// Focus on specific operation
107        #[arg(long)]
108        focus: Option<String>,
109        /// Detect naive implementations
110        #[arg(long)]
111        detect_naive: bool,
112        /// GFLOPS threshold for naive detection
113        #[arg(long, default_value = "10.0")]
114        threshold: f64,
115        /// Compare against HuggingFace baseline
116        #[arg(long)]
117        compare_hf: Option<String>,
118        /// Measure energy consumption (requires RAPL)
119        #[arg(long)]
120        energy: bool,
121        /// Compute performance grade (vs Ollama baseline)
122        #[arg(long)]
123        perf_grade: bool,
124        /// Show call graph
125        #[arg(long)]
126        callgraph: bool,
127        /// Exit non-zero if naive implementation detected
128        #[arg(long)]
129        fail_on_naive: bool,
130        /// Output file path for flamegraph SVG (GH-174, PMAT-182)
131        #[arg(long, short = 'o')]
132        output: Option<PathBuf>,
133
134        // PMAT-192: CI Assertion Mode (GH-180)
135        /// Enable CI mode with assertion checks (exits 1 on failure)
136        #[arg(long)]
137        ci: bool,
138        /// Minimum throughput in tok/s (CI assertion, exits 1 if below)
139        #[arg(long)]
140        assert_throughput: Option<f64>,
141        /// Maximum p99 latency in ms (CI assertion, exits 1 if above)
142        #[arg(long)]
143        assert_p99: Option<f64>,
144        /// Maximum p50 latency in ms (CI assertion, exits 1 if above)
145        #[arg(long)]
146        assert_p50: Option<f64>,
147        /// Warmup passes before measurement (default: 3)
148        #[arg(long, default_value = "3")]
149        warmup: usize,
150        /// Measurement passes (default: 10)
151        #[arg(long, default_value = "10")]
152        measure: usize,
153        /// Number of tokens to generate per measurement pass (default: 32)
154        #[arg(long, default_value = "32")]
155        tokens: usize,
156        /// Compare against Ollama baseline (runs ollama for comparison)
157        #[arg(long)]
158        ollama: bool,
159        /// Disable GPU (force CPU-only profiling)
160        #[arg(long)]
161        no_gpu: bool,
162        /// Compare against another model format (F-PROFILE-011)
163        #[arg(long, value_name = "FILE")]
164        compare: Option<PathBuf>,
165    },
166    /// Falsifiable QA checklist for model releases
167    Qa {
168        /// Path to model file
169        #[arg(value_name = "FILE")]
170        file: PathBuf,
171        /// Minimum throughput threshold in tok/s
172        #[arg(long, value_name = "TPS")]
173        assert_tps: Option<f64>,
174        /// Minimum speedup vs Ollama
175        #[arg(long, value_name = "SPEEDUP")]
176        assert_speedup: Option<f64>,
177        /// Minimum GPU vs CPU speedup (F-PERF-042)
178        #[arg(long, value_name = "SPEEDUP")]
179        assert_gpu_speedup: Option<f64>,
180        /// Skip golden output test
181        #[arg(long)]
182        skip_golden: bool,
183        /// Skip throughput benchmark
184        #[arg(long)]
185        skip_throughput: bool,
186        /// Skip Ollama parity comparison
187        #[arg(long)]
188        skip_ollama: bool,
189        /// Skip GPU vs CPU speedup test (F-PERF-042)
190        #[arg(long)]
191        skip_gpu_speedup: bool,
192        /// Skip tensor contract validation (PMAT-235)
193        #[arg(long)]
194        skip_contract: bool,
195        /// Skip cross-format parity test (F-QUAL-032)
196        #[arg(long)]
197        skip_format_parity: bool,
198        /// Skip PTX parity validation (GH-219)
199        #[arg(long)]
200        skip_ptx_parity: bool,
201        /// SafeTensors model path for cross-format parity test (F-QUAL-032)
202        #[arg(long, value_name = "PATH")]
203        safetensors_path: Option<PathBuf>,
204        /// Number of benchmark iterations
205        #[arg(long, default_value = "10")]
206        iterations: usize,
207        /// Number of warmup iterations
208        #[arg(long, default_value = "3")]
209        warmup: usize,
210        /// Maximum tokens to generate
211        #[arg(long, default_value = "32")]
212        max_tokens: usize,
213        /// Output as JSON (for CI integration)
214        #[arg(long)]
215        json: bool,
216        /// Verbose output
217        #[arg(short, long)]
218        verbose: bool,
219        /// Minimum number of gates that must execute (fail if fewer)
220        #[arg(long, value_name = "N")]
221        min_executed: Option<usize>,
222        /// Previous QA report for regression detection
223        #[arg(long, value_name = "FILE")]
224        previous_report: Option<PathBuf>,
225        /// Maximum allowed performance regression ratio (default: 0.10 = 10%)
226        #[arg(long, value_name = "RATIO")]
227        regression_threshold: Option<f64>,
228        /// Skip GPU state isolation test
229        #[arg(long)]
230        skip_gpu_state: bool,
231        /// Skip metadata plausibility validation (Bug 210, GH-222)
232        #[arg(long)]
233        skip_metadata: bool,
234        /// Skip GPU capability match gate (GH-280)
235        #[arg(long)]
236        skip_capability: bool,
237        /// Assert classifier head presence and shape (F-CLASS-004)
238        #[arg(long)]
239        assert_classifier_head: bool,
240    },
241    /// GPU/CPU parity check (PMAT-232: genchi genbutsu — see where GPU diverges)
242    Parity {
243        /// Path to GGUF model file
244        #[arg(value_name = "FILE")]
245        file: PathBuf,
246        /// Prompt text (default: "What is 2+2?")
247        #[arg(short, long, default_value = "What is 2+2?")]
248        prompt: String,
249        /// Assert parity (exit non-zero on divergence)
250        #[arg(long)]
251        assert: bool,
252    },
253    /// Model-to-PTX source mapping (Mieruka: make GPU kernel dispatch visible)
254    #[command(name = "ptx-map")]
255    PtxMap {
256        /// Path to GGUF model file
257        #[arg(value_name = "FILE")]
258        file: PathBuf,
259        /// Filter to specific kernel (e.g., --kernel Q4KGemv)
260        #[arg(long)]
261        kernel: Option<String>,
262        /// Reverse lookup: kernel name -> which layers/steps use it
263        #[arg(long)]
264        reverse: Option<String>,
265        /// Output as JSON
266        #[arg(long)]
267        json: bool,
268        /// Full PTX snippets and detailed analysis
269        #[arg(short, long)]
270        verbose: bool,
271        /// Show batched prefill kernel variants instead of decode
272        #[arg(long)]
273        prefill: bool,
274    },
275    /// PTX analysis and bug detection (trueno-explain: register pressure, roofline, 15+ bug detectors)
276    #[command(name = "ptx")]
277    Ptx {
278        /// Path to a PTX source file
279        #[arg(value_name = "FILE")]
280        file: Option<PathBuf>,
281        /// Analyze a named kernel from trueno-gpu
282        #[arg(long, short)]
283        kernel: Option<String>,
284        /// Strict mode (no performance whitelist)
285        #[arg(long)]
286        strict: bool,
287        /// Show only bug analysis (skip register/memory/roofline)
288        #[arg(long)]
289        bugs: bool,
290        /// Output as JSON
291        #[arg(long)]
292        json: bool,
293        /// Verbose output (include PTX source listing)
294        #[arg(short, long)]
295        verbose: bool,
296    },
297    /// ML tuning: LoRA/QLoRA configuration and memory planning (GH-176)
298    Tune {
299        /// Path to model file (optional if using --model)
300        #[arg(value_name = "FILE")]
301        file: Option<PathBuf>,
302        /// Tuning method: auto, full, lora, qlora
303        #[arg(long, short = 'm', default_value = "auto")]
304        method: String,
305        /// LoRA rank (default: auto-selected)
306        #[arg(long, short = 'r')]
307        rank: Option<u32>,
308        /// Available VRAM in GB
309        #[arg(long, default_value = "16.0")]
310        vram: f64,
311        /// Only plan configuration, don't train
312        #[arg(long)]
313        plan: bool,
314        /// Model size for planning (e.g., "7B", "1.5B")
315        #[arg(long, value_name = "SIZE")]
316        model: Option<String>,
317        /// Freeze base model weights
318        #[arg(long)]
319        freeze_base: bool,
320        /// Training data file (JSONL format)
321        #[arg(long, value_name = "FILE")]
322        train_data: Option<PathBuf>,
323        /// Output as JSON (for CI integration)
324        #[arg(long)]
325        json: bool,
326    },
327    /// Attach live TUI to a running training session
328    Monitor {
329        /// Experiment output directory (same as finetune -o)
330        #[arg(value_name = "DIR")]
331        dir: PathBuf,
332        /// Refresh interval in milliseconds
333        #[arg(long, default_value = "500")]
334        refresh_ms: u64,
335        /// Compact display mode
336        #[arg(long)]
337        compact: bool,
338    },
339    /// ComputeBrick pipeline monitor (cbtop)
340    Cbtop {
341        /// Model name (e.g., qwen2.5-coder-1.5b)
342        #[arg(long)]
343        model: Option<String>,
344        /// Attach to running realizar process
345        #[arg(long)]
346        attach: Option<String>,
347        /// Path to GGUF model file for real profiling
348        #[arg(long, value_name = "MODEL")]
349        model_path: Option<PathBuf>,
350        /// Run in headless mode (no TUI, for CI/automation)
351        #[arg(long)]
352        headless: bool,
353        /// Output JSON format (requires --headless)
354        #[arg(long)]
355        json: bool,
356        /// Output file path (requires --headless)
357        #[arg(long, value_name = "FILE")]
358        output: Option<PathBuf>,
359        /// CI mode: exit with code 1 if thresholds not met
360        #[arg(long)]
361        ci: bool,
362        /// Minimum throughput threshold in tok/s (for --ci)
363        #[arg(long, value_name = "TOK_S")]
364        throughput: Option<f64>,
365        /// Minimum brick score threshold 0-100 (for --ci)
366        #[arg(long, value_name = "SCORE")]
367        brick_score: Option<u32>,
368        /// Number of warmup iterations before measurement
369        #[arg(long, default_value = "10")]
370        warmup: usize,
371        /// Number of measurement iterations
372        #[arg(long, default_value = "100")]
373        iterations: usize,
374        /// PAR-100: Enable speculative decoding benchmark
375        #[arg(long)]
376        speculative: bool,
377        /// PAR-100: Number of tokens to draft speculatively (default: 4)
378        #[arg(long, default_value = "4")]
379        speculation_k: usize,
380        /// PAR-099: Path to draft model for speculative decoding
381        #[arg(long, value_name = "DRAFT_MODEL")]
382        draft_model: Option<PathBuf>,
383        /// PAR-102: Number of concurrent requests
384        #[arg(long, default_value = "1")]
385        concurrent: usize,
386        /// Use simulated data (for CI testing only)
387        #[arg(long)]
388        simulated: bool,
389    },
390    /// Export for probar visual testing
391    Probar {
392        /// Path to .apr model file
393        #[arg(value_name = "FILE")]
394        file: PathBuf,
395        /// Output directory for test artifacts
396        #[arg(short, long, default_value = "./probar-export")]
397        output: PathBuf,
398        /// Export format: json, png, or both
399        #[arg(long, default_value = "both")]
400        format: String,
401        /// Golden reference directory for comparison
402        #[arg(long)]
403        golden: Option<PathBuf>,
404        /// Filter layers by name pattern
405        #[arg(long)]
406        layer: Option<String>,
407    },
408    /// Compare APR model against HuggingFace source
409    #[command(name = "compare-hf")]
410    CompareHf {
411        /// Path to .apr model file
412        #[arg(value_name = "FILE")]
413        file: PathBuf,
414        /// HuggingFace repo ID (e.g., openai/whisper-tiny)
415        #[arg(long)]
416        hf: String,
417        /// Filter tensors by name pattern
418        #[arg(long)]
419        tensor: Option<String>,
420        /// Comparison threshold (default: 1e-5)
421        #[arg(long, default_value = "1e-5")]
422        threshold: f64,
423        /// Output as JSON
424        #[arg(long)]
425        json: bool,
426    },
427    /// Format-aware binary forensics (10X better than xxd)
428    Hex {
429        /// Path to model file (APR, GGUF, or SafeTensors)
430        #[arg(value_name = "FILE")]
431        file: PathBuf,
432        /// Filter tensors by name pattern
433        #[arg(long)]
434        tensor: Option<String>,
435        /// Limit bytes/values to display
436        #[arg(long, default_value = "64")]
437        limit: usize,
438        /// Show tensor statistics
439        #[arg(long)]
440        stats: bool,
441        /// List tensor names only
442        #[arg(long)]
443        list: bool,
444        /// Output as JSON
445        #[arg(long)]
446        json: bool,
447        /// Annotated file header (magic, version, tensor count, metadata)
448        #[arg(long)]
449        header: bool,
450        /// Q4K/Q6K/Q8_0 super-block structure with field annotations
451        #[arg(long)]
452        blocks: bool,
453        /// Value histogram + entropy + kurtosis analysis
454        #[arg(long)]
455        distribution: bool,
456        /// Layout contract verification overlay per tensor
457        #[arg(long)]
458        contract: bool,
459        /// Per-region byte entropy analysis
460        #[arg(long)]
461        entropy: bool,
462        /// Raw bytes (like xxd but format-aware, with ASCII column)
463        #[arg(long)]
464        raw: bool,
465        /// Start at byte offset (supports 0x prefix for hex)
466        #[arg(long, default_value = "0")]
467        offset: String,
468        /// Bytes per row for raw output (default: 16)
469        #[arg(long, default_value = "16")]
470        width: usize,
471        /// Slice range for partial tensor reads (e.g., 0:3 for first 3 elements)
472        #[arg(long)]
473        slice: Option<String>,
474    },
475    /// Model architecture tree view
476    Tree {
477        /// Path to .apr model file
478        #[arg(value_name = "FILE")]
479        file: PathBuf,
480        /// Filter by component pattern
481        #[arg(long)]
482        filter: Option<String>,
483        /// Output format: ascii, dot, mermaid, json
484        #[arg(long, default_value = "ascii")]
485        format: String,
486        /// Show tensor sizes
487        #[arg(long)]
488        sizes: bool,
489        /// Maximum tree depth
490        #[arg(long)]
491        depth: Option<usize>,
492    },
493    /// Data flow visualization
494    Flow {
495        /// Path to .apr model file
496        #[arg(value_name = "FILE")]
497        file: PathBuf,
498        /// Filter by layer pattern
499        #[arg(long)]
500        layer: Option<String>,
501        /// Component to visualize: full, encoder, decoder, etc.
502        #[arg(long, default_value = "full")]
503        component: String,
504        /// Verbose output with statistics
505        #[arg(short, long)]
506        verbose: bool,
507        /// Output as JSON
508        #[arg(long)]
509        json: bool,
510    },
511    /// Cross-subcommand smoke test (does every tool handle this model?)
512    Qualify {
513        /// Path to model file (APR, GGUF, or SafeTensors)
514        #[arg(value_name = "FILE")]
515        file: PathBuf,
516        /// Testing tier: smoke (Phase 1), standard (+contracts), full (+playbook)
517        #[arg(long, default_value = "smoke")]
518        tier: String,
519        /// Timeout per gate in seconds
520        #[arg(long, default_value = "120")]
521        timeout: u64,
522        /// Output as JSON
523        #[arg(long)]
524        json: bool,
525        /// Show subcommand output (disable stdout suppression)
526        #[arg(short, long)]
527        verbose: bool,
528        /// Skip specific gates (comma-separated)
529        #[arg(long, value_delimiter = ',')]
530        skip: Option<Vec<String>>,
531    },
532    /// Publishing, conversion, and analysis tools
533    #[command(flatten)]
534    Tools(ToolCommands),
535}