Skip to main content

apr_cli/
extended_commands.rs

1
2/// Extended CLI commands (analysis, profiling, QA, benchmarks, and advanced tools).
3///
4/// Flattened into `Commands` via `#[command(flatten)]` so all subcommands remain
5/// top-level from the user's perspective (e.g., `apr chat`, `apr profile`).
6#[derive(Subcommand, Debug)]
7pub enum ExtendedCommands {
8    /// Interactive chat with language model
9    Chat {
10        /// Path to .apr model file
11        #[arg(value_name = "FILE")]
12        file: PathBuf,
13        /// Sampling temperature (0 = greedy, higher = more random)
14        #[arg(long, default_value = "0.7")]
15        temperature: f32,
16        /// Nucleus sampling threshold
17        #[arg(long, default_value = "0.9")]
18        top_p: f32,
19        /// Maximum tokens to generate per response
20        #[arg(long, default_value = "512")]
21        max_tokens: usize,
22        /// System prompt to set model behavior
23        #[arg(long)]
24        system: Option<String>,
25        /// Show inspection info (top-k probs, tokens/sec)
26        #[arg(long)]
27        inspect: bool,
28        /// Disable GPU acceleration (use CPU)
29        #[arg(long)]
30        no_gpu: bool,
31        /// Force GPU acceleration (requires CUDA)
32        #[arg(long)]
33        gpu: bool,
34        /// Enable inference tracing (APR-TRACE-001)
35        #[arg(long)]
36        trace: bool,
37        /// Trace specific steps only (comma-separated)
38        #[arg(long, value_delimiter = ',')]
39        trace_steps: Option<Vec<String>>,
40        /// Verbose tracing
41        #[arg(long)]
42        trace_verbose: bool,
43        /// Save trace output to JSON file
44        #[arg(long, value_name = "FILE")]
45        trace_output: Option<PathBuf>,
46        /// Trace detail level (none, basic, layer, payload)
47        #[arg(long, value_name = "LEVEL", default_value = "basic")]
48        trace_level: String,
49        /// Enable inline Roofline profiling (PMAT-SHOWCASE-METHODOLOGY-001)
50        #[arg(long)]
51        profile: bool,
52    },
53    /// Benchmark throughput (spec H12: >= 10 tok/s)
54    Bench {
55        /// Path to model file
56        #[arg(value_name = "FILE")]
57        file: PathBuf,
58        /// Number of warmup iterations
59        #[arg(long, default_value = "3")]
60        warmup: usize,
61        /// Number of measurement iterations
62        #[arg(long, default_value = "5")]
63        iterations: usize,
64        /// Max tokens to generate per iteration
65        #[arg(long, default_value = "32")]
66        max_tokens: usize,
67        /// Test prompt
68        #[arg(long)]
69        prompt: Option<String>,
70        /// Use realizar for fast inference (vs aprender baseline)
71        #[arg(long)]
72        fast: bool,
73        /// Benchmark specific brick
74        #[arg(long)]
75        brick: Option<String>,
76    },
77    /// Evaluate model perplexity (spec H13: PPL <= 20)
78    Eval {
79        /// Path to model file
80        #[arg(value_name = "FILE")]
81        file: PathBuf,
82        /// Dataset: wikitext-2, lambada, or custom
83        #[arg(long, default_value = "wikitext-2")]
84        dataset: String,
85        /// Custom text (when dataset=custom)
86        #[arg(long)]
87        text: Option<String>,
88        /// Maximum tokens to evaluate
89        #[arg(long, default_value = "512")]
90        max_tokens: usize,
91        /// Perplexity threshold for pass/fail
92        #[arg(long, default_value = "20.0")]
93        threshold: f32,
94    },
95    /// Deep profiling with Roofline analysis
96    Profile {
97        /// Path to model file
98        #[arg(value_name = "FILE")]
99        file: PathBuf,
100        /// Layer-by-layer granular analysis
101        #[arg(long)]
102        granular: bool,
103        /// Output format (human, json, flamegraph)
104        #[arg(long, default_value = "human")]
105        format: String,
106        /// Focus on specific operation
107        #[arg(long)]
108        focus: Option<String>,
109        /// Detect naive implementations
110        #[arg(long)]
111        detect_naive: bool,
112        /// GFLOPS threshold for naive detection
113        #[arg(long, default_value = "10.0")]
114        threshold: f64,
115        /// Compare against HuggingFace baseline
116        #[arg(long)]
117        compare_hf: Option<String>,
118        /// Measure energy consumption (requires RAPL)
119        #[arg(long)]
120        energy: bool,
121        /// Compute performance grade (vs Ollama baseline)
122        #[arg(long)]
123        perf_grade: bool,
124        /// Show call graph
125        #[arg(long)]
126        callgraph: bool,
127        /// Exit non-zero if naive implementation detected
128        #[arg(long)]
129        fail_on_naive: bool,
130        /// Output file path for flamegraph SVG (GH-174, PMAT-182)
131        #[arg(long, short = 'o')]
132        output: Option<PathBuf>,
133
134        // PMAT-192: CI Assertion Mode (GH-180)
135        /// Enable CI mode with assertion checks (exits 1 on failure)
136        #[arg(long)]
137        ci: bool,
138        /// Minimum throughput in tok/s (CI assertion, exits 1 if below)
139        #[arg(long)]
140        assert_throughput: Option<f64>,
141        /// Maximum p99 latency in ms (CI assertion, exits 1 if above)
142        #[arg(long)]
143        assert_p99: Option<f64>,
144        /// Maximum p50 latency in ms (CI assertion, exits 1 if above)
145        #[arg(long)]
146        assert_p50: Option<f64>,
147        /// Warmup passes before measurement (default: 3)
148        #[arg(long, default_value = "3")]
149        warmup: usize,
150        /// Measurement passes (default: 10)
151        #[arg(long, default_value = "10")]
152        measure: usize,
153        /// Number of tokens to generate per measurement pass (default: 32)
154        #[arg(long, default_value = "32")]
155        tokens: usize,
156        /// Compare against Ollama baseline (runs ollama for comparison)
157        #[arg(long)]
158        ollama: bool,
159        /// Disable GPU (force CPU-only profiling)
160        #[arg(long)]
161        no_gpu: bool,
162        /// Compare against another model format (F-PROFILE-011)
163        #[arg(long, value_name = "FILE")]
164        compare: Option<PathBuf>,
165    },
166    /// Falsifiable QA checklist for model releases
167    Qa {
168        /// Path to model file
169        #[arg(value_name = "FILE")]
170        file: PathBuf,
171        /// Minimum throughput threshold in tok/s
172        #[arg(long, value_name = "TPS")]
173        assert_tps: Option<f64>,
174        /// Minimum speedup vs Ollama
175        #[arg(long, value_name = "SPEEDUP")]
176        assert_speedup: Option<f64>,
177        /// Minimum GPU vs CPU speedup (F-PERF-042)
178        #[arg(long, value_name = "SPEEDUP")]
179        assert_gpu_speedup: Option<f64>,
180        /// Skip golden output test
181        #[arg(long)]
182        skip_golden: bool,
183        /// Skip throughput benchmark
184        #[arg(long)]
185        skip_throughput: bool,
186        /// Skip Ollama parity comparison
187        #[arg(long)]
188        skip_ollama: bool,
189        /// Skip GPU vs CPU speedup test (F-PERF-042)
190        #[arg(long)]
191        skip_gpu_speedup: bool,
192        /// Skip tensor contract validation (PMAT-235)
193        #[arg(long)]
194        skip_contract: bool,
195        /// Skip cross-format parity test (F-QUAL-032)
196        #[arg(long)]
197        skip_format_parity: bool,
198        /// Skip PTX parity validation (GH-219)
199        #[arg(long)]
200        skip_ptx_parity: bool,
201        /// SafeTensors model path for cross-format parity test (F-QUAL-032)
202        #[arg(long, value_name = "PATH")]
203        safetensors_path: Option<PathBuf>,
204        /// Number of benchmark iterations
205        #[arg(long, default_value = "10")]
206        iterations: usize,
207        /// Number of warmup iterations
208        #[arg(long, default_value = "3")]
209        warmup: usize,
210        /// Maximum tokens to generate
211        #[arg(long, default_value = "32")]
212        max_tokens: usize,
213        /// Output as JSON (for CI integration)
214        #[arg(long)]
215        json: bool,
216        /// Verbose output
217        #[arg(short, long)]
218        verbose: bool,
219        /// Minimum number of gates that must execute (fail if fewer)
220        #[arg(long, value_name = "N")]
221        min_executed: Option<usize>,
222        /// Previous QA report for regression detection
223        #[arg(long, value_name = "FILE")]
224        previous_report: Option<PathBuf>,
225        /// Maximum allowed performance regression ratio (default: 0.10 = 10%)
226        #[arg(long, value_name = "RATIO")]
227        regression_threshold: Option<f64>,
228        /// Skip GPU state isolation test
229        #[arg(long)]
230        skip_gpu_state: bool,
231        /// Skip metadata plausibility validation (Bug 210, GH-222)
232        #[arg(long)]
233        skip_metadata: bool,
234        /// Skip GPU capability match gate (GH-280)
235        #[arg(long)]
236        skip_capability: bool,
237    },
238    /// GPU/CPU parity check (PMAT-232: genchi genbutsu — see where GPU diverges)
239    Parity {
240        /// Path to GGUF model file
241        #[arg(value_name = "FILE")]
242        file: PathBuf,
243        /// Prompt text (default: "What is 2+2?")
244        #[arg(short, long, default_value = "What is 2+2?")]
245        prompt: String,
246        /// Assert parity (exit non-zero on divergence)
247        #[arg(long)]
248        assert: bool,
249    },
250    /// Model-to-PTX source mapping (Mieruka: make GPU kernel dispatch visible)
251    #[command(name = "ptx-map")]
252    PtxMap {
253        /// Path to GGUF model file
254        #[arg(value_name = "FILE")]
255        file: PathBuf,
256        /// Filter to specific kernel (e.g., --kernel Q4KGemv)
257        #[arg(long)]
258        kernel: Option<String>,
259        /// Reverse lookup: kernel name -> which layers/steps use it
260        #[arg(long)]
261        reverse: Option<String>,
262        /// Output as JSON
263        #[arg(long)]
264        json: bool,
265        /// Full PTX snippets and detailed analysis
266        #[arg(short, long)]
267        verbose: bool,
268        /// Show batched prefill kernel variants instead of decode
269        #[arg(long)]
270        prefill: bool,
271    },
272    /// PTX analysis and bug detection (trueno-explain: register pressure, roofline, 15+ bug detectors)
273    #[command(name = "ptx")]
274    Ptx {
275        /// Path to a PTX source file
276        #[arg(value_name = "FILE")]
277        file: Option<PathBuf>,
278        /// Analyze a named kernel from trueno-gpu
279        #[arg(long, short)]
280        kernel: Option<String>,
281        /// Strict mode (no performance whitelist)
282        #[arg(long)]
283        strict: bool,
284        /// Show only bug analysis (skip register/memory/roofline)
285        #[arg(long)]
286        bugs: bool,
287        /// Output as JSON
288        #[arg(long)]
289        json: bool,
290        /// Verbose output (include PTX source listing)
291        #[arg(short, long)]
292        verbose: bool,
293    },
294    /// ML tuning: LoRA/QLoRA configuration and memory planning (GH-176)
295    Tune {
296        /// Path to model file (optional if using --model)
297        #[arg(value_name = "FILE")]
298        file: Option<PathBuf>,
299        /// Tuning method: auto, full, lora, qlora
300        #[arg(long, short = 'm', default_value = "auto")]
301        method: String,
302        /// LoRA rank (default: auto-selected)
303        #[arg(long, short = 'r')]
304        rank: Option<u32>,
305        /// Available VRAM in GB
306        #[arg(long, default_value = "16.0")]
307        vram: f64,
308        /// Only plan configuration, don't train
309        #[arg(long)]
310        plan: bool,
311        /// Model size for planning (e.g., "7B", "1.5B")
312        #[arg(long, value_name = "SIZE")]
313        model: Option<String>,
314        /// Freeze base model weights
315        #[arg(long)]
316        freeze_base: bool,
317        /// Training data file (JSONL format)
318        #[arg(long, value_name = "FILE")]
319        train_data: Option<PathBuf>,
320        /// Output as JSON (for CI integration)
321        #[arg(long)]
322        json: bool,
323    },
324    /// ComputeBrick pipeline monitor (cbtop)
325    Cbtop {
326        /// Model name (e.g., qwen2.5-coder-1.5b)
327        #[arg(long)]
328        model: Option<String>,
329        /// Attach to running realizar process
330        #[arg(long)]
331        attach: Option<String>,
332        /// Path to GGUF model file for real profiling
333        #[arg(long, value_name = "MODEL")]
334        model_path: Option<PathBuf>,
335        /// Run in headless mode (no TUI, for CI/automation)
336        #[arg(long)]
337        headless: bool,
338        /// Output JSON format (requires --headless)
339        #[arg(long)]
340        json: bool,
341        /// Output file path (requires --headless)
342        #[arg(long, value_name = "FILE")]
343        output: Option<PathBuf>,
344        /// CI mode: exit with code 1 if thresholds not met
345        #[arg(long)]
346        ci: bool,
347        /// Minimum throughput threshold in tok/s (for --ci)
348        #[arg(long, value_name = "TOK_S")]
349        throughput: Option<f64>,
350        /// Minimum brick score threshold 0-100 (for --ci)
351        #[arg(long, value_name = "SCORE")]
352        brick_score: Option<u32>,
353        /// Number of warmup iterations before measurement
354        #[arg(long, default_value = "10")]
355        warmup: usize,
356        /// Number of measurement iterations
357        #[arg(long, default_value = "100")]
358        iterations: usize,
359        /// PAR-100: Enable speculative decoding benchmark
360        #[arg(long)]
361        speculative: bool,
362        /// PAR-100: Number of tokens to draft speculatively (default: 4)
363        #[arg(long, default_value = "4")]
364        speculation_k: usize,
365        /// PAR-099: Path to draft model for speculative decoding
366        #[arg(long, value_name = "DRAFT_MODEL")]
367        draft_model: Option<PathBuf>,
368        /// PAR-102: Number of concurrent requests
369        #[arg(long, default_value = "1")]
370        concurrent: usize,
371        /// Use simulated data (for CI testing only)
372        #[arg(long)]
373        simulated: bool,
374    },
375    /// Export for probar visual testing
376    Probar {
377        /// Path to .apr model file
378        #[arg(value_name = "FILE")]
379        file: PathBuf,
380        /// Output directory for test artifacts
381        #[arg(short, long, default_value = "./probar-export")]
382        output: PathBuf,
383        /// Export format: json, png, or both
384        #[arg(long, default_value = "both")]
385        format: String,
386        /// Golden reference directory for comparison
387        #[arg(long)]
388        golden: Option<PathBuf>,
389        /// Filter layers by name pattern
390        #[arg(long)]
391        layer: Option<String>,
392    },
393    /// Compare APR model against HuggingFace source
394    #[command(name = "compare-hf")]
395    CompareHf {
396        /// Path to .apr model file
397        #[arg(value_name = "FILE")]
398        file: PathBuf,
399        /// HuggingFace repo ID (e.g., openai/whisper-tiny)
400        #[arg(long)]
401        hf: String,
402        /// Filter tensors by name pattern
403        #[arg(long)]
404        tensor: Option<String>,
405        /// Comparison threshold (default: 1e-5)
406        #[arg(long, default_value = "1e-5")]
407        threshold: f64,
408        /// Output as JSON
409        #[arg(long)]
410        json: bool,
411    },
412    /// Format-aware binary forensics (10X better than xxd)
413    Hex {
414        /// Path to model file (APR, GGUF, or SafeTensors)
415        #[arg(value_name = "FILE")]
416        file: PathBuf,
417        /// Filter tensors by name pattern
418        #[arg(long)]
419        tensor: Option<String>,
420        /// Limit bytes/values to display
421        #[arg(long, default_value = "64")]
422        limit: usize,
423        /// Show tensor statistics
424        #[arg(long)]
425        stats: bool,
426        /// List tensor names only
427        #[arg(long)]
428        list: bool,
429        /// Output as JSON
430        #[arg(long)]
431        json: bool,
432        /// Annotated file header (magic, version, tensor count, metadata)
433        #[arg(long)]
434        header: bool,
435        /// Q4K/Q6K/Q8_0 super-block structure with field annotations
436        #[arg(long)]
437        blocks: bool,
438        /// Value histogram + entropy + kurtosis analysis
439        #[arg(long)]
440        distribution: bool,
441        /// Layout contract verification overlay per tensor
442        #[arg(long)]
443        contract: bool,
444        /// Per-region byte entropy analysis
445        #[arg(long)]
446        entropy: bool,
447        /// Raw bytes (like xxd but format-aware, with ASCII column)
448        #[arg(long)]
449        raw: bool,
450        /// Start at byte offset (supports 0x prefix for hex)
451        #[arg(long, default_value = "0")]
452        offset: String,
453        /// Bytes per row for raw output (default: 16)
454        #[arg(long, default_value = "16")]
455        width: usize,
456        /// Slice range for partial tensor reads (e.g., 0:3 for first 3 elements)
457        #[arg(long)]
458        slice: Option<String>,
459    },
460    /// Model architecture tree view
461    Tree {
462        /// Path to .apr model file
463        #[arg(value_name = "FILE")]
464        file: PathBuf,
465        /// Filter by component pattern
466        #[arg(long)]
467        filter: Option<String>,
468        /// Output format: ascii, dot, mermaid, json
469        #[arg(long, default_value = "ascii")]
470        format: String,
471        /// Show tensor sizes
472        #[arg(long)]
473        sizes: bool,
474        /// Maximum tree depth
475        #[arg(long)]
476        depth: Option<usize>,
477    },
478    /// Data flow visualization
479    Flow {
480        /// Path to .apr model file
481        #[arg(value_name = "FILE")]
482        file: PathBuf,
483        /// Filter by layer pattern
484        #[arg(long)]
485        layer: Option<String>,
486        /// Component to visualize: full, encoder, decoder, etc.
487        #[arg(long, default_value = "full")]
488        component: String,
489        /// Verbose output with statistics
490        #[arg(short, long)]
491        verbose: bool,
492        /// Output as JSON
493        #[arg(long)]
494        json: bool,
495    },
496    /// Publishing, conversion, and analysis tools
497    #[command(flatten)]
498    Tools(ToolCommands),
499}