apr-cli 0.4.17

CLI tool for APR model inspection, debugging, and operations
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
//! cbtop - ComputeBrick Top (TUI for brick pipeline visualization)
//!
//! Per spec: docs/specifications/qwen2.5-coder-showcase-demo.md §6 + §12.11
//!
//! Toyota Way Principles:
//! - Mieruka: Make status visible at a glance
//! - Jidoka: Highlight budget violations immediately
//! - Genchi Genbutsu: Show real metrics, not estimates
//!
//! Usage:
//!   cbtop --model qwen2.5-coder-1.5b
//!   apr cbtop --attach realizar
//!   apr cbtop --model-path /path/to/model.gguf --headless --json  # GGUF profiling
//!   apr cbtop --model-path /path/to/model.safetensors --headless --json  # SafeTensors
//!   apr cbtop --model-path /path/to/model.apr --headless --json  # APR profiling
//!
//! Headless mode for CI:
//!   apr cbtop --headless --json --output results.json
//!   apr cbtop --headless --ci --throughput 400 --brick-score 90
//!
//! Real profiling mode (§12.11 Unified BrickProfiler):
//!   apr cbtop --model-path model.{gguf,safetensors,apr} --headless --json
//!   - Uses realizar for actual inference (CUDA or CPU)
//!   - Unified BrickProfiler timing for ALL formats
//!   - Reports real hardware info from CUDA context

use crate::error::{CliError, Result};
use crossterm::{
    event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode, KeyEventKind},
    execute,
    terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
};
use ratatui::{
    backend::CrosstermBackend,
    layout::{Constraint, Direction, Layout, Rect},
    style::{Color, Modifier, Style},
    text::{Line, Span},
    widgets::{Block, Borders, List, ListItem, Paragraph, Sparkline, Tabs},
    Frame, Terminal,
};
use std::io;
use std::path::PathBuf;
use std::time::Instant;

/// Supported model formats for unified BrickProfiler (§12.11)
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ModelFormat {
    /// GGUF format (.gguf) - llama.cpp compatible quantized models
    Gguf,
    /// SafeTensors format (.safetensors) - HuggingFace f16/bf16 models
    SafeTensors,
    /// APR format (.apr) - Our native optimized format
    Apr,
}

#[allow(clippy::trivially_copy_pass_by_ref)] // Idiomatic &self for enum methods
impl ModelFormat {
    /// Detect format from file extension
    pub fn from_path(path: &std::path::Path) -> Option<Self> {
        let ext = path.extension()?.to_str()?.to_lowercase();
        match ext.as_str() {
            "gguf" => Some(Self::Gguf),
            "safetensors" => Some(Self::SafeTensors),
            "apr" => Some(Self::Apr),
            _ => None,
        }
    }

    /// Format-specific brick name prefix (per §12.11.1)
    pub fn brick_prefix(&self) -> &'static str {
        match self {
            Self::Gguf => "gguf",
            Self::SafeTensors => "st",
            Self::Apr => "apr",
        }
    }
}

/// Configuration for cbtop command
#[derive(Debug, Clone)]
pub struct CbtopConfig {
    pub model: Option<String>,
    pub attach: Option<String>,
    /// Path to model file for real profiling (§12.11 Unified BrickProfiler)
    /// Supports: .gguf, .safetensors, .apr
    pub model_path: Option<PathBuf>,
    pub headless: bool,
    pub json: bool,
    pub output: Option<PathBuf>,
    pub ci: bool,
    pub throughput_threshold: Option<f64>,
    pub brick_score_threshold: Option<u32>,
    pub warmup: usize,
    pub iterations: usize,
    /// PAR-100: Enable speculative decoding benchmark
    pub speculative: bool,
    /// PAR-100: Number of tokens to draft speculatively (default: 4)
    pub speculation_k: usize,
    /// PAR-099: Path to draft model for speculative decoding
    pub draft_model_path: Option<PathBuf>,
    /// PAR-102: Number of concurrent requests for aggregate throughput measurement
    pub concurrent: usize,
    /// Use simulated data (for CI testing only - explicitly opts out of real profiling)
    pub simulated: bool,
}

impl Default for CbtopConfig {
    fn default() -> Self {
        Self {
            model: None,
            attach: None,
            model_path: None,
            headless: false,
            json: false,
            output: None,
            ci: false,
            throughput_threshold: None,
            brick_score_threshold: None,
            warmup: 10,
            iterations: 100,
            speculative: false,
            speculation_k: 4,
            draft_model_path: None,
            concurrent: 1, // PAR-102: Default to single request
            simulated: false,
        }
    }
}

/// Headless report output per spec section 7.0.1
#[derive(Debug, Clone)]
pub struct HeadlessReport {
    pub model: String,
    pub timestamp: String,
    pub hardware: HardwareInfo,
    pub throughput: ThroughputMetrics,
    pub brick_scores: Vec<BrickScore>,
    pub pmat_scores: PmatScores,
    pub falsification: FalsificationSummary,
    pub status: String,
    pub ci_result: String,
}

/// PMAT quality scores per spec section 7.0.1
#[derive(Debug, Clone)]
pub struct PmatScores {
    pub rust_project_score: f64,
    pub tdg_score: f64,
    pub cuda_tdg_score: f64,
    pub brick_score: u32,
    pub grade: String,
}

#[derive(Debug, Clone)]
pub struct HardwareInfo {
    pub gpu: String,
    pub cpu: String,
    pub memory_gb: u32,
}

#[derive(Debug, Clone)]
pub struct ThroughputMetrics {
    pub tokens_per_sec: f64,
    pub ttft_ms: f64,
    pub cv_percent: f64,
    pub p50_us: f64,
    pub p99_us: f64,
}

#[derive(Debug, Clone)]
pub struct BrickScore {
    pub name: String,
    pub score: u32,
    pub grade: String,
    pub budget_us: f64,
    pub actual_us: f64,
    pub gap_factor: f64,
}

#[derive(Debug, Clone)]
pub struct FalsificationSummary {
    pub total_points: u32,
    pub passed: u32,
    pub failed: u32,
    pub blocked: u32,
}

/// Brick timing data
#[derive(Debug, Clone)]
struct BrickTiming {
    name: &'static str,
    budget_us: f64,
    actual_us: f64,
    samples: Vec<f64>,
}

impl BrickTiming {
    fn new(name: &'static str, budget_us: f64) -> Self {
        Self {
            name,
            budget_us,
            actual_us: 0.0,
            samples: Vec::with_capacity(100),
        }
    }

    fn gap_factor(&self) -> f64 {
        if self.budget_us > 0.0 {
            self.actual_us / self.budget_us
        } else {
            1.0
        }
    }

    fn status(&self) -> &'static str {
        if self.actual_us <= self.budget_us {
            ""
        } else {
            ""
        }
    }

    fn percent_of_budget(&self) -> u16 {
        if self.budget_us > 0.0 {
            ((self.actual_us / self.budget_us) * 100.0).min(200.0) as u16
        } else {
            100
        }
    }

    fn add_sample(&mut self, us: f64) {
        self.samples.push(us);
        if self.samples.len() > 100 {
            self.samples.remove(0);
        }
        // Update actual as moving average
        self.actual_us = self.samples.iter().sum::<f64>() / self.samples.len() as f64;
    }

    fn sparkline_data(&self) -> Vec<u64> {
        self.samples
            .iter()
            .map(|&x| (x * 10.0).min(255.0) as u64)
            .collect()
    }
}

/// Pipeline state
#[derive(Debug, Clone)]
struct PipelineState {
    bricks: Vec<BrickTiming>,
    layer_idx: usize,
    total_layers: usize,
    tokens_generated: usize,
    total_us: f64,
    target_tok_s: f64,
    current_tok_s: f64,
}

impl PipelineState {
    fn new() -> Self {
        // Default budgets from spec §3.1
        let bricks = vec![
            BrickTiming::new("RmsNorm", 1.5),
            BrickTiming::new("QkvBrick", 6.0),
            BrickTiming::new("RoPE", 1.0),
            BrickTiming::new("Attention", 10.0),
            BrickTiming::new("OProj", 3.5),
            BrickTiming::new("RmsNorm", 1.5),
            BrickTiming::new("FfnBrick", 12.2),
        ];

        Self {
            bricks,
            layer_idx: 0,
            total_layers: 28, // Default for 1.5B
            tokens_generated: 0,
            total_us: 0.0,
            target_tok_s: 976.0, // 2x llama.cpp for 1.5B
            current_tok_s: 0.0,
        }
    }

    fn total_budget(&self) -> f64 {
        self.bricks.iter().map(|b| b.budget_us).sum()
    }

    fn total_actual(&self) -> f64 {
        self.bricks.iter().map(|b| b.actual_us).sum()
    }

    fn bottleneck(&self) -> Option<&BrickTiming> {
        self.bricks.iter().max_by(|a, b| {
            a.gap_factor()
                .partial_cmp(&b.gap_factor())
                .unwrap_or(std::cmp::Ordering::Equal)
        })
    }

    fn update_demo(&mut self) {
        // Demo mode: simulate timing data
        use std::time::{SystemTime, UNIX_EPOCH};
        let seed = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .expect("System time before Unix epoch")
            .as_millis() as u64;

        for (i, brick) in self.bricks.iter_mut().enumerate() {
            // Add some variance around the budget
            let base = brick.budget_us;
            let variance = (((seed >> (i * 4)) & 0xFF) as f64 / 255.0 - 0.5) * base * 0.4;
            brick.add_sample(base + variance);
        }

        self.tokens_generated += 1;
        self.total_us = self.total_actual() * self.total_layers as f64;
        if self.total_us > 0.0 {
            self.current_tok_s = 1_000_000.0 / self.total_us;
        }
    }
}

/// Active view
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum View {
    Pipeline,
    Budget,
    Histogram,
    Gpu,
    Memory,
}

impl View {
    fn titles() -> Vec<&'static str> {
        vec![
            "Pipeline [p]",
            "Budget [b]",
            "Histogram [h]",
            "GPU [g]",
            "Memory [m]",
        ]
    }

    fn index(self) -> usize {
        match self {
            View::Pipeline => 0,
            View::Budget => 1,
            View::Histogram => 2,
            View::Gpu => 3,
            View::Memory => 4,
        }
    }
}

/// Application state
struct App {
    model_name: String,
    pipeline: PipelineState,
    current_view: View,
    selected_brick: usize,
    should_quit: bool,
    demo_mode: bool,
}

impl App {
    fn new(model: Option<&str>) -> Self {
        Self {
            model_name: model.unwrap_or("qwen2.5-coder-1.5b").to_string(),
            pipeline: PipelineState::new(),
            current_view: View::Pipeline,
            selected_brick: 0,
            should_quit: false,
            demo_mode: true, // Start in demo mode if no live connection
        }
    }

    fn next_brick(&mut self) {
        if !self.pipeline.bricks.is_empty() {
            self.selected_brick = (self.selected_brick + 1) % self.pipeline.bricks.len();
        }
    }

    fn prev_brick(&mut self) {
        if !self.pipeline.bricks.is_empty() {
            self.selected_brick = if self.selected_brick == 0 {
                self.pipeline.bricks.len() - 1
            } else {
                self.selected_brick - 1
            };
        }
    }

    fn tick(&mut self) {
        if self.demo_mode {
            self.pipeline.update_demo();
        }
    }
}

/// Run the cbtop command
pub fn run(config: CbtopConfig) -> Result<()> {
    if config.headless {
        run_headless(config)
    } else {
        run_tui(config.model.as_deref(), config.attach.as_deref())
    }
}

/// Run headless mode for CI/automation
fn run_headless(config: CbtopConfig) -> Result<()> {
    // Toyota Way: Genchi Genbutsu - Use real data by default.
    // Simulation is only allowed when explicitly requested with --simulated.

    // If --simulated is set, use simulated data (for CI testing)
    if config.simulated {
        eprintln!("cbtop: WARNING - Using simulated data (--simulated flag set)");
        eprintln!("       For real profiling: apr cbtop --model-path <FILE> --headless");
        return run_headless_simulated(config);
    }

    #[cfg(feature = "inference")]
    {
        if config.model_path.is_some() {
            run_headless_real(config)
        } else {
            Err(CliError::ValidationFailed(
                "Headless mode requires --model-path for real profiling.\n\
                 For CI testing with simulated data, use: apr cbtop --headless --simulated\n\
                 For real profiling, use: apr cbtop --model-path <FILE> --headless"
                    .to_string(),
            ))
        }
    }

    #[cfg(not(feature = "inference"))]
    {
        return Err(CliError::ValidationFailed(
            "Headless mode requires --model-path and the 'inference' feature.\n\
             For CI testing with simulated data, use: apr cbtop --headless --simulated\n\
             Rebuild with: cargo build -p apr-cli --features inference"
                .to_string(),
        ));
    }
}

include!("cbtop_get_cpu_memory.rs");
include!("cbtop_report_tui.rs");
include!("gguf.rs");
include!("cbtop_measure_batch.rs");
include!("cbtop_render_title.rs");
include!("cbtop_06.rs");