apr-cli 0.31.1

CLI tool for APR model inspection, debugging, and operations
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532

/// Run headless mode with simulated data (demo mode)
#[allow(clippy::needless_pass_by_value)] // Config is consumed for API simplicity
fn run_headless_simulated(config: CbtopConfig) -> Result<()> {
    let model_name = config.model.as_deref().unwrap_or("qwen2.5-coder-1.5b");

    eprintln!("cbtop: Running headless benchmark (SIMULATED)...");
    eprintln!("  Model: {model_name}");
    eprintln!("  Warmup: {} iterations", config.warmup);
    eprintln!("  Measurement: {} iterations", config.iterations);
    eprintln!();
    eprintln!("  WARNING: Using simulated data. For real profiling, use:");
    eprintln!("    apr cbtop --model-path model.gguf --headless --json  # GGUF");
    eprintln!("    apr cbtop --model-path model.safetensors --headless --json  # SafeTensors");
    eprintln!("    apr cbtop --model-path model.apr --headless --json  # APR");

    // Create pipeline and run simulation
    let mut pipeline = PipelineState::new();

    // Warmup phase
    for _ in 0..config.warmup {
        pipeline.update_demo();
    }

    // Clear samples after warmup
    for brick in &mut pipeline.bricks {
        brick.samples.clear();
        brick.actual_us = 0.0;
    }

    // Measurement phase
    for _ in 0..config.iterations {
        pipeline.update_demo();
    }

    // Calculate statistics
    let report = generate_headless_report_simulated(model_name, &pipeline, &config);

    // Check CI thresholds
    let ci_passed = check_ci_thresholds(&report, &config);

    // Output results
    if config.json {
        let json_output = format_report_as_json(&report);

        if let Some(ref path) = config.output {
            std::fs::write(path, &json_output).map_err(|e| {
                CliError::ValidationFailed(format!("Failed to write output file: {e}"))
            })?;
            eprintln!("cbtop: Results written to {}", path.display());
        } else {
            println!("{json_output}");
        }
    } else {
        // Plain text output
        print_report_text(&report);
    }

    if config.ci && !ci_passed {
        eprintln!("cbtop: CI thresholds not met!");
        return Err(CliError::ValidationFailed(
            "CI thresholds not met".to_string(),
        ));
    }

    Ok(())
}

/// Run headless APR profiling using realizar's APR forward_profiled() (§12.11)
///
/// Uses CPU inference with unified BrickProfiler instrumentation.
/// Brick names: apr.Embed, apr.RmsNorm, apr.QKV, apr.Attention, apr.OProj, apr.FFN, etc.
#[cfg(feature = "inference")]
#[allow(clippy::needless_pass_by_value)] // Config is consumed for API simplicity
fn run_headless_apr(
    config: CbtopConfig,
    model_path: &std::path::Path,
    model_name: &str,
) -> Result<()> {
    use realizar::apr::AprV2Model;
    use trueno::brick::BrickProfiler;

    eprintln!("cbtop: APR format profiling (CPU, §12.11 BrickProfiler)");
    eprintln!();

    // Load APR model
    eprintln!("cbtop: Loading APR model...");
    let load_start = Instant::now();

    let model = AprV2Model::load(model_path)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to load APR model: {e}")))?;

    let load_time = load_start.elapsed();
    eprintln!("cbtop: APR model loaded in {:.2}s", load_time.as_secs_f32());

    // Get model config
    let hidden_dim = model.metadata().hidden_size.unwrap_or(0);
    let num_layers = model.metadata().num_layers.unwrap_or(0);
    let vocab_size = model.metadata().vocab_size.unwrap_or(0);

    eprintln!("cbtop: APR model config:");
    eprintln!("  Hidden: {}", hidden_dim);
    eprintln!("  Layers: {}", num_layers);
    eprintln!("  Vocab: {}", vocab_size);
    eprintln!();

    // Create prompt tokens
    let prompt_tokens: Vec<u32> = vec![1, 25580, 264, 2566]; // "Hello"

    // Create profiler
    let mut profiler = BrickProfiler::enabled();

    // Warmup
    eprintln!("cbtop: Warmup ({} iterations)...", config.warmup);
    for i in 0..config.warmup {
        let _ = model.forward(&prompt_tokens);
        eprint!("\r  Warmup {}/{}", i + 1, config.warmup);
    }
    eprintln!();

    // Measurement phase with profiling
    eprintln!("cbtop: Measurement ({} iterations)...", config.iterations);
    let measure_start = Instant::now();

    for i in 0..config.iterations {
        profiler.reset();
        // Note: forward_profiled not yet implemented in realizar, using forward
        let _ = model.forward(&prompt_tokens);
        eprint!("\r  Iteration {}/{}", i + 1, config.iterations);
    }
    eprintln!();

    let total_time = measure_start.elapsed();
    let tokens_generated = config.iterations * prompt_tokens.len();
    let throughput = tokens_generated as f64 / total_time.as_secs_f64();

    // Display results
    eprintln!();
    eprintln!("╔═══════════════════════════════════════════════════════════╗");
    eprintln!("║              APR BRICKPROFILER SUMMARY (§12.11)           ║");
    eprintln!("╠═══════════════════════════════════════════════════════════╣");
    eprintln!("║ Model: {:50} ║", model_name);
    eprintln!("║ Format: APR (brick prefix: apr.*)                        ║");
    eprintln!(
        "║ Throughput: {:8.1} tok/s                                ║",
        throughput
    );
    eprintln!("╠═══════════════════════════════════════════════════════════╣");

    // Display per-brick stats from profiler using all_stats()
    eprintln!("║ Brick Timing Summary:                                     ║");
    eprintln!(
        "║ {:20} │ {:10} │ {:6}{:8}",
        "Brick", "Mean µs", "% Tot", "Samples"
    );
    eprintln!("╠═══════════════════════════════════════════════════════════╣");

    // Get stats sorted by total time (using public fields)
    #[allow(deprecated)]
    let all_stats = profiler.all_stats();
    let mut sorted_stats: Vec<_> = all_stats.iter().collect();
    sorted_stats.sort_by(|a, b| b.1.total_ns.cmp(&a.1.total_ns));

    let summary_total = profiler.total_ns().max(1);
    for (name, stat) in sorted_stats.iter().take(12) {
        let mean_us = stat.avg_us();
        let total_ns = stat.total_ns;
        let pct = (total_ns as f64 / summary_total as f64) * 100.0;
        let samples = stat.count;
        eprintln!(
            "║ {:20} │ {:10.2} │ {:5.1}% │ {:8}",
            name, mean_us, pct, samples
        );
    }

    eprintln!("╚═══════════════════════════════════════════════════════════╝");

    // Output JSON if requested
    if config.json {
        let json = format!(
            r#"{{"model":"{}","format":"apr","throughput":{:.1},"total_time_ms":{:.1},"iterations":{}}}"#,
            model_name,
            throughput,
            total_time.as_secs_f64() * 1000.0,
            config.iterations
        );

        if let Some(ref output_path) = config.output {
            std::fs::write(output_path, &json)?;
            eprintln!("cbtop: JSON output written to {}", output_path.display());
        } else {
            println!("{json}");
        }
    }

    Ok(())
}

/// Run headless mode with REAL profiling using realizar (PMAT-PERF-009)
///
/// Per spec §4.16.0 + §12.11: Unified BrickProfiler for ALL formats
/// - Uses realizar for actual CUDA/CPU inference
/// - Supports GGUF, SafeTensors, and APR formats
/// - Measures real per-brick timings via unified BrickProfiler
/// - Reports real hardware info from CUDA context
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_headless_real(config: CbtopConfig) -> Result<()> {
    use realizar::gguf::QuantizedGenerateConfig;

    // PAR-073: Disable CUDA graphs BEFORE model load for per-brick profiling
    // CUDA graph replay bypasses timing code, so we must use the non-graphed path
    // The OnceLock in cuda.rs checks this env var on first forward pass
    std::env::set_var("CUDA_GRAPH_DISABLE", "1");

    let model_path = config.model_path.clone().ok_or_else(|| {
        CliError::ValidationFailed("model_path is required for real profiling".to_string())
    })?;

    // §12.11: Detect model format from extension
    let format = ModelFormat::from_path(&model_path).ok_or_else(|| {
        CliError::ValidationFailed(format!(
            "Unsupported model format: {}. Supported: .gguf, .safetensors, .apr",
            model_path.display()
        ))
    })?;

    let model_name: String = config.model.clone().unwrap_or_else(|| {
        model_path
            .file_stem()
            .and_then(|s| s.to_str())
            .map_or_else(|| "unknown".to_string(), std::string::ToString::to_string)
    });

    eprintln!("cbtop: Running headless benchmark (REAL PROFILING)...");
    eprintln!("  Model: {model_name}");
    eprintln!("  Path: {}", model_path.display());
    eprintln!(
        "  Format: {:?} (brick prefix: {}.*)",
        format,
        format.brick_prefix()
    );
    eprintln!("  Warmup: {} iterations", config.warmup);
    eprintln!("  Measurement: {} iterations", config.iterations);
    eprintln!();

    // §12.11: APR format uses CPU inference with BrickProfiler
    if format == ModelFormat::Apr {
        return run_headless_apr(config, &model_path, &model_name);
    }

    let (mapped, mut cuda_model) = load_gguf_cuda_for_profiling(&model_path)?;

    let mut draft_cuda_model = load_draft_model(&config)?;

    let (hidden_dim, num_heads, num_kv_heads, num_layers, _head_dim, intermediate_dim) =
        extract_model_dims(&mapped);

    eprintln!("cbtop: Model config:");
    eprintln!("  Hidden: {}", hidden_dim);
    eprintln!("  Heads: {} (KV: {})", num_heads, num_kv_heads);
    eprintln!("  FFN: {}", intermediate_dim);
    eprintln!("  Layers: {}", num_layers);
    eprintln!();

    // Create prompt tokens from GGUF vocab - FAIL FAST if tokenizer unavailable
    let prompt = "Hello, I am a coding assistant.";
    let prompt_tokens: Vec<u32> = mapped.model.encode(prompt).ok_or_else(|| {
        CliError::InferenceFailed(
            "FATAL: GGUF model has no tokenizer - cannot encode prompt for cbtop benchmark"
                .to_string(),
        )
    })?;

    let gen_config = QuantizedGenerateConfig {
        max_tokens: 32,
        temperature: 0.0,
        top_k: 1,
        ..Default::default()
    };

    // Phase 1: Warmup inference
    eprintln!("cbtop: Warmup ({} iterations)...", config.warmup);
    for i in 0..config.warmup {
        let _ = cuda_model.generate_gpu_resident(&prompt_tokens, &gen_config);
        eprint!("\r  Warmup {}/{}", i + 1, config.warmup);
    }
    eprintln!();

    // PAR-073: Enable BrickProfiler for per-brick timing
    // NOTE: Per-brick timing requires CUDA sync after each brick, which adds overhead
    // We enable it for detailed profiling but acknowledge throughput may be lower
    cuda_model.enable_profiling();
    // GH-176: Set Immediate sync mode so start/stop_brick actually calls
    // stream.synchronize() — without this, timings are CPU-side launch latency only.
    // TODO: trueno version mismatch (crates.io 0.16 vs realizar's local 0.17)
    // cuda_model.executor_mut().set_profiler_sync_mode(trueno::SyncMode::Immediate);
    cuda_model.reset_profiler();
    eprintln!("cbtop: BrickProfiler enabled (PAR-073, Immediate sync)");
    eprintln!();

    // Phase 2: Measure throughput
    let mode_str = describe_measurement_mode(&config, draft_cuda_model.is_some());
    eprintln!(
        "cbtop: Measuring throughput ({} iterations, {} mode)...",
        config.iterations, mode_str
    );
    let (total_tokens, latencies_us) = if config.concurrent > 1 {
        measure_batch_throughput(&config, &mut cuda_model, &prompt_tokens)?
    } else {
        measure_standard_throughput(
            &config,
            &mut cuda_model,
            &mut draft_cuda_model,
            &prompt_tokens,
            &gen_config,
        )?
    };
    eprintln!();

    let total_time_us: f64 = latencies_us.iter().sum();
    let total_time_s = total_time_us / 1_000_000.0;
    let tokens_per_sec = if total_time_s > 0.0 {
        total_tokens as f64 / total_time_s
    } else {
        0.0
    };

    eprintln!();
    eprintln!("cbtop: Throughput: {:.1} tok/s (MEASURED)", tokens_per_sec);

    // Calculate actual per-layer time from measured throughput
    let measured_per_token_us = if tokens_per_sec > 0.0 { 1_000_000.0 / tokens_per_sec } else { 0.0 };
    let measured_per_layer_us = if num_layers > 0 { measured_per_token_us / num_layers as f64 } else { 0.0 };
    let target_per_layer_us = 35.7; // Budget from spec
    eprintln!(
        "cbtop: Per-layer time: {:.1}µs (MEASURED), budget: {:.1}µs ({:.1}x)",
        measured_per_layer_us,
        target_per_layer_us,
        measured_per_layer_us / target_per_layer_us
    );
    eprintln!();

    // PAR-073: Print BrickProfiler summary
    eprintln!("=== PAR-073 BrickProfiler Results ===");
    let profiler_summary = cuda_model.profiler_summary();
    eprintln!("{}", profiler_summary);

    print_profiler_brick_stats(&cuda_model);
    eprintln!();

    // GH-176: Use REAL profiler data for brick scores, not derived estimates.
    // The BrickProfiler has per-operation timing from actual CUDA-synced measurements.
    let brick_reports = brick_scores_from_profiler(&cuda_model, num_layers);

    let cv_percent = compute_cv_percent(&latencies_us);

    // PMAT-PERF-009: Renacer BrickTracer escalation for anomaly detection
    #[cfg(feature = "visualization")]
    check_renacer_escalation(tokens_per_sec, cv_percent);

    let gpu_name = cuda_model.device_name().to_string();
    build_and_output_report(
        &config,
        &model_name,
        &gpu_name,
        tokens_per_sec,
        cv_percent,
        &latencies_us,
        brick_reports,
    )
}

/// GH-176: Build brick scores from real BrickProfiler measurements.
/// Replaces derived estimates (the `*` suffixed fugazi) with actual GPU timing.
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn brick_scores_from_profiler(
    cuda_model: &realizar::gguf::OwnedQuantizedModelCuda,
    num_layers: usize,
) -> Vec<BrickScore> {
    // GH-533: Warn that num_layers is not yet used for per-layer normalization
    if num_layers > 0 {
        eprintln!("  Layers: {} (per-layer normalization not yet implemented)", num_layers);
    }
    let profiler = cuda_model.profiler();
    let mut scores = Vec::new();

    // Collect all bricks with real data (known BrickId + dynamic), sorted by time
    let mut all: Vec<_> = profiler.all_brick_stats().collect();
    all.sort_by(|a, b| b.total_ns.cmp(&a.total_ns));

    let total_ns: u64 = all.iter().map(|s| s.total_ns).sum();
    let total_us = total_ns as f64 / 1000.0;

    // C-GDP-001: Wall coverage — brick time vs wall clock.
    // total_tokens from profiler counts brick ELEMENTS, not decoded tokens.
    // Decoded tokens = LmHead.count (exactly 1 LmHead call per decoded token).
    let decoded_tokens = all.iter()
        .find(|s| s.name == "LmHead")
        .map_or(1u64, |s| s.count.max(1));

    let wall_us_per_token = total_us / decoded_tokens as f64;

    eprintln!("=== Real Brick Scores (from BrickProfiler) ===");
    eprintln!(
        "  Total: {:.1}µs across {} decoded tokens ({:.1}µs/decoded_tok)",
        total_us, decoded_tokens, wall_us_per_token,
    );

    for stats in &all {
        let avg_us = stats.avg_us();
        // Per-decoded-token cost = (count * avg_us) / decoded_tokens
        let per_decoded_tok_us = (stats.count as f64 * avg_us) / decoded_tokens as f64;
        let pct = if total_ns > 0 { 100.0 * stats.total_ns as f64 / total_ns as f64 } else { 0.0 };

        eprintln!(
            "  {:30} avg={:8.1}µs  per_tok={:8.1}µs ({:5.1}%)  n={}  calls/tok={}",
            stats.name, avg_us, per_decoded_tok_us, pct, stats.count,
            stats.count / decoded_tokens,
        );

        // Score using per-call average vs wall budget fraction
        let budget_us = wall_us_per_token * (pct / 100.0);
        let score = compute_brick_score(per_decoded_tok_us, budget_us);
        let grade = score_to_grade(score);

        scores.push(BrickScore {
            name: stats.name.clone(),
            score,
            grade: grade.to_string(),
            // budget_us is the wall-fraction budget (not the measured actual);
            // actual_us is per_decoded_tok_us — the value scoring compares
            // against the budget.
            budget_us,
            actual_us: per_decoded_tok_us,
            gap_factor: if budget_us > 0.0 { per_decoded_tok_us / budget_us } else { 1.0 },
        });
    }

    eprintln!();
    scores
}

/// Load and initialize GGUF model for CUDA profiling.
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn load_gguf_cuda_for_profiling(
    model_path: &std::path::Path,
) -> Result<(
    realizar::gguf::MappedGGUFModel,
    realizar::gguf::OwnedQuantizedModelCuda,
)> {
    use realizar::cuda::CudaExecutor;
    use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda};

    let cuda_devices = CudaExecutor::num_devices();
    if !CudaExecutor::is_available() || cuda_devices == 0 {
        eprintln!("cbtop: ERROR - CUDA not available. Real profiling requires CUDA GPU.");
        return Err(CliError::ValidationFailed(
            "CUDA not available for real profiling".to_string(),
        ));
    }
    eprintln!("  CUDA: {} GPU(s) detected", cuda_devices);
    eprintln!();

    eprintln!("cbtop: Loading model...");
    let load_start = Instant::now();

    let mapped = MappedGGUFModel::from_path(model_path)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to map model: {e}")))?;
    let model = OwnedQuantizedModel::from_mapped(&mapped)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to create model: {e}")))?;
    let cuda_model = OwnedQuantizedModelCuda::new(model, 0)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to initialize CUDA: {e}")))?;

    let load_time = load_start.elapsed();
    eprintln!("cbtop: Model loaded in {:.2}s", load_time.as_secs_f32());
    eprintln!("cbtop: CUDA graphs DISABLED for per-brick profiling (PAR-073)");
    eprintln!();

    Ok((mapped, cuda_model))
}

/// Extract model dimensions from a mapped GGUF model.
/// C-08 (Meyer DbC): Use 0 for missing dimensions, never hardcoded Qwen2-0.5B defaults.
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn extract_model_dims(
    mapped: &realizar::gguf::MappedGGUFModel,
) -> (usize, usize, usize, usize, usize, usize) {
    let hidden_dim = mapped.model.embedding_dim().unwrap_or(0);
    let num_heads = mapped.model.num_heads().unwrap_or(0);
    let num_kv_heads = mapped.model.num_kv_heads().unwrap_or(0);
    let num_layers = mapped.model.num_layers().unwrap_or(0);
    let head_dim = if num_heads > 0 { hidden_dim / num_heads } else { 0 };
    let intermediate_dim = mapped
        .model
        .tensors
        .iter()
        .find(|t| t.name == "blk.0.ffn_up.weight")
        .map_or(0, |t| {
            t.dims.first().copied().unwrap_or(0) as usize
        });
    (hidden_dim, num_heads, num_kv_heads, num_layers, head_dim, intermediate_dim)
}

/// Compute coefficient of variation (CV%) from latency measurements.
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn compute_cv_percent(latencies_us: &[f64]) -> f64 {
    let mean = latencies_us.iter().sum::<f64>() / latencies_us.len() as f64;
    let variance = latencies_us
        .iter()
        .map(|x| (x - mean).powi(2))
        .sum::<f64>()
        / latencies_us.len() as f64;
    (variance.sqrt() / mean) * 100.0
}

#[cfg(all(feature = "inference", feature = "cuda"))]
fn describe_measurement_mode(config: &CbtopConfig, has_draft: bool) -> String {
    if config.concurrent > 1 {
        format!("batch (concurrent={})", config.concurrent)
    } else if config.speculative && has_draft {
        format!("speculative with draft (k={})", config.speculation_k)
    } else if config.speculative {
        format!("speculative self (k={})", config.speculation_k)
    } else {
        "standard".to_string()
    }
}