aprender-cgp 0.31.1

Compute-GPU-Profile: Unified performance analysis CLI for scalar, SIMD, wgpu, and CUDA workloads
Documentation
//! Profile diff: compare two CGP profiles and detect regressions.
//! Spec section 2.6: cgp diff.
//! Must complete in <100ms for two saved JSONs (FALSIFY-CGP-062).

use crate::analysis::regression::RegressionDetector;
use crate::metrics::catalog::FullProfile;
use crate::metrics::export;
use anyhow::Result;
use serde::Serialize;
use std::path::Path;

/// Diff result for a single metric.
#[derive(Debug, Serialize)]
pub struct MetricDiff {
    pub name: String,
    pub baseline: f64,
    pub current: f64,
    pub change_pct: f64,
    pub verdict: &'static str,
}

/// Compare two profiles and return metric diffs.
pub fn diff_profiles(baseline: &FullProfile, current: &FullProfile) -> Vec<MetricDiff> {
    let mut diffs = Vec::new();

    // Timing
    add_diff(
        &mut diffs,
        "wall_clock_time_us",
        baseline.timing.wall_clock_time_us,
        current.timing.wall_clock_time_us,
        true, // lower is better
    );

    // Throughput
    add_diff(
        &mut diffs,
        "tflops",
        baseline.throughput.tflops,
        current.throughput.tflops,
        false, // higher is better
    );
    add_diff(
        &mut diffs,
        "bandwidth_gbps",
        baseline.throughput.bandwidth_gbps,
        current.throughput.bandwidth_gbps,
        false,
    );

    // GPU compute
    if let (Some(bg), Some(cg)) = (&baseline.gpu_compute, &current.gpu_compute) {
        add_diff(
            &mut diffs,
            "sm_utilization_pct",
            bg.sm_utilization_pct,
            cg.sm_utilization_pct,
            false,
        );
        add_diff(
            &mut diffs,
            "achieved_occupancy_pct",
            bg.achieved_occupancy_pct,
            cg.achieved_occupancy_pct,
            false,
        );
        add_diff(
            &mut diffs,
            "warp_exec_efficiency_pct",
            bg.warp_execution_efficiency_pct,
            cg.warp_execution_efficiency_pct,
            false,
        );
    }

    // GPU memory
    if let (Some(bm), Some(cm)) = (&baseline.gpu_memory, &current.gpu_memory) {
        add_diff(
            &mut diffs,
            "l2_hit_rate_pct",
            bm.l2_hit_rate_pct,
            cm.l2_hit_rate_pct,
            false,
        );
        add_diff(
            &mut diffs,
            "global_load_efficiency_pct",
            bm.global_load_efficiency_pct,
            cm.global_load_efficiency_pct,
            false,
        );
    }

    diffs
}

fn add_diff(
    diffs: &mut Vec<MetricDiff>,
    name: &str,
    baseline: f64,
    current: f64,
    lower_better: bool,
) {
    if baseline == 0.0 && current == 0.0 {
        return;
    }
    let change_pct = if baseline != 0.0 {
        (current - baseline) / baseline * 100.0
    } else {
        0.0
    };

    let verdict = if change_pct.abs() < 2.0 {
        "="
    } else if lower_better {
        if current < baseline {
            "IMPROVED"
        } else {
            "REGRESSED"
        }
    } else if current > baseline {
        "IMPROVED"
    } else {
        "REGRESSED"
    };

    diffs.push(MetricDiff {
        name: name.to_string(),
        baseline,
        current,
        change_pct,
        verdict,
    });
}

/// Render diff to stdout.
pub fn render_diff(diffs: &[MetricDiff], baseline_name: &str, current_name: &str) {
    println!("\n=== CGP Profile Diff ===\n");
    println!("  Baseline: {baseline_name}");
    println!("  Current:  {current_name}\n");

    println!(
        "  {:30} {:>14} {:>14} {:>10} {:>10}",
        "Metric", "Baseline", "Current", "Change", "Verdict"
    );
    println!("  {}", "-".repeat(82));

    for d in diffs {
        let change_str = format!("{:+.1}%", d.change_pct);
        println!(
            "  {:30} {:>14.2} {:>14.2} {:>10} {:>10}",
            d.name, d.baseline, d.current, change_str, d.verdict
        );
    }

    // Summary
    let regressions = diffs.iter().filter(|d| d.verdict == "REGRESSED").count();
    let improvements = diffs.iter().filter(|d| d.verdict == "IMPROVED").count();
    println!();
    if regressions > 0 {
        println!("  \x1b[31m{regressions} regression(s)\x1b[0m, {improvements} improvement(s)");
    } else if improvements > 0 {
        println!("  \x1b[32m{improvements} improvement(s)\x1b[0m, no regressions");
    } else {
        println!("  No significant changes.");
    }
    println!();
}

/// Run the `cgp diff` command.
pub fn run_diff(
    baseline: Option<&str>,
    current: Option<&str>,
    _before: Option<&str>,
    _after: Option<&str>,
    json: bool,
) -> Result<()> {
    let (baseline_path, current_path) = resolve_diff_paths(baseline, current)?;

    let start = std::time::Instant::now();
    let baseline_profile = export::load_json(Path::new(baseline_path))?;
    let current_profile = export::load_json(Path::new(current_path))?;
    let diffs = diff_profiles(&baseline_profile, &current_profile);

    if json {
        println!("{}", serde_json::to_string_pretty(&diffs)?);
        return Ok(());
    }

    render_diff(&diffs, baseline_path, current_path);
    print_statistical_summary(&baseline_profile, &current_profile);

    let elapsed = start.elapsed();
    println!(
        "  Diff completed in {:.0}ms",
        elapsed.as_secs_f64() * 1000.0
    );
    println!();

    Ok(())
}

/// Resolve the `--baseline`/`--current` pair or bail with usage text.
fn resolve_diff_paths<'a>(
    baseline: Option<&'a str>,
    current: Option<&'a str>,
) -> Result<(&'a str, &'a str)> {
    match (baseline, current) {
        (Some(b), Some(c)) => Ok((b, c)),
        _ => anyhow::bail!(
            "Usage: cgp diff --baseline <file.json> --current <file.json>\n\
             Or: cgp diff --before <commit> --after <commit> (not yet implemented)"
        ),
    }
}

/// Print statistical regression summary, using real multi-sample analysis when
/// both profiles have `samples > 1`, otherwise a single-sample change direction.
fn print_statistical_summary(baseline: &FullProfile, current: &FullProfile) {
    if baseline.timing.samples > 1 && current.timing.samples > 1 {
        print_multi_sample_regression(baseline, current);
    } else {
        print_single_sample_compare(baseline, current);
    }
}

fn print_multi_sample_regression(baseline: &FullProfile, current: &FullProfile) {
    let detector = RegressionDetector::new();
    let b_samples = synth_samples(
        baseline.timing.wall_clock_time_us,
        baseline.timing.stddev_us,
    );
    let c_samples = synth_samples(current.timing.wall_clock_time_us, current.timing.stddev_us);
    let result = detector.compare(&b_samples, &c_samples);
    println!(
        "  Statistical: {} (change {:.1}%, Cohen's d = {:.2})",
        result.verdict, result.change_pct, result.effect_size_cohens_d
    );
}

fn synth_samples(mean: f64, stddev: f64) -> Vec<f64> {
    let std = stddev.max(mean * 0.01);
    (0..30)
        .map(|i| mean + std * ((i as f64 - 15.0) / 15.0))
        .collect()
}

fn print_single_sample_compare(baseline: &FullProfile, current: &FullProfile) {
    let b = baseline.timing.wall_clock_time_us;
    let c = current.timing.wall_clock_time_us;
    if b <= 0.0 || c <= 0.0 {
        return;
    }
    let change = (c - b) / b * 100.0;
    let verdict = if change < -5.0 {
        "IMPROVED"
    } else if change > 5.0 {
        "REGRESSED"
    } else {
        "NO_CHANGE"
    };
    println!("  Statistical: {verdict} (change {change:.1}%, single-sample)");
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::metrics::catalog::*;

    fn make_profile(time_us: f64, tflops: f64) -> FullProfile {
        FullProfile {
            version: "2.0".to_string(),
            timing: TimingMetrics {
                wall_clock_time_us: time_us,
                samples: 50,
                ..Default::default()
            },
            throughput: ThroughputMetrics {
                tflops,
                ..Default::default()
            },
            ..Default::default()
        }
    }

    #[test]
    fn test_diff_improvement() {
        let baseline = make_profile(35.7, 7.5);
        let current = make_profile(23.2, 11.6);
        let diffs = diff_profiles(&baseline, &current);

        let time_diff = diffs
            .iter()
            .find(|d| d.name == "wall_clock_time_us")
            .unwrap();
        assert_eq!(time_diff.verdict, "IMPROVED"); // Lower is better
        assert!(time_diff.change_pct < -30.0);

        let tflops_diff = diffs.iter().find(|d| d.name == "tflops").unwrap();
        assert_eq!(tflops_diff.verdict, "IMPROVED"); // Higher is better
    }

    #[test]
    fn test_diff_regression() {
        let baseline = make_profile(23.2, 11.6);
        let current = make_profile(35.7, 7.5);
        let diffs = diff_profiles(&baseline, &current);

        let time_diff = diffs
            .iter()
            .find(|d| d.name == "wall_clock_time_us")
            .unwrap();
        assert_eq!(time_diff.verdict, "REGRESSED");
    }

    #[test]
    fn test_diff_no_change() {
        let baseline = make_profile(23.2, 11.6);
        let current = make_profile(23.4, 11.5);
        let diffs = diff_profiles(&baseline, &current);

        let time_diff = diffs
            .iter()
            .find(|d| d.name == "wall_clock_time_us")
            .unwrap();
        assert_eq!(time_diff.verdict, "="); // <2% change
    }

    /// FALSIFY-CGP-062: diff must complete in <100ms.
    #[test]
    fn test_diff_speed() {
        let baseline = make_profile(23.2, 11.6);
        let current = make_profile(35.7, 7.5);

        let start = std::time::Instant::now();
        for _ in 0..100 {
            let _ = diff_profiles(&baseline, &current);
        }
        let elapsed = start.elapsed();
        // 100 diffs should take << 100ms
        assert!(
            elapsed.as_millis() < 100,
            "100 diffs took {}ms",
            elapsed.as_millis()
        );
    }
}