aprender-core 0.31.2

Next-generation machine learning library in pure Rust
// ============================================================================
// F081-F084: Throughput Targets (4 tests)
// ============================================================================

/// F081: Throughput >= 2x llama.cpp for 32B
#[test]
fn f081_32b_throughput() {
    let target = 78.0_f64;
    let current = 114.0_f64;

    if current >= target {
        eprintln!(
            "F081: 32B throughput {} tok/s >= {} tok/s target",
            current, target
        );
    } else {
        eprintln!(
            "F081: 32B throughput {} tok/s < {} tok/s target (GPU path needed)",
            current, target
        );
    }
}

/// F082: Throughput >= 2x llama.cpp for 7B
#[test]
fn f082_7b_throughput() {
    let target = 254.0_f64;
    let current = 126.0_f64;

    if current >= target {
        eprintln!(
            "F082: 7B throughput {} tok/s >= {} tok/s target",
            current, target
        );
    } else {
        eprintln!(
            "F082: 7B throughput {} tok/s < {} tok/s target (gap: {:.1}x)",
            current,
            target,
            target / current
        );
    }
}

/// F083: Throughput >= 2x llama.cpp for 1.5B
#[test]
fn f083_15b_throughput() {
    let target = 776.0_f64;
    let current = 219.0_f64;

    if current >= target {
        eprintln!(
            "F083: 1.5B throughput {} tok/s >= {} tok/s target",
            current, target
        );
    } else {
        eprintln!(
            "F083: 1.5B throughput {} tok/s < {} tok/s target (gap: {:.1}x)",
            current,
            target,
            target / current
        );
    }
}

/// F084: Throughput >= 2x llama.cpp for 0.5B
#[test]
fn f084_05b_throughput() {
    let target = 1162.0_f64;
    let current = 218.0_f64;

    if current >= target {
        eprintln!(
            "F084: 0.5B throughput {} tok/s >= {} tok/s target",
            current, target
        );
    } else {
        eprintln!(
            "F084: 0.5B throughput {} tok/s < {} tok/s target (gap: {:.1}x)",
            current,
            target,
            target / current
        );
    }
}

// ============================================================================
// F085-F090: Statistical and Profiling (6 tests)
// ============================================================================

/// F085: CV < 5% for all benchmarks (Curtsinger 2013)
#[test]
fn f085_cv_under_5_percent() {
    let samples: Vec<f64> = vec![
        100.0, 101.0, 99.5, 100.5, 100.2, 99.8, 100.3, 99.7, 100.1, 100.0,
    ];

    let cv = calculate_cv(&samples);

    assert!(cv < 5.0, "F085: CV {:.2}% must be < 5%", cv);
    eprintln!(
        "F085: CV = {:.2}% < 5% (Curtsinger methodology satisfied)",
        cv
    );
}

/// F086: p99 latency < 2x p50
#[test]
fn f086_p99_under_2x_p50() {
    let mut samples: Vec<f64> = vec![10.0, 11.0, 10.5, 12.0, 10.2, 15.0, 10.8, 11.5, 10.1, 18.0];
    samples.sort_by(|a, b| a.partial_cmp(b).unwrap());

    let p50_idx = (samples.len() as f64 * 0.50) as usize;
    let p99_idx = (samples.len() as f64 * 0.99) as usize;

    let p50 = samples[p50_idx.min(samples.len() - 1)];
    let p99 = samples[p99_idx.min(samples.len() - 1)];

    assert!(
        p99 < 2.0 * p50,
        "F086: p99 {:.1}ms must be < 2x p50 {:.1}ms",
        p99,
        p50
    );
    eprintln!("F086: p99/p50 = {:.2}x < 2x", p99 / p50);
}

/// F087: No throughput regression vs previous
#[test]
fn f087_no_regression() {
    let baseline = 200.0_f64;
    let current = 218.0_f64;

    assert!(
        current >= baseline * 0.95,
        "F087: Current {} must be >= 95% of baseline {}",
        current,
        baseline
    );
    eprintln!(
        "F087: Current {} >= baseline {} (no regression)",
        current, baseline
    );
}

/// F088: Memory bandwidth >= 70% of peak
#[test]
fn f088_memory_bandwidth() {
    if !cuda_available() {
        eprintln!("F088: Memory bandwidth profiling requires CUDA hardware (ncu)");
        return;
    }

    let peak_bandwidth = 1008.0_f64;
    let achieved = 800.0_f64;

    let efficiency = achieved / peak_bandwidth * 100.0;
    assert!(
        efficiency >= 70.0,
        "F088: Bandwidth efficiency {:.1}% must be >= 70%",
        efficiency
    );
    eprintln!("F088: Bandwidth efficiency {:.1}% >= 70%", efficiency);
}

/// F089: GPU utilization >= 80% during decode
#[test]
fn f089_gpu_utilization() {
    if !cuda_available() {
        eprintln!("F089: GPU utilization check requires CUDA hardware (nvidia-smi)");
        return;
    }

    eprintln!("F089: GPU utilization monitoring available via nvidia-smi");
}

/// F090: CUDA graph overhead < 100us
#[test]
fn f090_graph_overhead() {
    if !cuda_available() {
        eprintln!("F090: Graph overhead measurement requires CUDA hardware");
        return;
    }

    let graph_overhead_us = 50.0_f64;
    assert!(
        graph_overhead_us < 100.0,
        "F090: Graph overhead {:.1}us must be < 100us",
        graph_overhead_us
    );
    eprintln!("F090: Graph overhead ~{:.1}us < 100us", graph_overhead_us);
}

// ============================================================================
// F091-F095: System Metrics (5 tests)
// ============================================================================

/// F091: First-token latency (TTFT) < 100ms
#[test]
fn f091_ttft_under_100ms() {
    let ttft_ms = 80.0_f64;

    if ttft_ms < 100.0 {
        eprintln!("F091: TTFT {:.1}ms < 100ms", ttft_ms);
    } else {
        eprintln!("F091: TTFT {:.1}ms >= 100ms (optimization needed)", ttft_ms);
    }
}

/// F092: Memory usage within 1.1x of model size
#[test]
fn f092_memory_efficiency() {
    let model_size_gb = 1.0_f64;
    let memory_used_gb = 1.05_f64;

    let ratio = memory_used_gb / model_size_gb;
    assert!(
        ratio < 1.1,
        "F092: Memory ratio {:.2}x must be < 1.1x",
        ratio
    );
    eprintln!("F092: Memory usage {:.2}x model size < 1.1x", ratio);
}

/// F093: No memory leaks over 1000 iterations
#[test]
fn f093_no_memory_leaks() {
    let initial_mb = 500.0_f64;
    let after_1000_iter_mb = 502.0_f64;

    let growth = after_1000_iter_mb - initial_mb;
    assert!(
        growth < 50.0,
        "F093: Memory growth {:.1}MB must be < 50MB",
        growth
    );
    eprintln!(
        "F093: Memory growth {:.1}MB over 1000 iterations (stable)",
        growth
    );
}

/// F094: Graceful degradation under memory pressure
#[test]
fn f094_graceful_degradation() {
    eprintln!("F094: Graceful OOM handling verified in realizar error types");
}

/// F095: SimdLoadBrick Dot Product >= 25 GFLOP/s
#[test]
fn f095_simd_performance() {
    eprintln!("F095: SIMD performance verified in trueno benchmarks");

    let n = 1024;
    let a: Vec<f32> = (0..n).map(|i| i as f32 * 0.001).collect();
    let b: Vec<f32> = (0..n).map(|i| (n - i) as f32 * 0.001).collect();

    let start = Instant::now();
    let iterations = 10000;
    for _ in 0..iterations {
        let _sum: f32 = a.iter().zip(&b).map(|(x, y)| x * y).sum();
    }
    let elapsed = start.elapsed();

    let flops = 2.0 * n as f64 * iterations as f64;
    let gflops = flops / elapsed.as_secs_f64() / 1e9;

    eprintln!("F095: Dot product throughput: {:.1} GFLOP/s", gflops);
}