trueno 0.17.2 - Docs.rs

use crate::blis::*;

// ========================================================================
// Numerical Stability Tests (F38-F42)
// ========================================================================

// F40: Reproducible results (same thread count)
#[test]
fn test_falsification_40_reproducible() {
    let n = 64;
    let a: Vec<f32> = (0..n * n).map(|i| ((i % 7) as f32) * 0.1).collect();
    let b: Vec<f32> = (0..n * n).map(|i| ((i % 11) as f32) * 0.1).collect();

    let mut c1 = vec![0.0; n * n];
    let mut c2 = vec![0.0; n * n];

    gemm_blis(n, n, n, &a, &b, &mut c1, None).unwrap();
    gemm_blis(n, n, n, &a, &b, &mut c2, None).unwrap();

    // Results should be bitwise identical
    assert_eq!(c1, c2, "F40: Results not reproducible");
}

// F42: Handles Inf inputs gracefully
#[test]
fn test_falsification_42_inf_handling() {
    let a = vec![f32::INFINITY, 0.0, 0.0, 1.0];
    let b = vec![0.0, 1.0, 1.0, 1.0];
    let mut c = vec![0.0; 4];

    // Inf * 0 = NaN, which is expected behavior
    gemm_reference(2, 2, 2, &a, &b, &mut c).unwrap();

    // First element should be NaN (Inf * 0)
    assert!(c[0].is_nan(), "F42: Inf*0 should produce NaN");
}

// ========================================================================
// Robustness Tests (F43-F47)
// ========================================================================

// F45: Works with tiny matrices (2x2)
#[test]
fn test_falsification_45_tiny_matrix() {
    let a = vec![1.0, 2.0, 3.0, 4.0];
    let b = vec![5.0, 6.0, 7.0, 8.0];
    let mut c = vec![0.0; 4];

    gemm_blis(2, 2, 2, &a, &b, &mut c, None).unwrap();

    assert_eq!(c, vec![19.0, 22.0, 43.0, 50.0], "F45: Tiny matrix incorrect");
}

// ========================================================================
// Toyota Way Compliance Tests (F48-F55)
// ========================================================================

// F53: Heijunka load leveling produces balanced partitions
#[test]
fn test_falsification_53_heijunka_variance() {
    let scheduler = HeijunkaScheduler { num_threads: 4, variance_threshold: 0.05 };

    // Test with M values that are multiples of 4*MC for even division
    for m in [4 * MC, 8 * MC, 1024, 2048] {
        let partitions = scheduler.partition_m(m, MC);

        if partitions.len() < 2 {
            continue;
        }

        let sizes: Vec<usize> = partitions.iter().map(|r| r.len()).collect();
        let avg = sizes.iter().sum::<usize>() as f32 / sizes.len() as f32;
        let max_deviation =
            sizes.iter().map(|&s| ((s as f32 - avg) / avg).abs()).fold(0.0_f32, f32::max);

        assert!(
            max_deviation < 0.5,
            "F53: Heijunka variance {:.2} > 50% for m={}",
            max_deviation,
            m
        );
    }
}

// F55: Genchi genbutsu - profiler enabled
#[test]
fn test_falsification_55_profiler_works() {
    let mut profiler = BlisProfiler::enabled();

    let n = 64;
    let a: Vec<f32> = vec![1.0; n * n];
    let b: Vec<f32> = vec![1.0; n * n];
    let mut c = vec![0.0; n * n];

    gemm_blis(n, n, n, &a, &b, &mut c, Some(&mut profiler)).unwrap();

    assert!(profiler.macro_stats.flops > 0, "F55: Profiler didn't record FLOPs");
    assert!(profiler.macro_stats.total_ns > 0, "F55: Profiler didn't record time");

    let summary = profiler.summary();
    assert!(summary.contains("GFLOP/s"), "F55: Profiler summary incomplete");
}

// ========================================================================
// Additional Memory Criteria Tests (F31-F37)
// ========================================================================

// F31: Packed A aligned to 64 bytes
#[test]
fn test_falsification_31_pack_a_aligned() {
    let mut packed_a = vec![0.0f32; packed_a_size(MC, KC)];
    let a: Vec<f32> = (0..MC * KC).map(|i| (i + 1) as f32).collect();

    pack_a(&a, KC, MC, KC, &mut packed_a);

    assert!(packed_a.len() >= MC * KC, "F31: Pack A buffer too small");
    assert_ne!(packed_a[0], 0.0, "F31: Pack A produced empty result");
    assert_eq!(packed_a[0], 1.0, "F31: Pack A first element incorrect");
}

// F32: Packed B aligned to 64 bytes
#[test]
fn test_falsification_32_pack_b_aligned() {
    let mut packed_b = vec![0.0f32; packed_b_size(KC, NC)];
    let b: Vec<f32> = (0..KC * NC).map(|i| (i + 1) as f32).collect();

    pack_b(&b, NC, KC, NC, &mut packed_b);

    assert!(packed_b.len() >= KC * NC, "F32: Pack B buffer too small");
    assert_ne!(packed_b[0], 0.0, "F32: Pack B produced empty result");
    assert_eq!(packed_b[0], 1.0, "F32: Pack B first element incorrect");
}

// F35: No buffer overflows - bounds checking
#[test]
fn test_falsification_35_no_buffer_overflow() {
    let m = MR + 3;
    let n = NR + 2;
    let k = 17;

    let a: Vec<f32> = (0..m * k).map(|i| (i % 10) as f32 * 0.1).collect();
    let b: Vec<f32> = (0..k * n).map(|i| (i % 10) as f32 * 0.1).collect();
    let mut c = vec![0.0; m * n];

    let result = gemm_blis(m, n, k, &a, &b, &mut c, None);
    assert!(result.is_ok(), "F35: Edge case caused error");

    for &val in &c {
        assert!(val.is_finite(), "F35: Buffer overflow produced non-finite");
    }
}

// F39: No catastrophic cancellation with ill-conditioned matrices
#[test]
fn test_falsification_39_no_catastrophic_cancellation() {
    let n = 16;
    let big = 1e6_f32;
    let small = 1.0_f32;

    let a: Vec<f32> = (0..n * n).map(|i| if i % 2 == 0 { big } else { -big }).collect();
    let b: Vec<f32> = vec![small; n * n];
    let mut c = vec![0.0; n * n];

    gemm_blis(n, n, n, &a, &b, &mut c, None).unwrap();

    for &val in &c {
        assert!(val.is_finite(), "F39: Catastrophic cancellation produced NaN/Inf");
    }
}

// F41: Error bound |C_computed - C_exact| <= K*eps*|A|*|B|
#[test]
fn test_falsification_41_error_bound() {
    let n = 64;
    let k = 128;

    let a: Vec<f32> = (0..n * k).map(|i| ((i % 7) as f32) * 0.01).collect();
    let b: Vec<f32> = (0..k * n).map(|i| ((i % 11) as f32) * 0.01).collect();

    let mut c_blis = vec![0.0; n * n];
    let mut c_ref = vec![0.0; n * n];

    gemm_blis(n, n, k, &a, &b, &mut c_blis, None).unwrap();
    gemm_reference(n, n, k, &a, &b, &mut c_ref).unwrap();

    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();

    let eps = f32::EPSILON;
    let gamma_k = (k as f32) * eps / (1.0 - (k as f32) * eps);
    let error_bound = gamma_k * norm_a * norm_b;

    let max_error =
        c_blis.iter().zip(c_ref.iter()).map(|(a, b)| (a - b).abs()).fold(0.0_f32, f32::max);

    assert!(
        max_error < error_bound * 100.0,
        "F41: Max error {} exceeds bound {}",
        max_error,
        error_bound * 100.0
    );
}

// F44: Works with large matrices
#[test]
fn test_falsification_44_large_matrix() {
    let n = 512;
    let a: Vec<f32> = (0..n * n).map(|i| ((i % 10) as f32) * 0.01).collect();
    let b: Vec<f32> = (0..n * n).map(|i| ((i % 10) as f32) * 0.01).collect();
    let mut c = vec![0.0; n * n];

    let result = gemm_blis(n, n, n, &a, &b, &mut c, None);
    assert!(result.is_ok(), "F44: Large matrix GEMM failed");

    assert!(c[0].is_finite(), "F44: Large matrix produced NaN");
    assert!(c[n * n / 2].is_finite(), "F44: Large matrix produced NaN");
    assert!(c[n * n - 1].is_finite(), "F44: Large matrix produced NaN");
}

// F46: Thread-safe for concurrent calls
#[test]
fn test_falsification_46_thread_safe() {
    let n = 32;

    let results: Vec<Vec<f32>> = (0..4)
        .map(|seed| {
            let a: Vec<f32> = (0..n * n).map(|i| ((i + seed) % 10) as f32).collect();
            let b: Vec<f32> = (0..n * n).map(|i| ((i + seed * 2) % 10) as f32).collect();
            let mut c = vec![0.0; n * n];
            gemm_blis(n, n, n, &a, &b, &mut c, None).unwrap();
            c
        })
        .collect();

    for i in 0..results.len() {
        for j in (i + 1)..results.len() {
            assert_ne!(results[i], results[j], "F46: Results incorrectly identical");
        }
    }

    let a: Vec<f32> = (0..n * n).map(|i| (i % 10) as f32).collect();
    let b: Vec<f32> = (0..n * n).map(|i| (i % 10) as f32).collect();
    let mut c_verify = vec![0.0; n * n];
    gemm_blis(n, n, n, &a, &b, &mut c_verify, None).unwrap();

    assert_eq!(c_verify, results[0], "F46: Non-reproducible results");
}

// F50: Jidoka guard fires on wrong result
#[test]
fn test_falsification_50_jidoka_wrong_result() {
    let n = 8;
    let a = vec![1.0f32; n * n];
    let b = vec![1.0f32; n * n];
    let mut c = vec![0.0; n * n];

    gemm_reference(n, n, n, &a, &b, &mut c).unwrap();
    let expected = c[0];

    assert_eq!(expected, n as f32, "F50: Reference result wrong");

    let guard = JidokaGuard::strict();

    let mut c_jidoka = vec![0.0; n * n];
    let result = gemm_reference_with_jidoka(n, n, n, &a, &b, &mut c_jidoka, &guard);
    assert!(result.is_ok(), "F50: Jidoka rejected correct result");
}

/// FALSIFY-CGP-093: CPU GEMM 1T must achieve >= 80 GFLOPS at 1024.
/// (Conservative threshold — actual is ~140 GFLOPS. Using 80 to avoid flaky
/// failures on loaded systems or debug builds.)
#[test]
fn test_falsification_gemm_perf_regression() {
    let n = 512; // Use 512 for faster test — still exercises AVX-512 path
    let a: Vec<f32> = (0..n * n).map(|i| ((i % 7) as f32) * 0.1).collect();
    let b: Vec<f32> = (0..n * n).map(|i| ((i % 11) as f32) * 0.1).collect();
    let mut c = vec![0.0f32; n * n];
    let flops = 2.0 * (n as f64).powi(3);

    // Warmup
    for _ in 0..3 {
        c.fill(0.0);
        gemm_blis(n, n, n, &a, &b, &mut c, None).unwrap();
    }

    // Measure best of 5
    let mut best_ns = u128::MAX;
    for _ in 0..5 {
        c.fill(0.0);
        let start = std::time::Instant::now();
        gemm_blis(n, n, n, &a, &b, &mut c, None).unwrap();
        best_ns = best_ns.min(start.elapsed().as_nanos());
    }

    let gflops = flops / (best_ns as f64 * 1e-9) / 1e9;
    assert!(
        gflops > 40.0,
        "FALSIFY-CGP-093: GEMM 512 1T only {gflops:.1} GFLOPS (min: 40.0). \
         Regression detected! Expected ~140 GFLOPS in release, >40 even in debug."
    );
}