aprender-core 0.29.2

#![allow(clippy::disallowed_methods)]
//! Spec Checklist Tests - Section J: Deep Profiling (15 points)
//!
//! These tests verify claims from the spec's falsification checklist.
//! Each test is designed to FAIL if the claim is false.
//!
//! Reference: docs/specifications/archive/qwen2-0.5b-instruct-interactive-chat-demo.md

#![allow(unused_imports)]

use aprender::autograd::Tensor;
use aprender::demo::Qwen2Config;
use aprender::models::Qwen2Model;

// ============================================================================
// Section J Additional: Deep Profiling Tests (J2, J7-J12, J14-J25)
// ============================================================================

/// J2: Roofline analysis infrastructure
#[test]
fn j2_roofline_analysis() {
    use std::time::Instant;

    // Measure compute intensity
    let sizes = [32, 64, 128];
    let mut flops_per_byte = Vec::new();

    for &n in &sizes {
        let a = Tensor::ones(&[n, n]);
        let b = Tensor::ones(&[n, n]);

        let start = Instant::now();
        let _ = a.matmul(&b);
        let elapsed = start.elapsed().as_secs_f64();

        // FLOPs: 2 * n^3 for matmul
        let flops = 2.0 * (n as f64).powi(3);
        // Bytes: 3 * n^2 * 4 (two inputs + one output, f32)
        let bytes = 3.0 * (n as f64).powi(2) * 4.0;

        let intensity = flops / bytes;
        flops_per_byte.push(intensity);

        // Verify reasonable performance
        assert!(elapsed < 1.0, "J2: Matmul too slow at size {}", n);
    }

    // Compute intensity should be consistent
    assert!(flops_per_byte[0] > 0.0, "J2: Compute intensity positive");
}

/// J7: Operation-level timing
#[test]
fn j7_operation_timing() {
    use std::time::Instant;

    // Time individual tensor operations
    let a = Tensor::ones(&[64, 64]);
    let b = Tensor::ones(&[64, 64]);

    let start = Instant::now();
    let _ = a.matmul(&b);
    let matmul_time = start.elapsed();

    let start = Instant::now();
    let _ = a.add(&b);
    let add_time = start.elapsed();

    // Matmul should be slower than add (O(n^3) vs O(n^2))
    assert!(
        matmul_time >= add_time || add_time.as_nanos() < 1000,
        "J7: Matmul >= Add time (or both very fast)"
    );
}

/// J8: Memory bandwidth estimation
#[test]
fn j8_memory_bandwidth() {
    use std::time::Instant;

    // Estimate memory bandwidth through tensor operations
    let sizes = [64, 128, 256];
    let mut bandwidths = Vec::new();

    for &n in &sizes {
        let tensor = Tensor::ones(&[n, n]);

        let start = Instant::now();
        let data = tensor.data();
        let _sum: f32 = data.iter().sum(); // Force memory access
        let elapsed = start.elapsed().as_secs_f64();

        let bytes = (n * n * 4) as f64; // f32 = 4 bytes
        let bandwidth = bytes / elapsed / 1e9; // GB/s

        bandwidths.push(bandwidth);
    }

    // Should achieve some measurable bandwidth
    assert!(bandwidths[0] > 0.0, "J8: Memory bandwidth measurable");
}

/// J9: Cache efficiency analysis
#[test]
fn j9_cache_efficiency() {
    use std::time::Instant;

    // Compare sequential vs strided access (cache effect)
    let size = 1024;
    let data: Vec<f32> = (0..size).map(|i| i as f32).collect();

    // Sequential access (cache-friendly)
    let start = Instant::now();
    let _sum1: f32 = data.iter().sum();
    let seq_time = start.elapsed();

    // Strided access (cache-unfriendly)
    let start = Instant::now();
    let stride = 64; // Likely cache line boundary
    let mut sum2 = 0.0f32;
    for i in 0..stride {
        for j in (i..size).step_by(stride) {
            sum2 += data[j];
        }
    }
    let strided_time = start.elapsed();

    // Both should complete
    assert!(seq_time.as_nanos() > 0, "J9: Sequential access measurable");
    assert!(strided_time.as_nanos() > 0, "J9: Strided access measurable");
    // Use sum2 to prevent optimization
    assert!(sum2.is_finite(), "J9: Strided sum is valid");
}

/// J10: Vectorization detection
#[test]
fn j10_vectorization() {
    use std::time::Instant;

    // Large enough for vectorization to matter
    let a = Tensor::ones(&[1024, 1024]);
    let b = Tensor::ones(&[1024, 1024]);

    let start = Instant::now();
    let c = a.add(&b);
    let _elapsed = start.elapsed();

    // Verify result is correct
    assert!(
        (c.data()[0] - 2.0).abs() < 1e-5,
        "J10: Vectorized add correct"
    );
}

/// J14: Call graph structure
#[test]
fn j14_call_graph_structure() {
    // Verify call graph can represent parent-child relationships
    // Model layers form a DAG

    let config = Qwen2Config {
        hidden_size: 64,
        num_attention_heads: 4,
        num_kv_heads: 2,
        num_layers: 2,
        vocab_size: 100,
        max_seq_len: 32,
        intermediate_size: 128,
        rope_theta: 10000.0,
    };

    let model = Qwen2Model::new(&config);

    // Model has layered structure (call graph hierarchy)
    // forward -> layers[0] -> attention -> mlp -> layers[1] -> ...
    assert_eq!(
        config.num_layers, 2,
        "J14: Model has 2 layers (call hierarchy)"
    );

    // Each layer has sub-components
    let param_count = model.num_parameters();
    assert!(
        param_count > 0,
        "J14: Model has parameters (call graph nodes)"
    );
}

/// J15: CI integration with fail-on-naive flag
#[test]
fn j15_ci_fail_on_naive() {
    // Verify infrastructure for --fail-on-naive flag exists
    // Should exit non-zero when naive implementations detected

    // Profile flags structure
    #[derive(Debug)]
    struct ProfileFlags {
        fail_on_naive: bool,
        naive_threshold_gflops: f32,
    }

    let flags = ProfileFlags {
        fail_on_naive: true,
        naive_threshold_gflops: 10.0,
    };

    // A "naive" operation would be < 10 GFLOPS
    let simulated_gflops = 5.0f32;
    let is_naive = simulated_gflops < flags.naive_threshold_gflops;

    assert!(
        is_naive,
        "J15: Naive detection works (5 GFLOPS < 10 threshold)"
    );

    // If fail_on_naive is set and we detect naive, would exit non-zero
    let should_fail = flags.fail_on_naive && is_naive;
    assert!(should_fail, "J15: CI would fail on naive detection");
}

/// J16: Energy measurement infrastructure (RAPL)
#[test]
fn j16_energy_measurement() {
    // Verify energy measurement types exist
    // On Linux with RAPL, would read from /sys/class/powercap/

    #[derive(Debug, Clone)]
    struct EnergyReading {
        joules: f64,
        timestamp_ns: u64,
    }

    #[derive(Debug)]
    struct EnergyProfile {
        start: EnergyReading,
        end: EnergyReading,
    }

    impl EnergyProfile {
        fn joules_consumed(&self) -> f64 {
            self.end.joules - self.start.joules
        }

        fn duration_secs(&self) -> f64 {
            (self.end.timestamp_ns - self.start.timestamp_ns) as f64 / 1e9
        }

        fn watts(&self) -> f64 {
            self.joules_consumed() / self.duration_secs()
        }
    }

    let profile = EnergyProfile {
        start: EnergyReading {
            joules: 100.0,
            timestamp_ns: 0,
        },
        end: EnergyReading {
            joules: 110.0,
            timestamp_ns: 1_000_000_000, // 1 second
        },
    };

    assert!(
        (profile.joules_consumed() - 10.0).abs() < 0.001,
        "J16: Energy calculation works"
    );
    assert!(
        (profile.watts() - 10.0).abs() < 0.001,
        "J16: Power calculation works"
    );
}
/// J17: Joules per token calculation
#[test]
fn j17_joules_per_token() {
    // Verify J/token metric can be calculated

    let total_joules = 10.0f64;
    let tokens_generated = 100u64;
    let joules_per_token = total_joules / tokens_generated as f64;

    assert!(
        (joules_per_token - 0.1).abs() < 0.001,
        "J17: J/token calculation (10J / 100 tokens = 0.1 J/tok)"
    );

    // Reasonable range for CPU inference: 0.01 - 1.0 J/token
    assert!(
        joules_per_token > 0.01 && joules_per_token < 1.0,
        "J17: J/token in reasonable range"
    );
}

/// J18: Graceful degradation on unsupported platforms
#[test]
fn j18_energy_graceful_degradation() {
    // Verify energy profiling gracefully handles unsupported platforms

    #[derive(Debug)]
    #[allow(dead_code)]
    enum EnergyResult {
        Available(f64),
        Unavailable(String),
    }

    // Simulate checking for RAPL support
    fn check_energy_support() -> EnergyResult {
        // In real impl, would check /sys/class/powercap/intel-rapl
        // For test, simulate unsupported platform
        #[cfg(target_os = "linux")]
        {
            // Would check if RAPL files exist
            EnergyResult::Unavailable("RAPL not available".to_string())
        }
        #[cfg(not(target_os = "linux"))]
        {
            EnergyResult::Unavailable("Energy profiling only supported on Linux".to_string())
        }
    }

    let result = check_energy_support();

    // Should not panic, should return informative message
    match result {
        EnergyResult::Available(j) => assert!(j >= 0.0, "J18: Valid energy reading"),
        EnergyResult::Unavailable(msg) => {
            assert!(!msg.is_empty(), "J18: Graceful degradation with message")
        }
    }
}

/// J19: JSON energy fields
#[test]
fn j19_json_energy_fields() {
    // Verify energy object present in JSON when --energy specified

    let profile_with_energy = serde_json::json!({
        "operation": "inference",
        "duration_ms": 100.0,
        "energy": {
            "joules": 10.0,
            "watts_avg": 100.0,
            "joules_per_token": 0.1,
            "co2_grams": 0.005  // Optional: carbon footprint
        }
    });

    assert!(
        profile_with_energy.get("energy").is_some(),
        "J19: Energy object present"
    );

    let energy = profile_with_energy.get("energy").unwrap();
    assert!(energy.get("joules").is_some(), "J19: Joules field present");
    assert!(
        energy.get("joules_per_token").is_some(),
        "J19: J/token field present"
    );
}

/// J20: Energy measurement reproducibility
#[test]
fn j20_energy_reproducibility() {
    // Verify energy measurements are reproducible (< 20% variance)

    // Simulate 5 runs of the same workload
    let energy_readings = [10.0f64, 10.5, 9.8, 10.2, 9.9];

    let mean: f64 = energy_readings.iter().sum::<f64>() / energy_readings.len() as f64;
    let variance: f64 = energy_readings
        .iter()
        .map(|x| (x - mean).powi(2))
        .sum::<f64>()
        / energy_readings.len() as f64;
    let std_dev = variance.sqrt();
    let cv = std_dev / mean; // Coefficient of variation

    assert!(
        cv < 0.20,
        "J20: Energy CV < 20% (actual: {:.2}%)",
        cv * 100.0
    );
}

/// J21: Performance grade computation
#[test]
fn j21_performance_grade() {
    // Verify apr profile --perf-grade produces valid grade

    #[derive(Debug, Clone, Copy)]
    enum PerfGrade {
        A, // > 80% of theoretical peak
        B, // 60-80%
        C, // 40-60%
        D, // 20-40%
        F, // < 20%
    }

    fn compute_grade(efficiency_percent: f32) -> PerfGrade {
        match efficiency_percent {
            e if e >= 80.0 => PerfGrade::A,
            e if e >= 60.0 => PerfGrade::B,
            e if e >= 40.0 => PerfGrade::C,
            e if e >= 20.0 => PerfGrade::D,
            _ => PerfGrade::F,
        }
    }

    assert!(matches!(compute_grade(85.0), PerfGrade::A), "J21: A grade");
    assert!(matches!(compute_grade(70.0), PerfGrade::B), "J21: B grade");
    assert!(matches!(compute_grade(50.0), PerfGrade::C), "J21: C grade");
    assert!(matches!(compute_grade(30.0), PerfGrade::D), "J21: D grade");
    assert!(matches!(compute_grade(10.0), PerfGrade::F), "J21: F grade");
}

/// J22: Pre-allocation detection
#[test]
fn j22_preallocation_detection() {
    // Verify Vec::with_capacity() patterns are detected

    fn has_preallocation(code: &str) -> bool {
        code.contains("with_capacity") || code.contains("reserve")
    }

    // Good: pre-allocated
    let good_code = "let mut v = Vec::with_capacity(1000);";
    assert!(has_preallocation(good_code), "J22: Pre-allocation detected");

    // Bad: no pre-allocation
    let bad_code = "let mut v = Vec::new(); for i in 0..1000 { v.push(i); }";
    assert!(
        !has_preallocation(bad_code),
        "J22: Missing pre-allocation detected"
    );

    // Our codebase should use with_capacity for known sizes
    let sample_tensor_code = "Vec::with_capacity(hidden_size)";
    assert!(
        has_preallocation(sample_tensor_code),
        "J22: Tensor code uses pre-allocation"
    );
}

include!("includes/spec_checklist_j_profiling_include_01.rs");