realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
#[cfg(test)]
mod tests {
    use crate::apr::*;

    /// PMAT-107: Falsification test for GQA dimensions in CUDA executor
    ///
    /// This test loads a real APR file and creates AprV2ModelCuda, then verifies
    /// that the CUDA executor has the correct GQA dimensions. This catches bugs
    /// where num_kv_heads is parsed correctly but not propagated to the executor.
    #[test]
    #[cfg(feature = "cuda")]
    fn test_falsification_apr_cuda_gqa_dimensions() {
        let home = std::env::var("HOME").expect("HOME env var not set");
        let apr_path_str = format!(
            "{}/.cache/huggingface/models/qwen2.5-coder-1.5b-apr/qwen2.5-coder-1.5b-q4k.apr",
            home
        );
        let apr_path = std::path::Path::new(&apr_path_str);

        if !apr_path.exists() {
            println!("⚠️ Test model not available at {:?}, skipping", apr_path);
            return;
        }

        // Load the model
        let model = AprV2Model::load(apr_path).expect("Should load APR file");

        // Verify metadata first
        assert_eq!(model.metadata.num_heads, Some(12), "num_heads should be 12");
        assert_eq!(
            model.metadata.num_kv_heads,
            Some(2),
            "num_kv_heads should be 2 (GQA)"
        );

        // Create CUDA model
        use crate::apr::AprV2ModelCuda;

        let cuda_model = match AprV2ModelCuda::new(model, 0) {
            Ok(m) => m,
            Err(e) => {
                eprintln!("⚠️ CUDA not available: {e}");
                return;
            },
        };

        // Access the executor's GQA dimensions
        // We need to verify kv_num_heads and kv_num_kv_heads are set correctly
        // The executor is private but we can check via the metadata pass-through

        println!("=== CUDA EXECUTOR GQA CONFIG ===");
        println!(
            "  model.metadata.num_heads: {:?}",
            cuda_model.inner().metadata.num_heads
        );
        println!(
            "  model.metadata.num_kv_heads: {:?}",
            cuda_model.inner().metadata.num_kv_heads
        );

        // The critical check: if CUDA model was initialized correctly, the GQA ratio should be 6:1
        // (12 Q heads / 2 KV heads = 6x repeat factor for GQA)
        let num_heads = cuda_model.inner().metadata.num_heads.unwrap_or(1);
        let num_kv_heads = cuda_model
            .inner()
            .metadata
            .num_kv_heads
            .unwrap_or(num_heads);
        let gqa_ratio = num_heads / num_kv_heads;

        assert_eq!(
            gqa_ratio, 6,
            "FALSIFICATION FAILED: GQA ratio wrong!\n\
             Expected: 6 (12 Q heads / 2 KV heads), Got: {} ({} / {})",
            gqa_ratio, num_heads, num_kv_heads
        );

        println!(
            "✅ CUDA model has correct GQA ratio: {} ({}:{} heads:kv_heads)",
            gqa_ratio, num_heads, num_kv_heads
        );
    }
include!("tests_header_parsing.rs");
include!("tests_apr_flags_metadata.rs");
include!("tests_f16.rs");
include!("tests_apr_header.rs");
include!("tests_byte.rs");
include!("tests_simd_dot.rs");
include!("tests_tensor_entry_dequant.rs");
include!("tests_q8_f16_and_trait_impls.rs");
include!("transform.rs");
include!("tests_apr.rs");
include!("tests_bpe_tokenizer_apr.rs");
include!("tests_mapped_model.rs");
include!("tests_tensor_entry.rs");
}