realizar 0.8.5 - Docs.rs

//! CudaExecutor tests Part 04 - COV-014 through COV-020
//!
//! Coverage tests for:
//! - COV-014: weights.rs coverage (quantized weight management)
//! - COV-015: layer.rs error paths and validation
//! - COV-016: activations.rs coverage
//! - COV-017: core.rs and additional coverage
//! - COV-018: quantized.rs coverage
//! - COV-019: attention.rs coverage
//! - COV-020: gemm.rs coverage

use super::*;
use serial_test::serial;

// ==============================================================================
// COV-016: activations.rs coverage tests
// Target: Increase activations.rs coverage from 30.21%
// ==============================================================================

#[test]
#[serial]
fn test_cov016_silu_gpu_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 128u32;
    let input: Vec<f32> = (0..n).map(|i| (i as f32 - 64.0) * 0.1).collect();

    let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
    let result = executor.silu_gpu(&input_gpu, n);
    assert!(result.is_ok(), "silu_gpu failed: {:?}", result.err());

    executor.stream.synchronize().expect("sync");
    let output_gpu = result.unwrap();
    let mut output = vec![0.0f32; n as usize];
    output_gpu.copy_to_host(&mut output).expect("copy to host");

    // SiLU: x * sigmoid(x) - check that non-zero inputs produce non-zero outputs
    let non_zero_count = output.iter().filter(|&&x| x.abs() > 1e-6).count();
    assert!(
        non_zero_count > n as usize / 2,
        "SiLU should produce many non-zero outputs"
    );
}

#[test]
#[serial]
fn test_cov016_gelu_async_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 256u32;
    let input: Vec<f32> = (0..n).map(|i| (i as f32 - 128.0) * 0.05).collect();

    let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
    let result = executor.gelu_async(&input_gpu, n);
    assert!(result.is_ok(), "gelu_async failed: {:?}", result.err());

    executor.stream.synchronize().expect("sync");
    let output_gpu = result.unwrap();
    let mut output = vec![0.0f32; n as usize];
    output_gpu.copy_to_host(&mut output).expect("copy to host");

    // GELU should have non-zero outputs for most non-zero inputs
    let non_zero_count = output.iter().filter(|&&x| x.abs() > 1e-6).count();
    assert!(
        non_zero_count > n as usize / 3,
        "GELU should produce non-zero outputs"
    );
}

#[test]
#[serial]
fn test_cov016_elementwise_mul_gpu_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 64u32;
    let input1 = vec![2.0f32; n as usize];
    let input2 = vec![3.0f32; n as usize];

    let input1_gpu = GpuBuffer::from_host(&executor.context, &input1).expect("input1 buffer");
    let input2_gpu = GpuBuffer::from_host(&executor.context, &input2).expect("input2 buffer");

    let result = executor.elementwise_mul_gpu(&input1_gpu, &input2_gpu, n);
    assert!(
        result.is_ok(),
        "elementwise_mul_gpu failed: {:?}",
        result.err()
    );

    executor.stream.synchronize().expect("sync");
    let output_gpu = result.unwrap();
    let mut output = vec![0.0f32; n as usize];
    output_gpu.copy_to_host(&mut output).expect("copy to host");

    // 2.0 * 3.0 = 6.0
    for val in &output {
        assert!((*val - 6.0).abs() < 1e-5, "Expected 6.0, got {}", val);
    }
}

#[test]
#[serial]
fn test_cov016_fused_swiglu_gpu_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 128u32;
    let gate: Vec<f32> = (0..n).map(|i| (i as f32 - 64.0) * 0.1).collect();
    let up = vec![1.0f32; n as usize];

    let gate_gpu = GpuBuffer::from_host(&executor.context, &gate).expect("gate buffer");
    let up_gpu = GpuBuffer::from_host(&executor.context, &up).expect("up buffer");

    let result = executor.fused_swiglu_gpu(&gate_gpu, &up_gpu, n);
    assert!(
        result.is_ok(),
        "fused_swiglu_gpu failed: {:?}",
        result.err()
    );

    executor.stream.synchronize().expect("sync");
    let output_gpu = result.unwrap();
    let mut output = vec![0.0f32; n as usize];
    output_gpu.copy_to_host(&mut output).expect("copy to host");

    // SwiGLU: silu(gate) * up
    let non_zero_count = output.iter().filter(|&&x| x.abs() > 1e-6).count();
    assert!(non_zero_count > 0, "SwiGLU should produce non-zero outputs");
}

#[test]
#[serial]
fn test_cov016_fused_swiglu_into_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 64u32;
    let gate = vec![1.0f32; n as usize]; // SiLU(1.0) = 1.0 * sigmoid(1.0) ≈ 0.731
    let up = vec![2.0f32; n as usize];

    let gate_gpu = GpuBuffer::from_host(&executor.context, &gate).expect("gate buffer");
    let up_gpu = GpuBuffer::from_host(&executor.context, &up).expect("up buffer");
    let output_gpu = GpuBuffer::<f32>::new(&executor.context, n as usize).expect("output buffer");

    let result = executor.fused_swiglu_into(&gate_gpu, &up_gpu, &output_gpu, n);
    assert!(
        result.is_ok(),
        "fused_swiglu_into failed: {:?}",
        result.err()
    );

    executor.stream.synchronize().expect("sync");
    let mut output = vec![0.0f32; n as usize];
    output_gpu.copy_to_host(&mut output).expect("copy to host");

    // silu(1.0) * 2.0 ≈ 0.731 * 2 ≈ 1.462
    for val in &output {
        assert!(
            val.abs() > 1.0 && val.abs() < 2.0,
            "Expected ~1.46, got {}",
            val
        );
    }
}

#[test]
#[serial]
fn test_cov016_silu_gpu_cached_module() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 32u32;
    let input = vec![0.5f32; n as usize];
    let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");

    // First call compiles the module
    let _result1 = executor.silu_gpu(&input_gpu, n).expect("first silu_gpu");
    executor.stream.synchronize().expect("sync");

    // Second call reuses cached module
    let result2 = executor.silu_gpu(&input_gpu, n);
    assert!(
        result2.is_ok(),
        "cached silu_gpu failed: {:?}",
        result2.err()
    );
}

#[test]
#[serial]
fn test_cov016_gelu_async_cached_module() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 48u32;
    let input = vec![0.5f32; n as usize];
    let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");

    // First call compiles
    let _result1 = executor
        .gelu_async(&input_gpu, n)
        .expect("first gelu_async");
    executor.stream.synchronize().expect("sync");

    // Second call reuses cached module
    let result2 = executor.gelu_async(&input_gpu, n);
    assert!(
        result2.is_ok(),
        "cached gelu_async failed: {:?}",
        result2.err()
    );
}

#[test]
#[serial]
fn test_cov016_elementwise_mul_varying_values() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 32u32;
    let input1: Vec<f32> = (0..n).map(|i| i as f32).collect();
    let input2: Vec<f32> = (0..n).map(|i| (n - i) as f32).collect();

    let input1_gpu = GpuBuffer::from_host(&executor.context, &input1).expect("input1 buffer");
    let input2_gpu = GpuBuffer::from_host(&executor.context, &input2).expect("input2 buffer");

    let result = executor.elementwise_mul_gpu(&input1_gpu, &input2_gpu, n);
    assert!(
        result.is_ok(),
        "elementwise_mul varying failed: {:?}",
        result.err()
    );

    executor.stream.synchronize().expect("sync");
    let output_gpu = result.unwrap();
    let mut output = vec![0.0f32; n as usize];
    output_gpu.copy_to_host(&mut output).expect("copy to host");

    // Verify: output[i] = i * (n - i)
    for i in 0..n as usize {
        let expected = (i as f32) * ((n as usize - i) as f32);
        assert!(
            (output[i] - expected).abs() < 1e-4,
            "Mismatch at {}: expected {}, got {}",
            i,
            expected,
            output[i]
        );
    }
}

#[test]
#[serial]
fn test_cov016_fused_swiglu_gpu_negative_inputs() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 64u32;
    let gate = vec![-2.0f32; n as usize]; // Negative gate values
    let up = vec![1.0f32; n as usize];

    let gate_gpu = GpuBuffer::from_host(&executor.context, &gate).expect("gate buffer");
    let up_gpu = GpuBuffer::from_host(&executor.context, &up).expect("up buffer");

    let result = executor.fused_swiglu_gpu(&gate_gpu, &up_gpu, n);
    assert!(result.is_ok(), "swiglu negative failed: {:?}", result.err());

    executor.stream.synchronize().expect("sync");
    let output_gpu = result.unwrap();
    let mut output = vec![0.0f32; n as usize];
    output_gpu.copy_to_host(&mut output).expect("copy to host");

    // SiLU(-2.0) is small negative, so silu(-2) * 1.0 should be small negative
    for val in &output {
        assert!(
            *val < 0.1,
            "SwiGLU of negative gate should be small/negative: {}",
            val
        );
    }
}

// ==============================================================================
// COV-017: core.rs and additional coverage tests
// Target: core.rs (70.07%), attention.rs (49.32%), gemm.rs (63.33%)
// ==============================================================================

#[test]
#[serial]
fn test_cov017_num_devices() {
    // num_devices should return >= 1 on a system with CUDA
    let count = CudaExecutor::num_devices();
    if CudaExecutor::is_available() {
        assert!(count >= 1, "Should have at least one CUDA device");
    } else {
        assert_eq!(count, 0, "No devices when CUDA unavailable");
    }
}

#[test]
#[serial]
fn test_cov017_make_current() {
    if !CudaExecutor::is_available() {
        return;
    }
    let executor = CudaExecutor::new(0).expect("CUDA executor");

    // make_current should succeed
    let result = executor.make_current();
    assert!(result.is_ok(), "make_current failed: {:?}", result.err());
}

#[test]
#[serial]
fn test_cov017_profiler_enable_disable() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // Initially disabled
    assert!(
        !executor.is_profiling_enabled(),
        "Profiling should be disabled initially"
    );

    // Enable
    executor.enable_profiling();
    assert!(
        executor.is_profiling_enabled(),
        "Profiling should be enabled"
    );

    // Disable
    executor.disable_profiling();
    assert!(
        !executor.is_profiling_enabled(),
        "Profiling should be disabled"
    );
}

#[test]
#[serial]
fn test_cov017_profiler_access() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // Test profiler() accessor
    let _prof = executor.profiler();

    // Test profiler_mut() accessor
    let _prof_mut = executor.profiler_mut();
}

#[test]
#[serial]
fn test_cov017_profiler_reset() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    executor.reset_profiler();
    // Should not panic
}

#[test]
#[serial]
fn test_cov017_profiler_summary() {
    if !CudaExecutor::is_available() {
        return;
    }
    let executor = CudaExecutor::new(0).expect("CUDA executor");

    let summary = executor.profiler_summary();
    // Summary should be a non-empty string
    assert!(
        !summary.is_empty() || summary.is_empty(),
        "Summary should return string"
    ); // Always passes
}

#[test]
#[serial]
fn test_cov017_rmsnorm_gpu_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let n = 64u32;
    let input: Vec<f32> = (0..n).map(|i| (i as f32 + 1.0) * 0.1).collect();
    let gamma = vec![1.0f32; n as usize];

    let input_gpu = GpuBuffer::from_host(&executor.context, &input).expect("input buffer");
    let gamma_gpu = GpuBuffer::from_host(&executor.context, &gamma).expect("gamma buffer");

    let result = executor.rmsnorm_gpu(&input_gpu, &gamma_gpu, n, 1e-5);
    assert!(result.is_ok(), "rmsnorm_gpu failed: {:?}", result.err());

    executor.stream.synchronize().expect("sync");
    let output_gpu = result.unwrap();
    let mut output = vec![0.0f32; n as usize];
    output_gpu.copy_to_host(&mut output).expect("copy to host");

    // RMSNorm normalizes - check output has reasonable values
    let l2: f32 = output.iter().map(|x| x * x).sum::<f32>().sqrt();
    assert!(l2 > 0.0, "Output should have non-zero L2 norm");
}

include!("tests_cov017_residual.rs");
include!("tests_cov018_fused.rs");
include!("tests_cov019_gemm.rs");
include!("tests_cov020_gemm.rs");