realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors

#[cfg(test)]
#[cfg(feature = "cuda")]
mod tests {
    use super::*;
    use crate::cuda::executor::test_fixtures::{
        generate_q4_0_weights, generate_q5_0_weights, generate_q8_0_weights,
    };

    /// Helper to create CudaExecutor for tests
    fn create_executor() -> Option<CudaExecutor> {
        CudaExecutor::new(0).ok()
    }

    // ========================================================================
    // Q8_0 GEMV Tests
    // ========================================================================

    #[test]
    fn test_q8_0_gemv_into_basic() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        // K=256, N=64: 64 output rows, 8 blocks per row (32 elements/block)
        let k = 256u32;
        let n = 64u32;
        let blocks = (n as usize) * (k as usize / 32);
        let weights = generate_q8_0_weights(blocks);

        // Upload weights to GPU
        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let weight_ptr = weight_buf.as_ptr();

        // Create input/output buffers
        let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        // Execute (may fail on PTX generation but exercises path)
        let result = exec.q8_0_gemv_into(weight_ptr, &input_buf, &output_buf, n, k);
        let _ = result;
    }

    #[test]
    fn test_q8_0_gemv_into_large() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        let k = 512u32;
        let n = 128u32;
        let blocks = (n as usize) * (k as usize / 32);
        let weights = generate_q8_0_weights(blocks);

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = vec![0.5f32; k as usize];
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result = exec.q8_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    // ========================================================================
    // Q5_0 GEMV Tests
    // ========================================================================

    #[test]
    fn test_q5_0_gemv_into_basic() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        let k = 256u32;
        let n = 64u32;
        let blocks = (n as usize) * (k as usize / 32);
        let weights = generate_q5_0_weights(blocks);

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result = exec.q5_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    #[test]
    fn test_q5_0_gemv_into_large() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        let k = 512u32;
        let n = 128u32;
        let blocks = (n as usize) * (k as usize / 32);
        let weights = generate_q5_0_weights(blocks);

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = vec![0.5f32; k as usize];
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result = exec.q5_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    // ========================================================================
    // Q4_0 GEMV Tests
    // ========================================================================

    #[test]
    fn test_q4_0_gemv_into_basic() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        let k = 256u32;
        let n = 64u32;
        let blocks = (n as usize) * (k as usize / 32);
        let weights = generate_q4_0_weights(blocks);

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result = exec.q4_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    #[test]
    fn test_q4_0_gemv_into_single_row() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        let k = 256u32;
        let n = 1u32;
        let blocks = k as usize / 32;
        let weights = generate_q4_0_weights(blocks);

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = vec![1.0f32; k as usize];
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result = exec.q4_0_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    // ========================================================================
    // Q4_1 GEMV Tests
    // ========================================================================

    #[test]
    fn test_q4_1_gemv_into_basic() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        // Q4_1 has same block count as Q4_0 but 20 bytes per block instead of 18
        let k = 256u32;
        let n = 64u32;
        let blocks = (n as usize) * (k as usize / 32);
        // Use Q4_0 weights (18 bytes/block), Q4_1 expects 20 bytes/block
        // The kernel will interpret incorrectly, but we're testing the path
        let weights = generate_q4_0_weights(blocks);

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result = exec.q4_1_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    // ========================================================================
    // Q5_K GEMV Tests
    // ========================================================================

    #[test]
    fn test_q5k_gemv_into_basic() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        // Q5_K: 176 bytes per 256 elements (super-block format)
        let k = 256u32;
        let n = 64u32;
        // Q5_K needs k to be multiple of 256
        let superblocks = (n as usize) * (k as usize / 256);
        // Simulate Q5K weight data
        let weights = vec![0u8; superblocks * 176];

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result = exec.q5k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    // ========================================================================
    // Q6_K GEMV Tests
    // ========================================================================

    #[test]
    fn test_q6k_gemv_into_basic() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        // Q6_K: 210 bytes per 256 elements
        let k = 256u32;
        let n = 64u32;
        let superblocks = (n as usize) * (k as usize / 256);
        let weights = vec![0u8; superblocks * 210];

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result = exec.q6k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    #[test]
    fn test_coalesced_q6k_gemv_into_basic() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        let k = 256u32;
        let n = 64u32;
        let superblocks = (n as usize) * (k as usize / 256);
        let weights = vec![0u8; superblocks * 210];

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = (0..k as usize).map(|i| (i as f32) * 0.01).collect();
        let output = vec![0.0f32; n as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result =
            exec.coalesced_q6k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, n, k);
        let _ = result;
    }

    // ========================================================================
    // Batched Q6K GEMV Tests
    // ========================================================================

    #[test]
    fn test_batched_q6k_gemv_into_basic() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        let m = 4u32; // batch size
        let k = 256u32;
        let n = 64u32;
        let superblocks = (n as usize) * (k as usize / 256);
        let weights = vec![0u8; superblocks * 210];

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        // Batched input: m * k elements
        let input: Vec<f32> = (0..(m * k) as usize).map(|i| (i as f32) * 0.001).collect();
        // Batched output: m * n elements
        let output = vec![0.0f32; (m * n) as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result =
            exec.batched_q6k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, m, n, k);
        let _ = result;
    }

    #[test]
    fn test_batched_q6k_gemv_into_m8() {
        let Some(mut exec) = create_executor() else {
            return;
        };

        let m = 8u32;
        let k = 256u32;
        let n = 32u32;
        let superblocks = (n as usize) * (k as usize / 256);
        let weights = vec![0u8; superblocks * 210];

        let weight_buf = GpuBuffer::from_host(&exec.context, &weights).unwrap();
        let input: Vec<f32> = vec![0.5f32; (m * k) as usize];
        let output = vec![0.0f32; (m * n) as usize];
        let input_buf = GpuBuffer::from_host(&exec.context, &input).unwrap();
        let output_buf = GpuBuffer::from_host(&exec.context, &output).unwrap();

        let result =
            exec.batched_q6k_gemv_into(weight_buf.as_ptr(), &input_buf, &output_buf, m, n, k);
        let _ = result;
    }
}