realizar 0.8.5 - Docs.rs


/// Test transformer_layer_batched positions mismatch
#[test]
#[serial]
fn test_cov030_transformer_layer_batched_positions_mismatch() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // Init workspace for batch_size=4
    executor.init_workspace(512, 256).expect("init workspace");
    executor
        .init_batched_workspace(512, 256, 4)
        .expect("init batched");

    let input_data = vec![0.1f32; 512 * 4];
    let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
    let layer_weights = test_zeroed_layer_weights();

    // Positions length != m
    let result = executor.transformer_layer_batched(
        &input,
        0,
        &layer_weights,
        4,
        &[0, 1], // only 2 positions but m=4
        512,
        256,
        1e-5,
    );
    assert!(result.is_err(), "Should error when positions.len() != m");
}

// =============================================================================
// COV-031: Additional Activation & Attention Coverage Tests
// Target: Improve activations.rs and attention.rs coverage
// =============================================================================

/// Test rope_into basic operation
#[test]
#[serial]
fn test_cov031_rope_into_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let num_heads = 8u32;
    let head_dim = 64u32;
    let n = num_heads * head_dim;
    let theta = 10000.0f32;
    let position = 5u32;

    // Create input/output buffers
    let input_data = vec![1.0f32; n as usize];
    let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
    let output = GpuBuffer::new(executor.context(), n as usize).expect("output");

    let result = executor.rope_into(&input, &output, position, num_heads, head_dim, theta);
    assert!(
        result.is_ok(),
        "rope_into should succeed: {:?}",
        result.err()
    );
}

/// Test rope_into with different positions
#[test]
#[serial]
fn test_cov031_rope_into_positions() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let num_heads = 8u32;
    let head_dim = 64u32;
    let n = num_heads * head_dim;
    let theta = 10000.0f32;

    let input_data = vec![1.0f32; n as usize];
    let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
    let output = GpuBuffer::new(executor.context(), n as usize).expect("output");

    // Test various positions
    for position in [0u32, 1, 10, 100, 1000] {
        let result = executor.rope_into(&input, &output, position, num_heads, head_dim, theta);
        assert!(
            result.is_ok(),
            "rope_into at position {} failed: {:?}",
            position,
            result.err()
        );
    }
}

/// Test rope_indirect_into basic operation
#[test]
#[serial]
fn test_cov031_rope_indirect_into_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let num_heads = 8u32;
    let head_dim = 64u32;
    let n = num_heads * head_dim;
    let theta = 10000.0f32;

    let input_data = vec![1.0f32; n as usize];
    let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
    let output = GpuBuffer::new(executor.context(), n as usize).expect("output");

    // Position in device buffer (for CUDA graph compatibility)
    let position_buf = GpuBuffer::from_host(executor.context(), &[5u32]).expect("position buf");

    let result =
        executor.rope_indirect_into(&input, &output, &position_buf, num_heads, head_dim, theta);
    assert!(
        result.is_ok(),
        "rope_indirect_into should succeed: {:?}",
        result.err()
    );
}

/// Test rope_neox_into basic operation
#[test]
#[serial]
fn test_cov031_rope_neox_into_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let num_heads = 8u32;
    let head_dim = 64u32;
    let n = num_heads * head_dim;
    let theta = 10000.0f32;
    let position = 5u32;

    let input_data = vec![1.0f32; n as usize];
    let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
    let output = GpuBuffer::new(executor.context(), n as usize).expect("output");

    let result = executor.rope_neox_into(&input, &output, position, num_heads, head_dim, theta);
    assert!(
        result.is_ok(),
        "rope_neox_into should succeed: {:?}",
        result.err()
    );
}

/// Test rope_neox_indirect_into basic operation
#[test]
#[serial]
fn test_cov031_rope_neox_indirect_into_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let num_heads = 8u32;
    let head_dim = 64u32;
    let n = num_heads * head_dim;
    let theta = 10000.0f32;

    let input_data = vec![1.0f32; n as usize];
    let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");
    let output = GpuBuffer::new(executor.context(), n as usize).expect("output");
    let position_buf = GpuBuffer::from_host(executor.context(), &[10u32]).expect("position buf");

    let result = executor.rope_neox_indirect_into(
        &input,
        &output,
        &position_buf,
        num_heads,
        head_dim,
        theta,
    );
    assert!(
        result.is_ok(),
        "rope_neox_indirect_into should succeed: {:?}",
        result.err()
    );
}

/// Test fused_qkv_into basic operation
#[test]
#[serial]
fn test_cov031_fused_qkv_into_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // Use small dimensions to avoid memory issues
    // fused_qkv_into expects: x[hidden_size], w_q[hidden_size, hidden_size],
    // w_k[hidden_size, kv_dim], w_v[hidden_size, kv_dim]
    // out_q[hidden_size], out_k[kv_dim], out_v[kv_dim]
    let hidden_dim = 64u32;
    let kv_dim = 32u32; // GQA with fewer KV heads

    // Create weight matrices as f32 GpuBuffers
    let w_q_data = vec![0.01f32; (hidden_dim * hidden_dim) as usize]; // Q output is hidden_dim
    let w_k_data = vec![0.01f32; (hidden_dim * kv_dim) as usize];
    let w_v_data = vec![0.01f32; (hidden_dim * kv_dim) as usize];

    let w_q = GpuBuffer::from_host(executor.context(), &w_q_data).expect("w_q");
    let w_k = GpuBuffer::from_host(executor.context(), &w_k_data).expect("w_k");
    let w_v = GpuBuffer::from_host(executor.context(), &w_v_data).expect("w_v");

    let input_data = vec![0.1f32; hidden_dim as usize];
    let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");

    // Q output has hidden_dim elements, K/V have kv_dim elements
    let q_out = GpuBuffer::new(executor.context(), hidden_dim as usize).expect("q_out");
    let k_out = GpuBuffer::new(executor.context(), kv_dim as usize).expect("k_out");
    let v_out = GpuBuffer::new(executor.context(), kv_dim as usize).expect("v_out");

    let result = executor.fused_qkv_into(
        &input, &w_q, &w_k, &w_v, &q_out, &k_out, &v_out, hidden_dim, kv_dim,
    );
    assert!(
        result.is_ok(),
        "fused_qkv_into should succeed: {:?}",
        result.err()
    );

    // Synchronize to catch any kernel errors before test ends
    executor.synchronize().expect("sync");
}

/// Test fused_gate_up_into basic operation
#[test]
#[serial]
fn test_cov031_fused_gate_up_into_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // Use smaller dimensions to avoid memory issues
    let hidden_dim = 64u32;
    let intermediate_dim = 128u32;

    // Create weight matrices as f32 GpuBuffers
    let w_gate_data = vec![0.01f32; (hidden_dim * intermediate_dim) as usize];
    let w_up_data = vec![0.01f32; (hidden_dim * intermediate_dim) as usize];

    let w_gate = GpuBuffer::from_host(executor.context(), &w_gate_data).expect("w_gate");
    let w_up = GpuBuffer::from_host(executor.context(), &w_up_data).expect("w_up");

    let input_data = vec![0.1f32; hidden_dim as usize];
    let input = GpuBuffer::from_host(executor.context(), &input_data).expect("input");

    let output = GpuBuffer::new(executor.context(), intermediate_dim as usize).expect("output");

    let result = executor.fused_gate_up_into(
        &input,
        &w_gate,
        &w_up,
        &output,
        hidden_dim,
        intermediate_dim,
    );
    assert!(
        result.is_ok(),
        "fused_gate_up_into should succeed: {:?}",
        result.err()
    );

    // Synchronize to catch any kernel errors before test ends
    executor.synchronize().expect("sync");
}

/// Test incremental_attention_into basic operation
#[test]
#[serial]
fn test_cov031_incremental_attention_into_basic() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let num_heads = 4usize;
    let num_kv_heads = 4usize;
    let head_dim = 8usize;
    let q_dim = num_heads * head_dim;
    let kv_dim = num_kv_heads * head_dim;

    // Initialize KV cache first (required for incremental attention)
    executor
        .init_kv_cache_gpu(1, num_heads, num_kv_heads, head_dim, 16)
        .expect("init kv");

    let q_data = vec![0.1f32; q_dim];
    let k_data = vec![0.1f32; kv_dim];
    let v_data = vec![0.1f32; kv_dim];

    let q_buf = GpuBuffer::from_host(executor.context(), &q_data).expect("q_buf");
    let k_buf = GpuBuffer::from_host(executor.context(), &k_data).expect("k_buf");
    let v_buf = GpuBuffer::from_host(executor.context(), &v_data).expect("v_buf");
    let out_buf = GpuBuffer::new(executor.context(), q_dim).expect("out_buf");

    let result = executor.incremental_attention_into(0, &q_buf, &k_buf, &v_buf, &out_buf);
    assert!(
        result.is_ok(),
        "incremental_attention_into should succeed: {:?}",
        result.err()
    );
}

/// Test batched_incremental_attention_into with batch_size=2
#[test]
#[serial]
fn test_cov031_batched_incremental_attention_into() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let num_heads = 4usize;
    let num_kv_heads = 4usize;
    let head_dim = 8usize;
    let batch_size = 2usize;
    let q_dim = num_heads * head_dim;
    let kv_dim = num_kv_heads * head_dim;

    // Initialize KV cache first (required for batched attention)
    executor
        .init_kv_cache_gpu(1, num_heads, num_kv_heads, head_dim, 16)
        .expect("init kv");
    executor
        .init_batched_kv_cache_gpu(1, batch_size)
        .expect("init batched kv");

    let q_data = vec![0.1f32; q_dim * batch_size];
    let k_data = vec![0.1f32; kv_dim * batch_size];
    let v_data = vec![0.1f32; kv_dim * batch_size];

    let q_buf = GpuBuffer::from_host(executor.context(), &q_data).expect("q_buf");
    let k_buf = GpuBuffer::from_host(executor.context(), &k_data).expect("k_buf");
    let v_buf = GpuBuffer::from_host(executor.context(), &v_data).expect("v_buf");
    let out_buf = GpuBuffer::new(executor.context(), q_dim * batch_size).expect("out_buf");

    let positions = vec![0u32; batch_size];

    let result = executor.batched_incremental_attention_into(
        0, &q_buf, &k_buf, &v_buf, &out_buf, batch_size, &positions,
    );
    assert!(
        result.is_ok(),
        "batched_incremental_attention_into should succeed: {:?}",
        result.err()
    );
}

/// Test flash_decoding_attention_into without init
#[test]
#[serial]
fn test_cov031_flash_decoding_not_init() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let seq_len = 16usize;
    let head_dim = 64usize;
    let n = seq_len * head_dim;

    let q_data = vec![0.1f32; n];
    let k_data = vec![0.1f32; n];
    let v_data = vec![0.1f32; n];

    let q_buf = GpuBuffer::from_host(executor.context(), &q_data).expect("q_buf");
    let k_buf = GpuBuffer::from_host(executor.context(), &k_data).expect("k_buf");
    let v_buf = GpuBuffer::from_host(executor.context(), &v_data).expect("v_buf");
    let out_buf = GpuBuffer::new(executor.context(), n).expect("out_buf");

    // Without init_flash_decoding, should return error
    let positions = vec![0u32; 1];
    let result =
        executor.flash_decoding_attention_into(0, &q_buf, &k_buf, &v_buf, &out_buf, 1, &positions);
    assert!(result.is_err(), "flash_decoding should error without init");
}

/// Test init_flash_decoding and flash_decoding_attention_into
#[test]
#[serial]
fn test_cov031_flash_decoding_with_init() {
    if !CudaExecutor::is_available() {
        return;
    }
    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let num_heads = 8usize;
    let head_dim = 64usize;
    let max_seq_len = 128usize;
    let batch_size = 1usize;

    // Initialize flash decoding (num_heads, head_dim, max_seq_len, batch_size)
    let init_result = executor.init_flash_decoding(num_heads, head_dim, max_seq_len, batch_size);
    assert!(
        init_result.is_ok(),
        "init_flash_decoding should succeed: {:?}",
        init_result.err()
    );

    // Now try flash decoding attention
    let q_dim = num_heads * head_dim;

    let q_data = vec![0.1f32; q_dim];
    let k_data = vec![0.1f32; q_dim];
    let v_data = vec![0.1f32; q_dim];

    let q_buf = GpuBuffer::from_host(executor.context(), &q_data).expect("q_buf");
    let k_buf = GpuBuffer::from_host(executor.context(), &k_data).expect("k_buf");
    let v_buf = GpuBuffer::from_host(executor.context(), &v_data).expect("v_buf");
    let out_buf = GpuBuffer::new(executor.context(), q_dim).expect("out_buf");

    let positions = vec![0u32; batch_size];
    let result = executor
        .flash_decoding_attention_into(0, &q_buf, &k_buf, &v_buf, &out_buf, batch_size, &positions);
    // Note: flash decoding may fail if KV cache not initialized, but at least we cover init path
    let _ = result;
}