singe-cuda 0.1.0-alpha.8

#![cfg(feature = "testing")]

use singe_cuda::{
    cuda_module, error::Error, memory::DeviceMemory, module::LaunchConfig,
    stream::StreamCaptureMode,
};

cuda_module! {
    pub mod scale_kernel {
        source: r#"
            extern "C" __global__ void scale_add(
                const float* input,
                float* output,
                float alpha,
                int len
            ) {
                int i = static_cast<int>(
                    blockIdx.x * blockDim.x + threadIdx.x
                );
                if (i < len) {
                    output[i] = input[i] * alpha + 1.0f;
                }
            }
        "#,
    }
}

cuda_module! {
    pub mod header_kernel {
        source: r#"
        #include "common.cuh"

        extern "C" __global__ void add_bias(float* values, int len) {
            int i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
            if (i < len) {
                values[i] += CUDA_MODULE_BIAS;
            }
        }
        "#,
        headers: {
            "common.cuh" => r#"
            #pragma once
            #define CUDA_MODULE_BIAS 3.0f
            "#,
        },
        compile: {
            nvcc_args: ["--std=c++20"],
            nvrtc_args: ["--std=c++20"],
        },
    }
}

cuda_module! {
    pub mod exported_kernel {
        source: r#"
        extern "C" __global__ void keep_me(float* values, int len) {
            int i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
            if (i < len) {
                values[i] *= 2.0f;
            }
        }

        extern "C" __global__ void skip_me(float* values, int len) {
            int i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
            if (i < len) {
                values[i] = -999.0f;
            }
        }
        "#,
        exports: {
            keep_me as double_values,
        },
    }
}

cuda_module! {
    pub mod typed_kernel {
        source: r#"
        extern "C" __global__ void typed_params(
            const float* input,
            float* output,
            size_t len,
            unsigned int flags
        ) {
            size_t i = blockIdx.x * blockDim.x + threadIdx.x;
            if (i < len && flags != 0) {
                output[i] = input[i];
            }
        }
        "#,
    }
}

#[test]
fn launches_kernel_from_generated_module() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();

    let input = vec![1.0f32, 2.0, 3.5, -4.0, 8.25];
    let mut output = vec![0.0f32; input.len()];
    let alpha = 2.5f32;
    let length = input.len() as i32;

    let input_device = DeviceMemory::from_slice(&input).unwrap();
    let mut output_device = DeviceMemory::<f32>::zeroes(output.len()).unwrap();

    let module = scale_kernel::Module::create(&ctx).unwrap();
    let config = LaunchConfig::for_1d_grid(input.len(), 128);
    unsafe {
        module
            .scale_add_with_memory(&config, &input_device, &mut output_device, alpha, length)
            .unwrap();
    }

    output_device.copy_to_host(&mut output).unwrap();

    let expected = input
        .iter()
        .map(|value| value * alpha + 1.0)
        .collect::<Vec<_>>();
    assert_eq!(output, expected);
}

#[test]
fn launches_raw_pointer_kernel_from_generated_module() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();

    let input = vec![1.0f32, 2.0, 3.5, -4.0, 8.25];
    let mut output = vec![0.0f32; input.len()];
    let alpha = 2.5f32;
    let length = input.len() as i32;

    let input_device = DeviceMemory::from_slice(&input).unwrap();
    let output_device = DeviceMemory::<f32>::zeroes(output.len()).unwrap();

    let module = scale_kernel::Module::create(&ctx).unwrap();
    let config = LaunchConfig::for_1d_grid(input.len(), 128);
    unsafe {
        module
            .scale_add(
                &config,
                input_device.as_ptr(),
                output_device.as_mut_ptr(),
                alpha,
                length,
            )
            .unwrap();
    }

    output_device.copy_to_host(&mut output).unwrap();

    let expected = input
        .iter()
        .map(|value| value * alpha + 1.0)
        .collect::<Vec<_>>();
    assert_eq!(output, expected);
}

#[test]
fn launches_kernel_on_stream_from_generated_module() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();
    let stream = ctx.create_stream().unwrap();

    let input = vec![1.0f32, 2.0, 3.5, -4.0, 8.25];
    let mut output = vec![0.0f32; input.len()];
    let alpha = 2.5f32;
    let length = input.len() as i32;

    let input_device = DeviceMemory::from_slice(&input).unwrap();
    let mut output_device = DeviceMemory::<f32>::zeroes(output.len()).unwrap();

    let module = scale_kernel::Module::create(&ctx).unwrap();
    let config = LaunchConfig::for_1d_grid(input.len(), 128);
    unsafe {
        module
            .scale_add_with_memory_on(
                &config,
                &stream,
                &input_device,
                &mut output_device,
                alpha,
                length,
            )
            .unwrap();
    }
    stream.synchronize().unwrap();

    output_device.copy_to_host(&mut output).unwrap();

    let expected = input
        .iter()
        .map(|value| value * alpha + 1.0)
        .collect::<Vec<_>>();
    assert_eq!(output, expected);
}

#[test]
fn updates_kernel_node_params_from_generated_module() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();
    let stream = ctx.create_stream().unwrap();

    let input = vec![1.0f32, 2.0, 3.5, -4.0, 8.25];
    let mut output = vec![0.0f32; input.len()];
    let alpha = 2.5f32;
    let length = input.len() as i32;

    let input_device = DeviceMemory::from_slice(&input).unwrap();
    let mut output_device = DeviceMemory::<f32>::zeroes(output.len()).unwrap();

    let module = scale_kernel::Module::create(&ctx).unwrap();
    let config = LaunchConfig::for_1d_grid(input.len(), 128);
    let mut graph = ctx.create_graph().unwrap();
    let node = unsafe {
        module
            .scale_add_with_memory_node(
                &mut graph,
                &[],
                &config,
                &input_device,
                &mut output_device,
                0.0,
                length,
            )
            .unwrap()
    };
    let topology = graph.topology_summary().unwrap();
    assert_eq!(topology.nodes, 1);
    assert_eq!(topology.root_nodes, 1);
    assert_eq!(topology.edges, 0);
    assert_eq!(topology.kernel_nodes, 1);
    assert_eq!(topology.child_graph_nodes, 0);

    let mut executable = graph.instantiate().unwrap();
    drop(graph);
    unsafe {
        module
            .scale_add_with_memory_set_node_params(
                &mut executable,
                node,
                &config,
                &input_device,
                &mut output_device,
                alpha,
                length,
            )
            .unwrap();
    }
    executable.launch(&stream).unwrap();
    stream.synchronize().unwrap();

    output_device.copy_to_host(&mut output).unwrap();

    let expected = input
        .iter()
        .map(|value| value * alpha + 1.0)
        .collect::<Vec<_>>();
    assert_eq!(output, expected);
}

#[test]
fn rejects_node_from_different_graph() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();

    let mut graph = ctx.create_graph().unwrap();
    let node = graph.add_empty_node(&[]).unwrap();
    let mut other_graph = ctx.create_graph().unwrap();

    assert!(matches!(
        other_graph.add_empty_node(&[node]),
        Err(Error::GraphNodeMismatch)
    ));

    let mut executable = graph.instantiate().unwrap();
    let other_node = other_graph.add_empty_node(&[]).unwrap();
    assert!(matches!(
        executable.disable_node(other_node),
        Err(Error::GraphNodeMismatch)
    ));
}

#[test]
fn child_graph_returns_borrowed_graph() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();

    let mut child = ctx.create_graph().unwrap();
    child.add_empty_node(&[]).unwrap();

    let mut parent = ctx.create_graph().unwrap();
    let child_node = parent.add_child_graph_node(&[], &child).unwrap();
    let borrowed_child = child_node.child_graph().unwrap();

    assert_eq!(borrowed_child.topology_summary().unwrap().nodes, 1);
    assert_eq!(borrowed_child.context(), Some(ctx.as_ref()));
}

#[test]
fn graph_kernel_node_copies_borrowed_scalar_arguments() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();
    let stream = ctx.create_stream().unwrap();

    let input = vec![1.0f32, 2.0, 3.5, -4.0, 8.25];
    let mut output = vec![0.0f32; input.len()];
    let mut alpha = 2.5f32;
    let mut length = input.len() as i32;

    let input_device = DeviceMemory::from_slice(&input).unwrap();
    let mut output_device = DeviceMemory::<f32>::zeroes(output.len()).unwrap();

    let module = scale_kernel::Module::create(&ctx).unwrap();
    let function = module.raw().function("scale_add").unwrap();
    let config = LaunchConfig::for_1d_grid(input.len(), 128);
    let mut params = singe_cuda::module::KernelParameters::new();
    params
        .device_slice(&input_device)
        .device_slice_mut(&mut output_device)
        .arg(&alpha)
        .arg(&length);

    let mut graph = ctx.create_graph().unwrap();
    unsafe {
        function
            .add_to_graph(&mut graph, &[], &config, &mut params)
            .unwrap();
    }
    drop(params);
    alpha = -100.0;
    length = 0;

    let executable = graph.instantiate().unwrap();
    executable.launch(&stream).unwrap();
    stream.synchronize().unwrap();

    output_device.copy_to_host(&mut output).unwrap();

    let expected = input
        .iter()
        .map(|value| value * 2.5 + 1.0)
        .collect::<Vec<_>>();
    assert_eq!(output, expected);
    assert_eq!(alpha, -100.0);
    assert_eq!(length, 0);
}

#[test]
fn records_kernel_operation_during_capture() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();
    let stream = ctx.create_stream().unwrap();

    let input = vec![1.0f32, 2.0, 3.5, -4.0, 8.25];
    let mut output = vec![0.0f32; input.len()];
    let alpha = 2.5f32;
    let length = input.len() as i32;

    let input_device = DeviceMemory::from_slice(&input).unwrap();
    let mut output_device = DeviceMemory::<f32>::zeroes(output.len()).unwrap();

    let module = scale_kernel::Module::create(&ctx).unwrap();
    let config = LaunchConfig::for_1d_grid(input.len(), 128);

    let executable = stream
        .capture_executable(StreamCaptureMode::Relaxed, |scope| unsafe {
            module.scale_add_with_memory_record(
                scope,
                &config,
                &input_device,
                &mut output_device,
                alpha,
                length,
            )
        })
        .unwrap();

    executable.launch(&stream).unwrap();
    stream.synchronize().unwrap();

    output_device.copy_to_host(&mut output).unwrap();

    let expected = input
        .iter()
        .map(|value| value * alpha + 1.0)
        .collect::<Vec<_>>();
    assert_eq!(output, expected);
}

#[test]
fn adds_kernel_node_from_generated_module() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();
    let stream = ctx.create_stream().unwrap();

    let input = vec![1.0f32, 2.0, 3.5, -4.0, 8.25];
    let mut output = vec![0.0f32; input.len()];
    let alpha = 2.5f32;
    let length = input.len() as i32;

    let input_device = DeviceMemory::from_slice(&input).unwrap();
    let mut output_device = DeviceMemory::<f32>::zeroes(output.len()).unwrap();

    let module = scale_kernel::Module::create(&ctx).unwrap();
    let config = LaunchConfig::for_1d_grid(input.len(), 128);
    let mut graph = ctx.create_graph().unwrap();
    unsafe {
        module
            .scale_add_with_memory_node(
                &mut graph,
                &[],
                &config,
                &input_device,
                &mut output_device,
                alpha,
                length,
            )
            .unwrap();
    }

    let executable = graph.instantiate().unwrap();
    executable.launch(&stream).unwrap();
    stream.synchronize().unwrap();

    output_device.copy_to_host(&mut output).unwrap();
    let expected = input
        .iter()
        .map(|value| value * alpha + 1.0)
        .collect::<Vec<_>>();
    assert_eq!(output, expected);
}

#[test]
fn launches_kernel_with_headers() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();

    let mut values = vec![1.0f32, 2.5, -3.0, 0.25];
    let length = values.len() as i32;
    let mut values_device = DeviceMemory::from_slice(&values).unwrap();

    let module = header_kernel::Module::create(&ctx).unwrap();
    let config = LaunchConfig::for_1d_grid(values.len(), 128);
    unsafe {
        module
            .add_bias_with_memory(&config, &mut values_device, length)
            .unwrap();
    }

    values_device.copy_to_host(&mut values).unwrap();
    assert_eq!(values, vec![4.0f32, 5.5, 0.0, 3.25]);
}

#[test]
fn exports_selected_kernel_with_renamed_method() {
    let (_lock, ctx) = singe_cuda::testing::bootstrap().unwrap();

    let mut values = vec![1.0f32, 2.5, -3.0, 0.25];
    let length = values.len() as i32;
    let mut values_device = DeviceMemory::from_slice(&values).unwrap();

    let module = exported_kernel::Module::create(&ctx).unwrap();
    let config = LaunchConfig::for_1d_grid(values.len(), 128);
    unsafe {
        module
            .double_values_with_memory(&config, &mut values_device, length)
            .unwrap();
    }

    values_device.copy_to_host(&mut values).unwrap();
    assert_eq!(values, vec![2.0f32, 5.0, -6.0, 0.5]);
}

#[test]
fn generated_module_uses_source_pointer_and_scalar_types() {
    let _raw_method: unsafe fn(
        &typed_kernel::Module,
        &LaunchConfig,
        *const f32,
        *mut f32,
        usize,
        u32,
    ) -> singe_cuda::error::Result<()> = typed_kernel::Module::typed_params;
    let _memory_method: unsafe fn(
        &typed_kernel::Module,
        &LaunchConfig,
        &DeviceMemory<f32>,
        &mut DeviceMemory<f32>,
        usize,
        u32,
    ) -> singe_cuda::error::Result<()> = typed_kernel::Module::typed_params_with_memory;
    let _memory_record_method: unsafe fn(
        &typed_kernel::Module,
        &singe_cuda::stream::StreamCaptureScope<'_>,
        &LaunchConfig,
        &DeviceMemory<f32>,
        &mut DeviceMemory<f32>,
        usize,
        u32,
    ) -> singe_cuda::error::Result<()> = typed_kernel::Module::typed_params_with_memory_record;
}