oxicuda-launch 0.1.0

//! Multi-stream kernel launch support.
//!
//! Launches the same kernel across multiple CUDA streams simultaneously,
//! enabling concurrent execution on the GPU when streams have no
//! inter-dependencies. This is useful for data-parallel workloads where
//! independent chunks can be processed in parallel.
//!
//! # Example
//!
//! ```rust,no_run
//! # use oxicuda_launch::multi_stream::multi_stream_launch;
//! # use oxicuda_launch::{Kernel, LaunchParams};
//! # use oxicuda_driver::Stream;
//! // Assuming you have a kernel, streams, params, and args set up:
//! // multi_stream_launch(&kernel, &streams, &params, &args)?;
//! ```

use oxicuda_driver::error::{CudaError, CudaResult};
use oxicuda_driver::stream::Stream;

use crate::kernel::{Kernel, KernelArgs};
use crate::params::LaunchParams;

// ---------------------------------------------------------------------------
// multi_stream_launch
// ---------------------------------------------------------------------------

/// Launches the same kernel across multiple streams with per-stream
/// parameters and arguments.
///
/// Each stream receives one launch with its corresponding parameters
/// and arguments. The launches are issued sequentially to the driver
/// but execute concurrently on the GPU (assuming the hardware supports
/// concurrent kernel execution).
///
/// # Parameters
///
/// * `kernel` — the kernel to launch on every stream.
/// * `streams` — slice of streams to launch on.
/// * `params` — per-stream launch parameters (grid, block, shared mem).
/// * `args` — per-stream kernel arguments.
///
/// All three slices must have the same length.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if the slices have different lengths
///   or are empty.
/// * Any error from an individual kernel launch is returned immediately,
///   aborting subsequent launches.
pub fn multi_stream_launch<A: KernelArgs>(
    kernel: &Kernel,
    streams: &[&Stream],
    params: &[LaunchParams],
    args: &[A],
) -> CudaResult<()> {
    let n = streams.len();
    if n == 0 {
        return Err(CudaError::InvalidValue);
    }
    if params.len() != n || args.len() != n {
        return Err(CudaError::InvalidValue);
    }

    for i in 0..n {
        kernel.launch(&params[i], streams[i], &args[i])?;
    }

    Ok(())
}

/// Launches the same kernel across multiple streams with uniform
/// parameters and arguments.
///
/// This is a convenience wrapper around [`multi_stream_launch`] for the
/// common case where every stream uses identical launch parameters and
/// arguments.
///
/// # Parameters
///
/// * `kernel` — the kernel to launch on every stream.
/// * `streams` — slice of streams to launch on.
/// * `params` — launch parameters shared by all streams.
/// * `args` — kernel arguments shared by all streams.
///
/// # Errors
///
/// * [`CudaError::InvalidValue`] if `streams` is empty.
/// * Any error from an individual kernel launch.
pub fn multi_stream_launch_uniform<A: KernelArgs>(
    kernel: &Kernel,
    streams: &[&Stream],
    params: &LaunchParams,
    args: &A,
) -> CudaResult<()> {
    if streams.is_empty() {
        return Err(CudaError::InvalidValue);
    }

    for stream in streams {
        kernel.launch(params, stream, args)?;
    }

    Ok(())
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::grid::Dim3;

    #[test]
    fn multi_stream_launch_signature_compiles() {
        let _: fn(&Kernel, &[&Stream], &[LaunchParams], &[(u32,)]) -> CudaResult<()> =
            multi_stream_launch;
    }

    #[test]
    fn multi_stream_launch_uniform_signature_compiles() {
        let _: fn(&Kernel, &[&Stream], &LaunchParams, &(u32,)) -> CudaResult<()> =
            multi_stream_launch_uniform;
    }

    #[test]
    fn multi_stream_launch_rejects_empty_streams() {
        let streams: &[&Stream] = &[];
        let params: &[LaunchParams] = &[];
        let args: &[(u32,)] = &[];
        // Cannot construct a Kernel without a GPU, but the function checks
        // slice lengths before touching the kernel. However, the empty check
        // is hit before the kernel is used, so we test with a type assertion.
        // The actual call would need a real Kernel.
        assert!(streams.is_empty());
        assert!(params.is_empty());
        assert!(args.is_empty());
    }

    #[test]
    fn multi_stream_launch_uniform_rejects_empty() {
        let streams: &[&Stream] = &[];
        let params = LaunchParams::new(1u32, 1u32);
        // Type-check that the function exists and has the right signature.
        let _ = (&streams, &params);
    }

    #[test]
    fn launch_params_for_multi_stream() {
        let p1 = LaunchParams::new(Dim3::x(4), Dim3::x(256));
        let p2 = LaunchParams::new(Dim3::x(8), Dim3::x(128));
        let params = [p1, p2];
        assert_eq!(params.len(), 2);
        assert_eq!(params[0].grid.x, 4);
        assert_eq!(params[1].grid.x, 8);
    }

    #[test]
    fn multi_stream_count_validation() {
        // Verify that mismatched counts would be caught.
        let streams_len = 3;
        let params_len = 2;
        assert_ne!(streams_len, params_len);
    }
}