Skip to main content

oxicuda_launch/
lib.rs

1//! # OxiCUDA Launch
2//!
3//! **Type-safe GPU kernel launch infrastructure for the OxiCUDA ecosystem.**
4//!
5//! This crate provides ergonomic, type-safe abstractions for launching CUDA
6//! GPU kernels. It builds on top of [`oxicuda_driver`] to offer:
7//!
8//! - **[`Dim3`]** — 3-dimensional grid and block size specification with
9//!   convenient conversions from `u32`, `(u32, u32)`, and `(u32, u32, u32)`.
10//!
11//! - **[`LaunchParams`]** — kernel launch configuration (grid, block, shared
12//!   memory) with a builder pattern via [`LaunchParamsBuilder`].
13//!
14//! - **[`Kernel`]** — a launchable kernel wrapper that manages module lifetime
15//!   via `Arc<Module>` and provides occupancy query delegation.
16//!
17//! - **[`KernelArgs`]** — a trait for type-safe kernel argument passing,
18//!   implemented for tuples of `Copy` types up to 24 elements.
19//!
20//! - **[`launch!`]** — a convenience macro for concise kernel launches.
21//!
22//! - **[`grid_size_for`]** — a helper to compute the minimum grid size
23//!   needed to cover a given number of work items.
24//!
25//! ## Quick start
26//!
27//! ```rust,no_run
28//! use std::sync::Arc;
29//! use oxicuda_driver::{init, Device, Context, Module, Stream};
30//! use oxicuda_launch::{Kernel, LaunchParams, Dim3, grid_size_for, launch};
31//!
32//! # fn main() -> oxicuda_driver::CudaResult<()> {
33//! init()?;
34//! let dev = Device::get(0)?;
35//! let ctx = Arc::new(Context::new(&dev)?);
36//!
37//! // Load PTX and create a kernel.
38//! let ptx = ""; // In practice, use include_str! or load from file.
39//! let module = Arc::new(Module::from_ptx(ptx)?);
40//! let kernel = Kernel::from_module(module, "vector_add")?;
41//!
42//! // Configure launch dimensions.
43//! let n: u32 = 1024;
44//! let block_size = 256u32;
45//! let grid = grid_size_for(n, block_size);
46//!
47//! let stream = Stream::new(&ctx)?;
48//!
49//! // Launch with the macro.
50//! let (a_ptr, b_ptr, c_ptr) = (0u64, 0u64, 0u64);
51//! launch!(kernel, grid(grid), block(block_size), &stream, &(a_ptr, b_ptr, c_ptr, n))?;
52//!
53//! stream.synchronize()?;
54//! # Ok(())
55//! # }
56//! ```
57//!
58//! ## Crate features
59//!
60//! | Feature     | Description                              |
61//! |-------------|------------------------------------------|
62//! | `gpu-tests` | Enable tests that require a physical GPU |
63
64#![warn(missing_docs)]
65#![warn(clippy::all)]
66#![allow(clippy::module_name_repetitions)]
67#![allow(clippy::missing_safety_doc)]
68#![allow(clippy::too_many_arguments)]
69#![allow(clippy::type_complexity)]
70
71// ---------------------------------------------------------------------------
72// Module declarations
73// ---------------------------------------------------------------------------
74
75pub mod arg_serialize;
76pub mod async_launch;
77pub mod cluster;
78pub mod cooperative;
79pub mod dynamic_parallelism;
80pub mod error;
81pub mod graph_launch;
82pub mod grid;
83pub mod kernel;
84pub mod macros;
85pub mod multi_stream;
86pub mod named_args;
87pub mod params;
88pub mod telemetry;
89pub mod trace;
90
91// ---------------------------------------------------------------------------
92// Re-exports
93// ---------------------------------------------------------------------------
94
95pub use arg_serialize::{
96    ArgType, LaunchLog, LaunchLogger, LaunchSummary, SerializableKernelArgs, SerializedArg,
97};
98pub use async_launch::{
99    AsyncKernel, AsyncLaunchConfig, CompletionStatus, ErasedKernelArgs, LaunchCompletion,
100    LaunchTiming, PollStrategy, TimedLaunchCompletion, multi_launch_async,
101};
102pub use cluster::{ClusterDim, ClusterLaunchParams, cluster_launch};
103pub use cooperative::CooperativeLaunch;
104pub use dynamic_parallelism::{
105    ChildKernelSpec, DynamicLaunchPlan, DynamicParallelismConfig, GridSpec,
106};
107pub use error::LaunchError;
108pub use graph_launch::{GraphLaunchCapture, LaunchRecord};
109pub use grid::{Dim3, auto_grid_2d, auto_grid_for, grid_size_for};
110pub use kernel::{Kernel, KernelArgs};
111pub use multi_stream::{multi_stream_launch, multi_stream_launch_uniform};
112pub use named_args::{ArgBuilder, NamedKernelArgs};
113pub use params::{LaunchParams, LaunchParamsBuilder};
114pub use telemetry::{
115    KernelStats, LaunchTelemetry, TelemetryCollector, TelemetryExporter, TelemetrySummary,
116    estimate_occupancy,
117};
118pub use trace::KernelSpanGuard;
119
120// ---------------------------------------------------------------------------
121// Prelude
122// ---------------------------------------------------------------------------
123
124/// Convenient glob import for common OxiCUDA Launch types.
125///
126/// ```rust
127/// use oxicuda_launch::prelude::*;
128/// ```
129pub mod prelude {
130    pub use crate::{
131        ArgBuilder, ArgType, AsyncKernel, AsyncLaunchConfig, ChildKernelSpec, ClusterDim,
132        ClusterLaunchParams, CompletionStatus, CooperativeLaunch, Dim3, DynamicLaunchPlan,
133        DynamicParallelismConfig, GraphLaunchCapture, GridSpec, Kernel, KernelArgs,
134        LaunchCompletion, LaunchError, LaunchLog, LaunchLogger, LaunchParams, LaunchParamsBuilder,
135        LaunchRecord, LaunchSummary, LaunchTiming, NamedKernelArgs, PollStrategy,
136        SerializableKernelArgs, SerializedArg, TimedLaunchCompletion, auto_grid_2d, auto_grid_for,
137        cluster_launch, grid_size_for, multi_launch_async, multi_stream_launch,
138        multi_stream_launch_uniform,
139    };
140}