Skip to main content

baracuda_nccl_sys/
lib.rs

1//! Raw FFI + dynamic loader for NVIDIA NCCL (multi-GPU collective communication).
2//!
3//! `baracuda-nccl` wraps this with a safe, typed API. Use this crate
4//! directly only if you need a function that the safe layer hasn't
5//! wrapped yet (in which case please file a bug).
6//!
7//! NCCL is primarily a Linux library; Windows support landed in later NCCL
8//! versions but is uncommon. This crate compiles everywhere and defers the
9//! "is NCCL actually installed?" question to runtime — [`nccl()`] returns
10//! `LoaderError::LibraryNotFound` on hosts without NCCL.
11
12#![allow(non_camel_case_types, non_snake_case, non_upper_case_globals)]
13#![warn(missing_debug_implementations)]
14
15use core::ffi::{c_int, c_void};
16use std::sync::OnceLock;
17
18use baracuda_core::{Library, LoaderError};
19use baracuda_cuda_sys::runtime::cudaStream_t;
20use baracuda_types::CudaStatus;
21
22/// Opaque NCCL communicator.
23pub type ncclComm_t = *mut c_void;
24
25/// A 128-byte unique identifier for multi-process NCCL initialization.
26#[repr(C)]
27#[derive(Copy, Clone)]
28pub struct ncclUniqueId {
29    /// Internal field.
30    pub internal: [i8; 128],
31}
32
33impl core::fmt::Debug for ncclUniqueId {
34    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
35        f.debug_struct("ncclUniqueId").finish_non_exhaustive()
36    }
37}
38
39impl Default for ncclUniqueId {
40    fn default() -> Self {
41        Self { internal: [0; 128] }
42    }
43}
44
45/// NCCL element data type.
46#[repr(i32)]
47#[derive(Copy, Clone, Debug, Eq, PartialEq)]
48pub enum ncclDataType_t {
49    /// 8-bit signed integer element.
50    Int8 = 0,
51    /// 8-bit unsigned integer element.
52    Uint8 = 1,
53    /// 32-bit signed integer element.
54    Int32 = 2,
55    /// 32-bit unsigned integer element.
56    Uint32 = 3,
57    /// 64-bit signed integer element.
58    Int64 = 4,
59    /// 64-bit unsigned integer element.
60    Uint64 = 5,
61    /// IEEE-754 binary16 (fp16) element.
62    Float16 = 6,
63    /// IEEE-754 binary32 (fp32) element.
64    Float32 = 7,
65    /// IEEE-754 binary64 (fp64) element.
66    Float64 = 8,
67    /// bfloat16 element.
68    BFloat16 = 9,
69}
70
71/// NCCL reduction operation. Modeled as a transparent newtype rather
72/// than a closed enum because [`PFN_ncclRedOpCreatePreMulSum`] returns
73/// custom op IDs (≥ 5) that don't fit a closed Rust enum.
74#[repr(transparent)]
75#[derive(Copy, Clone, Debug, Eq, PartialEq)]
76#[allow(non_camel_case_types)]
77pub struct ncclRedOp_t(pub i32);
78
79#[allow(non_upper_case_globals)]
80impl ncclRedOp_t {
81    /// `ncclSum` — element-wise sum reduction.
82    pub const Sum: Self = Self(0);
83    /// `ncclProd` — element-wise product reduction.
84    pub const Prod: Self = Self(1);
85    /// `ncclMax` — element-wise max reduction.
86    pub const Max: Self = Self(2);
87    /// `ncclMin` — element-wise min reduction.
88    pub const Min: Self = Self(3);
89    /// `ncclAvg` — element-wise average reduction (NCCL 2.10+).
90    pub const Avg: Self = Self(4);
91}
92
93// ---- status ---------------------------------------------------------------
94
95/// Return code from an NCCL call.
96#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
97#[repr(transparent)]
98pub struct ncclResult_t(pub i32);
99
100impl ncclResult_t {
101    /// `ncclSuccess` — operation succeeded.
102    pub const Success: Self = Self(0);
103    /// `ncclUnhandledCudaError` — an underlying CUDA call failed.
104    pub const UnhandledCudaError: Self = Self(1);
105    /// `ncclSystemError` — a system-level error occurred (sockets, files, ...).
106    pub const SystemError: Self = Self(2);
107    /// `ncclInternalError` — an internal NCCL error occurred.
108    pub const InternalError: Self = Self(3);
109    /// `ncclInvalidArgument` — an argument was invalid.
110    pub const InvalidArgument: Self = Self(4);
111    /// `ncclInvalidUsage` — the call is invalid in the current state.
112    pub const InvalidUsage: Self = Self(5);
113    /// `ncclRemoteError` — another rank in the communicator failed.
114    pub const RemoteError: Self = Self(6);
115    /// `ncclInProgress` — non-blocking operation still in progress.
116    pub const InProgress: Self = Self(7);
117
118    /// Return `true` if the status code denotes success.
119    pub const fn is_success(self) -> bool {
120        self.0 == 0
121    }
122}
123
124impl CudaStatus for ncclResult_t {
125    fn code(self) -> i32 {
126        self.0
127    }
128    fn name(self) -> &'static str {
129        match self.0 {
130            0 => "ncclSuccess",
131            1 => "ncclUnhandledCudaError",
132            2 => "ncclSystemError",
133            3 => "ncclInternalError",
134            4 => "ncclInvalidArgument",
135            5 => "ncclInvalidUsage",
136            6 => "ncclRemoteError",
137            7 => "ncclInProgress",
138            _ => "ncclUnrecognizedResult",
139        }
140    }
141    fn description(self) -> &'static str {
142        match self.0 {
143            0 => "success",
144            1 => "unhandled CUDA error",
145            2 => "system error",
146            3 => "internal NCCL error",
147            4 => "invalid argument",
148            5 => "invalid usage",
149            6 => "remote error (another rank failed)",
150            7 => "operation in progress (non-blocking comm)",
151            _ => "unrecognized NCCL status code",
152        }
153    }
154    fn is_success(self) -> bool {
155        ncclResult_t::is_success(self)
156    }
157    fn library(self) -> &'static str {
158        "nccl"
159    }
160}
161
162// ---- function-pointer types ----------------------------------------------
163
164/// Function-pointer type for `ncclGetVersion` (query NCCL library version). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
165pub type PFN_ncclGetVersion = unsafe extern "C" fn(version: *mut c_int) -> ncclResult_t;
166/// Function-pointer type for `ncclGetUniqueId` (generate a unique multi-rank initialization ID). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
167pub type PFN_ncclGetUniqueId = unsafe extern "C" fn(id: *mut ncclUniqueId) -> ncclResult_t;
168/// Function-pointer type for `ncclCommInitRank` (initialize a communicator rank). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
169pub type PFN_ncclCommInitRank = unsafe extern "C" fn(
170    comm: *mut ncclComm_t,
171    nranks: c_int,
172    comm_id: ncclUniqueId,
173    rank: c_int,
174) -> ncclResult_t;
175/// Function-pointer type for `ncclCommInitAll` (initialize all-local-GPU communicators in one call). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
176pub type PFN_ncclCommInitAll = unsafe extern "C" fn(
177    comms: *mut ncclComm_t,
178    ndev: c_int,
179    dev_list: *const c_int,
180) -> ncclResult_t;
181/// Function-pointer type for `ncclCommDestroy` (destroy a communicator). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
182pub type PFN_ncclCommDestroy = unsafe extern "C" fn(comm: ncclComm_t) -> ncclResult_t;
183/// Function-pointer type for `ncclCommCount` (query rank count on a communicator). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
184pub type PFN_ncclCommCount =
185    unsafe extern "C" fn(comm: ncclComm_t, count: *mut c_int) -> ncclResult_t;
186/// Function-pointer type for `ncclCommUserRank` (query this rank's index on a communicator). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
187pub type PFN_ncclCommUserRank =
188    unsafe extern "C" fn(comm: ncclComm_t, rank: *mut c_int) -> ncclResult_t;
189
190/// Function-pointer type for `ncclAllReduce` (all-reduce collective). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
191pub type PFN_ncclAllReduce = unsafe extern "C" fn(
192    sendbuff: *const c_void,
193    recvbuff: *mut c_void,
194    count: usize,
195    datatype: ncclDataType_t,
196    op: ncclRedOp_t,
197    comm: ncclComm_t,
198    stream: cudaStream_t,
199) -> ncclResult_t;
200
201/// Function-pointer type for `ncclBroadcast` (broadcast-from-root collective). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
202pub type PFN_ncclBroadcast = unsafe extern "C" fn(
203    sendbuff: *const c_void,
204    recvbuff: *mut c_void,
205    count: usize,
206    datatype: ncclDataType_t,
207    root: c_int,
208    comm: ncclComm_t,
209    stream: cudaStream_t,
210) -> ncclResult_t;
211
212/// Function-pointer type for `ncclGroupStart` (start grouped collective ops). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
213pub type PFN_ncclGroupStart = unsafe extern "C" fn() -> ncclResult_t;
214/// Function-pointer type for `ncclGroupEnd` (end grouped collective ops and commit them). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
215pub type PFN_ncclGroupEnd = unsafe extern "C" fn() -> ncclResult_t;
216
217// ---- Full collective surface ----
218
219/// Function-pointer type for `ncclReduce` (reduce-to-root collective). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
220pub type PFN_ncclReduce = unsafe extern "C" fn(
221    sendbuff: *const c_void,
222    recvbuff: *mut c_void,
223    count: usize,
224    datatype: ncclDataType_t,
225    op: ncclRedOp_t,
226    root: c_int,
227    comm: ncclComm_t,
228    stream: cudaStream_t,
229) -> ncclResult_t;
230
231/// Function-pointer type for `ncclAllGather` (all-gather collective). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
232pub type PFN_ncclAllGather = unsafe extern "C" fn(
233    sendbuff: *const c_void,
234    recvbuff: *mut c_void,
235    sendcount: usize,
236    datatype: ncclDataType_t,
237    comm: ncclComm_t,
238    stream: cudaStream_t,
239) -> ncclResult_t;
240
241/// Function-pointer type for `ncclReduceScatter` (reduce-scatter collective). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
242pub type PFN_ncclReduceScatter = unsafe extern "C" fn(
243    sendbuff: *const c_void,
244    recvbuff: *mut c_void,
245    recvcount: usize,
246    datatype: ncclDataType_t,
247    op: ncclRedOp_t,
248    comm: ncclComm_t,
249    stream: cudaStream_t,
250) -> ncclResult_t;
251
252/// Function-pointer type for `ncclSend` (point-to-point send). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
253pub type PFN_ncclSend = unsafe extern "C" fn(
254    sendbuff: *const c_void,
255    count: usize,
256    datatype: ncclDataType_t,
257    peer: c_int,
258    comm: ncclComm_t,
259    stream: cudaStream_t,
260) -> ncclResult_t;
261
262/// Function-pointer type for `ncclRecv` (point-to-point receive). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
263pub type PFN_ncclRecv = unsafe extern "C" fn(
264    recvbuff: *mut c_void,
265    count: usize,
266    datatype: ncclDataType_t,
267    peer: c_int,
268    comm: ncclComm_t,
269    stream: cudaStream_t,
270) -> ncclResult_t;
271
272// ---- Communicator lifecycle extras ----
273
274/// Function-pointer type for `ncclCommAbort` (abort outstanding ops on a communicator). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
275pub type PFN_ncclCommAbort = unsafe extern "C" fn(comm: ncclComm_t) -> ncclResult_t;
276/// Function-pointer type for `ncclCommFinalize` (finalize a non-blocking communicator). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
277pub type PFN_ncclCommFinalize = unsafe extern "C" fn(comm: ncclComm_t) -> ncclResult_t;
278/// Function-pointer type for `ncclCommGetAsyncError` (fetch a communicator's last async error). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
279pub type PFN_ncclCommGetAsyncError =
280    unsafe extern "C" fn(comm: ncclComm_t, async_error: *mut ncclResult_t) -> ncclResult_t;
281/// Function-pointer type for `ncclCommCuDevice` (query CUDA device backing a communicator rank). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
282pub type PFN_ncclCommCuDevice =
283    unsafe extern "C" fn(comm: ncclComm_t, device: *mut c_int) -> ncclResult_t;
284/// Function-pointer type for `ncclCommSplit` (split a communicator by color/key). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
285pub type PFN_ncclCommSplit = unsafe extern "C" fn(
286    comm: ncclComm_t,
287    color: c_int,
288    key: c_int,
289    new_comm: *mut ncclComm_t,
290    config: *mut c_void, // ncclConfig_t
291) -> ncclResult_t;
292
293/// Function-pointer type for `ncclCommInitRankConfig` (initialize a communicator rank with config). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
294pub type PFN_ncclCommInitRankConfig = unsafe extern "C" fn(
295    comm: *mut ncclComm_t,
296    nranks: c_int,
297    comm_id: ncclUniqueId,
298    rank: c_int,
299    config: *mut c_void, // ncclConfig_t
300) -> ncclResult_t;
301
302// ---- Memory helpers (NCCL 2.19+) ----
303
304/// Function-pointer type for `ncclMemAlloc` (allocate NCCL-registered device memory). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
305pub type PFN_ncclMemAlloc =
306    unsafe extern "C" fn(ptr: *mut *mut c_void, size: usize) -> ncclResult_t;
307/// Function-pointer type for `ncclMemFree` (free NCCL-registered device memory). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
308pub type PFN_ncclMemFree = unsafe extern "C" fn(ptr: *mut c_void) -> ncclResult_t;
309
310/// Function-pointer type for `ncclCommRegister` (register a user buffer with a communicator). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
311pub type PFN_ncclCommRegister = unsafe extern "C" fn(
312    comm: ncclComm_t,
313    buff: *mut c_void,
314    size: usize,
315    handle: *mut *mut c_void,
316) -> ncclResult_t;
317
318/// Function-pointer type for `ncclCommDeregister` (deregister a user buffer from a communicator). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
319pub type PFN_ncclCommDeregister =
320    unsafe extern "C" fn(comm: ncclComm_t, handle: *mut c_void) -> ncclResult_t;
321
322// ---- Custom reduction ops ----
323
324/// Function-pointer type for `ncclRedOpCreatePreMulSum` (create a custom pre-multiplied-sum reduction op). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
325pub type PFN_ncclRedOpCreatePreMulSum = unsafe extern "C" fn(
326    op: *mut ncclRedOp_t,
327    scalar: *mut c_void,
328    datatype: ncclDataType_t,
329    residence: i32, // ncclScalarResidence_t
330    comm: ncclComm_t,
331) -> ncclResult_t;
332
333/// Function-pointer type for `ncclRedOpDestroy` (destroy a custom reduction op). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
334pub type PFN_ncclRedOpDestroy =
335    unsafe extern "C" fn(op: ncclRedOp_t, comm: ncclComm_t) -> ncclResult_t;
336
337// ---- Error strings ----
338
339/// Function-pointer type for `ncclGetErrorString` (decode an ncclResult_t into a static C string). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
340pub type PFN_ncclGetErrorString =
341    unsafe extern "C" fn(result: ncclResult_t) -> *const core::ffi::c_char;
342/// Function-pointer type for `ncclGetLastError` (fetch the last error string on a communicator). See <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html>.
343pub type PFN_ncclGetLastError =
344    unsafe extern "C" fn(comm: ncclComm_t) -> *const core::ffi::c_char;
345
346// ---- loader --------------------------------------------------------------
347
348fn nccl_candidates() -> &'static [&'static str] {
349    #[cfg(target_os = "linux")]
350    {
351        &["libnccl.so.2", "libnccl.so"]
352    }
353    #[cfg(target_os = "windows")]
354    {
355        &["nccl.dll", "libnccl.dll"]
356    }
357    #[cfg(not(any(target_os = "linux", target_os = "windows")))]
358    {
359        &[]
360    }
361}
362
363macro_rules! nccl_fns {
364    ($($name:ident as $sym:literal : $pfn:ty);* $(;)?) => {
365        /// Lazily-resolved NCCL function-pointer table.
366        pub struct Nccl {
367            lib: Library,
368            $($name: OnceLock<$pfn>,)*
369        }
370        impl core::fmt::Debug for Nccl {
371            fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
372                f.debug_struct("Nccl").field("lib", &self.lib).finish_non_exhaustive()
373            }
374        }
375        impl Nccl {
376            $(
377                /// `func` (func).
378                pub fn $name(&self) -> Result<$pfn, LoaderError> {
379                    if let Some(&p) = self.$name.get() { return Ok(p); }
380                    let raw: *mut () = unsafe { self.lib.raw_symbol($sym)? };
381                    let p: $pfn = unsafe { core::mem::transmute_copy::<*mut (), $pfn>(&raw) };
382                    let _ = self.$name.set(p);
383                    Ok(p)
384                }
385            )*
386            fn empty(lib: Library) -> Self {
387                Self { lib, $($name: OnceLock::new(),)* }
388            }
389        }
390    };
391}
392
393nccl_fns! {
394    nccl_get_version as "ncclGetVersion": PFN_ncclGetVersion;
395    nccl_get_unique_id as "ncclGetUniqueId": PFN_ncclGetUniqueId;
396    nccl_comm_init_rank as "ncclCommInitRank": PFN_ncclCommInitRank;
397    nccl_comm_init_rank_config as "ncclCommInitRankConfig": PFN_ncclCommInitRankConfig;
398    nccl_comm_init_all as "ncclCommInitAll": PFN_ncclCommInitAll;
399    nccl_comm_destroy as "ncclCommDestroy": PFN_ncclCommDestroy;
400    nccl_comm_abort as "ncclCommAbort": PFN_ncclCommAbort;
401    nccl_comm_finalize as "ncclCommFinalize": PFN_ncclCommFinalize;
402    nccl_comm_get_async_error as "ncclCommGetAsyncError": PFN_ncclCommGetAsyncError;
403    nccl_comm_count as "ncclCommCount": PFN_ncclCommCount;
404    nccl_comm_user_rank as "ncclCommUserRank": PFN_ncclCommUserRank;
405    nccl_comm_cu_device as "ncclCommCuDevice": PFN_ncclCommCuDevice;
406    nccl_comm_split as "ncclCommSplit": PFN_ncclCommSplit;
407    nccl_all_reduce as "ncclAllReduce": PFN_ncclAllReduce;
408    nccl_reduce as "ncclReduce": PFN_ncclReduce;
409    nccl_broadcast as "ncclBroadcast": PFN_ncclBroadcast;
410    nccl_all_gather as "ncclAllGather": PFN_ncclAllGather;
411    nccl_reduce_scatter as "ncclReduceScatter": PFN_ncclReduceScatter;
412    nccl_send as "ncclSend": PFN_ncclSend;
413    nccl_recv as "ncclRecv": PFN_ncclRecv;
414    nccl_group_start as "ncclGroupStart": PFN_ncclGroupStart;
415    nccl_group_end as "ncclGroupEnd": PFN_ncclGroupEnd;
416    nccl_mem_alloc as "ncclMemAlloc": PFN_ncclMemAlloc;
417    nccl_mem_free as "ncclMemFree": PFN_ncclMemFree;
418    nccl_comm_register as "ncclCommRegister": PFN_ncclCommRegister;
419    nccl_comm_deregister as "ncclCommDeregister": PFN_ncclCommDeregister;
420    nccl_red_op_create_pre_mul_sum as "ncclRedOpCreatePreMulSum": PFN_ncclRedOpCreatePreMulSum;
421    nccl_red_op_destroy as "ncclRedOpDestroy": PFN_ncclRedOpDestroy;
422    nccl_get_error_string as "ncclGetErrorString": PFN_ncclGetErrorString;
423    nccl_get_last_error as "ncclGetLastError": PFN_ncclGetLastError;
424}
425
426/// Return the lazily-loaded NCCL library accessor.
427pub fn nccl() -> Result<&'static Nccl, LoaderError> {
428    static NCCL: OnceLock<Nccl> = OnceLock::new();
429    if let Some(n) = NCCL.get() {
430        return Ok(n);
431    }
432    let lib = Library::open("nccl", nccl_candidates())?;
433    let n = Nccl::empty(lib);
434    let _ = NCCL.set(n);
435    Ok(NCCL.get().expect("OnceLock set or lost race"))
436}