rlx_cuda/lib.rs
1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! RLX CUDA backend — NVIDIA GPUs via the pure-Rust `cudarc` crate.
17#![allow(clippy::too_many_arguments)]
18//
19// FFI shim helpers (cudnn_conv*, cublaslt_matmul_fused, etc.) inherently
20// take many arguments — they mirror the underlying C API surface.
21// Suppressing the lint at crate scope avoids drowning out signal warnings.
22//!
23//! Same overall shape as rlx-wgpu (device singleton, arena buffer, per-op
24//! kernels, command-stream-per-forward-pass) but targeting CUDA via
25//! `cudarc::driver` for memory + dispatch and `cudarc::cublas` for matmul.
26//! Element-wise / reduction / shape kernels are CUDA C++ source strings
27//! compiled at init time via NVRTC — same pattern as rlx-wgpu's WGSL
28//! kernels.
29//!
30//! The crate uses `cudarc`'s `fallback-dynamic-loading` feature so it
31//! compiles on Mac (and any other host without a CUDA SDK). `is_available()`
32//! returns false when libcuda can't be dlopen()'d — every other entry
33//! point checks this and degrades cleanly.
34//!
35//! Layout:
36//! - `device` — `CudaContext` singleton (per-process), driver init
37//! - `arena` — device buffer + per-node offsets
38//! - `kernels` — CUDA C++ source strings + NVRTC compile + cuModule cache
39//! - `backend` — `CudaExecutable`: IR lowering, schedule, run
40
41pub mod arena;
42pub mod backend;
43pub mod calibrate;
44pub mod device;
45pub mod fft_dispatch;
46pub mod fft_host;
47pub mod gdn_host;
48pub mod gguf_gpu;
49pub mod gguf_host;
50pub mod host_staging;
51pub mod im2col_host;
52pub mod kernels;
53pub mod llada2_gate_host;
54pub mod log_mel_backward_host;
55pub mod log_mel_host;
56pub mod sam_ops_host;
57pub mod splat_host;
58#[cfg(feature = "native-splat")]
59pub mod splat_native;
60pub mod training_bwd_host;
61pub mod umap_knn_host;
62pub mod unfuse;
63pub mod welch_peaks_dispatch;
64pub mod welch_peaks_host;
65
66pub use backend::{CompileMode, CudaExecutable, ExecMode};
67
68/// HIP-CPU validation path — runs `.cu` kernels on CPU threads so we
69/// can numerically validate them on Mac/Docker without renting a CUDA
70/// box. Strictly a dev feature; never enabled in production.
71#[cfg(feature = "hip-cpu-validate")]
72pub mod cpu_dispatch;
73
74/// True if a CUDA driver is reachable. With `dynamic-loading`, this
75/// returns false on hosts without `libcuda` (Mac, headless boxes, CI
76/// runners without GPUs) — the crate still compiled, but no kernel
77/// dispatch is possible.
78pub fn is_available() -> bool {
79 device::cuda_context().is_some()
80}