atomr_accel_cutlass/lib.rs
1//! # atomr-accel-cutlass
2//!
3//! CUTLASS kernel-template instantiation for `atomr-accel-cuda`.
4//!
5//! Phase 6 of the CUDA-coverage roadmap. This crate exposes a thin
6//! actor-friendly facade over CUTLASS GEMM, grouped-GEMM, and
7//! implicit-GEMM convolution templates. Two compilation strategies are
8//! supported:
9//!
10//! ## Strategy A (default — NVRTC at runtime)
11//!
12//! User code constructs a [`gemm::GemmRequest<T>`] (or grouped /
13//! conv equivalent) and forwards it to a [`actor::CutlassActor`].
14//! The actor builds a small `.cu` translation unit that
15//! `#include`s the vendored CUTLASS headers (under
16//! `crates/atomr-accel-cutlass/cutlass/include/`) and instantiates
17//! the requested template, then hands the source to
18//! `atomr_accel_cuda::kernel::NvrtcActor` for compilation.
19//! Compilation cost is amortized via the per-arch disk cache that
20//! `NvrtcActor` already maintains and via the in-process
21//! [`plan_cache::PlanCache`] keyed by
22//! `(template_id, shape, dtype, arch)`.
23//!
24//! ## Strategy B (`cutlass-prebuilt` feature — nvcc at build time)
25//!
26//! `build.rs` walks a generator that emits a static library of
27//! pre-instantiated kernels for a fixed `(op × dtype × arch)` matrix.
28//! When the feature is OFF the `build.rs` is a no-op probe so the
29//! crate still builds on hosts without `nvcc`. The contract for
30//! Strategy B is documented in [`build.rs`] and in the crate
31//! README — full implementation is a follow-up; we wire the toggle
32//! and the empty hooks here.
33//!
34//! ## Coverage
35//!
36//! | op | dtype | arch |
37//! |-------------------|-------------------|---------------------------|
38//! | GEMM | fp32, fp16, bf16 | sm_80, sm_86, sm_89 |
39//! | GEMM | fp8 e4m3 / e5m2 | sm_89, sm_90a, sm_100 |
40//! | GEMM | fp4 e2m1 | sm_100, sm_120 |
41//! | grouped GEMM | fp16, bf16, fp8 | sm_90a, sm_100 |
42//! | conv2d fwd / dgrad / wgrad (implicit-GEMM) | fp16, bf16, fp32, fp8 | sm_80, sm_86, sm_89, sm_90a |
43//!
44//! Compute targets: Ampere (sm_80, fp16/bf16/fp32), Hopper (sm_90a,
45//! fp8 e4m3/e5m2 + fp16/bf16, persistent kernels), Blackwell
46//! (sm_100, fp8/fp4 + EVT).
47//!
48//! ## Crate layout
49//!
50//! ```text
51//! crates/atomr-accel-cutlass/
52//! ├── Cargo.toml
53//! ├── cutlass/ # vendored CUTLASS headers (BSD-3-Clause)
54//! ├── include/ # local template adapters
55//! ├── build.rs # gated on cutlass-prebuilt feature
56//! ├── examples/
57//! │ ├── cutlass_gemm_fp8.rs
58//! │ └── cutlass_grouped_gemm.rs
59//! └── src/
60//! ├── lib.rs # CutlassActor, CutlassMsg, props
61//! ├── actor.rs # actor surface
62//! ├── gemm.rs # GemmRequest<T>, CutlassGemmDispatch
63//! ├── grouped_gemm.rs # GroupedGemmRequest<T>
64//! ├── conv.rs # ConvFwd / Dgrad / Wgrad requests
65//! ├── evt.rs # EpilogueVisitorTree builder
66//! ├── plan_cache.rs # template plan cache
67//! └── kernels/ # generated .cu sources at runtime
68//! ```
69
70#![deny(rust_2018_idioms)]
71
72pub mod actor;
73pub mod conv;
74pub mod dtype;
75pub mod gemm;
76pub mod plan_cache;
77
78#[cfg(feature = "evt")]
79pub mod evt;
80
81#[cfg(feature = "grouped")]
82pub mod grouped_gemm;
83
84mod kernels;
85
86pub use actor::{CutlassActor, CutlassInner, CutlassMsg};
87pub use conv::{
88 ConvDgradRequest, ConvFwdRequest, ConvLayout, ConvShape, ConvWgradRequest, CutlassConvDispatch,
89};
90pub use dtype::{
91 is_fp4_supported, is_fp8_supported, is_supported_for, CutlassDtype, GemmSupported, SmArch,
92};
93pub use gemm::{CutlassGemmDispatch, GemmEpilogue, GemmLayout, GemmRequest, GemmShape, RefitMsg};
94pub use plan_cache::{PlanCache, PlanKey};
95
96#[cfg(feature = "evt")]
97pub use evt::{EpilogueOp, EpilogueVisitorTree, EvtBuilder};
98
99#[cfg(feature = "grouped")]
100pub use grouped_gemm::{
101 CutlassGroupedGemmDispatch, GroupedGemmRequest, GroupedGemmShape, GroupedLayout,
102};
103
104/// Entry point used by the `cutlass` cargo feature on
105/// `atomr-accel-cuda`. Returns the human-readable crate version
106/// string. Exposed for completeness so that downstream re-exports can
107/// version-pin against the cutlass crate without an extra cargo
108/// metadata call.
109pub const fn version() -> &'static str {
110 env!("CARGO_PKG_VERSION")
111}
112
113/// Convenience: build a [`CutlassActor`] props value. Mirrors the
114/// `props` constructors used by the rest of `atomr-accel-cuda` so
115/// downstream code can write
116/// `system.actor_of(atomr_accel_cutlass::props(...), "cutlass")`.
117pub fn props(plan_cache_capacity: usize) -> actor::CutlassProps {
118 actor::CutlassProps::new(plan_cache_capacity)
119}