trueno-gpu 0.4.17

Pure Rust PTX generation for NVIDIA CUDA - no LLVM, no nvcc
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
//! GEMM (General Matrix Multiply) Kernels
//!
//! Implements C = alpha * A @ B + beta * C with multiple variants:
//!
//! - **Basic**: Standard 2D GEMM (naive, tiled, tensor core)
//! - **Batched**: 3D batched GEMM for independent matrix multiplications
//! - **Batched4D**: 4D batched GEMM for multi-head attention

mod basic;
mod batched;
mod batched_4d;

pub use basic::{GemmConfig, GemmKernel};
pub use batched::{BatchedGemmConfig, BatchedGemmKernel};
pub use batched_4d::{Batched4DGemmConfig, Batched4DGemmKernel};