trueno-gpu 0.4.29

Pure Rust PTX generation for NVIDIA CUDA - no LLVM, no nvcc
Documentation
1
2
3
4
5
6
7
8
9
10
//! Tiled Q4_K GEMV Kernels with Shared Memory Input Caching
//!
//! - `TiledQ4KGemvKernel`: Input vector cached in shared memory
//! - `ChunkedTiledQ4KGemvKernel`: Handles K > 8K with fixed 32KB chunks

mod chunked;
mod shared_memory;

pub use chunked::ChunkedTiledQ4KGemvKernel;
pub use shared_memory::TiledQ4KGemvKernel;