pub fn mwv_warp_count() -> u32 {
static MWV_WARPS: std::sync::OnceLock<u32> = std::sync::OnceLock::new();
*MWV_WARPS.get_or_init(|| {
std::env::var("MWV_WARPS")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(3)
})
}
#[allow(unused_imports)]
use trueno_gpu::kernels::{
Activation, ArgMaxFinalKernel, ArgMaxKernel, AttentionKernel, BatchedHwDp4aQ4KGemvKernel,
BatchedIncrementalAttentionKernel, BatchedQ4KGemvKernel, BatchedQ6KGemvKernel,
BatchedResidualAddKernel, BatchedRopeKernel, BatchedSwigluKernel,
BatchedVectorizedRmsNormKernel, BiasActivationKernel, ChunkedTiledQ4KGemvKernel,
CoalescedGemvKernel, CoalescedQ4KGemvKernel, CoalescedQ6KGemvKernel, Dp4aQ4KGemmKernel,
Dp4aQ4KGemvKernel, Dp4aQ6KGemvKernel, ElementwiseMulKernel, Fp16Q4KGemvKernel,
FusedFp32Q4KGemvKernel, FusedGateUpKernel, FusedGateUpQ4KGemvKernel,
FusedGateUpSwigluHwDp4aQ4KGemvKernel, FusedQKVHwDp4aQ4KGemvKernel, FusedQKVKernel,
FusedResidualRmsNormKernel, FusedRmsNormGateUpSwigluQ4KKernel, FusedRmsNormQ4KGemvKernel,
FusedSwigluKernel, GeluKernel, GemmKernel, GemvKernel, HalfWarpDp4aQ4KGemvKernel,
HalfWarpDp4aQ6KGemvKernel, IncrementalAttentionKernel, InlineQ8Dp4aQ4KGemvKernel, Kernel,
KvCacheScatterIndirectKernel, KvCacheScatterKernel, LayerNormKernel,
MultiWarpIncrementalAttentionKernel, MultiWarpQ6KGemvKernel, MultiWarpTensorCoreQ4KGemmKernel,
MultiWarpVectorizedQ4KGemvKernel, MwvDp4aQ4KGemvKernel, PackedDp4aQ4KQ8Kernel,
PerHeadRmsNormKernel, PreciseRmsNormKernel, PreciseRopeIndirectKernel, Q4KDequantFp16Kernel,
Q4KDequantKernel, Q4KGemvKernel, Q4KQ8DotKernel, Q4_0GemvKernel, Q4_1GemvKernel, Q5KGemvKernel,
Q5KKernel, Q5_0GemvKernel, Q6KDequantKernel, Q6KGemvKernel, Q6KKernel, Q8QuantizeKernel,
Q8_0GemvKernel, QuantizeKernel, ResidualAddKernel, RmsNormKernel, RopeIndirectKernel,
RopeKernel, RopeNeoxIndirectKernel, RopeNeoxKernel, SiluKernel, SoftmaxKernel,
TensorCoreQ4KGemmKernel, TiledQ4KGemvKernel, TrueDp4aQ4KGemvKernel, VectorizedQ4KGemvKernel,
VectorizedRmsNormKernel, WideQ4KGemvKernel,
};
#[cfg(target_arch = "x86_64")]
#[allow(unused_imports)]
use trueno_gpu::kernels::{
BatchedFusedResidualRmsNormKernel, InterleavedWmmaQ4KGemmKernel, W4a16WmmaQ4KGemmKernel,
};
include!("kernel_type.rs");
include!("kernel_generator.rs");
include!("kernel.rs");
include!("layout.rs");
include!("kernels_generate.rs");