Skip to main content

ferrum_kernels/
lib.rs

1//! Ferrum unified compute kernels for high-performance inference.
2//!
3//! Provides the `Backend` trait and implementations for CUDA, Metal, and CPU.
4//! On CUDA builds, kernels are compiled to PTX during `cargo build` and loaded
5//! on demand at runtime.
6
7pub fn configure_native_profile_sink(
8    config: &ferrum_bench_core::ProfileSinkConfig,
9) -> std::io::Result<()> {
10    #[cfg(all(feature = "cuda", feature = "vllm-moe-marlin"))]
11    backend::cuda::marlin::configure_vllm_moe_profile_sink(config)?;
12    #[cfg(not(all(feature = "cuda", feature = "vllm-moe-marlin")))]
13    let _ = config;
14    Ok(())
15}
16
17pub mod backend;
18
19pub mod linear;
20pub use linear::Linear;
21
22pub mod stacked_expert;
23pub use stacked_expert::StackedExpertGgufLinear;
24
25pub mod marlin_expert_stack;
26pub use marlin_expert_stack::MarlinExpertStack;
27
28pub mod quant_linear;
29
30pub mod attention;
31
32pub mod moe_host;
33
34// Audit #9: Metal GGUF k-quant kernels (q4_k_*, q6_k_*, moe_*) physically
35// live in `backend/metal/` now. Re-exported here so external callers'
36// `ferrum_kernels::q4_k_gemm::*` paths + internal `crate::q4_k_*::*` paths
37// keep working unchanged. (`moe_host` stays top-level — it's the CPU
38// reference impl used from `ferrum-models`, not a Metal kernel.)
39#[cfg(all(target_os = "macos", feature = "metal"))]
40pub use backend::metal::{
41    moe_post_ops, moe_post_ops_batched, moe_router, q4_k, q4_k_gemm, q4_k_gemv, q4_k_gemv_v2,
42    q4_k_moe_id_gate_up_silu, q4_k_moe_id_gate_up_silu_batched, q4_k_moe_id_gemm, q4_k_moe_id_gemv,
43    q4_k_moe_id_gemv_batched, q6_k_gemm, q6_k_gemv, q6_k_moe_id_gemm, q6_k_moe_id_gemv,
44    q6_k_moe_id_gemv_batched,
45};
46
47#[cfg(feature = "cuda")]
48pub(crate) mod ptx {
49    // Generated by build.rs from all .cu sources. Some kernels (e.g.
50    // SOFTMAX, BATCHED_FLASH_DECODE_ATTENTION) are emitted unconditionally
51    // but only loaded behind specific code paths, so dead_code fires in
52    // configs that don't hit them.
53    #![allow(dead_code)]
54    include!(concat!(env!("OUT_DIR"), "/ptx.rs"));
55}
56
57// Audit #9: CUDA kernels physically live under `backend/cuda/` now.
58// Re-exports preserve the historical `ferrum_kernels::foo::*` public
59// surface + internal `crate::foo::*` paths.
60//
61// Two files stay at the crate root for now because they would otherwise
62// collide with same-named files already under `backend/cuda/` (which
63// host the Backend-trait impls, not the kernel launchers):
64//   - `int8_kv.rs` (top-level launchers `launch_int8_paged_decode_*`)
65//   - `quant.rs`   (top-level `dequant_int4` legacy path)
66
67#[cfg(feature = "cuda")]
68pub mod int8_kv;
69#[cfg(feature = "cuda")]
70pub mod quant;
71
72#[cfg(feature = "cuda")]
73pub use backend::cuda::{
74    cublas, cuda_decode, cuda_graph, decode_buffers, gpu_paged_kv, marlin, nccl_comm, tp_decode,
75    weight_store,
76};
77
78#[cfg(feature = "cuda")]
79pub use backend::cuda::decode_attention::decode_attention;
80#[cfg(feature = "cuda")]
81pub use backend::cuda::fused_add_rms_norm::fused_add_rms_norm;
82#[cfg(feature = "cuda")]
83pub use backend::cuda::fused_silu_mul::fused_silu_mul;
84#[cfg(feature = "cuda")]
85pub use backend::cuda::residual_add::residual_add;
86#[cfg(feature = "cuda")]
87pub use backend::cuda::rms_norm::rms_norm;
88#[cfg(feature = "cuda")]
89pub use backend::cuda::rope::rope;
90
91// Preserve `crate::triton_ptx` / `crate::triton_meta` paths for in-crate
92// callers (e.g. `quant_linear::cuda_marlin::CudaMarlinLinear::forward`).
93// These are NOT part of the kernels-crate public API.
94#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
95pub(crate) use backend::cuda::{triton_meta, triton_ptx};
96
97#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
98pub use backend::cuda::triton_add_bias::add_bias_triton;
99#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
100pub use backend::cuda::triton_fused_add_rms_norm::fused_add_rms_norm_triton;
101#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
102pub use backend::cuda::triton_fused_silu_mul::fused_silu_mul_triton;
103#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
104pub use backend::cuda::triton_gelu::gelu_triton;
105#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
106pub use backend::cuda::triton_layer_norm::layer_norm_triton;
107#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
108pub use backend::cuda::triton_residual_add::residual_add_triton;
109#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
110pub use backend::cuda::triton_residual_add_inplace::residual_add_inplace_triton;
111#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
112pub use backend::cuda::triton_rms_norm::rms_norm_triton;
113#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
114pub use backend::cuda::triton_softmax::softmax_triton;
115#[cfg(all(feature = "cuda", feature = "triton-kernels"))]
116pub use backend::cuda::{triton_fused_moe, triton_w4a16};
117
118// vLLM gptq_marlin port (Phase 12). Behind its own feature for opt-in
119// while we validate correctness + perf vs ferrum's existing IST-DASLab Marlin.
120#[cfg(feature = "vllm-marlin")]
121pub use backend::cuda::vllm_marlin;