ferrum_kernels/
quant_linear.rs

1//! Concrete `Linear<B>` impls for quantized weights.
2//!
3//! Phase 3e moves the per-backend kernel-dispatch logic out of the
4//! `BackendQuantMarlin` / `BackendQuantGguf` trait method bodies and
5//! into these concrete `Linear<B>` types. Each impl owns the
6//! cudarc / metal / cpu kernel call directly — no more `B::gemm_gptq`
7//! indirection.
8//!
9//! Why these live in `ferrum-kernels` rather than `ferrum-quantization`:
10//! the `forward()` body needs cudarc / metal-rs types, and pulling
11//! those into ferrum-quantization would create a dep cycle (kernels
12//! → quantization → kernels). ferrum-quantization stays as the
13//! weight-format parser layer; backend-specific Linear impls live here.
14
15pub mod cpu_dequant;
16pub mod cpu_gguf;
17pub mod cpu_marlin_stack;
18
19#[cfg(feature = "cuda")]
20pub mod cuda_marlin;
21
22#[cfg(feature = "cuda")]
23pub mod cuda_marlin_stack;
24
25#[cfg(all(target_os = "macos", feature = "metal"))]
26pub mod metal_gguf;
27
28#[cfg(all(target_os = "macos", feature = "metal"))]
29pub mod metal_gguf_moe;
ferrum_kernels/quant_linear.rs

ferrum_kernels/
quant_linear.rs