Skip to main content

ferrum_quantization/
quant_linear.rs

1//! `QuantLinear<B>` — thin wrapper that delegates to the boxed
2//! `Linear<B>` returned by `B::load_quant` / `B::load_quant_fused`.
3//!
4//! Phase 3e/3: backend-specific kernel dispatch (Metal Q4_K/Q6_K
5//! mul_mm, CPU dequant + gemm) lives inside the boxed Linear's
6//! `forward()` body, not in a Backend trait method. The historical
7//! `QuantLinear<B>` constructors (`from_gguf_bytes`, `from_gguf_fused`)
8//! stay so callers don't have to change shape — they just route through
9//! the new factory.
10
11use ferrum_kernels::backend::{Backend, BackendQuantGguf, GgufQuantType};
12use ferrum_kernels::Linear;
13use ferrum_types::Result;
14
15/// Linear projection backed by a GGUF k-quant weight.
16///
17/// `forward()` is a tail-call to the inner backend-specific Linear
18/// (Metal: `MetalGgufLinear`, CPU: `CpuGgufLinear`). LTO inlines through
19/// the dispatch.
20pub struct QuantLinear<B: Backend + BackendQuantGguf> {
21    inner: Box<dyn Linear<B> + Send + Sync>,
22}
23
24impl<B: Backend + BackendQuantGguf> QuantLinear<B> {
25    /// Build from raw GGUF block bytes.
26    ///
27    /// `kind`: which k-quant flavour the bytes encode (Q4_K, Q5_K, …).
28    /// `bytes`: the on-disk payload, sized by the kind's block layout.
29    pub fn from_gguf_bytes(
30        kind: GgufQuantType,
31        bytes: &[u8],
32        out_features: usize,
33        in_features: usize,
34    ) -> Result<Self> {
35        let inner = B::load_quant(kind, bytes, out_features, in_features)?;
36        Ok(Self { inner })
37    }
38
39    /// Build a fused projection from multiple `(kind, bytes, rows)`
40    /// parts that share `in_features`. Each part stays in its own
41    /// QuantStore (no byte-concat); forward dispatches one matvec per
42    /// part. Used for Qwen3 `qkv_proj` when q+k are Q4_K and v is Q6_K
43    /// — the homogeneous fused-Q4 fast path would have to fall back
44    /// to eager-fp32, blowing 100 MB per layer.
45    pub fn from_gguf_fused(
46        parts: &[(GgufQuantType, &[u8], usize)],
47        in_features: usize,
48    ) -> Result<Self> {
49        let inner = B::load_quant_fused(parts, in_features)?;
50        Ok(Self { inner })
51    }
52}
53
54impl<B: Backend + BackendQuantGguf> Linear<B> for QuantLinear<B> {
55    fn in_features(&self) -> usize {
56        self.inner.in_features()
57    }
58
59    fn out_features(&self) -> usize {
60        self.inner.out_features()
61    }
62
63    fn forward(&self, ctx: &mut B::Context, input: &B::Buffer, out: &mut B::Buffer, m: usize) {
64        self.inner.forward(ctx, input, out, m);
65    }
66}