Skip to main content

ferrum_quantization/
lib.rs

1//! Weight-format abstraction for Ferrum models.
2//!
3//! Separates "what is the weight matrix like" (dense f32, GPTQ int4, AWQ,
4//! GGUF, ...) from "what device does the math" (Backend) and "how does the
5//! model wire things together" (model code).
6//!
7//! Usage in model code:
8//! ```ignore
9//! let qkv: Box<dyn Linear<B>> = loader.load_linear("model.layers.0.self_attn.qkv_proj")?;
10//! qkv.forward(ctx, &input, &mut out, m);
11//! ```
12//!
13//! The `Linear` trait dispatches to the appropriate backend kernel
14//! (`B::gemm` for Dense, `B::gemm_gptq` for GPTQ, etc.) without the model
15//! having to branch on quantization type.
16
17#![forbid(unsafe_op_in_unsafe_fn)]
18
19pub mod dense;
20pub mod gguf;
21pub mod gptq;
22pub mod loader;
23pub mod native_safetensors;
24pub mod quant_linear;
25pub mod traits;
26
27pub use dense::DenseLinear;
28pub use gguf::{GgufFile, GgufLinear, GgufLoader};
29pub use gptq::{GptqLinear, StackedExpertLinear};
30pub use loader::{PrefixedLoader, WeightLoader};
31pub use native_safetensors::NativeSafetensorsLoader;
32pub use quant_linear::QuantLinear;
33pub use traits::Linear;
34
35// Quant config types — populated from safetensors metadata or GGUF header.
36pub mod config;
37pub use config::{QuantConfig, QuantMethod};