Skip to main content

ferrum_quantization/
lib.rs

1//! Weight-format abstraction for Ferrum models.
2//!
3//! Separates "what is the weight matrix like" (dense f32, GPTQ int4, AWQ,
4//! GGUF, ...) from "what device does the math" (Backend) and "how does the
5//! model wire things together" (model code).
6//!
7//! Usage in model code:
8//! ```ignore
9//! let qkv: Box<dyn Linear<B>> = loader.load_linear("model.layers.0.self_attn.qkv_proj")?;
10//! qkv.forward(ctx, &input, &mut out, m);
11//! ```
12//!
13//! The `Linear` trait dispatches to the appropriate backend kernel
14//! (`B::gemm` for Dense, `B::gemm_gptq` for GPTQ, etc.) without the model
15//! having to branch on quantization type.
16
17#![forbid(unsafe_op_in_unsafe_fn)]
18
19pub mod dense;
20pub mod gguf;
21pub mod gptq;
22pub mod loader;
23pub mod lora;
24pub mod native_safetensors;
25pub mod quant_linear;
26pub mod traits;
27
28pub use dense::DenseLinear;
29pub use gguf::{GgufFile, GgufLinear, GgufLoader};
30pub use gptq::{GptqLinear, StackedExpertLinear};
31pub use loader::{PrefixedLoader, WeightLoader};
32pub use lora::LoraLinearRef;
33pub use native_safetensors::NativeSafetensorsLoader;
34pub use quant_linear::QuantLinear;
35pub use traits::Linear;
36
37// Quant config types — populated from safetensors metadata or GGUF header.
38pub mod config;
39pub use config::{QuantConfig, QuantMethod};