Skip to main content

ferrum_quantization/
lib.rs

1//! Weight-format abstraction for Ferrum models.
2//!
3//! Separates "what is the weight matrix like" (dense f32, GPTQ int4, AWQ,
4//! GGUF, ...) from "what device does the math" (Backend) and "how does the
5//! model wire things together" (model code).
6//!
7//! Usage in model code:
8//! ```ignore
9//! let qkv: Box<dyn Linear<B>> = loader.load_linear("model.layers.0.self_attn.qkv_proj")?;
10//! qkv.forward(ctx, &input, &mut out, m);
11//! ```
12//!
13//! The `Linear` trait dispatches to the appropriate backend kernel
14//! (`B::gemm` for Dense, `B::gemm_gptq` for GPTQ, etc.) without the model
15//! having to branch on quantization type.
16
17#![forbid(unsafe_op_in_unsafe_fn)]
18
19pub mod dense;
20pub mod factory;
21pub mod gptq;
22pub mod loader;
23pub mod native_safetensors;
24pub mod traits;
25
26pub use dense::DenseLinear;
27pub use factory::DefaultLinearFactory;
28pub use gptq::GptqLinear;
29pub use loader::{PrefixedLoader, WeightLoader};
30pub use native_safetensors::NativeSafetensorsLoader;
31pub use traits::{Linear, LinearFactory};
32
33// Quant config types — populated from safetensors metadata or GGUF header.
34pub mod config;
35pub use config::{QuantConfig, QuantMethod};