Skip to main content

tensorlogic_trustformers/quantization/
mod.rs

1//! Post-Training Quantization (PTQ) for transformer weight matrices.
2//!
3//! This module provides a CPU-first implementation of INT8 quantization for
4//! linear layers, following the same "Paradigm B — numerical layers" design
5//! used by the `moe` module.
6//!
7//! ## Architecture
8//!
9//! - [`QuantizedLinear`]: A weight matrix stored as `Array2<i8>` with per-channel
10//!   or per-tensor scale/zero_point. Forward pass dequantizes on the fly then
11//!   performs f64 matmul (integer-matmul kernel is a future follow-up).
12//! - [`calibrate_linear`]: Wraps `tensorlogic-scirs-backend`'s
13//!   `calibrate_quantization` to produce `QuantizationParams` from a weight
14//!   matrix, including per-channel calibration.
15//!
16//! ## Example
17//!
18//! ```rust,ignore
19//! use ndarray::Array2;
20//! use tensorlogic_trustformers::quantization::{calibrate_linear, QuantizedLinear};
21//! use tensorlogic_scirs_backend::quantization::{QuantizationGranularity, QuantizationType};
22//!
23//! let weight = Array2::from_shape_fn((4, 8), |(i, j)| (i * 8 + j) as f64);
24//! let params = calibrate_linear(&weight, QuantizationType::Int8,
25//!                               QuantizationGranularity::PerChannel);
26//! let qlinear = QuantizedLinear::from_fp(&weight, &params).expect("quantize");
27//!
28//! let x = Array2::ones((2, 8));
29//! let out = qlinear.forward(&x);
30//! assert_eq!(out.shape(), &[2, 4]);
31//! ```
32
33pub mod calibration;
34pub mod linear;
35
36#[cfg(test)]
37mod tests;
38
39pub use calibration::calibrate_linear;
40pub use linear::{QuantizationError, QuantizedLinear};