use std::collections::HashMap;
use trustformers_core::{
errors::{Result, TrustformersError},
tensor::Tensor,
};
#[derive(Debug, Clone)]
pub struct AdvancedQuantizationConfig {
pub method: QuantizationMethod,
pub block_size: usize,
pub double_quantization: bool,
pub compute_dtype: ComputeDataType,
pub outlier_threshold: f32,
pub nested_quantization: bool,
}
impl Default for AdvancedQuantizationConfig {
fn default() -> Self {
Self {
method: QuantizationMethod::NF4,
block_size: 64,
double_quantization: true,
compute_dtype: ComputeDataType::BFloat16,
outlier_threshold: 6.0,
nested_quantization: false,
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum QuantizationMethod {
NF4,
FP4,
Int4Asymmetric,
Int8BlockWise,
MixedPrecision { primary_bits: u8, outlier_bits: u8 },
}
#[derive(Debug, Clone, PartialEq)]
pub enum ComputeDataType {
Float32,
Float16,
BFloat16,
}
const NF4_QUANT_TABLE: [f32; 16] = [
-1.0,
-0.696_192_8,
-0.525_073_05,
-0.394_917_5,
-0.284_441_38,
-0.184_773_43,
-0.091_050_036,
0.0,
0.079_580_3,
0.160_930_2,
0.246_112_3,
0.337_915_24,
0.440_709_83,
0.562_617,
0.722_956_84,
1.0,
];
pub struct FP4Format {
pub sign_bits: u8,
pub exponent_bits: u8,
pub mantissa_bits: u8,
}
impl Default for FP4Format {
fn default() -> Self {
Self {
sign_bits: 1,
exponent_bits: 2,
mantissa_bits: 1,
}
}
}
#[derive(Debug, Clone)]
pub struct QuantizedTensor {
pub data: Vec<u8>,
pub scales: Vec<f32>,
pub zero_points: Option<Vec<f32>>,
pub shape: Vec<usize>,
pub metadata: QuantizationMetadata,
}
#[derive(Debug, Clone)]
pub struct QuantizationMetadata {
pub method: QuantizationMethod,
pub block_size: usize,
pub outlier_indices: Option<Vec<usize>>,
pub outlier_values: Option<Vec<f32>>,
pub double_quantized: bool,
pub compute_dtype: ComputeDataType,
}
pub struct AdvancedQuantizer {
config: AdvancedQuantizationConfig,
#[allow(dead_code)]
nf4_lookup: HashMap<u8, f32>,
#[allow(dead_code)]
inv_nf4_lookup: HashMap<u32, u8>, }
impl AdvancedQuantizer {
pub fn new(config: AdvancedQuantizationConfig) -> Self {
let mut nf4_lookup = HashMap::new();
let mut inv_nf4_lookup = HashMap::new();
for (i, &value) in NF4_QUANT_TABLE.iter().enumerate() {
nf4_lookup.insert(i as u8, value);
inv_nf4_lookup.insert(value.to_bits(), i as u8);
}
Self {
config,
nf4_lookup,
inv_nf4_lookup,
}
}
pub fn quantize(&self, tensor: &Tensor) -> Result<QuantizedTensor> {
match &self.config.method {
QuantizationMethod::NF4 => self.quantize_nf4(tensor),
QuantizationMethod::FP4 => self.quantize_fp4(tensor),
QuantizationMethod::Int4Asymmetric => self.quantize_int4_asymmetric(tensor),
QuantizationMethod::Int8BlockWise => self.quantize_int8_blockwise(tensor),
QuantizationMethod::MixedPrecision {
primary_bits,
outlier_bits,
} => self.quantize_mixed_precision(tensor, *primary_bits, *outlier_bits),
}
}
pub fn dequantize(&self, quantized: &QuantizedTensor) -> Result<Tensor> {
match &quantized.metadata.method {
QuantizationMethod::NF4 => self.dequantize_nf4(quantized),
QuantizationMethod::FP4 => self.dequantize_fp4(quantized),
QuantizationMethod::Int4Asymmetric => self.dequantize_int4_asymmetric(quantized),
QuantizationMethod::Int8BlockWise => self.dequantize_int8_blockwise(quantized),
QuantizationMethod::MixedPrecision { .. } => self.dequantize_mixed_precision(quantized),
}
}
fn quantize_nf4(&self, tensor: &Tensor) -> Result<QuantizedTensor> {
let tensor_data = tensor.data_f32()?;
let total_elements = tensor_data.len();
let num_blocks = total_elements.div_ceil(self.config.block_size);
let mut quantized_data = Vec::with_capacity(total_elements.div_ceil(2)); let mut scales = Vec::with_capacity(num_blocks);
let mut outlier_indices = Vec::new();
let mut outlier_values = Vec::new();
for block_idx in 0..num_blocks {
let start_idx = block_idx * self.config.block_size;
let end_idx = (start_idx + self.config.block_size).min(total_elements);
let block_data = &tensor_data[start_idx..end_idx];
let abs_max = block_data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let mut processed_block = Vec::with_capacity(block_data.len());
for (local_idx, &value) in block_data.iter().enumerate() {
if value.abs() > self.config.outlier_threshold * abs_max {
outlier_indices.push(start_idx + local_idx);
outlier_values.push(value);
processed_block.push(0.0); } else {
processed_block.push(value);
}
}
let scale = abs_max;
scales.push(scale);
for chunk in processed_block.chunks(2) {
let val1 = self.quantize_nf4_value(chunk[0] / scale);
let val2 =
if chunk.len() > 1 { self.quantize_nf4_value(chunk[1] / scale) } else { 0 };
let packed = (val1 & 0xF) | ((val2 & 0xF) << 4);
quantized_data.push(packed);
}
}
let final_scales = if self.config.double_quantization {
self.double_quantize_scales(&scales)?
} else {
scales
};
let outliers = if outlier_indices.is_empty() {
(None, None)
} else {
(Some(outlier_indices), Some(outlier_values))
};
Ok(QuantizedTensor {
data: quantized_data,
scales: final_scales,
zero_points: None,
shape: tensor.shape().to_vec(),
metadata: QuantizationMetadata {
method: QuantizationMethod::NF4,
block_size: self.config.block_size,
outlier_indices: outliers.0,
outlier_values: outliers.1,
double_quantized: self.config.double_quantization,
compute_dtype: self.config.compute_dtype.clone(),
},
})
}
fn quantize_nf4_value(&self, value: f32) -> u8 {
let clamped = value.clamp(-1.0, 1.0);
let mut best_idx = 0;
let mut best_error = (clamped - NF4_QUANT_TABLE[0]).abs();
for (idx, &quant_val) in NF4_QUANT_TABLE.iter().enumerate().skip(1) {
let error = (clamped - quant_val).abs();
if error < best_error {
best_error = error;
best_idx = idx;
}
}
best_idx as u8
}
fn quantize_fp4(&self, tensor: &Tensor) -> Result<QuantizedTensor> {
let tensor_data = tensor.data_f32()?;
let total_elements = tensor_data.len();
let num_blocks = total_elements.div_ceil(self.config.block_size);
let mut quantized_data = Vec::with_capacity(total_elements.div_ceil(2));
let mut scales = Vec::with_capacity(num_blocks);
for block_idx in 0..num_blocks {
let start_idx = block_idx * self.config.block_size;
let end_idx = (start_idx + self.config.block_size).min(total_elements);
let block_data = &tensor_data[start_idx..end_idx];
let abs_max = block_data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let scale = abs_max / 7.0; scales.push(scale);
for chunk in block_data.chunks(2) {
let val1 = self.quantize_fp4_value(chunk[0] / scale);
let val2 =
if chunk.len() > 1 { self.quantize_fp4_value(chunk[1] / scale) } else { 0 };
let packed = (val1 & 0xF) | ((val2 & 0xF) << 4);
quantized_data.push(packed);
}
}
Ok(QuantizedTensor {
data: quantized_data,
scales,
zero_points: None,
shape: tensor.shape().to_vec(),
metadata: QuantizationMetadata {
method: QuantizationMethod::FP4,
block_size: self.config.block_size,
outlier_indices: None,
outlier_values: None,
double_quantized: false,
compute_dtype: self.config.compute_dtype.clone(),
},
})
}
fn quantize_fp4_value(&self, value: f32) -> u8 {
if value == 0.0 {
return 0;
}
let sign = if value < 0.0 { 1u8 } else { 0u8 };
let abs_val = value.abs();
let quantized = if abs_val <= 0.25 {
0 } else if abs_val <= 0.75 {
1 } else if abs_val <= 1.25 {
2 } else if abs_val <= 1.75 {
3 } else if abs_val <= 2.5 {
4 } else if abs_val <= 3.5 {
5 } else if abs_val <= 5.0 {
6 } else {
7 };
(sign << 3) | (quantized & 0x7)
}
fn quantize_int4_asymmetric(&self, tensor: &Tensor) -> Result<QuantizedTensor> {
let tensor_data = tensor.data_f32()?;
let total_elements = tensor_data.len();
let num_blocks = total_elements.div_ceil(self.config.block_size);
let mut quantized_data = Vec::with_capacity(total_elements.div_ceil(2));
let mut scales = Vec::with_capacity(num_blocks);
let mut zero_points = Vec::with_capacity(num_blocks);
for block_idx in 0..num_blocks {
let start_idx = block_idx * self.config.block_size;
let end_idx = (start_idx + self.config.block_size).min(total_elements);
let block_data = &tensor_data[start_idx..end_idx];
let min_val = block_data.iter().fold(f32::INFINITY, |a, &b| a.min(b));
let max_val = block_data.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
let scale = (max_val - min_val) / 15.0; let zero_point = -min_val / scale;
scales.push(scale);
zero_points.push(zero_point);
for chunk in block_data.chunks(2) {
let val1 = ((chunk[0] / scale + zero_point).round() as i32).clamp(0, 15) as u8;
let val2 = if chunk.len() > 1 {
((chunk[1] / scale + zero_point).round() as i32).clamp(0, 15) as u8
} else {
0
};
let packed = (val1 & 0xF) | ((val2 & 0xF) << 4);
quantized_data.push(packed);
}
}
Ok(QuantizedTensor {
data: quantized_data,
scales,
zero_points: Some(zero_points),
shape: tensor.shape().to_vec(),
metadata: QuantizationMetadata {
method: QuantizationMethod::Int4Asymmetric,
block_size: self.config.block_size,
outlier_indices: None,
outlier_values: None,
double_quantized: false,
compute_dtype: self.config.compute_dtype.clone(),
},
})
}
fn quantize_int8_blockwise(&self, tensor: &Tensor) -> Result<QuantizedTensor> {
let tensor_data = tensor.data_f32()?;
let total_elements = tensor_data.len();
let num_blocks = total_elements.div_ceil(self.config.block_size);
let mut quantized_data = Vec::with_capacity(total_elements);
let mut scales = Vec::with_capacity(num_blocks);
for block_idx in 0..num_blocks {
let start_idx = block_idx * self.config.block_size;
let end_idx = (start_idx + self.config.block_size).min(total_elements);
let block_data = &tensor_data[start_idx..end_idx];
let abs_max = block_data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let scale = abs_max / 127.0;
scales.push(scale);
for &value in block_data {
let quantized = ((value / scale).round() as i32).clamp(-127, 127) as i8;
quantized_data.push(quantized as u8);
}
}
Ok(QuantizedTensor {
data: quantized_data,
scales,
zero_points: None,
shape: tensor.shape().to_vec(),
metadata: QuantizationMetadata {
method: QuantizationMethod::Int8BlockWise,
block_size: self.config.block_size,
outlier_indices: None,
outlier_values: None,
double_quantized: false,
compute_dtype: self.config.compute_dtype.clone(),
},
})
}
fn quantize_mixed_precision(
&self,
tensor: &Tensor,
_primary_bits: u8,
_outlier_bits: u8,
) -> Result<QuantizedTensor> {
self.quantize_nf4(tensor)
}
fn dequantize_nf4(&self, quantized: &QuantizedTensor) -> Result<Tensor> {
let total_elements: usize = quantized.shape.iter().product();
let mut dequantized_data = vec![0.0f32; total_elements];
let num_blocks = quantized.scales.len();
let mut data_idx = 0;
let mut elem_idx = 0;
for block_idx in 0..num_blocks {
let scale = quantized.scales[block_idx];
let block_size = if block_idx == num_blocks - 1 {
total_elements - block_idx * quantized.metadata.block_size
} else {
quantized.metadata.block_size
};
let mut block_elem_count = 0;
while block_elem_count < block_size && data_idx < quantized.data.len() {
let packed = quantized.data[data_idx];
data_idx += 1;
let val1_idx = (packed & 0xF) as usize;
let val2_idx = ((packed >> 4) & 0xF) as usize;
if elem_idx < total_elements {
dequantized_data[elem_idx] = NF4_QUANT_TABLE[val1_idx] * scale;
elem_idx += 1;
block_elem_count += 1;
}
if block_elem_count < block_size && elem_idx < total_elements {
dequantized_data[elem_idx] = NF4_QUANT_TABLE[val2_idx] * scale;
elem_idx += 1;
block_elem_count += 1;
}
}
}
if let (Some(outlier_indices), Some(outlier_values)) = (
&quantized.metadata.outlier_indices,
&quantized.metadata.outlier_values,
) {
for (&idx, &value) in outlier_indices.iter().zip(outlier_values.iter()) {
if idx < dequantized_data.len() {
dequantized_data[idx] = value;
}
}
}
Tensor::from_vec(dequantized_data, &quantized.shape)
}
fn dequantize_fp4(&self, quantized: &QuantizedTensor) -> Result<Tensor> {
let total_elements: usize = quantized.shape.iter().product();
let mut dequantized_data = vec![0.0f32; total_elements];
let num_blocks = quantized.scales.len();
let mut data_idx = 0;
let mut elem_idx = 0;
for block_idx in 0..num_blocks {
let scale = quantized.scales[block_idx];
let block_size = if block_idx == num_blocks - 1 {
total_elements - block_idx * quantized.metadata.block_size
} else {
quantized.metadata.block_size
};
let mut block_elem_count = 0;
while block_elem_count < block_size && data_idx < quantized.data.len() {
let packed = quantized.data[data_idx];
data_idx += 1;
let val1_bits = packed & 0xF;
let val2_bits = (packed >> 4) & 0xF;
if elem_idx < total_elements {
dequantized_data[elem_idx] = self.dequantize_fp4_value(val1_bits) * scale;
elem_idx += 1;
block_elem_count += 1;
}
if block_elem_count < block_size && elem_idx < total_elements {
dequantized_data[elem_idx] = self.dequantize_fp4_value(val2_bits) * scale;
elem_idx += 1;
block_elem_count += 1;
}
}
}
Tensor::from_vec(dequantized_data, &quantized.shape)
}
fn dequantize_fp4_value(&self, bits: u8) -> f32 {
if bits == 0 {
return 0.0;
}
let sign = if (bits >> 3) & 1 == 1 { -1.0 } else { 1.0 };
let magnitude_bits = bits & 0x7;
let magnitude = match magnitude_bits {
1 => 0.5,
2 => 1.0,
3 => 1.5,
4 => 2.0,
5 => 3.0,
6 => 4.0,
7 => 6.0,
_ => 0.0,
};
sign * magnitude
}
fn dequantize_int4_asymmetric(&self, quantized: &QuantizedTensor) -> Result<Tensor> {
let total_elements: usize = quantized.shape.iter().product();
let mut dequantized_data = vec![0.0f32; total_elements];
let num_blocks = quantized.scales.len();
let zero_points = quantized.zero_points.as_ref().ok_or_else(|| {
TrustformersError::config_error(
"Zero points required for asymmetric quantization",
"dequantize_int4",
)
})?;
let mut data_idx = 0;
let mut elem_idx = 0;
for block_idx in 0..num_blocks {
let scale = quantized.scales[block_idx];
let zero_point = zero_points[block_idx];
let block_size = if block_idx == num_blocks - 1 {
total_elements - block_idx * quantized.metadata.block_size
} else {
quantized.metadata.block_size
};
let mut block_elem_count = 0;
while block_elem_count < block_size && data_idx < quantized.data.len() {
let packed = quantized.data[data_idx];
data_idx += 1;
let val1 = (packed & 0xF) as f32;
let val2 = ((packed >> 4) & 0xF) as f32;
if elem_idx < total_elements {
dequantized_data[elem_idx] = (val1 - zero_point) * scale;
elem_idx += 1;
block_elem_count += 1;
}
if block_elem_count < block_size && elem_idx < total_elements {
dequantized_data[elem_idx] = (val2 - zero_point) * scale;
elem_idx += 1;
block_elem_count += 1;
}
}
}
Tensor::from_vec(dequantized_data, &quantized.shape)
}
fn dequantize_int8_blockwise(&self, quantized: &QuantizedTensor) -> Result<Tensor> {
let total_elements: usize = quantized.shape.iter().product();
let mut dequantized_data = vec![0.0f32; total_elements];
let num_blocks = quantized.scales.len();
let mut data_idx = 0;
let mut elem_idx = 0;
for block_idx in 0..num_blocks {
let scale = quantized.scales[block_idx];
let block_size = if block_idx == num_blocks - 1 {
total_elements - block_idx * quantized.metadata.block_size
} else {
quantized.metadata.block_size
};
for _ in 0..block_size {
if data_idx < quantized.data.len() && elem_idx < total_elements {
let quantized_val = quantized.data[data_idx] as i8;
dequantized_data[elem_idx] = quantized_val as f32 * scale;
data_idx += 1;
elem_idx += 1;
}
}
}
Tensor::from_vec(dequantized_data, &quantized.shape)
}
fn dequantize_mixed_precision(&self, quantized: &QuantizedTensor) -> Result<Tensor> {
self.dequantize_nf4(quantized)
}
fn double_quantize_scales(&self, scales: &[f32]) -> Result<Vec<f32>> {
if scales.is_empty() {
return Ok(Vec::new());
}
let max_scale = scales.iter().fold(0.0f32, |a, &b| a.max(b.abs()));
let scale_scale = max_scale / 127.0;
let mut quantized_scales = Vec::with_capacity(scales.len());
for &scale in scales {
let quantized = ((scale / scale_scale).round() as i32).clamp(-127, 127) as i8;
quantized_scales.push(quantized as f32 * scale_scale);
}
Ok(quantized_scales)
}
pub fn get_stats(&self, quantized: &QuantizedTensor) -> QuantizationStats {
let original_size = quantized.shape.iter().product::<usize>() * 4; let compressed_size = quantized.data.len() + quantized.scales.len() * 4;
let compression_ratio = original_size as f32 / compressed_size as f32;
QuantizationStats {
original_size_bytes: original_size,
compressed_size_bytes: compressed_size,
compression_ratio,
method: quantized.metadata.method.clone(),
block_size: quantized.metadata.block_size,
outlier_count: quantized
.metadata
.outlier_indices
.as_ref()
.map(|indices| indices.len())
.unwrap_or(0),
}
}
}
#[derive(Debug, Clone)]
pub struct QuantizationStats {
pub original_size_bytes: usize,
pub compressed_size_bytes: usize,
pub compression_ratio: f32,
pub method: QuantizationMethod,
pub block_size: usize,
pub outlier_count: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_nf4_quantization() -> Result<()> {
let config = AdvancedQuantizationConfig::default();
let quantizer = AdvancedQuantizer::new(config);
let data = vec![0.1, -0.5, 0.8, -1.2, 0.0, 0.3, -0.7, 2.1];
let tensor = Tensor::from_vec(data.clone(), &[8])?;
let quantized = quantizer.quantize(&tensor)?;
assert_eq!(quantized.metadata.method, QuantizationMethod::NF4);
let dequantized = quantizer.dequantize(&quantized)?;
let dequant_data = dequantized.data_f32()?;
for (orig, dequant) in data.iter().zip(dequant_data.iter()) {
let abs_error = (orig - dequant).abs();
let rel_error = abs_error / orig.abs().max(1e-6);
let tolerance_met = rel_error < 1.0 || abs_error < 0.5;
assert!(
tolerance_met,
"Quantization error too large: {} vs {} (abs_error: {:.4}, rel_error: {:.4})",
orig, dequant, abs_error, rel_error
);
}
Ok(())
}
#[test]
fn test_fp4_quantization() -> Result<()> {
let config = AdvancedQuantizationConfig {
method: QuantizationMethod::FP4,
..AdvancedQuantizationConfig::default()
};
let quantizer = AdvancedQuantizer::new(config);
let data = vec![0.5, -1.0, 2.0, -3.5, 0.0, 1.5, -0.25, 4.0];
let tensor = Tensor::from_vec(data.clone(), &[8])?;
let quantized = quantizer.quantize(&tensor)?;
let dequantized = quantizer.dequantize(&quantized)?;
let dequant_data = dequantized.data_f32()?;
assert_eq!(dequant_data.len(), data.len());
Ok(())
}
#[test]
fn test_quantization_stats() -> Result<()> {
let config = AdvancedQuantizationConfig::default();
let quantizer = AdvancedQuantizer::new(config);
let data = vec![1.0; 1024]; let tensor = Tensor::from_vec(data, &[1024])?;
let quantized = quantizer.quantize(&tensor)?;
let stats = quantizer.get_stats(&quantized);
assert!(stats.compression_ratio > 4.0); assert_eq!(stats.original_size_bytes, 1024 * 4);
Ok(())
}
#[test]
fn test_int4_asymmetric_quantization() -> Result<()> {
let config = AdvancedQuantizationConfig {
method: QuantizationMethod::Int4Asymmetric,
..AdvancedQuantizationConfig::default()
};
let quantizer = AdvancedQuantizer::new(config);
let data = vec![0.1, 0.3, 0.7, 0.9, 1.1, 1.3, 1.7, 1.9]; let tensor = Tensor::from_vec(data.clone(), &[8])?;
let quantized = quantizer.quantize(&tensor)?;
assert!(quantized.zero_points.is_some());
let dequantized = quantizer.dequantize(&quantized)?;
let dequant_data = dequantized.data_f32()?;
assert_eq!(dequant_data.len(), data.len());
Ok(())
}
#[test]
fn test_nf4_lookup_table() {
let config = AdvancedQuantizationConfig::default();
let quantizer = AdvancedQuantizer::new(config);
assert_eq!(quantizer.nf4_lookup.len(), 16);
assert_eq!(quantizer.nf4_lookup[&0], -1.0);
assert_eq!(quantizer.nf4_lookup[&7], 0.0);
assert_eq!(quantizer.nf4_lookup[&15], 1.0);
}
#[test]
fn test_nf4_table_monotone_increasing() {
for i in 1..NF4_QUANT_TABLE.len() {
assert!(
NF4_QUANT_TABLE[i] > NF4_QUANT_TABLE[i - 1],
"NF4 table must be strictly monotone: table[{}]={} <= table[{}]={}",
i,
NF4_QUANT_TABLE[i],
i - 1,
NF4_QUANT_TABLE[i - 1]
);
}
}
#[test]
fn test_nf4_table_zero_at_index_7() {
assert_eq!(
NF4_QUANT_TABLE[7], 0.0,
"NF4 table index 7 must be 0.0 (zero is always a quantization level)"
);
}
#[test]
fn test_nf4_table_values_in_unit_range() {
for &v in NF4_QUANT_TABLE.iter() {
assert!(
(-1.0..=1.0).contains(&v),
"NF4 quantile {} must be in [-1, 1]",
v
);
}
}
#[test]
fn test_smoothquant_migration_scale_balance() {
let alpha = 0.5f64;
let act_max = [10.0f64, 5.0, 20.0];
let weight_max = [1.0f64, 2.0, 0.5];
for i in 0..act_max.len() {
let scale = weight_max[i].powf(alpha) / act_max[i].powf(1.0 - alpha);
let new_act = act_max[i] * scale;
let new_weight = weight_max[i] / scale;
assert!(
(new_act - new_weight).abs() < 1e-9,
"SmoothQuant α=0.5 must equalise act/weight magnitudes; {} != {}",
new_act,
new_weight
);
}
}
#[test]
fn test_smoothquant_higher_alpha_smoothes_activations_more() {
let act_max = 16.0f64;
let weight_max = 1.0f64;
let scale_low_alpha = weight_max.powf(0.25) / act_max.powf(0.75); let scale_high_alpha = weight_max.powf(0.75) / act_max.powf(0.25);
let new_act_low = act_max * scale_low_alpha;
let new_act_high = act_max * scale_high_alpha;
assert!(
new_act_high > new_act_low,
"Higher α must give larger activation smoothing; got {} <= {}",
new_act_high,
new_act_low
);
}
#[test]
fn test_outlier_detection_threshold() {
let outlier_threshold = 6.0f32;
let block = [0.1f32, 0.3, 50.0, -0.2, 0.8]; let abs_max = block.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let _outliers: Vec<usize> = block
.iter()
.enumerate()
.filter(|(_, &v)| v.abs() > outlier_threshold * abs_max / outlier_threshold)
.map(|(i, _)| i)
.collect();
let strict_outliers: Vec<usize> = block
.iter()
.enumerate()
.filter(|(_, &v)| v.abs() > abs_max * 0.9) .map(|(i, _)| i)
.collect();
assert_eq!(
strict_outliers,
vec![2],
"Only index 2 (50.0) should be an outlier"
);
}
#[test]
fn test_quantization_stats_tracks_outliers() -> Result<()> {
let config = AdvancedQuantizationConfig {
method: QuantizationMethod::NF4,
outlier_threshold: 0.01, ..Default::default()
};
let quantizer = AdvancedQuantizer::new(config);
let data: Vec<f32> = (0..64).map(|i| (i as f32) * 0.1).collect();
let tensor = Tensor::from_vec(data, &[64])?;
let quantized = quantizer.quantize(&tensor)?;
let stats = quantizer.get_stats(&quantized);
assert_eq!(
stats.outlier_count,
quantized.metadata.outlier_indices.as_ref().map(|v| v.len()).unwrap_or(0)
);
Ok(())
}
#[test]
fn test_awq_scaling_reduces_quantization_error() {
let delta = 0.25f64; let quantize = |x: f64| (x / delta).round() * delta;
let w = 0.137f64; let act_scale = 2.0f64;
let _err_without = (w - quantize(w)).abs();
let _err_with = (w - quantize(w * act_scale) / act_scale).abs();
let bound_with = delta / act_scale;
let bound_without = delta;
assert!(
bound_with < bound_without,
"AWQ scaling must reduce quantization error bound: {} < {}",
bound_with,
bound_without
);
}
#[test]
fn test_per_group_quantization_error_bound() {
let bits = 4u32;
let levels = (1u32 << bits) - 1; let min_val = -1.0f64;
let max_val = 1.0f64;
let scale = (max_val - min_val) / levels as f64;
let max_error = scale / 2.0;
let a: u64 = 6364136223846793005;
let c_lcg: u64 = 1442695040888963407;
let mut lcg: u64 = 0x1122_3344_5566_7788;
for _ in 0..32 {
lcg = lcg.wrapping_mul(a).wrapping_add(c_lcg);
let x = min_val + (lcg as f64 / u64::MAX as f64) * (max_val - min_val);
let q = ((x - min_val) / scale).round().clamp(0.0, levels as f64);
let x_hat = q * scale + min_val;
let error = (x - x_hat).abs();
assert!(
error <= max_error + 1e-10,
"Quantization error {} must be <= bound {}",
error,
max_error
);
}
}
#[test]
fn test_double_quantization_preserves_scale_count() -> Result<()> {
let config = AdvancedQuantizationConfig {
double_quantization: true,
..Default::default()
};
let quantizer = AdvancedQuantizer::new(config);
let data = vec![0.5f32; 128];
let tensor = Tensor::from_vec(data, &[128])?;
let quantized = quantizer.quantize(&tensor)?;
let expected_blocks = 128usize.div_ceil(quantizer.config.block_size);
assert_eq!(quantized.scales.len(), expected_blocks);
Ok(())
}
#[test]
fn test_mixed_precision_config() -> Result<()> {
let config = AdvancedQuantizationConfig {
method: QuantizationMethod::MixedPrecision {
primary_bits: 4,
outlier_bits: 8,
},
..Default::default()
};
let quantizer = AdvancedQuantizer::new(config);
let data = vec![0.1f32, 0.2, 0.3, 0.4, 10.0, 0.5, 0.6, 0.7]; let tensor = Tensor::from_vec(data.clone(), &[8])?;
let quantized = quantizer.quantize(&tensor)?;
let dequantized = quantizer.dequantize(&quantized)?;
let dequant_data = dequantized.data_f32()?;
assert_eq!(
dequant_data.len(),
data.len(),
"Mixed-precision must preserve element count"
);
Ok(())
}
#[test]
fn test_int8_blockwise_compression_ratio() -> Result<()> {
let config = AdvancedQuantizationConfig {
method: QuantizationMethod::Int8BlockWise,
block_size: 32,
double_quantization: false,
..Default::default()
};
let quantizer = AdvancedQuantizer::new(config);
let data = vec![1.0f32; 128];
let tensor = Tensor::from_vec(data, &[128])?;
let quantized = quantizer.quantize(&tensor)?;
let stats = quantizer.get_stats(&quantized);
assert!(
stats.compression_ratio > 2.0,
"Int8 block-wise must achieve > 2x compression; got {}",
stats.compression_ratio
);
Ok(())
}
}