use crate::errors::{Result, TrustformersError};
use crate::tensor::Tensor;
use serde::{Deserialize, Serialize};
use std::f32;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[allow(non_camel_case_types)]
pub enum KQuantType {
Q2_K,
Q3_K,
Q4_K,
}
impl KQuantType {
pub fn superblock_size(&self) -> usize {
256
}
pub fn num_subblocks(&self) -> usize {
match self {
KQuantType::Q2_K => 16, KQuantType::Q3_K => 16, KQuantType::Q4_K => 8, }
}
pub fn subblock_size(&self) -> usize {
self.superblock_size() / self.num_subblocks()
}
pub fn bits_per_weight(&self) -> f32 {
match self {
KQuantType::Q2_K => 2.5625,
KQuantType::Q3_K => 3.4375,
KQuantType::Q4_K => 4.5,
}
}
pub fn weight_bits(&self) -> u8 {
match self {
KQuantType::Q2_K => 2,
KQuantType::Q3_K => 3,
KQuantType::Q4_K => 4,
}
}
pub fn scale_bits(&self) -> u8 {
match self {
KQuantType::Q2_K => 4, KQuantType::Q3_K => 6, KQuantType::Q4_K => 6, }
}
pub fn bytes_per_superblock(&self) -> usize {
match self {
KQuantType::Q2_K => 84, KQuantType::Q3_K => 112, KQuantType::Q4_K => 144, }
}
}
type F16 = u16;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BlockQ2K {
pub d: F16,
pub dmin: F16,
pub scales: Vec<u8>,
pub mins: Vec<u8>,
pub qs: Vec<u8>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BlockQ3K {
pub hmask: Vec<u8>,
pub scales: Vec<u8>,
pub d: F16,
pub qs: Vec<u8>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BlockQ4K {
pub d: F16,
pub dmin: F16,
pub scales: Vec<u8>,
pub mins: Vec<u8>,
pub qs: Vec<u8>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KQuantConfig {
pub quant_type: KQuantType,
pub importance_based: bool,
pub outlier_percentile: f32,
pub scale_optimization_iters: usize,
}
impl Default for KQuantConfig {
fn default() -> Self {
Self {
quant_type: KQuantType::Q4_K,
importance_based: true,
outlier_percentile: 0.99,
scale_optimization_iters: 10,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KQuantTensor {
pub quant_type: KQuantType,
pub shape: Vec<usize>,
pub blocks: Vec<u8>,
pub num_blocks: usize,
pub num_weights: usize,
}
pub struct KQuantizer {
config: KQuantConfig,
}
impl KQuantizer {
pub fn new(config: KQuantConfig) -> Result<Self> {
Ok(Self { config })
}
pub fn quantize(&self, tensor: &Tensor) -> Result<KQuantTensor> {
let data = tensor.to_vec_f32()?;
let shape = tensor.shape().to_vec();
let superblock_size = self.config.quant_type.superblock_size();
let num_blocks = data.len().div_ceil(superblock_size);
let bytes_per_block = self.config.quant_type.bytes_per_superblock();
let mut blocks = Vec::with_capacity(num_blocks * bytes_per_block);
for block_idx in 0..num_blocks {
let start = block_idx * superblock_size;
let end = (start + superblock_size).min(data.len());
let block_data = &data[start..end];
let mut padded = block_data.to_vec();
while padded.len() < superblock_size {
padded.push(0.0);
}
let block_bytes = match self.config.quant_type {
KQuantType::Q2_K => self.quantize_q2k(&padded)?,
KQuantType::Q3_K => self.quantize_q3k(&padded)?,
KQuantType::Q4_K => self.quantize_q4k(&padded)?,
};
blocks.extend(block_bytes);
}
Ok(KQuantTensor {
quant_type: self.config.quant_type,
shape,
blocks,
num_blocks,
num_weights: data.len(),
})
}
fn quantize_q2k(&self, data: &[f32]) -> Result<Vec<u8>> {
assert_eq!(data.len(), 256);
let num_subblocks = 16;
let subblock_size = 16;
let max_abs = data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let min_val = data.iter().copied().fold(f32::INFINITY, f32::min);
let d = f32_to_f16(max_abs / 3.0); let dmin = f32_to_f16(min_val.abs());
let mut scales = vec![0u8; 8];
let mut mins = vec![0u8; 8];
let mut qs = vec![0u8; 64];
for sb in 0..num_subblocks {
let sb_start = sb * subblock_size;
let sb_data = &data[sb_start..sb_start + subblock_size];
let sb_max = sb_data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let sb_min = sb_data.iter().copied().fold(f32::INFINITY, f32::min);
let scale_q = ((sb_max / f16_to_f32(d)) * 15.0).round().clamp(0.0, 15.0) as u8;
let min_q = ((sb_min.abs() / f16_to_f32(dmin)) * 15.0).round().clamp(0.0, 15.0) as u8;
if sb % 2 == 0 {
scales[sb / 2] = scale_q;
mins[sb / 2] = min_q;
} else {
scales[sb / 2] |= scale_q << 4;
mins[sb / 2] |= min_q << 4;
}
let sb_scale = f16_to_f32(d) * (scale_q as f32 / 15.0);
#[allow(clippy::needless_range_loop)]
for i in 0..subblock_size {
let weight = sb_data[i];
let quant = ((weight / sb_scale) + 1.5).round().clamp(0.0, 3.0) as u8;
let byte_idx = (sb_start + i) / 4;
let bit_offset = ((sb_start + i) % 4) * 2;
qs[byte_idx] |= quant << bit_offset;
}
}
let mut bytes = Vec::with_capacity(84);
bytes.extend(&d.to_le_bytes());
bytes.extend(&dmin.to_le_bytes());
bytes.extend(&scales);
bytes.extend(&mins);
bytes.extend(&qs);
Ok(bytes)
}
fn quantize_q3k(&self, data: &[f32]) -> Result<Vec<u8>> {
assert_eq!(data.len(), 256);
let num_subblocks = 16;
let subblock_size = 16;
let max_abs = data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let d = f32_to_f16(max_abs / 7.0);
let mut scales = vec![0u8; 12]; let mut hmask = [0u8; 2];
let mut qs = vec![0u8; 96];
for sb in 0..num_subblocks {
let sb_start = sb * subblock_size;
let sb_data = &data[sb_start..sb_start + subblock_size];
let sb_max = sb_data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let scale_f = (sb_max / f16_to_f32(d)) * 63.0;
let scale_q = scale_f.round().clamp(0.0, 127.0) as u8;
let scale_6bit = scale_q & 0x3F;
if scale_q & 0x40 != 0 {
hmask[sb / 8] |= 1 << (sb % 8);
}
let byte_idx = (sb * 6) / 8;
let bit_offset = (sb * 6) % 8;
if byte_idx < scales.len() {
scales[byte_idx] |= scale_6bit << bit_offset;
if bit_offset > 2 && byte_idx + 1 < scales.len() {
scales[byte_idx + 1] |= scale_6bit >> (8 - bit_offset);
}
}
let sb_scale = f16_to_f32(d) * (scale_6bit as f32 / 63.0);
#[allow(clippy::needless_range_loop)]
for i in 0..subblock_size {
let weight = sb_data[i];
let quant = ((weight / sb_scale) + 4.0).round().clamp(0.0, 7.0) as u8;
let bit_idx = (sb_start + i) * 3;
let byte_idx = bit_idx / 8;
let bit_offset = bit_idx % 8;
if byte_idx < qs.len() {
qs[byte_idx] |= (quant & 0x07) << bit_offset;
if bit_offset > 5 && byte_idx + 1 < qs.len() {
qs[byte_idx + 1] |= (quant & 0x07) >> (8 - bit_offset);
}
}
}
}
let mut bytes = Vec::with_capacity(112);
bytes.extend(&hmask);
bytes.extend(&scales);
bytes.extend(&d.to_le_bytes());
bytes.extend(&qs);
Ok(bytes)
}
fn quantize_q4k(&self, data: &[f32]) -> Result<Vec<u8>> {
assert_eq!(data.len(), 256);
let num_subblocks = 8;
let subblock_size = 32;
let max_abs = data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let min_val = data.iter().copied().fold(f32::INFINITY, f32::min);
let d = f32_to_f16(max_abs / 15.0); let dmin = f32_to_f16(min_val.abs());
let mut scales = vec![0u8; 6]; let mut mins = vec![0u8; 6];
let mut qs = vec![0u8; 128];
for sb in 0..num_subblocks {
let sb_start = sb * subblock_size;
let sb_data = &data[sb_start..sb_start + subblock_size];
let sb_max = sb_data.iter().map(|x| x.abs()).fold(0.0f32, f32::max);
let sb_min = sb_data.iter().copied().fold(f32::INFINITY, f32::min);
let scale_q = ((sb_max / f16_to_f32(d)) * 63.0).round().clamp(0.0, 63.0) as u8;
let min_q = ((sb_min.abs() / f16_to_f32(dmin)) * 63.0).round().clamp(0.0, 63.0) as u8;
let byte_idx = (sb * 6) / 8;
let bit_offset = (sb * 6) % 8;
if byte_idx < scales.len() {
scales[byte_idx] |= scale_q << bit_offset;
if bit_offset > 2 && byte_idx + 1 < scales.len() {
scales[byte_idx + 1] |= scale_q >> (8 - bit_offset);
}
}
if byte_idx < mins.len() {
mins[byte_idx] |= min_q << bit_offset;
if bit_offset > 2 && byte_idx + 1 < mins.len() {
mins[byte_idx + 1] |= min_q >> (8 - bit_offset);
}
}
let sb_scale = f16_to_f32(d) * (scale_q as f32 / 63.0);
#[allow(clippy::needless_range_loop)]
for i in 0..subblock_size {
let weight = sb_data[i];
let quant = ((weight / sb_scale) + 8.0).round().clamp(0.0, 15.0) as u8;
let byte_idx = (sb_start + i) / 2;
if (sb_start + i) % 2 == 0 {
qs[byte_idx] = quant;
} else {
qs[byte_idx] |= quant << 4;
}
}
}
let mut bytes = Vec::with_capacity(144);
bytes.extend(&d.to_le_bytes());
bytes.extend(&dmin.to_le_bytes());
bytes.extend(&scales);
bytes.extend(&mins);
bytes.extend(&qs);
Ok(bytes)
}
pub fn dequantize(&self, kquant: &KQuantTensor) -> Result<Tensor> {
let mut dequantized = Vec::with_capacity(kquant.num_weights);
let bytes_per_block = kquant.quant_type.bytes_per_superblock();
let superblock_size = kquant.quant_type.superblock_size();
for block_idx in 0..kquant.num_blocks {
let block_start = block_idx * bytes_per_block;
let block_end = block_start + bytes_per_block;
if block_end > kquant.blocks.len() {
break;
}
let block_bytes = &kquant.blocks[block_start..block_end];
let block_data = match kquant.quant_type {
KQuantType::Q2_K => self.dequantize_q2k(block_bytes)?,
KQuantType::Q3_K => self.dequantize_q3k(block_bytes)?,
KQuantType::Q4_K => self.dequantize_q4k(block_bytes)?,
};
let remaining = kquant.num_weights - dequantized.len();
let to_add = remaining.min(superblock_size);
dequantized.extend_from_slice(&block_data[..to_add]);
}
Tensor::from_vec(dequantized, &kquant.shape)
}
fn dequantize_q2k(&self, bytes: &[u8]) -> Result<Vec<f32>> {
if bytes.len() < 84 {
return Err(TrustformersError::quantization_error(
"Invalid Q2_K block size".to_string(),
));
}
let d = f16_to_f32(u16::from_le_bytes([bytes[0], bytes[1]]));
let _dmin = f16_to_f32(u16::from_le_bytes([bytes[2], bytes[3]]));
let scales = &bytes[4..12];
let _mins = &bytes[12..20];
let qs = &bytes[20..84];
let mut weights = Vec::with_capacity(256);
for sb in 0..16 {
let scale_byte = scales[sb / 2];
let scale_q = if sb % 2 == 0 { scale_byte & 0x0F } else { scale_byte >> 4 };
let sb_scale = d * (scale_q as f32 / 15.0);
for i in 0..16 {
let weight_idx = sb * 16 + i;
let byte_idx = weight_idx / 4;
let bit_offset = (weight_idx % 4) * 2;
if byte_idx < qs.len() {
let quant = (qs[byte_idx] >> bit_offset) & 0x03;
let weight = sb_scale * (quant as f32 - 1.5);
weights.push(weight);
}
}
}
Ok(weights)
}
fn dequantize_q3k(&self, bytes: &[u8]) -> Result<Vec<f32>> {
if bytes.len() < 112 {
return Err(TrustformersError::quantization_error(
"Invalid Q3_K block size".to_string(),
));
}
let hmask = &bytes[0..2];
let scales = &bytes[2..14];
let d = f16_to_f32(u16::from_le_bytes([bytes[14], bytes[15]]));
let qs = &bytes[16..112];
let mut weights = Vec::with_capacity(256);
for sb in 0..16 {
let byte_idx = (sb * 6) / 8;
let bit_offset = (sb * 6) % 8;
let mut scale_q = if byte_idx < scales.len() {
(scales[byte_idx] >> bit_offset) & 0x3F
} else {
32
};
if hmask[sb / 8] & (1 << (sb % 8)) != 0 {
scale_q |= 0x40;
}
let sb_scale = d * (scale_q as f32 / 63.0);
for i in 0..16 {
let weight_idx = sb * 16 + i;
let bit_idx = weight_idx * 3;
let byte_idx = bit_idx / 8;
let bit_offset = bit_idx % 8;
if byte_idx < qs.len() {
let mut quant = (qs[byte_idx] >> bit_offset) & 0x07;
if bit_offset > 5 && byte_idx + 1 < qs.len() {
quant |= (qs[byte_idx + 1] << (8 - bit_offset)) & 0x07;
}
let weight = sb_scale * (quant as f32 - 4.0);
weights.push(weight);
}
}
}
Ok(weights)
}
fn dequantize_q4k(&self, bytes: &[u8]) -> Result<Vec<f32>> {
if bytes.len() < 144 {
return Err(TrustformersError::quantization_error(
"Invalid Q4_K block size".to_string(),
));
}
let d = f16_to_f32(u16::from_le_bytes([bytes[0], bytes[1]]));
let _dmin = f16_to_f32(u16::from_le_bytes([bytes[2], bytes[3]]));
let scales = &bytes[4..10];
let _mins = &bytes[10..16];
let qs = &bytes[16..144];
let mut weights = Vec::with_capacity(256);
for sb in 0..8 {
let byte_idx = (sb * 6) / 8;
let bit_offset = (sb * 6) % 8;
let scale_q = if byte_idx < scales.len() {
(scales[byte_idx] >> bit_offset) & 0x3F
} else {
32
};
let sb_scale = d * (scale_q as f32 / 63.0);
for i in 0..32 {
let weight_idx = sb * 32 + i;
let byte_idx = weight_idx / 2;
if byte_idx < qs.len() {
let quant =
if weight_idx % 2 == 0 { qs[byte_idx] & 0x0F } else { qs[byte_idx] >> 4 };
let weight = sb_scale * (quant as f32 - 8.0);
weights.push(weight);
}
}
}
Ok(weights)
}
}
fn f32_to_f16(val: f32) -> F16 {
let bits = val.to_bits();
let sign = (bits >> 31) & 1;
let exp = ((bits >> 23) & 0xFF) as i32;
let mant = bits & 0x7F_FFFF;
if exp == 0 {
return (sign as u16) << 15;
}
let exp_f16 = exp - 127 + 15;
if exp_f16 <= 0 {
return (sign as u16) << 15;
}
if exp_f16 >= 31 {
return ((sign as u16) << 15) | 0x7C00; }
let mant_f16 = (mant >> 13) as u16;
((sign as u16) << 15) | ((exp_f16 as u16) << 10) | (mant_f16 & 0x3FF)
}
fn f16_to_f32(val: F16) -> f32 {
let sign = (val >> 15) & 1;
let exp = ((val >> 10) & 0x1F) as i32;
let mant = (val & 0x3FF) as u32;
if exp == 0 {
return if sign == 1 { -0.0 } else { 0.0 };
}
if exp == 31 {
return if sign == 1 { f32::NEG_INFINITY } else { f32::INFINITY };
}
let exp_f32 = exp - 15 + 127;
let mant_f32 = mant << 13;
let bits = ((sign as u32) << 31) | ((exp_f32 as u32) << 23) | mant_f32;
f32::from_bits(bits)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_kquant_types() {
let q2k = KQuantType::Q2_K;
assert_eq!(q2k.superblock_size(), 256);
assert_eq!(q2k.num_subblocks(), 16);
assert_eq!(q2k.weight_bits(), 2);
assert_eq!(q2k.bytes_per_superblock(), 84);
let q3k = KQuantType::Q3_K;
assert_eq!(q3k.weight_bits(), 3);
assert_eq!(q3k.bytes_per_superblock(), 112);
let q4k = KQuantType::Q4_K;
assert_eq!(q4k.weight_bits(), 4);
assert_eq!(q4k.bytes_per_superblock(), 144);
}
#[test]
fn test_q4k_quantization() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q4_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..512).map(|i| (i as f32) * 0.01).collect();
let tensor = Tensor::from_vec(data.clone(), &[512])?;
let quantized = quantizer.quantize(&tensor)?;
assert_eq!(quantized.quant_type, KQuantType::Q4_K);
assert_eq!(quantized.num_blocks, 2); assert_eq!(quantized.blocks.len(), 2 * 144);
let dequantized = quantizer.dequantize(&quantized)?;
assert_eq!(dequantized.shape(), &[512]);
Ok(())
}
#[test]
fn test_q2k_roundtrip() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q2_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..256).map(|i| ((i as f32) - 128.0) * 0.01).collect();
let tensor = Tensor::from_vec(data.clone(), &[256])?;
let quantized = quantizer.quantize(&tensor)?;
let dequantized = quantizer.dequantize(&quantized)?;
let deq_data = dequantized.to_vec_f32()?;
for (orig, deq) in data.iter().zip(deq_data.iter()) {
let abs_error = (orig - deq).abs();
assert!(abs_error < 1.5, "Error too large: {} vs {}", orig, deq);
}
Ok(())
}
#[test]
fn test_f16_conversion() {
let values = vec![0.0, 1.0, -1.0, 10.5, -10.5, 0.123, -0.123];
for &val in &values {
let f16 = f32_to_f16(val);
let recovered = f16_to_f32(f16);
let rel_error = (val - recovered).abs() / (val.abs() + 1e-6);
assert!(
rel_error < 0.001,
"FP16 conversion error: {} vs {}",
val,
recovered
);
}
}
#[test]
fn test_kquant_type_superblock_size_all() {
assert_eq!(KQuantType::Q2_K.superblock_size(), 256);
assert_eq!(KQuantType::Q3_K.superblock_size(), 256);
assert_eq!(KQuantType::Q4_K.superblock_size(), 256);
}
#[test]
fn test_kquant_type_subblock_sizes() {
assert_eq!(KQuantType::Q2_K.subblock_size(), 16);
assert_eq!(KQuantType::Q3_K.subblock_size(), 16);
assert_eq!(KQuantType::Q4_K.subblock_size(), 32);
}
#[test]
fn test_kquant_type_bits_per_weight() {
assert!((KQuantType::Q2_K.bits_per_weight() - 2.5625).abs() < 1e-4);
assert!((KQuantType::Q3_K.bits_per_weight() - 3.4375).abs() < 1e-4);
assert!((KQuantType::Q4_K.bits_per_weight() - 4.5).abs() < 1e-4);
}
#[test]
fn test_kquant_type_scale_bits() {
assert_eq!(KQuantType::Q2_K.scale_bits(), 4);
assert_eq!(KQuantType::Q3_K.scale_bits(), 6);
assert_eq!(KQuantType::Q4_K.scale_bits(), 6);
}
#[test]
fn test_kquant_config_default() {
let config = KQuantConfig::default();
assert_eq!(config.quant_type, KQuantType::Q4_K);
assert!(config.importance_based);
assert!((config.outlier_percentile - 0.99).abs() < 1e-6);
assert_eq!(config.scale_optimization_iters, 10);
}
#[test]
fn test_kquant_config_clone() {
let config = KQuantConfig {
quant_type: KQuantType::Q2_K,
importance_based: false,
outlier_percentile: 0.95,
scale_optimization_iters: 5,
};
let cloned = config.clone();
assert_eq!(cloned.quant_type, KQuantType::Q2_K);
assert!(!cloned.importance_based);
}
#[test]
fn test_quantizer_creation() -> Result<()> {
let quantizer = KQuantizer::new(KQuantConfig::default())?;
let _ = quantizer;
Ok(())
}
#[test]
fn test_q3k_quantization() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q3_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..256).map(|i| (i as f32) * 0.01).collect();
let tensor = Tensor::from_vec(data, &[256])?;
let quantized = quantizer.quantize(&tensor)?;
assert_eq!(quantized.quant_type, KQuantType::Q3_K);
assert_eq!(quantized.num_blocks, 1);
assert_eq!(quantized.blocks.len(), 112);
let dequantized = quantizer.dequantize(&quantized)?;
assert_eq!(dequantized.shape(), &[256]);
Ok(())
}
#[test]
fn test_q4k_multiple_blocks() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q4_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..1024).map(|i| (i as f32) * 0.001).collect();
let tensor = Tensor::from_vec(data, &[1024])?;
let quantized = quantizer.quantize(&tensor)?;
assert_eq!(quantized.num_blocks, 4); assert_eq!(quantized.blocks.len(), 4 * 144);
Ok(())
}
#[test]
fn test_q2k_all_zeros() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q2_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data = vec![0.0f32; 256];
let tensor = Tensor::from_vec(data, &[256])?;
let quantized = quantizer.quantize(&tensor)?;
let dequantized = quantizer.dequantize(&quantized)?;
let deq_data = dequantized.to_vec_f32()?;
for val in &deq_data {
assert!(val.abs() < 1e-3, "Expected ~0.0, got {}", val);
}
Ok(())
}
#[test]
fn test_q4k_negative_values() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q4_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..256).map(|i| (i as f32 - 128.0) * 0.01).collect();
let tensor = Tensor::from_vec(data, &[256])?;
let quantized = quantizer.quantize(&tensor)?;
let dequantized = quantizer.dequantize(&quantized)?;
assert_eq!(dequantized.shape(), &[256]);
Ok(())
}
#[test]
fn test_kquant_tensor_clone() -> Result<()> {
let config = KQuantConfig::default();
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..256).map(|i| i as f32 * 0.01).collect();
let tensor = Tensor::from_vec(data, &[256])?;
let quantized = quantizer.quantize(&tensor)?;
let cloned = quantized.clone();
assert_eq!(cloned.quant_type, quantized.quant_type);
assert_eq!(cloned.num_blocks, quantized.num_blocks);
assert_eq!(cloned.blocks.len(), quantized.blocks.len());
Ok(())
}
#[test]
fn test_kquant_tensor_shape_preserved() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q4_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..512).map(|i| i as f32 * 0.01).collect();
let tensor = Tensor::from_vec(data, &[512])?;
let quantized = quantizer.quantize(&tensor)?;
assert_eq!(quantized.shape, vec![512]);
assert_eq!(quantized.num_weights, 512);
Ok(())
}
#[test]
fn test_f16_zero() {
let f16 = f32_to_f16(0.0);
let recovered = f16_to_f32(f16);
assert!((recovered).abs() < 1e-6);
}
#[test]
fn test_f16_small_values() {
let val = 0.001;
let f16 = f32_to_f16(val);
let recovered = f16_to_f32(f16);
assert!((val - recovered).abs() < 0.001);
}
#[test]
fn test_f16_large_values() {
let val = 65000.0;
let f16 = f32_to_f16(val);
let recovered = f16_to_f32(f16);
let rel_error = (val - recovered).abs() / val;
assert!(rel_error < 0.01);
}
#[test]
fn test_f16_negative() {
let val = -42.5;
let f16 = f32_to_f16(val);
let recovered = f16_to_f32(f16);
assert!((val - recovered).abs() < 0.1);
}
#[test]
fn test_q2k_compression_ratio() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q2_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..256).map(|i| i as f32 * 0.01).collect();
let tensor = Tensor::from_vec(data, &[256])?;
let quantized = quantizer.quantize(&tensor)?;
assert!(quantized.blocks.len() < 256 * 4);
Ok(())
}
#[test]
fn test_q4k_compression_ratio() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q4_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..256).map(|i| i as f32 * 0.01).collect();
let tensor = Tensor::from_vec(data, &[256])?;
let quantized = quantizer.quantize(&tensor)?;
assert!(quantized.blocks.len() < 256 * 4);
assert_eq!(quantized.blocks.len(), 144);
Ok(())
}
#[test]
fn test_quantize_non_multiple_of_superblock() -> Result<()> {
let config = KQuantConfig {
quant_type: KQuantType::Q4_K,
..Default::default()
};
let quantizer = KQuantizer::new(config)?;
let data: Vec<f32> = (0..300).map(|i| i as f32 * 0.01).collect();
let tensor = Tensor::from_vec(data, &[300])?;
let quantized = quantizer.quantize(&tensor)?;
assert_eq!(quantized.num_blocks, 2); assert_eq!(quantized.num_weights, 300);
Ok(())
}
}