#![allow(dead_code)]
pub mod core;
pub mod ops;
pub mod params;
pub mod tensor;
pub mod types;
pub mod accelerator;
pub mod hardware;
pub mod specialized;
pub mod benchmarks;
pub mod calibration;
pub use core::{dequantize_from_int8, quantize_to_int8};
pub use ops::{CpuQuantizationOps, QuantizationOps};
pub use params::QuantizationParams;
pub use tensor::QuantizedTensor;
pub use types::{QuantizationScheme, QuantizedDType};
pub use hardware::{
QuantizationHardwareFeatures, QuantizationPerformanceHints, QuantizedMemoryLayout,
SimdQuantizationOps,
};
pub use specialized::{
Dp4aQuantizationOps, SpecializedQuantizationOps, TensorCoreFormat, TensorCoreQuantizationOps,
VnniQuantizationOps,
};
pub use accelerator::{
AdvancedQuantizationAccelerator, AutoTuningConfig, BenchmarkResults, OptimalQuantizationConfig,
PerformanceRequirements, QuantizationOperationType, QuantizationRecommendations,
QuantizationWorkload,
};
pub use calibration::{
CalibrationFunction, CalibrationMethod, CalibrationStatistics, PercentileCalibrator,
QuantizationCalibrator,
};
pub use benchmarks::{
BenchmarkConfig, BenchmarkResult, BenchmarkSummary, ComparativeBenchmarkResult,
MemoryBenchmarkResults, MemoryUsage, QuantizationBenchmarkSuite,
};
use crate::{BackendResult, Device};
use std::sync::Arc;
#[cfg(not(feature = "std"))]
use alloc::{boxed::Box, string::String, sync::Arc, vec::Vec};
pub fn create_quantization_system(device: Device) -> BackendResult<QuantizationSystem> {
QuantizationSystem::new(device)
}
pub fn create_optimal_params(dtype: QuantizedDType, accuracy_priority: bool) -> QuantizationParams {
if accuracy_priority {
match dtype {
QuantizedDType::Int8 => QuantizationParams::int8_symmetric(),
QuantizedDType::UInt8 => QuantizationParams::uint8_asymmetric(),
QuantizedDType::Int4 => QuantizationParams::int4_symmetric(),
_ => {
let mut params = QuantizationParams::default();
params.dtype = dtype;
params
}
}
} else {
let mut params = QuantizationParams {
dtype: dtype.clone(),
scheme: QuantizationScheme::Symmetric, scale: vec![1.0],
zero_point: vec![0],
block_size: None,
min_val: None,
max_val: None,
};
if matches!(
dtype,
QuantizedDType::UInt8 | QuantizedDType::UInt4 | QuantizedDType::UInt16
) {
params.scheme = QuantizationScheme::Asymmetric;
}
params
}
}
#[derive(Debug)]
pub struct QuantizationSystem {
device: Device,
hw_features: QuantizationHardwareFeatures,
base_ops: CpuQuantizationOps,
accelerator: Option<AdvancedQuantizationAccelerator>,
calibrator: QuantizationCalibrator,
benchmark_suite: QuantizationBenchmarkSuite,
}
impl QuantizationSystem {
pub fn new(device: Device) -> BackendResult<Self> {
let hw_features = QuantizationHardwareFeatures::detect_for_device(&device);
let base_ops = CpuQuantizationOps::new();
let accelerator = if hw_features.supports_int8_simd || hw_features.supports_tensor_cores {
Some(AdvancedQuantizationAccelerator::new(
device.clone(),
Arc::new(base_ops.clone()),
))
} else {
None
};
let calibrator = QuantizationCalibrator::new(CalibrationMethod::Adaptive, device.clone());
let benchmark_suite =
QuantizationBenchmarkSuite::new(device.clone(), BenchmarkConfig::default());
Ok(Self {
device,
hw_features,
base_ops,
accelerator,
calibrator,
benchmark_suite,
})
}
pub fn hardware_features(&self) -> &QuantizationHardwareFeatures {
&self.hw_features
}
pub fn device(&self) -> &Device {
&self.device
}
pub fn has_acceleration(&self) -> bool {
self.accelerator.is_some()
}
pub fn get_recommendations(
&self,
workload: &QuantizationWorkload,
) -> QuantizationRecommendations {
if let Some(ref accelerator) = self.accelerator {
accelerator.get_recommendations(workload)
} else {
QuantizationRecommendations::default()
}
}
pub fn calibrate_from_samples(
&mut self,
samples: Vec<Vec<f32>>,
dtype: QuantizedDType,
method: CalibrationMethod,
) -> BackendResult<QuantizationParams> {
self.calibrator.set_method(method);
self.calibrator.clear_samples();
self.calibrator.add_samples(samples);
self.calibrator.calibrate(dtype)
}
pub fn auto_tune(
&mut self,
workload: &QuantizationWorkload,
) -> BackendResult<OptimalQuantizationConfig> {
if let Some(ref mut accelerator) = self.accelerator {
accelerator.auto_tune(workload)
} else {
Ok(OptimalQuantizationConfig::default())
}
}
pub fn benchmark_operations(&mut self) -> BackendResult<BenchmarkSummary> {
self.benchmark_suite
.benchmark_quantization_ops(&self.base_ops)
}
pub fn quantize_f32(
&self,
input: &[f32],
params: &QuantizationParams,
) -> BackendResult<Vec<u8>> {
if self.should_use_acceleration(¶ms.dtype) {
if let Some(ref _accelerator) = self.accelerator {
return self.base_ops.quantize_f32(input, params);
}
}
self.base_ops.quantize_f32(input, params)
}
pub fn dequantize_f32(
&self,
input: &[u8],
params: &QuantizationParams,
) -> BackendResult<Vec<f32>> {
if self.should_use_acceleration(¶ms.dtype) {
if let Some(ref _accelerator) = self.accelerator {
return self.base_ops.dequantize_f32(input, params);
}
}
self.base_ops.dequantize_f32(input, params)
}
pub fn qmatmul(
&self,
a: &QuantizedTensor,
b: &QuantizedTensor,
) -> BackendResult<QuantizedTensor> {
self.base_ops.qmatmul(a, b)
}
fn should_use_acceleration(&self, dtype: &QuantizedDType) -> bool {
self.hw_features.supports_dtype_efficiently(dtype)
}
pub fn create_quantized_tensor(
&self,
shape: Vec<usize>,
params: QuantizationParams,
) -> QuantizedTensor {
QuantizedTensor::new(shape, params, self.device.clone())
}
pub fn optimal_block_size(&self) -> usize {
self.hw_features.optimal_block_size()
}
pub fn performance_hints(&self) -> QuantizationPerformanceHints {
QuantizationPerformanceHints::for_hardware(&self.hw_features)
}
}
pub fn auto_quantize_tensor(
data: &[f32],
shape: Vec<usize>,
device: Device,
target_dtype: QuantizedDType,
) -> BackendResult<QuantizedTensor> {
let mut calibrator = QuantizationCalibrator::new(CalibrationMethod::Adaptive, device.clone());
calibrator.add_sample(data.to_vec());
let params = calibrator.calibrate(target_dtype)?;
let system = QuantizationSystem::new(device)?;
let quantized_data = system.quantize_f32(data, ¶ms)?;
Ok(QuantizedTensor {
data: quantized_data,
shape,
params,
device: system.device.clone(),
})
}
pub fn estimate_memory_savings(dtype: &QuantizedDType) -> f64 {
let fp32_bits = 32.0;
let quantized_bits = dtype.bits() as f64;
1.0 - (quantized_bits / fp32_bits)
}
pub fn estimate_accuracy_impact(dtype: &QuantizedDType, scheme: QuantizationScheme) -> f64 {
let base_accuracy: f64 = match dtype {
QuantizedDType::Int16 | QuantizedDType::UInt16 => 0.99,
QuantizedDType::Int8 | QuantizedDType::UInt8 => 0.95,
QuantizedDType::Int4 | QuantizedDType::UInt4 => 0.85,
QuantizedDType::Binary => 0.70,
QuantizedDType::Mixed(_) => 0.90,
};
let scheme_factor: f64 = match scheme {
QuantizationScheme::Symmetric => 1.0,
QuantizationScheme::Linear => 0.98,
QuantizationScheme::Asymmetric => 0.96,
QuantizationScheme::ChannelWise => 1.02, QuantizationScheme::BlockWise => 1.01, QuantizationScheme::Logarithmic => 0.90, };
(base_accuracy * scheme_factor).min(1.0f64)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_create_quantization_system() {
let device = Device::cpu().expect("Device should succeed");
let system = create_quantization_system(device);
assert!(system.is_ok());
let system = system.expect("operation should succeed");
assert!(!system.device().device_type().to_string().is_empty());
}
#[test]
fn test_create_optimal_params() {
let params_acc = create_optimal_params(QuantizedDType::Int8, true);
assert_eq!(params_acc.dtype, QuantizedDType::Int8);
assert_eq!(params_acc.scheme, QuantizationScheme::Symmetric);
let params_speed = create_optimal_params(QuantizedDType::UInt8, false);
assert_eq!(params_speed.dtype, QuantizedDType::UInt8);
assert_eq!(params_speed.scheme, QuantizationScheme::Asymmetric);
}
#[test]
fn test_quantization_system_creation() {
let device = Device::cpu().expect("Device should succeed");
let system = QuantizationSystem::new(device);
assert!(system.is_ok());
let system = system.expect("operation should succeed");
assert!(system.hardware_features().max_parallel_ops >= 1);
}
#[test]
fn test_quantization_system_operations() {
let device = Device::cpu().expect("Device should succeed");
let system = QuantizationSystem::new(device).expect("Quantization System should succeed");
let data = vec![1.0, 2.0, 3.0, 4.0];
let params = QuantizationParams::int8_symmetric();
let quantized = system.quantize_f32(&data, ¶ms);
assert!(quantized.is_ok());
let quantized_data = quantized.expect("operation should succeed");
let dequantized = system.dequantize_f32(&quantized_data, ¶ms);
assert!(dequantized.is_ok());
let dequantized_data = dequantized.expect("operation should succeed");
assert_eq!(dequantized_data.len(), data.len());
}
#[test]
fn test_auto_quantize_tensor() {
let device = Device::cpu().expect("Device should succeed");
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let shape = vec![2, 3];
let result = auto_quantize_tensor(&data, shape.clone(), device, QuantizedDType::Int8);
assert!(result.is_ok());
let tensor = result.expect("operation should succeed");
assert_eq!(tensor.shape, shape);
assert_eq!(tensor.params.dtype, QuantizedDType::Int8);
}
#[test]
fn test_memory_savings_estimation() {
let savings_int8 = estimate_memory_savings(&QuantizedDType::Int8);
assert!((savings_int8 - 0.75).abs() < 0.01);
let savings_int4 = estimate_memory_savings(&QuantizedDType::Int4);
assert!((savings_int4 - 0.875).abs() < 0.01);
let savings_binary = estimate_memory_savings(&QuantizedDType::Binary);
assert!((savings_binary - 0.96875).abs() < 0.01);
}
#[test]
fn test_accuracy_impact_estimation() {
let accuracy_int8 =
estimate_accuracy_impact(&QuantizedDType::Int8, QuantizationScheme::Symmetric);
assert!(accuracy_int8 >= 0.90);
let accuracy_int4 =
estimate_accuracy_impact(&QuantizedDType::Int4, QuantizationScheme::Symmetric);
assert!(accuracy_int4 < accuracy_int8);
let accuracy_channelwise =
estimate_accuracy_impact(&QuantizedDType::Int8, QuantizationScheme::ChannelWise);
let accuracy_linear =
estimate_accuracy_impact(&QuantizedDType::Int8, QuantizationScheme::Linear);
assert!(accuracy_channelwise >= accuracy_linear);
}
#[test]
fn test_quantization_system_calibration() {
let device = Device::cpu().expect("Device should succeed");
let mut system =
QuantizationSystem::new(device).expect("Quantization System should succeed");
let samples = vec![vec![1.0, 2.0, 3.0], vec![4.0, 5.0, 6.0]];
let result =
system.calibrate_from_samples(samples, QuantizedDType::Int8, CalibrationMethod::MinMax);
assert!(result.is_ok());
let params = result.expect("operation should succeed");
assert_eq!(params.dtype, QuantizedDType::Int8);
}
#[test]
fn test_quantization_system_benchmarking() {
let device = Device::cpu().expect("Device should succeed");
let mut system =
QuantizationSystem::new(device).expect("Quantization System should succeed");
let result = system.benchmark_operations();
assert!(result.is_ok());
let summary = result.expect("operation should succeed");
assert!(!summary.results.is_empty());
}
#[test]
fn test_quantization_system_tensor_creation() {
let device = Device::cpu().expect("Device should succeed");
let system = QuantizationSystem::new(device).expect("Quantization System should succeed");
let shape = vec![2, 3, 4];
let params = QuantizationParams::int8_symmetric();
let tensor = system.create_quantized_tensor(shape.clone(), params.clone());
assert_eq!(tensor.shape, shape);
assert_eq!(tensor.params.dtype, params.dtype);
}
#[test]
fn test_quantization_system_performance_hints() {
let device = Device::cpu().expect("Device should succeed");
let system = QuantizationSystem::new(device).expect("Quantization System should succeed");
let hints = system.performance_hints();
assert!(!hints.preferred_dtypes.is_empty());
assert!(!hints.preferred_schemes.is_empty());
assert!(hints.optimal_batch_size > 0);
}
}