const QUANTIZATION_SCALE: f32 = 127.0;
const DEQUANTIZATION_SCALE: f32 = 1.0 / 127.0;
pub fn quantize_embedding(embedding: &[f32]) -> Vec<i8> {
embedding
.iter()
.map(|&value| {
let scaled = value * QUANTIZATION_SCALE;
let rounded = scaled.round();
if rounded > 127.0 {
127
} else if rounded < -127.0 {
-127
} else {
rounded as i8
}
})
.collect()
}
pub fn dequantize_embedding(quantized: &[i8]) -> Vec<f32> {
quantized
.iter()
.map(|&value| value as f32 * DEQUANTIZATION_SCALE)
.collect()
}
pub fn cosine_similarity_quantized(a: &[i8], b: &[i8]) -> f32 {
assert_eq!(a.len(), b.len(), "Embeddings must have the same dimension");
if a.is_empty() {
return 0.0;
}
let dot_product: i32 = a
.iter()
.zip(b.iter())
.map(|(&x, &y)| x as i32 * y as i32)
.sum();
let norm_a_sq: i32 = a.iter().map(|&x| x as i32 * x as i32).sum();
let norm_b_sq: i32 = b.iter().map(|&x| x as i32 * x as i32).sum();
if norm_a_sq == 0 || norm_b_sq == 0 {
return 0.0;
}
let norm_a = (norm_a_sq as f32).sqrt();
let norm_b = (norm_b_sq as f32).sqrt();
dot_product as f32 / (norm_a * norm_b)
}
pub fn quantization_memory_savings(f32_bytes: usize, i8_bytes: usize) -> f64 {
if f32_bytes == 0 {
return 0.0;
}
((f32_bytes - i8_bytes) as f64 / f32_bytes as f64) * 100.0
}
pub fn quantization_error(original: &[f32], restored: &[f32]) -> QuantizationError {
assert_eq!(
original.len(),
restored.len(),
"Vectors must have same length"
);
let mut max_error = 0.0f32;
let mut sum_error = 0.0f32;
let mut sum_squared_error = 0.0f32;
for (&orig, &rest) in original.iter().zip(restored.iter()) {
let error = (orig - rest).abs();
max_error = max_error.max(error);
sum_error += error;
sum_squared_error += error * error;
}
let n = original.len() as f32;
QuantizationError {
max_error,
mean_error: sum_error / n,
rmse: (sum_squared_error / n).sqrt(),
}
}
#[derive(Debug, Clone, Copy)]
pub struct QuantizationError {
pub max_error: f32,
pub mean_error: f32,
pub rmse: f32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_quantize_basic() {
let embedding = vec![0.5, -0.5, 1.0, -1.0, 0.0];
let quantized = quantize_embedding(&embedding);
assert_eq!(quantized.len(), 5);
assert_eq!(quantized[0], 64); assert_eq!(quantized[1], -64); assert_eq!(quantized[2], 127); assert_eq!(quantized[3], -127); assert_eq!(quantized[4], 0); }
#[test]
fn test_quantize_clamps_out_of_range() {
let embedding = vec![2.0, -2.0, 1.5, -1.5];
let quantized = quantize_embedding(&embedding);
assert_eq!(quantized[0], 127);
assert_eq!(quantized[1], -127);
assert_eq!(quantized[2], 127);
assert_eq!(quantized[3], -127);
}
#[test]
fn test_dequantize() {
let quantized = vec![64, -64, 127, -127, 0];
let dequantized = dequantize_embedding(&quantized);
assert_eq!(dequantized.len(), 5);
assert!((dequantized[0] - 0.504).abs() < 0.01);
assert!((dequantized[1] - (-0.504)).abs() < 0.01);
assert!((dequantized[2] - 1.0).abs() < 0.01);
assert!((dequantized[3] - (-1.0)).abs() < 0.01);
assert!((dequantized[4] - 0.0).abs() < 0.01);
}
#[test]
fn test_roundtrip_error() {
let original = vec![0.1, 0.2, 0.3, 0.4, 0.5, -0.1, -0.2, -0.3];
let quantized = quantize_embedding(&original);
let restored = dequantize_embedding(&quantized);
for (orig, rest) in original.iter().zip(restored.iter()) {
let error = (orig - rest).abs();
assert!(error < 0.01, "Error too large: {}", error);
}
}
#[test]
fn test_cosine_similarity_quantized() {
let a = vec![0.5, -0.3, 0.8];
let b = vec![0.5, -0.3, 0.8];
let qa = quantize_embedding(&a);
let qb = quantize_embedding(&b);
let similarity = cosine_similarity_quantized(&qa, &qb);
assert!((similarity - 1.0).abs() < 0.01);
}
#[test]
fn test_cosine_similarity_orthogonal() {
let a = vec![1.0, 0.0];
let b = vec![0.0, 1.0];
let qa = quantize_embedding(&a);
let qb = quantize_embedding(&b);
let similarity = cosine_similarity_quantized(&qa, &qb);
assert!(similarity.abs() < 0.01);
}
#[test]
fn test_cosine_similarity_opposite() {
let a = vec![1.0, 0.0];
let b = vec![-1.0, 0.0];
let qa = quantize_embedding(&a);
let qb = quantize_embedding(&b);
let similarity = cosine_similarity_quantized(&qa, &qb);
assert!((similarity - (-1.0)).abs() < 0.01);
}
#[test]
#[should_panic(expected = "Embeddings must have the same dimension")]
fn test_cosine_similarity_different_dimensions() {
let a = vec![1, 2, 3];
let b = vec![1, 2];
cosine_similarity_quantized(&a, &b);
}
#[test]
fn test_cosine_similarity_empty() {
let a: Vec<i8> = vec![];
let b: Vec<i8> = vec![];
let similarity = cosine_similarity_quantized(&a, &b);
assert_eq!(similarity, 0.0);
}
#[test]
fn test_cosine_similarity_zero_vector() {
let a = vec![0, 0, 0];
let b = vec![1, 2, 3];
let similarity = cosine_similarity_quantized(&a, &b);
assert_eq!(similarity, 0.0);
}
#[test]
fn test_memory_savings() {
let f32_bytes = 1536 * 100_000 * 4; let i8_bytes = 1536 * 100_000 * 1;
let savings = quantization_memory_savings(f32_bytes, i8_bytes);
assert_eq!(savings, 75.0);
}
#[test]
fn test_quantization_error_stats() {
let original = vec![0.1, 0.2, 0.3, 0.4, 0.5];
let quantized = quantize_embedding(&original);
let restored = dequantize_embedding(&quantized);
let error = quantization_error(&original, &restored);
assert!(error.max_error < 0.01);
assert!(error.mean_error < 0.01);
assert!(error.rmse < 0.01);
}
#[test]
fn test_quantization_error_perfect() {
let original = vec![0.0, 1.0 / 127.0, -1.0 / 127.0];
let quantized = quantize_embedding(&original);
let restored = dequantize_embedding(&quantized);
let error = quantization_error(&original, &restored);
assert!(error.max_error < 0.0001);
assert!(error.mean_error < 0.0001);
assert!(error.rmse < 0.0001);
}
#[test]
fn test_quantize_normalized_embedding() {
let embedding = vec![
0.12, -0.34, 0.56, 0.78, -0.23, 0.45, -0.67, 0.89, -0.12, 0.34,
];
let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
let normalized: Vec<f32> = embedding.iter().map(|x| x / norm).collect();
let quantized = quantize_embedding(&normalized);
let restored = dequantize_embedding(&quantized);
let error = quantization_error(&normalized, &restored);
assert!(error.max_error < 0.02);
assert!(error.mean_error < 0.01);
}
#[test]
fn test_large_embedding() {
let embedding: Vec<f32> = (0..1536).map(|i| (i as f32 / 1536.0) * 2.0 - 1.0).collect();
let quantized = quantize_embedding(&embedding);
assert_eq!(quantized.len(), 1536);
let restored = dequantize_embedding(&quantized);
assert_eq!(restored.len(), 1536);
let f32_bytes = 1536 * 4;
let i8_bytes = 1536 * 1;
let savings = quantization_memory_savings(f32_bytes, i8_bytes);
assert_eq!(savings, 75.0);
}
}