use std::collections::HashMap;
use vecstore::store::quantization::{PQConfig, ProductQuantizer};
use vecstore::{Metadata, Query, VecStore};
#[test]
fn test_pq_config_default() {
let config = PQConfig::default();
assert_eq!(config.num_subvectors, 16);
assert_eq!(config.num_centroids, 256);
assert_eq!(config.training_iterations, 20);
}
#[test]
fn test_pq_config_custom() {
let config = PQConfig {
num_subvectors: 8,
num_centroids: 256,
training_iterations: 10,
};
assert_eq!(config.num_subvectors, 8);
assert_eq!(config.num_centroids, 256);
}
#[test]
fn test_pq_creation_valid_dimension() {
let config = PQConfig {
num_subvectors: 8,
num_centroids: 256,
training_iterations: 10,
};
let pq = ProductQuantizer::new(128, config);
assert!(pq.is_ok(), "Should create PQ with valid dimensions");
}
#[test]
fn test_pq_creation_invalid_dimension() {
let config = PQConfig {
num_subvectors: 8,
num_centroids: 256,
training_iterations: 10,
};
let pq = ProductQuantizer::new(100, config);
assert!(pq.is_err(), "Should fail with invalid dimensions");
}
#[test]
fn test_pq_train_basic() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..300)
.map(|i| {
vec![
i as f32 * 0.1,
(i * 2) as f32 * 0.1,
(i * 3) as f32 * 0.1,
(i * 4) as f32 * 0.1,
(i * 5) as f32 * 0.1,
(i * 6) as f32 * 0.1,
(i * 7) as f32 * 0.1,
(i * 8) as f32 * 0.1,
]
})
.collect();
let result = pq.train(&training_vectors);
assert!(result.is_ok(), "Training should succeed");
}
#[test]
fn test_pq_train_empty_vectors() {
let config = PQConfig::default();
let mut pq = ProductQuantizer::new(128, config).unwrap();
let training_vectors: Vec<Vec<f32>> = vec![];
let result = pq.train(&training_vectors);
assert!(result.is_err(), "Should fail with empty training set");
}
#[test]
fn test_pq_train_insufficient_vectors() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 256,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..10).map(|i| vec![i as f32; 8]).collect();
let result = pq.train(&training_vectors);
assert!(result.is_ok() || result.is_err());
}
#[test]
fn test_pq_encode_before_training() {
let config = PQConfig::default();
let pq = ProductQuantizer::new(128, config).unwrap();
let vector = vec![0.1; 128];
let result = pq.encode(&vector);
assert!(result.is_err(), "Should fail to encode before training");
}
#[test]
fn test_pq_encode_after_training() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 8]).collect();
pq.train(&training_vectors).unwrap();
let vector = vec![1.0; 8];
let result = pq.encode(&vector);
assert!(result.is_ok(), "Should encode after training");
let codes = result.unwrap();
assert_eq!(codes.len(), 4, "Should have code for each subvector");
}
#[test]
fn test_pq_encode_wrong_dimension() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 8]).collect();
pq.train(&training_vectors).unwrap();
let vector = vec![1.0; 12];
let result = pq.encode(&vector);
assert!(result.is_err(), "Should fail with wrong dimension");
}
#[test]
fn test_pq_decode_basic() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 8]).collect();
pq.train(&training_vectors).unwrap();
let original = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let codes = pq.encode(&original).unwrap();
let decoded = pq.decode(&codes).unwrap();
assert_eq!(decoded.len(), 8, "Decoded should have same dimension");
}
#[test]
fn test_pq_encode_decode_preservation() {
let config = PQConfig {
num_subvectors: 8,
num_centroids: 256,
training_iterations: 10,
};
let mut pq = ProductQuantizer::new(64, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..300)
.map(|i| (0..64).map(|j| ((i + j) as f32 * 0.1).sin()).collect())
.collect();
pq.train(&training_vectors).unwrap();
let original = vec![1.0; 64];
let codes = pq.encode(&original).unwrap();
let decoded = pq.decode(&codes).unwrap();
for (o, d) in original.iter().zip(decoded.iter()) {
assert!(
(o - d).abs() < 5.0,
"Decoded value should be reasonably close"
);
}
}
#[test]
fn test_pq_compression_ratio() {
let config = PQConfig {
num_subvectors: 16,
num_centroids: 256, training_iterations: 10,
};
let dimension = 128;
let mut pq = ProductQuantizer::new(dimension, config).unwrap();
let training_vectors: Vec<Vec<f32>> =
(0..300).map(|i| vec![i as f32 * 0.01; dimension]).collect();
pq.train(&training_vectors).unwrap();
let vector = vec![1.0; dimension];
let codes = pq.encode(&vector).unwrap();
let original_size = dimension * std::mem::size_of::<f32>();
let compressed_size = codes.len() * std::mem::size_of::<u16>();
assert!(
original_size > compressed_size,
"Compressed should be smaller than original"
);
}
#[test]
fn test_pq_asymmetric_distance() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 8]).collect();
pq.train(&training_vectors).unwrap();
let query = vec![1.0; 8];
let target = vec![1.1; 8];
let codes = pq.encode(&target).unwrap();
let _codes = codes; }
#[test]
fn test_pq_identical_vectors_zero_distance() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 8]).collect();
pq.train(&training_vectors).unwrap();
let vector = vec![1.0; 8];
let codes = pq.encode(&vector).unwrap();
assert!(codes.len() > 0, "Should have encoded codes");
}
#[test]
fn test_pq_different_subvector_counts() {
for num_subvectors in [4, 8, 16, 32] {
let dimension = 64;
let config = PQConfig {
num_subvectors,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(dimension, config).unwrap();
let training_vectors: Vec<Vec<f32>> =
(0..50).map(|i| vec![i as f32 * 0.1; dimension]).collect();
let result = pq.train(&training_vectors);
assert!(
result.is_ok(),
"Should train with {} subvectors",
num_subvectors
);
}
}
#[test]
fn test_pq_different_centroid_counts() {
for num_centroids in [16, 64, 256] {
let config = PQConfig {
num_subvectors: 8,
num_centroids,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(64, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..300).map(|i| vec![i as f32 * 0.1; 64]).collect();
let result = pq.train(&training_vectors);
assert!(
result.is_ok(),
"Should train with {} centroids",
num_centroids
);
}
}
#[test]
fn test_pq_deterministic_encoding() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 8]).collect();
pq.train(&training_vectors).unwrap();
let vector = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let codes1 = pq.encode(&vector).unwrap();
let codes2 = pq.encode(&vector).unwrap();
let codes3 = pq.encode(&vector).unwrap();
assert_eq!(codes1, codes2, "Encoding should be deterministic");
assert_eq!(codes2, codes3, "Encoding should be deterministic");
}
#[test]
fn test_pq_serialization() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 8]).collect();
pq.train(&training_vectors).unwrap();
let serialized = bincode::serialize(&pq);
assert!(serialized.is_ok(), "Should serialize PQ");
let deserialized: Result<ProductQuantizer, _> = bincode::deserialize(&serialized.unwrap());
assert!(deserialized.is_ok(), "Should deserialize PQ");
}
#[test]
fn test_pq_with_vecstore() {
let temp_dir = tempfile::tempdir().unwrap();
let mut store = VecStore::open(temp_dir.path()).unwrap();
let meta = Metadata {
fields: HashMap::new(),
};
for i in 0..50 {
let vector: Vec<f32> = (0..128).map(|j| (i + j) as f32 * 0.01).collect();
store
.upsert(format!("doc{}", i), vector, meta.clone())
.unwrap();
}
let pq_config = PQConfig {
num_subvectors: 16,
num_centroids: 256,
training_iterations: 10,
};
let _ = store;
let _ = pq_config;
}
#[test]
#[ignore] fn test_pq_search_accuracy() {
let config = PQConfig {
num_subvectors: 8,
num_centroids: 256,
training_iterations: 15,
};
let dimension = 64;
let mut pq = ProductQuantizer::new(dimension, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..300)
.map(|i| {
(0..dimension)
.map(|j| ((i + j) as f32 * 0.1).sin())
.collect()
})
.collect();
pq.train(&training_vectors).unwrap();
let test_vectors: Vec<Vec<f32>> = (200..250)
.map(|i| {
(0..dimension)
.map(|j| ((i + j) as f32 * 0.1).sin())
.collect()
})
.collect();
let query = vec![0.5; dimension];
let mut original_distances: Vec<(usize, f32)> = test_vectors
.iter()
.enumerate()
.map(|(i, v)| {
let dist = euclidean_distance(&query, v);
(i, dist)
})
.collect();
original_distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let mut quantized_distances: Vec<(usize, f32)> = test_vectors
.iter()
.enumerate()
.map(|(i, v)| {
let codes = pq.encode(v).unwrap();
let dist = codes.iter().map(|&c| c as f32).sum::<f32>();
(i, dist)
})
.collect();
quantized_distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let top_k = 10;
let original_top: Vec<usize> = original_distances
.iter()
.take(top_k)
.map(|(i, _)| *i)
.collect();
let quantized_top: Vec<usize> = quantized_distances
.iter()
.take(top_k)
.map(|(i, _)| *i)
.collect();
let overlap = original_top
.iter()
.filter(|i| quantized_top.contains(i))
.count();
assert!(
overlap >= top_k / 2,
"PQ search should maintain reasonable accuracy"
);
}
#[test]
fn test_pq_batch_encoding() {
let config = PQConfig {
num_subvectors: 4,
num_centroids: 16,
training_iterations: 5,
};
let mut pq = ProductQuantizer::new(8, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..50).map(|i| vec![i as f32 * 0.1; 8]).collect();
pq.train(&training_vectors).unwrap();
let vectors: Vec<Vec<f32>> = (0..20).map(|i| vec![i as f32 * 0.2; 8]).collect();
for vector in &vectors {
let codes = pq.encode(vector);
assert!(codes.is_ok(), "Should encode batch vectors");
}
}
#[test]
fn test_pq_high_dimensional_vectors() {
let config = PQConfig {
num_subvectors: 32,
num_centroids: 256,
training_iterations: 10,
};
let dimension = 1536; let mut pq = ProductQuantizer::new(dimension, config).unwrap();
let training_vectors: Vec<Vec<f32>> = (0..300)
.map(|i| {
(0..dimension)
.map(|j| ((i + j) as f32 * 0.001).sin())
.collect()
})
.collect();
let result = pq.train(&training_vectors);
assert!(result.is_ok(), "Should handle high-dimensional vectors");
let vector = vec![0.5; dimension];
let codes = pq.encode(&vector);
assert!(codes.is_ok(), "Should encode high-dimensional vector");
}
fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).powi(2))
.sum::<f32>()
.sqrt()
}