use std::collections::HashMap;
use std::sync::{Arc, OnceLock, RwLock};
use std::time::Instant;
#[derive(Debug, Clone)]
pub struct LazyConfig {
pub dim: usize,
pub enable_bps: bool,
pub enable_rdf: bool,
pub enable_graph: bool,
pub bps_projections: usize,
pub rdf_sample_dims: usize,
pub graph_m: usize,
pub background_prewarm: bool,
}
impl Default for LazyConfig {
fn default() -> Self {
Self {
dim: 768,
enable_bps: true,
enable_rdf: true,
enable_graph: false, bps_projections: 64,
rdf_sample_dims: 32,
graph_m: 16,
background_prewarm: false,
}
}
}
pub type VectorKey = u64;
struct BpsIndex {
projections: Vec<Vec<f32>>,
codes: Vec<u64>,
build_time_ns: u64,
}
impl BpsIndex {
fn build(vectors: &[f32], dim: usize, num_projections: usize) -> Self {
let start = Instant::now();
let num_vectors = vectors.len() / dim;
let projections: Vec<Vec<f32>> = (0..num_projections)
.map(|i| {
(0..dim)
.map(|j| {
let seed = (i * 1000 + j) as f32;
(seed.sin() * 100.0) % 2.0 - 1.0
})
.collect()
})
.collect();
let codes: Vec<u64> = (0..num_vectors)
.map(|i| {
let vec_start = i * dim;
let vector = &vectors[vec_start..vec_start + dim];
let mut code = 0u64;
for (p, proj) in projections.iter().enumerate().take(64) {
let dot: f32 = vector.iter().zip(proj.iter()).map(|(a, b)| a * b).sum();
if dot >= 0.0 {
code |= 1 << p;
}
}
code
})
.collect();
Self {
projections,
codes,
build_time_ns: start.elapsed().as_nanos() as u64,
}
}
fn query(&self, vector: &[f32]) -> u64 {
let mut code = 0u64;
for (p, proj) in self.projections.iter().enumerate().take(64) {
let dot: f32 = vector.iter().zip(proj.iter()).map(|(a, b)| a * b).sum();
if dot >= 0.0 {
code |= 1 << p;
}
}
code
}
fn hamming_distance(a: u64, b: u64) -> u32 {
(a ^ b).count_ones()
}
}
struct RdfIndex {
sample_dims: Vec<usize>,
quantized: Vec<Vec<u8>>,
build_time_ns: u64,
}
impl RdfIndex {
fn build(vectors: &[f32], dim: usize, num_sample_dims: usize) -> Self {
let start = Instant::now();
let num_vectors = vectors.len() / dim;
let sample_dims: Vec<usize> = (0..num_sample_dims)
.map(|i| (i * 7919) % dim) .collect();
let quantized: Vec<Vec<u8>> = (0..num_vectors)
.map(|i| {
let vec_start = i * dim;
sample_dims
.iter()
.map(|&d| {
let val = vectors[vec_start + d];
((val.clamp(-1.0, 1.0) + 1.0) * 127.5) as u8
})
.collect()
})
.collect();
Self {
sample_dims,
quantized,
build_time_ns: start.elapsed().as_nanos() as u64,
}
}
fn query(&self, vector: &[f32]) -> Vec<u8> {
self.sample_dims
.iter()
.map(|&d| {
let val = vector.get(d).copied().unwrap_or(0.0);
((val.clamp(-1.0, 1.0) + 1.0) * 127.5) as u8
})
.collect()
}
fn l1_distance(a: &[u8], b: &[u8]) -> u32 {
a.iter()
.zip(b.iter())
.map(|(&x, &y)| (x as i32 - y as i32).unsigned_abs())
.sum()
}
}
struct MiniGraph {
#[allow(dead_code)]
neighbors: Vec<Vec<u32>>,
#[allow(dead_code)]
entry_point: u32,
build_time_ns: u64,
}
impl MiniGraph {
fn build(vectors: &[f32], dim: usize, m: usize) -> Self {
let start = Instant::now();
let num_vectors = vectors.len() / dim;
if num_vectors == 0 {
return Self {
neighbors: Vec::new(),
entry_point: 0,
build_time_ns: 0,
};
}
let mut neighbors: Vec<Vec<u32>> = vec![Vec::with_capacity(m); num_vectors];
for i in 1..num_vectors {
let vec_i = &vectors[i * dim..(i + 1) * dim];
let mut distances: Vec<(u32, f32)> = (0..i)
.map(|j| {
let vec_j = &vectors[j * dim..(j + 1) * dim];
let dist = l2_squared(vec_i, vec_j);
(j as u32, dist)
})
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
for &(neighbor, _) in distances.iter().take(m) {
neighbors[i].push(neighbor);
if neighbors[neighbor as usize].len() < m {
neighbors[neighbor as usize].push(i as u32);
}
}
}
Self {
neighbors,
entry_point: 0,
build_time_ns: start.elapsed().as_nanos() as u64,
}
}
}
struct LazyIndexes {
bps: OnceLock<BpsIndex>,
rdf: OnceLock<RdfIndex>,
graph: OnceLock<MiniGraph>,
}
impl LazyIndexes {
fn new() -> Self {
Self {
bps: OnceLock::new(),
rdf: OnceLock::new(),
graph: OnceLock::new(),
}
}
}
pub struct LazySegment {
config: LazyConfig,
data: Vec<f32>,
key_to_index: HashMap<VectorKey, u32>,
index_to_key: Vec<VectorKey>,
indexes: LazyIndexes,
#[allow(dead_code)]
stats: SegmentStats,
}
impl LazySegment {
pub fn new(vectors: Vec<(VectorKey, Vec<f32>)>, config: LazyConfig) -> Self {
let dim = config.dim;
let num_vectors = vectors.len();
let mut data = Vec::with_capacity(num_vectors * dim);
let mut key_to_index = HashMap::with_capacity(num_vectors);
let mut index_to_key = Vec::with_capacity(num_vectors);
for (i, (key, vector)) in vectors.into_iter().enumerate() {
data.extend_from_slice(&vector);
key_to_index.insert(key, i as u32);
index_to_key.push(key);
}
Self {
config,
data,
key_to_index,
index_to_key,
indexes: LazyIndexes::new(),
stats: SegmentStats::default(),
}
}
pub fn from_flat(flat_data: Vec<f32>, keys: Vec<VectorKey>, config: LazyConfig) -> Self {
let key_to_index: HashMap<_, _> = keys
.iter()
.enumerate()
.map(|(i, &k)| (k, i as u32))
.collect();
Self {
config,
data: flat_data,
key_to_index,
index_to_key: keys,
indexes: LazyIndexes::new(),
stats: SegmentStats::default(),
}
}
pub fn len(&self) -> usize {
self.index_to_key.len()
}
pub fn is_empty(&self) -> bool {
self.index_to_key.is_empty()
}
pub fn get(&self, key: VectorKey) -> Option<&[f32]> {
self.key_to_index.get(&key).map(|&idx| {
let start = idx as usize * self.config.dim;
&self.data[start..start + self.config.dim]
})
}
pub fn get_by_index(&self, index: u32) -> Option<&[f32]> {
if (index as usize) < self.index_to_key.len() {
let start = index as usize * self.config.dim;
Some(&self.data[start..start + self.config.dim])
} else {
None
}
}
pub fn search(&self, query: &[f32], k: usize) -> Vec<(VectorKey, f32)> {
if self.is_empty() {
return Vec::new();
}
let num_vectors = self.len();
let _dim = self.config.dim;
if num_vectors <= 100 {
return self.brute_force_search(query, k);
}
let candidates: Vec<u32> = if self.config.enable_bps {
self.bps_candidates(query, k * 10)
} else if self.config.enable_rdf {
self.rdf_candidates(query, k * 10)
} else {
(0..num_vectors as u32).collect()
};
let mut results: Vec<(VectorKey, f32)> = candidates
.into_iter()
.filter_map(|idx| {
let vec = self.get_by_index(idx)?;
let dist = l2_squared(query, vec);
let key = self.index_to_key[idx as usize];
Some((key, dist))
})
.collect();
results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
results.truncate(k);
results
}
fn brute_force_search(&self, query: &[f32], k: usize) -> Vec<(VectorKey, f32)> {
let dim = self.config.dim;
let mut results: Vec<(VectorKey, f32)> = self
.index_to_key
.iter()
.enumerate()
.map(|(i, &key)| {
let start = i * dim;
let vec = &self.data[start..start + dim];
let dist = l2_squared(query, vec);
(key, dist)
})
.collect();
results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
results.truncate(k);
results
}
fn bps_candidates(&self, query: &[f32], limit: usize) -> Vec<u32> {
let bps = self.indexes.bps.get_or_init(|| {
BpsIndex::build(&self.data, self.config.dim, self.config.bps_projections)
});
let query_code = bps.query(query);
let mut scored: Vec<(u32, u32)> = bps
.codes
.iter()
.enumerate()
.map(|(i, &code)| (i as u32, BpsIndex::hamming_distance(query_code, code)))
.collect();
scored.sort_by_key(|&(_, dist)| dist);
scored.into_iter().take(limit).map(|(idx, _)| idx).collect()
}
fn rdf_candidates(&self, query: &[f32], limit: usize) -> Vec<u32> {
let rdf = self.indexes.rdf.get_or_init(|| {
RdfIndex::build(&self.data, self.config.dim, self.config.rdf_sample_dims)
});
let query_quantized = rdf.query(query);
let mut scored: Vec<(u32, u32)> = rdf
.quantized
.iter()
.enumerate()
.map(|(i, quantized)| (i as u32, RdfIndex::l1_distance(&query_quantized, quantized)))
.collect();
scored.sort_by_key(|&(_, dist)| dist);
scored.into_iter().take(limit).map(|(idx, _)| idx).collect()
}
pub fn prewarm(&self) {
if self.config.enable_bps {
let _ = self.indexes.bps.get_or_init(|| {
BpsIndex::build(&self.data, self.config.dim, self.config.bps_projections)
});
}
if self.config.enable_rdf {
let _ = self.indexes.rdf.get_or_init(|| {
RdfIndex::build(&self.data, self.config.dim, self.config.rdf_sample_dims)
});
}
if self.config.enable_graph {
let _ = self
.indexes
.graph
.get_or_init(|| MiniGraph::build(&self.data, self.config.dim, self.config.graph_m));
}
}
pub fn indexes_built(&self) -> IndexStatus {
IndexStatus {
bps_built: self.indexes.bps.get().is_some(),
rdf_built: self.indexes.rdf.get().is_some(),
graph_built: self.indexes.graph.get().is_some(),
}
}
pub fn build_stats(&self) -> BuildStats {
BuildStats {
bps_time_ns: self.indexes.bps.get().map(|b| b.build_time_ns),
rdf_time_ns: self.indexes.rdf.get().map(|r| r.build_time_ns),
graph_time_ns: self.indexes.graph.get().map(|g| g.build_time_ns),
}
}
}
#[derive(Debug, Clone)]
pub struct IndexStatus {
pub bps_built: bool,
pub rdf_built: bool,
pub graph_built: bool,
}
#[derive(Debug, Clone)]
pub struct BuildStats {
pub bps_time_ns: Option<u64>,
pub rdf_time_ns: Option<u64>,
pub graph_time_ns: Option<u64>,
}
#[derive(Debug, Clone, Default)]
struct SegmentStats {
#[allow(dead_code)]
queries: u64,
#[allow(dead_code)]
index_builds: u64,
}
fn l2_squared(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| {
let d = x - y;
d * d
})
.sum()
}
pub struct LazySegmentManager {
segments: RwLock<Vec<Arc<LazySegment>>>,
#[allow(dead_code)]
config: LazyConfig,
}
impl LazySegmentManager {
pub fn new(config: LazyConfig) -> Self {
Self {
segments: RwLock::new(Vec::new()),
config,
}
}
pub fn add_segment(&self, segment: LazySegment) {
let segment = Arc::new(segment);
let mut segments = self.segments.write().unwrap();
segments.push(segment);
}
pub fn search(&self, query: &[f32], k: usize) -> Vec<(VectorKey, f32)> {
let segments = self.segments.read().unwrap();
let mut results: Vec<(VectorKey, f32)> =
segments.iter().flat_map(|s| s.search(query, k)).collect();
results.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
results.truncate(k);
results
}
pub fn prewarm_all(&self) {
let segments = self.segments.read().unwrap();
for segment in segments.iter() {
segment.prewarm();
}
}
pub fn num_segments(&self) -> usize {
self.segments.read().unwrap().len()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn random_vectors(n: usize, dim: usize, seed: u64) -> Vec<(VectorKey, Vec<f32>)> {
(0..n)
.map(|i| {
let key = (seed * 1000 + i as u64) as VectorKey;
let vector: Vec<f32> = (0..dim)
.map(|j| ((i * dim + j) as f32 * 0.001).sin())
.collect();
(key, vector)
})
.collect()
}
#[test]
fn test_lazy_segment_basic() {
let config = LazyConfig {
dim: 4,
enable_bps: false,
enable_rdf: false,
..Default::default()
};
let vectors = vec![
(1, vec![1.0, 0.0, 0.0, 0.0]),
(2, vec![0.0, 1.0, 0.0, 0.0]),
(3, vec![0.0, 0.0, 1.0, 0.0]),
];
let segment = LazySegment::new(vectors, config);
assert_eq!(segment.len(), 3);
assert_eq!(segment.get(1).unwrap(), &[1.0, 0.0, 0.0, 0.0]);
}
#[test]
fn test_lazy_segment_search() {
let config = LazyConfig {
dim: 4,
enable_bps: false,
enable_rdf: false,
..Default::default()
};
let vectors = vec![
(1, vec![1.0, 0.0, 0.0, 0.0]),
(2, vec![0.0, 1.0, 0.0, 0.0]),
(3, vec![0.5, 0.5, 0.0, 0.0]),
];
let segment = LazySegment::new(vectors, config);
let query = vec![1.0, 0.0, 0.0, 0.0];
let results = segment.search(&query, 2);
assert_eq!(results.len(), 2);
assert_eq!(results[0].0, 1); assert!(results[0].1 < 0.01);
}
#[test]
fn test_lazy_index_build() {
let config = LazyConfig {
dim: 8,
enable_bps: true,
enable_rdf: true,
enable_graph: false,
..Default::default()
};
let vectors = random_vectors(200, 8, 42);
let segment = LazySegment::new(vectors, config);
let status = segment.indexes_built();
assert!(!status.bps_built);
assert!(!status.rdf_built);
let query = vec![0.1; 8];
let _ = segment.search(&query, 5);
let status = segment.indexes_built();
assert!(status.bps_built);
}
#[test]
fn test_prewarm() {
let config = LazyConfig {
dim: 8,
enable_bps: true,
enable_rdf: true,
enable_graph: true,
..Default::default()
};
let vectors = random_vectors(50, 8, 42);
let segment = LazySegment::new(vectors, config);
segment.prewarm();
let status = segment.indexes_built();
assert!(status.bps_built);
assert!(status.rdf_built);
assert!(status.graph_built);
let stats = segment.build_stats();
assert!(stats.bps_time_ns.is_some());
assert!(stats.rdf_time_ns.is_some());
assert!(stats.graph_time_ns.is_some());
}
#[test]
fn test_segment_manager() {
let config = LazyConfig {
dim: 4,
enable_bps: false,
enable_rdf: false,
..Default::default()
};
let manager = LazySegmentManager::new(config.clone());
let seg1 = LazySegment::new(vec![(1, vec![1.0, 0.0, 0.0, 0.0])], config.clone());
let seg2 = LazySegment::new(vec![(2, vec![0.0, 1.0, 0.0, 0.0])], config.clone());
manager.add_segment(seg1);
manager.add_segment(seg2);
assert_eq!(manager.num_segments(), 2);
let query = vec![0.5, 0.5, 0.0, 0.0];
let results = manager.search(&query, 2);
assert_eq!(results.len(), 2);
}
#[test]
fn test_bps_index() {
let dim = 8;
let vectors: Vec<f32> = (0..100 * dim).map(|i| (i as f32 * 0.01).sin()).collect();
let bps = BpsIndex::build(&vectors, dim, 32);
assert_eq!(bps.codes.len(), 100);
assert!(bps.build_time_ns > 0);
let first_vec = vectors[0..dim].to_vec();
let code = bps.query(&first_vec);
assert_eq!(
code, bps.codes[0],
"query of a stored vector must match its build-time code"
);
assert_eq!(code, bps.query(&first_vec), "query must be deterministic");
assert_eq!(
code & !((1u64 << 32) - 1),
0,
"no bits beyond num_projections"
);
assert_eq!(BpsIndex::hamming_distance(code, code), 0);
}
#[test]
fn test_rdf_index() {
let dim = 8;
let vectors: Vec<f32> = (0..100 * dim).map(|i| (i as f32 * 0.01).sin()).collect();
let rdf = RdfIndex::build(&vectors, dim, 4);
assert_eq!(rdf.quantized.len(), 100);
assert_eq!(rdf.quantized[0].len(), 4);
}
}