use lancedb::Table;
use lancedb::{query::VectorQuery, DistanceType};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndexType {
IvfHnswSq,
IvfRq,
}
#[derive(Debug, Clone)]
pub struct VectorIndexParams {
pub should_create_index: bool,
pub index_type: IndexType,
pub num_partitions: u32,
pub num_edges: u32, pub ef_construction: u32, pub distance_type: DistanceType,
}
#[derive(Debug, Clone)]
pub struct SearchParams {
pub ef: Option<u32>, }
pub struct VectorOptimizer;
impl VectorOptimizer {
pub async fn optimize_query(
query: VectorQuery,
table: &Table,
table_name: &str,
) -> Result<VectorQuery, lancedb::Error> {
let row_count = table.count_rows(None).await?;
let indices = table.list_indices().await?;
let has_index = indices.iter().any(|idx| idx.columns == vec!["embedding"]);
if has_index {
tracing::debug!(
"Vector search on {} with {} rows, index exists (LanceDB auto-tunes HNSW)",
table_name,
row_count
);
} else {
tracing::debug!(
"Vector search on {} with {} rows, no index (brute force)",
table_name,
row_count
);
}
Ok(query)
}
pub fn calculate_index_params(
row_count: usize,
_vector_dimension: usize,
use_quantization: bool,
) -> VectorIndexParams {
if row_count < 1000 {
tracing::debug!(
"Dataset size {} is small, skipping index creation (brute force will be faster)",
row_count
);
return VectorIndexParams {
should_create_index: false,
index_type: if use_quantization {
IndexType::IvfRq
} else {
IndexType::IvfHnswSq
},
num_partitions: 0,
num_edges: 20,
ef_construction: 300,
distance_type: DistanceType::Cosine,
};
}
let num_partitions = if row_count >= 1_048_576 {
(row_count / 1_048_576) as u32
} else {
std::cmp::max((row_count as f64).sqrt() as u32, 2)
};
let num_partitions = num_partitions.clamp(2, 1024);
let index_type = if use_quantization {
IndexType::IvfRq
} else {
IndexType::IvfHnswSq
};
let index_type_name = match index_type {
IndexType::IvfHnswSq => "IVF_HNSW_SQ",
IndexType::IvfRq => "IVF_RQ",
};
tracing::debug!(
"Calculated {} index params for {} rows: partitions={}, num_edges=20, ef_construction=300",
index_type_name,
row_count,
num_partitions
);
VectorIndexParams {
should_create_index: true,
index_type,
num_partitions,
num_edges: 20, ef_construction: 300, distance_type: DistanceType::Cosine, }
}
pub fn calculate_search_params(_num_partitions: u32, _row_count: usize) -> SearchParams {
SearchParams {
ef: None, }
}
pub fn should_recreate_index(current_partitions: u32, optimal: &VectorIndexParams) -> bool {
if !optimal.should_create_index {
return false;
}
let partition_diff = (current_partitions as f32 - optimal.num_partitions as f32).abs()
/ optimal.num_partitions as f32;
partition_diff > 0.5
}
pub fn needs_reindex(current_rows: usize, indexed_rows: usize) -> bool {
if indexed_rows == 0 {
return false;
}
let growth = (current_rows as f64 - indexed_rows as f64) / indexed_rows as f64;
growth > 0.5
}
pub fn should_optimize_for_growth(
_current_rows: usize,
_vector_dim: usize,
check_growth: bool,
) -> bool {
if !check_growth {
return false;
}
false
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_small_dataset_no_index() {
let params = VectorOptimizer::calculate_index_params(500, 768, true);
assert!(!params.should_create_index);
}
#[test]
fn test_medium_dataset_creates_index() {
let params = VectorOptimizer::calculate_index_params(5000, 768, true);
assert!(params.should_create_index);
assert!(params.num_partitions >= 2);
}
#[test]
fn test_large_dataset_more_partitions() {
let params_small = VectorOptimizer::calculate_index_params(5000, 768, true);
let params_large = VectorOptimizer::calculate_index_params(50000, 768, true);
assert!(params_large.num_partitions > params_small.num_partitions);
}
#[test]
fn test_very_large_dataset_formula() {
let params = VectorOptimizer::calculate_index_params(2_000_000, 768, true);
assert!(params.should_create_index);
assert_eq!(params.num_partitions, 2);
}
#[test]
fn test_minimum_partitions() {
let params = VectorOptimizer::calculate_index_params(1000, 768, true);
assert!(params.num_partitions >= 2);
}
#[test]
fn test_should_recreate_index() {
let optimal = VectorIndexParams {
should_create_index: true,
index_type: IndexType::IvfHnswSq,
num_partitions: 100,
num_edges: 20,
ef_construction: 300,
distance_type: DistanceType::Cosine,
};
assert!(!VectorOptimizer::should_recreate_index(80, &optimal));
assert!(VectorOptimizer::should_recreate_index(10, &optimal));
}
#[test]
fn test_quantization_false_uses_ivf_hnsw_sq() {
let params = VectorOptimizer::calculate_index_params(5000, 768, false);
assert!(params.should_create_index);
assert_eq!(params.index_type, IndexType::IvfHnswSq);
}
#[test]
fn test_quantization_true_uses_ivf_rq() {
let params = VectorOptimizer::calculate_index_params(5000, 768, true);
assert!(params.should_create_index);
assert_eq!(params.index_type, IndexType::IvfRq);
}
#[test]
fn test_needs_reindex() {
assert!(!VectorOptimizer::needs_reindex(1500, 1000));
assert!(VectorOptimizer::needs_reindex(2000, 1000));
assert!(!VectorOptimizer::needs_reindex(1000, 1000));
}
}