use lance::table::format::{IndexMetadata, Manifest};
use serde::Serialize;
use crate::DistanceType;
pub struct VectorIndex {
pub columns: Vec<String>,
pub index_name: String,
pub index_uuid: String,
}
impl VectorIndex {
pub fn new_from_format(manifest: &Manifest, index: &IndexMetadata) -> Self {
let fields = index
.fields
.iter()
.map(|field_id| {
manifest
.schema
.field_by_id(*field_id)
.unwrap_or_else(|| {
panic!(
"field {field_id} of index {} must exist in schema",
index.name
)
})
.name
.clone()
})
.collect();
Self {
columns: fields,
index_name: index.name.clone(),
index_uuid: index.uuid.to_string(),
}
}
}
macro_rules! impl_distance_type_setter {
() => {
pub fn distance_type(mut self, distance_type: DistanceType) -> Self {
self.distance_type = distance_type;
self
}
};
}
macro_rules! impl_ivf_params_setter {
() => {
pub fn num_partitions(mut self, num_partitions: u32) -> Self {
self.num_partitions = Some(num_partitions);
self
}
pub fn sample_rate(mut self, sample_rate: u32) -> Self {
self.sample_rate = sample_rate;
self
}
pub fn max_iterations(mut self, max_iterations: u32) -> Self {
self.max_iterations = max_iterations;
self
}
pub fn target_partition_size(mut self, target_partition_size: u32) -> Self {
self.target_partition_size = Some(target_partition_size);
self
}
};
}
macro_rules! impl_pq_params_setter {
() => {
pub fn num_sub_vectors(mut self, num_sub_vectors: u32) -> Self {
self.num_sub_vectors = Some(num_sub_vectors);
self
}
pub fn num_bits(mut self, num_bits: u32) -> Self {
self.num_bits = Some(num_bits);
self
}
};
}
macro_rules! impl_hnsw_params_setter {
() => {
pub fn num_edges(mut self, m: u32) -> Self {
self.m = m;
self
}
pub fn ef_construction(mut self, ef_construction: u32) -> Self {
self.ef_construction = ef_construction;
self
}
};
}
#[derive(Debug, Clone, Serialize)]
pub struct IvfFlatIndexBuilder {
#[serde(rename = "metric_type")]
pub(crate) distance_type: DistanceType,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) target_partition_size: Option<u32>,
}
impl Default for IvfFlatIndexBuilder {
fn default() -> Self {
Self {
distance_type: DistanceType::L2,
num_partitions: None,
sample_rate: 256,
max_iterations: 50,
target_partition_size: None,
}
}
}
impl IvfFlatIndexBuilder {
impl_distance_type_setter!();
impl_ivf_params_setter!();
}
#[derive(Debug, Clone, Serialize)]
pub struct IvfSqIndexBuilder {
#[serde(rename = "metric_type")]
pub(crate) distance_type: DistanceType,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) target_partition_size: Option<u32>,
}
impl Default for IvfSqIndexBuilder {
fn default() -> Self {
Self {
distance_type: DistanceType::L2,
num_partitions: None,
sample_rate: 256,
max_iterations: 50,
target_partition_size: None,
}
}
}
impl IvfSqIndexBuilder {
impl_distance_type_setter!();
impl_ivf_params_setter!();
}
#[derive(Debug, Clone, Serialize)]
pub struct IvfPqIndexBuilder {
#[serde(rename = "metric_type")]
pub(crate) distance_type: DistanceType,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) target_partition_size: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_sub_vectors: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_bits: Option<u32>,
}
impl Default for IvfPqIndexBuilder {
fn default() -> Self {
Self {
distance_type: DistanceType::L2,
num_partitions: None,
num_sub_vectors: None,
num_bits: None,
sample_rate: 256,
max_iterations: 50,
target_partition_size: None,
}
}
}
impl IvfPqIndexBuilder {
impl_distance_type_setter!();
impl_ivf_params_setter!();
impl_pq_params_setter!();
}
pub(crate) fn suggested_num_sub_vectors(dim: u32) -> u32 {
if dim.is_multiple_of(16) {
dim / 16
} else if dim.is_multiple_of(8) {
dim / 8
} else {
log::warn!(
"The dimension of the vector is not divisible by 8 or 16, \
which may cause performance degradation in PQ"
);
1
}
}
#[derive(Debug, Clone, Serialize)]
pub struct IvfRqIndexBuilder {
#[serde(rename = "metric_type")]
pub(crate) distance_type: DistanceType,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_partitions: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_bits: Option<u32>,
pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) target_partition_size: Option<u32>,
}
impl Default for IvfRqIndexBuilder {
fn default() -> Self {
Self {
distance_type: DistanceType::L2,
num_partitions: None,
num_bits: None,
sample_rate: 256,
max_iterations: 50,
target_partition_size: None,
}
}
}
impl IvfRqIndexBuilder {
impl_distance_type_setter!();
impl_ivf_params_setter!();
pub fn num_bits(mut self, num_bits: u32) -> Self {
self.num_bits = Some(num_bits);
self
}
}
#[derive(Debug, Clone, Serialize)]
pub struct IvfHnswPqIndexBuilder {
#[serde(rename = "metric_type")]
pub(crate) distance_type: DistanceType,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) target_partition_size: Option<u32>,
pub(crate) m: u32,
pub(crate) ef_construction: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_sub_vectors: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_bits: Option<u32>,
}
impl Default for IvfHnswPqIndexBuilder {
fn default() -> Self {
Self {
distance_type: DistanceType::L2,
num_partitions: None,
num_sub_vectors: None,
num_bits: None,
sample_rate: 256,
max_iterations: 50,
m: 20,
ef_construction: 300,
target_partition_size: None,
}
}
}
impl IvfHnswPqIndexBuilder {
impl_distance_type_setter!();
impl_ivf_params_setter!();
impl_hnsw_params_setter!();
impl_pq_params_setter!();
}
#[derive(Debug, Clone, Serialize)]
pub struct IvfHnswSqIndexBuilder {
#[serde(rename = "metric_type")]
pub(crate) distance_type: DistanceType,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) num_partitions: Option<u32>,
pub(crate) sample_rate: u32,
pub(crate) max_iterations: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) target_partition_size: Option<u32>,
pub(crate) m: u32,
pub(crate) ef_construction: u32,
}
impl Default for IvfHnswSqIndexBuilder {
fn default() -> Self {
Self {
distance_type: DistanceType::L2,
num_partitions: None,
sample_rate: 256,
max_iterations: 50,
m: 20,
ef_construction: 300,
target_partition_size: None,
}
}
}
impl IvfHnswSqIndexBuilder {
impl_distance_type_setter!();
impl_ivf_params_setter!();
impl_hnsw_params_setter!();
}