mod builder;
pub(crate) mod row_vertex;
mod search;
use super::{
graph::{Vertex, VertexSerDe},
MetricType,
};
pub(crate) use builder::build_diskann_index;
use lance_index::vector::pq::PQBuildParams;
pub(crate) use row_vertex::RowVertex;
pub(crate) use search::DiskANNIndex;
#[derive(Clone, Debug)]
pub struct DiskANNParams {
pub r: usize,
pub alpha: f32,
pub l: usize,
pub pq_params: PQBuildParams,
pub metric_type: MetricType,
}
impl Default for DiskANNParams {
fn default() -> Self {
Self {
r: 50,
alpha: 1.2,
l: 70,
pq_params: PQBuildParams::default(),
metric_type: MetricType::L2,
}
}
}
impl DiskANNParams {
pub fn new(r: usize, alpha: f32, l: usize) -> Self {
Self {
r,
alpha,
l,
pq_params: PQBuildParams::default(),
metric_type: MetricType::L2,
}
}
pub fn r(&mut self, r: usize) -> &mut Self {
self.r = r;
self
}
pub fn alpha(&mut self, alpha: f32) -> &mut Self {
self.alpha = alpha;
self
}
pub fn l(&mut self, l: usize) -> &mut Self {
self.l = l;
self
}
pub fn pq_num_bits(&mut self, nbits: usize) -> &mut Self {
self.pq_params.num_bits = nbits;
self
}
pub fn pq_num_sub_vectors(&mut self, m: usize) -> &mut Self {
self.pq_params.num_sub_vectors = m;
self
}
pub fn use_opq(&mut self, use_opq: bool) -> &mut Self {
self.pq_params.use_opq = use_opq;
self
}
pub fn metric_type(&mut self, metric_type: MetricType) -> &mut Self {
self.metric_type = metric_type;
self
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchIterator};
use arrow_schema::{DataType, Field, Schema as ArrowSchema};
use lance_testing::datagen::generate_random_array;
use tempfile::tempdir;
use super::*;
use crate::{
arrow::*,
dataset::Dataset,
index::{
DatasetIndexExt,
{vector::VectorIndexParams, IndexType},
},
};
#[tokio::test]
async fn test_create_index() {
let test_dir = tempdir().unwrap();
let dimension = 16;
let schema = Arc::new(ArrowSchema::new(vec![Field::new(
"embeddings",
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Float32, true)),
dimension,
),
false,
)]));
let float_arr = generate_random_array(512 * dimension as usize);
let vectors =
Arc::new(FixedSizeListArray::try_new_from_values(float_arr, dimension).unwrap());
let batches = vec![RecordBatch::try_new(schema.clone(), vec![vectors.clone()]).unwrap()];
let test_uri = test_dir.path().to_str().unwrap();
let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap();
let params =
VectorIndexParams::with_diskann_params(MetricType::L2, DiskANNParams::default());
dataset
.create_index(&["embeddings"], IndexType::Vector, None, ¶ms, false)
.await
.unwrap();
let indices = dataset.load_indices().await.unwrap();
let actual = indices.first().unwrap().dataset_version;
let expected = dataset.manifest.version - 1;
assert_eq!(actual, expected);
}
}