use crate::directories::RamDirectory;
use crate::dsl::{Document, SchemaBuilder};
use crate::index::{Index, IndexConfig, IndexWriter};
#[tokio::test]
async fn test_vector_index_threshold_switch() {
use crate::dsl::{DenseVectorConfig, DenseVectorQuantization, VectorIndexType};
let mut schema_builder = SchemaBuilder::default();
let title = schema_builder.add_text_field("title", true, true);
let embedding = schema_builder.add_dense_vector_field_with_config(
"embedding",
true, true, DenseVectorConfig {
dim: 8,
index_type: VectorIndexType::IvfRaBitQ,
quantization: DenseVectorQuantization::F32,
num_clusters: Some(4), nprobe: 2,
build_threshold: Some(50), unit_norm: false,
},
);
let schema = schema_builder.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..30 {
let mut doc = Document::new();
doc.add_text(title, format!("Document {}", i));
let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 30.0).collect();
doc.add_dense_vector(embedding, vec);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert!(
index.segment_manager.trained().is_none(),
"Should not have trained centroids below threshold"
);
let query_vec: Vec<f32> = vec![0.5; 8];
let segments = index.segment_readers().await.unwrap();
assert!(!segments.is_empty());
let results = segments[0]
.search_dense_vector(
embedding,
&query_vec,
5,
0,
1.0,
crate::query::MultiValueCombiner::Max,
)
.await
.unwrap();
assert!(!results.is_empty(), "Flat search should return results");
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
for i in 30..60 {
let mut doc = Document::new();
doc.add_text(title, format!("Document {}", i));
let vec: Vec<f32> = (0..8).map(|_| (i as f32) / 60.0).collect();
doc.add_dense_vector(embedding, vec);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
writer.build_vector_index().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert!(
index.segment_manager.trained().is_some(),
"Should have loaded trained centroids for embedding field"
);
let segments = index.segment_readers().await.unwrap();
let results = segments[0]
.search_dense_vector(
embedding,
&query_vec,
5,
0,
1.0,
crate::query::MultiValueCombiner::Max,
)
.await
.unwrap();
assert!(
!results.is_empty(),
"Search should return results after build"
);
let writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.build_vector_index().await.unwrap();
assert!(writer.segment_manager.trained().is_some());
}
#[tokio::test]
async fn test_needle_sparse_vector() {
use crate::query::SparseVectorQuery;
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse = sb.add_sparse_vector_field("sparse", true, true);
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..100 {
let mut doc = Document::new();
doc.add_text(title, format!("Hay sparse doc {}", i));
let entries: Vec<(u32, f32)> = (0..10)
.map(|d| (d, 0.1 + (i as f32 * 0.001) + (d as f32 * 0.01)))
.collect();
doc.add_sparse_vector(sparse, entries);
writer.add_document(doc).unwrap();
}
let mut needle = Document::new();
needle.add_text(title, "Needle sparse document");
needle.add_sparse_vector(
sparse,
vec![(1000, 0.9), (1001, 0.8), (1002, 0.7), (5, 0.3)],
);
writer.add_document(needle).unwrap();
for i in 100..150 {
let mut doc = Document::new();
doc.add_text(title, format!("More hay sparse doc {}", i));
let entries: Vec<(u32, f32)> = (0..10).map(|d| (d, 0.2 + (d as f32 * 0.02))).collect();
doc.add_sparse_vector(sparse, entries);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 151);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = SparseVectorQuery::new(sparse, vec![(1000, 1.0), (1001, 1.0), (1002, 1.0)]);
let results = searcher.search(&query, 10).await.unwrap();
assert_eq!(results.len(), 1, "Only needle has dims 1000-1002");
assert!(results[0].score > 0.0, "Needle score should be positive");
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let title_val = doc.get_first(title).unwrap().as_text().unwrap();
assert_eq!(title_val, "Needle sparse document");
let query_shared = SparseVectorQuery::new(sparse, vec![(5, 1.0)]);
let results = searcher.search(&query_shared, 200).await.unwrap();
assert!(
results.len() >= 100,
"Shared dim 5 should match many docs, got {}",
results.len()
);
let query_missing = SparseVectorQuery::new(sparse, vec![(99999, 1.0)]);
let results = searcher.search(&query_missing, 10).await.unwrap();
assert_eq!(
results.len(),
0,
"Non-existent dimension should match nothing"
);
}
#[tokio::test]
async fn test_needle_sparse_vector_multi_segment_merge() {
use crate::query::SparseVectorQuery;
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse = sb.add_sparse_vector_field("sparse", true, true);
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..30 {
let mut doc = Document::new();
doc.add_text(title, format!("seg1 hay {}", i));
doc.add_sparse_vector(sparse, vec![(0, 0.5), (1, 0.3)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let mut needle = Document::new();
needle.add_text(title, "seg2 needle");
needle.add_sparse_vector(sparse, vec![(500, 0.95), (501, 0.85)]);
writer.add_document(needle).unwrap();
for i in 0..29 {
let mut doc = Document::new();
doc.add_text(title, format!("seg2 hay {}", i));
doc.add_sparse_vector(sparse, vec![(0, 0.4), (2, 0.6)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 60);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = SparseVectorQuery::new(sparse, vec![(500, 1.0), (501, 1.0)]);
let results = searcher.search(&query, 10).await.unwrap();
assert_eq!(results.len(), 1, "Pre-merge: needle should be found");
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
doc.get_first(title).unwrap().as_text().unwrap(),
"seg2 needle"
);
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(index.num_docs().await.unwrap(), 60);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = SparseVectorQuery::new(sparse, vec![(500, 1.0), (501, 1.0)]);
let results = searcher.search(&query, 10).await.unwrap();
assert_eq!(results.len(), 1, "Post-merge: needle should still be found");
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
doc.get_first(title).unwrap().as_text().unwrap(),
"seg2 needle"
);
}
#[tokio::test]
async fn test_needle_dense_vector_flat() {
use crate::dsl::{DenseVectorConfig, VectorIndexType};
use crate::query::DenseVectorQuery;
let dim = 16;
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let embedding = sb.add_dense_vector_field_with_config(
"embedding",
true,
true,
DenseVectorConfig {
dim,
index_type: VectorIndexType::Flat,
quantization: crate::dsl::DenseVectorQuantization::F32,
num_clusters: None,
nprobe: 0,
build_threshold: None,
unit_norm: false,
},
);
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..100 {
let mut doc = Document::new();
doc.add_text(title, format!("Hay dense doc {}", i));
let vec: Vec<f32> = (0..dim)
.map(|d| ((i * 7 + d * 13) % 100) as f32 / 1000.0)
.collect();
doc.add_dense_vector(embedding, vec);
writer.add_document(doc).unwrap();
}
let mut needle = Document::new();
needle.add_text(title, "Needle dense document");
let needle_vec: Vec<f32> = vec![1.0; dim];
needle.add_dense_vector(embedding, needle_vec.clone());
writer.add_document(needle).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 101);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = DenseVectorQuery::new(embedding, needle_vec);
let results = searcher.search(&query, 5).await.unwrap();
assert!(!results.is_empty(), "Should find at least 1 result");
let top_doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let top_title = top_doc.get_first(title).unwrap().as_text().unwrap();
assert_eq!(
top_title, "Needle dense document",
"Top result should be the needle (exact vector match)"
);
assert!(
results[0].score > 0.9,
"Exact match should have very high cosine similarity, got {}",
results[0].score
);
}
#[tokio::test]
async fn test_binary_dense_vector_rerank() {
use crate::dsl::BinaryDenseVectorConfig;
use crate::query::{BinaryDenseVectorQuery, RerankerConfig, TermQuery};
let dim_bits = 64; let byte_len = dim_bits / 8;
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let body = sb.add_text_field("body", true, true);
let bvec = sb.add_binary_dense_vector_field_with_config(
"bvec",
true,
true,
BinaryDenseVectorConfig::new(dim_bits),
);
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let needle_vec = vec![0xFF_u8; byte_len];
let mut needle = Document::new();
needle.add_text(title, "Needle binary document");
needle.add_text(body, "searchterm unique content");
needle.add_binary_dense_vector(bvec, needle_vec.clone());
writer.add_document(needle).unwrap();
for i in 0u8..25 {
let mut doc = Document::new();
doc.add_text(title, format!("Hay binary doc {}", i));
doc.add_text(body, "searchterm common filler");
let v: Vec<u8> = (0..byte_len)
.map(|d| i.wrapping_add(d as u8) & 0x55)
.collect();
doc.add_binary_dense_vector(bvec, v);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let mut near_vec = vec![0xFF_u8; byte_len];
near_vec[0] = 0xFE; let mut near = Document::new();
near.add_text(title, "Near-needle binary document");
near.add_text(body, "searchterm close match");
near.add_binary_dense_vector(bvec, near_vec.clone());
writer.add_document(near).unwrap();
for i in 25u8..50 {
let mut doc = Document::new();
doc.add_text(title, format!("Hay binary doc {}", i));
doc.add_text(body, "searchterm common filler");
let v: Vec<u8> = (0..byte_len)
.map(|d| i.wrapping_add(d as u8) & 0x55)
.collect();
doc.add_binary_dense_vector(bvec, v);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 52);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = BinaryDenseVectorQuery::new(bvec, needle_vec.clone());
let results = searcher.search(&query, 5).await.unwrap();
assert!(
!results.is_empty(),
"L1 binary search should return results"
);
let top_doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
top_doc.get_first(title).unwrap().as_text().unwrap(),
"Needle binary document",
"L1: exact match should be top result"
);
assert!(
(results[0].score - 1.0).abs() < 1e-6,
"Exact match score should be 1.0, got {}",
results[0].score
);
assert!(results.len() >= 2);
let second_doc = searcher
.doc(results[1].segment_id, results[1].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
second_doc.get_first(title).unwrap().as_text().unwrap(),
"Near-needle binary document",
"L1: near-needle should be second"
);
let expected_near = 1.0 - 1.0 / dim_bits as f32;
assert!(
(results[1].score - expected_near).abs() < 1e-6,
"Near-needle score should be {}, got {}",
expected_near,
results[1].score
);
let reranker_config = RerankerConfig {
field: bvec,
vector: Vec::new(),
binary_vector: needle_vec.clone(),
combiner: crate::query::MultiValueCombiner::Max,
unit_norm: false,
matryoshka_dims: None,
rrf_k: 0.0,
};
let query = BinaryDenseVectorQuery::new(bvec, needle_vec.clone());
let (reranked, _total) = searcher
.search_and_rerank(&query, 52, 5, &reranker_config)
.await
.unwrap();
assert!(!reranked.is_empty(), "Reranker should return results");
let top_doc = searcher
.doc(reranked[0].segment_id, reranked[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
top_doc.get_first(title).unwrap().as_text().unwrap(),
"Needle binary document",
"Reranked: exact match should be top"
);
assert!(
(reranked[0].score - 1.0).abs() < 1e-6,
"Reranked exact match score should be 1.0, got {}",
reranked[0].score
);
assert!(reranked.len() >= 2);
let second_doc = searcher
.doc(reranked[1].segment_id, reranked[1].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
second_doc.get_first(title).unwrap().as_text().unwrap(),
"Near-needle binary document",
"Reranked: near-needle should be second"
);
assert!(
(reranked[1].score - expected_near).abs() < 1e-6,
"Reranked near-needle score should be {}, got {}",
expected_near,
reranked[1].score
);
let text_query = TermQuery::text(body, "searchterm");
let (reranked, _total) = searcher
.search_and_rerank(&text_query, 52, 5, &reranker_config)
.await
.unwrap();
assert!(!reranked.is_empty(), "Text+rerank should return results");
let top_doc = searcher
.doc(reranked[0].segment_id, reranked[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
top_doc.get_first(title).unwrap().as_text().unwrap(),
"Needle binary document",
"Text L1 + Binary L2: exact match should be top after reranking"
);
assert!(
(reranked[0].score - 1.0).abs() < 1e-6,
"Text+rerank: needle score should be 1.0, got {}",
reranked[0].score
);
let second_doc = searcher
.doc(reranked[1].segment_id, reranked[1].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
second_doc.get_first(title).unwrap().as_text().unwrap(),
"Near-needle binary document",
"Text L1 + Binary L2: near-needle should be second after reranking"
);
for w in reranked.windows(2) {
assert!(
w[0].score >= w[1].score,
"Reranked scores should be non-increasing: {} < {}",
w[0].score,
w[1].score
);
}
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(index.num_docs().await.unwrap(), 52);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let text_query = TermQuery::text(body, "searchterm");
let (reranked, _total) = searcher
.search_and_rerank(&text_query, 52, 5, &reranker_config)
.await
.unwrap();
assert!(
!reranked.is_empty(),
"Post-merge: reranker should return results"
);
let top_doc = searcher
.doc(reranked[0].segment_id, reranked[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
top_doc.get_first(title).unwrap().as_text().unwrap(),
"Needle binary document",
"Post-merge: needle should be top after reranking"
);
assert!(
(reranked[0].score - 1.0).abs() < 1e-6,
"Post-merge: needle score should be 1.0, got {}",
reranked[0].score
);
let second_doc = searcher
.doc(reranked[1].segment_id, reranked[1].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
second_doc.get_first(title).unwrap().as_text().unwrap(),
"Near-needle binary document",
"Post-merge: near-needle should be second after reranking"
);
}
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn test_needle_combined_all_modalities() {
use crate::directories::MmapDirectory;
use crate::dsl::{DenseVectorConfig, VectorIndexType};
use crate::query::{DenseVectorQuery, SparseVectorQuery, TermQuery};
let tmp_dir = tempfile::tempdir().unwrap();
let dir = MmapDirectory::new(tmp_dir.path());
let dim = 8;
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let body = sb.add_text_field("body", true, true);
let sparse = sb.add_sparse_vector_field("sparse", true, true);
let embedding = sb.add_dense_vector_field_with_config(
"embedding",
true,
true,
DenseVectorConfig {
dim,
index_type: VectorIndexType::Flat,
quantization: crate::dsl::DenseVectorQuantization::F32,
num_clusters: None,
nprobe: 0,
build_threshold: None,
unit_norm: false,
},
);
let schema = sb.build();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..80u32 {
let mut doc = Document::new();
doc.add_text(title, format!("Hay doc {}", i));
doc.add_text(body, "general filler text about nothing special");
doc.add_sparse_vector(sparse, vec![(0, 0.3), (1, 0.2), ((i % 10) + 10, 0.5)]);
let vec: Vec<f32> = (0..dim)
.map(|d| ((i as usize * 3 + d * 7) % 50) as f32 / 100.0)
.collect();
doc.add_dense_vector(embedding, vec);
writer.add_document(doc).unwrap();
}
let mut needle = Document::new();
needle.add_text(title, "The extraordinary rhinoceros");
needle.add_text(
body,
"This document about rhinoceros is the only one with this word",
);
needle.add_sparse_vector(sparse, vec![(9999, 0.99), (9998, 0.88)]);
let needle_vec = vec![0.9; dim];
needle.add_dense_vector(embedding, needle_vec.clone());
writer.add_document(needle).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 81);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let tq = TermQuery::text(body, "rhinoceros");
let results = searcher.search(&tq, 10).await.unwrap();
assert_eq!(
results.len(),
1,
"Full-text: should find exactly the needle"
);
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert!(
doc.get_first(title)
.unwrap()
.as_text()
.unwrap()
.contains("rhinoceros")
);
let sq = SparseVectorQuery::new(sparse, vec![(9999, 1.0), (9998, 1.0)]);
let results = searcher.search(&sq, 10).await.unwrap();
assert_eq!(results.len(), 1, "Sparse: should find exactly the needle");
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert!(
doc.get_first(title)
.unwrap()
.as_text()
.unwrap()
.contains("rhinoceros")
);
let dq = DenseVectorQuery::new(embedding, needle_vec);
let results = searcher.search(&dq, 1).await.unwrap();
assert!(!results.is_empty(), "Dense: should find at least 1 result");
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
doc.get_first(title).unwrap().as_text().unwrap(),
"The extraordinary rhinoceros",
"Dense: top-1 should be the needle"
);
let ft_doc_id = {
let tq = TermQuery::text(body, "rhinoceros");
let r = searcher.search(&tq, 1).await.unwrap();
r[0].doc_id
};
let sp_doc_id = {
let sq = SparseVectorQuery::new(sparse, vec![(9999, 1.0)]);
let r = searcher.search(&sq, 1).await.unwrap();
r[0].doc_id
};
let dn_doc_id = {
let dq = DenseVectorQuery::new(embedding, vec![0.9; dim]);
let r = searcher.search(&dq, 1).await.unwrap();
r[0].doc_id
};
assert_eq!(
ft_doc_id, sp_doc_id,
"Full-text and sparse should find same doc"
);
assert_eq!(
sp_doc_id, dn_doc_id,
"Sparse and dense should find same doc"
);
}