use crate::directories::RamDirectory;
use crate::dsl::{Document, SchemaBuilder};
use crate::index::{Index, IndexConfig, IndexWriter};
use crate::query::SparseVectorQuery;
use crate::structures::{SparseFormat, SparseVectorConfig, WeightQuantization};
fn bmp_config() -> SparseVectorConfig {
SparseVectorConfig {
format: SparseFormat::Bmp,
weight_quantization: WeightQuantization::UInt8,
bmp_block_size: 64,
..SparseVectorConfig::default()
}
}
fn maxscore_config() -> SparseVectorConfig {
SparseVectorConfig {
format: SparseFormat::MaxScore,
weight_quantization: WeightQuantization::UInt8,
..SparseVectorConfig::default()
}
}
fn bmp_schema() -> (crate::dsl::Schema, crate::dsl::Field, crate::dsl::Field) {
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse = sb.add_sparse_vector_field_with_config("sparse", true, true, bmp_config());
(sb.build(), title, sparse)
}
fn maxscore_schema() -> (crate::dsl::Schema, crate::dsl::Field, crate::dsl::Field) {
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse = sb.add_sparse_vector_field_with_config("sparse", true, true, maxscore_config());
(sb.build(), title, sparse)
}
#[tokio::test]
async fn test_bmp_needle_in_haystack() {
let (schema, title, sparse) = bmp_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..100 {
let mut doc = Document::new();
doc.add_text(title, format!("Hay doc {}", i));
let entries: Vec<(u32, f32)> = (0..10)
.map(|d| (d, 0.1 + (i as f32 * 0.001) + (d as f32 * 0.01)))
.collect();
doc.add_sparse_vector(sparse, entries);
writer.add_document(doc).unwrap();
}
let mut needle = Document::new();
needle.add_text(title, "Needle BMP document");
needle.add_sparse_vector(sparse, vec![(1000, 0.9), (1001, 0.8), (1002, 0.7)]);
writer.add_document(needle).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 101);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = SparseVectorQuery::new(sparse, vec![(1000, 1.0), (1001, 1.0), (1002, 1.0)]);
let results = searcher.search(&query, 10).await.unwrap();
assert_eq!(results.len(), 1, "Only the needle has dims 1000-1002");
assert!(results[0].score > 0.0);
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
doc.get_first(title).unwrap().as_text().unwrap(),
"Needle BMP document"
);
let query_shared = SparseVectorQuery::new(sparse, vec![(5, 1.0)]);
let results = searcher.search(&query_shared, 200).await.unwrap();
assert!(
results.len() >= 50,
"Shared dim 5 should match many docs, got {}",
results.len()
);
let query_missing = SparseVectorQuery::new(sparse, vec![(99999, 1.0)]);
let results = searcher.search(&query_missing, 10).await.unwrap();
assert_eq!(results.len(), 0);
}
#[tokio::test]
async fn test_bmp_merge() {
let (schema, title, sparse) = bmp_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..30 {
let mut doc = Document::new();
doc.add_text(title, format!("seg1 hay {}", i));
doc.add_sparse_vector(sparse, vec![(0, 0.5), (1, 0.3), (2, 0.2)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let mut needle = Document::new();
needle.add_text(title, "seg2 needle");
needle.add_sparse_vector(sparse, vec![(500, 0.95), (501, 0.85)]);
writer.add_document(needle).unwrap();
for i in 0..29 {
let mut doc = Document::new();
doc.add_text(title, format!("seg2 hay {}", i));
doc.add_sparse_vector(sparse, vec![(0, 0.4), (3, 0.6)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 60);
let segments = index.segment_readers().await.unwrap();
assert!(segments.len() >= 2, "Should have at least 2 segments");
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = SparseVectorQuery::new(sparse, vec![(500, 1.0), (501, 1.0)]);
let results = searcher.search(&query, 10).await.unwrap();
assert_eq!(results.len(), 1, "Pre-merge: needle should be found");
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(index.num_docs().await.unwrap(), 60);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let results = searcher.search(&query, 10).await.unwrap();
assert_eq!(results.len(), 1, "Post-merge: needle should still be found");
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
doc.get_first(title).unwrap().as_text().unwrap(),
"seg2 needle"
);
let query_hay = SparseVectorQuery::new(sparse, vec![(0, 1.0)]);
let results = searcher.search(&query_hay, 100).await.unwrap();
assert!(
results.len() >= 50,
"Post-merge: dim 0 should match >=50 docs, got {}",
results.len()
);
}
#[tokio::test]
async fn test_bmp_score_ranking() {
let (schema, title, sparse) = bmp_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..50 {
let mut doc = Document::new();
doc.add_text(title, format!("Doc weight {}", i));
let weight = (i + 1) as f32 / 50.0;
doc.add_sparse_vector(sparse, vec![(0, weight)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = SparseVectorQuery::new(sparse, vec![(0, 1.0)]);
let results = searcher.search(&query, 10).await.unwrap();
assert_eq!(results.len(), 10);
for i in 1..results.len() {
assert!(
results[i - 1].score >= results[i].score,
"Results should be sorted descending: score[{}]={} < score[{}]={}",
i - 1,
results[i - 1].score,
i,
results[i].score
);
}
let top_doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
assert_eq!(
top_doc.get_first(title).unwrap().as_text().unwrap(),
"Doc weight 49"
);
}
#[tokio::test]
async fn test_bmp_vs_maxscore_equivalence() {
let (schema_bmp, _title_bmp, sparse_bmp) = bmp_schema();
let dir_bmp = RamDirectory::new();
let config = IndexConfig::default();
let mut writer_bmp = IndexWriter::create(dir_bmp.clone(), schema_bmp.clone(), config.clone())
.await
.unwrap();
let (schema_ms, _title_ms, sparse_ms) = maxscore_schema();
let dir_ms = RamDirectory::new();
let mut writer_ms = IndexWriter::create(dir_ms.clone(), schema_ms.clone(), config.clone())
.await
.unwrap();
let mut rng_state: u32 = 42;
for i in 0..200 {
let mut entries = Vec::new();
let num_dims = 5 + (rng_state % 11);
for _ in 0..num_dims {
rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
let dim = rng_state % 1000;
rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
let weight = (rng_state % 100) as f32 / 100.0;
if weight > 0.01 {
entries.push((dim, weight));
}
}
let mut doc_bmp = Document::new();
doc_bmp.add_text(_title_bmp, format!("doc {}", i));
doc_bmp.add_sparse_vector(sparse_bmp, entries.clone());
writer_bmp.add_document(doc_bmp).unwrap();
let mut doc_ms = Document::new();
doc_ms.add_text(_title_ms, format!("doc {}", i));
doc_ms.add_sparse_vector(sparse_ms, entries);
writer_ms.add_document(doc_ms).unwrap();
}
writer_bmp.commit().await.unwrap();
writer_ms.commit().await.unwrap();
let index_bmp = Index::open(dir_bmp, config.clone()).await.unwrap();
let index_ms = Index::open(dir_ms, config.clone()).await.unwrap();
let query_dims = vec![(42, 0.8), (100, 0.6), (200, 0.4), (500, 0.9)];
let reader_bmp = index_bmp.reader().await.unwrap();
let searcher_bmp = reader_bmp.searcher().await.unwrap();
let query_bmp = SparseVectorQuery::new(sparse_bmp, query_dims.clone());
let results_bmp = searcher_bmp.search(&query_bmp, 20).await.unwrap();
let reader_ms = index_ms.reader().await.unwrap();
let searcher_ms = reader_ms.searcher().await.unwrap();
let query_ms = SparseVectorQuery::new(sparse_ms, query_dims);
let results_ms = searcher_ms.search(&query_ms, 20).await.unwrap();
assert_eq!(
results_bmp.len(),
results_ms.len(),
"BMP and MaxScore should return same number of results: BMP={}, MS={}",
results_bmp.len(),
results_ms.len()
);
if !results_bmp.is_empty() {
let bmp_top = results_bmp[0].score;
let ms_top = results_ms[0].score;
let diff = (bmp_top - ms_top).abs();
assert!(
diff < 0.2 * ms_top.max(0.01),
"Top scores should be close: BMP={:.4}, MS={:.4}, diff={:.4}",
bmp_top,
ms_top,
diff
);
}
}
#[tokio::test]
async fn test_bmp_vs_maxscore_multi_ordinal() {
let (schema_bmp, _title_bmp, sparse_bmp) = bmp_schema();
let dir_bmp = RamDirectory::new();
let config = IndexConfig::default();
let mut writer_bmp = IndexWriter::create(dir_bmp.clone(), schema_bmp.clone(), config.clone())
.await
.unwrap();
let (schema_ms, _title_ms, sparse_ms) = maxscore_schema();
let dir_ms = RamDirectory::new();
let mut writer_ms = IndexWriter::create(dir_ms.clone(), schema_ms.clone(), config.clone())
.await
.unwrap();
let vectors_per_doc: Vec<Vec<Vec<(u32, f32)>>> = vec![
vec![
vec![(10, 0.9), (20, 0.3)],
vec![(10, 0.7), (30, 0.5)],
vec![(20, 0.8), (40, 0.2)],
],
vec![
vec![(10, 0.4), (30, 0.9)],
vec![(30, 0.8), (50, 0.3)],
vec![(10, 0.2), (60, 0.5)],
],
vec![
vec![(20, 0.95), (10, 0.1)],
vec![(20, 0.85)],
vec![(20, 0.6), (70, 0.4)],
],
];
let mut all_docs = vectors_per_doc.clone();
let mut rng: u32 = 777;
for _ in 0..50 {
let mut doc_vecs = Vec::new();
for _ in 0..3 {
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let dim = 100 + (rng % 200);
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let w = (rng % 50) as f32 / 100.0 + 0.01;
doc_vecs.push(vec![(dim, w)]);
}
all_docs.push(doc_vecs);
}
for vectors in &all_docs {
let mut doc_bmp = Document::new();
doc_bmp.add_text(_title_bmp, "doc");
for v in vectors {
doc_bmp.add_sparse_vector(sparse_bmp, v.clone());
}
writer_bmp.add_document(doc_bmp).unwrap();
let mut doc_ms = Document::new();
doc_ms.add_text(_title_ms, "doc");
for v in vectors {
doc_ms.add_sparse_vector(sparse_ms, v.clone());
}
writer_ms.add_document(doc_ms).unwrap();
}
writer_bmp.commit().await.unwrap();
writer_ms.commit().await.unwrap();
let index_bmp = Index::open(dir_bmp, config.clone()).await.unwrap();
let index_ms = Index::open(dir_ms, config.clone()).await.unwrap();
let query_dims = vec![(10, 1.0), (20, 0.5)];
let reader_bmp = index_bmp.reader().await.unwrap();
let searcher_bmp = reader_bmp.searcher().await.unwrap();
let q_bmp = SparseVectorQuery::new(sparse_bmp, query_dims.clone());
let results_bmp = searcher_bmp.search(&q_bmp, 10).await.unwrap();
let reader_ms = index_ms.reader().await.unwrap();
let searcher_ms = reader_ms.searcher().await.unwrap();
let q_ms = SparseVectorQuery::new(sparse_ms, query_dims);
let results_ms = searcher_ms.search(&q_ms, 10).await.unwrap();
assert_eq!(
results_bmp.len(),
results_ms.len(),
"Multi-ordinal: BMP returned {} results, MaxScore returned {}",
results_bmp.len(),
results_ms.len()
);
let n = results_bmp.len().min(5);
for i in 0..n {
let bmp_doc = results_bmp[i].doc_id;
let ms_doc = results_ms[i].doc_id;
let bmp_score = results_bmp[i].score;
let ms_score = results_ms[i].score;
let diff = (bmp_score - ms_score).abs();
assert!(
diff < 0.25 * ms_score.max(0.01),
"Multi-ordinal rank {}: BMP doc_id={} score={:.4}, MS doc_id={} score={:.4}, diff={:.4}",
i,
bmp_doc,
bmp_score,
ms_doc,
ms_score,
diff
);
}
}
#[tokio::test]
async fn test_bmp_many_blocks() {
let (schema, title, sparse) = bmp_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..500 {
let mut doc = Document::new();
doc.add_text(title, format!("Doc {}", i));
let dim = (i % 20) as u32;
let weight = 0.1 + (i as f32 / 500.0);
doc.add_sparse_vector(sparse, vec![(dim, weight), (100, 0.05)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = SparseVectorQuery::new(sparse, vec![(100, 1.0)]);
let results = searcher.search(&query, 600).await.unwrap();
assert!(
results.len() >= 400,
"Dim 100 should match most docs, got {}",
results.len()
);
let query = SparseVectorQuery::new(sparse, vec![(5, 1.0)]);
let results = searcher.search(&query, 100).await.unwrap();
assert!(
results.len() >= 20,
"Dim 5 should match ~25 docs, got {}",
results.len()
);
}
#[tokio::test]
async fn test_bmp_merge_exact_doc_ids() {
let (schema, title, sparse) = bmp_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
const DOCS_PER_SEG: usize = 100;
const NUM_SEGS: usize = 5;
for seg in 0..NUM_SEGS {
for i in 0..DOCS_PER_SEG {
let unique_dim = (seg * DOCS_PER_SEG + i) as u32;
let mut doc = Document::new();
doc.add_text(title, format!("seg{} doc{}", seg, i));
doc.add_sparse_vector(sparse, vec![(unique_dim, 1.0), (9999, 0.1)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
}
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(
index.num_docs().await.unwrap() as usize,
DOCS_PER_SEG * NUM_SEGS
);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
for seg in 0..NUM_SEGS {
for i in 0..DOCS_PER_SEG {
let unique_dim = (seg * DOCS_PER_SEG + i) as u32;
let expected_title = format!("seg{} doc{}", seg, i);
let query = SparseVectorQuery::new(sparse, vec![(unique_dim, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
assert_eq!(
results.len(),
1,
"dim {} should match exactly 1 doc, got {}",
unique_dim,
results.len()
);
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got_title = doc.get_first(title).unwrap().as_text().unwrap().to_string();
assert_eq!(
got_title, expected_title,
"dim {} returned wrong doc: got '{}', expected '{}'",
unique_dim, got_title, expected_title
);
}
}
}
#[tokio::test]
async fn test_maxscore_merge() {
let (schema, title, sparse) = maxscore_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
const DOCS_PER_SEG: usize = 50;
const NUM_SEGS: usize = 3;
for seg in 0..NUM_SEGS {
for i in 0..DOCS_PER_SEG {
let unique_dim = (seg * DOCS_PER_SEG + i) as u32;
let mut doc = Document::new();
doc.add_text(title, format!("seg{} doc{}", seg, i));
doc.add_sparse_vector(sparse, vec![(unique_dim, 1.0), (9999, 0.1)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
}
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(
index.num_docs().await.unwrap() as usize,
DOCS_PER_SEG * NUM_SEGS
);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
for seg in 0..NUM_SEGS {
for i in 0..DOCS_PER_SEG {
let unique_dim = (seg * DOCS_PER_SEG + i) as u32;
let expected_title = format!("seg{} doc{}", seg, i);
let query = SparseVectorQuery::new(sparse, vec![(unique_dim, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
assert_eq!(
results.len(),
1,
"MaxScore merge: dim {} should match exactly 1 doc, got {}",
unique_dim,
results.len()
);
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got_title = doc.get_first(title).unwrap().as_text().unwrap().to_string();
assert_eq!(
got_title, expected_title,
"MaxScore merge: dim {} returned wrong doc: got '{}', expected '{}'",
unique_dim, got_title, expected_title
);
}
}
let query = SparseVectorQuery::new(sparse, vec![(9999, 1.0)]);
let results = searcher.search(&query, 200).await.unwrap();
assert_eq!(
results.len(),
DOCS_PER_SEG * NUM_SEGS,
"MaxScore merge: dim 9999 should match all {} docs, got {}",
DOCS_PER_SEG * NUM_SEGS,
results.len()
);
}
#[tokio::test]
async fn test_bmp_multi_round_merge() {
let (schema, title, sparse) = bmp_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for batch in 0..3 {
for i in 0..20 {
let mut doc = Document::new();
doc.add_text(title, format!("r1 b{} d{}", batch, i));
doc.add_sparse_vector(
sparse,
vec![(0, 0.5), ((batch * 10 + i % 5 + 1) as u32, 0.8)],
);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
}
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 60);
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
for batch in 0..2 {
for i in 0..20 {
let mut doc = Document::new();
doc.add_text(title, format!("r2 b{} d{}", batch, i));
doc.add_sparse_vector(sparse, vec![(0, 0.3), (999, 0.9)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
}
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 100);
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let query = SparseVectorQuery::new(sparse, vec![(0, 1.0)]);
let results = searcher.search(&query, 200).await.unwrap();
assert!(
results.len() >= 90,
"Dim 0 should match most docs after 2 merges, got {}",
results.len()
);
let query = SparseVectorQuery::new(sparse, vec![(999, 1.0)]);
let results = searcher.search(&query, 100).await.unwrap();
assert!(
results.len() >= 35,
"Dim 999 should match ~40 docs, got {}",
results.len()
);
}
#[tokio::test]
async fn test_bmp_merge_correctness() {
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse = sb.add_sparse_vector_field_with_config("sparse", true, true, bmp_config());
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
const DOCS_PER_SEG: usize = 100;
const NUM_SEGS: usize = 5;
for seg in 0..NUM_SEGS {
for i in 0..DOCS_PER_SEG {
let unique_dim = (seg * DOCS_PER_SEG + i) as u32;
let mut doc = Document::new();
doc.add_text(title, format!("seg{} doc{}", seg, i));
let topic_dim = 10000 + (seg as u32 * 100);
doc.add_sparse_vector(
sparse,
vec![(unique_dim, 1.0), (9999, 0.1), (topic_dim, 0.5)],
);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
}
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
let segments = index.segment_readers().await.unwrap();
assert!(
segments.len() >= 5,
"Should have >= 5 segments before merge"
);
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(
index.num_docs().await.unwrap() as usize,
DOCS_PER_SEG * NUM_SEGS
);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let mut failures = Vec::new();
for seg in 0..NUM_SEGS {
for i in 0..DOCS_PER_SEG {
let unique_dim = (seg * DOCS_PER_SEG + i) as u32;
let expected_title = format!("seg{} doc{}", seg, i);
let query = SparseVectorQuery::new(sparse, vec![(unique_dim, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
if results.len() != 1 {
failures.push(format!(
"dim {}: expected 1 result, got {}",
unique_dim,
results.len()
));
continue;
}
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got_title = doc.get_first(title).unwrap().as_text().unwrap().to_string();
if got_title != expected_title {
failures.push(format!(
"dim {}: got '{}', expected '{}'",
unique_dim, got_title, expected_title
));
}
}
}
assert!(
failures.is_empty(),
"Merge correctness: {} failures:\n{}",
failures.len(),
failures[..failures.len().min(20)].join("\n")
);
let query = SparseVectorQuery::new(sparse, vec![(9999, 1.0)]);
let results = searcher.search(&query, 600).await.unwrap();
assert_eq!(
results.len(),
DOCS_PER_SEG * NUM_SEGS,
"Merge: dim 9999 should match all {} docs, got {}",
DOCS_PER_SEG * NUM_SEGS,
results.len()
);
}
#[tokio::test]
async fn test_bmp_merge_large() {
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse = sb.add_sparse_vector_field_with_config("sparse", true, true, bmp_config());
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
const DOCS_PER_SEG: usize = 500;
const NUM_SEGS: usize = 3;
for seg in 0..NUM_SEGS {
for i in 0..DOCS_PER_SEG {
let unique_dim = (seg * DOCS_PER_SEG + i) as u32;
let topic = i / 50;
let topic_dim = 20000 + (topic as u32 * 10);
let topic_dim2 = 20001 + (topic as u32 * 10);
let mut doc = Document::new();
doc.add_text(title, format!("s{}d{}", seg, i));
doc.add_sparse_vector(
sparse,
vec![
(unique_dim, 1.0),
(9999, 0.1),
(topic_dim, 0.8),
(topic_dim2, 0.5),
],
);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
}
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let mut failures = Vec::new();
for seg in 0..NUM_SEGS {
for i in 0..DOCS_PER_SEG {
let unique_dim = (seg * DOCS_PER_SEG + i) as u32;
let expected = format!("s{}d{}", seg, i);
let query = SparseVectorQuery::new(sparse, vec![(unique_dim, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
if results.len() != 1 {
failures.push(format!(
"dim {}: expected 1 result, got {}",
unique_dim,
results.len()
));
continue;
}
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got = doc.get_first(title).unwrap().as_text().unwrap().to_string();
if got != expected {
failures.push(format!(
"dim {}: got '{}', expected '{}'",
unique_dim, got, expected
));
}
}
}
assert!(
failures.is_empty(),
"Merge large: {} failures (of {}):\n{}",
failures.len(),
DOCS_PER_SEG * NUM_SEGS,
failures[..failures.len().min(30)].join("\n")
);
let query = SparseVectorQuery::new(sparse, vec![(20000, 1.0), (20001, 0.5)]);
let results = searcher.search(&query, 200).await.unwrap();
assert!(
results.len() >= 100,
"Topic query should match >=100 docs, got {}",
results.len()
);
}
#[tokio::test]
async fn test_bmp_reorder_standalone() {
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse = sb.add_sparse_vector_field_with_config("sparse", true, true, bmp_config());
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
const NUM_DOCS: usize = 200;
for i in 0..NUM_DOCS {
let unique_dim = i as u32;
let topic = i / 50;
let topic_dim = 10000 + (topic as u32 * 10);
let mut doc = Document::new();
doc.add_text(title, format!("doc{}", i));
doc.add_sparse_vector(
sparse,
vec![(unique_dim, 1.0), (9999, 0.1), (topic_dim, 0.5)],
);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(index.num_docs().await.unwrap() as usize, NUM_DOCS);
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.reorder().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(index.num_docs().await.unwrap() as usize, NUM_DOCS);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let mut failures = Vec::new();
for i in 0..NUM_DOCS {
let unique_dim = i as u32;
let expected_title = format!("doc{}", i);
let query = SparseVectorQuery::new(sparse, vec![(unique_dim, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
if results.len() != 1 {
failures.push(format!(
"dim {}: expected 1 result, got {}",
unique_dim,
results.len()
));
continue;
}
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got_title = doc.get_first(title).unwrap().as_text().unwrap().to_string();
if got_title != expected_title {
failures.push(format!(
"dim {}: got '{}', expected '{}'",
unique_dim, got_title, expected_title
));
}
}
assert!(
failures.is_empty(),
"Standalone reorder: {} failures:\n{}",
failures.len(),
failures[..failures.len().min(20)].join("\n")
);
let query = SparseVectorQuery::new(sparse, vec![(9999, 1.0)]);
let results = searcher.search(&query, 300).await.unwrap();
assert_eq!(
results.len(),
NUM_DOCS,
"Reorder: dim 9999 should match all {} docs, got {}",
NUM_DOCS,
results.len()
);
}
#[tokio::test]
async fn test_bmp_reorder_multi_field() {
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse_a = sb.add_sparse_vector_field_with_config("sparse_a", true, true, bmp_config());
let sparse_b = sb.add_sparse_vector_field_with_config("sparse_b", true, true, bmp_config());
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
const NUM_DOCS: usize = 100;
for i in 0..NUM_DOCS {
let mut doc = Document::new();
doc.add_text(title, format!("doc{}", i));
doc.add_sparse_vector(sparse_a, vec![(i as u32, 1.0), (9999, 0.1)]);
doc.add_sparse_vector(sparse_b, vec![(1000 + i as u32, 1.0), (19999, 0.1)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.reorder().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(index.num_docs().await.unwrap() as usize, NUM_DOCS);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let mut failures = Vec::new();
for i in 0..NUM_DOCS {
let query = SparseVectorQuery::new(sparse_a, vec![(i as u32, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
if results.len() != 1 {
failures.push(format!(
"field_a dim {}: expected 1 result, got {}",
i,
results.len()
));
continue;
}
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got = doc.get_first(title).unwrap().as_text().unwrap();
if got != format!("doc{}", i) {
failures.push(format!("field_a dim {}: got '{}'", i, got));
}
}
for i in 0..NUM_DOCS {
let query = SparseVectorQuery::new(sparse_b, vec![(1000 + i as u32, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
if results.len() != 1 {
failures.push(format!(
"field_b dim {}: expected 1 result, got {}",
1000 + i,
results.len()
));
continue;
}
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got = doc.get_first(title).unwrap().as_text().unwrap();
if got != format!("doc{}", i) {
failures.push(format!("field_b dim {}: got '{}'", 1000 + i, got));
}
}
assert!(
failures.is_empty(),
"Multi-field reorder: {} failures:\n{}",
failures.len(),
failures[..failures.len().min(20)].join("\n")
);
let query = SparseVectorQuery::new(sparse_a, vec![(9999, 1.0)]);
let results = searcher.search(&query, 200).await.unwrap();
assert_eq!(results.len(), NUM_DOCS);
let query = SparseVectorQuery::new(sparse_b, vec![(19999, 1.0)]);
let results = searcher.search(&query, 200).await.unwrap();
assert_eq!(results.len(), NUM_DOCS);
}
#[tokio::test]
async fn test_bmp_multi_ordinal_clustering() {
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let sparse = sb.add_sparse_vector_field_with_config("sparse", true, true, bmp_config());
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
const NUM_DOCS: usize = 200;
let mut rng: u32 = 42;
for i in 0..NUM_DOCS / 2 {
let mut doc = Document::new();
doc.add_text(title, format!("doc{}", i));
let unique_a = 5000 + i as u32;
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let topic_a_dim = 1001 + (rng % 99);
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let w = 0.3 + (rng % 70) as f32 / 100.0;
doc.add_sparse_vector(sparse, vec![(1000, 0.5), (topic_a_dim, w), (unique_a, 1.0)]);
let unique_b = 6000 + i as u32;
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let topic_b_dim = 2001 + (rng % 99);
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let w = 0.3 + (rng % 70) as f32 / 100.0;
doc.add_sparse_vector(sparse, vec![(2000, 0.5), (topic_b_dim, w), (unique_b, 1.0)]);
let unique_c = 7000 + i as u32;
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let topic_c_dim = 3001 + (rng % 99);
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let w = 0.3 + (rng % 70) as f32 / 100.0;
doc.add_sparse_vector(sparse, vec![(3000, 0.5), (topic_c_dim, w), (unique_c, 1.0)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
for i in NUM_DOCS / 2..NUM_DOCS {
let mut doc = Document::new();
doc.add_text(title, format!("doc{}", i));
let unique_a = 5000 + i as u32;
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let topic_a_dim = 1001 + (rng % 99);
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let w = 0.3 + (rng % 70) as f32 / 100.0;
doc.add_sparse_vector(sparse, vec![(1000, 0.5), (topic_a_dim, w), (unique_a, 1.0)]);
let unique_b = 6000 + i as u32;
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let topic_b_dim = 2001 + (rng % 99);
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let w = 0.3 + (rng % 70) as f32 / 100.0;
doc.add_sparse_vector(sparse, vec![(2000, 0.5), (topic_b_dim, w), (unique_b, 1.0)]);
let unique_c = 7000 + i as u32;
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let topic_c_dim = 3001 + (rng % 99);
rng = rng.wrapping_mul(1103515245).wrapping_add(12345);
let w = 0.3 + (rng % 70) as f32 / 100.0;
doc.add_sparse_vector(sparse, vec![(3000, 0.5), (topic_c_dim, w), (unique_c, 1.0)]);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let mut writer = IndexWriter::open(dir.clone(), config.clone())
.await
.unwrap();
writer.force_merge().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.segment_readers().await.unwrap().len(), 1);
assert_eq!(index.num_docs().await.unwrap() as usize, NUM_DOCS);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let mut failures = Vec::new();
for i in 0..NUM_DOCS {
let expected = format!("doc{}", i);
let query = SparseVectorQuery::new(sparse, vec![(5000 + i as u32, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
if results.len() != 1 {
failures.push(format!(
"doc{} ord0 unique_dim={}: got {} results",
i,
5000 + i,
results.len()
));
} else {
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got = doc.get_first(title).unwrap().as_text().unwrap();
if got != expected {
failures.push(format!(
"doc{} ord0: got '{}', expected '{}'",
i, got, expected
));
}
}
let query = SparseVectorQuery::new(sparse, vec![(6000 + i as u32, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
if results.len() != 1 {
failures.push(format!(
"doc{} ord1 unique_dim={}: got {} results",
i,
6000 + i,
results.len()
));
} else {
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got = doc.get_first(title).unwrap().as_text().unwrap();
if got != expected {
failures.push(format!(
"doc{} ord1: got '{}', expected '{}'",
i, got, expected
));
}
}
let query = SparseVectorQuery::new(sparse, vec![(7000 + i as u32, 1.0)]);
let results = searcher.search(&query, 5).await.unwrap();
if results.len() != 1 {
failures.push(format!(
"doc{} ord2 unique_dim={}: got {} results",
i,
7000 + i,
results.len()
));
} else {
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let got = doc.get_first(title).unwrap().as_text().unwrap();
if got != expected {
failures.push(format!(
"doc{} ord2: got '{}', expected '{}'",
i, got, expected
));
}
}
}
assert!(
failures.is_empty(),
"Multi-ordinal clustering: {} failures:\n{}",
failures.len(),
failures[..failures.len().min(20)].join("\n")
);
let query = SparseVectorQuery::new(sparse, vec![(1000, 1.0)]);
let results_a = searcher.search(&query, 300).await.unwrap();
assert_eq!(
results_a.len(),
NUM_DOCS,
"Topic A anchor dim 1000 should match all {} docs, got {}",
NUM_DOCS,
results_a.len()
);
let query = SparseVectorQuery::new(sparse, vec![(1000, 0.8), (2000, 0.8)]);
let results_cross = searcher.search(&query, 300).await.unwrap();
assert_eq!(
results_cross.len(),
NUM_DOCS,
"Cross-topic query should match all {} docs, got {}",
NUM_DOCS,
results_cross.len()
);
}