use std::fs::File;
use std::io::BufReader;
use std::path::Path;
use std::sync::{Arc, Barrier};
use std::thread;
use ndarray::Array2;
use ndarray_rand::rand_distr::Uniform;
use ndarray_rand::RandomExt;
use next_plaid::index::MmapIndex;
use next_plaid::{IndexConfig, UpdateConfig};
use tempfile::TempDir;
fn setup_test_dir() -> TempDir {
TempDir::new().unwrap()
}
fn random_embeddings(num_docs: usize, tokens_per_doc: usize, dim: usize) -> Vec<Array2<f32>> {
(0..num_docs)
.map(|_| {
let mut emb: Array2<f32> =
Array2::random((tokens_per_doc, dim), Uniform::new(-1.0f32, 1.0f32));
for mut row in emb.axis_iter_mut(ndarray::Axis(0)) {
let norm: f32 = row.dot(&row).sqrt();
if norm > 0.0 {
row.mapv_inplace(|x| x / norm);
}
}
emb
})
.collect()
}
fn read_num_documents_from_file(index_path: &str) -> Option<usize> {
let metadata_path = Path::new(index_path).join("metadata.json");
if !metadata_path.exists() {
return None;
}
let file = File::open(&metadata_path).ok()?;
let metadata: serde_json::Value = serde_json::from_reader(BufReader::new(file)).ok()?;
metadata["num_documents"].as_u64().map(|n| n as usize)
}
#[test]
fn test_metadata_sync_after_create() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();
let embeddings = random_embeddings(10, 8, 64);
let config = IndexConfig {
nbits: 4,
batch_size: 100,
seed: Some(42),
..Default::default()
};
let index = MmapIndex::create_with_kmeans(&embeddings, path, &config).unwrap();
let num_docs = read_num_documents_from_file(path).expect("metadata.json should exist");
assert_eq!(num_docs, 10, "metadata.json should reflect 10 documents");
assert_eq!(index.metadata.num_documents, 10);
}
#[test]
fn test_metadata_sync_after_update() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();
let initial_embeddings = random_embeddings(5, 8, 64);
let config = IndexConfig {
nbits: 4,
batch_size: 100,
seed: Some(42),
..Default::default()
};
let mut index = MmapIndex::create_with_kmeans(&initial_embeddings, path, &config).unwrap();
assert_eq!(read_num_documents_from_file(path).unwrap(), 5);
let new_embeddings = random_embeddings(5, 8, 64);
let update_config = UpdateConfig::default();
let _doc_ids = index.update(&new_embeddings, &update_config).unwrap();
let num_docs = read_num_documents_from_file(path).expect("metadata.json should exist");
assert_eq!(
num_docs, 10,
"metadata.json should reflect 10 documents after update"
);
assert_eq!(index.metadata.num_documents, 10);
}
#[test]
fn test_metadata_sync_sequential_updates() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();
let config = IndexConfig {
nbits: 4,
batch_size: 100,
seed: Some(42),
start_from_scratch: 5,
..Default::default()
};
let update_config = UpdateConfig {
start_from_scratch: 5,
buffer_size: 1000, ..Default::default()
};
let emb1 = random_embeddings(3, 8, 64);
let mut index = MmapIndex::create_with_kmeans(&emb1, path, &config).unwrap();
assert_eq!(read_num_documents_from_file(path).unwrap(), 3);
let emb2 = random_embeddings(2, 8, 64);
let _doc_ids = index.update(&emb2, &update_config).unwrap();
assert_eq!(
read_num_documents_from_file(path).unwrap(),
5,
"After first update: expected 5 docs"
);
let emb3 = random_embeddings(3, 8, 64);
let _doc_ids = index.update(&emb3, &update_config).unwrap();
assert_eq!(
read_num_documents_from_file(path).unwrap(),
8,
"After second update: expected 8 docs"
);
let emb4 = random_embeddings(4, 8, 64);
let _doc_ids = index.update(&emb4, &update_config).unwrap();
assert_eq!(
read_num_documents_from_file(path).unwrap(),
12,
"After third update: expected 12 docs"
);
}
#[test]
fn test_metadata_sync_update_or_create_new() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();
let embeddings = random_embeddings(8, 8, 64);
let index_config = IndexConfig {
nbits: 4,
batch_size: 100,
seed: Some(42),
..Default::default()
};
let update_config = UpdateConfig::default();
let (index, doc_ids) =
MmapIndex::update_or_create(&embeddings, path, &index_config, &update_config).unwrap();
let num_docs = read_num_documents_from_file(path).expect("metadata.json should exist");
assert_eq!(num_docs, 8, "metadata.json should reflect 8 documents");
assert_eq!(index.metadata.num_documents, 8);
assert_eq!(doc_ids.len(), 8);
}
#[test]
fn test_metadata_sync_update_or_create_existing() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();
let index_config = IndexConfig {
nbits: 4,
batch_size: 100,
seed: Some(42),
..Default::default()
};
let update_config = UpdateConfig::default();
let emb1 = random_embeddings(5, 8, 64);
let (initial_index, doc_ids1) =
MmapIndex::update_or_create(&emb1, path, &index_config, &update_config).unwrap();
assert_eq!(read_num_documents_from_file(path).unwrap(), 5);
assert_eq!(doc_ids1, vec![0, 1, 2, 3, 4]);
drop(initial_index);
let emb2 = random_embeddings(5, 8, 64);
let (index, doc_ids2) =
MmapIndex::update_or_create(&emb2, path, &index_config, &update_config).unwrap();
let num_docs = read_num_documents_from_file(path).expect("metadata.json should exist");
assert_eq!(num_docs, 10, "metadata.json should reflect 10 documents");
assert_eq!(index.metadata.num_documents, 10);
assert_eq!(doc_ids2, vec![5, 6, 7, 8, 9]);
}
#[test]
fn test_metadata_sync_cross_thread_visibility() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap().to_string();
let path_clone = path.clone();
let embeddings = random_embeddings(5, 8, 64);
let config = IndexConfig {
nbits: 4,
batch_size: 100,
seed: Some(42),
..Default::default()
};
let mut index = MmapIndex::create_with_kmeans(&embeddings, &path, &config).unwrap();
let barrier = Arc::new(Barrier::new(2));
let barrier_clone = Arc::clone(&barrier);
let reader_handle = thread::spawn(move || {
barrier_clone.wait();
let num_docs =
read_num_documents_from_file(&path_clone).expect("metadata.json should exist");
assert_eq!(
num_docs, 10,
"Reader thread should see 10 documents immediately after update"
);
});
let new_embeddings = random_embeddings(5, 8, 64);
let update_config = UpdateConfig::default();
let _doc_ids = index.update(&new_embeddings, &update_config).unwrap();
barrier.wait();
reader_handle
.join()
.expect("Reader thread should complete successfully");
}
#[test]
fn test_metadata_sync_after_delete() {
let dir = setup_test_dir();
let path = dir.path().to_str().unwrap();
let embeddings = random_embeddings(10, 8, 64);
let config = IndexConfig {
nbits: 4,
batch_size: 100,
seed: Some(42),
..Default::default()
};
let mut index = MmapIndex::create_with_kmeans(&embeddings, path, &config).unwrap();
assert_eq!(read_num_documents_from_file(path).unwrap(), 10);
let deleted = index.delete(&[0, 2, 4]).unwrap();
assert_eq!(deleted, 3);
index.reload().unwrap();
let num_docs = read_num_documents_from_file(path).expect("metadata.json should exist");
assert_eq!(
num_docs, 7,
"metadata.json should reflect 7 documents after deletion"
);
assert_eq!(index.metadata.num_documents, 7);
}