use luci::analysis::AnalyzerRegistry;
use luci::core::{DocId, FieldId, SegmentId};
use luci::deletion::DeletionMap;
use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use luci::merge::merge_segments;
use luci::search::bm25::bm25_idf;
use luci::search::expression::parse_search;
use luci::search::reader::Reader;
use luci::search::segment_store::SegmentStore;
use luci::segment::reader::SegmentReader;
use luci::storage::SingleFileDirectory;
use luci::writer::IndexWriter;
use serde_json::json;
fn test_dir(name: &str) -> std::path::PathBuf {
let dir = std::env::temp_dir().join(format!("luci_integration_{}_{name}", std::process::id()));
let _ = std::fs::remove_dir_all(&dir);
dir
}
fn cleanup(path: &std::path::Path) {
let _ = std::fs::remove_dir_all(path);
}
#[test]
fn exit_criterion_1_indexing_and_term_search() {
let doc_count: usize = match std::env::var("LUCI_LARGE_TEST") {
Ok(_) => 1_000_000,
Err(_) => 100_000,
};
let path = test_dir("ec1");
let schema = Mapping::builder()
.field("title", FieldType::Text)
.field("category", FieldType::Keyword)
.build();
let index = Index::create_with_mapping(&path, schema).unwrap();
let categories = ["tech", "science", "sports", "politics", "entertainment"];
let docs: Vec<serde_json::Value> = (0..doc_count)
.map(|i| {
let cat = categories[i % categories.len()];
json!({
"title": format!("document number {i} about {cat} topics"),
"category": cat,
})
})
.collect();
index.bulk(docs).unwrap();
let expr = parse_search(json!({"term": {"title": "tech"}}), 10).unwrap();
let results = index.search(&expr).unwrap();
assert!(results.total_hits().value > 0);
assert!(results.len() <= 10);
for hit in results.iter() {
let source = hit.source().expect("_source should be present");
assert!(source["title"].is_string());
assert!(source["category"].is_string());
let title = source["title"].as_str().unwrap();
assert!(
title.contains("tech"),
"title should contain search term: {title}"
);
}
let expr = parse_search(json!({"term": {"category": "science"}}), 5).unwrap();
let results = index.search(&expr).unwrap();
assert!(results.total_hits().value > 0);
for hit in results.iter() {
let source = hit.source().unwrap();
assert_eq!(source["category"], "science");
}
cleanup(&path);
}
#[test]
fn exit_criterion_2_bm25_correctness() {
let path = test_dir("ec2");
let schema = Mapping::builder().field("body", FieldType::Text).build();
let index = Index::create_with_mapping(&path, schema).unwrap();
let docs: Vec<serde_json::Value> = (0..100)
.map(|i| {
let body = if i < 10 {
let mut words = vec!["search"; (i + 1) as usize];
for _ in 0..(10 - i) {
words.push("filler");
}
words.join(" ")
} else {
format!("document {i} with various other words and content")
};
json!({"body": body})
})
.collect();
index.bulk(docs).unwrap();
let expr = parse_search(json!({"term": {"body": "search"}}), 10).unwrap();
let results = index.search(&expr).unwrap();
assert_eq!(
results.total_hits().value,
10,
"exactly 10 docs contain 'search'"
);
let top_hit = results.hit(0).unwrap();
let top_doc = top_hit.doc_id().as_u32();
assert!(
top_doc < 10,
"top result should be from the 'search' docs, got doc {top_doc}"
);
let scores: Vec<f32> = results.iter().map(|h| h.score()).collect();
for i in 0..scores.len() - 1 {
assert!(
scores[i] >= scores[i + 1],
"results not in descending score order at position {i}"
);
}
let idf = bm25_idf(100, 10);
assert!(idf > 0.0, "IDF should be positive");
for hit in results.iter() {
assert!(hit.score() > 0.0, "score should be positive");
assert!(hit.score().is_finite(), "score should be finite");
}
}
#[test]
fn exit_criterion_3_segment_merge() {
let path = test_dir("ec3");
let storage = SingleFileDirectory::create(&path).unwrap();
let schema = Mapping::builder()
.field("body", FieldType::Text)
.field("tag", FieldType::Keyword)
.build();
let mut writer = IndexWriter::new(storage, schema.clone(), AnalyzerRegistry::new());
for i in 0..5 {
writer
.add(json!({
"body": format!("segment one document {i}"),
"tag": "batch1",
}))
.unwrap();
}
writer.commit().unwrap();
for i in 0..5 {
writer
.add(json!({
"body": format!("segment two document {i}"),
"tag": "batch2",
}))
.unwrap();
}
writer.commit().unwrap();
let storage = SingleFileDirectory::open(&path).unwrap();
assert_eq!(storage.segments().len(), 2);
let seg1_id = storage.segments()[0].segment_id;
let seg2_id = storage.segments()[1].segment_id;
let reader1 = SegmentReader::open(storage.read_segment(seg1_id).unwrap()).unwrap();
let reader2 = SegmentReader::open(storage.read_segment(seg2_id).unwrap()).unwrap();
assert_eq!(reader1.doc_count(), 5);
assert_eq!(reader2.doc_count(), 5);
let mut deletions = DeletionMap::new();
deletions.mark_deleted(seg1_id, DocId::new(1));
deletions.mark_deleted(seg1_id, DocId::new(3));
deletions.mark_deleted(seg2_id, DocId::new(2));
let merge_output = merge_segments(
SegmentId::new(100),
&[&reader1, &reader2],
&deletions,
&schema,
&AnalyzerRegistry::new(),
)
.expect("merge failed");
let merged = SegmentReader::open(merge_output.bytes).unwrap();
assert_eq!(merged.doc_count(), 7);
assert_eq!(merged.doc_freq(FieldId::new(0), "segment"), 7);
assert_eq!(merged.doc_freq(FieldId::new(0), "one"), 3);
assert_eq!(merged.doc_freq(FieldId::new(0), "two"), 4);
let seg_id = merged.segment_id();
let merged_doc_count = merged.doc_count();
let store = SegmentStore::new(vec![merged], AnalyzerRegistry::new(), None, None);
let reader = Reader::new(&store);
for doc in 0..merged_doc_count {
assert!(reader.get_source(seg_id, DocId::new(doc)).is_some());
}
cleanup(&path);
}
#[test]
fn end_to_end_write_search() {
let path = test_dir("e2e");
let schema = Mapping::builder()
.field("title", FieldType::Text)
.field("year", FieldType::Keyword)
.build();
let index = Index::create_with_mapping(&path, schema).unwrap();
let docs = vec![
json!({"title": "The Rust Programming Language", "year": "2015"}),
json!({"title": "Elasticsearch: The Definitive Guide", "year": "2015"}),
json!({"title": "Database Internals", "year": "2019"}),
json!({"title": "Designing Data-Intensive Applications", "year": "2017"}),
json!({"title": "Information Retrieval: Implementing and Evaluating Search Engines", "year": "2010"}),
];
for doc in &docs {
index.add(doc.clone()).unwrap();
}
let expr = parse_search(json!({"term": {"title": "rust"}}), 10).unwrap();
let results = index.search(&expr).unwrap();
assert_eq!(results.total_hits().value, 1);
assert_eq!(
results.hit(0).unwrap().source().unwrap()["title"],
"The Rust Programming Language"
);
let expr = parse_search(json!({"term": {"title": "the"}}), 10).unwrap();
let results = index.search(&expr).unwrap();
assert!(results.total_hits().value >= 2);
let expr = parse_search(json!({"term": {"year": "2015"}}), 10).unwrap();
let results = index.search(&expr).unwrap();
assert_eq!(results.total_hits().value, 2);
cleanup(&path);
}