use std::fs;
use std::path::Path;
use std::sync::Arc;
use assertables::*;
use bearing::document::DocumentBuilder;
use bearing::index::config::IndexWriterConfig;
use bearing::index::directory_reader::DirectoryReader;
use bearing::index::field::text;
use bearing::index::writer::IndexWriter;
use bearing::search::*;
use bearing::store::{MemoryDirectory, SharedDirectory};
fn build_golden_docs_index() -> (SharedDirectory, DirectoryReader) {
let config = IndexWriterConfig::default().num_threads(1);
let directory: SharedDirectory = MemoryDirectory::create();
let writer = IndexWriter::new(config, Arc::clone(&directory));
let docs_dir = Path::new("testdata/golden-docs");
let mut paths: Vec<_> = fs::read_dir(docs_dir)
.unwrap()
.filter_map(|e| e.ok())
.filter(|e| e.path().extension().is_some_and(|ext| ext == "txt"))
.map(|e| e.path())
.collect();
paths.sort();
for path in &paths {
let contents = fs::read_to_string(path).unwrap();
let doc = DocumentBuilder::new()
.add_field(text("contents").value(contents.as_str()))
.build();
writer.add_document(doc).unwrap();
}
writer.commit().unwrap();
let reader = DirectoryReader::open(&*directory).unwrap();
(directory, reader)
}
fn must_must_query(field: &str, term1: &[u8], term2: &[u8]) -> BooleanQuery {
let mut builder = BooleanQuery::builder();
builder.add_query(Box::new(TermQuery::new(field, term1)), Occur::Must);
builder.add_query(Box::new(TermQuery::new(field, term2)), Occur::Must);
builder.build()
}
#[test]
fn test_boolean_must_algorithms_data() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_must_query("contents", b"algorithms", b"data");
let top_docs = searcher.search(&query, 10).unwrap();
assert_eq!(top_docs.total_hits.value, 4);
assert_eq!(top_docs.score_docs.len(), 4);
assert_eq!(top_docs.score_docs[0].doc, 11);
assert_in_delta!(top_docs.score_docs[0].score, 0.9744643_f32, 1e-5);
assert_eq!(top_docs.score_docs[1].doc, 0);
assert_in_delta!(top_docs.score_docs[1].score, 0.802067_f32, 1e-5);
assert_eq!(top_docs.score_docs[2].doc, 14);
assert_in_delta!(top_docs.score_docs[2].score, 0.8000477_f32, 1e-5);
assert_eq!(top_docs.score_docs[3].doc, 3);
assert_in_delta!(top_docs.score_docs[3].score, 0.6710463_f32, 1e-5);
}
#[test]
fn test_boolean_must_distributed_systems() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_must_query("contents", b"distributed", b"systems");
let top_docs = searcher.search(&query, 10).unwrap();
assert_eq!(top_docs.total_hits.value, 6);
assert_eq!(top_docs.score_docs.len(), 6);
assert_eq!(top_docs.score_docs[0].doc, 13);
assert_in_delta!(top_docs.score_docs[0].score, 0.6466637_f32, 1e-5);
assert_eq!(top_docs.score_docs[1].doc, 2);
assert_in_delta!(top_docs.score_docs[1].score, 0.5788049_f32, 1e-5);
assert_eq!(top_docs.score_docs[2].doc, 7);
assert_in_delta!(top_docs.score_docs[2].score, 0.5369343_f32, 1e-5);
assert_eq!(top_docs.score_docs[3].doc, 8);
assert_in_delta!(top_docs.score_docs[3].score, 0.5369343_f32, 1e-5);
assert_eq!(top_docs.score_docs[4].doc, 6);
assert_in_delta!(top_docs.score_docs[4].score, 0.5074845_f32, 1e-5);
assert_eq!(top_docs.score_docs[5].doc, 0);
assert_in_delta!(top_docs.score_docs[5].score, 0.4957356_f32, 1e-5);
}
#[test]
fn test_boolean_must_memory_performance() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_must_query("contents", b"memory", b"performance");
let top_docs = searcher.search(&query, 10).unwrap();
assert_eq!(top_docs.total_hits.value, 1);
assert_eq!(top_docs.score_docs.len(), 1);
assert_eq!(top_docs.score_docs[0].doc, 0);
assert_in_delta!(top_docs.score_docs[0].score, 1.1177642_f32, 1e-5);
}
#[test]
fn test_boolean_must_quantum_computing() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_must_query("contents", b"quantum", b"computing");
let top_docs = searcher.search(&query, 10).unwrap();
assert_eq!(top_docs.total_hits.value, 1);
assert_eq!(top_docs.score_docs.len(), 1);
assert_eq!(top_docs.score_docs[0].doc, 9);
assert_in_delta!(top_docs.score_docs[0].score, 1.7729266_f32, 1e-5);
}
#[test]
fn test_boolean_must_nonexistent_terms() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_must_query("contents", b"nonexistent1", b"nonexistent2");
let top_docs = searcher.search(&query, 10).unwrap();
assert_eq!(top_docs.total_hits.value, 0);
assert_is_empty!(top_docs.score_docs);
}
fn should_should_query(field: &str, term1: &[u8], term2: &[u8]) -> BooleanQuery {
let mut builder = BooleanQuery::builder();
builder.add_query(Box::new(TermQuery::new(field, term1)), Occur::Should);
builder.add_query(Box::new(TermQuery::new(field, term2)), Occur::Should);
builder.build()
}
#[test]
fn test_boolean_should_algorithms_data() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = should_should_query("contents", b"algorithms", b"data");
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 10);
assert_eq!(top_docs.score_docs.len(), 10);
let expected = [
(11, 0.9744643_f32),
(0, 0.802067),
(14, 0.8000477),
(3, 0.6710463),
(5, 0.2621497),
(1, 0.250627),
(10, 0.2121821),
(12, 0.1924969),
(6, 0.1707188),
(9, 0.1577401),
];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
#[test]
fn test_boolean_should_distributed_systems() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = should_should_query("contents", b"distributed", b"systems");
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 12);
assert_eq!(top_docs.score_docs.len(), 12);
let expected = [
(13, 0.6466637_f32),
(2, 0.5788049),
(7, 0.5369343),
(8, 0.5369343),
(6, 0.5074845),
(0, 0.4957356),
(12, 0.1548606),
(3, 0.1403393),
(4, 0.1385187),
(9, 0.1345176),
(10, 0.1243533),
(11, 0.0942376),
];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
#[test]
fn test_boolean_should_memory_performance() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = should_should_query("contents", b"memory", b"performance");
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 7);
assert_eq!(top_docs.score_docs.len(), 7);
let expected = [
(0, 1.1177642_f32),
(8, 0.7110608),
(1, 0.6353776),
(9, 0.569159),
(14, 0.5055991),
(2, 0.4995965),
(12, 0.4880089),
];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
fn must_must_not_query(field: &str, must_term: &[u8], not_term: &[u8]) -> BooleanQuery {
let mut builder = BooleanQuery::builder();
builder.add_query(Box::new(TermQuery::new(field, must_term)), Occur::Must);
builder.add_query(Box::new(TermQuery::new(field, not_term)), Occur::MustNot);
builder.build()
}
#[test]
fn test_boolean_must_not_distributed_security() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_must_not_query("contents", b"distributed", b"security");
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 6);
assert_eq!(top_docs.score_docs.len(), 6);
let expected = [
(13, 0.4596379_f32),
(2, 0.4214391),
(7, 0.4214391),
(8, 0.4214391),
(0, 0.3891023),
(6, 0.365091),
];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
#[test]
fn test_boolean_must_not_memory_quantum() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_must_not_query("contents", b"memory", b"quantum");
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 2);
assert_eq!(top_docs.score_docs.len(), 2);
let expected = [(8, 0.7110608_f32), (0, 0.6565015)];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
#[test]
fn test_boolean_should_must_not_distributed_security() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let mut builder = BooleanQuery::builder();
builder.add_query(
Box::new(TermQuery::new("contents", b"distributed")),
Occur::Should,
);
builder.add_query(
Box::new(TermQuery::new("contents", b"security")),
Occur::MustNot,
);
let query = builder.build();
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 6);
assert_eq!(top_docs.score_docs.len(), 6);
let expected = [
(13, 0.4596379_f32),
(2, 0.4214391),
(7, 0.4214391),
(8, 0.4214391),
(0, 0.3891023),
(6, 0.365091),
];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
#[test]
fn test_boolean_should_must_not_memory_quantum() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let mut builder = BooleanQuery::builder();
builder.add_query(
Box::new(TermQuery::new("contents", b"memory")),
Occur::Should,
);
builder.add_query(
Box::new(TermQuery::new("contents", b"quantum")),
Occur::MustNot,
);
let query = builder.build();
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 2);
assert_eq!(top_docs.score_docs.len(), 2);
let expected = [(8, 0.7110608_f32), (0, 0.6565015)];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
fn must_should_query(field: &str, must_term: &[u8], should_term: &[u8]) -> BooleanQuery {
let mut builder = BooleanQuery::builder();
builder.add_query(Box::new(TermQuery::new(field, must_term)), Occur::Must);
builder.add_query(Box::new(TermQuery::new(field, should_term)), Occur::Should);
builder.build()
}
#[test]
fn test_boolean_mixed_algorithms_data() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_should_query("contents", b"algorithms", b"data");
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 4);
assert_eq!(top_docs.score_docs.len(), 4);
let expected = [
(11, 0.9744643_f32),
(0, 0.802067),
(14, 0.8000477),
(3, 0.6710463),
];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
#[test]
fn test_boolean_mixed_distributed_systems() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_should_query("contents", b"distributed", b"systems");
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 6);
assert_eq!(top_docs.score_docs.len(), 6);
let expected = [
(13, 0.6466637_f32),
(2, 0.5788049),
(7, 0.5369343),
(8, 0.5369343),
(6, 0.5074845),
(0, 0.4957356),
];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
#[test]
fn test_boolean_mixed_memory_performance() {
let (_dir, reader) = build_golden_docs_index();
let searcher = IndexSearcher::new(&reader);
let query = must_should_query("contents", b"memory", b"performance");
let top_docs = searcher.search(&query, 15).unwrap();
assert_eq!(top_docs.total_hits.value, 3);
assert_eq!(top_docs.score_docs.len(), 3);
let expected = [(0, 1.1177642_f32), (8, 0.7110608), (9, 0.569159)];
for (i, &(doc, score)) in expected.iter().enumerate() {
assert_eq!(top_docs.score_docs[i].doc, doc);
assert_in_delta!(top_docs.score_docs[i].score, score, 1e-5);
}
}
fn build_large_index(doc_count: usize, terms: &[&str]) -> (SharedDirectory, DirectoryReader) {
let config = IndexWriterConfig::default().num_threads(1);
let directory: SharedDirectory = MemoryDirectory::create();
let writer = IndexWriter::new(config, Arc::clone(&directory));
for i in 0..doc_count {
let filler_count = (i % 20) + 1;
let filler: String = (0..filler_count)
.map(|j| format!("word{}", j))
.collect::<Vec<_>>()
.join(" ");
let content = format!("{} {}", terms.join(" "), filler);
let doc = DocumentBuilder::new()
.add_field(text("content").value(content.as_str()))
.build();
writer.add_document(doc).unwrap();
}
writer.commit().unwrap();
let reader = DirectoryReader::open(&*directory).unwrap();
(directory, reader)
}
#[test]
fn test_boolean_conjunction_pruning_matches_term_query() {
let doc_count = 1500;
let (_dir, reader) = build_large_index(doc_count, &["alpha", "beta"]);
let searcher = IndexSearcher::new(&reader);
let term_query = TermQuery::new("content", b"alpha");
let term_result = searcher.search(&term_query, 10).unwrap();
assert_lt!(
term_result.total_hits.value,
doc_count as i64,
"TermQuery should prune non-competitive docs when totalHits > threshold"
);
let bool_query = must_must_query("content", b"alpha", b"beta");
let bool_result = searcher.search(&bool_query, 10).unwrap();
assert_lt!(
bool_result.total_hits.value,
doc_count as i64,
"BooleanQuery conjunction should prune non-competitive docs like TermQuery does"
);
}