use crate::directories::RamDirectory;
use crate::dsl::{Document, SchemaBuilder};
use crate::index::{Index, IndexConfig, IndexWriter};
#[tokio::test]
async fn test_maxscore_optimization_for_or_queries() {
use crate::query::{BooleanQuery, TermQuery};
let mut schema_builder = SchemaBuilder::default();
let content = schema_builder.add_text_field("content", true, true);
let schema = schema_builder.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust programming language is fast");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust is a systems language");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "programming is fun");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "python is easy to learn");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust rust programming programming systems");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
let or_query = BooleanQuery::new()
.should(TermQuery::text(content, "rust"))
.should(TermQuery::text(content, "programming"));
let results = index.search(&or_query, 10).await.unwrap();
assert_eq!(results.hits.len(), 4, "Should find exactly 4 documents");
let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
assert!(doc_ids.contains(&0), "Should find doc 0");
assert!(doc_ids.contains(&1), "Should find doc 1");
assert!(doc_ids.contains(&2), "Should find doc 2");
assert!(doc_ids.contains(&4), "Should find doc 4");
assert!(
!doc_ids.contains(&3),
"Should NOT find doc 3 (only has 'python')"
);
let single_query = BooleanQuery::new().should(TermQuery::text(content, "rust"));
let results = index.search(&single_query, 10).await.unwrap();
assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
let must_query = BooleanQuery::new()
.must(TermQuery::text(content, "rust"))
.should(TermQuery::text(content, "programming"));
let results = index.search(&must_query, 10).await.unwrap();
assert_eq!(results.hits.len(), 3, "Should find 3 documents with 'rust'");
let must_not_query = BooleanQuery::new()
.should(TermQuery::text(content, "rust"))
.should(TermQuery::text(content, "programming"))
.must_not(TermQuery::text(content, "systems"));
let results = index.search(&must_not_query, 10).await.unwrap();
let doc_ids: Vec<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
assert!(
!doc_ids.contains(&1),
"Should NOT find doc 1 (has 'systems')"
);
assert!(
!doc_ids.contains(&4),
"Should NOT find doc 4 (has 'systems')"
);
let or_query = BooleanQuery::new()
.should(TermQuery::text(content, "rust"))
.should(TermQuery::text(content, "programming"));
let results = index.search(&or_query, 2).await.unwrap();
assert_eq!(results.hits.len(), 2, "Should return only top 2 results");
}
#[tokio::test]
async fn test_boolean_or_maxscore_optimization() {
use crate::query::{BooleanQuery, TermQuery};
let mut schema_builder = SchemaBuilder::default();
let content = schema_builder.add_text_field("content", true, true);
let schema = schema_builder.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..10 {
let mut doc = Document::new();
let text = match i % 4 {
0 => "apple banana cherry",
1 => "apple orange",
2 => "banana grape",
_ => "cherry date",
};
doc.add_text(content, text);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
let query = BooleanQuery::new()
.should(TermQuery::text(content, "apple"))
.should(TermQuery::text(content, "banana"));
let results = index.search(&query, 10).await.unwrap();
assert_eq!(results.hits.len(), 8, "Should find all matching docs");
}
#[tokio::test]
async fn test_needle_fulltext_single_segment() {
let mut sb = SchemaBuilder::default();
let title = sb.add_text_field("title", true, true);
let body = sb.add_text_field("body", true, true);
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..100 {
let mut doc = Document::new();
doc.add_text(title, format!("Hay document number {}", i));
doc.add_text(
body,
"common words repeated across all hay documents filler text",
);
writer.add_document(doc).unwrap();
}
let mut needle = Document::new();
needle.add_text(title, "The unique needle xylophone");
needle.add_text(
body,
"This document contains the extraordinary term xylophone",
);
writer.add_document(needle).unwrap();
for i in 100..150 {
let mut doc = Document::new();
doc.add_text(title, format!("More hay document {}", i));
doc.add_text(body, "common words filler text again and again");
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 151);
let results = index.query("xylophone", 10).await.unwrap();
assert_eq!(results.hits.len(), 1, "Should find exactly the needle");
assert!(results.hits[0].score > 0.0, "Score should be positive");
let doc = index
.get_document(&results.hits[0].address)
.await
.unwrap()
.unwrap();
let title_val = doc.get_first(title).unwrap().as_text().unwrap();
assert!(
title_val.contains("xylophone"),
"Retrieved doc should be the needle"
);
let results = index.query("common", 200).await.unwrap();
assert!(
results.hits.len() >= 100,
"Common term should match many docs"
);
let results = index.query("nonexistentterm99999", 10).await.unwrap();
assert_eq!(
results.hits.len(),
0,
"Non-existent term should match nothing"
);
}
#[tokio::test]
async fn test_needle_fulltext_multi_segment() {
use crate::query::TermQuery;
let mut sb = SchemaBuilder::default();
let content = sb.add_text_field("content", true, true);
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..50 {
let mut doc = Document::new();
doc.add_text(content, format!("segment one hay document {}", i));
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let mut needle = Document::new();
needle.add_text(content, "the magnificent quetzalcoatl serpent deity");
writer.add_document(needle).unwrap();
for i in 0..49 {
let mut doc = Document::new();
doc.add_text(content, format!("segment two hay document {}", i));
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
for i in 0..50 {
let mut doc = Document::new();
doc.add_text(content, format!("segment three hay document {}", i));
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir.clone(), config.clone()).await.unwrap();
assert_eq!(index.num_docs().await.unwrap(), 150);
let num_segments = index.segment_readers().await.unwrap().len();
assert!(
num_segments >= 2,
"Should have multiple segments, got {}",
num_segments
);
let results = index.query("quetzalcoatl", 10).await.unwrap();
assert_eq!(
results.hits.len(),
1,
"Should find exactly 1 needle across segments"
);
let reader = index.reader().await.unwrap();
let searcher = reader.searcher().await.unwrap();
let tq = TermQuery::text(content, "quetzalcoatl");
let results = searcher.search(&tq, 10).await.unwrap();
assert_eq!(results.len(), 1, "TermQuery should also find the needle");
let doc = searcher
.doc(results[0].segment_id, results[0].doc_id)
.await
.unwrap()
.unwrap();
let text = doc.get_first(content).unwrap().as_text().unwrap();
assert!(
text.contains("quetzalcoatl"),
"Should retrieve needle content"
);
let results = index.query("document", 200).await.unwrap();
assert!(
results.hits.len() >= 149,
"Should find hay docs across all segments"
);
}
#[tokio::test]
async fn test_many_needles_all_found() {
let mut sb = SchemaBuilder::default();
let content = sb.add_text_field("content", true, true);
let schema = sb.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let num_needles = 20usize;
let hay_per_batch = 50usize;
let needle_terms: Vec<String> = (0..num_needles)
.map(|i| format!("uniqueneedle{:04}", i))
.collect();
for batch in 0..4 {
for i in 0..hay_per_batch {
let mut doc = Document::new();
doc.add_text(
content,
format!("hay batch {} item {} common filler", batch, i),
);
writer.add_document(doc).unwrap();
}
for n in 0..5 {
let needle_idx = batch * 5 + n;
let mut doc = Document::new();
doc.add_text(
content,
format!("this is {} among many documents", needle_terms[needle_idx]),
);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
}
let index = Index::open(dir, config).await.unwrap();
let total = index.num_docs().await.unwrap();
assert_eq!(total, (hay_per_batch * 4 + num_needles) as u32);
for term in &needle_terms {
let results = index.query(term, 10).await.unwrap();
assert_eq!(
results.hits.len(),
1,
"Should find exactly 1 doc for needle '{}'",
term
);
}
let results = index.query("common", 500).await.unwrap();
assert_eq!(
results.hits.len(),
hay_per_batch * 4,
"Common term should match all {} hay docs",
hay_per_batch * 4
);
}
#[tokio::test]
async fn test_russian_stemmer_search() {
let mut schema_builder = SchemaBuilder::default();
let title = schema_builder.add_text_field_with_tokenizer("title", true, true, "ru_stem");
let schema = schema_builder.build();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let mut writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(title, "бегущие собаки");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "маленькая собака");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("собаки", 10).await.unwrap();
assert!(
!results.hits.is_empty(),
"Russian stemmer: 'собаки' should match documents"
);
let results = index.query("собака", 10).await.unwrap();
assert!(
!results.hits.is_empty(),
"Russian stemmer: 'собака' should match (same stem as 'собаки')"
);
let results = index.query("title:бегущие", 10).await.unwrap();
assert_eq!(
results.hits.len(),
1,
"Russian stemmer: field-qualified search should find 1 doc"
);
}