#[cfg(test)]
mod tests {
use std::collections::HashSet;
use crate::directories::RamDirectory;
use crate::dsl::{Document, Field, SchemaBuilder};
use crate::index::{Index, IndexConfig, IndexWriter};
use crate::query::{bm25_idf, bm25_score};
fn create_schema() -> (crate::dsl::Schema, Field) {
let mut schema_builder = SchemaBuilder::default();
let content = schema_builder.add_text_field("content", true, true);
(schema_builder.build(), content)
}
fn generate_test_documents(
num_docs: usize,
terms: &[&str],
) -> (Vec<String>, std::collections::HashMap<String, HashSet<u32>>) {
let mut docs = Vec::with_capacity(num_docs);
let mut term_to_docs: std::collections::HashMap<String, HashSet<u32>> =
std::collections::HashMap::new();
for term in terms {
term_to_docs.insert(term.to_string(), HashSet::new());
}
for i in 0..num_docs {
let mut content_parts = Vec::new();
for (term_idx, term) in terms.iter().enumerate() {
if i % (term_idx + 2) == 0 {
let tf = 1 + (i % 3);
for _ in 0..tf {
content_parts.push(*term);
}
term_to_docs.get_mut(*term).unwrap().insert(i as u32);
}
}
content_parts.push("filler");
content_parts.push("content");
docs.push(content_parts.join(" "));
}
(docs, term_to_docs)
}
#[allow(dead_code)]
fn compute_expected_bm25(tf: f32, doc_freq: f32, total_docs: f32, avg_field_len: f32) -> f32 {
let idf = bm25_idf(doc_freq, total_docs);
bm25_score(tf, idf, tf, avg_field_len)
}
#[tokio::test]
async fn test_single_term_all_docs_retrieved() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust programming");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "python programming");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust rust rust");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "java code");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust systems");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:rust", 10).await.unwrap();
let found_ids: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
let expected_ids: HashSet<u32> = [0, 2, 4].into_iter().collect();
assert_eq!(
found_ids, expected_ids,
"Expected docs {:?}, found {:?}",
expected_ids, found_ids
);
assert_eq!(
results.hits[0].address.doc_id, 2,
"Doc with highest TF should be first"
);
}
#[tokio::test]
async fn test_or_query_all_matching_docs_retrieved() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust programming");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "python programming");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust python");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "java code");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "python only");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index
.query("content:rust OR content:python", 10)
.await
.unwrap();
let found_ids: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
let expected_ids: HashSet<u32> = [0, 1, 2, 4].into_iter().collect();
assert_eq!(
found_ids, expected_ids,
"OR query should find all docs with either term. Expected {:?}, found {:?}",
expected_ids, found_ids
);
assert_eq!(
results.hits[0].address.doc_id, 2,
"Doc matching both terms should be first"
);
}
#[tokio::test]
async fn test_synthetic_documents_exhaustive() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let terms = ["alpha", "beta", "gamma", "delta"];
let (docs, expected_term_docs) = generate_test_documents(50, &terms);
for doc_content in &docs {
let mut doc = Document::new();
doc.add_text(content, doc_content.clone());
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
for term in &terms {
let expected = expected_term_docs.get(*term).unwrap();
let results = index
.query(&format!("content:{}", term), 100)
.await
.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
let missed: Vec<_> = expected.difference(&found).collect();
let extra: Vec<_> = found.difference(expected).collect();
assert!(
missed.is_empty() && extra.is_empty(),
"Term '{}': missed {:?}, extra {:?}",
term,
missed,
extra
);
}
}
#[tokio::test]
async fn test_no_missed_documents_large_corpus() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut expected_alpha = HashSet::new();
let mut expected_beta = HashSet::new();
for i in 0..100u32 {
let mut doc = Document::new();
let mut terms = vec!["filler"];
if i % 2 == 0 {
terms.push("alpha");
expected_alpha.insert(i);
}
if i % 3 == 0 {
terms.push("beta");
expected_beta.insert(i);
}
doc.add_text(content, terms.join(" "));
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:alpha", 200).await.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
let missed: Vec<_> = expected_alpha.difference(&found).collect();
assert!(
missed.is_empty(),
"Missed {} docs for 'alpha': {:?}",
missed.len(),
missed
);
let extra: Vec<_> = found.difference(&expected_alpha).collect();
assert!(
extra.is_empty(),
"Extra {} docs for 'alpha': {:?}",
extra.len(),
extra
);
let results = index.query("content:beta", 200).await.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
let missed: Vec<_> = expected_beta.difference(&found).collect();
assert!(
missed.is_empty(),
"Missed {} docs for 'beta': {:?}",
missed.len(),
missed
);
}
#[tokio::test]
async fn test_bm25_score_ordering_by_tf() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let tfs = [1, 2, 3, 5, 10];
for tf in tfs {
let mut doc = Document::new();
let text = vec!["test"; tf].join(" ");
doc.add_text(content, text);
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:test", 10).await.unwrap();
assert_eq!(results.hits.len(), 5);
for i in 1..results.hits.len() {
assert!(
results.hits[i - 1].score >= results.hits[i].score,
"Scores not in descending order at position {}: {} < {}",
i,
results.hits[i - 1].score,
results.hits[i].score
);
}
assert_eq!(
results.hits[0].address.doc_id, 4,
"Highest TF doc should be first"
);
}
#[tokio::test]
async fn test_multi_term_score_accumulation() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "systems");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust systems");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "rust rust systems systems");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index
.query("content:rust OR content:systems", 10)
.await
.unwrap();
assert_eq!(
results.hits[0].address.doc_id, 3,
"Doc with multiple term matches (higher TF) should be first"
);
assert_eq!(
results.hits[1].address.doc_id, 2,
"Doc matching both terms once should be second"
);
}
#[tokio::test]
async fn test_multi_segment_retrieval() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig {
max_indexing_memory_bytes: 1024, ..Default::default()
};
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for batch in 0..3 {
for i in 0..5 {
let doc_id = batch * 5 + i;
let mut doc = Document::new();
if doc_id % 2 == 0 {
doc.add_text(content, format!("searchterm doc{}", doc_id));
} else {
doc.add_text(content, format!("otherword doc{}", doc_id));
}
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
}
let index = Index::open(dir, config).await.unwrap();
let seg_count = index.segment_readers().await.unwrap().len();
assert!(
seg_count >= 2,
"Should have multiple segments, got {}",
seg_count
);
let results = index.query("content:searchterm", 50).await.unwrap();
assert_eq!(
results.hits.len(),
8,
"Should find 8 docs with searchterm across segments"
);
for hit in &results.hits {
assert!(hit.score > 0.0, "All hits should have positive scores");
}
}
#[tokio::test]
async fn test_multi_segment_score_consistency() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for _segment in 0..2 {
let mut doc = Document::new();
doc.add_text(content, "identical content here");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
}
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:identical", 10).await.unwrap();
assert_eq!(results.hits.len(), 2, "Should find docs in both segments");
let score_diff = (results.hits[0].score - results.hits[1].score).abs();
assert!(
score_diff < 0.1,
"Identical docs should have similar scores, got diff={}",
score_diff
);
}
#[tokio::test]
async fn test_empty_results() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(content, "hello world");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:nonexistent", 10).await.unwrap();
assert_eq!(
results.hits.len(),
0,
"Should return empty for non-matching term"
);
}
#[tokio::test]
async fn test_limit_respected() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for _i in 0..50 {
let mut doc = Document::new();
doc.add_text(content, "common term here");
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:common", 5).await.unwrap();
assert_eq!(results.hits.len(), 5, "Should respect limit of 5");
for hit in &results.hits {
assert!(hit.score > 0.0, "Score should be positive");
}
}
#[tokio::test]
async fn test_many_terms_or_query() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let terms = [
"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
];
for (i, term) in terms.iter().enumerate() {
let mut doc = Document::new();
doc.add_text(content, format!("{} content", term));
writer.add_document(doc).unwrap();
assert_eq!(i, i); }
let mut doc = Document::new();
doc.add_text(content, terms.join(" "));
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let query_str = terms
.iter()
.map(|t| format!("content:{}", t))
.collect::<Vec<_>>()
.join(" OR ");
let results = index.query(&query_str, 20).await.unwrap();
assert_eq!(results.hits.len(), 9, "Should find all 9 docs");
assert_eq!(
results.hits[0].address.doc_id, 8,
"Doc matching all terms should have highest score"
);
}
#[tokio::test]
async fn test_high_tf_document() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(content, vec!["repeat"; 100].join(" "));
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(content, "repeat once");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:repeat", 10).await.unwrap();
assert_eq!(results.hits.len(), 2);
assert_eq!(
results.hits[0].address.doc_id, 0,
"High TF doc should be first"
);
assert!(
results.hits[0].score > results.hits[1].score,
"High TF should yield higher score"
);
}
#[tokio::test]
async fn test_no_duplicate_doc_ids_in_results() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..20 {
let mut doc = Document::new();
doc.add_text(content, format!("term1 term2 term3 doc{}", i));
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index
.query("content:term1 OR content:term2 OR content:term3", 50)
.await
.unwrap();
let mut seen = HashSet::new();
for hit in &results.hits {
assert!(
seen.insert(hit.address.doc_id),
"Duplicate doc_id {} in results",
hit.address.doc_id
);
}
}
#[tokio::test]
async fn test_scores_are_positive_and_finite() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for _i in 0..10 {
let mut doc = Document::new();
doc.add_text(content, "test content here");
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:test", 20).await.unwrap();
for hit in &results.hits {
assert!(
hit.score > 0.0,
"Score should be positive, got {}",
hit.score
);
assert!(
hit.score.is_finite(),
"Score should be finite, got {}",
hit.score
);
assert!(!hit.score.is_nan(), "Score should not be NaN");
}
}
#[tokio::test]
async fn test_single_doc_single_term() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(content, "unique");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("content:unique", 10).await.unwrap();
assert_eq!(results.hits.len(), 1);
assert_eq!(results.hits[0].address.doc_id, 0);
assert!(results.hits[0].score > 0.0);
}
#[tokio::test]
async fn test_idf_impact_on_scoring() {
let (schema, content) = create_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
for i in 0..10 {
let mut doc = Document::new();
if i == 0 {
doc.add_text(content, "common rare");
} else {
doc.add_text(content, "common");
}
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let rare_results = index.query("content:rare", 10).await.unwrap();
assert_eq!(rare_results.hits.len(), 1);
let rare_score = rare_results.hits[0].score;
let common_results = index.query("content:common", 10).await.unwrap();
assert_eq!(common_results.hits.len(), 10);
let common_score = common_results.hits[0].score;
assert!(
rare_score > common_score,
"Rare term (IDF={:.4}) should score higher than common (IDF={:.4})",
rare_score,
common_score
);
}
fn create_multifield_schema() -> (crate::dsl::Schema, Field, Field, Field) {
let mut schema_builder = SchemaBuilder::default();
let title = schema_builder.add_text_field("title", true, true);
let body = schema_builder.add_text_field("body", true, true);
let tags = schema_builder.add_text_field("tags", true, true);
(schema_builder.build(), title, body, tags)
}
#[tokio::test]
async fn test_multifield_basic_retrieval() {
let (schema, title, body, tags) = create_multifield_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust programming guide");
doc.add_text(body, "this is about software development");
doc.add_text(tags, "tutorial beginner");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "software guide");
doc.add_text(body, "learn rust programming here");
doc.add_text(tags, "tutorial");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "programming tutorial");
doc.add_text(body, "general software development");
doc.add_text(tags, "rust systems");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust mastery");
doc.add_text(body, "advanced rust programming");
doc.add_text(tags, "rust expert");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("title:rust", 10).await.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
assert!(found.contains(&0), "Doc 0 should match title:rust");
assert!(found.contains(&3), "Doc 3 should match title:rust");
assert!(!found.contains(&1), "Doc 1 should not match title:rust");
assert!(!found.contains(&2), "Doc 2 should not match title:rust");
let results = index.query("body:rust", 10).await.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
assert!(found.contains(&1), "Doc 1 should match body:rust");
assert!(found.contains(&3), "Doc 3 should match body:rust");
let results = index.query("tags:rust", 10).await.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
assert!(found.contains(&2), "Doc 2 should match tags:rust");
assert!(found.contains(&3), "Doc 3 should match tags:rust");
}
#[tokio::test]
async fn test_multifield_or_across_fields() {
let (schema, title, body, _tags) = create_multifield_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(title, "searchterm in title");
doc.add_text(body, "other content");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "different title");
doc.add_text(body, "searchterm in body");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "searchterm title");
doc.add_text(body, "searchterm body");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "unrelated");
doc.add_text(body, "nothing here");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index
.query("title:searchterm OR body:searchterm", 10)
.await
.unwrap();
assert_eq!(results.hits.len(), 3, "Should find 3 docs with searchterm");
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
assert!(found.contains(&0));
assert!(found.contains(&1));
assert!(found.contains(&2));
assert!(!found.contains(&3));
assert_eq!(
results.hits[0].address.doc_id, 2,
"Doc matching both fields should score highest"
);
}
#[tokio::test]
async fn test_multifield_tf_accumulation() {
let (schema, title, body, tags) = create_multifield_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust");
doc.add_text(body, "programming");
doc.add_text(tags, "code");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust rust rust rust rust");
doc.add_text(body, "programming");
doc.add_text(tags, "code");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust rust");
doc.add_text(body, "rust rust");
doc.add_text(tags, "rust");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("title:rust", 10).await.unwrap();
assert_eq!(
results.hits[0].address.doc_id, 1,
"Highest TF should score highest"
);
for i in 1..results.hits.len() {
assert!(
results.hits[i - 1].score >= results.hits[i].score,
"Scores should be in descending order"
);
}
}
#[tokio::test]
async fn test_multifield_different_field_lengths() {
let (schema, title, body, _tags) = create_multifield_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust guide");
doc.add_text(body, "this is a very long body with lots of words that dilute the term frequency and should result in lower BM25 scores for terms that appear here because length normalization penalizes longer documents");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust");
doc.add_text(body, "short body");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(
title,
"the comprehensive rust programming language tutorial guide for beginners",
);
doc.add_text(body, "content");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("title:rust", 10).await.unwrap();
assert_eq!(results.hits.len(), 3);
for hit in &results.hits {
assert!(hit.score > 0.0, "All scores should be positive");
assert!(hit.score.is_finite(), "All scores should be finite");
}
}
#[tokio::test]
async fn test_multifield_cross_field_or_query() {
let (schema, title, body, tags) = create_multifield_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(title, "alpha");
doc.add_text(body, "beta");
doc.add_text(tags, "gamma");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "beta");
doc.add_text(body, "gamma");
doc.add_text(tags, "alpha");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "gamma");
doc.add_text(body, "alpha");
doc.add_text(tags, "beta");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "alpha beta gamma");
doc.add_text(body, "other");
doc.add_text(tags, "misc");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index
.query("title:alpha OR body:beta OR tags:gamma", 10)
.await
.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
assert!(found.contains(&0), "Doc 0 should match (title:alpha)");
assert!(found.contains(&3), "Doc 3 should match (title:alpha)");
}
#[tokio::test]
async fn test_multifield_no_cross_contamination() {
let (schema, title, body, _tags) = create_multifield_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(title, "public information");
doc.add_text(body, "this contains secret data");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "secret document");
doc.add_text(body, "public information here");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("title:secret", 10).await.unwrap();
assert_eq!(results.hits.len(), 1, "Only one doc has secret in title");
assert_eq!(
results.hits[0].address.doc_id, 1,
"Only doc 1 has secret in title"
);
let results = index.query("body:secret", 10).await.unwrap();
assert_eq!(results.hits.len(), 1, "Only one doc has secret in body");
assert_eq!(
results.hits[0].address.doc_id, 0,
"Only doc 0 has secret in body"
);
}
#[tokio::test]
async fn test_multifield_combined_scoring() {
let (schema, title, body, tags) = create_multifield_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust");
doc.add_text(body, "rust");
doc.add_text(tags, "rust");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "rust rust rust");
doc.add_text(body, "other content");
doc.add_text(tags, "misc");
writer.add_document(doc).unwrap();
let mut doc = Document::new();
doc.add_text(title, "python");
doc.add_text(body, "java");
doc.add_text(tags, "go");
writer.add_document(doc).unwrap();
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index
.query("title:rust OR body:rust OR tags:rust", 10)
.await
.unwrap();
assert_eq!(results.hits.len(), 2, "Should find 2 docs with rust");
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
assert!(found.contains(&0));
assert!(found.contains(&1));
assert!(!found.contains(&2));
for hit in &results.hits {
assert!(hit.score > 0.0);
}
}
#[tokio::test]
async fn test_multifield_large_document_set() {
let (schema, title, body, tags) = create_multifield_schema();
let dir = RamDirectory::new();
let config = IndexConfig::default();
let writer = IndexWriter::create(dir.clone(), schema.clone(), config.clone())
.await
.unwrap();
let mut expected_title_match = HashSet::new();
let mut expected_body_match = HashSet::new();
let mut expected_any_match = HashSet::new();
for i in 0..100u32 {
let mut doc = Document::new();
if i % 3 == 0 {
doc.add_text(title, format!("target document {}", i));
expected_title_match.insert(i);
expected_any_match.insert(i);
} else {
doc.add_text(title, format!("other document {}", i));
}
if i % 5 == 0 {
doc.add_text(body, format!("contains target word {}", i));
expected_body_match.insert(i);
expected_any_match.insert(i);
} else {
doc.add_text(body, format!("regular content {}", i));
}
doc.add_text(tags, format!("tag{}", i % 10));
writer.add_document(doc).unwrap();
}
writer.commit().await.unwrap();
let index = Index::open(dir, config).await.unwrap();
let results = index.query("title:target", 200).await.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
let missed: Vec<_> = expected_title_match.difference(&found).collect();
assert!(
missed.is_empty(),
"Missed {} docs for title:target: {:?}",
missed.len(),
missed
);
let results = index.query("body:target", 200).await.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
let missed: Vec<_> = expected_body_match.difference(&found).collect();
assert!(
missed.is_empty(),
"Missed {} docs for body:target: {:?}",
missed.len(),
missed
);
let results = index
.query("title:target OR body:target", 200)
.await
.unwrap();
let found: HashSet<u32> = results.hits.iter().map(|h| h.address.doc_id).collect();
let missed: Vec<_> = expected_any_match.difference(&found).collect();
assert!(
missed.is_empty(),
"Missed {} docs for OR query: {:?}",
missed.len(),
missed
);
let extra: Vec<_> = found.difference(&expected_any_match).collect();
assert!(
extra.is_empty(),
"Found {} extra docs for OR query: {:?}",
extra.len(),
extra
);
}
}