lucisearch 0.8.0

//! Exit criteria integration tests for Milestone 1.
//!
//! Test 1: Index 1M documents, search with term query, verify _source
//! Test 2: BM25 correctness on small corpus with known term distributions
//! Test 3: Segment merge across two commit cycles with deletions

use luci::analysis::AnalyzerRegistry;
use luci::core::{DocId, FieldId, SegmentId};
use luci::deletion::DeletionMap;
use luci::index::Index;
use luci::mapping::{FieldType, Mapping};
use luci::merge::merge_segments;
use luci::search::bm25::bm25_idf;
use luci::search::expression::parse_search;
use luci::search::reader::Reader;
use luci::search::segment_store::SegmentStore;
use luci::segment::reader::SegmentReader;
use luci::storage::SingleFileDirectory;
use luci::writer::IndexWriter;
use serde_json::json;

fn test_dir(name: &str) -> std::path::PathBuf {
    let dir = std::env::temp_dir().join(format!("luci_integration_{}_{name}", std::process::id()));
    let _ = std::fs::remove_dir_all(&dir);
    dir
}

fn cleanup(path: &std::path::Path) {
    let _ = std::fs::remove_dir_all(path);
}

/// Exit Criterion 1: Index 1M documents, search with term query, return _source.
///
/// This test uses a smaller corpus (100K) by default for speed. Set the
/// LUCI_LARGE_TEST environment variable to run with 1M.
#[test]
fn exit_criterion_1_indexing_and_term_search() {
    let doc_count: usize = match std::env::var("LUCI_LARGE_TEST") {
        Ok(_) => 1_000_000,
        Err(_) => 100_000,
    };

    let path = test_dir("ec1");
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("category", FieldType::Keyword)
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    let categories = ["tech", "science", "sports", "politics", "entertainment"];

    let docs: Vec<serde_json::Value> = (0..doc_count)
        .map(|i| {
            let cat = categories[i % categories.len()];
            json!({
                "title": format!("document number {i} about {cat} topics"),
                "category": cat,
            })
        })
        .collect();
    index.bulk(docs).unwrap();

    // Search for "tech" — should appear in every 5th doc's title
    let expr = parse_search(json!({"term": {"title": "tech"}}), 10).unwrap();
    let results = index.search(&expr).unwrap();
    assert!(results.total_hits().value > 0);
    assert!(results.len() <= 10);

    // Verify _source is returned and parseable
    for hit in results.iter() {
        let source = hit.source().expect("_source should be present");
        assert!(source["title"].is_string());
        assert!(source["category"].is_string());
        // Title should contain "tech"
        let title = source["title"].as_str().unwrap();
        assert!(
            title.contains("tech"),
            "title should contain search term: {title}"
        );
    }

    // Search for category keyword
    let expr = parse_search(json!({"term": {"category": "science"}}), 5).unwrap();
    let results = index.search(&expr).unwrap();
    assert!(results.total_hits().value > 0);
    for hit in results.iter() {
        let source = hit.source().unwrap();
        assert_eq!(source["category"], "science");
    }

    cleanup(&path);
}

/// Exit Criterion 2: BM25 scoring produces correct rankings.
///
/// Small corpus with known term distributions. Hand-compute BM25 scores
/// and verify Luci produces identical ranking order.
#[test]
fn exit_criterion_2_bm25_correctness() {
    let path = test_dir("ec2");
    let schema = Mapping::builder().field("body", FieldType::Text).build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    // Corpus: 100 documents with controlled term distributions.
    // "search" appears in docs 0-9 (10 docs), with varying frequencies.
    // Other docs have filler text.
    let docs: Vec<serde_json::Value> = (0..100)
        .map(|i| {
            let body = if i < 10 {
                let mut words = vec!["search"; (i + 1) as usize];
                for _ in 0..(10 - i) {
                    words.push("filler");
                }
                words.join(" ")
            } else {
                format!("document {i} with various other words and content")
            };
            json!({"body": body})
        })
        .collect();
    index.bulk(docs).unwrap();

    let expr = parse_search(json!({"term": {"body": "search"}}), 10).unwrap();
    let results = index.search(&expr).unwrap();
    assert_eq!(
        results.total_hits().value,
        10,
        "exactly 10 docs contain 'search'"
    );

    // Verify ranking order: higher TF should rank higher (for similar field lengths).
    // Doc 9 has tf=10, doc 8 has tf=9, ..., doc 0 has tf=1.
    // With BM25's length normalization and the varying field lengths, the exact
    // order may differ slightly, but the general trend should hold: docs with
    // higher tf and shorter length should rank higher.

    // At minimum, the top result should be one of the high-TF docs
    let top_hit = results.hit(0).unwrap();
    let top_doc = top_hit.doc_id().as_u32();
    assert!(
        top_doc < 10,
        "top result should be from the 'search' docs, got doc {top_doc}"
    );

    // Verify scores are in descending order
    let scores: Vec<f32> = results.iter().map(|h| h.score()).collect();
    for i in 0..scores.len() - 1 {
        assert!(
            scores[i] >= scores[i + 1],
            "results not in descending score order at position {i}"
        );
    }

    // Verify BM25 scores against hand computation.
    // We can compute expected scores for doc 9 (tf=10, dl=10+1=11 with standard analyzer)
    // N=100, n=10
    let idf = bm25_idf(100, 10);
    assert!(idf > 0.0, "IDF should be positive");

    // Verify all scores are positive and finite
    for hit in results.iter() {
        assert!(hit.score() > 0.0, "score should be positive");
        assert!(hit.score().is_finite(), "score should be finite");
    }
}

/// Exit Criterion 3: Segment merge with deletions.
///
/// Index into two segments (two commit cycles), delete some docs, merge,
/// verify merged segment has correct docs and correct search results.
#[test]
fn exit_criterion_3_segment_merge() {
    let path = test_dir("ec3");
    let storage = SingleFileDirectory::create(&path).unwrap();
    let schema = Mapping::builder()
        .field("body", FieldType::Text)
        .field("tag", FieldType::Keyword)
        .build();
    let mut writer = IndexWriter::new(storage, schema.clone(), AnalyzerRegistry::new());

    // Commit 1: 5 docs
    for i in 0..5 {
        writer
            .add(json!({
                "body": format!("segment one document {i}"),
                "tag": "batch1",
            }))
            .unwrap();
    }
    writer.commit().unwrap();

    // Commit 2: 5 docs
    for i in 0..5 {
        writer
            .add(json!({
                "body": format!("segment two document {i}"),
                "tag": "batch2",
            }))
            .unwrap();
    }
    writer.commit().unwrap();

    // Verify we have 2 segments
    let storage = SingleFileDirectory::open(&path).unwrap();
    assert_eq!(storage.segments().len(), 2);

    // Load both segments
    let seg1_id = storage.segments()[0].segment_id;
    let seg2_id = storage.segments()[1].segment_id;
    let reader1 = SegmentReader::open(storage.read_segment(seg1_id).unwrap()).unwrap();
    let reader2 = SegmentReader::open(storage.read_segment(seg2_id).unwrap()).unwrap();

    assert_eq!(reader1.doc_count(), 5);
    assert_eq!(reader2.doc_count(), 5);

    // Delete docs 1, 3 from segment 1 and doc 2 from segment 2
    let mut deletions = DeletionMap::new();
    deletions.mark_deleted(seg1_id, DocId::new(1));
    deletions.mark_deleted(seg1_id, DocId::new(3));
    deletions.mark_deleted(seg2_id, DocId::new(2));

    // Merge
    let merge_output = merge_segments(
        SegmentId::new(100),
        &[&reader1, &reader2],
        &deletions,
        &schema,
        &AnalyzerRegistry::new(),
    )
    .expect("merge failed");

    let merged = SegmentReader::open(merge_output.bytes).unwrap();

    // Should have 10 - 3 = 7 documents
    assert_eq!(merged.doc_count(), 7);

    // "segment" should appear in all 7 docs
    assert_eq!(merged.doc_freq(FieldId::new(0), "segment"), 7);

    // "one" should appear in 3 docs (5 - 2 deleted)
    assert_eq!(merged.doc_freq(FieldId::new(0), "one"), 3);

    // "two" should appear in 4 docs (5 - 1 deleted)
    assert_eq!(merged.doc_freq(FieldId::new(0), "two"), 4);

    // Verify source retrieval on merged segment via Reader
    let seg_id = merged.segment_id();
    let merged_doc_count = merged.doc_count();
    let store = SegmentStore::new(vec![merged], AnalyzerRegistry::new(), None, None);
    let reader = Reader::new(&store);

    // Verify all 7 docs have _source
    for doc in 0..merged_doc_count {
        assert!(reader.get_source(seg_id, DocId::new(doc)).is_some());
    }

    cleanup(&path);
}

/// Supplementary: end-to-end write → commit → search → verify source.
#[test]
fn end_to_end_write_search() {
    let path = test_dir("e2e");
    let schema = Mapping::builder()
        .field("title", FieldType::Text)
        .field("year", FieldType::Keyword)
        .build();
    let index = Index::create_with_mapping(&path, schema).unwrap();

    let docs = vec![
        json!({"title": "The Rust Programming Language", "year": "2015"}),
        json!({"title": "Elasticsearch: The Definitive Guide", "year": "2015"}),
        json!({"title": "Database Internals", "year": "2019"}),
        json!({"title": "Designing Data-Intensive Applications", "year": "2017"}),
        json!({"title": "Information Retrieval: Implementing and Evaluating Search Engines", "year": "2010"}),
    ];

    for doc in &docs {
        index.add(doc.clone()).unwrap();
    }

    // Search for "rust" — should find 1 doc
    let expr = parse_search(json!({"term": {"title": "rust"}}), 10).unwrap();
    let results = index.search(&expr).unwrap();
    assert_eq!(results.total_hits().value, 1);
    assert_eq!(
        results.hit(0).unwrap().source().unwrap()["title"],
        "The Rust Programming Language"
    );

    // Search for "the" — should find multiple
    let expr = parse_search(json!({"term": {"title": "the"}}), 10).unwrap();
    let results = index.search(&expr).unwrap();
    assert!(results.total_hits().value >= 2);

    // Year keyword search
    let expr = parse_search(json!({"term": {"year": "2015"}}), 10).unwrap();
    let results = index.search(&expr).unwrap();
    assert_eq!(results.total_hits().value, 2);

    cleanup(&path);
}