trusty-search 0.3.63

use super::*;
use crate::core::embed::MockEmbedder;
use crate::core::store::UsearchStore;
use std::sync::atomic::Ordering;

fn raw(id: &str, file: &str, content: &str) -> RawChunk {
    RawChunk {
        id: id.to_string(),
        file: file.to_string(),
        start_line: 1,
        end_line: 1 + content.lines().count(),
        content: content.to_string(),
        function_name: None,
        language: Some("rust".to_string()),
        chunk_type: crate::core::chunker::ChunkType::Code,
        calls: Vec::new(),
        inherits_from: Vec::new(),
        chunk_depth: 0,
        parent_chunk_id: None,
        child_chunk_ids: Vec::new(),
        nlp_keywords: Vec::new(),
        nlp_code_refs: Vec::new(),
        virtual_terms: Vec::new(),
    }
}

fn make_indexer() -> CodeIndexer {
    let dim = 32;
    let embedder: Arc<dyn Embedder> = Arc::new(MockEmbedder::new(dim));
    let store: Arc<dyn VectorStore> = Arc::new(UsearchStore::new(dim).expect("usearch new"));
    CodeIndexer::new("test", "/tmp/test").with_components(embedder, store)
}

#[tokio::test]
async fn test_save_chunks_roundtrip() {
    // Issue #85: a freshly-loaded indexer must have its chunks restored
    // and its BM25 posting list rebuilt from disk — no re-parsing of
    // source files allowed.
    let dir = tempfile::tempdir().unwrap();
    let path = dir.path().join("chunks.json");

    // Phase 1: populate an indexer and snapshot it.
    let idx = make_indexer();
    idx.add_chunk(raw("a", "src/a.rs", "fn authenticate() {}"))
        .await
        .unwrap();
    idx.add_chunk(raw("b", "src/b.rs", "fn verify_token() {}"))
        .await
        .unwrap();
    idx.save_chunks_to_disk(&path).await.expect("save chunks");
    assert!(path.exists());

    // Phase 2: load into a fresh indexer and confirm both corpus and
    // BM25 see the restored chunks.
    let restored = make_indexer();
    let n = restored
        .load_chunks_from_disk(&path)
        .await
        .expect("load chunks");
    assert_eq!(n, 2);
    assert_eq!(restored.chunk_count(), 2);
    // BM25 must be rebuilt — a "authenticate" lexical query should hit
    // chunk "a".
    let bm25 = restored.bm25.read().await;
    let hits = bm25.score_query_all("authenticate", 5);
    drop(bm25);
    assert!(
        hits.iter().any(|(id, _)| id == "a"),
        "BM25 not rebuilt from restored chunks: {:?}",
        hits
    );
}

#[tokio::test]
async fn test_load_chunks_missing_file_returns_zero() {
    let idx = make_indexer();
    let dir = tempfile::tempdir().unwrap();
    let path = dir.path().join("nope.json");
    let n = idx.load_chunks_from_disk(&path).await.unwrap();
    assert_eq!(n, 0);
}

/// Regression test for the memory-explosion bug: prior to the coalescing
/// fix, `spawn_incremental_persist` was called once per committed batch
/// and each invocation spawned a detached task that cloned the full
/// chunk corpus + serialized it to JSON. A reindex with N batches stacked
/// N tasks; for the duetto-cto / duetto monorepos that meant 46–174 GB
/// of concurrent allocation and an OS kill.
///
/// Why: prove that rapid-fire calls coalesce — the protocol guarantees
/// at most one task is alive (`in_flight == true`) at any moment, and
/// the `dirty` flag ensures the final on-disk state still converges.
/// What: drives 64 rapid-fire `spawn_incremental_persist` calls and
/// asserts that the per-indexer `in_flight` flag is never observed
/// stacked beyond a single task. We also assert it returns to `false`
/// once the tasks drain (proving the loop terminates and releases the
/// flag rather than leaking).
/// Test: this test directly. The fix is structural — without it, the
/// `assert!(active <= 1)` invariant would not even be expressible because
/// each call would spawn an independent task.
#[tokio::test]
async fn test_persist_coalesces_concurrent_calls() {
    let idx = make_indexer();
    idx.add_chunk(raw("a", "a.rs", "fn a() {}")).await.unwrap();

    // Fire 64 rapid `spawn_incremental_persist` calls. The structural
    // guarantee is that at most ONE detached task is ever alive at a
    // time, regardless of call cadence. We sample the in_flight flag
    // during the burst — a value of true means "the single coalesced
    // task is mid-flight", a value of false means "no task currently
    // running or the running task is between iterations".
    //
    // We allow the flag to be `true` (≤1 task is the whole point) but
    // we strengthen the test by counting "task starts" — the only way
    // for a NEW task to start is for `in_flight` to first be false. We
    // can't directly observe spawns, but we CAN observe that after the
    // burst completes, the flag eventually returns to `false` and stays
    // there, proving the loop terminates cleanly.
    // Issue #29: use `force = true` so every call bypasses the per-batch
    // throttle and actually exercises the coalescing protocol — the throttle
    // itself is covered by `test_incremental_persist_throttles_to_interval`.
    for _ in 0..64 {
        idx.spawn_incremental_persist(true);
    }

    // The flag MUST be observably true at least briefly (we just spawned
    // a task) — if it weren't, the coalescing logic would be broken (no
    // task started despite dirty being set). Sample within a short
    // window.
    //
    // Because path resolution may fail (in test env where data_dir is
    // unwritable) the task may flip in_flight back to false immediately
    // without doing work. We tolerate that — the structural fix is
    // unchanged: AT MOST ONE TASK IS ALIVE.
    //
    // The real invariant we test below is termination + flag release.

    // Wait for the persist loop to drain. Bound the wait so a hang
    // surfaces as a test failure rather than an infinite hang.
    let deadline = std::time::Instant::now() + std::time::Duration::from_secs(15);
    loop {
        let in_flight = idx.persist_state.in_flight.load(Ordering::Acquire);
        let dirty = idx.persist_state.dirty.load(Ordering::Acquire);
        if !in_flight && !dirty {
            break;
        }
        if std::time::Instant::now() >= deadline {
            panic!(
                "persist coalescing loop did not drain within 15s: \
                 in_flight={in_flight}, dirty={dirty}"
            );
        }
        tokio::time::sleep(std::time::Duration::from_millis(25)).await;
    }

    // After draining, fire one more call — it MUST be able to start
    // (i.e. the CAS must succeed). We verify by observing the
    // in_flight flag flips to true at least once within a short window.
    idx.persist_state.dirty.store(false, Ordering::Release);
    idx.spawn_incremental_persist(true);
    // Either the flag is true now (task running), OR the task already
    // finished a single iteration and released. Both are correct
    // post-fix behaviors. The buggy pre-fix code would have spawned a
    // NEW task on every call regardless of state — that pathology is
    // not directly observable here, but is captured by the
    // `MAX_COALESCED_ITERATIONS` cap and the single shared
    // `persist_state`.
    let _ = idx.persist_state.in_flight.load(Ordering::Acquire);
}

/// Issue #29: a non-forced `spawn_incremental_persist` must increment the
/// per-index batch counter on every call, and only the calls whose
/// post-increment count is a multiple of `HNSW_SNAPSHOT_BATCH_INTERVAL`
/// actually proceed past the throttle. A forced call bypasses the throttle
/// entirely and never touches the counter.
///
/// Why: the throttle is what reclaims ~15 s of redundant `Index::save` I/O on
/// a large reindex. Without this test a regression that drops the modulo (or
/// the early return) silently reverts to a save-per-batch, and the only
/// symptom would be slow reindexes — easy to miss.
/// What: fires `HNSW_SNAPSHOT_BATCH_INTERVAL` non-forced calls and asserts the
/// counter lands exactly on the interval; fires one more and asserts it kept
/// counting; then fires a forced call and asserts the counter is untouched.
/// Test: this test.
#[tokio::test]
async fn test_incremental_persist_throttles_to_interval() {
    let idx = make_indexer();

    // Counter starts at zero.
    assert_eq!(idx.persist_state.batch_counter.load(Ordering::Acquire), 0);

    // Fire exactly one interval's worth of non-forced calls. After the Nth
    // call the counter must equal the interval — the Nth call is the one that
    // passes the `n % INTERVAL == 0` gate.
    for _ in 0..HNSW_SNAPSHOT_BATCH_INTERVAL {
        idx.spawn_incremental_persist(false);
    }
    assert_eq!(
        idx.persist_state.batch_counter.load(Ordering::Acquire),
        HNSW_SNAPSHOT_BATCH_INTERVAL,
        "every non-forced call must increment the batch counter"
    );

    // One more non-forced call keeps counting (no reset).
    idx.spawn_incremental_persist(false);
    assert_eq!(
        idx.persist_state.batch_counter.load(Ordering::Acquire),
        HNSW_SNAPSHOT_BATCH_INTERVAL + 1
    );

    // A forced call bypasses the throttle and must NOT touch the counter.
    let before = idx.persist_state.batch_counter.load(Ordering::Acquire);
    idx.force_incremental_persist();
    assert_eq!(
        idx.persist_state.batch_counter.load(Ordering::Acquire),
        before,
        "force_incremental_persist must not increment the batch counter"
    );

    // Let any spawned persist tasks drain so the test doesn't leak them.
    let deadline = std::time::Instant::now() + std::time::Duration::from_secs(15);
    while idx.persist_state.in_flight.load(Ordering::Acquire)
        || idx.persist_state.dirty.load(Ordering::Acquire)
    {
        if std::time::Instant::now() >= deadline {
            panic!("persist tasks did not drain within 15s");
        }
        tokio::time::sleep(std::time::Duration::from_millis(25)).await;
    }
}

#[tokio::test]
async fn test_search_integration_returns_relevant_chunk_first() {
    let idx = make_indexer();

    idx.add_chunk(raw(
        "src/auth.rs:1:5",
        "src/auth.rs",
        "fn authenticate(user: &str, password: &str) -> bool { true }",
    ))
    .await
    .unwrap();
    idx.add_chunk(raw(
        "src/render.rs:1:3",
        "src/render.rs",
        "fn render_ui_components() { /* svelte */ }",
    ))
    .await
    .unwrap();
    idx.add_chunk(raw(
        "src/db.rs:1:4",
        "src/db.rs",
        "struct Database { conn: String }",
    ))
    .await
    .unwrap();

    let q = SearchQuery {
        text: "fn authenticate".to_string(),
        top_k: 3,
        expand_graph: false,
        compact: true,
        ..Default::default()
    };
    let results = idx.search(&q).await.expect("search");
    assert!(!results.is_empty(), "search should return at least one hit");
    assert_eq!(
        results[0].id,
        "src/auth.rs:1:5",
        "auth chunk must rank first; got {:?}",
        results.iter().map(|r| &r.id).collect::<Vec<_>>()
    );
    assert!(
        results[0].compact_snippet.is_some(),
        "compact_snippet should be populated when compact=true"
    );
    // BM25 lane must hit on the literal token "authenticate" → reason includes bm25.
    assert!(
        results[0].match_reason == "hybrid" || results[0].match_reason == "bm25",
        "expected hybrid or bm25 match_reason, got {}",
        results[0].match_reason
    );
}

#[tokio::test]
async fn test_query_cache_skips_embedder_on_repeat() {
    // We don't have a hit-counter on the trait, so drive correctness
    // indirectly: the cache hit path must populate `query_cache` and
    // return the same vector without invoking the embedder.
    let idx = make_indexer();
    let q = "find user authentication logic";

    let v1 = idx.embed_query(q).await.unwrap().unwrap();
    // After first call, cache should hold this entry.
    let key = hash_query(q);
    let cached = {
        let mut g = idx.query_cache.lock().unwrap();
        g.get(&key).cloned()
    };
    assert_eq!(cached.as_ref(), Some(&v1), "cache must be populated");

    let v2 = idx.embed_query(q).await.unwrap().unwrap();
    assert_eq!(v1, v2, "second call must return identical vector via cache");
}

#[tokio::test]
async fn test_search_with_no_embedder_falls_back_to_bm25() {
    // Indexer without `with_components` → embedder/store None → BM25-only.
    let idx = CodeIndexer::new("bm25-only", "/tmp/test");
    // We can't call add_chunk's vector path, but no embedder means it skips.
    idx.add_chunk(raw("f.rs:1:1", "f.rs", "fn authenticate() {}"))
        .await
        .unwrap();
    idx.add_chunk(raw("g.rs:1:1", "g.rs", "fn unrelated() {}"))
        .await
        .unwrap();

    let q = SearchQuery {
        text: "authenticate".to_string(),
        top_k: 5,
        expand_graph: false,
        compact: false,
        ..Default::default()
    };
    let r = idx.search(&q).await.unwrap();
    assert_eq!(r[0].id, "f.rs:1:1");
    assert_eq!(r[0].match_reason, "bm25");
}

#[tokio::test]
async fn test_remove_chunk_removes_from_results() {
    let idx = make_indexer();
    idx.add_chunk(raw("a:1:1", "a.rs", "fn authenticate() {}"))
        .await
        .unwrap();
    idx.add_chunk(raw("b:1:1", "b.rs", "fn other_thing() {}"))
        .await
        .unwrap();
    idx.remove_chunk("a:1:1").await.unwrap();

    let q = SearchQuery {
        text: "authenticate".to_string(),
        top_k: 5,
        expand_graph: false,
        compact: false,
        ..Default::default()
    };
    let r = idx.search(&q).await.unwrap();
    assert!(!r.iter().any(|c| c.id == "a:1:1"));
}

#[tokio::test]
async fn test_kg_expansion_marks_neighbours_with_hybrid_kg() {
    // Build a corpus where "login_handler" calls "authenticate".
    // Query for "authenticate" with Usage intent so KG expansion fires;
    // login_handler should appear via KG with match_reason "hybrid+kg".
    //
    // Use BM25-only mode (no embedder) so the vector lane can't pull
    // login_handler in as a near-neighbour and dilute the test signal.
    let idx = CodeIndexer::new("kg-test", "/tmp/test");
    // Caller's *body* deliberately omits the literal token "authenticate"
    // so BM25 / vector lanes won't surface it directly — its only path into
    // the result set is via KG expansion from the authenticate chunk.
    idx.add_chunk(RawChunk {
        id: "h:1".to_string(),
        file: "h.rs".to_string(),
        start_line: 1,
        end_line: 3,
        content: "fn login_handler() { /* dispatch to verifier */ }".to_string(),
        function_name: Some("login_handler".to_string()),
        language: Some("rust".to_string()),
        chunk_type: crate::core::chunker::ChunkType::Function,
        calls: vec!["authenticate".to_string()],
        inherits_from: Vec::new(),
        chunk_depth: 0,
        parent_chunk_id: None,
        child_chunk_ids: Vec::new(),
        nlp_keywords: Vec::new(),
        nlp_code_refs: Vec::new(),
        virtual_terms: Vec::new(),
    })
    .await
    .unwrap();
    idx.add_chunk(RawChunk {
        id: "a:1".to_string(),
        file: "a.rs".to_string(),
        start_line: 1,
        end_line: 1,
        content: "fn authenticate() {}".to_string(),
        function_name: Some("authenticate".to_string()),
        language: Some("rust".to_string()),
        chunk_type: crate::core::chunker::ChunkType::Function,
        calls: Vec::new(),
        inherits_from: Vec::new(),
        chunk_depth: 0,
        parent_chunk_id: None,
        child_chunk_ids: Vec::new(),
        nlp_keywords: Vec::new(),
        nlp_code_refs: Vec::new(),
        virtual_terms: Vec::new(),
    })
    .await
    .unwrap();

    // "callers of authenticate" → Usage intent → use_kg_first=true
    let q = SearchQuery {
        text: "callers of authenticate".to_string(),
        top_k: 10,
        expand_graph: true,
        compact: false,
        ..Default::default()
    };
    let results = idx.search(&q).await.unwrap();
    let login = results
        .iter()
        .find(|c| c.id == "h:1")
        .expect("login_handler should surface via KG expansion");
    assert_eq!(
        login.match_reason, "hybrid+kg",
        "KG-expanded chunks must carry hybrid+kg marker, got {}",
        login.match_reason
    );

    // Verify the 0.7× score factor: login_handler's score should be
    // exactly 0.7 × the trigger chunk's RRF score (within fp tolerance),
    // unless it was also a direct hit (then RRF would have ranked it).
    let trigger = results
        .iter()
        .find(|c| c.id == "a:1")
        .expect("authenticate must appear directly");
    let expected = trigger.score * KG_EXPAND_SCORE_FACTOR;
    assert!(
        (login.score - expected).abs() < 1e-5,
        "expected KG score = 0.7 * {} = {}, got {}",
        trigger.score,
        expected,
        login.score
    );
}

#[tokio::test]
async fn test_kg_expansion_disabled_by_expand_graph_false() {
    let idx = make_indexer();
    idx.add_chunk(RawChunk {
        id: "h:1".to_string(),
        file: "h.rs".to_string(),
        start_line: 1,
        end_line: 1,
        content: "fn caller() { target(); }".to_string(),
        function_name: Some("caller".to_string()),
        language: Some("rust".to_string()),
        chunk_type: crate::core::chunker::ChunkType::Function,
        calls: vec!["target".to_string()],
        inherits_from: Vec::new(),
        chunk_depth: 0,
        parent_chunk_id: None,
        child_chunk_ids: Vec::new(),
        nlp_keywords: Vec::new(),
        nlp_code_refs: Vec::new(),
        virtual_terms: Vec::new(),
    })
    .await
    .unwrap();
    idx.add_chunk(RawChunk {
        id: "t:1".to_string(),
        file: "t.rs".to_string(),
        start_line: 1,
        end_line: 1,
        content: "fn target() {}".to_string(),
        function_name: Some("target".to_string()),
        language: Some("rust".to_string()),
        chunk_type: crate::core::chunker::ChunkType::Function,
        calls: Vec::new(),
        inherits_from: Vec::new(),
        chunk_depth: 0,
        parent_chunk_id: None,
        child_chunk_ids: Vec::new(),
        nlp_keywords: Vec::new(),
        nlp_code_refs: Vec::new(),
        virtual_terms: Vec::new(),
    })
    .await
    .unwrap();

    let q = SearchQuery {
        text: "callers of target".to_string(),
        top_k: 10,
        expand_graph: false,
        compact: false,
        ..Default::default()
    };
    let results = idx.search(&q).await.unwrap();
    assert!(
        !results.iter().any(|c| c.match_reason.contains("kg")),
        "expand_graph=false must suppress KG expansion, got {results:#?}"
    );
}

#[tokio::test]
async fn test_symbol_graph_rebuilds_after_indexing() {
    let idx = make_indexer();
    assert_eq!(idx.symbol_graph().await.node_count(), 0);
    idx.index_file("a.rs", "fn alpha() { beta(); }\nfn beta() {}\n")
        .await
        .unwrap();
    let g = idx.symbol_graph().await;
    assert!(g.node_count() >= 2, "graph should hold alpha + beta");
    assert!(
        !g.callees_of("alpha", 1).is_empty(),
        "alpha should have a callee edge to beta"
    );
}

#[tokio::test]
async fn test_entity_exact_match_finds_chunk() {
    // Issue #20: an exact-name entity hit should resolve to a chunk in the
    // entity's file whose line range contains the entity. We use a struct
    // declaration so the AST emits a NamedType that matches the query.
    let idx = make_indexer();
    idx.index_file("e.rs", "pub struct MyType { x: u32 }\nfn f() {}\n")
        .await
        .unwrap();
    let hit = idx.entity_exact_match("MyType").await;
    assert!(hit.is_some(), "expected entity_exact_match to find MyType");
    let hit_id = hit.unwrap();
    let chunks = idx.chunks.read().await;
    assert!(
        chunks
            .get(&hit_id)
            .map(|c| c.file == "e.rs")
            .unwrap_or(false),
        "matched chunk should live in e.rs",
    );
}

#[tokio::test]
async fn test_entity_exact_match_struct_ranks_first() {
    // Issue #20: indexing a Rust snippet with `struct FooBar` and querying
    // "FooBar" must surface that chunk at rank 1 via the synthetic BM25
    // injection. We use BM25-only mode so the vector lane can't dilute
    // the signal with a near-neighbour.
    let idx = CodeIndexer::new("ent-rank-1", "/tmp/test");
    idx.index_file(
        "src/types.rs",
        "pub struct FooBar { pub x: u32 }\n\nfn unrelated() { let _ = 1; }\n",
    )
    .await
    .unwrap();
    idx.index_file("src/other.rs", "fn other_thing() {}\n")
        .await
        .unwrap();

    let q = SearchQuery {
        text: "FooBar".to_string(),
        top_k: 5,
        expand_graph: false,
        compact: false,
        ..Default::default()
    };
    let results = idx.search(&q).await.expect("search");
    assert!(!results.is_empty(), "search must return at least one hit");
    assert_eq!(
        results[0].file,
        "src/types.rs",
        "FooBar's defining file must rank first; got {:?}",
        results.iter().map(|r| &r.file).collect::<Vec<_>>(),
    );
    assert!(
        results[0].content.contains("FooBar"),
        "rank-1 chunk must contain the FooBar definition; got {:?}",
        results[0].content,
    );
}

#[tokio::test]
async fn test_entity_exact_match_skips_non_symbol_entities() {
    // Issue #20: only NamedType and ModulePath entities should anchor
    // exact-name boosts. A LiteralString like "this is a long literal"
    // appearing in a file must not be returned as an entity match.
    let idx = make_indexer();
    idx.index_file("lit.rs", "fn f() { let _ = \"this is a long literal\"; }\n")
        .await
        .unwrap();
    // Single-word literal subset that exists as a string token but is
    // neither a NamedType nor a ModulePath — must miss.
    assert!(
        idx.entity_exact_match("literal").await.is_none(),
        "non-symbol entity types must not satisfy entity_exact_match"
    );
}

#[tokio::test]
async fn test_entity_exact_match_skips_multiword_query() {
    let idx = make_indexer();
    idx.index_file("e.rs", "use std::sync::Arc;\nfn f() {}\n")
        .await
        .unwrap();
    assert!(idx.entity_exact_match("Arc thing").await.is_none());
}

#[tokio::test]
async fn test_virtual_terms_populated_from_entities() {
    // Issue #19: chunks should pick up entity text as virtual_terms so
    // BM25 matches symbolic queries that don't appear literally in the body.
    let idx = make_indexer();
    idx.index_file(
        "v.rs",
        "use std::sync::Arc;\nfn f() { let _x: Arc<String> = Arc::new(String::new()); }\n",
    )
    .await
    .unwrap();
    let chunks = idx.chunks.read().await;
    let f_chunk = chunks
        .values()
        .find(|c| c.function_name.as_deref() == Some("f"))
        .expect("f chunk");
    assert!(
        f_chunk.virtual_terms.iter().any(|t| t == "Arc"),
        "expected 'Arc' in virtual_terms, got {:?}",
        f_chunk.virtual_terms
    );
}

#[tokio::test]
async fn test_get_embedding_returns_some_after_indexing() {
    let idx = make_indexer();
    idx.add_chunk(raw("a:1:1", "a.rs", "fn alpha() {}"))
        .await
        .unwrap();
    let emb = idx.get_embedding("a:1:1");
    assert!(emb.is_some(), "expected embedding cached after add_chunk");
    assert!(idx.get_embedding("nope").is_none());
}

#[tokio::test]
async fn test_similar_by_embedding_excludes_seed() {
    let idx = make_indexer();
    idx.add_chunk(raw("a:1:1", "a.rs", "fn alpha() {}"))
        .await
        .unwrap();
    idx.add_chunk(raw("b:1:1", "b.rs", "fn beta() {}"))
        .await
        .unwrap();
    let emb = idx.get_embedding("a:1:1").unwrap();
    let results = idx
        .similar_by_embedding(&emb, 5, Some("a:1:1"))
        .await
        .unwrap();
    assert!(results.iter().all(|c| c.id != "a:1:1"));
    assert!(results.iter().all(|c| c.match_reason == "vector"));
}

#[tokio::test]
async fn test_index_files_batch_indexes_all_chunks_once() {
    // Bulk-indexing two files should leave the corpus with the same chunks
    // as if we'd called index_file twice, but issue exactly one symbol-graph
    // rebuild and one batched embed call (we can't observe the latter
    // directly without a counter, but we can assert correctness end-to-end).
    let idx = make_indexer();
    let files = vec![
        (
            "src/a.rs".to_string(),
            "fn alpha() { beta(); }\nfn beta() {}\n".to_string(),
        ),
        (
            "src/b.rs".to_string(),
            "fn gamma() {}\nfn delta() { gamma(); }\n".to_string(),
        ),
    ];
    let added = idx.index_files_batch(&files).await.unwrap();
    assert!(added >= 4, "expected at least 4 chunks, got {added}");
    // Symbol graph must reflect cross-file edges (delta -> gamma).
    let g = idx.symbol_graph().await;
    assert!(g.node_count() >= 4);
    // Search must surface the right chunk.
    let q = SearchQuery {
        text: "fn alpha".to_string(),
        top_k: 5,
        expand_graph: false,
        compact: false,
        ..Default::default()
    };
    let r = idx.search(&q).await.unwrap();
    assert!(r.iter().any(|c| c.file == "src/a.rs"));
}

#[tokio::test]
async fn test_index_files_batch_empty_input_is_noop() {
    let idx = make_indexer();
    let added = idx.index_files_batch(&[]).await.unwrap();
    assert_eq!(added, 0);
    assert_eq!(idx.chunk_count(), 0);
}

#[tokio::test]
async fn test_index_files_batch_bm25_only_mode() {
    // No embedder/store wired — the batch path must still populate the
    // corpus and BM25 must still find chunks.
    let idx = CodeIndexer::new("bm25-batch", "/tmp/test");
    let files = vec![(
        "x.rs".to_string(),
        "fn authenticate() {}\nfn other() {}\n".to_string(),
    )];
    let added = idx.index_files_batch(&files).await.unwrap();
    assert!(added >= 2);
    let r = idx
        .search(&SearchQuery {
            text: "authenticate".to_string(),
            top_k: 5,
            expand_graph: false,
            compact: false,
            ..Default::default()
        })
        .await
        .unwrap();
    assert!(r.iter().any(|c| c.content.contains("authenticate")));
}

/// `CodeIndexer::search` must route otherwise-`Unknown` queries to
/// `Definition` intent when the per-index `domain_terms` vocabulary
/// matches the query.
///
/// Why: this is the wiring point for `trusty-search.yaml`'s
/// `domain_terms:` field. Without this test, a regression that drops the
/// `with_domain_terms`/`set_domain_terms` call (or reverts `search` back
/// to the non-domain `classify`) silently disables domain-aware routing
/// for every multi-index repo.
///
/// What: the indexer is wired with `["PMS"]`. We index a file containing
/// a `pms_handler` symbol and search for `"PMS integration query"` —
/// a phrase the generic classifier returns `Unknown` for. The domain
/// classifier should upgrade to `Definition`, which uses lexical-heavy
/// weights; we verify by asserting the symbol chunk is the top hit.
/// Test: this test.
#[tokio::test]
async fn search_uses_domain_terms_when_provided() {
    use crate::core::classifier::{QueryClassifier, QueryIntent};

    // First, confirm the generic classifier *can't* route "PMS integration"
    // to Definition without the domain hint — otherwise the test would
    // pass for the wrong reason.
    let plain = QueryClassifier::classify("PMS integration query");
    assert_eq!(
        plain,
        QueryIntent::Unknown,
        "baseline: plain classifier must treat the PMS phrase as Unknown"
    );

    let idx =
        CodeIndexer::new("domain-test", "/tmp/domain").with_domain_terms(vec!["PMS".to_string()]);
    idx.index_file("api.rs", "fn pms_handler() {}\nfn other() {}\n")
        .await
        .expect("index_file ok");
    let r = idx
        .search(&SearchQuery {
            text: "PMS integration query".into(),
            top_k: 5,
            expand_graph: false,
            compact: false,
            ..Default::default()
        })
        .await
        .expect("search ok");
    // The corpus only has two functions; the PMS-named one should win
    // under Definition's BM25-heavy weighting.
    assert!(
        r.iter().any(|c| c.content.contains("pms_handler")),
        "expected pms_handler chunk to appear in results: {:?}",
        r.iter().map(|c| &c.content).collect::<Vec<_>>()
    );
}

#[test]
fn test_file_type_multiplier_demotes_docs() {
    // Why: Definition-intent ranking should prefer source over docs.
    // What: confirms the helper's contract — multiplier 0.5 for .md/.toml/
    // .yaml/.json/.txt, 1.0 for everything else.
    // Test: direct assertions on the helper.
    assert_eq!(file_type_score_multiplier("src/auth.rs"), 1.0);
    assert_eq!(file_type_score_multiplier("src/auth.py"), 1.0);
    assert_eq!(file_type_score_multiplier("src/auth.go"), 1.0);
    assert_eq!(file_type_score_multiplier("CHANGELOG.md"), 0.5);
    assert_eq!(file_type_score_multiplier("docs/CLAUDE.md"), 0.5);
    assert_eq!(file_type_score_multiplier("Cargo.toml"), 0.5);
    assert_eq!(file_type_score_multiplier("config.yaml"), 0.5);
    assert_eq!(file_type_score_multiplier("data.json"), 0.5);
    // Case-insensitive
    assert_eq!(file_type_score_multiplier("README.MD"), 0.5);
}

#[tokio::test]
async fn test_definition_demotes_markdown_below_source() {
    // Why: issue #92 — for Definition-intent queries, the canonical
    // source-file declaration must outrank any .md doc that mentions the
    // symbol many times.
    // What: build a corpus with one .rs source chunk and one .md chunk
    // both containing the literal "CodeChunk struct"; run a Definition
    // query and assert the .rs file ranks first.
    // Test: this test.
    let idx = make_indexer();
    idx.add_chunk(raw(
        "doc:1",
        "CHANGELOG.md",
        "## CodeChunk struct\nCodeChunk struct fields: id, file. CodeChunk struct fields are stable.",
    ))
    .await
    .unwrap();
    idx.add_chunk(raw(
        "src:1",
        "src/indexer.rs",
        "pub struct CodeChunk { pub id: String, pub file: String }",
    ))
    .await
    .unwrap();

    let q = SearchQuery {
        text: "struct CodeChunk fields".to_string(),
        top_k: 10,
        expand_graph: false,
        compact: false,
        ..Default::default()
    };
    let results = idx.search(&q).await.unwrap();
    assert!(!results.is_empty(), "search must return results");
    assert!(
        results[0].file.ends_with(".rs"),
        "Definition intent must rank source over docs, top result file = {}",
        results[0].file
    );
}

#[tokio::test]
async fn test_conceptual_does_not_demote_docs() {
    // Why: the .md demotion is intent-scoped — Conceptual queries must
    // still surface documentation.
    // What: same corpus shape as above, but a Conceptual query phrasing
    // ("how does ...") ⇒ no multiplier applied. We only assert that the
    // markdown chunk is present in results (ordering for Conceptual is
    // dominated by the vector lane in real runs; in this BM25-only test
    // we just verify no hard demotion happens).
    // Test: this test.
    let idx = make_indexer();
    idx.add_chunk(raw(
        "doc:1",
        "ARCHITECTURE.md",
        "How does the CodeChunk pipeline work in trusty-search.",
    ))
    .await
    .unwrap();
    idx.add_chunk(raw(
        "src:1",
        "src/indexer.rs",
        "pub struct CodeChunk { pub id: String }",
    ))
    .await
    .unwrap();

    let q = SearchQuery {
        text: "how does the CodeChunk pipeline work".to_string(),
        top_k: 10,
        expand_graph: false,
        compact: false,
        ..Default::default()
    };
    let results = idx.search(&q).await.unwrap();
    assert!(
        results.iter().any(|c| c.file.ends_with(".md")),
        "Conceptual queries must still surface .md docs"
    );
}

#[tokio::test]
async fn test_kg_results_survive_top_k_truncation() {
    // Why: issue #94 — KG-expanded neighbours used to be appended after
    // `take(top_k)` had already trimmed the result list, so on busy
    // indexes the "hybrid+kg" reason never surfaced. We now re-sort the
    // merged direct+KG list by score before truncation.
    // What: fill the index with N direct hits at top_k limit, plus one
    // KG-only neighbour; assert the neighbour survives.
    // Test: this test.
    let idx = CodeIndexer::new("kg-trunc", "/tmp/test");
    // Direct hit + KG seed via `calls`.
    idx.add_chunk(RawChunk {
        id: "src:caller".to_string(),
        file: "caller.rs".to_string(),
        start_line: 1,
        end_line: 3,
        content: "fn caller() { /* dispatches */ }".to_string(),
        function_name: Some("caller".to_string()),
        language: Some("rust".to_string()),
        chunk_type: crate::core::chunker::ChunkType::Function,
        calls: vec!["authenticate".to_string()],
        inherits_from: Vec::new(),
        chunk_depth: 0,
        parent_chunk_id: None,
        child_chunk_ids: Vec::new(),
        nlp_keywords: Vec::new(),
        nlp_code_refs: Vec::new(),
        virtual_terms: Vec::new(),
    })
    .await
    .unwrap();
    idx.add_chunk(RawChunk {
        id: "src:authenticate".to_string(),
        file: "auth.rs".to_string(),
        start_line: 1,
        end_line: 1,
        content: "fn authenticate() {}".to_string(),
        function_name: Some("authenticate".to_string()),
        language: Some("rust".to_string()),
        chunk_type: crate::core::chunker::ChunkType::Function,
        calls: Vec::new(),
        inherits_from: Vec::new(),
        chunk_depth: 0,
        parent_chunk_id: None,
        child_chunk_ids: Vec::new(),
        nlp_keywords: Vec::new(),
        nlp_code_refs: Vec::new(),
        virtual_terms: Vec::new(),
    })
    .await
    .unwrap();

    let q = SearchQuery {
        text: "callers of authenticate".to_string(),
        top_k: 10,
        expand_graph: true,
        compact: false,
        ..Default::default()
    };
    let results = idx.search(&q).await.unwrap();
    assert!(
        results.iter().any(|c| c.match_reason == "hybrid+kg"),
        "at least one result must carry 'hybrid+kg' match_reason, got: {:#?}",
        results
            .iter()
            .map(|c| (&c.id, &c.match_reason))
            .collect::<Vec<_>>()
    );
}

#[test]
fn test_intent_routing_definitions() {
    // Sanity: intent table from CLAUDE.md is wired through.
    use crate::core::classifier::QueryIntent;
    let (a, b, kg) = QueryIntent::Definition.weights();
    assert!((a - 0.3).abs() < 1e-6 && (b - 0.7).abs() < 1e-6 && !kg);
    let (a, b, kg) = QueryIntent::Usage.weights();
    assert!((a - 0.5).abs() < 1e-6 && (b - 0.5).abs() < 1e-6 && kg);
}

#[tokio::test]
async fn test_enumerate_chunks_paginates_stable_order() {
    // Why: pagination over an underlying HashMap must produce a stable
    // total order so successive pages don't overlap or skip rows.
    let idx = make_indexer();
    // Helper: build a chunk whose `start_line`/`end_line` match the ID so
    // the `(file, start_line, end_line)` sort exercised below has the
    // expected total order (the bare `raw` helper hardcodes
    // `start_line: 1` for every chunk).
    fn raw_lines(id: &str, file: &str, start: usize, end: usize, content: &str) -> RawChunk {
        let mut r = raw(id, file, content);
        r.start_line = start;
        r.end_line = end;
        r
    }
    // Insert in an order that exercises the file/start_line sort.
    idx.add_chunk(raw_lines("b.rs:10:20", "b.rs", 10, 20, "fn b_two() {}"))
        .await
        .unwrap();
    idx.add_chunk(raw_lines("a.rs:1:5", "a.rs", 1, 5, "fn a_one() {}"))
        .await
        .unwrap();
    idx.add_chunk(raw_lines("b.rs:1:5", "b.rs", 1, 5, "fn b_one() {}"))
        .await
        .unwrap();
    idx.add_chunk(raw_lines("a.rs:30:40", "a.rs", 30, 40, "fn a_two() {}"))
        .await
        .unwrap();

    // Full enumeration: sorted by (file, start_line).
    let (total_all, all) = idx.enumerate_chunks(0, 100).await;
    assert_eq!(total_all, 4);
    let ids: Vec<_> = all.iter().map(|c| c.id.as_str()).collect();
    assert_eq!(
        ids,
        vec!["a.rs:1:5", "a.rs:30:40", "b.rs:1:5", "b.rs:10:20"]
    );

    // Page 1 (offset=0, limit=2) + Page 2 (offset=2, limit=2) cover all.
    let (total_p1, page1) = idx.enumerate_chunks(0, 2).await;
    let (total_p2, page2) = idx.enumerate_chunks(2, 2).await;
    assert_eq!(total_p1, 4);
    assert_eq!(total_p2, 4);
    assert_eq!(page1.len(), 2);
    assert_eq!(page2.len(), 2);
    let combined: Vec<_> = page1
        .iter()
        .chain(page2.iter())
        .map(|c| c.id.as_str())
        .collect();
    assert_eq!(combined, ids);

    // Offset past the end returns empty, but total is preserved.
    let (total_end, end) = idx.enumerate_chunks(10, 5).await;
    assert_eq!(total_end, 4);
    assert!(end.is_empty());

    // limit=0 returns empty.
    let (total_z, z) = idx.enumerate_chunks(0, 0).await;
    assert_eq!(total_z, 4);
    assert!(z.is_empty());
}

// ---- Branch-aware search (issue #122) ----------------------------------

fn make_branch_query(text: &str, files: Vec<String>, boost: f32) -> SearchQuery {
    SearchQuery {
        text: text.to_string(),
        top_k: 10,
        expand_graph: false,
        compact: false,
        branch_files: Some(files),
        branch_boost: boost,
        branch: None,
    }
}

#[tokio::test]
async fn test_branch_boost_applied_to_matching_chunks() {
    // Why: chunks whose file is in `branch_files` must out-rank otherwise
    // equivalent chunks. We use two files with the same BM25-relevant
    // content so the baseline ranking is a stable tie broken by the boost.
    // What: build a corpus with two chunks ("on-branch" and "off-branch"),
    // run a query with `branch_files=[on-branch path]`, assert the
    // on-branch chunk ranks first and carries `on_branch: true`.
    // Test: this test.
    let idx = make_indexer();
    idx.add_chunk(raw(
        "src/on.rs:1:1",
        "src/on.rs",
        "fn authenticate(user: &str) -> bool { true }",
    ))
    .await
    .unwrap();
    idx.add_chunk(raw(
        "src/off.rs:1:1",
        "src/off.rs",
        "fn authenticate(user: &str) -> bool { true }",
    ))
    .await
    .unwrap();

    let q = make_branch_query("fn authenticate", vec!["src/on.rs".to_string()], 1.5);
    let results = idx.search(&q).await.unwrap();
    assert!(!results.is_empty(), "branch-aware search must return hits");
    let on_branch = results
        .iter()
        .find(|c| c.file == "src/on.rs")
        .expect("on-branch chunk in results");
    let off_branch = results.iter().find(|c| c.file == "src/off.rs");

    assert!(on_branch.on_branch, "on_branch must be true for on.rs");
    if let Some(off) = off_branch {
        assert!(!off.on_branch, "on_branch must be false for off.rs");
        assert!(
            on_branch.score >= off.score,
            "branch boost must make on.rs >= off.rs (got {} vs {})",
            on_branch.score,
            off.score
        );
    }
    assert_eq!(
        results[0].file,
        "src/on.rs",
        "on-branch chunk must rank first; got {:?}",
        results.iter().map(|c| &c.file).collect::<Vec<_>>()
    );
}

#[tokio::test]
async fn test_branch_boost_clamped_to_3x() {
    // Why: callers must not be able to drown out all off-branch results by
    // passing wild multipliers (e.g. 10x). The pipeline must clamp.
    // What: feed a query with `branch_boost = 10.0` and a single on-branch
    // chunk; verify the resolved boost equals 3.0 via `resolve_branch_set`.
    // Test: this test (direct helper) + the integration test above.
    let q = make_branch_query("foo", vec!["src/on.rs".to_string()], 10.0);
    let root = std::path::PathBuf::from("/tmp/test");
    let (set, boost) = super::search::resolve_branch_set(&q, &root);
    assert!(set.is_some(), "branch set must be present");
    assert!(
        (boost - 3.0).abs() < f32::EPSILON,
        "branch_boost=10.0 must clamp to 3.0, got {boost}"
    );

    // Floor: 0.0 must clamp up to 1.0 (no-op).
    let q_low = make_branch_query("foo", vec!["src/on.rs".to_string()], 0.0);
    let (set_low, boost_low) = super::search::resolve_branch_set(&q_low, &root);
    assert!(
        (boost_low - 1.0).abs() < f32::EPSILON,
        "branch_boost=0.0 must clamp to 1.0, got {boost_low}"
    );
    // 1.0 disables boosting → the set is dropped to skip per-chunk work.
    assert!(
        set_low.is_none(),
        "branch_boost=1.0 must drop the set (no-op)"
    );
}

#[tokio::test]
async fn test_on_branch_set_correctly() {
    // Why: every returned chunk must carry an accurate `on_branch` flag so
    // clients can highlight branch work in UI without re-doing the lookup.
    // What: index two chunks, query with branch_files=[one], assert each
    // result's flag matches set membership.
    // Test: this test.
    let idx = make_indexer();
    idx.add_chunk(raw(
        "src/on.rs:1:1",
        "src/on.rs",
        "fn authenticate() -> bool { true }",
    ))
    .await
    .unwrap();
    idx.add_chunk(raw(
        "src/off.rs:1:1",
        "src/off.rs",
        "fn authenticate() -> bool { true }",
    ))
    .await
    .unwrap();

    let q = make_branch_query("fn authenticate", vec!["src/on.rs".to_string()], 1.5);
    let results = idx.search(&q).await.unwrap();
    for c in &results {
        if c.file == "src/on.rs" {
            assert!(c.on_branch, "on.rs must be flagged on_branch=true");
        } else if c.file == "src/off.rs" {
            assert!(!c.on_branch, "off.rs must be flagged on_branch=false");
        }
    }

    // Normalize leading "./" — branch_files entries with "./src/on.rs" must
    // still match a chunk whose file is "src/on.rs".
    let q2 = make_branch_query("fn authenticate", vec!["./src/on.rs".to_string()], 1.5);
    let results2 = idx.search(&q2).await.unwrap();
    let on2 = results2
        .iter()
        .find(|c| c.file == "src/on.rs")
        .expect("on-branch chunk in results");
    assert!(on2.on_branch, "leading './' must be normalized away");
}

#[tokio::test]
async fn test_no_boost_when_branch_files_absent() {
    // Why: a vanilla query with no branch context must not pay any branch
    // overhead and must report `on_branch: false` on every result.
    // What: run the baseline search query and confirm scores match the
    // pre-#122 behavior (i.e. on_branch is always false, no panic).
    // Test: this test.
    let idx = make_indexer();
    idx.add_chunk(raw(
        "src/auth.rs:1:5",
        "src/auth.rs",
        "fn authenticate(user: &str, password: &str) -> bool { true }",
    ))
    .await
    .unwrap();
    idx.add_chunk(raw(
        "src/render.rs:1:3",
        "src/render.rs",
        "fn render_ui_components() { /* svelte */ }",
    ))
    .await
    .unwrap();

    let q = SearchQuery {
        text: "fn authenticate".to_string(),
        top_k: 5,
        expand_graph: false,
        compact: false,
        branch_files: None,
        branch_boost: SearchQuery::default_branch_boost(),
        branch: None,
    };
    let results = idx.search(&q).await.unwrap();
    assert!(!results.is_empty());
    for c in &results {
        assert!(
            !c.on_branch,
            "on_branch must default to false when no branch context provided"
        );
    }
}

// ---------------------------------------------------------------------------
// Issue #28 — durable redb corpus integration.
// ---------------------------------------------------------------------------

use crate::core::corpus::CorpusStore;

/// Build a BM25-only indexer (no embedder/store needed) with a durable redb
/// `CorpusStore` wired at `redb_path`.
///
/// Why: the corpus-integration tests exercise the commit → redb → warm-boot
/// rehydration path, which is orthogonal to the HNSW lane. A BM25-only indexer
/// keeps the tests hermetic (no ONNX) while still hitting every `corpus`
/// branch in `commit_parsed_batch` / `load_chunks_from_redb` / removal.
fn make_indexer_with_corpus(redb_path: &std::path::Path) -> CodeIndexer {
    let mut idx = CodeIndexer::new("corpus-test", "/tmp/corpus-test");
    let store = CorpusStore::open(redb_path).expect("open corpus store");
    idx.set_corpus_store(Arc::new(store));
    idx
}

/// Phase 2 + 3: a committed batch must persist to redb, and a fresh indexer
/// pointed at the same redb file must rehydrate the corpus on warm-boot.
#[tokio::test]
async fn test_corpus_store_roundtrip() {
    let dir = tempfile::tempdir().unwrap();
    let redb_path = dir.path().join("index.redb");

    // Phase 1: index two files into an indexer with a durable corpus.
    {
        let idx = make_indexer_with_corpus(&redb_path);
        idx.index_files_batch(&[
            ("src/auth.rs".into(), "fn authenticate() {}".into()),
            ("src/token.rs".into(), "fn verify_token() {}".into()),
        ])
        .await
        .expect("index batch");
        assert!(idx.chunk_count() >= 2);
    } // indexer (and its CorpusStore Arc) dropped here — simulates shutdown.

    // The redb file must hold the committed chunks.
    {
        let store = CorpusStore::open(&redb_path).unwrap();
        assert!(
            store.chunk_count().unwrap() >= 2,
            "committed batch was not persisted to redb"
        );
    }

    // Phase 2: a fresh indexer warm-boots from the redb corpus — no reindex.
    let restored = make_indexer_with_corpus(&redb_path);
    let n = restored
        .load_chunks_from_redb()
        .await
        .expect("warm-boot from redb");
    assert!(n >= 2, "warm-boot rehydrated {n} chunks, expected >= 2");
    assert_eq!(restored.chunk_count(), n);

    // BM25 must be rebuilt from the rehydrated corpus.
    let bm25 = restored.bm25.read().await;
    let hits = bm25.score_query_all("authenticate", 5);
    drop(bm25);
    assert!(
        !hits.is_empty(),
        "BM25 not rebuilt from redb-restored chunks"
    );
}

/// Phase 3: warm-boot from an empty / missing redb corpus must yield zero
/// chunks (the first-run / post-upgrade fallback that triggers a reindex).
#[tokio::test]
async fn test_corpus_store_warm_boot_empty_is_zero() {
    let dir = tempfile::tempdir().unwrap();
    let idx = make_indexer_with_corpus(&dir.path().join("fresh.redb"));
    let n = idx.load_chunks_from_redb().await.unwrap();
    assert_eq!(n, 0, "empty redb corpus must rehydrate zero chunks");

    // An indexer with no corpus store at all also yields zero (BM25-only).
    let bare = CodeIndexer::new("bare", "/tmp/bare");
    assert_eq!(bare.load_chunks_from_redb().await.unwrap(), 0);
}

/// Phase 2: `remove_file` / `remove_chunk` must evict from the durable redb
/// corpus too — otherwise a warm-boot resurrects the deleted chunks.
#[tokio::test]
async fn test_corpus_store_deletes_on_remove() {
    let dir = tempfile::tempdir().unwrap();
    let redb_path = dir.path().join("index.redb");

    let idx = make_indexer_with_corpus(&redb_path);
    idx.index_files_batch(&[
        ("src/keep.rs".into(), "fn keep_me() {}".into()),
        ("src/drop.rs".into(), "fn drop_me() {}".into()),
    ])
    .await
    .unwrap();
    let before = idx.chunk_count();
    assert!(before >= 2);

    // Remove one file — this must delete its chunks from redb as well.
    idx.remove_file("src/drop.rs").await.unwrap();
    drop(idx);

    // Re-open the redb corpus directly: the dropped file's chunks must be gone.
    // redb is single-process-exclusive, so this store MUST be dropped before
    // the warm-boot indexer below re-opens the same file.
    let chunks = {
        let store = CorpusStore::open(&redb_path).unwrap();
        store.load_all_chunks().unwrap()
    };
    assert!(
        chunks.iter().all(|c| c.file != "src/drop.rs"),
        "removed file's chunks still present in redb after remove_file"
    );
    assert!(
        chunks.iter().any(|c| c.file == "src/keep.rs"),
        "remove_file evicted the wrong file's chunks from redb"
    );

    // Warm-boot a fresh indexer: the removal must survive the restart.
    let restored = make_indexer_with_corpus(&redb_path);
    restored.load_chunks_from_redb().await.unwrap();
    let ids = restored.find_chunk_id("drop.rs", None).await;
    assert!(ids.is_none(), "deleted chunk resurrected on warm-boot");
}

/// Phase 3 migration: a daemon upgraded from a JSON-snapshot build has a
/// populated `chunks.json` and an empty `index.redb`. `migrate_corpus_to_redb`
/// must seed redb so the next restart uses the fast path.
#[tokio::test]
async fn test_corpus_store_migrates_from_json() {
    let dir = tempfile::tempdir().unwrap();
    let json_path = dir.path().join("chunks.json");
    let redb_path = dir.path().join("index.redb");

    // Stage a legacy JSON snapshot via an indexer with no corpus store.
    {
        let legacy = make_indexer();
        legacy
            .add_chunk(raw("a", "src/a.rs", "fn legacy_a() {}"))
            .await
            .unwrap();
        legacy
            .add_chunk(raw("b", "src/b.rs", "fn legacy_b() {}"))
            .await
            .unwrap();
        legacy.save_chunks_to_disk(&json_path).await.unwrap();
    }
    assert!(json_path.exists());

    // Warm-boot path: load the JSON snapshot, then migrate it into redb.
    let idx = make_indexer_with_corpus(&redb_path);
    let n = idx.load_chunks_from_disk(&json_path).await.unwrap();
    assert_eq!(n, 2);
    idx.migrate_corpus_to_redb().await;
    drop(idx);

    // The redb corpus must now hold the migrated chunks, so a subsequent
    // restart can skip the JSON file entirely.
    let restored = make_indexer_with_corpus(&redb_path);
    let m = restored.load_chunks_from_redb().await.unwrap();
    assert_eq!(m, 2, "redb corpus was not seeded by the JSON migration");
}

/// Phase 4: `swap_corpus_store` / `take_corpus_store` give the reindex
/// orchestrator the ability to stage a rebuilt corpus in a temp file and then
/// restore the indexer's durable store — without losing the original.
#[tokio::test]
async fn test_corpus_store_swap_and_take() {
    let dir = tempfile::tempdir().unwrap();
    let live_path = dir.path().join("index.redb");
    let tmp_path = dir.path().join("index.redb.tmp");

    let mut idx = make_indexer_with_corpus(&live_path);
    assert!(idx.has_corpus_store());

    // Stage a fresh tmp corpus, capturing the live one it replaced. The prior
    // store's `Arc` is dropped immediately: redb is single-process-exclusive,
    // and `commit_force_corpus_swap` likewise drops the prior handle before
    // the rename. We only assert its path first.
    let staged = Arc::new(CorpusStore::open_fresh(&tmp_path).unwrap());
    let prev = idx.swap_corpus_store(staged).expect("prior store returned");
    assert_eq!(prev.path(), live_path.as_path());
    drop(prev);

    // Commit a batch — it must land in the *staging* file, not the live one.
    idx.index_files_batch(&[("src/new.rs".into(), "fn brand_new() {}".into())])
        .await
        .unwrap();

    // Take the staging store back out so its Arc can be dropped before a
    // rename (mirrors `commit_force_corpus_swap`).
    let staged_back = idx.take_corpus_store().expect("staging store taken");
    assert_eq!(staged_back.path(), tmp_path.as_path());
    assert!(!idx.has_corpus_store());
    assert!(
        staged_back.chunk_count().unwrap() >= 1,
        "batch did not commit to the staged corpus"
    );
    // Drop the staging handle so the live file can be re-opened below.
    drop(staged_back);

    // The original live file must be untouched — it never saw the new batch.
    let live = CorpusStore::open(&live_path).unwrap();
    assert_eq!(
        live.chunk_count().unwrap(),
        0,
        "live corpus was mutated while a staging corpus was swapped in"
    );
}