lantern 0.2.2 - Docs.rs

//! End-to-end coverage for the embedding-backed search paths.
//!
//! Every scenario uses [`MockEmbeddingBackend`] so the tests are deterministic
//! and do not require a running Ollama daemon. The mock is a bag-of-tokens
//! hasher — good enough to validate ranking behavior while keeping the tests
//! honest about what they actually check: the plumbing, not retrieval quality.

use std::fs;
use std::sync::Arc;

use rusqlite::Connection;

use lantern::embed::{
    DEFAULT_EMBED_MODEL, EmbedOptions, EmbeddingBackend, MockBackendFactory, MockEmbeddingBackend,
    VEC_MIRROR_DIM, embed_missing_with, embedding_stats, f32s_to_blob,
};
use lantern::ingest::ingest_path;
use lantern::mcp::{EmbedArgs, LanternServer, SearchArgs};
use lantern::search::{
    SemanticOptions, hybrid_search_with, semantic_search_with, vec_semantic_search_with,
};
use lantern::store::Store;
use tempfile::TempDir;

const MOCK_MODEL: &str = "mock-embed-test";

fn setup_with(files: &[(&str, &str)]) -> (TempDir, Store, std::path::PathBuf) {
    let root = tempfile::tempdir().unwrap();
    let store_dir = root.path().join("store");
    let mut store = Store::initialize(&store_dir).unwrap();
    let data = root.path().join("data");
    fs::create_dir_all(&data).unwrap();
    for (name, body) in files {
        fs::write(data.join(name), body).unwrap();
    }
    ingest_path(&mut store, &data).unwrap();
    (root, store, store_dir)
}

fn embed_all(store: &mut Store, backend: &MockEmbeddingBackend) {
    let opts = EmbedOptions {
        model: MOCK_MODEL.to_string(),
        ollama_url: "http://mock".to_string(),
        limit: None,
    };
    embed_missing_with(store, &opts, backend).unwrap();
}

fn sem_opts(limit: usize) -> SemanticOptions {
    SemanticOptions {
        limit,
        kind: None,
        path_contains: None,
        model: MOCK_MODEL.to_string(),
        ollama_url: "http://mock".to_string(),
        instruction: None,
    }
}

#[test]
fn embed_missing_writes_vectors_into_db() {
    let (_root, mut store, _) = setup_with(&[
        ("a.md", "rust systems programming language"),
        ("b.md", "baking sourdough bread at home"),
    ]);

    let backend = MockEmbeddingBackend::new(64);
    let report = embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: MOCK_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();
    assert_eq!(report.embedded, 2, "one chunk per short file");
    assert_eq!(report.failed, 0);
    assert_eq!(report.dim, Some(64));
    assert_eq!(report.model, MOCK_MODEL);

    let stats = embedding_stats(&store).unwrap();
    assert_eq!(stats.len(), 1);
    assert_eq!(stats[0].model, MOCK_MODEL);
    assert_eq!(stats[0].dim, 64);
    assert_eq!(stats[0].count, 2);

    // Re-running is a no-op: nothing new to embed.
    let second = embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: MOCK_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();
    assert_eq!(second.embedded, 0);
    assert_eq!(second.already_had, 2);
}

#[test]
fn embed_missing_dual_writes_into_vec_mirror_for_default_model() {
    // The default-model vec0 mirror is declared with `float[768]`, so the mock
    // backend is sized to match. Every canonical embeddings row for the default
    // model must have a parallel row in `chunks_vec_nomic_768`, keyed by the
    // chunks table's rowid so the mirror can be joined back to provenance.
    let (_root, mut store, _) = setup_with(&[
        ("a.md", "rust systems programming language"),
        ("b.md", "baking sourdough bread at home"),
    ]);

    let backend = MockEmbeddingBackend::new(768);
    let report = embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: DEFAULT_EMBED_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();
    assert_eq!(report.embedded, 2);
    assert_eq!(report.dim, Some(768));

    let canonical: i64 = store
        .conn()
        .query_row(
            "SELECT COUNT(*) FROM embeddings WHERE model = ?1",
            [DEFAULT_EMBED_MODEL],
            |row| row.get(0),
        )
        .unwrap();
    assert_eq!(
        canonical, 2,
        "canonical embeddings table must have both rows"
    );

    let mirror: i64 = store
        .conn()
        .query_row("SELECT COUNT(*) FROM chunks_vec_nomic_768", [], |row| {
            row.get(0)
        })
        .unwrap();
    assert_eq!(mirror, 2, "vec mirror must be dual-written");

    // Every mirror row's rowid must map back to a real chunk — this is what
    // lets a later search slice join the ANN result set to provenance.
    let orphans: i64 = store
        .conn()
        .query_row(
            "SELECT COUNT(*) FROM chunks_vec_nomic_768 v
             WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.rowid = v.rowid)",
            [],
            |row| row.get(0),
        )
        .unwrap();
    assert_eq!(orphans, 0, "mirror rowids must all reference chunks");
}

#[test]
fn embed_missing_skips_vec_mirror_for_non_default_model() {
    // The vec mirror only covers the default model in this slice. A run with
    // a different model must leave the mirror untouched — no accidental writes,
    // no dimension-mismatch errors from vec0.
    let (_root, mut store, _) = setup_with(&[("a.md", "anything at all")]);

    let backend = MockEmbeddingBackend::new(768);
    embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: MOCK_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();

    let mirror: i64 = store
        .conn()
        .query_row("SELECT COUNT(*) FROM chunks_vec_nomic_768", [], |row| {
            row.get(0)
        })
        .unwrap();
    assert_eq!(mirror, 0);
}

#[test]
fn semantic_search_ranks_topically_related_chunks_first() {
    let (_root, mut store, _) = setup_with(&[
        (
            "rust.md",
            "rust is a systems programming language with memory safety",
        ),
        (
            "bread.md",
            "sourdough bread requires flour water salt and patience",
        ),
        (
            "cars.md",
            "electric cars have large batteries and regenerative braking",
        ),
    ]);
    let backend = MockEmbeddingBackend::new(128);
    embed_all(&mut store, &backend);

    let hits =
        semantic_search_with(&store, "systems programming memory", &sem_opts(3), &backend).unwrap();
    assert_eq!(hits.len(), 3, "all three candidates should be returned");
    assert!(
        hits[0].uri.ends_with("/rust.md"),
        "rust doc must rank first; got {}",
        hits[0].uri
    );
    // Scores must be in descending cosine-similarity order.
    assert!(hits[0].score >= hits[1].score);
    assert!(hits[1].score >= hits[2].score);
    assert!(hits[0].score > 0.0);
}

#[test]
fn semantic_search_respects_limit() {
    let (_root, mut store, _) = setup_with(&[
        ("a.md", "needle one"),
        ("b.md", "needle two"),
        ("c.md", "needle three"),
    ]);
    let backend = MockEmbeddingBackend::new(64);
    embed_all(&mut store, &backend);

    let hits = semantic_search_with(&store, "needle", &sem_opts(2), &backend).unwrap();
    assert_eq!(hits.len(), 2);
}

#[test]
fn semantic_search_empty_query_returns_nothing() {
    let (_root, mut store, _) = setup_with(&[("a.md", "anything")]);
    let backend = MockEmbeddingBackend::new(64);
    embed_all(&mut store, &backend);

    let hits = semantic_search_with(&store, "   ", &sem_opts(10), &backend).unwrap();
    assert!(hits.is_empty());
}

#[test]
fn hybrid_search_blends_keyword_and_semantic_signals() {
    // `rust.md` contains the literal token and a topical match → should win
    // both sides of the blend. `programming.md` is a topical near-miss that
    // the semantic side should still surface over the unrelated `trees.md`.
    let (_root, mut store, _) = setup_with(&[
        (
            "rust.md",
            "rust is a systems programming language with lifetimes",
        ),
        (
            "programming.md",
            "systems programming languages balance safety and performance",
        ),
        (
            "trees.md",
            "redwood trees grow tall along the northern coast",
        ),
    ]);
    let backend = MockEmbeddingBackend::new(128);
    embed_all(&mut store, &backend);

    let hits = hybrid_search_with(&store, "rust programming", &sem_opts(3), &backend).unwrap();
    assert!(!hits.is_empty());
    assert!(
        hits[0].uri.ends_with("/rust.md"),
        "literal-token hit should lead hybrid ranking; got {}",
        hits[0].uri
    );
    // Blended scores should be monotonically non-increasing.
    for w in hits.windows(2) {
        assert!(w[0].score >= w[1].score);
    }
}

#[test]
fn hybrid_search_rrf_rewards_cross_list_matches() {
    // RRF replaces the old weighted blend: a chunk appearing in both the
    // keyword list and the semantic list must outrank one that appears in
    // only a single list, because it receives two rank contributions. The
    // test also verifies the two other RRF guarantees we care about: a pure
    // literal-token hit still ranks highly, and a semantic-only hit (no
    // literal token) still surfaces in the combined output rather than being
    // dropped by a zero keyword score.
    let (_root, mut store, _) = setup_with(&[
        (
            "both.md",
            "rust systems programming language with memory safety",
        ),
        (
            "literal.md",
            "rust unrelated gibberish lorem ipsum dolor sit amet",
        ),
        (
            "topical.md",
            "systems programming language tradeoffs at runtime",
        ),
    ]);
    let backend = MockEmbeddingBackend::new(128);
    embed_all(&mut store, &backend);

    let hits = hybrid_search_with(&store, "rust programming", &sem_opts(10), &backend).unwrap();
    let uri_of = |idx: usize| hits[idx].uri.clone();
    let rank_of = |needle: &str| {
        hits.iter()
            .position(|h| h.uri.ends_with(needle))
            .unwrap_or_else(|| {
                panic!(
                    "expected {needle} in hybrid hits; got {:?}",
                    hits.iter().map(|h| h.uri.clone()).collect::<Vec<_>>()
                )
            })
    };

    let both = rank_of("/both.md");
    let literal = rank_of("/literal.md");
    let topical = rank_of("/topical.md");

    assert!(
        both < literal && both < topical,
        "chunk matching both lists should outrank one-sided hits; got order: {}, {}, {}",
        uri_of(0),
        uri_of(1),
        uri_of(2),
    );
    assert!(
        hits.iter().any(|h| h.uri.ends_with("/topical.md")),
        "semantic-only hit must still appear in hybrid results"
    );
    // RRF scores are strictly positive and monotonically non-increasing.
    assert!(hits[0].score > 0.0);
    for w in hits.windows(2) {
        assert!(w[0].score >= w[1].score);
    }
}

#[test]
fn vec_semantic_search_matches_brute_force_order_for_default_model() {
    // Parity test: on a tiny store, the opt-in vec0-backed helper must return
    // the same chunks in the same order as the brute-force path, since they
    // read the same vectors and rank by the same metric. The mock backend
    // must be sized to the mirror's declared dim (768) so dual-writes into
    // `chunks_vec_nomic_768` actually happen.
    // Each doc shares a distinct number of query tokens so the ranking is not
    // tie-ridden — otherwise brute-force's insertion order and vec0's scan
    // order can legally disagree on a zero-cosine tail without either being
    // wrong.
    let (_root, mut store, _) = setup_with(&[
        (
            "rust.md",
            "rust is a systems programming language with memory safety",
        ),
        ("prog.md", "other systems programming concepts"),
        ("mem.md", "memory considerations at runtime only"),
    ]);
    let backend = MockEmbeddingBackend::new(VEC_MIRROR_DIM);
    embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: DEFAULT_EMBED_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();

    let opts = SemanticOptions {
        limit: 3,
        kind: None,
        path_contains: None,
        model: DEFAULT_EMBED_MODEL.to_string(),
        ollama_url: "http://mock".to_string(),
        instruction: None,
    };
    let query = "systems programming memory safety";
    let brute = semantic_search_with(&store, query, &opts, &backend).unwrap();
    let via_vec = vec_semantic_search_with(&store, query, &opts, &backend).unwrap();

    assert!(!brute.is_empty(), "brute-force should rank something");
    assert_eq!(
        brute.len(),
        via_vec.len(),
        "vec helper must return the same number of hits"
    );
    let brute_ids: Vec<&str> = brute.iter().map(|h| h.chunk_id.as_str()).collect();
    let vec_ids: Vec<&str> = via_vec.iter().map(|h| h.chunk_id.as_str()).collect();
    assert_eq!(
        brute_ids, vec_ids,
        "vec-backed ordering must match brute-force for the default model"
    );
    assert!(
        via_vec[0].uri.ends_with("/rust.md"),
        "rust doc should rank first; got {}",
        via_vec[0].uri
    );
}

#[test]
fn vec_semantic_search_rejects_non_default_model() {
    // The vec mirror is defined for the default model only; calling the helper
    // with any other model must fail loudly rather than silently returning the
    // wrong result set (the mirror would be empty for that model anyway).
    let (_root, mut store, _) = setup_with(&[("a.md", "anything")]);
    let backend = MockEmbeddingBackend::new(VEC_MIRROR_DIM);
    embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: MOCK_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();

    let opts = SemanticOptions {
        limit: 3,
        kind: None,
        path_contains: None,
        model: MOCK_MODEL.to_string(),
        ollama_url: "http://mock".to_string(),
        instruction: None,
    };
    let err = vec_semantic_search_with(&store, "anything", &opts, &backend).unwrap_err();
    let msg = format!("{err}");
    assert!(
        msg.contains(DEFAULT_EMBED_MODEL),
        "error should name the supported model; got {msg}"
    );
}

#[test]
fn semantic_search_reports_missing_model_with_available_models() {
    // The preflight check should fail before querying the backend when the
    // requested model has no stored embeddings, and it should list the models
    // that *are* present in the store.
    let (_root, mut store, _) = setup_with(&[("a.md", "anything")]);
    let backend = MockEmbeddingBackend::new(64);
    embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: MOCK_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();

    let opts = SemanticOptions {
        limit: 3,
        kind: None,
        path_contains: None,
        model: DEFAULT_EMBED_MODEL.to_string(),
        ollama_url: "http://mock".to_string(),
        instruction: None,
    };
    let err = semantic_search_with(&store, "anything", &opts, &backend).unwrap_err();
    let msg = format!("{err}");
    assert!(
        msg.contains("no stored embeddings for model 'nomic-embed-text'"),
        "error should name the missing model; got {msg}"
    );
    assert!(
        msg.contains("available models: mock-embed-test"),
        "error should list available models; got {msg}"
    );
}

#[test]
fn semantic_search_with_default_model_still_honors_filters() {
    // The auto-routing helper should only kick in for the unfiltered default
    // model. As soon as the caller asks for a kind filter, the search must stay
    // on the brute-force path so the filter is respected.
    let (_root, mut store, _) = setup_with(&[
        ("rust.md", "rust systems programming language with safety"),
        (
            "notes.json",
            "{\"note\":\"rust systems programming language with safety\"}",
        ),
    ]);
    let backend = MockEmbeddingBackend::new(VEC_MIRROR_DIM);
    embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: DEFAULT_EMBED_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();

    let opts = SemanticOptions {
        limit: 5,
        kind: Some("text/markdown".to_string()),
        path_contains: None,
        model: DEFAULT_EMBED_MODEL.to_string(),
        ollama_url: "http://mock".to_string(),
        instruction: None,
    };
    let hits = semantic_search_with(&store, "rust systems programming", &opts, &backend).unwrap();
    assert!(!hits.is_empty());
    assert!(
        hits.iter().all(|hit| hit.kind == "text/markdown"),
        "filtered semantic search must not leak non-markdown hits: {:?}",
        hits.iter().map(|hit| &hit.kind).collect::<Vec<_>>()
    );
    assert!(
        hits.iter().any(|hit| hit.uri.ends_with("/rust.md")),
        "markdown doc should remain in the result set"
    );
}

#[test]
fn legacy_store_migrates_and_backfills_vec_mirror() {
    // Regression test for the release blocker found in review: existing stores
    // with default-model embeddings must populate the vec0 mirror on upgrade,
    // otherwise auto-routed semantic search would return nothing.
    let root = tempfile::tempdir().unwrap();
    let store_dir = root.path().join("store");
    fs::create_dir_all(&store_dir).unwrap();
    let db_path = store_dir.join(lantern::store::DB_FILENAME);
    let conn = Connection::open(&db_path).unwrap();
    conn.execute_batch(
        "CREATE TABLE sources (
            id TEXT PRIMARY KEY,
            uri TEXT NOT NULL,
            path TEXT,
            kind TEXT NOT NULL,
            bytes INTEGER NOT NULL,
            content_sha256 TEXT NOT NULL,
            mtime_unix INTEGER,
            ingested_at INTEGER NOT NULL
        );
         CREATE TABLE chunks (
            id TEXT PRIMARY KEY,
            source_id TEXT NOT NULL,
            ordinal INTEGER NOT NULL,
            byte_start INTEGER NOT NULL,
            byte_end INTEGER NOT NULL,
            char_count INTEGER NOT NULL,
            text TEXT NOT NULL,
            sha256 TEXT NOT NULL,
            created_at INTEGER NOT NULL,
            UNIQUE(source_id, ordinal)
        );
         CREATE TABLE embeddings (
            chunk_id TEXT PRIMARY KEY,
            model TEXT NOT NULL,
            dim INTEGER NOT NULL,
            embedding BLOB NOT NULL,
            created_at INTEGER NOT NULL
        );
         PRAGMA user_version = 3;",
    )
    .unwrap();
    conn.execute(
        "INSERT INTO sources (id, uri, path, kind, bytes, content_sha256, mtime_unix, ingested_at)
         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
        rusqlite::params![
            "s1",
            "file:///tmp/rust.md",
            Some("/tmp/rust.md"),
            "text/markdown",
            31_i64,
            "sha",
            Option::<i64>::None,
            1_i64,
        ],
    )
    .unwrap();
    conn.execute(
        "INSERT INTO chunks (id, source_id, ordinal, byte_start, byte_end, char_count, text, sha256, created_at)
         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)",
        rusqlite::params![
            "c1",
            "s1",
            0_i64,
            0_i64,
            31_i64,
            31_i64,
            "rust systems programming language",
            "sha",
            1_i64,
        ],
    )
    .unwrap();
    let backend = MockEmbeddingBackend::new(VEC_MIRROR_DIM);
    let embedding = backend.embed("rust systems programming language").unwrap();
    conn.execute(
        "INSERT INTO embeddings (chunk_id, model, dim, embedding, created_at)
         VALUES (?1, ?2, ?3, ?4, ?5)",
        rusqlite::params![
            "c1",
            DEFAULT_EMBED_MODEL,
            VEC_MIRROR_DIM as i64,
            f32s_to_blob(&embedding),
            1_i64,
        ],
    )
    .unwrap();
    drop(conn);

    let store = Store::open(&store_dir).unwrap();
    let mirror_count: i64 = store
        .conn()
        .query_row(
            &format!("SELECT count(*) FROM {}", lantern::store::VEC_MIRROR_TABLE),
            [],
            |row| row.get(0),
        )
        .unwrap();
    assert_eq!(mirror_count, 1);

    let hits = semantic_search_with(
        &store,
        "rust systems programming language",
        &SemanticOptions {
            limit: 5,
            kind: None,
            path_contains: None,
            model: DEFAULT_EMBED_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            instruction: None,
        },
        &backend,
    )
    .unwrap();
    assert!(!hits.is_empty());
    assert!(hits[0].uri.ends_with("/rust.md"));
}

#[test]
fn mcp_server_drives_semantic_search_through_injected_factory() {
    // End-to-end MCP smoke test: build a server with the mock factory, then
    // use its synchronous entry points (the same bodies the #[tool] methods
    // call) to embed and search. Proves the factory is wired through the
    // MCP dispatch path, not just through direct library calls.
    let (_root, store, store_dir) = setup_with(&[
        ("a.md", "rust programming with borrow checker and traits"),
        ("b.md", "baking sourdough at home on the weekend"),
    ]);
    drop(store);

    let factory = Arc::new(MockBackendFactory::new(128));
    let server = LanternServer::with_factory(store_dir, factory);

    let report = server
        .embed_sync(EmbedArgs {
            model: Some(MOCK_MODEL.to_string()),
            ollama_url: Some("http://mock".to_string()),
            limit: None,
        })
        .unwrap();
    assert_eq!(report.embedded, 2);
    assert_eq!(report.dim, Some(128));

    let resp = server
        .search_sync(SearchArgs {
            query: "rust programming".to_string(),
            limit: Some(5),
            kind: None,
            path: None,
            mode: Some("semantic".to_string()),
            model: Some(MOCK_MODEL.to_string()),
            ollama_url: Some("http://mock".to_string()),
            instruction: None,
        })
        .unwrap();
    let results = resp
        .get("results")
        .and_then(|v| v.as_array())
        .expect("results array");
    assert!(!results.is_empty(), "semantic search returned no hits");
    let top_uri = results[0]
        .get("uri")
        .and_then(|v| v.as_str())
        .unwrap_or_default();
    assert!(
        top_uri.ends_with("/a.md"),
        "rust doc should rank first; got {top_uri}"
    );

    // Hybrid path through MCP must also work end-to-end.
    let hybrid = server
        .search_sync(SearchArgs {
            query: "rust programming".to_string(),
            limit: Some(5),
            kind: None,
            path: None,
            mode: Some("hybrid".to_string()),
            model: Some(MOCK_MODEL.to_string()),
            ollama_url: Some("http://mock".to_string()),
            instruction: None,
        })
        .unwrap();
    assert!(
        hybrid
            .get("results")
            .and_then(|v| v.as_array())
            .is_some_and(|arr| !arr.is_empty())
    );
}