lantern 0.3.0

Local-first, provenance-aware semantic search for agent activity
Documentation
use std::fs;

use lantern::embed::{EmbedOptions, MockEmbeddingBackend, embed_missing_with};
use lantern::ingest::ingest_path;
use lantern::search::{SearchOptions, SemanticOptions, hybrid_search_with, search};
use lantern::store::Store;
use rusqlite::params;
use tempfile::tempdir;

fn setup_store_with(files: &[(&str, &str)]) -> (tempfile::TempDir, Store) {
    let root = tempdir().unwrap();
    let mut store = Store::initialize(&root.path().join("store")).unwrap();
    let data = root.path().join("data");
    fs::create_dir_all(&data).unwrap();
    for (name, body) in files {
        fs::write(data.join(name), body).unwrap();
    }
    ingest_path(&mut store, &data).unwrap();
    (root, store)
}

/// Ancient, never-touched chunk should be filtered out by a confidence floor
/// that a fresh chunk easily clears.
#[test]
fn min_confidence_drops_low_confidence_hits() {
    let (_root, store) = setup_store_with(&[
        ("fresh.md", "Lanterns glow in the dark forest."),
        ("stale.md", "Lanterns are useful in old mines as well."),
    ]);

    let baseline = search(&store, "lantern", SearchOptions::default()).unwrap();
    assert_eq!(baseline.len(), 2, "both chunks should match on 'lantern'");

    // Force a wide confidence gap: keep one chunk fresh, push the other deep
    // into the decay tail with zero prior access. `unixepoch('now')` matches
    // the clock compute_confidence reads, so the fresh row really is fresh.
    store
        .conn()
        .execute(
            "UPDATE chunks
             SET timestamp_unix = unixepoch('now'),
                 last_accessed_at = unixepoch('now'),
                 access_count = 0
             WHERE source_id = (SELECT id FROM sources WHERE uri LIKE '%fresh.md')",
            [],
        )
        .unwrap();
    store
        .conn()
        .execute(
            "UPDATE chunks
             SET timestamp_unix = 1,
                 last_accessed_at = NULL,
                 access_count = 0
             WHERE source_id = (SELECT id FROM sources WHERE uri LIKE '%stale.md')",
            [],
        )
        .unwrap();

    let filtered = search(
        &store,
        "lantern",
        SearchOptions {
            min_confidence: Some(0.5),
            ..SearchOptions::default()
        },
    )
    .unwrap();
    assert_eq!(
        filtered.len(),
        1,
        "stale chunk should be dropped by a 0.5 floor; got {filtered:#?}"
    );
    assert!(filtered[0].uri.ends_with("/fresh.md"));
    assert!(filtered[0].confidence >= 0.5);
}

/// A filter-dropped chunk must not have its access metadata bumped — that is
/// the whole point of enforcing the floor before `bump_access_metadata`.
#[test]
fn filtered_hits_do_not_count_as_retrievals() {
    let (_root, store) = setup_store_with(&[("stale.md", "Lanterns are ancient history.")]);

    store
        .conn()
        .execute(
            "UPDATE chunks
             SET timestamp_unix = 1,
                 last_accessed_at = NULL,
                 access_count = 0,
                 access_decay_at = NULL",
            [],
        )
        .unwrap();

    let chunk_id: String = store
        .conn()
        .query_row("SELECT id FROM chunks LIMIT 1", [], |row| row.get(0))
        .unwrap();

    // Nothing matches, so access metadata stays put even without a floor.
    let hits = search(
        &store,
        "lantern",
        SearchOptions {
            min_confidence: Some(0.99),
            ..SearchOptions::default()
        },
    )
    .unwrap();
    assert!(hits.is_empty(), "floor of 0.99 should drop the stale chunk");

    let (access_count, last_accessed_at): (i64, Option<i64>) = store
        .conn()
        .query_row(
            "SELECT access_count, last_accessed_at FROM chunks WHERE id = ?1",
            params![chunk_id],
            |row| Ok((row.get(0)?, row.get(1)?)),
        )
        .unwrap();
    assert_eq!(
        access_count, 0,
        "filtered-out chunks should not count as retrievals"
    );
    assert_eq!(
        last_accessed_at, None,
        "filtered-out chunks should not record a retrieval timestamp"
    );
}

/// Hybrid search must enforce the confidence floor *after* blending and only
/// bump access metadata for survivors. Earlier code ran the keyword and
/// semantic passes through the public bumping helpers, so chunks the floor
/// later dropped still counted as retrievals.
#[test]
fn hybrid_search_does_not_bump_filtered_hits() {
    const MOCK_MODEL: &str = "mock-embed-test";

    let (_root, mut store) = setup_store_with(&[
        ("fresh.md", "Lanterns glow in the dark forest."),
        ("stale.md", "Lanterns are useful in old mines as well."),
    ]);

    let backend = MockEmbeddingBackend::new(64);
    embed_missing_with(
        &mut store,
        &EmbedOptions {
            model: MOCK_MODEL.to_string(),
            ollama_url: "http://mock".to_string(),
            limit: None,
        },
        &backend,
    )
    .unwrap();

    store
        .conn()
        .execute(
            "UPDATE chunks
             SET timestamp_unix = unixepoch('now'),
                 last_accessed_at = unixepoch('now'),
                 access_count = 0,
                 access_decay_at = NULL
             WHERE source_id = (SELECT id FROM sources WHERE uri LIKE '%fresh.md')",
            [],
        )
        .unwrap();
    store
        .conn()
        .execute(
            "UPDATE chunks
             SET timestamp_unix = 1,
                 last_accessed_at = NULL,
                 access_count = 0,
                 access_decay_at = NULL
             WHERE source_id = (SELECT id FROM sources WHERE uri LIKE '%stale.md')",
            [],
        )
        .unwrap();

    let stale_chunk_id: String = store
        .conn()
        .query_row(
            "SELECT id FROM chunks WHERE source_id = (SELECT id FROM sources WHERE uri LIKE '%stale.md')",
            [],
            |row| row.get(0),
        )
        .unwrap();

    let opts = SemanticOptions {
        limit: 10,
        kind: None,
        path_contains: None,
        session_id: None,
        model: MOCK_MODEL.to_string(),
        ollama_url: "http://mock".to_string(),
        instruction: None,
        min_confidence: Some(0.5),
    };
    let hits = hybrid_search_with(&store, "lantern", &opts, &backend).unwrap();

    assert!(
        hits.iter().all(|hit| !hit.uri.ends_with("/stale.md")),
        "stale chunk must be dropped by the 0.5 floor; got {hits:#?}"
    );
    assert!(
        hits.iter().all(|hit| hit.confidence >= 0.5),
        "every survivor must clear the floor; got {hits:#?}"
    );

    let (access_count, last_accessed_at): (i64, Option<i64>) = store
        .conn()
        .query_row(
            "SELECT access_count, last_accessed_at FROM chunks WHERE id = ?1",
            params![stale_chunk_id],
            |row| Ok((row.get(0)?, row.get(1)?)),
        )
        .unwrap();
    assert_eq!(
        access_count, 0,
        "filtered-out chunk must not be bumped by the inner keyword/semantic passes"
    );
    assert_eq!(
        last_accessed_at, None,
        "filtered-out chunk must not record a retrieval timestamp"
    );
}

/// Control: without a confidence floor, a matching hit is returned *and* its
/// access metadata is bumped. Guards against the filter quietly disabling the
/// whole retrieval path when `min_confidence` is left at the `None` default.
#[test]
fn no_floor_preserves_existing_bump_behavior() {
    let (_root, store) = setup_store_with(&[("a.md", "Lanterns glow in the dark.")]);

    let hits = search(&store, "lantern", SearchOptions::default()).unwrap();
    assert_eq!(hits.len(), 1);
    let chunk_id = hits[0].chunk_id.clone();

    let (access_count, last_accessed_at): (i64, Option<i64>) = store
        .conn()
        .query_row(
            "SELECT access_count, last_accessed_at FROM chunks WHERE id = ?1",
            params![chunk_id],
            |row| Ok((row.get(0)?, row.get(1)?)),
        )
        .unwrap();
    assert_eq!(access_count, 1);
    assert!(last_accessed_at.is_some());
}