lantern 0.2.3

Local-first, provenance-aware semantic search for agent activity
Documentation
use lantern::ingest::{StdinIngestOptions, ingest_stdin, ingest_stdin_with};
use lantern::search::{SearchOptions, search};
use lantern::store::Store;
use tempfile::tempdir;

fn new_store() -> (tempfile::TempDir, Store) {
    let root = tempdir().unwrap();
    let store = Store::initialize(&root.path().join("store")).unwrap();
    (root, store)
}

#[test]
fn empty_uri_is_rejected() {
    let (_root, mut store) = new_store();
    let err = ingest_stdin(&mut store, "", None, b"hello").unwrap_err();
    assert!(err.to_string().contains("non-empty --uri"));
    let err = ingest_stdin(&mut store, "   ", None, b"hello").unwrap_err();
    assert!(err.to_string().contains("non-empty --uri"));
}

#[test]
fn basic_plain_text_ingest_is_searchable() {
    let (_root, mut store) = new_store();
    let report = ingest_stdin(
        &mut store,
        "stdin://note-1",
        None,
        b"lantern notes arrive via stdin",
    )
    .unwrap();

    assert_eq!(report.ingested.len(), 1);
    let src = &report.ingested[0];
    assert_eq!(src.uri, "stdin://note-1");
    assert_eq!(src.kind, "text/plain");
    assert!(src.path.is_none(), "stdin has no filesystem path");
    assert_eq!(src.chunks, 1);
    assert_eq!(src.bytes, 30);

    let hits = search(&store, "stdin", SearchOptions::default()).unwrap();
    assert_eq!(hits.len(), 1);
    assert_eq!(hits[0].uri, "stdin://note-1");
    assert_eq!(hits[0].kind, "text/plain");
}

#[test]
fn jsonl_kind_routes_through_transcript_extractor() {
    let (_root, mut store) = new_store();
    let payload = concat!(
        "{\"role\":\"user\",\"content\":\"ask about needles\"}\n",
        "{\"role\":\"assistant\",\"content\":\"needles are in the haystack\"}\n",
    );
    let report = ingest_stdin(
        &mut store,
        "stdin://session-42",
        Some("application/jsonl"),
        payload.as_bytes(),
    )
    .unwrap();

    assert_eq!(report.ingested.len(), 1);
    assert_eq!(report.ingested[0].kind, "application/jsonl");
    assert_eq!(report.ingested[0].chunks, 2);

    let hits = search(
        &store,
        "haystack",
        SearchOptions {
            kind: Some("application/jsonl".into()),
            ..SearchOptions::default()
        },
    )
    .unwrap();
    assert_eq!(hits.len(), 1);
    assert!(hits[0].text.contains("[assistant]"));
}

#[test]
fn reingesting_same_content_is_a_noop() {
    let (_root, mut store) = new_store();
    let first = ingest_stdin(&mut store, "stdin://foo", None, b"hello world").unwrap();
    assert_eq!(first.ingested.len(), 1);

    let second = ingest_stdin(&mut store, "stdin://foo", None, b"hello world").unwrap();
    assert!(second.ingested.is_empty());
    assert_eq!(second.skipped.len(), 1);
    assert!(
        second.skipped[0]
            .reason
            .contains("unchanged since last ingest")
    );
}

#[test]
fn reingesting_changed_content_replaces_chunks() {
    let (_root, mut store) = new_store();
    let first = ingest_stdin(&mut store, "stdin://foo", None, b"old payload sentinel").unwrap();
    let original_id = first.ingested[0].source_id.clone();
    assert_eq!(
        search(&store, "sentinel", SearchOptions::default())
            .unwrap()
            .len(),
        1
    );

    let second = ingest_stdin(
        &mut store,
        "stdin://foo",
        None,
        b"replacement payload marker",
    )
    .unwrap();
    assert_eq!(second.ingested.len(), 1);
    assert_eq!(second.ingested[0].source_id, original_id);

    // Stale term gone; new term present.
    assert!(
        search(&store, "sentinel", SearchOptions::default())
            .unwrap()
            .is_empty()
    );
    assert_eq!(
        search(&store, "marker", SearchOptions::default())
            .unwrap()
            .len(),
        1
    );
}

#[test]
fn non_utf8_stdin_is_reported_as_skipped() {
    let (_root, mut store) = new_store();
    let report = ingest_stdin(&mut store, "stdin://binary", None, &[0xff, 0xfe, 0xfd]).unwrap();
    assert!(report.ingested.is_empty());
    assert_eq!(report.skipped.len(), 1);
    assert!(report.skipped[0].reason.contains("UTF-8"));
}

#[test]
fn different_uris_produce_distinct_sources_even_for_same_payload() {
    let (_root, mut store) = new_store();
    ingest_stdin(&mut store, "stdin://a", None, b"shared body").unwrap();
    ingest_stdin(&mut store, "stdin://b", None, b"shared body").unwrap();

    let count: i64 = store
        .conn()
        .query_row("SELECT COUNT(*) FROM sources", [], |r| r.get(0))
        .unwrap();
    assert_eq!(count, 2);
}

#[test]
fn append_mode_preserves_each_batch_as_its_own_source() {
    let (_root, mut store) = new_store();
    let opts = StdinIngestOptions { append: true };
    let first = ingest_stdin_with(
        &mut store,
        "stdin://session-7",
        None,
        b"first batch body",
        &opts,
    )
    .unwrap();
    let second = ingest_stdin_with(
        &mut store,
        "stdin://session-7",
        None,
        b"second batch body",
        &opts,
    )
    .unwrap();

    assert_eq!(first.ingested.len(), 1);
    assert_eq!(second.ingested.len(), 1);
    assert_ne!(
        first.ingested[0].source_id, second.ingested[0].source_id,
        "append mode must produce distinct source ids per batch"
    );

    let uris: Vec<String> = store
        .conn()
        .prepare("SELECT uri FROM sources ORDER BY uri")
        .unwrap()
        .query_map([], |row| row.get::<_, String>(0))
        .unwrap()
        .map(Result::unwrap)
        .collect();
    assert_eq!(uris.len(), 2);
    for uri in &uris {
        assert!(
            uri.starts_with("stdin://session-7#"),
            "expected batch uri to derive from base, got {uri}"
        );
    }
    assert_ne!(uris[0], uris[1], "batch uris must be unique");

    // Both batches remain independently searchable.
    assert_eq!(
        search(&store, "first", SearchOptions::default())
            .unwrap()
            .len(),
        1
    );
    assert_eq!(
        search(&store, "second", SearchOptions::default())
            .unwrap()
            .len(),
        1
    );
}

#[test]
fn append_mode_preserves_jsonl_metadata_across_batches() {
    let (_root, mut store) = new_store();
    let opts = StdinIngestOptions { append: true };
    let batch_one = br#"{"session_id":"sess-9","turn_id":"turn-1","role":"user","timestamp":1700000100,"content":"where are the notes"}
"#;
    let batch_two = br#"{"session_id":"sess-9","turn_id":"turn-2","role":"assistant","timestamp":1700000200,"content":"in the store"}
{"session_id":"sess-9","turn_id":"turn-3","tool_name":"grep","timestamp":1700000201,"content":"grep results"}
"#;

    ingest_stdin_with(
        &mut store,
        "stdin://sess-9",
        Some("application/jsonl"),
        batch_one,
        &opts,
    )
    .unwrap();
    ingest_stdin_with(
        &mut store,
        "stdin://sess-9",
        Some("application/jsonl"),
        batch_two,
        &opts,
    )
    .unwrap();

    let source_count: i64 = store
        .conn()
        .query_row("SELECT COUNT(*) FROM sources", [], |r| r.get(0))
        .unwrap();
    assert_eq!(source_count, 2, "each batch must persist as its own source");

    let mut stmt = store
        .conn()
        .prepare(
            "SELECT role, session_id, turn_id, tool_name, timestamp_unix
             FROM chunks ORDER BY timestamp_unix",
        )
        .unwrap();
    let rows: Vec<(
        Option<String>,
        Option<String>,
        Option<String>,
        Option<String>,
        Option<i64>,
    )> = stmt
        .query_map([], |row| {
            Ok((
                row.get(0)?,
                row.get(1)?,
                row.get(2)?,
                row.get(3)?,
                row.get(4)?,
            ))
        })
        .unwrap()
        .map(Result::unwrap)
        .collect();

    assert_eq!(rows.len(), 3);
    assert_eq!(rows[0].0.as_deref(), Some("user"));
    assert_eq!(rows[0].1.as_deref(), Some("sess-9"));
    assert_eq!(rows[0].2.as_deref(), Some("turn-1"));
    assert_eq!(rows[0].4, Some(1_700_000_100));

    assert_eq!(rows[1].0.as_deref(), Some("assistant"));
    assert_eq!(rows[1].2.as_deref(), Some("turn-2"));
    assert_eq!(rows[1].4, Some(1_700_000_200));

    assert_eq!(rows[2].0, None);
    assert_eq!(rows[2].2.as_deref(), Some("turn-3"));
    assert_eq!(rows[2].3.as_deref(), Some("grep"));
    assert_eq!(rows[2].4, Some(1_700_000_201));
}

#[test]
fn append_mode_does_not_dedup_identical_payloads() {
    let (_root, mut store) = new_store();
    let opts = StdinIngestOptions { append: true };
    let first = ingest_stdin_with(&mut store, "stdin://replay", None, b"same body", &opts).unwrap();
    let second =
        ingest_stdin_with(&mut store, "stdin://replay", None, b"same body", &opts).unwrap();

    assert_eq!(first.ingested.len(), 1);
    assert_eq!(
        second.ingested.len(),
        1,
        "append mode must not treat identical content as unchanged"
    );
    let count: i64 = store
        .conn()
        .query_row("SELECT COUNT(*) FROM sources", [], |r| r.get(0))
        .unwrap();
    assert_eq!(count, 2);
}

#[test]
fn default_mode_still_overwrites_same_uri() {
    // Regression guard: the existing default behavior must keep the same URI
    // mapped to a single source — only --append opts into accumulation.
    let (_root, mut store) = new_store();
    let opts = StdinIngestOptions::default();
    ingest_stdin_with(&mut store, "stdin://once", None, b"first", &opts).unwrap();
    ingest_stdin_with(&mut store, "stdin://once", None, b"second", &opts).unwrap();
    let count: i64 = store
        .conn()
        .query_row("SELECT COUNT(*) FROM sources", [], |r| r.get(0))
        .unwrap();
    assert_eq!(count, 1);
}