lantern 0.2.3

Local-first, provenance-aware semantic search for agent activity
Documentation
use std::fs;

use lantern::ingest::ingest_path;
use lantern::search::{SearchOptions, search};
use lantern::store::Store;
use rusqlite::params;
use tempfile::tempdir;

const TRANSCRIPT: &str = concat!(
    "{\"role\":\"user\",\"content\":\"Where should I keep the lantern?\"}\n",
    "{\"role\":\"assistant\",\"content\":[",
    "{\"type\":\"text\",\"text\":\"Hang it near the door.\"},",
    "{\"type\":\"text\",\"text\":\"It should catch the evening breeze.\"}",
    "]}\n",
    "\n",
    "{\"malformed line that should be skipped\n",
    "{\"role\":\"tool\",\"content\":\"\"}\n",
    "{\"text\":\"rust is a systems language\"}\n",
);

fn ingest_transcript() -> (tempfile::TempDir, Store, std::path::PathBuf) {
    let root = tempdir().unwrap();
    let mut store = Store::initialize(&root.path().join("store")).unwrap();
    let file = root.path().join("session.jsonl");
    fs::write(&file, TRANSCRIPT).unwrap();
    ingest_path(&mut store, &file).unwrap();
    (root, store, file)
}

#[test]
fn jsonl_ingest_produces_one_chunk_per_extractable_line() {
    let (_root, store, _file) = ingest_transcript();

    let (source_count, chunk_count, kind) = {
        let conn = store.conn();
        let s: i64 = conn
            .query_row("SELECT COUNT(*) FROM sources", [], |r| r.get(0))
            .unwrap();
        let c: i64 = conn
            .query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
            .unwrap();
        let k: String = conn
            .query_row("SELECT kind FROM sources", [], |r| r.get(0))
            .unwrap();
        (s, c, k)
    };
    assert_eq!(source_count, 1);
    assert_eq!(chunk_count, 3, "3 extractable lines out of 6");
    assert_eq!(kind, "application/jsonl");
}

#[test]
fn role_prefix_is_preserved_and_searchable() {
    let (_root, store, _file) = ingest_transcript();

    let assistant_hits = search(&store, "breeze", SearchOptions::default()).unwrap();
    assert_eq!(assistant_hits.len(), 1);
    assert!(assistant_hits[0].text.contains("[assistant]"));
    assert!(assistant_hits[0].text.contains("Hang it near the door"));
    assert!(
        assistant_hits[0].text.contains("evening breeze"),
        "multi-block content should be joined"
    );

    let user_hits = search(&store, "lantern", SearchOptions::default()).unwrap();
    assert_eq!(user_hits.len(), 1);
    assert!(user_hits[0].text.starts_with("[user]"));
}

#[test]
fn alternate_text_field_lines_are_indexed() {
    let (_root, store, _file) = ingest_transcript();
    let hits = search(&store, "systems", SearchOptions::default()).unwrap();
    assert_eq!(hits.len(), 1);
    assert_eq!(hits[0].text, "rust is a systems language");
}

#[test]
fn chunk_byte_ranges_match_their_source_line() {
    let (_root, store, _file) = ingest_transcript();

    let conn = store.conn();
    let source_id: String = conn
        .query_row("SELECT id FROM sources", [], |r| r.get(0))
        .unwrap();
    let mut stmt = conn
        .prepare(
            "SELECT byte_start, byte_end FROM chunks
             WHERE source_id = ?1 ORDER BY ordinal",
        )
        .unwrap();
    let ranges: Vec<(i64, i64)> = stmt
        .query_map(params![source_id], |row| Ok((row.get(0)?, row.get(1)?)))
        .unwrap()
        .collect::<Result<Vec<_>, _>>()
        .unwrap();

    for (start, end) in &ranges {
        assert!(end > start);
        let slice = &TRANSCRIPT[*start as usize..*end as usize];
        assert!(
            slice.ends_with('\n'),
            "each chunk maps to a full JSONL line"
        );
        assert!(slice.trim_start().starts_with('{'));
    }
}

#[test]
fn ingests_directory_mixing_jsonl_and_markdown() {
    let root = tempdir().unwrap();
    let mut store = Store::initialize(&root.path().join("store")).unwrap();
    let data = root.path().join("data");
    fs::create_dir_all(&data).unwrap();
    fs::write(data.join("note.md"), "# markdown\n\nSome body.\n").unwrap();
    fs::write(
        data.join("session.jsonl"),
        "{\"role\":\"user\",\"content\":\"needle in jsonl\"}\n",
    )
    .unwrap();

    let report = ingest_path(&mut store, &data).unwrap();
    assert_eq!(report.ingested.len(), 2);

    let hits = search(&store, "needle", SearchOptions::default()).unwrap();
    assert_eq!(hits.len(), 1);
    assert!(hits[0].uri.ends_with("/session.jsonl"));
    assert_eq!(hits[0].kind, "application/jsonl");
}

#[test]
fn empty_or_non_transcript_jsonl_ingests_with_no_chunks() {
    let root = tempdir().unwrap();
    let mut store = Store::initialize(&root.path().join("store")).unwrap();
    let file = root.path().join("empty.jsonl");
    fs::write(&file, "{\"timestamp\":1234}\n{\"foo\":\"bar\"}\n").unwrap();

    let report = ingest_path(&mut store, &file).unwrap();
    assert_eq!(report.ingested.len(), 1);
    assert_eq!(report.ingested[0].chunks, 0);

    let chunk_count: i64 = store
        .conn()
        .query_row("SELECT COUNT(*) FROM chunks", [], |r| r.get(0))
        .unwrap();
    assert_eq!(chunk_count, 0);
}