lantern 0.2.2

Local-first, provenance-aware semantic search for agent activity
Documentation
//! Machine-readable dump of indexed content.
//!
//! `export` renders the full contents of the local store (or a filtered
//! subset) as a single JSON document. Every exported chunk carries its
//! provenance fields (byte range, char count, sha256, source id) so the dump
//! is a self-contained snapshot an agent can reason over offline.

use anyhow::{Context, Result};
use rusqlite::{Connection, params};
use serde::Serialize;
use std::path::Path;

use crate::search;
use crate::store::Store;

#[derive(Debug, Clone, Serialize)]
pub struct Export {
    pub schema_version: i64,
    pub exported_at: i64,
    pub filter: FilterSnapshot,
    pub sources: Vec<ExportedSource>,
}

#[derive(Debug, Clone, Serialize)]
pub struct FilterSnapshot {
    pub path: Option<String>,
    pub query: Option<String>,
}

#[derive(Debug, Clone, Serialize)]
pub struct ExportedSource {
    pub source_id: String,
    pub uri: String,
    pub path: Option<String>,
    pub kind: String,
    pub bytes: i64,
    pub content_sha256: String,
    pub mtime_unix: Option<i64>,
    pub ingested_at: i64,
    pub chunks: Vec<ExportedChunk>,
}

#[derive(Debug, Clone, Serialize)]
pub struct ExportedChunk {
    pub chunk_id: String,
    pub ordinal: i64,
    pub byte_start: i64,
    pub byte_end: i64,
    pub char_count: i64,
    pub sha256: String,
    pub text: String,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub role: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub session_id: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub turn_id: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tool_name: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timestamp_unix: Option<i64>,
}

#[derive(Debug, Clone, Default)]
pub struct ExportFilter {
    /// Match sources whose `uri` or `path` contains this substring.
    pub path_contains: Option<String>,
    /// Restrict to sources with at least one chunk matching this FTS query.
    pub query: Option<String>,
}

pub fn export(store: &Store, filter: &ExportFilter) -> Result<Export> {
    let conn = store.conn();
    let ids = select_source_ids(conn, filter)?;
    let mut sources = Vec::with_capacity(ids.len());
    for id in &ids {
        sources.push(load_source(conn, id)?);
    }
    Ok(Export {
        schema_version: store.schema_version()?,
        exported_at: now_unix(),
        filter: FilterSnapshot {
            path: filter.path_contains.clone(),
            query: filter.query.clone(),
        },
        sources,
    })
}

pub fn write_json(export: &Export, output: Option<&Path>) -> Result<()> {
    let json = serde_json::to_string_pretty(export)?;
    match output {
        Some(path) => std::fs::write(path, format!("{json}\n"))
            .with_context(|| format!("writing export to {}", path.display()))?,
        None => println!("{json}"),
    }
    Ok(())
}

fn select_source_ids(conn: &Connection, filter: &ExportFilter) -> Result<Vec<String>> {
    let fts_query = filter.query.as_deref().map(search::build_fts_query);
    if matches!(fts_query.as_deref(), Some("")) {
        // A query that normalises to empty cannot match anything; preserve
        // the "empty filter means no results" contract.
        return Ok(Vec::new());
    }

    let path_like = filter.path_contains.as_deref().map(|p| format!("%{p}%"));

    let ids = match (path_like.as_deref(), fts_query.as_deref()) {
        (None, None) => collect_ids(
            conn,
            "SELECT id FROM sources ORDER BY ingested_at DESC, id DESC",
            params![],
        )?,
        (Some(like), None) => collect_ids(
            conn,
            "SELECT id FROM sources
             WHERE (path LIKE ?1 OR uri LIKE ?1)
             ORDER BY ingested_at DESC, id DESC",
            params![like],
        )?,
        (None, Some(fts)) => collect_ids(
            conn,
            "SELECT s.id FROM sources s
             WHERE EXISTS (
                SELECT 1 FROM chunks c
                JOIN chunks_fts ON chunks_fts.rowid = c.rowid
                WHERE c.source_id = s.id AND chunks_fts MATCH ?1
             )
             ORDER BY s.ingested_at DESC, s.id DESC",
            params![fts],
        )?,
        (Some(like), Some(fts)) => collect_ids(
            conn,
            "SELECT s.id FROM sources s
             WHERE (s.path LIKE ?1 OR s.uri LIKE ?1)
               AND EXISTS (
                SELECT 1 FROM chunks c
                JOIN chunks_fts ON chunks_fts.rowid = c.rowid
                WHERE c.source_id = s.id AND chunks_fts MATCH ?2
             )
             ORDER BY s.ingested_at DESC, s.id DESC",
            params![like, fts],
        )?,
    };

    Ok(ids)
}

fn collect_ids(
    conn: &Connection,
    sql: &str,
    params: &[&dyn rusqlite::ToSql],
) -> Result<Vec<String>> {
    let mut stmt = conn.prepare(sql)?;
    let rows = stmt.query_map(params, |row| row.get::<_, String>(0))?;
    Ok(rows.collect::<Result<Vec<_>, _>>()?)
}

pub(crate) fn load_source(conn: &Connection, id: &str) -> Result<ExportedSource> {
    let mut source = conn.query_row(
        "SELECT id, uri, path, kind, bytes, content_sha256, mtime_unix, ingested_at
         FROM sources WHERE id = ?1",
        params![id],
        |row| {
            Ok(ExportedSource {
                source_id: row.get(0)?,
                uri: row.get(1)?,
                path: row.get(2)?,
                kind: row.get(3)?,
                bytes: row.get(4)?,
                content_sha256: row.get(5)?,
                mtime_unix: row.get(6)?,
                ingested_at: row.get(7)?,
                chunks: Vec::new(),
            })
        },
    )?;

    let mut stmt = conn.prepare(
        "SELECT id, ordinal, byte_start, byte_end, char_count, sha256, text,
                role, session_id, turn_id, tool_name, timestamp_unix
         FROM chunks WHERE source_id = ?1 ORDER BY ordinal",
    )?;
    let chunks = stmt.query_map(params![id], |row| {
        Ok(ExportedChunk {
            chunk_id: row.get(0)?,
            ordinal: row.get(1)?,
            byte_start: row.get(2)?,
            byte_end: row.get(3)?,
            char_count: row.get(4)?,
            sha256: row.get(5)?,
            text: row.get(6)?,
            role: row.get(7)?,
            session_id: row.get(8)?,
            turn_id: row.get(9)?,
            tool_name: row.get(10)?,
            timestamp_unix: row.get(11)?,
        })
    })?;
    source.chunks = chunks.collect::<Result<Vec<_>, _>>()?;
    Ok(source)
}

fn now_unix() -> i64 {
    std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_secs() as i64)
        .unwrap_or(0)
}