lantern 0.2.3

Local-first, provenance-aware semantic search for agent activity
Documentation
//! Store introspection for agents and humans.
//!
//! `inspect` answers "what is in my local Lantern store right now?" without
//! touching search ranking or external systems. It surfaces schema version,
//! on-disk paths, aggregate counts, and the most recently ingested sources
//! with per-source chunk counts — the fields needed to debug ingest state.

use std::fs;

use anyhow::Result;
use rusqlite::params;
use serde::Serialize;

use crate::embed::{EmbeddingStats, embedding_stats};
use crate::store::Store;

#[derive(Debug, Clone, Serialize)]
pub struct InspectReport {
    pub schema_version: i64,
    pub store_path: String,
    pub db_path: String,
    pub db_bytes: u64,
    pub source_count: i64,
    pub chunk_count: i64,
    pub indexed_bytes: i64,
    pub embeddings: Vec<EmbeddingStats>,
    pub recent_sources: Vec<RecentSource>,
}

#[derive(Debug, Clone, Serialize)]
pub struct RecentSource {
    pub source_id: String,
    pub uri: String,
    pub path: Option<String>,
    pub kind: String,
    pub bytes: i64,
    pub chunks: i64,
    pub ingested_at: i64,
}

#[derive(Debug, Clone, Copy)]
pub struct InspectOptions {
    pub recent_limit: usize,
}

impl Default for InspectOptions {
    fn default() -> Self {
        Self { recent_limit: 10 }
    }
}

pub fn inspect(store: &Store, opts: InspectOptions) -> Result<InspectReport> {
    let conn = store.conn();
    let schema_version = store.schema_version()?;
    let db_path = store.db_path();
    let db_bytes = fs::metadata(&db_path).map(|m| m.len()).unwrap_or(0);

    let source_count: i64 = conn.query_row("SELECT COUNT(*) FROM sources", [], |row| row.get(0))?;
    let chunk_count: i64 = conn.query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))?;
    let indexed_bytes: i64 =
        conn.query_row("SELECT COALESCE(SUM(bytes), 0) FROM sources", [], |row| {
            row.get(0)
        })?;

    let mut stmt = conn.prepare(
        "SELECT s.id, s.uri, s.path, s.kind, s.bytes, s.ingested_at,
                (SELECT COUNT(*) FROM chunks c WHERE c.source_id = s.id) AS chunks
         FROM sources s
         ORDER BY s.ingested_at DESC, s.id DESC
         LIMIT ?1",
    )?;
    let rows = stmt.query_map(params![opts.recent_limit as i64], |row| {
        Ok(RecentSource {
            source_id: row.get(0)?,
            uri: row.get(1)?,
            path: row.get(2)?,
            kind: row.get(3)?,
            bytes: row.get(4)?,
            ingested_at: row.get(5)?,
            chunks: row.get(6)?,
        })
    })?;
    let recent_sources = rows.collect::<Result<Vec<_>, _>>()?;
    let embeddings = embedding_stats(store)?;

    Ok(InspectReport {
        schema_version,
        store_path: store.root().to_string_lossy().into_owned(),
        db_path: db_path.to_string_lossy().into_owned(),
        db_bytes,
        source_count,
        chunk_count,
        indexed_bytes,
        embeddings,
        recent_sources,
    })
}

pub fn print_text(report: &InspectReport) {
    println!("lantern store");
    println!("  path:     {}", report.store_path);
    println!(
        "  database: {} ({})",
        report.db_path,
        format_bytes(report.db_bytes)
    );
    println!("  schema:   v{}", report.schema_version);
    println!("  sources:  {}", report.source_count);
    println!("  chunks:   {}", report.chunk_count);
    println!("  indexed:  {} bytes of source text", report.indexed_bytes);

    if report.embeddings.is_empty() {
        println!("  embeds:   none");
    } else {
        for e in &report.embeddings {
            println!(
                "  embeds:   {count} chunks  model={model}  dim={dim}",
                count = e.count,
                model = e.model,
                dim = e.dim,
            );
        }
    }

    if report.recent_sources.is_empty() {
        println!();
        println!("no sources ingested yet");
        return;
    }

    let now = now_unix();
    println!();
    println!("recent sources:");
    for s in &report.recent_sources {
        println!(
            "  {id} {ago:>8}  chunks={chunks:<3} bytes={bytes:<6} {kind:<14} {uri}",
            id = s.source_id,
            ago = ago(now, s.ingested_at),
            chunks = s.chunks,
            bytes = s.bytes,
            kind = s.kind,
            uri = s.uri,
        );
    }
}

pub fn print_json(report: &InspectReport) -> Result<()> {
    println!("{}", serde_json::to_string_pretty(report)?);
    Ok(())
}

pub(crate) fn now_unix() -> i64 {
    std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.as_secs() as i64)
        .unwrap_or(0)
}

pub(crate) fn ago(now: i64, then: i64) -> String {
    let delta = (now - then).max(0);
    if delta < 60 {
        return format!("{delta}s ago");
    }
    let m = delta / 60;
    if m < 60 {
        return format!("{m}m ago");
    }
    let h = m / 60;
    if h < 24 {
        return format!("{h}h ago");
    }
    let d = h / 24;
    format!("{d}d ago")
}

fn format_bytes(n: u64) -> String {
    const KB: u64 = 1024;
    const MB: u64 = KB * 1024;
    const GB: u64 = MB * 1024;
    if n >= GB {
        format!("{:.1} GB", n as f64 / GB as f64)
    } else if n >= MB {
        format!("{:.1} MB", n as f64 / MB as f64)
    } else if n >= KB {
        format!("{:.1} KB", n as f64 / KB as f64)
    } else {
        format!("{n} B")
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn ago_buckets() {
        assert_eq!(ago(1_000, 1_000), "0s ago");
        assert_eq!(ago(1_050, 1_000), "50s ago");
        assert_eq!(ago(1_000 + 60 * 5, 1_000), "5m ago");
        assert_eq!(ago(1_000 + 3600 * 3, 1_000), "3h ago");
        assert_eq!(ago(1_000 + 86400 * 2, 1_000), "2d ago");
        assert_eq!(ago(500, 1_000), "0s ago");
    }

    #[test]
    fn format_bytes_thresholds() {
        assert_eq!(format_bytes(0), "0 B");
        assert_eq!(format_bytes(512), "512 B");
        assert_eq!(format_bytes(1024), "1.0 KB");
        assert_eq!(format_bytes(1024 * 1024), "1.0 MB");
        assert_eq!(format_bytes(1024 * 1024 * 1024), "1.0 GB");
    }
}