lantern 0.2.3

Local-first, provenance-aware semantic search for agent activity
Documentation
//! Compare indexed filesystem sources against the current filesystem.
//!
//! Only sources whose `uri` begins with `file://` are considered — stdin
//! and other synthetic sources are intentionally ignored. When a
//! scanned path is provided, the comparison is scoped to that subtree
//! (both for checking indexed sources and for detecting new files that
//! have not yet been ingested).

use std::collections::HashSet;
use std::fs;
use std::io;
use std::path::{Path, PathBuf};

use anyhow::{Context, Result};
use serde::Serialize;

use crate::ingest::{is_supported_file, sha256_hex};
use crate::store::Store;

#[derive(Debug, Clone, Serialize)]
pub struct DiffReport {
    pub scanned_path: Option<String>,
    pub unchanged: Vec<FileState>,
    pub changed: Vec<FileState>,
    pub missing: Vec<MissingFile>,
    pub unindexed: Vec<UnindexedFile>,
}

#[derive(Debug, Clone, Serialize)]
pub struct FileState {
    pub source_id: String,
    pub uri: String,
    pub path: String,
    pub indexed_sha256: String,
    pub current_sha256: String,
}

#[derive(Debug, Clone, Serialize)]
pub struct MissingFile {
    pub source_id: String,
    pub uri: String,
    pub path: String,
    pub indexed_sha256: String,
    pub reason: String,
}

#[derive(Debug, Clone, Serialize)]
pub struct UnindexedFile {
    pub path: String,
    pub bytes: u64,
}

pub fn diff(store: &Store, scanned_path: Option<&Path>) -> Result<DiffReport> {
    let scan_root = match scanned_path {
        Some(p) => {
            Some(fs::canonicalize(p).with_context(|| format!("canonicalize {}", p.display()))?)
        }
        None => None,
    };

    let indexed: Vec<(String, String, String)> = {
        let conn = store.conn();
        let mut stmt = conn.prepare(
            "SELECT id, uri, content_sha256 FROM sources
             WHERE uri LIKE 'file://%'
             ORDER BY id",
        )?;
        let rows = stmt.query_map([], |row| {
            Ok((
                row.get::<_, String>(0)?,
                row.get::<_, String>(1)?,
                row.get::<_, String>(2)?,
            ))
        })?;
        rows.collect::<Result<Vec<_>, _>>()?
    };

    let mut unchanged = Vec::new();
    let mut changed = Vec::new();
    let mut missing = Vec::new();
    let mut indexed_abs: HashSet<PathBuf> = HashSet::new();

    for (source_id, uri, indexed_sha) in indexed {
        let abs_str = uri.strip_prefix("file://").unwrap_or(uri.as_str());
        let abs = PathBuf::from(abs_str);

        if let Some(root) = &scan_root
            && !abs.starts_with(root)
        {
            continue;
        }
        indexed_abs.insert(abs.clone());

        match fs::read(&abs) {
            Ok(bytes) => {
                let current_sha = sha256_hex(&bytes);
                let state = FileState {
                    source_id,
                    uri,
                    path: abs.to_string_lossy().into_owned(),
                    indexed_sha256: indexed_sha.clone(),
                    current_sha256: current_sha.clone(),
                };
                if current_sha == indexed_sha {
                    unchanged.push(state);
                } else {
                    changed.push(state);
                }
            }
            Err(err) => {
                let reason = if err.kind() == io::ErrorKind::NotFound {
                    "not found".to_string()
                } else {
                    format!("{err}")
                };
                missing.push(MissingFile {
                    source_id,
                    uri,
                    path: abs.to_string_lossy().into_owned(),
                    indexed_sha256: indexed_sha,
                    reason,
                });
            }
        }
    }

    let mut unindexed = Vec::new();
    if let Some(root) = &scan_root {
        for entry in walkdir::WalkDir::new(root).sort_by_file_name() {
            let entry = entry.with_context(|| format!("walk {}", root.display()))?;
            let p = entry.path();
            if !p.is_file() || !is_supported_file(p) {
                continue;
            }
            // Skip files under <store>/stashes/ etc. by excluding anything
            // already inside the store directory. Simpler: canonicalize and
            // compare against the indexed set.
            let canonical = match fs::canonicalize(p) {
                Ok(c) => c,
                Err(_) => continue,
            };
            if indexed_abs.contains(&canonical) {
                continue;
            }
            let bytes = fs::metadata(&canonical).map(|m| m.len()).unwrap_or(0);
            unindexed.push(UnindexedFile {
                path: canonical.to_string_lossy().into_owned(),
                bytes,
            });
        }
    }

    Ok(DiffReport {
        scanned_path: scan_root.map(|p| p.to_string_lossy().into_owned()),
        unchanged,
        changed,
        missing,
        unindexed,
    })
}

pub fn print_text(report: &DiffReport) {
    match &report.scanned_path {
        Some(p) => println!("scanned: {p}"),
        None => println!("scanned: <store-wide>"),
    }
    println!(
        "unchanged: {}  changed: {}  missing: {}  unindexed: {}",
        report.unchanged.len(),
        report.changed.len(),
        report.missing.len(),
        report.unindexed.len(),
    );

    if !report.changed.is_empty() {
        println!();
        println!("changed:");
        for f in &report.changed {
            println!(
                "  {id}  indexed={old} -> current={new}  {path}",
                id = f.source_id,
                old = &f.indexed_sha256[..12.min(f.indexed_sha256.len())],
                new = &f.current_sha256[..12.min(f.current_sha256.len())],
                path = f.path,
            );
        }
    }

    if !report.missing.is_empty() {
        println!();
        println!("missing:");
        for f in &report.missing {
            println!(
                "  {id}  {path}  ({reason})",
                id = f.source_id,
                path = f.path,
                reason = f.reason,
            );
        }
    }

    if !report.unindexed.is_empty() {
        println!();
        println!("unindexed:");
        for f in &report.unindexed {
            println!("  {path}  ({bytes} B)", path = f.path, bytes = f.bytes);
        }
    }
}

pub fn print_json(report: &DiffReport) -> Result<()> {
    println!("{}", serde_json::to_string_pretty(report)?);
    Ok(())
}