collet 0.1.1

Relentless agentic coding orchestrator with zero-drop agent loops
Documentation
//! Disk persistence for RepoMap data.
//!
//! Stores per-project cache under `~/.collet/cache/<hash>/` with:
//!   - `symbols.json`    — parsed symbol table
//!   - `hash_cache.json` — BLAKE3 file hashes for incremental rebuild
//!   - `bm25.json`       — BM25 search index
//!   - `meta.json`       — cache version and build timestamp
//!
//! On load: if meta version matches and all files parse, the RepoMap skips
//! the expensive full rebuild and only does an incremental update for files
//! whose mtime changed since the cache was written.

use std::collections::HashMap;
use std::path::{Path, PathBuf};

use serde::{Deserialize, Serialize};

use super::bm25::Bm25Index;
use super::hasher::FileHashCache;
use super::parser::Symbol;
use super::ranker::SymbolRanker;

/// Cached data loaded from disk for a RepoMap.
pub type CachedRepoMap = (
    HashMap<PathBuf, Vec<Symbol>>,
    FileHashCache,
    Bm25Index,
    SymbolRanker,
);

/// Current cache format version. Bump when the on-disk format changes.
const CACHE_VERSION: u32 = 1;

// ── Cache metadata ─────────────────────────────────────────────────────

#[derive(Serialize, Deserialize)]
struct CacheMeta {
    version: u32,
    /// Unix timestamp of when the cache was written.
    written_at: u64,
    /// Number of files indexed.
    file_count: usize,
    /// Number of symbols indexed.
    symbol_count: usize,
}

// ── Serializable BM25 ─────────────────────────────────────────────────

/// On-disk representation of a BM25 document (uses relative paths only).
#[derive(Serialize, Deserialize)]
struct DiskDocument {
    rel_path: String,
    tf: HashMap<String, u32>,
    token_count: u32,
}

#[derive(Serialize, Deserialize)]
struct DiskBm25 {
    documents: HashMap<String, DiskDocument>, // keyed by rel_path
    doc_freq: HashMap<String, u32>,
    total_tokens: u64,
}

// ── Serializable Ranker ────────────────────────────────────────────────

#[derive(Serialize, Deserialize)]
struct DiskRanker {
    reference_counts: HashMap<String, u32>,
}

// ── Public API ─────────────────────────────────────────────────────────

/// Compute the cache directory for a given project root.
///
/// Returns `~/.collet/cache/<blake3_hash_of_path>/`.
pub fn cache_dir_for(project_root: &Path) -> Option<PathBuf> {
    let home = dirs::home_dir()?;
    let key = blake3::hash(project_root.to_string_lossy().as_bytes());
    let short_hash = &key.to_hex()[..16];
    Some(home.join(".collet").join("cache").join(short_hash))
}

/// Save all RepoMap components to disk.
pub fn save(
    project_root: &Path,
    symbols: &HashMap<PathBuf, Vec<Symbol>>,
    hash_cache: &FileHashCache,
    bm25: &Bm25Index,
    ranker: &SymbolRanker,
) -> anyhow::Result<()> {
    let dir = cache_dir_for(project_root).ok_or_else(|| anyhow::anyhow!("no home dir"))?;
    std::fs::create_dir_all(&dir)?;

    // Meta
    let meta = CacheMeta {
        version: CACHE_VERSION,
        written_at: std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_default()
            .as_secs(),
        file_count: symbols.len(),
        symbol_count: symbols.values().map(|v| v.len()).sum(),
    };
    atomic_write(&dir.join("meta.json"), &serde_json::to_vec(&meta)?)?;

    // Symbols — key by relative path for portability
    let rel_symbols: HashMap<String, &Vec<Symbol>> = symbols
        .iter()
        .map(|(path, syms)| {
            let rel = path
                .strip_prefix(project_root)
                .unwrap_or(path)
                .to_string_lossy()
                .to_string();
            (rel, syms)
        })
        .collect();
    atomic_write(
        &dir.join("symbols.json"),
        &serde_json::to_vec(&rel_symbols)?,
    )?;

    // Hash cache
    atomic_write(&dir.join("hash_cache.json"), &hash_cache.to_json()?)?;

    // BM25
    let disk_bm25 = bm25_to_disk(bm25, project_root);
    atomic_write(&dir.join("bm25.json"), &serde_json::to_vec(&disk_bm25)?)?;

    // Ranker
    let disk_ranker = ranker_to_disk(ranker);
    atomic_write(&dir.join("ranker.json"), &serde_json::to_vec(&disk_ranker)?)?;

    tracing::info!(
        files = meta.file_count,
        symbols = meta.symbol_count,
        dir = %dir.display(),
        "RepoMap disk cache saved",
    );
    Ok(())
}

/// Attempt to load a cached RepoMap from disk.
///
/// Returns `None` if the cache doesn't exist, is stale, or can't be parsed.
/// On success, returns `(symbols, hash_cache, bm25, ranker)`.
pub fn load(project_root: &Path) -> Option<CachedRepoMap> {
    let dir = cache_dir_for(project_root)?;
    if !dir.exists() {
        return None;
    }

    // Check meta version
    let meta_bytes = std::fs::read(dir.join("meta.json")).ok()?;
    let meta: CacheMeta = serde_json::from_slice(&meta_bytes).ok()?;
    if meta.version != CACHE_VERSION {
        tracing::info!(
            disk = meta.version,
            current = CACHE_VERSION,
            "RepoMap cache version mismatch — rebuilding",
        );
        return None;
    }

    // Symbols
    let sym_bytes = std::fs::read(dir.join("symbols.json")).ok()?;
    let rel_symbols: HashMap<String, Vec<Symbol>> = serde_json::from_slice(&sym_bytes).ok()?;
    let symbols: HashMap<PathBuf, Vec<Symbol>> = rel_symbols
        .into_iter()
        .map(|(rel, syms)| (project_root.join(&rel), syms))
        .collect();

    // Hash cache
    let hc_bytes = std::fs::read(dir.join("hash_cache.json")).ok()?;
    let hash_cache = FileHashCache::from_json(&hc_bytes)?;

    // BM25
    let bm25_bytes = std::fs::read(dir.join("bm25.json")).ok()?;
    let disk_bm25: DiskBm25 = serde_json::from_slice(&bm25_bytes).ok()?;
    let bm25 = bm25_from_disk(disk_bm25, project_root);

    // Ranker
    let ranker_bytes = std::fs::read(dir.join("ranker.json")).ok()?;
    let disk_ranker: DiskRanker = serde_json::from_slice(&ranker_bytes).ok()?;
    let ranker = ranker_from_disk(disk_ranker);

    tracing::info!(
        files = symbols.len(),
        symbols = symbols.values().map(|v| v.len()).sum::<usize>(),
        age_secs = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_default()
            .as_secs()
            .saturating_sub(meta.written_at),
        "RepoMap disk cache loaded",
    );

    Some((symbols, hash_cache, bm25, ranker))
}

// ── Conversion helpers ─────────────────────────────────────────────────

fn bm25_to_disk(bm25: &Bm25Index, root: &Path) -> DiskBm25 {
    let documents: HashMap<String, DiskDocument> = bm25
        .documents
        .iter()
        .map(|(abs_path, doc)| {
            let rel = abs_path
                .strip_prefix(root)
                .unwrap_or(abs_path)
                .to_string_lossy()
                .to_string();
            (
                rel,
                DiskDocument {
                    rel_path: doc.rel_path.clone(),
                    tf: doc.tf.clone(),
                    token_count: doc.token_count,
                },
            )
        })
        .collect();
    DiskBm25 {
        documents,
        doc_freq: bm25.doc_freq.clone(),
        total_tokens: bm25.total_tokens,
    }
}

fn bm25_from_disk(disk: DiskBm25, root: &Path) -> Bm25Index {
    let documents = disk
        .documents
        .into_iter()
        .map(|(rel, doc)| {
            let abs_path = root.join(&rel);
            (
                abs_path.clone(),
                super::bm25::Document {
                    rel_path: doc.rel_path,
                    abs_path,
                    tf: doc.tf,
                    token_count: doc.token_count,
                },
            )
        })
        .collect();
    Bm25Index {
        documents,
        doc_freq: disk.doc_freq,
        total_tokens: disk.total_tokens,
        root: root.to_path_buf(),
    }
}

fn ranker_to_disk(ranker: &SymbolRanker) -> DiskRanker {
    DiskRanker {
        reference_counts: ranker.reference_counts().clone(),
    }
}

fn ranker_from_disk(disk: DiskRanker) -> SymbolRanker {
    SymbolRanker::from_reference_counts(disk.reference_counts)
}

/// Write data to a temp file, then atomically rename.
fn atomic_write(path: &Path, data: &[u8]) -> anyhow::Result<()> {
    let tmp = path.with_extension("tmp");
    std::fs::write(&tmp, data)?;
    std::fs::rename(&tmp, path)?;
    Ok(())
}