goosedump 0.3.6

Coding agent context data browser
// SPDX-License-Identifier: LGPL-2.1-or-later
// Copyright (C) Jarkko Sakkinen 2026

use crate::message::ConversationMessage;
use anyhow::{Context as _, anyhow};
use rusqlite::{Connection, OptionalExtension};
use std::path::{Path, PathBuf};

/// Bumped whenever the on-disk schema changes.
const DB_SCHEMA_VERSION: i64 = 3;

/// The FTS5 tokenizer used for both the indexed content and the query.
const TOKENIZE: &str = "unicode61 remove_diacritics 2";

/// Per-context cache metadata used to decide whether the on-disk index is
/// still valid.
#[derive(Debug, Clone)]
pub struct CacheKey {
    pub client: String,
    pub context_id: String,
    pub filter_id: String,
    pub source_path: PathBuf,
    pub mtime: i64,
}

/// A single hit returned by the FTS5 query. The CLI promotes these into
/// `message::SearchHit` once it joins them back with the live
/// `ConversationMessage` list.
#[derive(Debug, Clone)]
pub struct CachedHit {
    pub entry_id: String,
    pub score: f64,
    pub snippet: String,
}

/// Incremental on-disk FTS5 cache for one or more contexts.
pub struct IndexCache {
    conn: Connection,
}

impl IndexCache {
    /// Open (or create) the on-disk cache at the platform's cache dir.
    ///
    /// # Errors
    /// Returns an error if the cache directory cannot be created or the
    /// `SQLite` database cannot be opened or migrated.
    pub fn open() -> anyhow::Result<Self> {
        let path = cache_db_path()?;
        Self::open_at(&path)
    }

    /// Open a cache at an explicit path. Used by the test suite to give
    /// every test a private database file so they can run in parallel
    /// without stepping on each other.
    ///
    /// # Errors
    /// Returns an error if the parent directory cannot be created or the
    /// `SQLite` database cannot be opened or migrated.
    pub fn open_at(path: &Path) -> anyhow::Result<Self> {
        if let Some(parent) = path.parent() {
            std::fs::create_dir_all(parent)
                .with_context(|| format!("failed to create cache dir {}", parent.display()))?;
        }
        let conn = Connection::open(path)
            .with_context(|| format!("failed to open cache at {}", path.display()))?;
        let mut cache = Self { conn };
        cache.migrate()?;
        Ok(cache)
    }

    fn migrate(&mut self) -> anyhow::Result<()> {
        let current_version: i64 = self
            .conn
            .query_row("PRAGMA user_version", [], |row| row.get(0))?;

        if current_version != DB_SCHEMA_VERSION {
            self.drop_cached_data()?;
        }

        self.create_schema()?;
        self.conn
            .execute_batch(&format!("PRAGMA user_version = {DB_SCHEMA_VERSION};"))?;
        Ok(())
    }

    fn create_schema(&self) -> anyhow::Result<()> {
        self.conn.execute_batch(
            "CREATE TABLE IF NOT EXISTS contexts (
                client TEXT NOT NULL,
                context_id TEXT NOT NULL,
                filter_id TEXT NOT NULL,
                source_path TEXT NOT NULL,
                mtime INTEGER NOT NULL,
                entry_count INTEGER NOT NULL,
                indexed_at INTEGER NOT NULL,
                PRIMARY KEY (client, context_id, filter_id)
            );

            CREATE INDEX IF NOT EXISTS contexts_source_path_idx
                ON contexts(source_path);",
        )?;
        Ok(())
    }
    fn drop_cached_data(&mut self) -> anyhow::Result<()> {
        let table_names = {
            let mut stmt = self.conn.prepare(
                "SELECT name FROM sqlite_master
                 WHERE type = 'table'
                   AND sql LIKE 'CREATE VIRTUAL TABLE%USING fts5%'",
            )?;
            stmt.query_map([], |row| row.get::<_, String>(0))?
                .collect::<Result<Vec<_>, _>>()?
        };

        let tx = self.conn.transaction()?;
        tx.execute("DROP TABLE IF EXISTS contexts", [])?;
        for table_name in table_names {
            tx.execute(
                &format!("DROP TABLE IF EXISTS {}", quote_ident(&table_name)),
                [],
            )?;
        }
        tx.commit()?;
        Ok(())
    }

    /// True if the cache has a fresh index for the given key.
    pub fn is_fresh(&self, key: &CacheKey) -> anyhow::Result<bool> {
        let row = self
            .conn
            .query_row(
                "SELECT source_path, mtime \
                 FROM contexts WHERE client = ?1 AND context_id = ?2 AND filter_id = ?3",
                rusqlite::params![key.client, key.context_id, key.filter_id],
                |row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)),
            )
            .optional()?;
        let Some((source_path, mtime)) = row else {
            return Ok(false);
        };
        Ok(source_path == key.source_path.display().to_string() && mtime == key.mtime)
    }

    /// Index `messages` for `key` if the cache is missing or stale.
    pub fn index(
        &mut self,
        key: &CacheKey,
        messages: &[ConversationMessage],
    ) -> anyhow::Result<()> {
        if self.is_fresh(key)? {
            return Ok(());
        }
        self.reindex(key, messages)
    }

    fn reindex(&mut self, key: &CacheKey, messages: &[ConversationMessage]) -> anyhow::Result<()> {
        let tx = self.conn.transaction()?;
        let table_name = fts_table_for(key);

        // Drop and recreate the FTS5 table to ensure the on-disk shape
        // matches the current schema version.
        tx.execute_batch(&format!(
            "DROP TABLE IF EXISTS fts_{table_name};
             CREATE VIRTUAL TABLE fts_{table_name} USING fts5(
                entry_id UNINDEXED,
                content,
                tokenize = '{TOKENIZE}'
             );"
        ))?;

        {
            let mut stmt = tx.prepare(&format!(
                "INSERT INTO fts_{table_name}(entry_id, content) VALUES (?1, ?2)"
            ))?;
            for message in messages {
                let searchable = crate::display::searchable_text(message);
                if searchable.is_empty() {
                    continue;
                }
                stmt.execute(rusqlite::params![message.entry_id, searchable])?;
            }
        }

        let now = unix_now();
        let entry_count: i64 = messages.len().try_into().unwrap_or(i64::MAX);
        tx.execute(
            "INSERT OR REPLACE INTO contexts
             (client, context_id, filter_id, source_path, mtime, entry_count, indexed_at)
             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
            rusqlite::params![
                key.client,
                key.context_id,
                key.filter_id,
                key.source_path.display().to_string(),
                key.mtime,
                entry_count,
                now,
            ],
        )?;
        tx.commit()?;
        Ok(())
    }

    /// Look up matching entries for `pattern` against the cached FTS5
    /// index. Returns scores where higher is better and a snippet.
    pub fn query(&self, key: &CacheKey, pattern: &str) -> anyhow::Result<Vec<CachedHit>> {
        let table = fts_table_for(key);
        let mut stmt = self.conn.prepare(&format!(
            "SELECT entry_id, -bm25(fts_{table}) AS score, \
                    snippet(fts_{table}, 1, '', '', '...', 6) AS snip \
             FROM fts_{table} \
             WHERE fts_{table} MATCH ?1 \
             ORDER BY score DESC"
        ))?;

        let rows = stmt
            .query_map(rusqlite::params![pattern], |row| {
                Ok(CachedHit {
                    entry_id: row.get::<_, String>(0)?,
                    score: row.get::<_, f64>(1)?,
                    snippet: row.get::<_, String>(2)?,
                })
            })?
            .collect::<Result<Vec<_>, _>>()?;

        Ok(rows)
    }
}

impl std::fmt::Display for IndexCache {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str("IndexCache(<fts5>")
    }
}

fn fts_table_for(key: &CacheKey) -> String {
    let client = sanitize_id(&key.client, false);
    let context_id = sanitize_id(&key.context_id, true);
    let filter_id = sanitize_id(&key.filter_id, true);
    let digest = key_digest(key);
    format!("{client}_{context_id}_{filter_id}_{digest}")
}

/// Hex digest of the exact key fields, appended to the sanitized table
/// stem so that distinct keys never collide on a single FTS5 table even
/// when their sanitized forms coincide.
fn key_digest(key: &CacheKey) -> String {
    use std::hash::{Hash as _, Hasher as _};
    let mut hasher = std::collections::hash_map::DefaultHasher::new();
    key.client.hash(&mut hasher);
    key.context_id.hash(&mut hasher);
    key.filter_id.hash(&mut hasher);
    format!("{:016x}", hasher.finish())
}

fn sanitize_id(id: &str, use_empty_fallback: bool) -> String {
    let mut out = String::with_capacity(id.len());
    for ch in id.chars() {
        if ch.is_ascii_alphanumeric() {
            out.push(ch);
        } else {
            out.push('_');
        }
    }
    if use_empty_fallback && out.is_empty() {
        out.push_str("empty");
    }
    out
}

fn quote_ident(ident: &str) -> String {
    format!("\"{}\"", ident.replace('"', "\"\""))
}

fn cache_db_path() -> anyhow::Result<PathBuf> {
    let Some(base) = dirs::cache_dir() else {
        return Err(anyhow!("cache directory not available on this platform"));
    };
    Ok(base.join("goosedump").join("index.db"))
}

fn unix_now() -> i64 {
    let Ok(duration) = std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH) else {
        return 0;
    };
    i64::try_from(duration.as_secs()).unwrap_or_default()
}

/// Stat a source path and return its modification time as a Unix timestamp.
pub fn mtime_of(path: &Path) -> i64 {
    let Ok(metadata) = std::fs::metadata(path) else {
        return 0;
    };
    let Ok(modified) = metadata.modified() else {
        return 0;
    };
    let Ok(duration) = modified.duration_since(std::time::UNIX_EPOCH) else {
        return 0;
    };
    i64::try_from(duration.as_secs()).unwrap_or(0)
}