repotoire 0.7.1

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
//! File caching for detectors
//!
//! Provides a shared cache to avoid re-reading files across multiple detectors.
//!
//! Cache layers (FileCache, IncrementalCache) implement the `CacheLayer` trait
//! and are coordinated via `CacheCoordinator` for consistent invalidation.

pub mod masking;
pub mod paths;
pub mod traits;

pub use traits::{CacheCoordinator, CacheLayer};

use dashmap::DashMap;
use rayon::prelude::*;
use std::path::{Path, PathBuf};
use std::sync::{Arc, OnceLock};

pub use paths::{
    cache_dir, ensure_cache_dir, findings_cache_path, git_cache_path, graph_db_path,
    graph_stats_path,
};

/// Global file cache instance
static GLOBAL_CACHE: OnceLock<FileCache> = OnceLock::new();

/// Get or initialize the global file cache
pub fn global_cache() -> &'static FileCache {
    GLOBAL_CACHE.get_or_init(FileCache::new)
}

/// Warm the global cache with files from a directory
pub fn warm_global_cache(root: &Path, extensions: &[&str]) {
    global_cache().warm(root, extensions);
}

/// Thread-safe file content cache
#[derive(Clone)]
pub struct FileCache {
    /// Cached file contents: path -> content
    contents: Arc<DashMap<PathBuf, Arc<String>>>,
    /// Cached file lines: path -> lines
    lines: Arc<DashMap<PathBuf, Arc<Vec<String>>>>,
    /// Cached masked content (comments/strings replaced with spaces): path -> masked
    masked: Arc<DashMap<PathBuf, Arc<String>>>,
    /// Cached tree-sitter parse trees: path -> tree.
    ///
    /// Populated by `parsers::parse_file_inner` (which already parses every
    /// file) so AST-using detectors can read the tree without a second parse.
    /// `tree_sitter::Tree` is internally Arc-shared and cheap to clone.
    trees: Arc<DashMap<PathBuf, Arc<tree_sitter::Tree>>>,
}

impl FileCache {
    pub fn new() -> Self {
        Self {
            contents: Arc::new(DashMap::new()),
            lines: Arc::new(DashMap::new()),
            masked: Arc::new(DashMap::new()),
            trees: Arc::new(DashMap::new()),
        }
    }

    /// Pre-warm cache with files from a directory walk
    pub fn warm(&self, root: &Path, extensions: &[&str]) {
        let walker = ignore::WalkBuilder::new(root)
            .hidden(false)
            .git_ignore(true)
            .build();

        let paths: Vec<PathBuf> = walker
            .filter_map(|e| e.ok())
            .filter(|e| e.path().is_file())
            .filter(|e| {
                e.path()
                    .extension()
                    .and_then(|ext| ext.to_str())
                    .map(|ext| extensions.contains(&ext))
                    .unwrap_or(false)
            })
            .map(|e| e.path().to_path_buf())
            .collect();

        // Read files in parallel
        paths.par_iter().for_each(|path| {
            if let Ok(content) = std::fs::read_to_string(path) {
                self.contents.insert(path.clone(), Arc::new(content));
            }
        });

        // Pre-warm masked content in parallel (tree-sitter parse + string/comment stripping).
        // This front-loads the masking cost so detectors get cache hits instead of paying
        // the ~2-3ms tree-sitter parse on first access.
        paths.par_iter().for_each(|path| {
            let _ = self.masked_content(path);
        });
    }

    /// File content (cached, lazy-loading)
    pub fn content(&self, path: &Path) -> Option<Arc<String>> {
        // Check cache first
        if let Some(content) = self.contents.get(path) {
            return Some(Arc::clone(&content));
        }

        // Read and cache
        if let Ok(content) = std::fs::read_to_string(path) {
            let arc = Arc::new(content);
            self.contents.insert(path.to_path_buf(), Arc::clone(&arc));
            Some(arc)
        } else {
            None
        }
    }

    /// File lines (cached, lazy-loading)
    pub fn lines(&self, path: &Path) -> Option<Arc<Vec<String>>> {
        // Check cache first
        if let Some(lines) = self.lines.get(path) {
            return Some(Arc::clone(&lines));
        }

        // Get content and split into lines
        let content = self.content(path)?;
        let lines: Vec<String> = content.lines().map(String::from).collect();
        let arc = Arc::new(lines);
        self.lines.insert(path.to_path_buf(), Arc::clone(&arc));
        Some(arc)
    }

    /// Masked file content with comments, strings, and docstrings replaced by spaces.
    ///
    /// Returns cached masked content, or computes it on first access by parsing
    /// the file with tree-sitter and replacing non-code regions with spaces
    /// (preserving newlines for stable line numbers).
    pub fn masked_content(&self, path: &Path) -> Option<Arc<String>> {
        if let Some(masked) = self.masked.get(path) {
            return Some(Arc::clone(&masked));
        }

        let content = self.content(path)?;
        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
        let masked = masking::mask_non_code(&content, ext);
        let arc = Arc::new(masked);
        self.masked.insert(path.to_path_buf(), Arc::clone(&arc));
        Some(arc)
    }

    /// Store pre-computed masked content directly (avoids re-parsing).
    ///
    /// Called from `parse_file()` when we already have a tree-sitter tree
    /// and can compute the masking without a second parse.
    pub fn store_masked(&self, path: &Path, masked: String) {
        self.masked.insert(path.to_path_buf(), Arc::new(masked));
    }

    /// Cached tree-sitter parse tree for a file.
    ///
    /// Returns `None` for files that haven't been parsed yet (i.e. not yet
    /// processed by `parsers::parse_file_inner`) or whose language
    /// has no tree-sitter grammar. Detectors should be tolerant of this and
    /// either skip the file or fall back to non-AST analysis.
    pub fn tree(&self, path: &Path) -> Option<Arc<tree_sitter::Tree>> {
        self.trees.get(path).map(|t| Arc::clone(&t))
    }

    /// Store a parsed tree-sitter tree.
    ///
    /// Called from `parse_file_inner()` after the main parse succeeds, so
    /// AST-consuming detectors can read the tree without re-parsing.
    pub fn store_tree(&self, path: &Path, tree: tree_sitter::Tree) {
        self.trees.insert(path.to_path_buf(), Arc::new(tree));
    }

    /// Get list of cached file paths
    pub fn cached_paths(&self) -> Vec<PathBuf> {
        self.contents.iter().map(|r| r.key().clone()).collect()
    }

    /// Get cached paths filtered by extension
    pub fn paths_with_ext(&self, extensions: &[&str]) -> Vec<PathBuf> {
        self.contents
            .iter()
            .filter(|r| {
                r.key()
                    .extension()
                    .and_then(|e| e.to_str())
                    .map(|e| extensions.contains(&e))
                    .unwrap_or(false)
            })
            .map(|r| r.key().clone())
            .collect()
    }

    /// Cache stats
    pub fn stats(&self) -> (usize, usize) {
        (self.contents.len(), self.lines.len())
    }

    /// Clear all cached data (#13 — prevent stale data in watch/server mode)
    pub fn clear(&self) {
        self.contents.clear();
        self.lines.clear();
        self.masked.clear();
        self.trees.clear();
    }

    /// Evict specific files from the cache.
    ///
    /// Used by `AnalysisEngine::update()` to invalidate stale entries for
    /// changed files before re-running detectors.
    pub fn evict(&self, paths: &[PathBuf]) {
        for path in paths {
            self.contents.remove(path);
            self.lines.remove(path);
            self.masked.remove(path);
            self.trees.remove(path);
        }
    }
}

impl Default for FileCache {
    fn default() -> Self {
        Self::new()
    }
}

impl CacheLayer for FileCache {
    fn name(&self) -> &str {
        "file-content"
    }

    fn is_populated(&self) -> bool {
        !self.contents.is_empty()
    }

    fn invalidate_files(&mut self, changed_files: &[&Path]) {
        for path in changed_files {
            self.contents.remove(*path);
            self.lines.remove(*path);
            self.masked.remove(*path);
            self.trees.remove(*path);
        }
    }

    fn invalidate_all(&mut self) {
        self.clear();
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;
    use std::sync::Arc;

    #[test]
    fn test_file_cache_implements_cache_layer() {
        let mut cache = FileCache::new();

        // Verify name
        assert_eq!(cache.name(), "file-content");

        // Verify is_populated returns false when empty
        assert!(!cache.is_populated());

        // Insert some content
        let path_a = PathBuf::from("/tmp/a.rs");
        let path_b = PathBuf::from("/tmp/b.rs");
        cache
            .contents
            .insert(path_a.clone(), Arc::new("fn main() {}".to_string()));
        cache
            .contents
            .insert(path_b.clone(), Arc::new("fn helper() {}".to_string()));
        cache
            .lines
            .insert(path_a.clone(), Arc::new(vec!["fn main() {}".to_string()]));
        cache
            .lines
            .insert(path_b.clone(), Arc::new(vec!["fn helper() {}".to_string()]));

        // Verify is_populated returns true after adding content
        assert!(cache.is_populated());

        // Invalidate a specific file
        let path_a_ref: &Path = &path_a;
        cache.invalidate_files(&[path_a_ref]);

        // path_a should be removed from both contents and lines
        assert!(cache.contents.get(&path_a).is_none());
        assert!(cache.lines.get(&path_a).is_none());

        // path_b should still be present
        assert!(cache.contents.get(&path_b).is_some());
        assert!(cache.lines.get(&path_b).is_some());

        // Cache should still be populated
        assert!(cache.is_populated());
    }

    #[test]
    fn test_file_cache_invalidate_all() {
        let mut cache = FileCache::new();

        // Insert content
        let path_a = PathBuf::from("/tmp/a.rs");
        let path_b = PathBuf::from("/tmp/b.rs");
        cache
            .contents
            .insert(path_a.clone(), Arc::new("content a".to_string()));
        cache
            .contents
            .insert(path_b.clone(), Arc::new("content b".to_string()));
        cache
            .lines
            .insert(path_a.clone(), Arc::new(vec!["content a".to_string()]));
        cache
            .lines
            .insert(path_b.clone(), Arc::new(vec!["content b".to_string()]));

        assert!(cache.is_populated());

        // Invalidate all
        cache.invalidate_all();

        // Everything should be gone
        assert!(!cache.is_populated());
        assert!(cache.contents.is_empty());
        assert!(cache.lines.is_empty());
    }

    #[test]
    fn test_tree_cache_round_trip() {
        // Build a real tree-sitter tree from a tiny Python source so we
        // exercise the actual store/retrieve path, not just an empty wrapper.
        let mut parser = tree_sitter::Parser::new();
        parser
            .set_language(&tree_sitter_python::LANGUAGE.into())
            .expect("set python language");
        let source = "x = 1\n";
        let tree = parser.parse(source, None).expect("parse python");

        let cache = FileCache::new();
        let path = PathBuf::from("/tmp/round_trip.py");

        // Miss before store.
        assert!(cache.tree(&path).is_none());

        cache.store_tree(&path, tree);

        // Hit after store.
        let got = cache.tree(&path).expect("tree should be cached");
        // Same Arc shared across calls (cheap clone semantics).
        let got2 = cache.tree(&path).expect("second lookup");
        assert!(Arc::ptr_eq(&got, &got2), "tree() should return same Arc");

        // The tree's root node corresponds to the source we parsed.
        assert_eq!(got.root_node().kind(), "module");
    }

    #[test]
    fn test_tree_cache_evict_drops_tree() {
        let mut parser = tree_sitter::Parser::new();
        parser
            .set_language(&tree_sitter_python::LANGUAGE.into())
            .expect("set python language");
        let tree = parser.parse("y = 2\n", None).expect("parse");

        let cache = FileCache::new();
        let path = PathBuf::from("/tmp/evictable.py");
        cache.store_tree(&path, tree);
        assert!(cache.tree(&path).is_some());

        cache.evict(std::slice::from_ref(&path));
        assert!(
            cache.tree(&path).is_none(),
            "evict() must drop the tree alongside content/masked"
        );
    }

    #[test]
    fn test_tree_cache_clear_drops_all_trees() {
        let mut parser = tree_sitter::Parser::new();
        parser
            .set_language(&tree_sitter_python::LANGUAGE.into())
            .expect("set python language");

        let cache = FileCache::new();
        for i in 0..3 {
            let tree = parser.parse(format!("a{i} = {i}\n"), None).expect("parse");
            cache.store_tree(&PathBuf::from(format!("/tmp/c{i}.py")), tree);
        }
        assert_eq!(cache.trees.len(), 3);

        cache.clear();
        assert_eq!(cache.trees.len(), 0, "clear() must drop all trees");
    }
}