repotoire 0.9.0

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
//! Shared file content cache for cross-detector file access.
//!
//! Uses DashMap for lock-free concurrent reads. `Arc<String>` avoids cloning
//! file contents when multiple detectors access the same file.
//!
//! In addition to raw content, the cache also memoizes parsed
//! `tree_sitter::Tree`s keyed by `(path, Language)`. This lets multiple
//! AST-using detectors and postprocess passes share a single parse per
//! file/language pair instead of re-parsing.

use crate::parsers::lightweight::Language;
use dashmap::DashMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tree_sitter::{Parser, Tree};

/// Maximum file size to cache (2MB, matches parser guardrail)
const MAX_CACHE_FILE_SIZE: u64 = 2 * 1024 * 1024;

/// Thread-safe shared file content + parse-tree cache.
pub struct FileContentCache {
    /// Raw UTF-8 file contents, keyed by absolute path.
    content: DashMap<PathBuf, Arc<String>>,
    /// Parsed tree-sitter trees, keyed by `(path, language)`. A given file
    /// can be parsed under multiple grammars (e.g. a `.h` header detected as
    /// both C and C++ in different code paths) so the language is part of the
    /// key. `Arc<Tree>` is cheap to clone — `Tree` is itself a refcounted
    /// pointer to the parser's allocation, but wrapping in `Arc` lets the
    /// cache hand out clones without exposing tree-sitter internals.
    trees: DashMap<(PathBuf, Language), Arc<Tree>>,
}

impl FileContentCache {
    pub fn new() -> Self {
        Self {
            content: DashMap::new(),
            trees: DashMap::new(),
        }
    }

    /// Get file content, reading from disk on cache miss.
    /// Returns None for files that don't exist, aren't UTF-8, or exceed 2MB.
    pub fn get_or_read(&self, path: &Path) -> Option<Arc<String>> {
        if let Some(entry) = self.content.get(path) {
            return Some(Arc::clone(entry.value()));
        }

        // Check size before reading
        if let Ok(meta) = std::fs::metadata(path) {
            if meta.len() > MAX_CACHE_FILE_SIZE {
                return None;
            }
        }

        let content = std::fs::read_to_string(path).ok()?;
        let arc = Arc::new(content);
        self.content.insert(path.to_path_buf(), Arc::clone(&arc));
        Some(arc)
    }

    /// Get file content and parsed AST for `language`, parsing on miss.
    ///
    /// Uses [`get_or_read`] for content, then memoizes the parse tree under
    /// `(path, language)`. Returns `None` if the file can't be read, exceeds
    /// the size cap, or fails to parse.
    ///
    /// Concurrency: DashMap shards independently, and the parse step is
    /// idempotent — a brief race where two threads parse the same file in
    /// parallel only wastes one parse; the second insert wins and both
    /// callers see a valid `Arc<Tree>`.
    ///
    /// Languages currently supported: Python, JavaScript, TypeScript, Rust,
    /// Go, Java, C, C++. Other languages return `None` (no grammar wired in
    /// the cache layer).
    pub fn get_or_parse(
        &self,
        path: &Path,
        language: Language,
    ) -> Option<(Arc<String>, Arc<Tree>)> {
        let content = self.get_or_read(path)?;

        let key = (path.to_path_buf(), language);
        if let Some(entry) = self.trees.get(&key) {
            return Some((content, Arc::clone(entry.value())));
        }

        let ts_lang = ts_language_for(language)?;
        let mut parser = Parser::new();
        parser.set_language(&ts_lang).ok()?;
        let tree = parser.parse(content.as_bytes(), None)?;
        let arc_tree = Arc::new(tree);
        self.trees.insert(key, Arc::clone(&arc_tree));
        Some((content, arc_tree))
    }

    /// Number of cached files
    #[allow(dead_code)]
    pub fn len(&self) -> usize {
        self.content.len()
    }

    /// Returns true if the cache is empty
    #[allow(dead_code)]
    pub fn is_empty(&self) -> bool {
        self.content.is_empty()
    }

    /// Number of cached parse trees (not files — a single file may be parsed
    /// under multiple languages).
    #[allow(dead_code)]
    pub fn tree_count(&self) -> usize {
        self.trees.len()
    }
}

impl Default for FileContentCache {
    fn default() -> Self {
        Self::new()
    }
}

/// Resolve a `Language` to a tree-sitter grammar handle.
///
/// Returns `None` for languages where we don't bundle a grammar (Kotlin,
/// Ruby, PHP, Swift, Unknown). The tree-sitter-c-sharp crate is wired in
/// at the workspace level but exposes a different `Language` constructor;
/// keeping it out of this helper for now since postprocess only needs
/// Python — the others come along for free for future detectors.
fn ts_language_for(language: Language) -> Option<tree_sitter::Language> {
    match language {
        Language::Python => Some(tree_sitter_python::LANGUAGE.into()),
        Language::JavaScript => Some(tree_sitter_javascript::LANGUAGE.into()),
        Language::TypeScript => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
        Language::Rust => Some(tree_sitter_rust::LANGUAGE.into()),
        Language::Go => Some(tree_sitter_go::LANGUAGE.into()),
        Language::Java => Some(tree_sitter_java::LANGUAGE.into()),
        Language::C => Some(tree_sitter_c::LANGUAGE.into()),
        Language::Cpp => Some(tree_sitter_cpp::LANGUAGE.into()),
        Language::CSharp
        | Language::Kotlin
        | Language::Ruby
        | Language::Php
        | Language::Swift
        | Language::Unknown => None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;

    #[test]
    fn test_file_cache_reads_and_caches() {
        let tmp = tempfile::tempdir().unwrap();
        let file_path = tmp.path().join("test.py");
        std::fs::write(&file_path, "print('hello')").unwrap();

        let cache = FileContentCache::new();

        let content1 = cache.get_or_read(&file_path).unwrap();
        assert_eq!(&*content1, "print('hello')");
        assert_eq!(cache.len(), 1);

        let content2 = cache.get_or_read(&file_path).unwrap();
        assert!(Arc::ptr_eq(&content1, &content2));
    }

    #[test]
    fn test_file_cache_skips_large_files() {
        let tmp = tempfile::tempdir().unwrap();
        let file_path = tmp.path().join("huge.py");
        let mut f = std::fs::File::create(&file_path).unwrap();
        f.write_all(&vec![b'x'; 3 * 1024 * 1024]).unwrap();

        let cache = FileContentCache::new();
        assert!(cache.get_or_read(&file_path).is_none());
    }

    #[test]
    fn test_file_cache_returns_none_for_missing_file() {
        let cache = FileContentCache::new();
        assert!(cache
            .get_or_read(Path::new("/nonexistent/file.py"))
            .is_none());
    }

    #[test]
    fn test_get_or_parse_caches_tree() {
        let tmp = tempfile::tempdir().unwrap();
        let file_path = tmp.path().join("t.py");
        std::fs::write(&file_path, "def f(x):\n    return x + 1\n").unwrap();

        let cache = FileContentCache::new();
        let (src1, tree1) = cache.get_or_parse(&file_path, Language::Python).unwrap();
        assert_eq!(cache.tree_count(), 1);

        let (src2, tree2) = cache.get_or_parse(&file_path, Language::Python).unwrap();
        assert!(Arc::ptr_eq(&src1, &src2));
        assert!(Arc::ptr_eq(&tree1, &tree2));
        assert_eq!(cache.tree_count(), 1);

        // Tree is well-formed.
        assert_eq!(tree1.root_node().kind(), "module");
    }

    #[test]
    fn test_get_or_parse_separates_languages() {
        // Same path under two languages keys two trees.
        let tmp = tempfile::tempdir().unwrap();
        let file_path = tmp.path().join("ambiguous.h");
        std::fs::write(&file_path, "int x;\n").unwrap();

        let cache = FileContentCache::new();
        let (_, tree_c) = cache.get_or_parse(&file_path, Language::C).unwrap();
        let (_, tree_cpp) = cache.get_or_parse(&file_path, Language::Cpp).unwrap();
        assert_eq!(cache.tree_count(), 2);
        assert!(!Arc::ptr_eq(&tree_c, &tree_cpp));
    }

    #[test]
    fn test_get_or_parse_returns_none_for_unsupported_language() {
        let tmp = tempfile::tempdir().unwrap();
        let file_path = tmp.path().join("script.rb");
        std::fs::write(&file_path, "puts 'hi'\n").unwrap();

        let cache = FileContentCache::new();
        assert!(cache.get_or_parse(&file_path, Language::Ruby).is_none());
    }

    #[test]
    fn test_get_or_parse_concurrent_access_is_sound() {
        // Smoke test for DashMap concurrent insert: 8 threads racing the
        // same key should all see a valid tree. Per docs the second insert
        // simply overwrites the first; both Arcs remain valid.
        let tmp = tempfile::tempdir().unwrap();
        let file_path = tmp.path().join("concurrent.py");
        std::fs::write(&file_path, "x = 1\n").unwrap();

        let cache = Arc::new(FileContentCache::new());
        let handles: Vec<_> = (0..8)
            .map(|_| {
                let cache = Arc::clone(&cache);
                let path = file_path.clone();
                std::thread::spawn(move || {
                    let (src, tree) = cache.get_or_parse(&path, Language::Python).unwrap();
                    assert_eq!(&*src, "x = 1\n");
                    assert_eq!(tree.root_node().kind(), "module");
                })
            })
            .collect();
        for h in handles {
            h.join().unwrap();
        }
        // After the race settles, exactly one entry remains.
        assert_eq!(cache.tree_count(), 1);
    }
}