use crate::parsers::lightweight::Language;
use dashmap::DashMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use tree_sitter::{Parser, Tree};
const MAX_CACHE_FILE_SIZE: u64 = 2 * 1024 * 1024;
pub struct FileContentCache {
content: DashMap<PathBuf, Arc<String>>,
trees: DashMap<(PathBuf, Language), Arc<Tree>>,
}
impl FileContentCache {
pub fn new() -> Self {
Self {
content: DashMap::new(),
trees: DashMap::new(),
}
}
pub fn get_or_read(&self, path: &Path) -> Option<Arc<String>> {
if let Some(entry) = self.content.get(path) {
return Some(Arc::clone(entry.value()));
}
if let Ok(meta) = std::fs::metadata(path) {
if meta.len() > MAX_CACHE_FILE_SIZE {
return None;
}
}
let content = std::fs::read_to_string(path).ok()?;
let arc = Arc::new(content);
self.content.insert(path.to_path_buf(), Arc::clone(&arc));
Some(arc)
}
pub fn get_or_parse(
&self,
path: &Path,
language: Language,
) -> Option<(Arc<String>, Arc<Tree>)> {
let content = self.get_or_read(path)?;
let key = (path.to_path_buf(), language);
if let Some(entry) = self.trees.get(&key) {
return Some((content, Arc::clone(entry.value())));
}
let ts_lang = ts_language_for(language)?;
let mut parser = Parser::new();
parser.set_language(&ts_lang).ok()?;
let tree = parser.parse(content.as_bytes(), None)?;
let arc_tree = Arc::new(tree);
self.trees.insert(key, Arc::clone(&arc_tree));
Some((content, arc_tree))
}
#[allow(dead_code)]
pub fn len(&self) -> usize {
self.content.len()
}
#[allow(dead_code)]
pub fn is_empty(&self) -> bool {
self.content.is_empty()
}
#[allow(dead_code)]
pub fn tree_count(&self) -> usize {
self.trees.len()
}
}
impl Default for FileContentCache {
fn default() -> Self {
Self::new()
}
}
fn ts_language_for(language: Language) -> Option<tree_sitter::Language> {
match language {
Language::Python => Some(tree_sitter_python::LANGUAGE.into()),
Language::JavaScript => Some(tree_sitter_javascript::LANGUAGE.into()),
Language::TypeScript => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
Language::Rust => Some(tree_sitter_rust::LANGUAGE.into()),
Language::Go => Some(tree_sitter_go::LANGUAGE.into()),
Language::Java => Some(tree_sitter_java::LANGUAGE.into()),
Language::C => Some(tree_sitter_c::LANGUAGE.into()),
Language::Cpp => Some(tree_sitter_cpp::LANGUAGE.into()),
Language::CSharp
| Language::Kotlin
| Language::Ruby
| Language::Php
| Language::Swift
| Language::Unknown => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
#[test]
fn test_file_cache_reads_and_caches() {
let tmp = tempfile::tempdir().unwrap();
let file_path = tmp.path().join("test.py");
std::fs::write(&file_path, "print('hello')").unwrap();
let cache = FileContentCache::new();
let content1 = cache.get_or_read(&file_path).unwrap();
assert_eq!(&*content1, "print('hello')");
assert_eq!(cache.len(), 1);
let content2 = cache.get_or_read(&file_path).unwrap();
assert!(Arc::ptr_eq(&content1, &content2));
}
#[test]
fn test_file_cache_skips_large_files() {
let tmp = tempfile::tempdir().unwrap();
let file_path = tmp.path().join("huge.py");
let mut f = std::fs::File::create(&file_path).unwrap();
f.write_all(&vec![b'x'; 3 * 1024 * 1024]).unwrap();
let cache = FileContentCache::new();
assert!(cache.get_or_read(&file_path).is_none());
}
#[test]
fn test_file_cache_returns_none_for_missing_file() {
let cache = FileContentCache::new();
assert!(cache
.get_or_read(Path::new("/nonexistent/file.py"))
.is_none());
}
#[test]
fn test_get_or_parse_caches_tree() {
let tmp = tempfile::tempdir().unwrap();
let file_path = tmp.path().join("t.py");
std::fs::write(&file_path, "def f(x):\n return x + 1\n").unwrap();
let cache = FileContentCache::new();
let (src1, tree1) = cache.get_or_parse(&file_path, Language::Python).unwrap();
assert_eq!(cache.tree_count(), 1);
let (src2, tree2) = cache.get_or_parse(&file_path, Language::Python).unwrap();
assert!(Arc::ptr_eq(&src1, &src2));
assert!(Arc::ptr_eq(&tree1, &tree2));
assert_eq!(cache.tree_count(), 1);
assert_eq!(tree1.root_node().kind(), "module");
}
#[test]
fn test_get_or_parse_separates_languages() {
let tmp = tempfile::tempdir().unwrap();
let file_path = tmp.path().join("ambiguous.h");
std::fs::write(&file_path, "int x;\n").unwrap();
let cache = FileContentCache::new();
let (_, tree_c) = cache.get_or_parse(&file_path, Language::C).unwrap();
let (_, tree_cpp) = cache.get_or_parse(&file_path, Language::Cpp).unwrap();
assert_eq!(cache.tree_count(), 2);
assert!(!Arc::ptr_eq(&tree_c, &tree_cpp));
}
#[test]
fn test_get_or_parse_returns_none_for_unsupported_language() {
let tmp = tempfile::tempdir().unwrap();
let file_path = tmp.path().join("script.rb");
std::fs::write(&file_path, "puts 'hi'\n").unwrap();
let cache = FileContentCache::new();
assert!(cache.get_or_parse(&file_path, Language::Ruby).is_none());
}
#[test]
fn test_get_or_parse_concurrent_access_is_sound() {
let tmp = tempfile::tempdir().unwrap();
let file_path = tmp.path().join("concurrent.py");
std::fs::write(&file_path, "x = 1\n").unwrap();
let cache = Arc::new(FileContentCache::new());
let handles: Vec<_> = (0..8)
.map(|_| {
let cache = Arc::clone(&cache);
let path = file_path.clone();
std::thread::spawn(move || {
let (src, tree) = cache.get_or_parse(&path, Language::Python).unwrap();
assert_eq!(&*src, "x = 1\n");
assert_eq!(tree.root_node().kind(), "module");
})
})
.collect();
for h in handles {
h.join().unwrap();
}
assert_eq!(cache.tree_count(), 1);
}
}