use std::collections::BTreeMap;
use std::collections::HashSet;
use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::Hash;
use std::hash::Hasher;
use std::io;
use std::path::Path;
use std::path::PathBuf;
use std::time::SystemTime;
use serde::Deserialize;
use serde::Serialize;
const MAX_CACHE_BYTES: u64 = 2 * 1024 * 1024 * 1024;
const STALE_REBUILD_RATIO: f64 = 0.10;
pub const INDEX_VERSION: u32 = 1;
#[derive(Serialize, Deserialize)]
pub struct TrigramIndex {
version: u32,
files: Vec<FileRecord>,
postings: BTreeMap<[u8; 3], Vec<u32>>,
root: PathBuf,
}
#[derive(Serialize, Deserialize)]
struct FileRecord {
path: PathBuf,
mtime_s: i64,
mtime_ns: u32,
size: u64,
}
fn index_dir(root: &Path) -> Option<PathBuf> {
let base = dirs::cache_dir()?;
let mut hasher = DefaultHasher::new();
root.hash(&mut hasher);
let hash = format!("{:016x}", hasher.finish());
Some(base.join("fastgrep").join("trigram").join(hash))
}
fn trigram_cache_root() -> Option<PathBuf> {
Some(dirs::cache_dir()?.join("fastgrep").join("trigram"))
}
fn file_mtime(path: &Path) -> io::Result<(i64, u32, u64)> {
let meta = fs::metadata(path)?;
let mtime = meta.modified()?.duration_since(SystemTime::UNIX_EPOCH).unwrap_or_default();
Ok((mtime.as_secs() as i64, mtime.subsec_nanos(), meta.len()))
}
impl TrigramIndex {
pub fn load(root: &Path) -> Option<Self> {
let dir = index_dir(root)?;
let index_path = dir.join("index.bin");
let data = fs::read(&index_path).ok()?;
let Ok(index) = bitcode::deserialize::<Self>(&data) else {
let _ = fs::remove_dir_all(&dir);
return None;
};
if index.version != INDEX_VERSION {
let _ = fs::remove_dir_all(&dir);
return None;
}
Some(index)
}
pub fn build(root: &Path, paths: &[PathBuf]) -> Self {
let mut files = Vec::new();
let mut postings: BTreeMap<[u8; 3], Vec<u32>> = BTreeMap::new();
for path in paths {
let Ok((mtime_s, mtime_ns, size)) = file_mtime(path) else { continue };
let Ok(data) = fs::read(path) else { continue };
let check = data.len().min(8192);
if memchr::memchr(0, &data[..check]).is_some() {
continue;
}
let file_id = files.len() as u32;
files.push(FileRecord { path: path.clone(), mtime_s, mtime_ns, size });
let mut seen = HashSet::new();
for tri in data.windows(3) {
let key = [tri[0], tri[1], tri[2]];
if seen.insert(key) {
postings.entry(key).or_default().push(file_id);
}
}
}
Self { version: INDEX_VERSION, files, postings, root: root.to_owned() }
}
pub fn candidate_files(&self, trigrams: &[[u8; 3]]) -> HashSet<PathBuf> {
if trigrams.is_empty() {
return self.files.iter().map(|f| f.path.clone()).collect();
}
let mut lists: Vec<&Vec<u32>> =
trigrams.iter().filter_map(|tri| self.postings.get(tri)).collect();
if lists.len() < trigrams.len() {
return HashSet::new();
}
lists.sort_by_key(|l| l.len());
let mut result: HashSet<u32> = lists[0].iter().copied().collect();
for list in &lists[1..] {
let set: HashSet<u32> = list.iter().copied().collect();
result.retain(|id| set.contains(id));
if result.is_empty() {
return HashSet::new();
}
}
result.iter().map(|&id| self.files[id as usize].path.clone()).collect()
}
pub fn stale_files(&self) -> Vec<PathBuf> {
self.files
.iter()
.filter(|f| {
match file_mtime(&f.path) {
Ok((s, ns, sz)) => s != f.mtime_s || ns != f.mtime_ns || sz != f.size,
Err(_) => true, }
})
.map(|f| f.path.clone())
.collect()
}
pub fn needs_rebuild(&self) -> bool {
if self.files.is_empty() {
return false;
}
let stale = self.stale_files().len();
stale as f64 > self.files.len() as f64 * STALE_REBUILD_RATIO
}
pub fn file_count(&self) -> usize {
self.files.len()
}
pub fn save(&self) -> io::Result<()> {
let Some(dir) = index_dir(&self.root) else {
return Err(io::Error::other("cannot determine cache dir"));
};
fs::create_dir_all(&dir)?;
let data = bitcode::serialize(self).map_err(io::Error::other)?;
let tmp = dir.join("index.bin.tmp");
fs::write(&tmp, &data)?;
fs::rename(&tmp, dir.join("index.bin"))?;
Ok(())
}
}
pub fn evict_if_needed() {
let Some(root) = trigram_cache_root() else { return };
let Ok(entries) = fs::read_dir(&root) else { return };
let mut dirs: Vec<(PathBuf, u64, SystemTime)> = Vec::new();
let mut total: u64 = 0;
for entry in entries {
let Ok(entry) = entry else { continue };
let path = entry.path();
if !path.is_dir() {
continue;
}
let index_file = path.join("index.bin");
if let Ok(meta) = fs::metadata(&index_file) {
let size = meta.len();
let mtime = meta.modified().unwrap_or(SystemTime::UNIX_EPOCH);
total += size;
dirs.push((path, size, mtime));
}
}
if total <= MAX_CACHE_BYTES {
return;
}
dirs.sort_by_key(|(_, _, mtime)| *mtime);
for (dir, size, _) in &dirs {
if total <= MAX_CACHE_BYTES {
break;
}
let _ = fs::remove_dir_all(dir);
total -= size;
}
}