use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::UNIX_EPOCH;
use anyhow::{Context, Result, bail};
use ignore::{WalkBuilder, WalkState};
use rayon::prelude::*;
use roaring::RoaringBitmap;
use rustc_hash::{FxHashMap, FxHashSet};
use crate::query::Query;
use crate::trigram::{self, Trigram};
const SNAPSHOT_MAGIC: &[u8; 8] = b"RGXIDX01";
const BINARY_SNIFF_BYTES: usize = 1024;
struct FileEntry {
path: PathBuf,
size: u64,
mtime_ns: u64,
live: bool,
}
#[derive(Default)]
pub struct Index {
entries: Vec<FileEntry>,
path_to_id: FxHashMap<PathBuf, u32>,
postings: FxHashMap<u32, RoaringBitmap>,
}
impl Index {
pub fn file_count(&self) -> usize {
self.entries.iter().filter(|e| e.live).count()
}
pub fn trigram_count(&self) -> usize {
self.postings.len()
}
pub fn build(root: impl AsRef<Path>) -> Index {
let paths = walk_files(root.as_ref());
Self::from_paths(&paths, &AtomicUsize::new(0))
}
pub fn from_paths(paths: &[PathBuf], progress: &AtomicUsize) -> Index {
let (metas, postings) = index_files(paths, progress);
let entries: Vec<FileEntry> = paths
.iter()
.cloned()
.zip(metas)
.map(|(path, (size, mtime_ns))| FileEntry {
path,
size,
mtime_ns,
live: true,
})
.collect();
let path_to_id = entries
.iter()
.enumerate()
.map(|(id, e)| (e.path.clone(), id as u32))
.collect();
Index {
entries,
path_to_id,
postings,
}
}
pub fn apply_changes(&mut self, changed: &[PathBuf], removed: &[PathBuf]) {
for path in removed {
if let Some(&id) = self.path_to_id.get(path) {
self.entries[id as usize].live = false;
}
}
let mut seen = FxHashSet::default();
for path in changed {
let (size, mtime_ns) = stat(path);
let Ok(bytes) = std::fs::read(path) else {
if let Some(&id) = self.path_to_id.get(path) {
self.entries[id as usize].live = false;
}
continue;
};
let id = self.intern(path, size, mtime_ns);
if collect_trigrams(&bytes, &mut seen) {
for &t in &seen {
self.postings
.entry(trigram::pack(t))
.or_default()
.insert(id);
}
}
}
}
pub fn reconcile(&mut self, root: impl AsRef<Path>) -> usize {
let walked = walk_files(root.as_ref());
let walked_set: rustc_hash::FxHashSet<&Path> = walked.iter().map(|p| p.as_path()).collect();
let mut changed = Vec::new();
for p in &walked {
match self.path_to_id.get(p) {
None => changed.push(p.clone()),
Some(&id) => {
let e = &self.entries[id as usize];
let (size, mtime_ns) = stat(p);
if !e.live || e.size != size || e.mtime_ns != mtime_ns {
changed.push(p.clone());
}
}
}
}
let removed: Vec<PathBuf> = self
.entries
.iter()
.filter(|e| e.live && !walked_set.contains(e.path.as_path()))
.map(|e| e.path.clone())
.collect();
let n = changed.len() + removed.len();
self.apply_changes(&changed, &removed);
n
}
fn intern(&mut self, path: &Path, size: u64, mtime_ns: u64) -> u32 {
if let Some(&id) = self.path_to_id.get(path) {
let e = &mut self.entries[id as usize];
e.size = size;
e.mtime_ns = mtime_ns;
e.live = true;
return id;
}
let id = self.entries.len() as u32;
self.entries.push(FileEntry {
path: path.to_path_buf(),
size,
mtime_ns,
live: true,
});
self.path_to_id.insert(path.to_path_buf(), id);
id
}
pub fn candidates(&self, query: &Query) -> Vec<&Path> {
let bitmap = self.eval(query);
bitmap
.iter()
.filter(|&id| self.entries[id as usize].live)
.map(|id| self.entries[id as usize].path.as_path())
.collect()
}
pub fn find(
&self,
needle: &str,
after: Option<&str>,
limit: usize,
) -> (Vec<&Path>, usize, usize) {
let mut hits: Vec<&Path> = self
.entries
.iter()
.filter(|e| e.live && e.path.to_string_lossy().contains(needle))
.map(|e| e.path.as_path())
.collect();
hits.sort_by_cached_key(|p| p.to_string_lossy().into_owned());
let total = hits.len();
let start = after.map_or(0, |a| {
hits.partition_point(|p| p.to_string_lossy().as_ref() <= a)
});
hits.drain(..start);
hits.truncate(limit);
(hits, total, start)
}
pub fn memory_bytes(&self) -> u64 {
self.postings
.values()
.map(|b| b.serialized_size() as u64)
.sum()
}
fn eval(&self, query: &Query) -> RoaringBitmap {
match query {
Query::All => self.all_ids(),
Query::Tri(t) => self
.postings
.get(&trigram::pack(*t))
.cloned()
.unwrap_or_default(),
Query::And(qs) => qs
.iter()
.map(|q| self.eval(q))
.reduce(|a, b| a & b)
.unwrap_or_else(|| self.all_ids()),
Query::Or(qs) => qs
.iter()
.map(|q| self.eval(q))
.reduce(|a, b| a | b)
.unwrap_or_default(),
}
}
fn all_ids(&self) -> RoaringBitmap {
(0..self.entries.len() as u32).collect()
}
pub fn save(&self, path: impl AsRef<Path>) -> Result<()> {
let path = path.as_ref();
if let Some(dir) = path.parent() {
std::fs::create_dir_all(dir).ok();
}
let tmp = path.with_extension("tmp");
let mut w = std::io::BufWriter::new(std::fs::File::create(&tmp)?);
w.write_all(SNAPSHOT_MAGIC)?;
write_u64(&mut w, self.entries.len() as u64)?;
for e in &self.entries {
w.write_all(&[e.live as u8])?;
write_u64(&mut w, e.size)?;
write_u64(&mut w, e.mtime_ns)?;
let pb = path_to_bytes(&e.path);
write_u32(&mut w, pb.len() as u32)?;
w.write_all(&pb)?;
}
write_u64(&mut w, self.postings.len() as u64)?;
let mut buf = Vec::new();
for (&key, bm) in &self.postings {
write_u32(&mut w, key)?;
buf.clear();
bm.serialize_into(&mut buf)?;
write_u32(&mut w, buf.len() as u32)?;
w.write_all(&buf)?;
}
w.flush()?;
drop(w);
std::fs::rename(&tmp, path).context("rename snapshot into place")?;
Ok(())
}
pub fn load(path: impl AsRef<Path>) -> Result<Index> {
let mut r = std::io::BufReader::new(std::fs::File::open(path)?);
let mut magic = [0u8; 8];
r.read_exact(&mut magic)?;
if &magic != SNAPSHOT_MAGIC {
bail!("snapshot version mismatch");
}
let n = read_u64(&mut r)? as usize;
let mut entries = Vec::with_capacity(n);
let mut path_to_id = FxHashMap::default();
for id in 0..n {
let mut live = [0u8; 1];
r.read_exact(&mut live)?;
let size = read_u64(&mut r)?;
let mtime_ns = read_u64(&mut r)?;
let plen = read_u32(&mut r)? as usize;
let mut pb = vec![0u8; plen];
r.read_exact(&mut pb)?;
let path = path_from_bytes(&pb);
path_to_id.insert(path.clone(), id as u32);
entries.push(FileEntry {
path,
size,
mtime_ns,
live: live[0] != 0,
});
}
let np = read_u64(&mut r)? as usize;
let n_entries = entries.len() as u32;
let mut postings = FxHashMap::default();
for _ in 0..np {
let key = read_u32(&mut r)?;
let blen = read_u32(&mut r)? as usize;
let mut bb = vec![0u8; blen];
r.read_exact(&mut bb)?;
let bm = RoaringBitmap::deserialize_from(&bb[..])?;
if bm.max().is_some_and(|m| m >= n_entries) {
bail!("snapshot posting references out-of-range file id");
}
postings.insert(key, bm);
}
Ok(Index {
entries,
path_to_id,
postings,
})
}
}
#[cfg(unix)]
fn path_to_bytes(p: &Path) -> std::borrow::Cow<'_, [u8]> {
use std::os::unix::ffi::OsStrExt;
std::borrow::Cow::Borrowed(p.as_os_str().as_bytes())
}
#[cfg(not(unix))]
fn path_to_bytes(p: &Path) -> std::borrow::Cow<'_, [u8]> {
std::borrow::Cow::Owned(p.to_string_lossy().into_owned().into_bytes())
}
#[cfg(unix)]
fn path_from_bytes(b: &[u8]) -> PathBuf {
use std::os::unix::ffi::OsStrExt;
PathBuf::from(std::ffi::OsStr::from_bytes(b))
}
#[cfg(not(unix))]
fn path_from_bytes(b: &[u8]) -> PathBuf {
PathBuf::from(String::from_utf8_lossy(b).into_owned())
}
fn write_u32(w: &mut impl Write, v: u32) -> std::io::Result<()> {
w.write_all(&v.to_le_bytes())
}
fn write_u64(w: &mut impl Write, v: u64) -> std::io::Result<()> {
w.write_all(&v.to_le_bytes())
}
fn read_u32(r: &mut impl Read) -> std::io::Result<u32> {
let mut b = [0u8; 4];
r.read_exact(&mut b)?;
Ok(u32::from_le_bytes(b))
}
fn read_u64(r: &mut impl Read) -> std::io::Result<u64> {
let mut b = [0u8; 8];
r.read_exact(&mut b)?;
Ok(u64::from_le_bytes(b))
}
pub fn walk_builder(root: &Path) -> WalkBuilder {
let mut b = WalkBuilder::new(root);
b.add_custom_ignore_filename(".rgignore");
b
}
pub fn walk_files(root: &Path) -> Vec<PathBuf> {
let found = Mutex::new(Vec::<PathBuf>::new());
walk_builder(root).build_parallel().run(|| {
let found = &found;
Box::new(move |res| {
if let Ok(entry) = res
&& entry.file_type().is_some_and(|t| t.is_file())
{
found.lock().unwrap().push(entry.into_path());
}
WalkState::Continue
})
});
let mut paths = found.into_inner().unwrap();
paths.sort();
paths
}
fn index_files(
paths: &[PathBuf],
progress: &AtomicUsize,
) -> (Vec<(u64, u64)>, FxHashMap<u32, RoaringBitmap>) {
const SHARDS: usize = 256;
const TRIGRAM_WORDS: usize = (1 << 24) / 64; let shards: Vec<Mutex<FxHashMap<u32, RoaringBitmap>>> = (0..SHARDS)
.map(|_| Mutex::new(FxHashMap::default()))
.collect();
let init = || {
(
vec![0u64; TRIGRAM_WORDS],
Vec::<u32>::new(),
vec![Vec::<u32>::new(); SHARDS],
)
};
let metas: Vec<(u64, u64)> = paths
.par_iter()
.enumerate()
.map_init(init, |(bits, distinct, by_shard), (id, path)| {
progress.fetch_add(1, Ordering::Relaxed);
let id = id as u32;
let (size, mtime_ns) = stat(path);
if let Ok(bytes) = std::fs::read(path)
&& !is_binary_from_start(&bytes)
{
distinct.clear();
trigram::for_each(&bytes, |t| {
let key = trigram::pack(t);
let (w, b) = ((key >> 6) as usize, key & 63);
if bits[w] & (1u64 << b) == 0 {
bits[w] |= 1u64 << b;
distinct.push(key);
}
});
by_shard.iter_mut().for_each(Vec::clear);
for &key in distinct.iter() {
by_shard[(key as usize) & (SHARDS - 1)].push(key);
}
for (s, keys) in by_shard.iter().enumerate() {
if keys.is_empty() {
continue;
}
let mut g = shards[s].lock().unwrap();
for &key in keys {
g.entry(key).or_default().insert(id);
}
}
for &key in distinct.iter() {
bits[(key >> 6) as usize] &= !(1u64 << (key & 63));
}
}
(size, mtime_ns)
})
.collect();
let mut merged = FxHashMap::default();
for shard in shards {
merged.extend(shard.into_inner().unwrap());
}
(metas, merged)
}
fn stat(path: &Path) -> (u64, u64) {
match std::fs::metadata(path) {
Ok(m) => {
let mtime = m
.modified()
.ok()
.and_then(|t| t.duration_since(UNIX_EPOCH).ok())
.map(|d| d.as_nanos() as u64)
.unwrap_or(0);
(m.len(), mtime)
}
Err(_) => (0, 0),
}
}
fn is_binary_from_start(bytes: &[u8]) -> bool {
memchr::memchr(0, &bytes[..bytes.len().min(BINARY_SNIFF_BYTES)]).is_some()
}
fn collect_trigrams(bytes: &[u8], seen: &mut FxHashSet<Trigram>) -> bool {
if is_binary_from_start(bytes) {
return false;
}
seen.clear();
trigram::for_each(bytes, |t| {
seen.insert(t);
});
true
}
#[cfg(test)]
mod tests {
use super::*;
use crate::query::Options;
fn write(dir: &Path, name: &str, content: &[u8]) {
std::fs::write(dir.join(name), content).unwrap();
}
fn names(c: Vec<&Path>) -> Vec<String> {
let mut v: Vec<String> = c
.iter()
.map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
.collect();
v.sort();
v
}
#[test]
fn build_candidates_and_binary_skip() {
let tmp = std::env::temp_dir().join(format!("rgx_idx_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
write(&tmp, "a.txt", b"the quick brown fox\nNEEDLE here\n");
write(&tmp, "b.txt", b"no match in this file at all\n");
write(&tmp, "c.txt", b"another NEEDLE appears\n");
write(&tmp, "bin.dat", b"\x00\x00NEEDLE inside binary\x00");
let idx = Index::build(&tmp);
assert_eq!(idx.file_count(), 4);
let q = Query::for_pattern("NEEDLE", Options::default());
let n = names(idx.candidates(&q));
assert!(n.contains(&"a.txt".to_string()) && n.contains(&"c.txt".to_string()));
assert!(!n.contains(&"b.txt".to_string()) && !n.contains(&"bin.dat".to_string()));
let fb = names(idx.candidates(&Query::for_pattern("a.", Options::default())));
assert_eq!(fb, vec!["a.txt", "b.txt", "bin.dat", "c.txt"]);
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn incremental_add_change_remove() {
let tmp = std::env::temp_dir().join(format!("rgx_inc_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
write(&tmp, "a.txt", b"WIDGET alpha\n");
let mut idx = Index::build(&tmp);
let q = Query::for_pattern("WIDGET", Options::default());
assert_eq!(names(idx.candidates(&q)), vec!["a.txt"]);
write(&tmp, "b.txt", b"WIDGET beta\n");
idx.apply_changes(&[tmp.join("b.txt")], &[]);
let mut got = names(idx.candidates(&q));
got.sort();
assert_eq!(got, vec!["a.txt", "b.txt"]);
std::fs::remove_file(tmp.join("a.txt")).unwrap();
idx.apply_changes(&[], &[tmp.join("a.txt")]);
assert_eq!(names(idx.candidates(&q)), vec!["b.txt"]);
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn find_returns_total_and_keyset_page() {
let tmp = std::env::temp_dir().join(format!("rgx_find_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
for n in ["conf_a.txt", "conf_b.txt", "conf_c.txt", "other.txt"] {
write(&tmp, n, b"x\n");
}
let idx = Index::build(&tmp);
let (page, total, start) = idx.find("conf", None, 2);
assert_eq!(total, 3);
assert_eq!(start, 0);
assert_eq!(names(page.clone()), vec!["conf_a.txt", "conf_b.txt"]);
let after = page.last().unwrap().to_string_lossy().into_owned();
let (page2, total2, start2) = idx.find("conf", Some(&after), 2);
assert_eq!(total2, 3);
assert_eq!(start2, 2);
assert_eq!(names(page2), vec!["conf_c.txt"]);
let _ = std::fs::remove_dir_all(&tmp);
}
#[test]
fn snapshot_roundtrip() {
let tmp = std::env::temp_dir().join(format!("rgx_snap_{}", std::process::id()));
let _ = std::fs::remove_dir_all(&tmp);
std::fs::create_dir_all(&tmp).unwrap();
write(&tmp, "a.txt", b"SNAPSHOT token here\n");
write(&tmp, "b.txt", b"other content\n");
let idx = Index::build(&tmp);
let snap = tmp.join("index.bin");
idx.save(&snap).unwrap();
let loaded = Index::load(&snap).unwrap();
assert_eq!(loaded.file_count(), idx.file_count());
let q = Query::for_pattern("SNAPSHOT", Options::default());
assert_eq!(names(loaded.candidates(&q)), vec!["a.txt"]);
let _ = std::fs::remove_dir_all(&tmp);
}
}