1use std::io::{Read, Write};
13use std::path::{Path, PathBuf};
14use std::sync::Mutex;
15use std::sync::atomic::{AtomicUsize, Ordering};
16use std::time::UNIX_EPOCH;
17
18use anyhow::{Context, Result, bail};
19use ignore::{WalkBuilder, WalkState};
20use rayon::prelude::*;
21use roaring::RoaringBitmap;
22use rustc_hash::{FxHashMap, FxHashSet};
23
24use crate::query::Query;
25use crate::trigram::{self, Trigram};
26
27const SNAPSHOT_MAGIC: &[u8; 8] = b"RGXIDX01";
29
30const BINARY_SNIFF_BYTES: usize = 1024;
35
36struct FileEntry {
38 path: PathBuf,
39 size: u64,
40 mtime_ns: u64,
41 live: bool,
42}
43
44#[derive(Default)]
45pub struct Index {
46 entries: Vec<FileEntry>,
47 path_to_id: FxHashMap<PathBuf, u32>,
48 postings: FxHashMap<u32, RoaringBitmap>,
49}
50
51impl Index {
52 pub fn file_count(&self) -> usize {
54 self.entries.iter().filter(|e| e.live).count()
55 }
56
57 pub fn trigram_count(&self) -> usize {
58 self.postings.len()
59 }
60
61 pub fn build(root: impl AsRef<Path>) -> Index {
63 let paths = walk_files(root.as_ref());
64 Self::from_paths(&paths, &AtomicUsize::new(0))
65 }
66
67 pub fn from_paths(paths: &[PathBuf], progress: &AtomicUsize) -> Index {
70 let (metas, postings) = index_files(paths, progress);
71 let entries: Vec<FileEntry> = paths
72 .iter()
73 .cloned()
74 .zip(metas)
75 .map(|(path, (size, mtime_ns))| FileEntry {
76 path,
77 size,
78 mtime_ns,
79 live: true,
80 })
81 .collect();
82 let path_to_id = entries
83 .iter()
84 .enumerate()
85 .map(|(id, e)| (e.path.clone(), id as u32))
86 .collect();
87 Index {
88 entries,
89 path_to_id,
90 postings,
91 }
92 }
93
94 pub fn apply_changes(&mut self, changed: &[PathBuf], removed: &[PathBuf]) {
96 for path in removed {
97 if let Some(&id) = self.path_to_id.get(path) {
98 self.entries[id as usize].live = false;
99 }
100 }
101 let mut seen = FxHashSet::default();
102 for path in changed {
103 let (size, mtime_ns) = stat(path);
107 let Ok(bytes) = std::fs::read(path) else {
108 if let Some(&id) = self.path_to_id.get(path) {
110 self.entries[id as usize].live = false;
111 }
112 continue;
113 };
114 let id = self.intern(path, size, mtime_ns);
115 if collect_trigrams(&bytes, &mut seen) {
116 for &t in &seen {
117 self.postings
118 .entry(trigram::pack(t))
119 .or_default()
120 .insert(id);
121 }
122 }
123 }
124 }
125
126 pub fn reconcile(&mut self, root: impl AsRef<Path>) -> usize {
130 let walked = walk_files(root.as_ref());
131 let walked_set: rustc_hash::FxHashSet<&Path> = walked.iter().map(|p| p.as_path()).collect();
132 let mut changed = Vec::new();
133 for p in &walked {
134 match self.path_to_id.get(p) {
135 None => changed.push(p.clone()),
136 Some(&id) => {
137 let e = &self.entries[id as usize];
138 let (size, mtime_ns) = stat(p);
139 if !e.live || e.size != size || e.mtime_ns != mtime_ns {
140 changed.push(p.clone());
141 }
142 }
143 }
144 }
145 let removed: Vec<PathBuf> = self
146 .entries
147 .iter()
148 .filter(|e| e.live && !walked_set.contains(e.path.as_path()))
149 .map(|e| e.path.clone())
150 .collect();
151 let n = changed.len() + removed.len();
152 self.apply_changes(&changed, &removed);
153 n
154 }
155
156 fn intern(&mut self, path: &Path, size: u64, mtime_ns: u64) -> u32 {
158 if let Some(&id) = self.path_to_id.get(path) {
159 let e = &mut self.entries[id as usize];
160 e.size = size;
161 e.mtime_ns = mtime_ns;
162 e.live = true;
163 return id;
164 }
165 let id = self.entries.len() as u32;
166 self.entries.push(FileEntry {
167 path: path.to_path_buf(),
168 size,
169 mtime_ns,
170 live: true,
171 });
172 self.path_to_id.insert(path.to_path_buf(), id);
173 id
174 }
175
176 pub fn candidates(&self, query: &Query) -> Vec<&Path> {
180 let bitmap = self.eval(query);
181 bitmap
182 .iter()
183 .filter(|&id| self.entries[id as usize].live)
184 .map(|id| self.entries[id as usize].path.as_path())
185 .collect()
186 }
187
188 pub fn find(
194 &self,
195 needle: &str,
196 after: Option<&str>,
197 limit: usize,
198 ) -> (Vec<&Path>, usize, usize) {
199 let mut hits: Vec<&Path> = self
200 .entries
201 .iter()
202 .filter(|e| e.live && e.path.to_string_lossy().contains(needle))
203 .map(|e| e.path.as_path())
204 .collect();
205 hits.sort_by_cached_key(|p| p.to_string_lossy().into_owned());
209 let total = hits.len();
210 let start = after.map_or(0, |a| {
211 hits.partition_point(|p| p.to_string_lossy().as_ref() <= a)
212 });
213 hits.drain(..start);
214 hits.truncate(limit);
215 (hits, total, start)
216 }
217
218 pub fn memory_bytes(&self) -> u64 {
220 self.postings
221 .values()
222 .map(|b| b.serialized_size() as u64)
223 .sum()
224 }
225
226 fn eval(&self, query: &Query) -> RoaringBitmap {
229 match query {
230 Query::All => self.all_ids(),
231 Query::Tri(t) => self
232 .postings
233 .get(&trigram::pack(*t))
234 .cloned()
235 .unwrap_or_default(),
236 Query::And(qs) => qs
239 .iter()
240 .map(|q| self.eval(q))
241 .reduce(|a, b| a & b)
242 .unwrap_or_else(|| self.all_ids()),
243 Query::Or(qs) => qs
244 .iter()
245 .map(|q| self.eval(q))
246 .reduce(|a, b| a | b)
247 .unwrap_or_default(),
248 }
249 }
250
251 fn all_ids(&self) -> RoaringBitmap {
252 (0..self.entries.len() as u32).collect()
253 }
254
255 pub fn save(&self, path: impl AsRef<Path>) -> Result<()> {
257 let path = path.as_ref();
258 if let Some(dir) = path.parent() {
259 std::fs::create_dir_all(dir).ok();
260 }
261 let tmp = path.with_extension("tmp");
262 let mut w = std::io::BufWriter::new(std::fs::File::create(&tmp)?);
263 w.write_all(SNAPSHOT_MAGIC)?;
264 write_u64(&mut w, self.entries.len() as u64)?;
265 for e in &self.entries {
266 w.write_all(&[e.live as u8])?;
267 write_u64(&mut w, e.size)?;
268 write_u64(&mut w, e.mtime_ns)?;
269 let pb = path_to_bytes(&e.path);
270 write_u32(&mut w, pb.len() as u32)?;
271 w.write_all(&pb)?;
272 }
273 write_u64(&mut w, self.postings.len() as u64)?;
274 let mut buf = Vec::new();
275 for (&key, bm) in &self.postings {
276 write_u32(&mut w, key)?;
277 buf.clear();
278 bm.serialize_into(&mut buf)?;
279 write_u32(&mut w, buf.len() as u32)?;
280 w.write_all(&buf)?;
281 }
282 w.flush()?;
283 drop(w);
284 std::fs::rename(&tmp, path).context("rename snapshot into place")?;
285 Ok(())
286 }
287
288 pub fn load(path: impl AsRef<Path>) -> Result<Index> {
290 let mut r = std::io::BufReader::new(std::fs::File::open(path)?);
291 let mut magic = [0u8; 8];
292 r.read_exact(&mut magic)?;
293 if &magic != SNAPSHOT_MAGIC {
294 bail!("snapshot version mismatch");
295 }
296 let n = read_u64(&mut r)? as usize;
297 let mut entries = Vec::with_capacity(n);
298 let mut path_to_id = FxHashMap::default();
299 for id in 0..n {
300 let mut live = [0u8; 1];
301 r.read_exact(&mut live)?;
302 let size = read_u64(&mut r)?;
303 let mtime_ns = read_u64(&mut r)?;
304 let plen = read_u32(&mut r)? as usize;
305 let mut pb = vec![0u8; plen];
306 r.read_exact(&mut pb)?;
307 let path = path_from_bytes(&pb);
308 path_to_id.insert(path.clone(), id as u32);
309 entries.push(FileEntry {
310 path,
311 size,
312 mtime_ns,
313 live: live[0] != 0,
314 });
315 }
316 let np = read_u64(&mut r)? as usize;
317 let n_entries = entries.len() as u32;
318 let mut postings = FxHashMap::default();
319 for _ in 0..np {
320 let key = read_u32(&mut r)?;
321 let blen = read_u32(&mut r)? as usize;
322 let mut bb = vec![0u8; blen];
323 r.read_exact(&mut bb)?;
324 let bm = RoaringBitmap::deserialize_from(&bb[..])?;
325 if bm.max().is_some_and(|m| m >= n_entries) {
328 bail!("snapshot posting references out-of-range file id");
329 }
330 postings.insert(key, bm);
331 }
332 Ok(Index {
333 entries,
334 path_to_id,
335 postings,
336 })
337 }
338}
339
340#[cfg(unix)]
344fn path_to_bytes(p: &Path) -> std::borrow::Cow<'_, [u8]> {
345 use std::os::unix::ffi::OsStrExt;
346 std::borrow::Cow::Borrowed(p.as_os_str().as_bytes())
347}
348#[cfg(not(unix))]
349fn path_to_bytes(p: &Path) -> std::borrow::Cow<'_, [u8]> {
350 std::borrow::Cow::Owned(p.to_string_lossy().into_owned().into_bytes())
351}
352
353#[cfg(unix)]
354fn path_from_bytes(b: &[u8]) -> PathBuf {
355 use std::os::unix::ffi::OsStrExt;
356 PathBuf::from(std::ffi::OsStr::from_bytes(b))
357}
358#[cfg(not(unix))]
359fn path_from_bytes(b: &[u8]) -> PathBuf {
360 PathBuf::from(String::from_utf8_lossy(b).into_owned())
361}
362
363fn write_u32(w: &mut impl Write, v: u32) -> std::io::Result<()> {
364 w.write_all(&v.to_le_bytes())
365}
366fn write_u64(w: &mut impl Write, v: u64) -> std::io::Result<()> {
367 w.write_all(&v.to_le_bytes())
368}
369fn read_u32(r: &mut impl Read) -> std::io::Result<u32> {
370 let mut b = [0u8; 4];
371 r.read_exact(&mut b)?;
372 Ok(u32::from_le_bytes(b))
373}
374fn read_u64(r: &mut impl Read) -> std::io::Result<u64> {
375 let mut b = [0u8; 8];
376 r.read_exact(&mut b)?;
377 Ok(u64::from_le_bytes(b))
378}
379
380pub fn walk_builder(root: &Path) -> WalkBuilder {
387 let mut b = WalkBuilder::new(root);
388 b.add_custom_ignore_filename(".rgignore");
389 b
390}
391
392pub fn walk_files(root: &Path) -> Vec<PathBuf> {
394 let found = Mutex::new(Vec::<PathBuf>::new());
395 walk_builder(root).build_parallel().run(|| {
396 let found = &found;
397 Box::new(move |res| {
398 if let Ok(entry) = res
399 && entry.file_type().is_some_and(|t| t.is_file())
400 {
401 found.lock().unwrap().push(entry.into_path());
402 }
403 WalkState::Continue
404 })
405 });
406 let mut paths = found.into_inner().unwrap();
407 paths.sort();
408 paths
409}
410
411fn index_files(
414 paths: &[PathBuf],
415 progress: &AtomicUsize,
416) -> (Vec<(u64, u64)>, FxHashMap<u32, RoaringBitmap>) {
417 const SHARDS: usize = 256;
418 const TRIGRAM_WORDS: usize = (1 << 24) / 64; let shards: Vec<Mutex<FxHashMap<u32, RoaringBitmap>>> = (0..SHARDS)
423 .map(|_| Mutex::new(FxHashMap::default()))
424 .collect();
425
426 let init = || {
430 (
431 vec![0u64; TRIGRAM_WORDS],
432 Vec::<u32>::new(),
433 vec![Vec::<u32>::new(); SHARDS],
434 )
435 };
436 let metas: Vec<(u64, u64)> = paths
437 .par_iter()
438 .enumerate()
439 .map_init(init, |(bits, distinct, by_shard), (id, path)| {
440 progress.fetch_add(1, Ordering::Relaxed);
441 let id = id as u32;
442 let (size, mtime_ns) = stat(path);
443 if let Ok(bytes) = std::fs::read(path)
444 && !is_binary_from_start(&bytes)
445 {
446 distinct.clear();
447 trigram::for_each(&bytes, |t| {
448 let key = trigram::pack(t);
449 let (w, b) = ((key >> 6) as usize, key & 63);
450 if bits[w] & (1u64 << b) == 0 {
451 bits[w] |= 1u64 << b;
452 distinct.push(key);
453 }
454 });
455 by_shard.iter_mut().for_each(Vec::clear);
456 for &key in distinct.iter() {
457 by_shard[(key as usize) & (SHARDS - 1)].push(key);
458 }
459 for (s, keys) in by_shard.iter().enumerate() {
460 if keys.is_empty() {
461 continue;
462 }
463 let mut g = shards[s].lock().unwrap();
464 for &key in keys {
465 g.entry(key).or_default().insert(id);
466 }
467 }
468 for &key in distinct.iter() {
469 bits[(key >> 6) as usize] &= !(1u64 << (key & 63));
470 }
471 }
472 (size, mtime_ns)
473 })
474 .collect();
475
476 let mut merged = FxHashMap::default();
477 for shard in shards {
478 merged.extend(shard.into_inner().unwrap());
479 }
480 (metas, merged)
481}
482
483fn stat(path: &Path) -> (u64, u64) {
484 match std::fs::metadata(path) {
485 Ok(m) => {
486 let mtime = m
487 .modified()
488 .ok()
489 .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
490 .map(|d| d.as_nanos() as u64)
491 .unwrap_or(0);
492 (m.len(), mtime)
493 }
494 Err(_) => (0, 0),
495 }
496}
497
498fn is_binary_from_start(bytes: &[u8]) -> bool {
501 memchr::memchr(0, &bytes[..bytes.len().min(BINARY_SNIFF_BYTES)]).is_some()
502}
503
504fn collect_trigrams(bytes: &[u8], seen: &mut FxHashSet<Trigram>) -> bool {
508 if is_binary_from_start(bytes) {
509 return false;
510 }
511 seen.clear();
512 trigram::for_each(bytes, |t| {
513 seen.insert(t);
514 });
515 true
516}
517
518#[cfg(test)]
519mod tests {
520 use super::*;
521 use crate::query::Options;
522
523 fn write(dir: &Path, name: &str, content: &[u8]) {
524 std::fs::write(dir.join(name), content).unwrap();
525 }
526
527 fn names(c: Vec<&Path>) -> Vec<String> {
528 let mut v: Vec<String> = c
529 .iter()
530 .map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
531 .collect();
532 v.sort();
533 v
534 }
535
536 #[test]
537 fn build_candidates_and_binary_skip() {
538 let tmp = std::env::temp_dir().join(format!("rgx_idx_{}", std::process::id()));
539 let _ = std::fs::remove_dir_all(&tmp);
540 std::fs::create_dir_all(&tmp).unwrap();
541 write(&tmp, "a.txt", b"the quick brown fox\nNEEDLE here\n");
542 write(&tmp, "b.txt", b"no match in this file at all\n");
543 write(&tmp, "c.txt", b"another NEEDLE appears\n");
544 write(&tmp, "bin.dat", b"\x00\x00NEEDLE inside binary\x00");
545
546 let idx = Index::build(&tmp);
547 assert_eq!(idx.file_count(), 4);
548 let q = Query::for_pattern("NEEDLE", Options::default());
549 let n = names(idx.candidates(&q));
550 assert!(n.contains(&"a.txt".to_string()) && n.contains(&"c.txt".to_string()));
551 assert!(!n.contains(&"b.txt".to_string()) && !n.contains(&"bin.dat".to_string()));
552
553 let fb = names(idx.candidates(&Query::for_pattern("a.", Options::default())));
555 assert_eq!(fb, vec!["a.txt", "b.txt", "bin.dat", "c.txt"]);
556 let _ = std::fs::remove_dir_all(&tmp);
557 }
558
559 #[test]
560 fn incremental_add_change_remove() {
561 let tmp = std::env::temp_dir().join(format!("rgx_inc_{}", std::process::id()));
562 let _ = std::fs::remove_dir_all(&tmp);
563 std::fs::create_dir_all(&tmp).unwrap();
564 write(&tmp, "a.txt", b"WIDGET alpha\n");
565 let mut idx = Index::build(&tmp);
566 let q = Query::for_pattern("WIDGET", Options::default());
567 assert_eq!(names(idx.candidates(&q)), vec!["a.txt"]);
568
569 write(&tmp, "b.txt", b"WIDGET beta\n");
571 idx.apply_changes(&[tmp.join("b.txt")], &[]);
572 let mut got = names(idx.candidates(&q));
573 got.sort();
574 assert_eq!(got, vec!["a.txt", "b.txt"]);
575
576 std::fs::remove_file(tmp.join("a.txt")).unwrap();
578 idx.apply_changes(&[], &[tmp.join("a.txt")]);
579 assert_eq!(names(idx.candidates(&q)), vec!["b.txt"]);
580 let _ = std::fs::remove_dir_all(&tmp);
581 }
582
583 #[test]
584 fn find_returns_total_and_keyset_page() {
585 let tmp = std::env::temp_dir().join(format!("rgx_find_{}", std::process::id()));
586 let _ = std::fs::remove_dir_all(&tmp);
587 std::fs::create_dir_all(&tmp).unwrap();
588 for n in ["conf_a.txt", "conf_b.txt", "conf_c.txt", "other.txt"] {
589 write(&tmp, n, b"x\n");
590 }
591 let idx = Index::build(&tmp);
592
593 let (page, total, start) = idx.find("conf", None, 2);
594 assert_eq!(total, 3);
595 assert_eq!(start, 0);
596 assert_eq!(names(page.clone()), vec!["conf_a.txt", "conf_b.txt"]);
597
598 let after = page.last().unwrap().to_string_lossy().into_owned();
600 let (page2, total2, start2) = idx.find("conf", Some(&after), 2);
601 assert_eq!(total2, 3);
602 assert_eq!(start2, 2);
603 assert_eq!(names(page2), vec!["conf_c.txt"]);
604 let _ = std::fs::remove_dir_all(&tmp);
605 }
606
607 #[test]
608 fn snapshot_roundtrip() {
609 let tmp = std::env::temp_dir().join(format!("rgx_snap_{}", std::process::id()));
610 let _ = std::fs::remove_dir_all(&tmp);
611 std::fs::create_dir_all(&tmp).unwrap();
612 write(&tmp, "a.txt", b"SNAPSHOT token here\n");
613 write(&tmp, "b.txt", b"other content\n");
614 let idx = Index::build(&tmp);
615 let snap = tmp.join("index.bin");
616 idx.save(&snap).unwrap();
617 let loaded = Index::load(&snap).unwrap();
618 assert_eq!(loaded.file_count(), idx.file_count());
619 let q = Query::for_pattern("SNAPSHOT", Options::default());
620 assert_eq!(names(loaded.candidates(&q)), vec!["a.txt"]);
621 let _ = std::fs::remove_dir_all(&tmp);
622 }
623}