ripgrepx 0.4.0

Instant ripgrep via a persistent candidate index, for the terminal and AI agents over MCP
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
//! The in-memory candidate index: trigram -> set of file IDs, plus a file table.
//!
//! This is the "narrow the candidate set" half of rgx. Building, querying, incremental update, and
//! persistence are kept separable from the walk and from ripgrep so each can be tested in isolation.
//!
//! Incremental updates are deliberately simple and lean on the confirm step for correctness: when a
//! file changes we add its *current* trigrams (so no new-trigram query can miss it); trigrams that
//! the file no longer contains linger in their posting lists, which only ever produces an extra
//! candidate that ripgrep then filters out. Deleted files are tombstoned (`live = false`) so they
//! stop being candidates. See `docs/index-and-storage.md` sections 3.1 and 5.

use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::UNIX_EPOCH;

use anyhow::{Context, Result, bail};
use ignore::{WalkBuilder, WalkState};
use rayon::prelude::*;
use roaring::RoaringBitmap;
use rustc_hash::{FxHashMap, FxHashSet};

use crate::query::Query;
use crate::trigram::{self, Trigram};

/// Snapshot magic + format version. Bump to invalidate (load returns an error -> caller rebuilds).
const SNAPSHOT_MAGIC: &[u8; 8] = b"RGXIDX01";

/// Bytes from the start of a file scanned for a NUL to decide "binary from the start". Conservative:
/// if a NUL is in here, ripgrep also treats the file as binary and prints nothing for it in a
/// recursive search, so giving it no postings is sound. Deeper NULs are handled by the confirm
/// step. See `docs/index-and-storage.md` section 3.1.
const BINARY_SNIFF_BYTES: usize = 1024;

/// A file known to the index. `live` is false once the file is deleted (tombstoned).
struct FileEntry {
    path: PathBuf,
    size: u64,
    mtime_ns: u64,
    live: bool,
}

#[derive(Default)]
pub struct Index {
    entries: Vec<FileEntry>,
    path_to_id: FxHashMap<PathBuf, u32>,
    postings: FxHashMap<u32, RoaringBitmap>,
}

impl Index {
    /// Number of live (non-deleted) files known to the index.
    pub fn file_count(&self) -> usize {
        self.entries.iter().filter(|e| e.live).count()
    }

    pub fn trigram_count(&self) -> usize {
        self.postings.len()
    }

    /// Build an index over `root`, honoring the same ignore rules ripgrep uses by default.
    pub fn build(root: impl AsRef<Path>) -> Index {
        let paths = walk_files(root.as_ref());
        Self::from_paths(&paths, &AtomicUsize::new(0))
    }

    /// Build an index from an already-walked path set, bumping `progress` once per file processed so
    /// a watcher can show a climbing count during a cold build.
    pub fn from_paths(paths: &[PathBuf], progress: &AtomicUsize) -> Index {
        let (metas, postings) = index_files(paths, progress);
        let entries: Vec<FileEntry> = paths
            .iter()
            .cloned()
            .zip(metas)
            .map(|(path, (size, mtime_ns))| FileEntry {
                path,
                size,
                mtime_ns,
                live: true,
            })
            .collect();
        let path_to_id = entries
            .iter()
            .enumerate()
            .map(|(id, e)| (e.path.clone(), id as u32))
            .collect();
        Index {
            entries,
            path_to_id,
            postings,
        }
    }

    /// Apply a set of changed and removed paths to the resident index (incremental update).
    pub fn apply_changes(&mut self, changed: &[PathBuf], removed: &[PathBuf]) {
        for path in removed {
            if let Some(&id) = self.path_to_id.get(path) {
                self.entries[id as usize].live = false;
            }
        }
        let mut seen = FxHashSet::default();
        for path in changed {
            // Stat BEFORE read: if the file is rewritten between our read and stat, we must record an
            // mtime no newer than the bytes we indexed, so the next reconcile still sees a change and
            // re-indexes. Stat-after-read could store the new mtime over old trigrams and miss it.
            let (size, mtime_ns) = stat(path);
            let Ok(bytes) = std::fs::read(path) else {
                // Unreadable now (perhaps deleted between event and handling): tombstone if known.
                if let Some(&id) = self.path_to_id.get(path) {
                    self.entries[id as usize].live = false;
                }
                continue;
            };
            let id = self.intern(path, size, mtime_ns);
            if collect_trigrams(&bytes, &mut seen) {
                for &t in &seen {
                    self.postings
                        .entry(trigram::pack(t))
                        .or_default()
                        .insert(id);
                }
            }
        }
    }

    /// Reconcile the index against the current tree by `(size, mtime)`: index new/changed files and
    /// tombstone vanished ones. Used at daemon startup to catch changes made while it was down.
    /// Returns the number of files re-indexed plus removed.
    pub fn reconcile(&mut self, root: impl AsRef<Path>) -> usize {
        let walked = walk_files(root.as_ref());
        let walked_set: rustc_hash::FxHashSet<&Path> = walked.iter().map(|p| p.as_path()).collect();
        let mut changed = Vec::new();
        for p in &walked {
            match self.path_to_id.get(p) {
                None => changed.push(p.clone()),
                Some(&id) => {
                    let e = &self.entries[id as usize];
                    let (size, mtime_ns) = stat(p);
                    if !e.live || e.size != size || e.mtime_ns != mtime_ns {
                        changed.push(p.clone());
                    }
                }
            }
        }
        let removed: Vec<PathBuf> = self
            .entries
            .iter()
            .filter(|e| e.live && !walked_set.contains(e.path.as_path()))
            .map(|e| e.path.clone())
            .collect();
        let n = changed.len() + removed.len();
        self.apply_changes(&changed, &removed);
        n
    }

    /// Get the id for `path`, creating a live entry if new; refresh metadata and revive if known.
    fn intern(&mut self, path: &Path, size: u64, mtime_ns: u64) -> u32 {
        if let Some(&id) = self.path_to_id.get(path) {
            let e = &mut self.entries[id as usize];
            e.size = size;
            e.mtime_ns = mtime_ns;
            e.live = true;
            return id;
        }
        let id = self.entries.len() as u32;
        self.entries.push(FileEntry {
            path: path.to_path_buf(),
            size,
            mtime_ns,
            live: true,
        });
        self.path_to_id.insert(path.to_path_buf(), id);
        id
    }

    /// Resolve a trigram query to candidate files (a sound superset of the real matches). A fallback
    /// query (no usable trigram) simply makes *every* live file a candidate — there is no separate
    /// "scan the tree" path; ripgrep then confirms over whatever this returns.
    pub fn candidates(&self, query: &Query) -> Vec<&Path> {
        let bitmap = self.eval(query);
        bitmap
            .iter()
            .filter(|&id| self.entries[id as usize].live)
            .map(|id| self.entries[id as usize].path.as_path())
            .collect()
    }

    /// File/dir name lookup (fd/find-style): live paths whose string contains `needle`. Returns the
    /// keyset page (paths whose string is strictly greater than `after`, capped at `limit`), the total
    /// number of matches, and `start` (matches skipped before this page) so callers can report an
    /// honest "X-Y of total". Sorted by path string — the same order callers keyset on — so paging is
    /// deterministic and never skips or repeats.
    pub fn find(
        &self,
        needle: &str,
        after: Option<&str>,
        limit: usize,
    ) -> (Vec<&Path>, usize, usize) {
        let mut hits: Vec<&Path> = self
            .entries
            .iter()
            .filter(|e| e.live && e.path.to_string_lossy().contains(needle))
            .map(|e| e.path.as_path())
            .collect();
        // Sort by the path string (the order callers keyset on). `sort_by_cached_key` decodes each
        // path to a String once (O(n)), versus a comparator that would re-decode both sides on every
        // one of the O(n log n) comparisons.
        hits.sort_by_cached_key(|p| p.to_string_lossy().into_owned());
        let total = hits.len();
        let start = after.map_or(0, |a| {
            hits.partition_point(|p| p.to_string_lossy().as_ref() <= a)
        });
        hits.drain(..start);
        hits.truncate(limit);
        (hits, total, start)
    }

    /// Approximate resident size of the posting lists (serialized roaring bytes), for status.
    pub fn memory_bytes(&self) -> u64 {
        self.postings
            .values()
            .map(|b| b.serialized_size() as u64)
            .sum()
    }

    /// Evaluate a query to the set of matching file IDs via roaring set algebra. A fallback
    /// (`Query::All`) yields every file id.
    fn eval(&self, query: &Query) -> RoaringBitmap {
        match query {
            Query::All => self.all_ids(),
            Query::Tri(t) => self
                .postings
                .get(&trigram::pack(*t))
                .cloned()
                .unwrap_or_default(),
            // An empty AND is the identity "match all", not "match nothing" — guard the soundness
            // invariant even though `Query::for_pattern` never builds an empty And/Or today.
            Query::And(qs) => qs
                .iter()
                .map(|q| self.eval(q))
                .reduce(|a, b| a & b)
                .unwrap_or_else(|| self.all_ids()),
            Query::Or(qs) => qs
                .iter()
                .map(|q| self.eval(q))
                .reduce(|a, b| a | b)
                .unwrap_or_default(),
        }
    }

    fn all_ids(&self) -> RoaringBitmap {
        (0..self.entries.len() as u32).collect()
    }

    /// Serialize the index to `path` (atomic via temp file + rename).
    pub fn save(&self, path: impl AsRef<Path>) -> Result<()> {
        let path = path.as_ref();
        if let Some(dir) = path.parent() {
            std::fs::create_dir_all(dir).ok();
        }
        let tmp = path.with_extension("tmp");
        let mut w = std::io::BufWriter::new(std::fs::File::create(&tmp)?);
        w.write_all(SNAPSHOT_MAGIC)?;
        write_u64(&mut w, self.entries.len() as u64)?;
        for e in &self.entries {
            w.write_all(&[e.live as u8])?;
            write_u64(&mut w, e.size)?;
            write_u64(&mut w, e.mtime_ns)?;
            let pb = path_to_bytes(&e.path);
            write_u32(&mut w, pb.len() as u32)?;
            w.write_all(&pb)?;
        }
        write_u64(&mut w, self.postings.len() as u64)?;
        let mut buf = Vec::new();
        for (&key, bm) in &self.postings {
            write_u32(&mut w, key)?;
            buf.clear();
            bm.serialize_into(&mut buf)?;
            write_u32(&mut w, buf.len() as u32)?;
            w.write_all(&buf)?;
        }
        w.flush()?;
        drop(w);
        std::fs::rename(&tmp, path).context("rename snapshot into place")?;
        Ok(())
    }

    /// Load an index snapshot from `path`. Errors (incl. version mismatch) signal "rebuild".
    pub fn load(path: impl AsRef<Path>) -> Result<Index> {
        let mut r = std::io::BufReader::new(std::fs::File::open(path)?);
        let mut magic = [0u8; 8];
        r.read_exact(&mut magic)?;
        if &magic != SNAPSHOT_MAGIC {
            bail!("snapshot version mismatch");
        }
        let n = read_u64(&mut r)? as usize;
        let mut entries = Vec::with_capacity(n);
        let mut path_to_id = FxHashMap::default();
        for id in 0..n {
            let mut live = [0u8; 1];
            r.read_exact(&mut live)?;
            let size = read_u64(&mut r)?;
            let mtime_ns = read_u64(&mut r)?;
            let plen = read_u32(&mut r)? as usize;
            let mut pb = vec![0u8; plen];
            r.read_exact(&mut pb)?;
            let path = path_from_bytes(&pb);
            path_to_id.insert(path.clone(), id as u32);
            entries.push(FileEntry {
                path,
                size,
                mtime_ns,
                live: live[0] != 0,
            });
        }
        let np = read_u64(&mut r)? as usize;
        let n_entries = entries.len() as u32;
        let mut postings = FxHashMap::default();
        for _ in 0..np {
            let key = read_u32(&mut r)?;
            let blen = read_u32(&mut r)? as usize;
            let mut bb = vec![0u8; blen];
            r.read_exact(&mut bb)?;
            let bm = RoaringBitmap::deserialize_from(&bb[..])?;
            // A posting referencing a file id past the entries table means a corrupt or foreign
            // snapshot; reject it so a query can't panic indexing `entries[id]` (caller rebuilds).
            if bm.max().is_some_and(|m| m >= n_entries) {
                bail!("snapshot posting references out-of-range file id");
            }
            postings.insert(key, bm);
        }
        Ok(Index {
            entries,
            path_to_id,
            postings,
        })
    }
}

/// Encode a path for the snapshot. The snapshot is a per-machine rebuildable cache, so each platform
/// only needs to round-trip its own paths: Unix uses the raw OS bytes (zero-copy, unchanged); other
/// platforms use UTF-8 (lossy for the rare non-Unicode path, which the next rebuild repairs).
#[cfg(unix)]
fn path_to_bytes(p: &Path) -> std::borrow::Cow<'_, [u8]> {
    use std::os::unix::ffi::OsStrExt;
    std::borrow::Cow::Borrowed(p.as_os_str().as_bytes())
}
#[cfg(not(unix))]
fn path_to_bytes(p: &Path) -> std::borrow::Cow<'_, [u8]> {
    std::borrow::Cow::Owned(p.to_string_lossy().into_owned().into_bytes())
}

#[cfg(unix)]
fn path_from_bytes(b: &[u8]) -> PathBuf {
    use std::os::unix::ffi::OsStrExt;
    PathBuf::from(std::ffi::OsStr::from_bytes(b))
}
#[cfg(not(unix))]
fn path_from_bytes(b: &[u8]) -> PathBuf {
    PathBuf::from(String::from_utf8_lossy(b).into_owned())
}

fn write_u32(w: &mut impl Write, v: u32) -> std::io::Result<()> {
    w.write_all(&v.to_le_bytes())
}
fn write_u64(w: &mut impl Write, v: u64) -> std::io::Result<()> {
    w.write_all(&v.to_le_bytes())
}
fn read_u32(r: &mut impl Read) -> std::io::Result<u32> {
    let mut b = [0u8; 4];
    r.read_exact(&mut b)?;
    Ok(u32::from_le_bytes(b))
}
fn read_u64(r: &mut impl Read) -> std::io::Result<u64> {
    let mut b = [0u8; 8];
    r.read_exact(&mut b)?;
    Ok(u64::from_le_bytes(b))
}

/// A `WalkBuilder` that enumerates exactly the files `rg` would under `root`. The crate defaults
/// already match ripgrep (skip hidden; honor `.gitignore`/`.ignore`/`.git/info/exclude` and the
/// global gitignore; read parent ignores; don't follow symlinks); the one thing the `rg` binary adds
/// on top of the crate is the `.rgignore` custom ignore name. Both the index walk and the `full_scan`
/// fallback go through here so they can't drift from `rg` — or from each other. See
/// `docs/indexing.md` (What the walk includes).
pub fn walk_builder(root: &Path) -> WalkBuilder {
    let mut b = WalkBuilder::new(root);
    b.add_custom_ignore_filename(".rgignore");
    b
}

/// Collect the files ripgrep would search under `root`, sorted so file IDs are deterministic.
pub fn walk_files(root: &Path) -> Vec<PathBuf> {
    let found = Mutex::new(Vec::<PathBuf>::new());
    walk_builder(root).build_parallel().run(|| {
        let found = &found;
        Box::new(move |res| {
            if let Ok(entry) = res
                && entry.file_type().is_some_and(|t| t.is_file())
            {
                found.lock().unwrap().push(entry.into_path());
            }
            WalkState::Continue
        })
    });
    let mut paths = found.into_inner().unwrap();
    paths.sort();
    paths
}

/// Read each file once: collect (size, mtime) per file (ordered) and accumulate distinct trigrams
/// into sharded posting lists in parallel. `progress` is bumped once per file (for watchers).
fn index_files(
    paths: &[PathBuf],
    progress: &AtomicUsize,
) -> (Vec<(u64, u64)>, FxHashMap<u32, RoaringBitmap>) {
    const SHARDS: usize = 256;
    const TRIGRAM_WORDS: usize = (1 << 24) / 64; // one bit per possible 24-bit trigram
    // Shared posting maps, sharded so workers rarely contend a lock. This keeps peak memory near the
    // final index size (~one compressed copy), unlike per-worker maps which would multiply it by the
    // worker count.
    let shards: Vec<Mutex<FxHashMap<u32, RoaringBitmap>>> = (0..SHARDS)
        .map(|_| Mutex::new(FxHashMap::default()))
        .collect();

    // Per-worker scratch reused across files: a sparse bitset that dedups a file's trigrams by
    // bit-test (24-bit keys are dense, so this beats hashing), the distinct keys, and the per-shard
    // grouping buffers. The bitset is cleared via the distinct list, so clearing is O(set bits).
    let init = || {
        (
            vec![0u64; TRIGRAM_WORDS],
            Vec::<u32>::new(),
            vec![Vec::<u32>::new(); SHARDS],
        )
    };
    let metas: Vec<(u64, u64)> = paths
        .par_iter()
        .enumerate()
        .map_init(init, |(bits, distinct, by_shard), (id, path)| {
            progress.fetch_add(1, Ordering::Relaxed);
            let id = id as u32;
            let (size, mtime_ns) = stat(path);
            if let Ok(bytes) = std::fs::read(path)
                && !is_binary_from_start(&bytes)
            {
                distinct.clear();
                trigram::for_each(&bytes, |t| {
                    let key = trigram::pack(t);
                    let (w, b) = ((key >> 6) as usize, key & 63);
                    if bits[w] & (1u64 << b) == 0 {
                        bits[w] |= 1u64 << b;
                        distinct.push(key);
                    }
                });
                by_shard.iter_mut().for_each(Vec::clear);
                for &key in distinct.iter() {
                    by_shard[(key as usize) & (SHARDS - 1)].push(key);
                }
                for (s, keys) in by_shard.iter().enumerate() {
                    if keys.is_empty() {
                        continue;
                    }
                    let mut g = shards[s].lock().unwrap();
                    for &key in keys {
                        g.entry(key).or_default().insert(id);
                    }
                }
                for &key in distinct.iter() {
                    bits[(key >> 6) as usize] &= !(1u64 << (key & 63));
                }
            }
            (size, mtime_ns)
        })
        .collect();

    let mut merged = FxHashMap::default();
    for shard in shards {
        merged.extend(shard.into_inner().unwrap());
    }
    (metas, merged)
}

fn stat(path: &Path) -> (u64, u64) {
    match std::fs::metadata(path) {
        Ok(m) => {
            let mtime = m
                .modified()
                .ok()
                .and_then(|t| t.duration_since(UNIX_EPOCH).ok())
                .map(|d| d.as_nanos() as u64)
                .unwrap_or(0);
            (m.len(), mtime)
        }
        Err(_) => (0, 0),
    }
}

/// True if a NUL appears within the conservative sniff window — ripgrep suppresses such files
/// entirely in a recursive search, so giving them no postings is sound (see [`BINARY_SNIFF_BYTES`]).
fn is_binary_from_start(bytes: &[u8]) -> bool {
    memchr::memchr(0, &bytes[..bytes.len().min(BINARY_SNIFF_BYTES)]).is_some()
}

/// Fill `seen` with the distinct trigrams of `bytes`, returning whether the file should be indexed.
/// Binary-from-start files get no postings (the one place this decision is made, shared by the cold
/// build and incremental updates so they can't drift).
fn collect_trigrams(bytes: &[u8], seen: &mut FxHashSet<Trigram>) -> bool {
    if is_binary_from_start(bytes) {
        return false;
    }
    seen.clear();
    trigram::for_each(bytes, |t| {
        seen.insert(t);
    });
    true
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::query::Options;

    fn write(dir: &Path, name: &str, content: &[u8]) {
        std::fs::write(dir.join(name), content).unwrap();
    }

    fn names(c: Vec<&Path>) -> Vec<String> {
        let mut v: Vec<String> = c
            .iter()
            .map(|p| p.file_name().unwrap().to_string_lossy().into_owned())
            .collect();
        v.sort();
        v
    }

    #[test]
    fn build_candidates_and_binary_skip() {
        let tmp = std::env::temp_dir().join(format!("rgx_idx_{}", std::process::id()));
        let _ = std::fs::remove_dir_all(&tmp);
        std::fs::create_dir_all(&tmp).unwrap();
        write(&tmp, "a.txt", b"the quick brown fox\nNEEDLE here\n");
        write(&tmp, "b.txt", b"no match in this file at all\n");
        write(&tmp, "c.txt", b"another NEEDLE appears\n");
        write(&tmp, "bin.dat", b"\x00\x00NEEDLE inside binary\x00");

        let idx = Index::build(&tmp);
        assert_eq!(idx.file_count(), 4);
        let q = Query::for_pattern("NEEDLE", Options::default());
        let n = names(idx.candidates(&q));
        assert!(n.contains(&"a.txt".to_string()) && n.contains(&"c.txt".to_string()));
        assert!(!n.contains(&"b.txt".to_string()) && !n.contains(&"bin.dat".to_string()));

        // A fallback query (no usable trigram) makes every live file a candidate, incl. bin.dat.
        let fb = names(idx.candidates(&Query::for_pattern("a.", Options::default())));
        assert_eq!(fb, vec!["a.txt", "b.txt", "bin.dat", "c.txt"]);
        let _ = std::fs::remove_dir_all(&tmp);
    }

    #[test]
    fn incremental_add_change_remove() {
        let tmp = std::env::temp_dir().join(format!("rgx_inc_{}", std::process::id()));
        let _ = std::fs::remove_dir_all(&tmp);
        std::fs::create_dir_all(&tmp).unwrap();
        write(&tmp, "a.txt", b"WIDGET alpha\n");
        let mut idx = Index::build(&tmp);
        let q = Query::for_pattern("WIDGET", Options::default());
        assert_eq!(names(idx.candidates(&q)), vec!["a.txt"]);

        // Add a new file containing WIDGET.
        write(&tmp, "b.txt", b"WIDGET beta\n");
        idx.apply_changes(&[tmp.join("b.txt")], &[]);
        let mut got = names(idx.candidates(&q));
        got.sort();
        assert_eq!(got, vec!["a.txt", "b.txt"]);

        // Remove a.txt -> tombstoned, no longer a candidate.
        std::fs::remove_file(tmp.join("a.txt")).unwrap();
        idx.apply_changes(&[], &[tmp.join("a.txt")]);
        assert_eq!(names(idx.candidates(&q)), vec!["b.txt"]);
        let _ = std::fs::remove_dir_all(&tmp);
    }

    #[test]
    fn find_returns_total_and_keyset_page() {
        let tmp = std::env::temp_dir().join(format!("rgx_find_{}", std::process::id()));
        let _ = std::fs::remove_dir_all(&tmp);
        std::fs::create_dir_all(&tmp).unwrap();
        for n in ["conf_a.txt", "conf_b.txt", "conf_c.txt", "other.txt"] {
            write(&tmp, n, b"x\n");
        }
        let idx = Index::build(&tmp);

        let (page, total, start) = idx.find("conf", None, 2);
        assert_eq!(total, 3);
        assert_eq!(start, 0);
        assert_eq!(names(page.clone()), vec!["conf_a.txt", "conf_b.txt"]);

        // Resume after the last path of the first page; total stays the full match count.
        let after = page.last().unwrap().to_string_lossy().into_owned();
        let (page2, total2, start2) = idx.find("conf", Some(&after), 2);
        assert_eq!(total2, 3);
        assert_eq!(start2, 2);
        assert_eq!(names(page2), vec!["conf_c.txt"]);
        let _ = std::fs::remove_dir_all(&tmp);
    }

    #[test]
    fn snapshot_roundtrip() {
        let tmp = std::env::temp_dir().join(format!("rgx_snap_{}", std::process::id()));
        let _ = std::fs::remove_dir_all(&tmp);
        std::fs::create_dir_all(&tmp).unwrap();
        write(&tmp, "a.txt", b"SNAPSHOT token here\n");
        write(&tmp, "b.txt", b"other content\n");
        let idx = Index::build(&tmp);
        let snap = tmp.join("index.bin");
        idx.save(&snap).unwrap();
        let loaded = Index::load(&snap).unwrap();
        assert_eq!(loaded.file_count(), idx.file_count());
        let q = Query::for_pattern("SNAPSHOT", Options::default());
        assert_eq!(names(loaded.candidates(&q)), vec!["a.txt"]);
        let _ = std::fs::remove_dir_all(&tmp);
    }
}