keyhog-core 0.5.40

keyhog-core: shared data model and detector specifications for the KeyHog secret scanner
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
//! Incremental scan support via a persisted file-content index.
//!
//! ## What it does
//!
//! On a fresh scan we compute, for every input chunk, a metadata tuple
//! `(mtime_ns, size, BLAKE3(content))` and store it under the file's
//! canonical path. On the next run, files whose `(mtime, size)` match
//! the stored values can be skipped *without re-reading the bytes* -
//! they almost certainly haven't changed (rsync-style trust). When
//! `(mtime, size)` differ but BLAKE3 matches we record the new mtime
//! and still skip - same content, different stat (touched, copied).
//!
//! Tier-B moat innovation #3 from audits/legendary-2026-04-26: "10–100×
//! speedup on CI re-runs" by skipping the 99% of files that didn't change.
//!
//! ## Schema versions
//!
//! - **v1 (legacy)** - `path → BLAKE3 hex` only. Loadable but lacks the
//!   metadata short-circuit; treated as cold-start to avoid mixing schemas.
//! - **v2 (current)** - `path → (mtime_ns, size, BLAKE3 hex)` plus a
//!   top-level `spec_hash` derived from the loaded detector set. A
//!   spec-hash mismatch invalidates the entire cache; this is the
//!   correctness fix for "added a detector but unchanged files were
//!   silently skipped, missing the new detection forever."
//!
//! ## Serialization
//!
//! JSON, on purpose. The dataset is one row per scanned file (≤ ~1M for
//! any sane repo) and JSON keeps the on-disk format trivial to debug,
//! diff, and version-control if a team wants to.
//!
//! ## Threat model
//!
//! Cached entries do NOT contain credentials. Storing a `(mtime, size,
//! content_hash)` tuple per scanned path leaks that the path *exists*
//! and what its content fingerprint is, which is why `--lockdown`
//! refuses to load or write the cache at all.

use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};

use parking_lot::RwLock;
use serde::{Deserialize, Serialize};

use crate::merkle_spec_hash::{hex_encode, hex_to_array};

pub use crate::merkle_spec_hash::compute_spec_hash;

/// On-disk per-entry record (v2). The `mtime_ns` + `size` pair is the
/// fast-path key: a successful match short-circuits the BLAKE3 read
/// entirely. `hash` remains as a paranoid-mode verifier and as the
/// authoritative content fingerprint when mtime alone changed.
#[derive(Debug, Clone, Serialize, Deserialize)]
struct EntryV2 {
    /// `mtime` in nanoseconds since UNIX epoch. Stored as `u64` so we
    /// don't lose ext4/NTFS sub-second precision; older filesystems
    /// (FAT32 with 2-second resolution) just round-trip the rounded value.
    mtime_ns: u64,
    /// File size in bytes from `fs::metadata`.
    size: u64,
    /// BLAKE3 hex digest of the chunk content.
    hash: String,
}

/// Top-level on-disk schema.
#[derive(Debug, Serialize, Deserialize)]
struct OnDisk {
    /// Schema version. Bumped on incompatible changes.
    version: u32,
    /// Hex BLAKE3 of the canonical detector-spec digest. Optional for
    /// schemas written before spec hashing was added; loaders treating
    /// `None` as "trust the cache" stay back-compatible.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    spec_hash: Option<String>,
    /// `path → entry`. Stored as hex strings (not raw bytes) so a human
    /// can `git diff` the file and see which entries changed.
    entries: HashMap<String, EntryV2>,
}

const SCHEMA_VERSION: u32 = 2;

/// Shard count: spreads concurrent `record` / `unchanged` calls across
/// independent locks so tiny-file storms don't serialize all rayon workers.
const MERKLE_SHARDS: usize = 64;

/// Default upper bound on the number of in-memory cache entries.
///
/// Resident cost per entry is roughly `48 bytes` for the [`CacheEntry`]
/// (`mtime_ns: u64` + `size: u64` + `hash: [u8; 32]`, with padding) plus
/// the heap-allocated [`PathBuf`] key (one allocation, length of the
/// canonical path). On a typical repo a path averages ~80-120 bytes, so
/// budget ~150 bytes/entry end-to-end. At the default cap of 8M entries
/// that bounds the index at roughly 1.2 GB resident - large, but bounded,
/// and survivable on the fleet's 32-128 GB boxes. A giant monorepo can
/// raise or lower this via [`MerkleIndex::with_max_entries`] (Tier-A
/// configurability: compiled default, overridable by the caller).
///
/// When the cap is hit we WARN and stop *adding new paths*; updates to
/// paths already in the index are always allowed so an over-cap scan
/// never corrupts an existing entry. An uncached file is simply re-read
/// and re-scanned next run - slower, never unsound. This preserves the
/// module's core guarantee (a file that ever produced a finding is
/// `forget`-ten, never cached) regardless of the cap.
const MERKLE_DEFAULT_MAX_ENTRIES: usize = 8_000_000;

fn shard_index(path: &Path) -> usize {
    let mut h = DefaultHasher::new();
    path.hash(&mut h);
    (h.finish() as usize) % MERKLE_SHARDS
}

/// In-memory per-entry record. Mirrors [`EntryV2`] but holds the hash as
/// a fixed-size array - saves the per-lookup hex-decode cost on the
/// `unchanged` hot path.
#[derive(Debug, Clone, Copy)]
struct CacheEntry {
    mtime_ns: u64,
    size: u64,
    hash: [u8; 32],
}

/// In-memory file-hash index loaded from / saved to a JSON cache file.
///
/// Concurrency model: the orchestrator holds an `Arc<MerkleIndex>` and
/// records new entries as chunks arrive from rayon-parallel sources.
/// Paths are sharded across [`MERKLE_SHARDS`] mutex-protected maps so
/// concurrent updates rarely contend.
#[derive(Debug)]
pub struct MerkleIndex {
    shards: Vec<RwLock<HashMap<PathBuf, CacheEntry>>>,
    /// Upper bound on the number of retained entries across all shards.
    /// Defaults to [`MERKLE_DEFAULT_MAX_ENTRIES`]. Once reached, only
    /// updates to existing paths are accepted; new paths are dropped
    /// (with a one-shot WARN) so a giant monorepo can't silently grow
    /// the index without bound.
    max_entries: usize,
    /// Set once the cap is first hit so we WARN at most once per index
    /// rather than once per dropped entry (which would be a log storm
    /// on a multi-million-file overflow).
    cap_warned: std::sync::atomic::AtomicBool,
    /// Approximate live entry count, maintained on the insert hot path so
    /// the cap check is O(1) instead of summing all 64 shard lengths per
    /// insert (that scan would dominate a multi-million-file scan). It is
    /// incremented only on a NEW-path insert and never decremented (the
    /// `forget` path is for found-secret invalidation, not bulk eviction),
    /// so it is a monotonic upper bound on live entries - exactly the
    /// conservative side for a "stop growing" budget. Exact counts use
    /// [`Self::len`].
    approx_count: std::sync::atomic::AtomicUsize,
}

impl MerkleIndex {
    /// Construct a fresh, empty [`MerkleIndex`] with no cached entries and
    /// the default entry cap ([`MERKLE_DEFAULT_MAX_ENTRIES`]).
    pub fn empty() -> Self {
        Self::with_max_entries(MERKLE_DEFAULT_MAX_ENTRIES)
    }

    /// Construct a fresh, empty [`MerkleIndex`] with an explicit entry cap.
    /// A cap of `0` is treated as "unbounded" for callers that genuinely
    /// want the old behavior, but the documented resident cost still
    /// applies (~150 bytes/entry).
    pub fn with_max_entries(max_entries: usize) -> Self {
        Self {
            shards: (0..MERKLE_SHARDS)
                .map(|_| RwLock::new(HashMap::new()))
                .collect(),
            max_entries,
            cap_warned: std::sync::atomic::AtomicBool::new(false),
            approx_count: std::sync::atomic::AtomicUsize::new(0),
        }
    }

    /// The configured maximum number of retained entries (`0` = unbounded).
    pub fn max_entries(&self) -> usize {
        self.max_entries
    }

    /// Load the index from `path` without spec-hash gating. Returns an
    /// empty index when the file doesn't exist (first run) or fails to
    /// parse (treat as cold start - safer than poisoning the cache from
    /// a corrupted artifact). v1 caches are intentionally rejected as
    /// cold-start because they lack metadata fields.
    pub fn load(path: &Path) -> Self {
        sweep_stale_tmp_files(path);
        Self::load_with_spec_inner(path, None)
    }

    /// Load the index, gated on a matching detector-spec hash. When the
    /// stored `spec_hash` differs from `expected_spec_hash`, the cache is
    /// treated as cold-start. This is the correctness gate that prevents
    /// "added a detector → unchanged file silently skipped → new
    /// detector never runs against it" from ever happening.
    pub fn load_with_spec(path: &Path, expected_spec_hash: &[u8; 32]) -> Self {
        sweep_stale_tmp_files(path);
        Self::load_with_spec_inner(path, Some(expected_spec_hash))
    }

    fn load_with_spec_inner(path: &Path, expected_spec_hash: Option<&[u8; 32]>) -> Self {
        let bytes = match std::fs::read(path) {
            Ok(b) => b,
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Self::empty(),
            Err(e) => {
                tracing::warn!(
                    cache = %path.display(),
                    error = %e,
                    "merkle index file read failed; treating as cold start"
                );
                return Self::empty();
            }
        };
        let on_disk: OnDisk = match serde_json::from_slice(&bytes) {
            Ok(d) => d,
            Err(error) => {
                tracing::warn!(
                    cache = %path.display(),
                    %error,
                    "merkle index parse failed; treating as cold start"
                );
                return Self::empty();
            }
        };
        if on_disk.version != SCHEMA_VERSION {
            tracing::warn!(
                cache = %path.display(),
                version = on_disk.version,
                expected = SCHEMA_VERSION,
                "merkle index schema mismatch; treating as cold start"
            );
            return Self::empty();
        }
        if let Some(expected) = expected_spec_hash {
            let stored_match = on_disk
                .spec_hash
                .as_deref()
                .and_then(hex_to_array)
                .is_some_and(|stored| &stored == expected);
            if !stored_match {
                tracing::info!(
                    cache = %path.display(),
                    "detector spec changed since last scan; cache invalidated"
                );
                return Self::empty();
            }
        }
        let entries: HashMap<PathBuf, CacheEntry> = on_disk
            .entries
            .into_iter()
            .filter_map(|(p, e)| {
                hex_to_array(&e.hash).map(|hash| {
                    (
                        PathBuf::from(p),
                        CacheEntry {
                            mtime_ns: e.mtime_ns,
                            size: e.size,
                            hash,
                        },
                    )
                })
            })
            .collect();
        tracing::info!(
            cache = %path.display(),
            count = entries.len(),
            "merkle index loaded"
        );
        let idx = Self::empty();
        // Respect the entry cap even on load: a previously-saved index
        // larger than the current cap (e.g. cap was lowered, or the file
        // was produced by an unbounded build) must not blow the working
        // set on read. Insert up to the cap, then warn-and-stop. The
        // dropped tail of paths is simply re-scanned next run.
        for (p, e) in entries {
            if !idx.try_insert(p, e) {
                break;
            }
        }
        idx
    }

    /// Persist the index without binding it to a detector-spec hash. Old
    /// callers stay on this path; the next-cycle load won't enforce a
    /// spec match. Use [`Self::save_with_spec`] for the safe modern path.
    pub fn save(&self, path: &Path) -> std::io::Result<()> {
        self.save_inner(path, None)
    }

    /// Persist the index *with* the given detector-spec hash so a future
    /// load can detect detector drift and invalidate cleanly.
    pub fn save_with_spec(&self, path: &Path, spec_hash: &[u8; 32]) -> std::io::Result<()> {
        self.save_inner(path, Some(spec_hash))
    }

    fn save_inner(&self, path: &Path, spec_hash: Option<&[u8; 32]>) -> std::io::Result<()> {
        // Concurrency note: two `keyhog scan --incremental` processes
        // running against overlapping paths will both want to write
        // `merkle.idx`. The tmp-file uses `std::process::id()` so
        // there's no tmp-name collision, but the final `rename` is
        // last-writer-wins.
        //
        // To minimise data loss on concurrent saves, READ the
        // current on-disk entries first and merge our in-memory
        // state on top - entries in memory take precedence (we just
        // observed those files in this scan), but disk entries that
        // we DIDN'T touch are preserved. This narrows the data-loss
        // window from "entire scan" to "between read-and-rename"
        // (~milliseconds) instead of "between scan-start and save".
        //
        // A truly race-free solution needs an OS-level file lock
        // (`fcntl(F_SETLK)` / `LockFileEx`); that would block the
        // second writer entirely. We accept the small remaining
        // race as a correctness/perf trade - losing a few entries
        // means an extra rescan, not a missed leak.
        let mut merged = HashMap::<PathBuf, CacheEntry>::new();
        // Read existing on-disk entries first. Use the SAME spec
        // hash we're about to write - if disk was written under a
        // different spec, those entries are stale (a future load
        // would invalidate them) and we drop them now. If spec
        // matches (or this is the no-spec save path), preserve.
        // Format-mismatch / corrupted-file paths already log inside
        // `load`; ignore the error here so a bad on-disk state
        // doesn't stop us writing a fresh one.
        let on_disk_now = match spec_hash {
            Some(hash) => Self::load_with_spec(path, hash),
            None => Self::load(path),
        };
        for shard in &on_disk_now.shards {
            merged.extend(shard.read().iter().map(|(p, e)| (p.clone(), *e)));
        }
        // In-memory entries layer on top - last-write-wins by path.
        // These are the paths we observed THIS scan, so they're the most
        // valuable; insert them after the on-disk set so they win, and
        // remember the set so the cap step below keeps them first.
        let mut in_memory_paths = std::collections::HashSet::<PathBuf>::new();
        for shard in &self.shards {
            for (p, e) in shard.read().iter() {
                merged.insert(p.clone(), *e);
                in_memory_paths.insert(p.clone());
            }
        }
        // Enforce the entry cap on the persisted set too. Both inputs are
        // already cap-bounded individually (load truncates, in-memory
        // `record` truncates), but their UNION can be up to ~2x the cap.
        // Truncate, preferring paths observed this scan (in-memory) so a
        // capped save keeps the freshest fingerprints. Without this, a
        // monorepo whose on-disk and in-memory path sets barely overlap
        // could write a file ~2x the budget, which the next load would
        // then truncate anyway - so cap it here for an honest on-disk size.
        if self.max_entries != 0 && merged.len() > self.max_entries {
            let mut kept = HashMap::<PathBuf, CacheEntry>::with_capacity(self.max_entries);
            // Freshest first.
            for p in &in_memory_paths {
                if kept.len() >= self.max_entries {
                    break;
                }
                if let Some(e) = merged.get(p) {
                    kept.insert(p.clone(), *e);
                }
            }
            // Fill remaining budget from carried-over on-disk paths.
            for (p, e) in &merged {
                if kept.len() >= self.max_entries {
                    break;
                }
                kept.entry(p.clone()).or_insert(*e);
            }
            merged = kept;
        }
        let entries: HashMap<String, EntryV2> = merged
            .iter()
            .map(|(p, e)| {
                (
                    p.display().to_string(),
                    EntryV2 {
                        mtime_ns: e.mtime_ns,
                        size: e.size,
                        hash: hex_encode(&e.hash),
                    },
                )
            })
            .collect();
        let on_disk = OnDisk {
            version: SCHEMA_VERSION,
            spec_hash: spec_hash.map(hex_encode),
            entries,
        };
        let serialized = serde_json::to_vec_pretty(&on_disk)
            .map_err(|e| std::io::Error::other(format!("merkle index encode: {e}")))?;
        let parent = path.parent().unwrap_or_else(|| std::path::Path::new("."));
        std::fs::create_dir_all(parent)?;
        // `NamedTempFile::new_in` creates a randomly-named file in
        // the same directory as the final target, then `persist`
        // atomic-renames it. If we panic between create and persist,
        // NamedTempFile's Drop deletes the tmp file - earlier code
        // used `path.with_extension(format!("tmp.{pid}"))` and
        // leaked the tmp on panic. A SIGTERM/SIGKILL still leaks
        // (Drop doesn't run); the only complete fix for that is a
        // startup-time stale-tmp sweep, which we accept as a
        // smaller residual hygiene issue.
        let mut tmp = tempfile::NamedTempFile::new_in(parent)?;
        std::io::Write::write_all(&mut tmp, &serialized)?;
        tmp.as_file().sync_all()?;
        tmp.persist(path).map_err(|e| e.error)?;
        Ok(())
    }

    /// Hash the given content with BLAKE3 (32-byte output).
    pub fn hash_content(content: &[u8]) -> [u8; 32] {
        *blake3::hash(content).as_bytes()
    }

    /// Returns `true` when `path` was previously indexed with the SAME
    /// content hash. Kept for callers that already have the hash in hand
    /// (e.g. the orchestrator's chunk-level skip path).
    pub fn unchanged(&self, path: &Path, content_hash: &[u8; 32]) -> bool {
        let i = shard_index(path);
        self.shards[i]
            .read()
            .get(path)
            .is_some_and(|prev| &prev.hash == content_hash)
    }

    /// Returns `true` when `(path, mtime_ns, size)` exactly matches a
    /// stored entry. This is the **fast-path skip** - it avoids reading
    /// the file at all, which is the dominant cost on cold-cache disk.
    /// A `false` return means "either we've never seen this path, or
    /// metadata differs - caller must read + hash to decide."
    pub fn metadata_unchanged(&self, path: &Path, mtime_ns: u64, size: u64) -> bool {
        let i = shard_index(path);
        self.shards[i]
            .read()
            .get(path)
            .is_some_and(|prev| prev.mtime_ns == mtime_ns && prev.size == size)
    }

    /// Returns the stored `(mtime_ns, size, content_hash)` for `path`,
    /// or `None` if the index hasn't seen it. Used by paranoid-mode
    /// verifiers that want to confirm content didn't change even when
    /// metadata happens to match.
    pub fn lookup(&self, path: &Path) -> Option<(u64, u64, [u8; 32])> {
        let i = shard_index(path);
        self.shards[i]
            .read()
            .get(path)
            .map(|e| (e.mtime_ns, e.size, e.hash))
    }

    /// Record a file's content hash. Back-compat shim that drops to a
    /// zero-metadata entry - calls into [`Self::record_with_metadata`]
    /// with `mtime_ns = 0` and `size = 0` so existing callers keep
    /// working but won't benefit from the metadata fast-path.
    pub fn record(&self, path: PathBuf, content_hash: [u8; 32]) {
        self.record_with_metadata(path, 0, 0, content_hash);
    }

    /// Record a file's metadata + content hash. Overwrites any prior
    /// entry at the same path. The path-shard mutex is held for the
    /// duration of the insert only; concurrent recordings against
    /// different shards never contend.
    pub fn record_with_metadata(
        &self,
        path: PathBuf,
        mtime_ns: u64,
        size: u64,
        content_hash: [u8; 32],
    ) {
        self.try_insert(
            path,
            CacheEntry {
                mtime_ns,
                size,
                hash: content_hash,
            },
        );
    }

    /// Insert or update one entry, honoring [`Self::max_entries`].
    ///
    /// Returns `true` if the entry is now present (inserted or updated),
    /// `false` if it was a NEW path dropped because the cap is reached.
    /// Updates to an already-present path always succeed (they don't grow
    /// the working set) so an over-cap scan never corrupts existing state.
    /// The first drop emits a single WARN; subsequent drops are silent to
    /// avoid a log storm on a multi-million-file overflow.
    fn try_insert(&self, path: PathBuf, entry: CacheEntry) -> bool {
        let i = shard_index(&path);
        {
            // Fast path: updating a path we already track is a
            // replacement, not growth - always allowed, no cap check.
            // Scope the write guard so it is released before we read
            // sibling shards for the cap check below (parking_lot
            // RwLock is non-reentrant; re-locking shard `i` would
            // deadlock).
            let mut shard = self.shards[i].write();
            if shard.contains_key(&path) {
                shard.insert(path, entry);
                return true;
            }
        }
        // `max_entries == 0` means unbounded (opt-in legacy behavior).
        // The cap is a soft budget checked against `approx_count` (O(1),
        // no shard scan). Concurrent new-path inserts across shards can
        // overshoot by at most the number of in-flight `record` calls -
        // bounded and harmless (a few entries over budget, never
        // unbounded growth).
        use std::sync::atomic::Ordering;
        if self.max_entries != 0 && self.approx_count.load(Ordering::Relaxed) >= self.max_entries {
            if !self.cap_warned.swap(true, Ordering::Relaxed) {
                tracing::warn!(
                    cap = self.max_entries,
                    "merkle index entry cap reached; new paths will not be \
                     cached this run (they are re-scanned next run). Raise \
                     the cap for very large trees if the rescan cost matters."
                );
            }
            return false;
        }
        // Re-acquire the shard write lock for the actual insert. A racing
        // writer may have inserted this same new path in the gap; only
        // bump the approximate count when WE created a new key, so the
        // counter doesn't drift above true growth on update races.
        let is_new = self.shards[i].write().insert(path, entry).is_none();
        if is_new {
            self.approx_count.fetch_add(1, Ordering::Relaxed);
        }
        true
    }

    /// Remove `path` from the index so the next scan treats it as new and
    /// re-reads + re-scans it.
    ///
    /// This is how incremental mode keeps its core safety guarantee: a file
    /// that produced ANY finding is never cached, so a secret in an otherwise
    /// unchanged file still surfaces on every later run instead of being
    /// silently skipped (the failure this module's own header warns about).
    /// Clean files - the 99% - stay cached, so the 10-100x speedup is
    /// unaffected, and because we store the ABSENCE of an entry rather than the
    /// finding, no secret value ever touches the on-disk index.
    pub fn forget(&self, path: &Path) {
        let i = shard_index(path);
        self.shards[i].write().remove(path);
    }

    /// Number of indexed entries.
    pub fn len(&self) -> usize {
        self.shards.iter().map(|s| s.read().len()).sum()
    }

    /// Returns true if no cached entries are present across any shard.
    pub fn is_empty(&self) -> bool {
        self.shards.iter().all(|s| s.read().is_empty())
    }
}

impl Default for MerkleIndex {
    fn default() -> Self {
        Self::empty()
    }
}

/// Default index location: `$XDG_CACHE_HOME/keyhog/merkle.idx` or
/// `~/.cache/keyhog/merkle.idx` on Linux, `~/Library/Caches/keyhog/...`
/// on macOS.
pub fn default_cache_path() -> Option<PathBuf> {
    dirs::cache_dir().map(|d| d.join("keyhog").join("merkle.idx"))
}

/// Stale-tmp-file age cutoff. `tempfile::NamedTempFile`'s Drop impl
/// cleans up on panic but NOT on SIGKILL/SIGTERM - those leak a
/// random-named tmp file in the cache directory. Older than this
/// cutoff means "no chance an in-flight save by another keyhog
/// process is still using it." 1 hour is generous; the longest
/// merkle save in observed runs is < 1 second on a fully-loaded
/// 100k-file scan.
const STALE_TMP_CUTOFF_SECS: u64 = 60 * 60;

/// Best-effort sweep of stale tmp files left behind by SIGKILL'd
/// keyhog processes. Called from `load`/`load_with_spec` before
/// reading the cache so stale tmps don't accumulate forever next
/// to the real `merkle.idx`. Logged at debug level only since
/// failure is non-fatal.
fn sweep_stale_tmp_files(cache_path: &Path) {
    let Some(parent) = cache_path.parent() else {
        return;
    };
    let Ok(entries) = std::fs::read_dir(parent) else {
        return;
    };
    let stem = cache_path
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or("merkle");
    let now = std::time::SystemTime::now();
    let mut swept = 0usize;
    for entry in entries.flatten() {
        let name = entry.file_name();
        let Some(name_str) = name.to_str() else {
            continue;
        };
        // tempfile::NamedTempFile uses random hex-suffixed names with
        // a `.tmp` prefix - match conservatively to avoid eating
        // unrelated files: `<stem>.tmp*` OR `.tmp<hex>`.
        let is_tmp_sibling =
            name_str.starts_with(&format!("{stem}.tmp")) || name_str.starts_with(".tmp");
        if !is_tmp_sibling {
            continue;
        }
        let path = entry.path();
        let Ok(meta) = path.metadata() else { continue };
        let Ok(modified) = meta.modified() else {
            continue;
        };
        let age = match now.duration_since(modified) {
            Ok(d) => d,
            Err(_) => continue, // mtime in the future - skip rather than guess
        };
        if age.as_secs() < STALE_TMP_CUTOFF_SECS {
            continue;
        }
        if std::fs::remove_file(&path).is_ok() {
            swept += 1;
        }
    }
    if swept > 0 {
        tracing::debug!(
            count = swept,
            dir = %parent.display(),
            "swept stale cache tmp files left by an interrupted save"
        );
    }
}