keyhog-core 0.5.37

keyhog-core: shared data model and detector specifications for the KeyHog secret scanner
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
//! Incremental scan support via a persisted file-content index.
//!
//! ## What it does
//!
//! On a fresh scan we compute, for every input chunk, a metadata tuple
//! `(mtime_ns, size, BLAKE3(content))` and store it under the file's
//! canonical path. On the next run, files whose `(mtime, size)` match
//! the stored values can be skipped *without re-reading the bytes* -
//! they almost certainly haven't changed (rsync-style trust). When
//! `(mtime, size)` differ but BLAKE3 matches we record the new mtime
//! and still skip - same content, different stat (touched, copied).
//!
//! Tier-B moat innovation #3 from audits/legendary-2026-04-26: "10–100×
//! speedup on CI re-runs" by skipping the 99% of files that didn't change.
//!
//! ## Schema versions
//!
//! - **v1 (legacy)** - `path → BLAKE3 hex` only. Loadable but lacks the
//!   metadata short-circuit; treated as cold-start to avoid mixing schemas.
//! - **v2 (current)** - `path → (mtime_ns, size, BLAKE3 hex)` plus a
//!   top-level `spec_hash` derived from the loaded detector set. A
//!   spec-hash mismatch invalidates the entire cache; this is the
//!   correctness fix for "added a detector but unchanged files were
//!   silently skipped, missing the new detection forever."
//!
//! ## Serialization
//!
//! JSON, on purpose. The dataset is one row per scanned file (≤ ~1M for
//! any sane repo) and JSON keeps the on-disk format trivial to debug,
//! diff, and version-control if a team wants to.
//!
//! ## Threat model
//!
//! Cached entries do NOT contain credentials. Storing a `(mtime, size,
//! content_hash)` tuple per scanned path leaks that the path *exists*
//! and what its content fingerprint is, which is why `--lockdown`
//! refuses to load or write the cache at all.

use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};

use parking_lot::RwLock;
use serde::{Deserialize, Serialize};

use crate::merkle_spec_hash::{hex_encode, hex_to_array};

pub use crate::merkle_spec_hash::compute_spec_hash;

/// On-disk per-entry record (v2). The `mtime_ns` + `size` pair is the
/// fast-path key: a successful match short-circuits the BLAKE3 read
/// entirely. `hash` remains as a paranoid-mode verifier and as the
/// authoritative content fingerprint when mtime alone changed.
#[derive(Debug, Clone, Serialize, Deserialize)]
struct EntryV2 {
    /// `mtime` in nanoseconds since UNIX epoch. Stored as `u64` so we
    /// don't lose ext4/NTFS sub-second precision; older filesystems
    /// (FAT32 with 2-second resolution) just round-trip the rounded value.
    mtime_ns: u64,
    /// File size in bytes from `fs::metadata`.
    size: u64,
    /// BLAKE3 hex digest of the chunk content.
    hash: String,
}

/// Top-level on-disk schema.
#[derive(Debug, Serialize, Deserialize)]
struct OnDisk {
    /// Schema version. Bumped on incompatible changes.
    version: u32,
    /// Hex BLAKE3 of the canonical detector-spec digest. Optional for
    /// schemas written before spec hashing was added; loaders treating
    /// `None` as "trust the cache" stay back-compatible.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    spec_hash: Option<String>,
    /// `path → entry`. Stored as hex strings (not raw bytes) so a human
    /// can `git diff` the file and see which entries changed.
    entries: HashMap<String, EntryV2>,
}

const SCHEMA_VERSION: u32 = 2;

/// Shard count: spreads concurrent `record` / `unchanged` calls across
/// independent locks so tiny-file storms don't serialize all rayon workers.
const MERKLE_SHARDS: usize = 64;

fn shard_index(path: &Path) -> usize {
    let mut h = DefaultHasher::new();
    path.hash(&mut h);
    (h.finish() as usize) % MERKLE_SHARDS
}

/// In-memory per-entry record. Mirrors [`EntryV2`] but holds the hash as
/// a fixed-size array - saves the per-lookup hex-decode cost on the
/// `unchanged` hot path.
#[derive(Debug, Clone, Copy)]
struct CacheEntry {
    mtime_ns: u64,
    size: u64,
    hash: [u8; 32],
}

/// In-memory file-hash index loaded from / saved to a JSON cache file.
///
/// Concurrency model: the orchestrator holds an `Arc<MerkleIndex>` and
/// records new entries as chunks arrive from rayon-parallel sources.
/// Paths are sharded across [`MERKLE_SHARDS`] mutex-protected maps so
/// concurrent updates rarely contend.
#[derive(Debug)]
pub struct MerkleIndex {
    shards: Vec<RwLock<HashMap<PathBuf, CacheEntry>>>,
}

impl MerkleIndex {
    /// Construct a fresh, empty [`MerkleIndex`] with no cached entries.
    pub fn empty() -> Self {
        Self {
            shards: (0..MERKLE_SHARDS)
                .map(|_| RwLock::new(HashMap::new()))
                .collect(),
        }
    }

    /// Load the index from `path` without spec-hash gating. Returns an
    /// empty index when the file doesn't exist (first run) or fails to
    /// parse (treat as cold start - safer than poisoning the cache from
    /// a corrupted artifact). v1 caches are intentionally rejected as
    /// cold-start because they lack metadata fields.
    pub fn load(path: &Path) -> Self {
        sweep_stale_tmp_files(path);
        Self::load_with_spec_inner(path, None)
    }

    /// Load the index, gated on a matching detector-spec hash. When the
    /// stored `spec_hash` differs from `expected_spec_hash`, the cache is
    /// treated as cold-start. This is the correctness gate that prevents
    /// "added a detector → unchanged file silently skipped → new
    /// detector never runs against it" from ever happening.
    pub fn load_with_spec(path: &Path, expected_spec_hash: &[u8; 32]) -> Self {
        sweep_stale_tmp_files(path);
        Self::load_with_spec_inner(path, Some(expected_spec_hash))
    }

    fn load_with_spec_inner(path: &Path, expected_spec_hash: Option<&[u8; 32]>) -> Self {
        let bytes = match std::fs::read(path) {
            Ok(b) => b,
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Self::empty(),
            Err(e) => {
                tracing::warn!(
                    cache = %path.display(),
                    error = %e,
                    "merkle index file read failed; treating as cold start"
                );
                return Self::empty();
            }
        };
        let on_disk: OnDisk = match serde_json::from_slice(&bytes) {
            Ok(d) => d,
            Err(error) => {
                tracing::warn!(
                    cache = %path.display(),
                    %error,
                    "merkle index parse failed; treating as cold start"
                );
                return Self::empty();
            }
        };
        if on_disk.version != SCHEMA_VERSION {
            tracing::warn!(
                cache = %path.display(),
                version = on_disk.version,
                expected = SCHEMA_VERSION,
                "merkle index schema mismatch; treating as cold start"
            );
            return Self::empty();
        }
        if let Some(expected) = expected_spec_hash {
            let stored_match = on_disk
                .spec_hash
                .as_deref()
                .and_then(hex_to_array)
                .is_some_and(|stored| &stored == expected);
            if !stored_match {
                tracing::info!(
                    cache = %path.display(),
                    "detector spec changed since last scan; cache invalidated"
                );
                return Self::empty();
            }
        }
        let entries: HashMap<PathBuf, CacheEntry> = on_disk
            .entries
            .into_iter()
            .filter_map(|(p, e)| {
                hex_to_array(&e.hash).map(|hash| {
                    (
                        PathBuf::from(p),
                        CacheEntry {
                            mtime_ns: e.mtime_ns,
                            size: e.size,
                            hash,
                        },
                    )
                })
            })
            .collect();
        tracing::info!(
            cache = %path.display(),
            count = entries.len(),
            "merkle index loaded"
        );
        let idx = Self::empty();
        for (p, e) in entries {
            let i = shard_index(&p);
            idx.shards[i].write().insert(p, e);
        }
        idx
    }

    /// Persist the index without binding it to a detector-spec hash. Old
    /// callers stay on this path; the next-cycle load won't enforce a
    /// spec match. Use [`Self::save_with_spec`] for the safe modern path.
    pub fn save(&self, path: &Path) -> std::io::Result<()> {
        self.save_inner(path, None)
    }

    /// Persist the index *with* the given detector-spec hash so a future
    /// load can detect detector drift and invalidate cleanly.
    pub fn save_with_spec(&self, path: &Path, spec_hash: &[u8; 32]) -> std::io::Result<()> {
        self.save_inner(path, Some(spec_hash))
    }

    fn save_inner(&self, path: &Path, spec_hash: Option<&[u8; 32]>) -> std::io::Result<()> {
        // Concurrency note: two `keyhog scan --incremental` processes
        // running against overlapping paths will both want to write
        // `merkle.idx`. The tmp-file uses `std::process::id()` so
        // there's no tmp-name collision, but the final `rename` is
        // last-writer-wins.
        //
        // To minimise data loss on concurrent saves, READ the
        // current on-disk entries first and merge our in-memory
        // state on top - entries in memory take precedence (we just
        // observed those files in this scan), but disk entries that
        // we DIDN'T touch are preserved. This narrows the data-loss
        // window from "entire scan" to "between read-and-rename"
        // (~milliseconds) instead of "between scan-start and save".
        //
        // A truly race-free solution needs an OS-level file lock
        // (`fcntl(F_SETLK)` / `LockFileEx`); that would block the
        // second writer entirely. We accept the small remaining
        // race as a correctness/perf trade - losing a few entries
        // means an extra rescan, not a missed leak.
        let mut merged = HashMap::<PathBuf, CacheEntry>::new();
        // Read existing on-disk entries first. Use the SAME spec
        // hash we're about to write - if disk was written under a
        // different spec, those entries are stale (a future load
        // would invalidate them) and we drop them now. If spec
        // matches (or this is the no-spec save path), preserve.
        // Format-mismatch / corrupted-file paths already log inside
        // `load`; ignore the error here so a bad on-disk state
        // doesn't stop us writing a fresh one.
        let on_disk_now = match spec_hash {
            Some(hash) => Self::load_with_spec(path, hash),
            None => Self::load(path),
        };
        for shard in &on_disk_now.shards {
            merged.extend(shard.read().iter().map(|(p, e)| (p.clone(), *e)));
        }
        // In-memory entries layer on top - last-write-wins by path.
        for shard in &self.shards {
            merged.extend(shard.read().iter().map(|(p, e)| (p.clone(), *e)));
        }
        let entries: HashMap<String, EntryV2> = merged
            .iter()
            .map(|(p, e)| {
                (
                    p.display().to_string(),
                    EntryV2 {
                        mtime_ns: e.mtime_ns,
                        size: e.size,
                        hash: hex_encode(&e.hash),
                    },
                )
            })
            .collect();
        let on_disk = OnDisk {
            version: SCHEMA_VERSION,
            spec_hash: spec_hash.map(hex_encode),
            entries,
        };
        let serialized = serde_json::to_vec_pretty(&on_disk)
            .map_err(|e| std::io::Error::other(format!("merkle index encode: {e}")))?;
        let parent = path.parent().unwrap_or_else(|| std::path::Path::new("."));
        std::fs::create_dir_all(parent)?;
        // `NamedTempFile::new_in` creates a randomly-named file in
        // the same directory as the final target, then `persist`
        // atomic-renames it. If we panic between create and persist,
        // NamedTempFile's Drop deletes the tmp file - earlier code
        // used `path.with_extension(format!("tmp.{pid}"))` and
        // leaked the tmp on panic. A SIGTERM/SIGKILL still leaks
        // (Drop doesn't run); the only complete fix for that is a
        // startup-time stale-tmp sweep, which we accept as a
        // smaller residual hygiene issue.
        let mut tmp = tempfile::NamedTempFile::new_in(parent)?;
        std::io::Write::write_all(&mut tmp, &serialized)?;
        tmp.as_file().sync_all()?;
        tmp.persist(path).map_err(|e| e.error)?;
        Ok(())
    }

    /// Hash the given content with BLAKE3 (32-byte output).
    pub fn hash_content(content: &[u8]) -> [u8; 32] {
        *blake3::hash(content).as_bytes()
    }

    /// Returns `true` when `path` was previously indexed with the SAME
    /// content hash. Kept for callers that already have the hash in hand
    /// (e.g. the orchestrator's chunk-level skip path).
    pub fn unchanged(&self, path: &Path, content_hash: &[u8; 32]) -> bool {
        let i = shard_index(path);
        self.shards[i]
            .read()
            .get(path)
            .is_some_and(|prev| &prev.hash == content_hash)
    }

    /// Returns `true` when `(path, mtime_ns, size)` exactly matches a
    /// stored entry. This is the **fast-path skip** - it avoids reading
    /// the file at all, which is the dominant cost on cold-cache disk.
    /// A `false` return means "either we've never seen this path, or
    /// metadata differs - caller must read + hash to decide."
    pub fn metadata_unchanged(&self, path: &Path, mtime_ns: u64, size: u64) -> bool {
        let i = shard_index(path);
        self.shards[i]
            .read()
            .get(path)
            .is_some_and(|prev| prev.mtime_ns == mtime_ns && prev.size == size)
    }

    /// Returns the stored `(mtime_ns, size, content_hash)` for `path`,
    /// or `None` if the index hasn't seen it. Used by paranoid-mode
    /// verifiers that want to confirm content didn't change even when
    /// metadata happens to match.
    pub fn lookup(&self, path: &Path) -> Option<(u64, u64, [u8; 32])> {
        let i = shard_index(path);
        self.shards[i]
            .read()
            .get(path)
            .map(|e| (e.mtime_ns, e.size, e.hash))
    }

    /// Record a file's content hash. Back-compat shim that drops to a
    /// zero-metadata entry - calls into [`Self::record_with_metadata`]
    /// with `mtime_ns = 0` and `size = 0` so existing callers keep
    /// working but won't benefit from the metadata fast-path.
    pub fn record(&self, path: PathBuf, content_hash: [u8; 32]) {
        self.record_with_metadata(path, 0, 0, content_hash);
    }

    /// Record a file's metadata + content hash. Overwrites any prior
    /// entry at the same path. The path-shard mutex is held for the
    /// duration of the insert only; concurrent recordings against
    /// different shards never contend.
    pub fn record_with_metadata(
        &self,
        path: PathBuf,
        mtime_ns: u64,
        size: u64,
        content_hash: [u8; 32],
    ) {
        let i = shard_index(&path);
        self.shards[i].write().insert(
            path,
            CacheEntry {
                mtime_ns,
                size,
                hash: content_hash,
            },
        );
    }

    /// Remove `path` from the index so the next scan treats it as new and
    /// re-reads + re-scans it.
    ///
    /// This is how incremental mode keeps its core safety guarantee: a file
    /// that produced ANY finding is never cached, so a secret in an otherwise
    /// unchanged file still surfaces on every later run instead of being
    /// silently skipped (the failure this module's own header warns about).
    /// Clean files - the 99% - stay cached, so the 10-100x speedup is
    /// unaffected, and because we store the ABSENCE of an entry rather than the
    /// finding, no secret value ever touches the on-disk index.
    pub fn forget(&self, path: &Path) {
        let i = shard_index(path);
        self.shards[i].write().remove(path);
    }

    /// Number of indexed entries.
    pub fn len(&self) -> usize {
        self.shards.iter().map(|s| s.read().len()).sum()
    }

    /// Returns true if no cached entries are present across any shard.
    pub fn is_empty(&self) -> bool {
        self.shards.iter().all(|s| s.read().is_empty())
    }
}

impl Default for MerkleIndex {
    fn default() -> Self {
        Self::empty()
    }
}

/// Default index location: `$XDG_CACHE_HOME/keyhog/merkle.idx` or
/// `~/.cache/keyhog/merkle.idx` on Linux, `~/Library/Caches/keyhog/...`
/// on macOS.
pub fn default_cache_path() -> Option<PathBuf> {
    dirs::cache_dir().map(|d| d.join("keyhog").join("merkle.idx"))
}

/// Stale-tmp-file age cutoff. `tempfile::NamedTempFile`'s Drop impl
/// cleans up on panic but NOT on SIGKILL/SIGTERM - those leak a
/// random-named tmp file in the cache directory. Older than this
/// cutoff means "no chance an in-flight save by another keyhog
/// process is still using it." 1 hour is generous; the longest
/// merkle save in observed runs is < 1 second on a fully-loaded
/// 100k-file scan.
const STALE_TMP_CUTOFF_SECS: u64 = 60 * 60;

/// Best-effort sweep of stale tmp files left behind by SIGKILL'd
/// keyhog processes. Called from `load`/`load_with_spec` before
/// reading the cache so stale tmps don't accumulate forever next
/// to the real `merkle.idx`. Logged at debug level only since
/// failure is non-fatal.
fn sweep_stale_tmp_files(cache_path: &Path) {
    let Some(parent) = cache_path.parent() else {
        return;
    };
    let Ok(entries) = std::fs::read_dir(parent) else {
        return;
    };
    let stem = cache_path
        .file_stem()
        .and_then(|s| s.to_str())
        .unwrap_or("merkle");
    let now = std::time::SystemTime::now();
    let mut swept = 0usize;
    for entry in entries.flatten() {
        let name = entry.file_name();
        let Some(name_str) = name.to_str() else {
            continue;
        };
        // tempfile::NamedTempFile uses random hex-suffixed names with
        // a `.tmp` prefix - match conservatively to avoid eating
        // unrelated files: `<stem>.tmp*` OR `.tmp<hex>`.
        let is_tmp_sibling =
            name_str.starts_with(&format!("{stem}.tmp")) || name_str.starts_with(".tmp");
        if !is_tmp_sibling {
            continue;
        }
        let path = entry.path();
        let Ok(meta) = path.metadata() else { continue };
        let Ok(modified) = meta.modified() else {
            continue;
        };
        let age = match now.duration_since(modified) {
            Ok(d) => d,
            Err(_) => continue, // mtime in the future - skip rather than guess
        };
        if age.as_secs() < STALE_TMP_CUTOFF_SECS {
            continue;
        }
        if std::fs::remove_file(&path).is_ok() {
            swept += 1;
        }
    }
    if swept > 0 {
        tracing::debug!(
            count = swept,
            dir = %parent.display(),
            "swept stale cache tmp files left by an interrupted save"
        );
    }
}