keyhog_scanner/static_intern.rs
1//! Static-string interner for the frozen detector-metadata universe.
2//!
3//! Built once at scanner construction from the universe of metadata
4//! strings that are stable across a scan run - every detector's
5//! `id`, `name`, `service`, plus the small set of `source_type`
6//! literals every source backend uses (`filesystem`, `git`,
7//! `git/history`, `stdin`, `s3`, `docker`, `web`, `github`, `slack`).
8//!
9//! At scan time, `lookup(s)` returns a pre-allocated `Arc<str>` for
10//! known strings without touching the global allocator. Unknown
11//! strings (file paths, commit SHAs, author names, dates) fall
12//! through to the per-scan `HashSet` interner in `ScanState`.
13//!
14//! ## Lookup backing: single-hash `ahash` map (PERF-locality_intern-1)
15//!
16//! The interner previously used vyre's CHD perfect hash. CHD is O(1) in the
17//! big-O sense, but its constant factor is FOUR full-key traversals per lookup:
18//! two seeded FNV-1a passes (bucket + slot), one xxHash-style verify pass, and a
19//! final byte-for-byte `arc == s` compare. FNV-1a folds one byte per loop
20//! iteration, so on the per-match hot path (three metadata fields per emitted
21//! finding) that is twelve whole-key traversals per match - the dominant cost
22//! the locality tripwire pins.
23//!
24//! `lookup` now resolves through a single `ahash` map keyed by the interned
25//! string. `ahash` mixes the key in 8-byte words with hardware multiply/rotate
26//! ops rather than one function call per byte, so a lookup is ONE fast hash +
27//! one bucket compare instead of three byte-loops + a compare. The map stores
28//! the arena index, and we return `arena[idx].clone()` - byte-identical to the
29//! string the CHD path returned. The map is built once and read-only at scan
30//! time, so every rayon worker shares it lock-free (an `&HashMap` read needs no
31//! synchronization). For callers that already hold the detector index, the
32//! scanner caches the resolved triple by index and skips `lookup` entirely
33//! (`CompiledScanner::interned_detector_metadata`).
34
35use std::sync::Arc;
36
37/// Stable source-type identifiers every keyhog source backend
38/// emits. Pre-interned because every match lands a copy of one of
39/// these in `MatchLocation::source`. Keep this list in sync with
40/// `keyhog_sources::Source::name()` implementations.
41pub(crate) const SEED_SOURCE_TYPES: &[&str] = &[
42 "filesystem",
43 "git",
44 "git/history",
45 "git/diff",
46 "git/staged",
47 "git-diff",
48 "stdin",
49 "s3",
50 "docker",
51 "web",
52 "github",
53 "slack",
54 "binary",
55];
56
57/// Frozen static-string interner. Built once at scanner
58/// construction; cloneable via `Arc` so every rayon worker shares
59/// one read-only instance.
60///
61/// `index` maps each interned string to its slot in `arena`; it is read-only
62/// after construction, so concurrent `lookup`s need no synchronization. The
63/// `ahash` hasher gives a single fast (8-byte-word, hardware-mixed) hash per
64/// lookup instead of the CHD perfect hash's three per-byte hash passes.
65#[derive(Default)]
66pub struct StaticInterner {
67 arena: Vec<Arc<str>>,
68 index: std::collections::HashMap<Arc<str>, u32, ahash::RandomState>,
69}
70
71impl StaticInterner {
72 /// Build an interner from the universe of stable strings: detector
73 /// metadata fields + the seed source-type list. Duplicates are
74 /// collapsed automatically (the map keeps one entry per distinct key).
75 pub fn from_detector_strings<I, S>(detector_strings: I) -> Self
76 where
77 I: IntoIterator<Item = S>,
78 S: AsRef<str>,
79 {
80 // Dedupe + freeze the input set. BTreeSet keeps the arena order stable
81 // and deterministic across runs (matters for any index-keyed cache the
82 // scanner derives from this arena, e.g. metadata_by_index).
83 let mut all: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
84 for s in detector_strings {
85 all.insert(s.as_ref().to_owned());
86 }
87 for s in SEED_SOURCE_TYPES {
88 all.insert((*s).to_owned());
89 }
90
91 if all.is_empty() {
92 return Self::default();
93 }
94
95 let arena: Vec<Arc<str>> = all.iter().map(|s| Arc::from(s.as_str())).collect();
96 let mut index: std::collections::HashMap<Arc<str>, u32, ahash::RandomState> =
97 std::collections::HashMap::with_capacity_and_hasher(
98 arena.len(),
99 ahash::RandomState::new(),
100 );
101 for (i, arc) in arena.iter().enumerate() {
102 index.insert(Arc::clone(arc), i as u32);
103 }
104 Self { arena, index }
105 }
106
107 /// Single-hash lookup. Returns a clone of the pre-allocated `Arc<str>`
108 /// when `s` is in the interner; `None` otherwise. One `ahash` pass over the
109 /// key plus a bucket compare - no second hash, no separate verify pass.
110 /// `Arc<str>: Borrow<str>` makes `get(s)` allocation-free on hits.
111 #[inline]
112 pub fn lookup(&self, s: &str) -> Option<Arc<str>> {
113 let &idx = self.index.get(s)?;
114 // The index can only hold valid arena slots (populated from arena
115 // above), but keep the bounds-checked `get` for a total function.
116 self.arena.get(idx as usize).cloned()
117 }
118
119 /// Number of pre-interned strings.
120 pub fn len(&self) -> usize {
121 self.arena.len()
122 }
123
124 pub fn is_empty(&self) -> bool {
125 self.arena.is_empty()
126 }
127}
128
129pub fn seed_source_type_count() -> usize {
130 SEED_SOURCE_TYPES.len()
131}