Skip to main content

keyhog_scanner/
static_intern.rs

1//! Static-string interner for the frozen detector-metadata universe.
2//!
3//! Built once at scanner construction from the universe of metadata
4//! strings that are stable across a scan run - every detector's
5//! `id`, `name`, `service`, plus the small set of `source_type`
6//! literals every source backend uses (`filesystem`, `git`,
7//! `git/history`, `stdin`, `s3`, `docker`, `web`, `github`, `slack`).
8//!
9//! At scan time, `lookup(s)` returns a pre-allocated `Arc<str>` for
10//! known strings without touching the global allocator. Unknown
11//! strings (file paths, commit SHAs, author names, dates) fall
12//! through to the per-scan `HashSet` interner in `ScanState`.
13//!
14//! ## Lookup backing: single-hash `ahash` map (PERF-locality_intern-1)
15//!
16//! The interner previously used vyre's CHD perfect hash. CHD is O(1) in the
17//! big-O sense, but its constant factor is FOUR full-key traversals per lookup:
18//! two seeded FNV-1a passes (bucket + slot), one xxHash-style verify pass, and a
19//! final byte-for-byte `arc == s` compare. FNV-1a folds one byte per loop
20//! iteration, so on the per-match hot path (three metadata fields per emitted
21//! finding) that is twelve whole-key traversals per match - the dominant cost
22//! the locality tripwire pins.
23//!
24//! `lookup` now resolves through a single `ahash` map keyed by the interned
25//! string. `ahash` mixes the key in 8-byte words with hardware multiply/rotate
26//! ops rather than one function call per byte, so a lookup is ONE fast hash +
27//! one bucket compare instead of three byte-loops + a compare. The map stores
28//! the arena index, and we return `arena[idx].clone()` - byte-identical to the
29//! string the CHD path returned. The map is built once and read-only at scan
30//! time, so every rayon worker shares it lock-free (an `&HashMap` read needs no
31//! synchronization). For callers that already hold the detector index, the
32//! scanner caches the resolved triple by index and skips `lookup` entirely
33//! (`CompiledScanner::interned_detector_metadata`).
34
35use std::sync::Arc;
36
37/// Stable source-type identifiers every keyhog source backend
38/// emits. Pre-interned because every match lands a copy of one of
39/// these in `MatchLocation::source`. Keep this list in sync with
40/// `keyhog_sources::Source::name()` implementations.
41pub(crate) const SEED_SOURCE_TYPES: &[&str] = &[
42    "filesystem",
43    "git",
44    "git/history",
45    "git/diff",
46    "git/staged",
47    "git-diff",
48    "stdin",
49    "s3",
50    "docker",
51    "web",
52    "github",
53    "slack",
54    "binary",
55];
56
57/// Frozen static-string interner. Built once at scanner
58/// construction; cloneable via `Arc` so every rayon worker shares
59/// one read-only instance.
60///
61/// `index` maps each interned string to its slot in `arena`; it is read-only
62/// after construction, so concurrent `lookup`s need no synchronization. The
63/// `ahash` hasher gives a single fast (8-byte-word, hardware-mixed) hash per
64/// lookup instead of the CHD perfect hash's three per-byte hash passes.
65#[derive(Default)]
66pub struct StaticInterner {
67    arena: Vec<Arc<str>>,
68    index: std::collections::HashMap<Arc<str>, u32, ahash::RandomState>,
69}
70
71impl StaticInterner {
72    /// Build an interner from the universe of stable strings: detector
73    /// metadata fields + the seed source-type list. Duplicates are
74    /// collapsed automatically (the map keeps one entry per distinct key).
75    pub fn from_detector_strings<I, S>(detector_strings: I) -> Self
76    where
77        I: IntoIterator<Item = S>,
78        S: AsRef<str>,
79    {
80        // Dedupe + freeze the input set. BTreeSet keeps the arena order stable
81        // and deterministic across runs (matters for any index-keyed cache the
82        // scanner derives from this arena, e.g. metadata_by_index).
83        let mut all: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
84        for s in detector_strings {
85            all.insert(s.as_ref().to_owned());
86        }
87        for s in SEED_SOURCE_TYPES {
88            all.insert((*s).to_owned());
89        }
90
91        if all.is_empty() {
92            return Self::default();
93        }
94
95        let arena: Vec<Arc<str>> = all.iter().map(|s| Arc::from(s.as_str())).collect();
96        let mut index: std::collections::HashMap<Arc<str>, u32, ahash::RandomState> =
97            std::collections::HashMap::with_capacity_and_hasher(
98                arena.len(),
99                ahash::RandomState::new(),
100            );
101        for (i, arc) in arena.iter().enumerate() {
102            index.insert(Arc::clone(arc), i as u32);
103        }
104        Self { arena, index }
105    }
106
107    /// Single-hash lookup. Returns a clone of the pre-allocated `Arc<str>`
108    /// when `s` is in the interner; `None` otherwise. One `ahash` pass over the
109    /// key plus a bucket compare - no second hash, no separate verify pass.
110    /// `Arc<str>: Borrow<str>` makes `get(s)` allocation-free on hits.
111    #[inline]
112    pub fn lookup(&self, s: &str) -> Option<Arc<str>> {
113        let &idx = self.index.get(s)?;
114        // The index can only hold valid arena slots (populated from arena
115        // above), but keep the bounds-checked `get` for a total function.
116        self.arena.get(idx as usize).cloned()
117    }
118
119    /// Number of pre-interned strings.
120    pub fn len(&self) -> usize {
121        self.arena.len()
122    }
123
124    pub fn is_empty(&self) -> bool {
125        self.arena.is_empty()
126    }
127}
128
129pub fn seed_source_type_count() -> usize {
130    SEED_SOURCE_TYPES.len()
131}