1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
//! Logic for compiling detector specifications into an efficient scanning engine.
use crate::error::{Result, ScanError};
use crate::types::*;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use keyhog_core::{CompanionSpec, DetectorSpec, PatternSpec};
use regex::Regex;
use super::compiler_prefix::extract_literal_prefixes;
pub fn build_ac_pattern_set(literals: &[String]) -> Result<Option<AhoCorasick>> {
if literals.is_empty() {
return Ok(None);
}
// ASCII case-insensitive to match Hyperscan's PatternFlags::CASELESS
// (see simd.rs). Without this, the CpuFallback backend misses literal
// hits on case-varied text (e.g. random base containing `akia` or
// `AKia`) that the SimdCpu backend finds, producing per-backend
// finding divergence visible in proptest gpu_proptest_invariants
// P1b. Detector keywords also rely on caseless matching for env-var
// shapes like `AWS_KEY_ID` vs `aws_key_id` - the existing
// fallback_keyword_ac at build_fallback_keyword_ac (this file)
// already uses ascii_case_insensitive(true) for the same reason.
Ok(Some(
AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(literals)?,
))
}
/// Keep GPU literal inputs in Keyhog order so Vyre match pattern IDs map back
/// to `ac_map` without an adapter table.
pub fn build_gpu_literals(ac_literals: &[String]) -> Option<std::sync::Arc<Vec<Vec<u8>>>> {
if ac_literals.iter().any(String::is_empty) {
tracing::warn!("GPU literal set contains an empty literal; disabling GPU literal scan");
return None;
}
let literals: Vec<Vec<u8>> = ac_literals
.iter()
.map(|literal| literal.as_bytes().to_vec())
.collect();
if literals.is_empty() {
None
} else {
tracing::info!(
patterns = literals.len(),
"GPU literal set prepared for Vyre"
);
Some(std::sync::Arc::new(literals))
}
}
pub fn build_same_prefix_patterns(literals: &[String]) -> Vec<Vec<usize>> {
let mut groups: std::collections::HashMap<&str, Vec<usize>> = std::collections::HashMap::new();
for (i, lit) in literals.iter().enumerate() {
groups.entry(lit.as_str()).or_default().push(i);
}
let mut map = vec![Vec::new(); literals.len()];
for indices in groups.values() {
if indices.len() > 1 {
for &i in indices {
map[i] = indices.iter().copied().filter(|&j| j != i).collect();
}
}
}
map
}
pub fn build_prefix_propagation(literals: &[String]) -> Vec<Vec<usize>> {
crate::prefix_trie::build_propagation_table(literals)
}
pub fn build_fallback_keyword_ac(
fallback: &[(CompiledPattern, Vec<String>)],
) -> (Option<AhoCorasick>, Vec<Vec<usize>>) {
let mut all_keywords = Vec::new();
let mut keyword_to_patterns = Vec::new();
let mut keyword_map: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
for (pattern_idx, (_, keywords)) in fallback.iter().enumerate() {
for kw in keywords {
// Floor stays at 4: lowering it to 3 to admit
// mailchimp's `-us`/`-eu`/`-uk` and openai/anthropic's
// `sk-`/`sk-ant-`/`pk-` measured a NET F1 regression
// (-67 TP, +28 FP) on SecretBench-medium 15k seed-0
// because (a) too-broad fallback detectors like
// helicone-api-key `sk-[a-zA-Z0-9]{20,}` fired
// wrongly on neighboring lines and (b) the recall
// gain on mailchimp was small. The right fix for
// those detectors is per-detector keyword tightening,
// not a global threshold change.
if kw.len() < 4 {
continue;
}
let idx = *keyword_map.entry(kw.clone()).or_insert_with(|| {
all_keywords.push(kw.clone());
keyword_to_patterns.push(Vec::new());
all_keywords.len() - 1
});
keyword_to_patterns[idx].push(pattern_idx);
}
}
if all_keywords.is_empty() {
return (None, Vec::new());
}
let ac = AhoCorasickBuilder::new()
.ascii_case_insensitive(true)
.build(all_keywords)
.ok();
(ac, keyword_to_patterns)
}
pub fn log_quality_warnings(warnings: &[String]) {
for warning in warnings {
tracing::warn!(target: "keyhog::scanner::quality", "{}", warning);
}
}
pub fn compile_detector_companions(detector: &DetectorSpec) -> Result<Vec<CompiledCompanion>> {
detector
.companions
.iter()
.map(|companion| compile_companion(companion, &detector.id))
.collect()
}
#[allow(clippy::too_many_arguments)]
pub fn compile_detector_pattern(
detector_index: usize,
detector: &DetectorSpec,
pattern_index: usize,
pattern: &PatternSpec,
ac_literals: &mut Vec<String>,
ac_map: &mut Vec<CompiledPattern>,
fallback: &mut Vec<(CompiledPattern, Vec<String>)>,
quality_warnings: &mut Vec<String>,
) -> Result<()> {
let detector_id = &detector.id;
let compiled = compile_pattern(detector_index, pattern_index, pattern, detector_id)?;
// Prefix extraction for Aho-Corasick prefiltering
let prefixes = extract_literal_prefixes(&pattern.regex);
// Proactive Homoglyph Expansion:
// kimi-decode audit: the previous flow here built a fallback regex
// shaped `^<expanded_prefix>` with NO body constraint, which would
// match any string starting with the homoglyph variant of the
// prefix - the exact same flutterwave-FP bug the production path
// (`compile_pattern`, earlier in this file) was already fixed for
// via `rewrite_alternation_prefix`. Since this `compile_detector_pattern`
// entry point has zero internal call sites and is only retained as
// a `pub` surface for hypothetical external consumers, the safe
// move is to skip the prefix-only homoglyph fallback here entirely.
// Callers needing homoglyph defense should route through the live
// CompiledScanner::compile pipeline which applies the validated
// rewrite + full-body anchoring.
if !prefixes.is_empty() {
tracing::debug!(
detector_id,
?prefixes,
mode = "AC",
"compiled detector pattern"
);
for prefix in prefixes {
ac_literals.push(prefix);
ac_map.push(compiled.clone());
}
} else {
// No literal prefix. With Hyperscan, these will be compiled directly
// into the HS database alongside the AC-prefix patterns. Without
// Hyperscan, they go to the keyword-gated regex fallback.
if detector.keywords.is_empty() {
quality_warnings.push(format!(
"Detector {detector_id} pattern {pattern_index} has no literal prefix and no keywords."
));
}
fallback.push((compiled, detector.keywords.clone()));
}
Ok(())
}
pub fn compile_pattern(
detector_index: usize,
pattern_index: usize,
spec: &PatternSpec,
detector_id: &str,
) -> Result<CompiledPattern> {
// Eagerly validate regex SYNTAX so a malformed detector pattern (e.g.
// `(unclosed`) is rejected at compile time rather than silently degrading
// to a never-matching rule on first use - a silently-accepted bad regex
// is a dead detector. The cheap `regex_syntax` parse runs for every
// pattern but builds NO NFA/DFA, so the lazy-compile win below is
// preserved; `regex_syntax` is the same front end the `regex` crate uses,
// so a parse error here is a build error there.
if regex_syntax::Parser::new().parse(&spec.regex).is_err() {
// Re-run through the `regex` crate only on the rare invalid pattern to
// obtain the canonical `regex::Error` for the structured error; the
// matcher-build cost here is irrelevant since we're erroring out.
let source = regex::Regex::new(&spec.regex)
.err()
.unwrap_or_else(|| regex::Error::Syntax(spec.regex.clone()));
return Err(ScanError::RegexCompile {
detector_id: detector_id.to_string(),
index: pattern_index,
source,
});
}
// The matcher is NOT built here - it is deferred to first use via
// `LazyRegex` (see types.rs). Building the whole corpus up front cost
// ~450ms-2.3s per invocation; deferral lets a scan compile only the
// patterns the Aho-Corasick prefilter actually selects.
Ok(CompiledPattern {
detector_index,
regex: LazyRegex::detector(spec.regex.as_str()),
group: spec.group,
client_safe: spec.client_safe,
})
}
static REGEX_CACHE: std::sync::OnceLock<dashmap::DashMap<String, std::sync::Arc<Regex>>> =
std::sync::OnceLock::new();
pub fn shared_regex_compile(
pattern: &str,
) -> std::result::Result<std::sync::Arc<Regex>, regex::Error> {
let regex = regex::RegexBuilder::new(pattern)
.case_insensitive(true)
.size_limit(REGEX_SIZE_LIMIT_BYTES)
.dfa_size_limit(regex_dfa_limit())
.crlf(true)
.build()?;
Ok(std::sync::Arc::new(regex))
}
pub fn warm_shared_regex_cache(
compiled: Vec<(
String,
std::result::Result<std::sync::Arc<Regex>, regex::Error>,
)>,
) {
let cache = REGEX_CACHE.get_or_init(dashmap::DashMap::new);
for (pattern, res) in compiled {
if let Ok(arc) = res {
cache.insert(pattern, arc);
}
}
}
/// Compile a regex once per unique source string and share the compiled
/// `Arc<Regex>` across every detector that uses it. The 889-detector corpus
/// has ~6-15% duplicate regexes (Google, JWT, Slack shapes); this collapses
/// each duplicate set into a single compiled instance, cutting startup
/// compile time and resident memory proportionally - see audits/legendary-
/// 2026-04-26 sources_verifier_detectors_legendary.md.
///
/// The cache is process-wide via a `dashmap::DashMap<...>` which is lock-free
/// and extremely high-performance during the main parallel compile.
pub(crate) fn shared_regex(
pattern: &str,
) -> std::result::Result<std::sync::Arc<Regex>, regex::Error> {
let cache = REGEX_CACHE.get_or_init(dashmap::DashMap::new);
if let Some(hit) = cache.get(pattern) {
return Ok(std::sync::Arc::clone(hit.value()));
}
let arc = shared_regex_compile(pattern)?;
Ok(cache
.entry(pattern.to_string())
.or_insert(arc)
.value()
.clone())
}
pub fn compile_companion(spec: &CompanionSpec, detector_id: &str) -> Result<CompiledCompanion> {
let regex = regex::RegexBuilder::new(&spec.regex)
.size_limit(REGEX_SIZE_LIMIT_BYTES)
.dfa_size_limit(regex_dfa_limit())
.crlf(true)
.build()
.map_err(|e| ScanError::RegexCompile {
detector_id: detector_id.to_string(),
index: FIRST_CAPTURE_GROUP_INDEX,
source: e,
})?;
let capture_group = (regex.captures_len() > 1).then_some(FIRST_CAPTURE_GROUP_INDEX);
Ok(CompiledCompanion {
name: spec.name.clone(),
regex,
capture_group,
within_lines: spec.within_lines,
required: spec.required,
})
}