keyhog_core/allowlist.rs
1//! Allowlist support: `.keyhogignore` file parsing for suppressing known false
2//! positives by path glob, detector ID, or credential hash.
3
4/// Allowlist: known false positives and ignored patterns.
5///
6/// Users can create a `.keyhogignore` file to suppress known FPs.
7/// Format (one per line):
8/// - `hash:<sha256>` - ignore a specific credential by hash
9/// - `detector:<id>` - ignore all findings from a detector
10/// - `path:<glob>` - ignore files matching a glob pattern
11/// - `# comment` - comments
12/// - blank lines are skipped
13use std::collections::HashMap;
14use std::collections::HashSet;
15use std::path::Component;
16use std::path::Path;
17
18use crate::merkle_spec_hash::hex_nibble;
19use crate::VerifiedFinding;
20
21#[path = "allowlist_metadata.rs"]
22mod allowlist_metadata;
23use allowlist_metadata::*;
24
25/// User-defined suppressions loaded from `.keyhogignore`: credential hashes, detector IDs, and path globs.
26///
27/// # Examples
28///
29/// ```rust
30/// use keyhog_core::allowlist::Allowlist;
31///
32/// let allowlist = Allowlist::parse("detector:demo-token\npath:**/*.md\n");
33/// assert!(allowlist.ignored_detectors.contains("demo-token"));
34/// ```
35#[derive(Debug, Clone, serde::Serialize)]
36pub struct Allowlist {
37 /// SHA-256 hashes of credentials to ignore.
38 pub credential_hashes: HashSet<[u8; 32]>,
39 /// Detector IDs to ignore entirely.
40 pub ignored_detectors: HashSet<String>,
41 /// Glob patterns for paths to ignore (raw, as authored). Kept as the public
42 /// contract + serialized form; the matcher consumes the precompiled
43 /// [`PathGlobIndex`] built from these in [`Allowlist::parse`].
44 pub ignored_paths: Vec<String>,
45 /// Precompiled, first-segment-bucketed form of `ignored_paths`. Built once
46 /// in `parse`/`empty` so per-finding path checks neither re-normalize +
47 /// re-split each pattern nor sweep every rule. Skipped by `serde` (it is a
48 /// pure function of `ignored_paths`; reconstructed via `Deserialize`/manual
49 /// rebuild if ever needed) so the serialized shape is unchanged.
50 #[serde(skip)]
51 path_index: PathGlobIndex,
52}
53
54const MAX_GLOB_SEGMENTS: usize = 256;
55const MAX_GLOB_SEGMENT_LEN: usize = 1024;
56
57/// One precompiled ignored-path glob: its normalized segments computed ONCE at
58/// parse time, plus the oversize verdict that `glob_match_normalized` used to
59/// recompute per finding. `anchor` records how the pattern's first segment can
60/// match a path's first segment, so the index can skip patterns that cannot
61/// possibly match a given path without running the full automaton.
62#[derive(Debug, Clone)]
63struct CompiledGlob {
64 /// Normalized pattern segments (the `normalize_path` + `split_segments`
65 /// result, owned). Empty when the pattern normalized to nothing.
66 segments: Vec<String>,
67 /// True when the pattern (or, at match time, the path) is too large to
68 /// match safely - preserves the original `glob_match_normalized` fail-safe.
69 /// A path larger than the cap is rejected at match time; an oversize
70 /// pattern is pre-marked here so it never matches anything.
71 oversize: bool,
72}
73
74/// First-segment bucketed index over the compiled globs. A path can match a
75/// glob only if the glob's first segment is `**` (matches any prefix), or it
76/// matches the path's FIRST segment. Literal first segments key a hash bucket;
77/// wildcard / `**` first segments (which can match many first segments) fall
78/// into `wild_first`, always tested. This turns the per-finding O(rules) sweep
79/// into O(wild_first + matching_literal_bucket), sub-linear in total rule count
80/// for the realistic monorepo `.gitignore` shape (mostly literal-anchored dir
81/// rules), while reproducing `glob_match_segments` bit-for-bit.
82#[derive(Debug, Clone, Default)]
83struct PathGlobIndex {
84 /// Globs whose first segment is a pure literal, keyed by that literal. A
85 /// path is tested only against the bucket for its own first segment.
86 literal_first: HashMap<String, Vec<CompiledGlob>>,
87 /// Globs whose first segment is `**` or contains a `*` wildcard (it can
88 /// match more than one distinct first segment, so it cannot be bucketed by
89 /// a literal). Always tested.
90 wild_first: Vec<CompiledGlob>,
91 /// Globs that normalized to ZERO segments (e.g. a pattern that was only
92 /// `.` / `..` noise). `glob_match_segments(&[], path)` is true only for the
93 /// empty path, so these are kept apart and only consulted for that case.
94 empty_pattern: Vec<CompiledGlob>,
95 /// Number of source patterns this index was compiled from. `ignored_paths`
96 /// is a PUBLIC, mutable field: callers may push/extend/clear it directly
97 /// after construction (the documented `.gitignore`-append workflow). The
98 /// matcher compares this against the live `ignored_paths.len()` and rebuilds
99 /// on mismatch, so a directly-mutated allowlist never silently under- or
100 /// over-suppresses. Construction paths (`parse`/`load`/`empty`) keep it in
101 /// sync, so the hot scanner path never pays the rebuild.
102 source_len: usize,
103}
104
105impl PathGlobIndex {
106 /// Build the index from raw ignored-path patterns. Runs `normalize_path` +
107 /// `split_segments` + the oversize scan ONCE per pattern (the work
108 /// `glob_match_normalized` previously repeated on every finding).
109 fn build(patterns: &[String]) -> Self {
110 let mut index = PathGlobIndex::default();
111 index.source_len = patterns.len();
112 for pattern in patterns {
113 let normalized_pattern = normalize_path(pattern);
114 let segments: Vec<String> = split_segments(&normalized_pattern)
115 .into_iter()
116 .map(str::to_string)
117 .collect();
118 // Mirror the pattern half of the original oversize fail-safe: an
119 // oversize pattern can never match (it returned false before).
120 let oversize = segments.len() > MAX_GLOB_SEGMENTS
121 || segments.iter().any(|s| s.len() > MAX_GLOB_SEGMENT_LEN);
122 let glob = CompiledGlob { segments, oversize };
123
124 match glob.segments.first() {
125 None => index.empty_pattern.push(glob),
126 Some(first) if first == "**" || first.contains('*') => {
127 index.wild_first.push(glob);
128 }
129 Some(first) => {
130 index
131 .literal_first
132 .entry(first.clone())
133 .or_default()
134 .push(glob);
135 }
136 }
137 }
138 index
139 }
140
141 /// True when any compiled glob matches `normalized_path`. Tests only the
142 /// candidate set for `normalized_path`'s first segment plus the always-on
143 /// wildcard-anchored globs - never the full rule list.
144 fn matches(&self, normalized_path: &str) -> bool {
145 let path_segments = split_segments(normalized_path);
146
147 // Path-side oversize fail-safe (was recomputed per pattern before).
148 let path_oversize = path_segments.len() > MAX_GLOB_SEGMENTS
149 || path_segments.iter().any(|s| s.len() > MAX_GLOB_SEGMENT_LEN);
150 if path_oversize {
151 tracing::warn!(
152 "skipping oversized allowlist path match ({} segments). Fix: shorten the path",
153 path_segments.len()
154 );
155 return false;
156 }
157
158 let test = |glob: &CompiledGlob| -> bool {
159 !glob.oversize && glob_match_segments(&glob.segments, &path_segments)
160 };
161
162 // Empty path: only a zero-segment pattern (or a `**`-led one, which is
163 // in wild_first) can match. Mirror `glob_match_segments(&[], &[])`.
164 if path_segments.is_empty() {
165 return self.empty_pattern.iter().any(test) || self.wild_first.iter().any(test);
166 }
167
168 let first = path_segments[0];
169 if let Some(bucket) = self.literal_first.get(first) {
170 if bucket.iter().any(test) {
171 return true;
172 }
173 }
174 self.wild_first.iter().any(test)
175 }
176}
177
178impl Allowlist {
179 /// Create an empty allowlist with no suppressed hashes, detectors, or paths.
180 ///
181 /// # Examples
182 ///
183 /// ```rust
184 /// use keyhog_core::allowlist::Allowlist;
185 ///
186 /// let allowlist = Allowlist::empty();
187 /// assert!(allowlist.ignored_paths.is_empty());
188 /// ```
189 pub fn empty() -> Self {
190 Self {
191 credential_hashes: HashSet::new(),
192 ignored_detectors: HashSet::new(),
193 ignored_paths: Vec::new(),
194 path_index: PathGlobIndex::default(),
195 }
196 }
197
198 /// Load from a .keyhogignore file.
199 ///
200 /// # Examples
201 ///
202 /// ```rust,no_run
203 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
204 /// use keyhog_core::allowlist::Allowlist;
205 /// use std::path::Path;
206 ///
207 /// let _allowlist = Allowlist::load(Path::new(".keyhogignore"))?;
208 /// # Ok(()) }
209 /// ```
210 pub fn load(path: &Path) -> Result<Self, std::io::Error> {
211 let contents = std::fs::read_to_string(path)?;
212 Ok(Self::parse(&contents))
213 }
214
215 /// Parse allowlist from string content.
216 ///
217 /// # Examples
218 ///
219 /// ```rust
220 /// use keyhog_core::allowlist::Allowlist;
221 ///
222 /// let allowlist = Allowlist::parse("path:**/.env\ndetector:demo-token\n");
223 /// assert!(allowlist.is_path_ignored("app/.env"));
224 /// ```
225 pub fn parse(content: &str) -> Self {
226 let mut al = Self::empty();
227 let today = today_yyyy_mm_dd();
228 for (line_number, raw_line) in content.lines().enumerate() {
229 let raw_line = raw_line.trim();
230 if raw_line.is_empty() || raw_line.starts_with('#') {
231 continue;
232 }
233 // Optional inline metadata: `entry; reason="..."; expires=YYYY-MM-DD; approved_by="..."`
234 // Each `;`-separated token after the first is a key=value pair.
235 let mut parts = raw_line.splitn(2, ';');
236 let entry = parts.next().unwrap_or("").trim();
237 let metadata = parts.next().unwrap_or("");
238 let parsed_meta = parse_inline_metadata(metadata);
239
240 // Drop entries whose `expires` is past - keeps `.keyhogignore`
241 // self-cleaning for short-lived approvals (Tier-B #18 governance).
242 if let Some(exp) = parsed_meta.expires.as_deref() {
243 if exp < today.as_str() {
244 tracing::warn!(
245 "allowlist entry expired on {} (today is {}): '{}'",
246 exp,
247 today,
248 entry
249 );
250 continue;
251 }
252 }
253
254 if let Some(hash) = entry.strip_prefix("hash:") {
255 let trimmed = hash.trim();
256 if let Some(valid_hash) = parse_sha256_hex(trimmed) {
257 al.credential_hashes.insert(valid_hash);
258 log_metadata_audit("hash", trimmed, &parsed_meta);
259 } else {
260 tracing::warn!(
261 "invalid hash allowlist entry at line {}: '{}'",
262 line_number + 1,
263 trimmed
264 );
265 }
266 } else if let Some(detector) = entry.strip_prefix("detector:") {
267 let detector = detector.trim();
268 if detector.is_empty() {
269 tracing::warn!(
270 "invalid detector allowlist entry at line {}: detector id is empty",
271 line_number + 1
272 );
273 } else {
274 al.ignored_detectors.insert(detector.to_string());
275 log_metadata_audit("detector", detector, &parsed_meta);
276 }
277 } else if let Some(path) = entry.strip_prefix("path:") {
278 let path = path.trim();
279 if path.is_empty() {
280 tracing::warn!(
281 "invalid path allowlist entry at line {}: glob is empty",
282 line_number + 1
283 );
284 } else {
285 al.ignored_paths.push(path.to_string());
286 log_metadata_audit("path", path, &parsed_meta);
287 }
288 } else if let Some(bytes) = parse_sha256_hex(entry) {
289 // Bare 64-char hex hash. Lets the obvious
290 // `keyhog scan ... --format jsonl | jq -r '.credential_hash'
291 // >> .keyhogignore` workflow Just Work without users
292 // learning the `hash:` prefix.
293 al.credential_hashes.insert(bytes);
294 log_metadata_audit("hash", entry, &parsed_meta);
295 } else {
296 // Bare path glob (gitignore-style). Anything that didn't
297 // match an explicit `hash:` / `detector:` / `path:` prefix
298 // and isn't a bare hash is interpreted as a path glob,
299 // matching `.gitignore` UX (`*.log`, `node_modules/`,
300 // `vendor/**/*.json`). kimi-1 dogfood #129 - the prior
301 // behavior emitted a warning and silently dropped the
302 // line, which is the worst of both worlds: every
303 // `.gitignore` users copied over was dead.
304 al.ignored_paths.push(entry.to_string());
305 log_metadata_audit("path", entry, &parsed_meta);
306 }
307 }
308 // Precompile the path globs ONCE: segments + oversize verdict + the
309 // first-segment bucket index, so per-finding suppression neither
310 // re-normalizes each pattern nor sweeps every rule.
311 al.path_index = PathGlobIndex::build(&al.ignored_paths);
312 al
313 }
314
315 /// Check whether detector or path rules suppress a verified finding.
316 ///
317 /// Hash-based suppression is evaluated earlier on [`crate::RawMatch`] values
318 /// because [`VerifiedFinding`] stores only redacted credentials.
319 ///
320 /// # Examples
321 ///
322 /// ```rust
323 /// use keyhog_core::allowlist::Allowlist;
324 /// use keyhog_core::{MatchLocation, Severity, VerificationResult, VerifiedFinding};
325 /// use std::collections::HashMap;
326 ///
327 /// let allowlist = Allowlist::parse("detector:demo-token\n");
328 /// let finding = VerifiedFinding {
329 /// detector_id: "demo-token".into(),
330 /// detector_name: "Demo Token".into(),
331 /// service: "demo".into(),
332 /// severity: Severity::High,
333 /// credential_redacted: "demo_...1234".into(),
334 /// location: MatchLocation {
335 /// source: "fs".into(),
336 /// file_path: Some("src/main.rs".into()),
337 /// line: Some(1),
338 /// offset: 0,
339 /// commit: None,
340 /// author: None,
341 /// date: None,
342 /// },
343 /// verification: VerificationResult::Unverifiable,
344 /// metadata: std::collections::HashMap::new(),
345 /// additional_locations: Vec::new(),
346 /// confidence: None,
347 /// credential_hash: [0u8; 32],
348 /// };
349 /// assert!(allowlist.is_allowed(&finding));
350 /// ```
351 pub fn is_allowed(&self, finding: &VerifiedFinding) -> bool {
352 let detector_ignored = self.ignored_detectors.contains(&*finding.detector_id);
353
354 let path_ignored = finding.location.file_path.as_ref().is_some_and(|path| {
355 let normalized_path = normalize_path(path);
356 self.path_matches(&normalized_path)
357 });
358
359 let hash_ignored = self.matches_ignored_hash(&finding.credential_hash);
360
361 detector_ignored || path_ignored || hash_ignored
362 }
363
364 /// Check if a raw credential hash is allowlisted.
365 ///
366 /// # Examples
367 ///
368 /// ```rust
369 /// use keyhog_core::allowlist::Allowlist;
370 ///
371 /// let allowlist = Allowlist::parse("");
372 /// assert!(!allowlist.is_hash_allowed("demo_ABC12345"));
373 /// ```
374 pub fn is_hash_allowed(&self, credential: &str) -> bool {
375 parse_sha256_hex(credential).is_some_and(|bytes| self.matches_ignored_hash(&bytes))
376 }
377
378 /// Check if a hex-encoded SHA-256 hash is allowlisted.
379 pub fn is_raw_hash_ignored(&self, hash_hex: &str) -> bool {
380 parse_sha256_hex(hash_hex).is_some_and(|bytes| self.matches_ignored_hash(&bytes))
381 }
382
383 /// Check if a finding's raw 32-byte SHA-256 hash is allowlisted - the
384 /// scan-path entry that takes the `[u8; 32]` form directly (no hex
385 /// round-trip). Siblings `is_hash_allowed` / `is_raw_hash_ignored` accept
386 /// the hex-string form for `.keyhogignore` self-checks and CLI input.
387 pub fn is_hash_ignored(&self, hash: &[u8; 32]) -> bool {
388 self.matches_ignored_hash(hash)
389 }
390
391 /// Check whether a raw path matches an ignored-path glob.
392 ///
393 /// # Examples
394 ///
395 /// ```rust
396 /// use keyhog_core::allowlist::Allowlist;
397 ///
398 /// let allowlist = Allowlist::parse("path:**/*.md\n");
399 /// assert!(allowlist.is_path_ignored("docs/README.md"));
400 /// ```
401 pub fn is_path_ignored(&self, path: &str) -> bool {
402 let normalized = normalize_path(path);
403 self.path_matches(&normalized)
404 }
405
406 /// Run the precompiled path-glob index against an already-normalized path,
407 /// rebuilding the index first iff the public `ignored_paths` field was
408 /// mutated directly since construction (detected by a length mismatch).
409 /// The construction paths keep the index in sync, so the scanner hot path
410 /// always takes the fast branch; only a hand-mutated allowlist pays the
411 /// one-off rebuild, and it pays it for correctness, not silently skips it.
412 fn path_matches(&self, normalized_path: &str) -> bool {
413 if self.path_index.source_len == self.ignored_paths.len() {
414 self.path_index.matches(normalized_path)
415 } else {
416 PathGlobIndex::build(&self.ignored_paths).matches(normalized_path)
417 }
418 }
419
420 fn matches_ignored_hash(&self, hash: &[u8; 32]) -> bool {
421 // Direct byte-set membership. Suppressing `hash:` entries are parsed
422 // from 64-hex into this same `[u8; 32]` form at load time
423 // (`parse_sha256_hex`), and findings carry the raw bytes, so no hex
424 // round-trip happens here. (Earlier versions also hashed raw input as a
425 // fallback, which silently encouraged plaintext in `.keyhogignore` - the
426 // file is often committed by accident; that path is intentionally gone,
427 // see audit release-2026-04-26.)
428 self.credential_hashes.contains(hash)
429 }
430}
431
432fn split_segments(path: &str) -> Vec<&str> {
433 if path.is_empty() {
434 Vec::new()
435 } else {
436 path.split(['/', '\\']).collect()
437 }
438}
439
440/// Segment-automaton glob match. Pattern segments are accepted by reference
441/// (`AsRef<str>`) so the precompiled `Vec<String>` index entries match WITHOUT
442/// re-borrowing into a `Vec<&str>` per finding; the path segments stay
443/// `&[&str]` (borrowed from the normalized path string). The matching logic is
444/// byte-for-byte the original automaton - only the pattern element type was
445/// generalized, so suppression decisions are identical.
446fn glob_match_segments<S: AsRef<str>>(pattern: &[S], path: &[&str]) -> bool {
447 let mut states = vec![false; path.len() + 1];
448 states[0] = true;
449
450 for segment in pattern {
451 let segment = segment.as_ref();
452 let mut next = vec![false; path.len() + 1];
453 if segment == "**" {
454 let mut reachable = false;
455 for idx in 0..=path.len() {
456 reachable |= states[idx];
457 next[idx] = reachable;
458 }
459 } else {
460 for idx in 0..path.len() {
461 if states[idx] && segment_match(segment, path[idx]) {
462 next[idx + 1] = true;
463 }
464 }
465 }
466 states = next;
467 }
468
469 states[path.len()]
470}
471
472fn segment_match(pattern: &str, text: &str) -> bool {
473 if pattern.is_ascii() && text.is_ascii() {
474 return segment_match_ascii(pattern.as_bytes(), text.as_bytes());
475 }
476
477 segment_match_chars(pattern, text)
478}
479
480#[allow(clippy::similar_names)] // star_pi / star_ti name the same Kleene-star state in two coordinate systems
481fn segment_match_ascii(pattern: &[u8], text: &[u8]) -> bool {
482 let mut pi = 0usize;
483 let mut ti = 0usize;
484 let mut star_pi = None;
485 let mut star_ti = 0usize;
486
487 while ti < text.len() {
488 if pi < pattern.len() && pattern[pi] == b'*' {
489 star_pi = Some(pi);
490 star_ti = ti;
491 pi += 1;
492 continue;
493 }
494
495 if pi < pattern.len() && pattern[pi] == text[ti] {
496 pi += 1;
497 ti += 1;
498 continue;
499 }
500
501 if let Some(star) = star_pi {
502 star_ti += 1;
503 ti = star_ti;
504 pi = star + 1;
505 continue;
506 }
507
508 return false;
509 }
510
511 while pi < pattern.len() && pattern[pi] == b'*' {
512 pi += 1;
513 }
514
515 pi == pattern.len()
516}
517
518#[allow(clippy::similar_names)] // star_pi / star_ti name the same Kleene-star state in two coordinate systems
519fn segment_match_chars(pattern: &str, text: &str) -> bool {
520 let pattern_chars: Vec<char> = pattern.chars().collect();
521 let text_chars: Vec<char> = text.chars().collect();
522
523 let mut pi = 0usize;
524 let mut ti = 0usize;
525 let mut star_pi = None;
526 let mut star_ti = 0usize;
527
528 while ti < text_chars.len() {
529 if pi < pattern_chars.len() && pattern_chars[pi] == '*' {
530 star_pi = Some(pi);
531 star_ti = ti;
532 pi += 1;
533 continue;
534 }
535
536 if pi < pattern_chars.len() && pattern_chars[pi] == text_chars[ti] {
537 pi += 1;
538 ti += 1;
539 continue;
540 }
541
542 if let Some(star) = star_pi {
543 star_ti += 1;
544 ti = star_ti;
545 pi = star + 1;
546 continue;
547 }
548
549 return false;
550 }
551
552 while pi < pattern_chars.len() && pattern_chars[pi] == '*' {
553 pi += 1;
554 }
555
556 pi == pattern_chars.len()
557}
558
559fn normalize_path(path: &str) -> String {
560 let path = path.replace('\\', "/");
561 let mut parts = Vec::new();
562 for component in Path::new(&path).components() {
563 match component {
564 Component::CurDir => {}
565 Component::ParentDir => {
566 if !parts.is_empty() && parts.last().is_some_and(|part| part != "..") {
567 parts.pop();
568 } else {
569 parts.push("..".to_string());
570 }
571 }
572 Component::Normal(part) => parts.push(part.to_string_lossy().into_owned()),
573 Component::RootDir => parts.clear(),
574 Component::Prefix(prefix) => parts.push(prefix.as_os_str().to_string_lossy().into()),
575 }
576 }
577 parts.join("/")
578}
579
580fn parse_sha256_hex(input: &str) -> Option<[u8; 32]> {
581 let input = input.trim();
582 // A SHA-256 hex digest is 64 ASCII bytes. Operate on the byte slice, not
583 // `&str[..]` slicing: a 64-*byte* input containing a multibyte UTF-8 char
584 // at an odd offset (e.g. a stray `é` pasted into `.keyhogignore`) would
585 // make `&input[idx*2..idx*2+2]` panic on a non-char boundary. Decode each
586 // nibble directly so any non-hex byte just fails the parse.
587 let bytes = input.as_bytes();
588 if bytes.len() != 64 {
589 return None;
590 }
591 let mut digest = [0u8; 32];
592 for idx in 0..32 {
593 let hi = hex_nibble(bytes[idx * 2])?;
594 let lo = hex_nibble(bytes[idx * 2 + 1])?;
595 digest[idx] = (hi << 4) | lo;
596 }
597 Some(digest)
598}
599
600/// Inline metadata parsed from a `.keyhogignore` line trailer. Used to
601/// implement enterprise governance fields (`reason`, `expires`,
602/// `approved_by`) per audits/legendary-2026-04-26 Tier-B #18.
603#[derive(Default, Debug)]
604struct InlineMetadata {
605 reason: Option<String>,
606 expires: Option<String>,
607 approved_by: Option<String>,
608}