Skip to main content

keyhog_core/
allowlist.rs

1//! Allowlist support: `.keyhogignore` file parsing for suppressing known false
2//! positives by path glob, detector ID, or credential hash.
3
4/// Allowlist: known false positives and ignored patterns.
5///
6/// Users can create a `.keyhogignore` file to suppress known FPs.
7/// Format (one per line):
8///   - `hash:<sha256>` - ignore a specific credential by hash
9///   - `detector:<id>` - ignore all findings from a detector
10///   - `path:<glob>` - ignore files matching a glob pattern
11///   - `# comment` - comments
12///   - blank lines are skipped
13use std::collections::HashMap;
14use std::collections::HashSet;
15use std::path::Component;
16use std::path::Path;
17
18use crate::merkle_spec_hash::hex_nibble;
19use crate::VerifiedFinding;
20
21#[path = "allowlist_metadata.rs"]
22mod allowlist_metadata;
23use allowlist_metadata::*;
24
25/// User-defined suppressions loaded from `.keyhogignore`: credential hashes, detector IDs, and path globs.
26///
27/// # Examples
28///
29/// ```rust
30/// use keyhog_core::allowlist::Allowlist;
31///
32/// let allowlist = Allowlist::parse("detector:demo-token\npath:**/*.md\n");
33/// assert!(allowlist.ignored_detectors.contains("demo-token"));
34/// ```
35#[derive(Debug, Clone, serde::Serialize)]
36pub struct Allowlist {
37    /// SHA-256 hashes of credentials to ignore.
38    pub credential_hashes: HashSet<[u8; 32]>,
39    /// Detector IDs to ignore entirely.
40    pub ignored_detectors: HashSet<String>,
41    /// Glob patterns for paths to ignore (raw, as authored). Kept as the public
42    /// contract + serialized form; the matcher consumes the precompiled
43    /// [`PathGlobIndex`] built from these in [`Allowlist::parse`].
44    pub ignored_paths: Vec<String>,
45    /// Precompiled, first-segment-bucketed form of `ignored_paths`. Built once
46    /// in `parse`/`empty` so per-finding path checks neither re-normalize +
47    /// re-split each pattern nor sweep every rule. Skipped by `serde` (it is a
48    /// pure function of `ignored_paths`; reconstructed via `Deserialize`/manual
49    /// rebuild if ever needed) so the serialized shape is unchanged.
50    #[serde(skip)]
51    path_index: PathGlobIndex,
52}
53
54const MAX_GLOB_SEGMENTS: usize = 256;
55const MAX_GLOB_SEGMENT_LEN: usize = 1024;
56
57/// One precompiled ignored-path glob: its normalized segments computed ONCE at
58/// parse time, plus the oversize verdict that `glob_match_normalized` used to
59/// recompute per finding. `anchor` records how the pattern's first segment can
60/// match a path's first segment, so the index can skip patterns that cannot
61/// possibly match a given path without running the full automaton.
62#[derive(Debug, Clone)]
63struct CompiledGlob {
64    /// Normalized pattern segments (the `normalize_path` + `split_segments`
65    /// result, owned). Empty when the pattern normalized to nothing.
66    segments: Vec<String>,
67    /// True when the pattern (or, at match time, the path) is too large to
68    /// match safely - preserves the original `glob_match_normalized` fail-safe.
69    /// A path larger than the cap is rejected at match time; an oversize
70    /// pattern is pre-marked here so it never matches anything.
71    oversize: bool,
72}
73
74/// First-segment bucketed index over the compiled globs. A path can match a
75/// glob only if the glob's first segment is `**` (matches any prefix), or it
76/// matches the path's FIRST segment. Literal first segments key a hash bucket;
77/// wildcard / `**` first segments (which can match many first segments) fall
78/// into `wild_first`, always tested. This turns the per-finding O(rules) sweep
79/// into O(wild_first + matching_literal_bucket), sub-linear in total rule count
80/// for the realistic monorepo `.gitignore` shape (mostly literal-anchored dir
81/// rules), while reproducing `glob_match_segments` bit-for-bit.
82#[derive(Debug, Clone, Default)]
83struct PathGlobIndex {
84    /// Globs whose first segment is a pure literal, keyed by that literal. A
85    /// path is tested only against the bucket for its own first segment.
86    literal_first: HashMap<String, Vec<CompiledGlob>>,
87    /// Globs whose first segment is `**` or contains a `*` wildcard (it can
88    /// match more than one distinct first segment, so it cannot be bucketed by
89    /// a literal). Always tested.
90    wild_first: Vec<CompiledGlob>,
91    /// Globs that normalized to ZERO segments (e.g. a pattern that was only
92    /// `.` / `..` noise). `glob_match_segments(&[], path)` is true only for the
93    /// empty path, so these are kept apart and only consulted for that case.
94    empty_pattern: Vec<CompiledGlob>,
95    /// Number of source patterns this index was compiled from. `ignored_paths`
96    /// is a PUBLIC, mutable field: callers may push/extend/clear it directly
97    /// after construction (the documented `.gitignore`-append workflow). The
98    /// matcher compares this against the live `ignored_paths.len()` and rebuilds
99    /// on mismatch, so a directly-mutated allowlist never silently under- or
100    /// over-suppresses. Construction paths (`parse`/`load`/`empty`) keep it in
101    /// sync, so the hot scanner path never pays the rebuild.
102    source_len: usize,
103}
104
105impl PathGlobIndex {
106    /// Build the index from raw ignored-path patterns. Runs `normalize_path` +
107    /// `split_segments` + the oversize scan ONCE per pattern (the work
108    /// `glob_match_normalized` previously repeated on every finding).
109    fn build(patterns: &[String]) -> Self {
110        let mut index = PathGlobIndex::default();
111        index.source_len = patterns.len();
112        for pattern in patterns {
113            let normalized_pattern = normalize_path(pattern);
114            let segments: Vec<String> = split_segments(&normalized_pattern)
115                .into_iter()
116                .map(str::to_string)
117                .collect();
118            // Mirror the pattern half of the original oversize fail-safe: an
119            // oversize pattern can never match (it returned false before).
120            let oversize = segments.len() > MAX_GLOB_SEGMENTS
121                || segments.iter().any(|s| s.len() > MAX_GLOB_SEGMENT_LEN);
122            let glob = CompiledGlob { segments, oversize };
123
124            match glob.segments.first() {
125                None => index.empty_pattern.push(glob),
126                Some(first) if first == "**" || first.contains('*') => {
127                    index.wild_first.push(glob);
128                }
129                Some(first) => {
130                    index
131                        .literal_first
132                        .entry(first.clone())
133                        .or_default()
134                        .push(glob);
135                }
136            }
137        }
138        index
139    }
140
141    /// True when any compiled glob matches `normalized_path`. Tests only the
142    /// candidate set for `normalized_path`'s first segment plus the always-on
143    /// wildcard-anchored globs - never the full rule list.
144    fn matches(&self, normalized_path: &str) -> bool {
145        let path_segments = split_segments(normalized_path);
146
147        // Path-side oversize fail-safe (was recomputed per pattern before).
148        let path_oversize = path_segments.len() > MAX_GLOB_SEGMENTS
149            || path_segments.iter().any(|s| s.len() > MAX_GLOB_SEGMENT_LEN);
150        if path_oversize {
151            tracing::warn!(
152                "skipping oversized allowlist path match ({} segments). Fix: shorten the path",
153                path_segments.len()
154            );
155            return false;
156        }
157
158        let test = |glob: &CompiledGlob| -> bool {
159            !glob.oversize && glob_match_segments(&glob.segments, &path_segments)
160        };
161
162        // Empty path: only a zero-segment pattern (or a `**`-led one, which is
163        // in wild_first) can match. Mirror `glob_match_segments(&[], &[])`.
164        if path_segments.is_empty() {
165            return self.empty_pattern.iter().any(test) || self.wild_first.iter().any(test);
166        }
167
168        let first = path_segments[0];
169        if let Some(bucket) = self.literal_first.get(first) {
170            if bucket.iter().any(test) {
171                return true;
172            }
173        }
174        self.wild_first.iter().any(test)
175    }
176}
177
178impl Allowlist {
179    /// Create an empty allowlist with no suppressed hashes, detectors, or paths.
180    ///
181    /// # Examples
182    ///
183    /// ```rust
184    /// use keyhog_core::allowlist::Allowlist;
185    ///
186    /// let allowlist = Allowlist::empty();
187    /// assert!(allowlist.ignored_paths.is_empty());
188    /// ```
189    pub fn empty() -> Self {
190        Self {
191            credential_hashes: HashSet::new(),
192            ignored_detectors: HashSet::new(),
193            ignored_paths: Vec::new(),
194            path_index: PathGlobIndex::default(),
195        }
196    }
197
198    /// Load from a .keyhogignore file.
199    ///
200    /// # Examples
201    ///
202    /// ```rust,no_run
203    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
204    /// use keyhog_core::allowlist::Allowlist;
205    /// use std::path::Path;
206    ///
207    /// let _allowlist = Allowlist::load(Path::new(".keyhogignore"))?;
208    /// # Ok(()) }
209    /// ```
210    pub fn load(path: &Path) -> Result<Self, std::io::Error> {
211        let contents = std::fs::read_to_string(path)?;
212        Ok(Self::parse(&contents))
213    }
214
215    /// Parse allowlist from string content.
216    ///
217    /// # Examples
218    ///
219    /// ```rust
220    /// use keyhog_core::allowlist::Allowlist;
221    ///
222    /// let allowlist = Allowlist::parse("path:**/.env\ndetector:demo-token\n");
223    /// assert!(allowlist.is_path_ignored("app/.env"));
224    /// ```
225    pub fn parse(content: &str) -> Self {
226        let mut al = Self::empty();
227        let today = today_yyyy_mm_dd();
228        for (line_number, raw_line) in content.lines().enumerate() {
229            let raw_line = raw_line.trim();
230            if raw_line.is_empty() || raw_line.starts_with('#') {
231                continue;
232            }
233            // Optional inline metadata: `entry; reason="..."; expires=YYYY-MM-DD; approved_by="..."`
234            // Each `;`-separated token after the first is a key=value pair.
235            let mut parts = raw_line.splitn(2, ';');
236            let entry = parts.next().unwrap_or("").trim();
237            let metadata = parts.next().unwrap_or("");
238            let parsed_meta = parse_inline_metadata(metadata);
239
240            // Drop entries whose `expires` is past - keeps `.keyhogignore`
241            // self-cleaning for short-lived approvals (Tier-B #18 governance).
242            if let Some(exp) = parsed_meta.expires.as_deref() {
243                if exp < today.as_str() {
244                    tracing::warn!(
245                        "allowlist entry expired on {} (today is {}): '{}'",
246                        exp,
247                        today,
248                        entry
249                    );
250                    continue;
251                }
252            }
253
254            if let Some(hash) = entry.strip_prefix("hash:") {
255                let trimmed = hash.trim();
256                if let Some(valid_hash) = parse_sha256_hex(trimmed) {
257                    al.credential_hashes.insert(valid_hash);
258                    log_metadata_audit("hash", trimmed, &parsed_meta);
259                } else {
260                    tracing::warn!(
261                        "invalid hash allowlist entry at line {}: '{}'",
262                        line_number + 1,
263                        trimmed
264                    );
265                }
266            } else if let Some(detector) = entry.strip_prefix("detector:") {
267                let detector = detector.trim();
268                if detector.is_empty() {
269                    tracing::warn!(
270                        "invalid detector allowlist entry at line {}: detector id is empty",
271                        line_number + 1
272                    );
273                } else {
274                    al.ignored_detectors.insert(detector.to_string());
275                    log_metadata_audit("detector", detector, &parsed_meta);
276                }
277            } else if let Some(path) = entry.strip_prefix("path:") {
278                let path = path.trim();
279                if path.is_empty() {
280                    tracing::warn!(
281                        "invalid path allowlist entry at line {}: glob is empty",
282                        line_number + 1
283                    );
284                } else {
285                    al.ignored_paths.push(path.to_string());
286                    log_metadata_audit("path", path, &parsed_meta);
287                }
288            } else if let Some(bytes) = parse_sha256_hex(entry) {
289                // Bare 64-char hex hash. Lets the obvious
290                // `keyhog scan ... --format jsonl | jq -r '.credential_hash'
291                // >> .keyhogignore` workflow Just Work without users
292                // learning the `hash:` prefix.
293                al.credential_hashes.insert(bytes);
294                log_metadata_audit("hash", entry, &parsed_meta);
295            } else {
296                // Bare path glob (gitignore-style). Anything that didn't
297                // match an explicit `hash:` / `detector:` / `path:` prefix
298                // and isn't a bare hash is interpreted as a path glob,
299                // matching `.gitignore` UX (`*.log`, `node_modules/`,
300                // `vendor/**/*.json`). kimi-1 dogfood #129 - the prior
301                // behavior emitted a warning and silently dropped the
302                // line, which is the worst of both worlds: every
303                // `.gitignore` users copied over was dead.
304                al.ignored_paths.push(entry.to_string());
305                log_metadata_audit("path", entry, &parsed_meta);
306            }
307        }
308        // Precompile the path globs ONCE: segments + oversize verdict + the
309        // first-segment bucket index, so per-finding suppression neither
310        // re-normalizes each pattern nor sweeps every rule.
311        al.path_index = PathGlobIndex::build(&al.ignored_paths);
312        al
313    }
314
315    /// Check whether detector or path rules suppress a verified finding.
316    ///
317    /// Hash-based suppression is evaluated earlier on [`crate::RawMatch`] values
318    /// because [`VerifiedFinding`] stores only redacted credentials.
319    ///
320    /// # Examples
321    ///
322    /// ```rust
323    /// use keyhog_core::allowlist::Allowlist;
324    /// use keyhog_core::{MatchLocation, Severity, VerificationResult, VerifiedFinding};
325    /// use std::collections::HashMap;
326    ///
327    /// let allowlist = Allowlist::parse("detector:demo-token\n");
328    /// let finding = VerifiedFinding {
329    ///     detector_id: "demo-token".into(),
330    ///     detector_name: "Demo Token".into(),
331    ///     service: "demo".into(),
332    ///     severity: Severity::High,
333    ///     credential_redacted: "demo_...1234".into(),
334    ///     location: MatchLocation {
335    ///         source: "fs".into(),
336    ///         file_path: Some("src/main.rs".into()),
337    ///         line: Some(1),
338    ///         offset: 0,
339    ///         commit: None,
340    ///         author: None,
341    ///         date: None,
342    ///     },
343    ///     verification: VerificationResult::Unverifiable,
344    ///     metadata: std::collections::HashMap::new(),
345    ///     additional_locations: Vec::new(),
346    ///     confidence: None,
347    ///     credential_hash: [0u8; 32],
348    /// };
349    /// assert!(allowlist.is_allowed(&finding));
350    /// ```
351    pub fn is_allowed(&self, finding: &VerifiedFinding) -> bool {
352        let detector_ignored = self.ignored_detectors.contains(&*finding.detector_id);
353
354        let path_ignored = finding.location.file_path.as_ref().is_some_and(|path| {
355            let normalized_path = normalize_path(path);
356            self.path_matches(&normalized_path)
357        });
358
359        let hash_ignored = self.matches_ignored_hash(&finding.credential_hash);
360
361        detector_ignored || path_ignored || hash_ignored
362    }
363
364    /// Check if a raw credential hash is allowlisted.
365    ///
366    /// # Examples
367    ///
368    /// ```rust
369    /// use keyhog_core::allowlist::Allowlist;
370    ///
371    /// let allowlist = Allowlist::parse("");
372    /// assert!(!allowlist.is_hash_allowed("demo_ABC12345"));
373    /// ```
374    pub fn is_hash_allowed(&self, credential: &str) -> bool {
375        parse_sha256_hex(credential).is_some_and(|bytes| self.matches_ignored_hash(&bytes))
376    }
377
378    /// Check if a hex-encoded SHA-256 hash is allowlisted.
379    pub fn is_raw_hash_ignored(&self, hash_hex: &str) -> bool {
380        parse_sha256_hex(hash_hex).is_some_and(|bytes| self.matches_ignored_hash(&bytes))
381    }
382
383    /// Check if a finding's raw 32-byte SHA-256 hash is allowlisted - the
384    /// scan-path entry that takes the `[u8; 32]` form directly (no hex
385    /// round-trip). Siblings `is_hash_allowed` / `is_raw_hash_ignored` accept
386    /// the hex-string form for `.keyhogignore` self-checks and CLI input.
387    pub fn is_hash_ignored(&self, hash: &[u8; 32]) -> bool {
388        self.matches_ignored_hash(hash)
389    }
390
391    /// Check whether a raw path matches an ignored-path glob.
392    ///
393    /// # Examples
394    ///
395    /// ```rust
396    /// use keyhog_core::allowlist::Allowlist;
397    ///
398    /// let allowlist = Allowlist::parse("path:**/*.md\n");
399    /// assert!(allowlist.is_path_ignored("docs/README.md"));
400    /// ```
401    pub fn is_path_ignored(&self, path: &str) -> bool {
402        let normalized = normalize_path(path);
403        self.path_matches(&normalized)
404    }
405
406    /// Run the precompiled path-glob index against an already-normalized path,
407    /// rebuilding the index first iff the public `ignored_paths` field was
408    /// mutated directly since construction (detected by a length mismatch).
409    /// The construction paths keep the index in sync, so the scanner hot path
410    /// always takes the fast branch; only a hand-mutated allowlist pays the
411    /// one-off rebuild, and it pays it for correctness, not silently skips it.
412    fn path_matches(&self, normalized_path: &str) -> bool {
413        if self.path_index.source_len == self.ignored_paths.len() {
414            self.path_index.matches(normalized_path)
415        } else {
416            PathGlobIndex::build(&self.ignored_paths).matches(normalized_path)
417        }
418    }
419
420    fn matches_ignored_hash(&self, hash: &[u8; 32]) -> bool {
421        // Direct byte-set membership. Suppressing `hash:` entries are parsed
422        // from 64-hex into this same `[u8; 32]` form at load time
423        // (`parse_sha256_hex`), and findings carry the raw bytes, so no hex
424        // round-trip happens here. (Earlier versions also hashed raw input as a
425        // fallback, which silently encouraged plaintext in `.keyhogignore` - the
426        // file is often committed by accident; that path is intentionally gone,
427        // see audit release-2026-04-26.)
428        self.credential_hashes.contains(hash)
429    }
430}
431
432fn split_segments(path: &str) -> Vec<&str> {
433    if path.is_empty() {
434        Vec::new()
435    } else {
436        path.split(['/', '\\']).collect()
437    }
438}
439
440/// Segment-automaton glob match. Pattern segments are accepted by reference
441/// (`AsRef<str>`) so the precompiled `Vec<String>` index entries match WITHOUT
442/// re-borrowing into a `Vec<&str>` per finding; the path segments stay
443/// `&[&str]` (borrowed from the normalized path string). The matching logic is
444/// byte-for-byte the original automaton - only the pattern element type was
445/// generalized, so suppression decisions are identical.
446fn glob_match_segments<S: AsRef<str>>(pattern: &[S], path: &[&str]) -> bool {
447    let mut states = vec![false; path.len() + 1];
448    states[0] = true;
449
450    for segment in pattern {
451        let segment = segment.as_ref();
452        let mut next = vec![false; path.len() + 1];
453        if segment == "**" {
454            let mut reachable = false;
455            for idx in 0..=path.len() {
456                reachable |= states[idx];
457                next[idx] = reachable;
458            }
459        } else {
460            for idx in 0..path.len() {
461                if states[idx] && segment_match(segment, path[idx]) {
462                    next[idx + 1] = true;
463                }
464            }
465        }
466        states = next;
467    }
468
469    states[path.len()]
470}
471
472fn segment_match(pattern: &str, text: &str) -> bool {
473    if pattern.is_ascii() && text.is_ascii() {
474        return segment_match_ascii(pattern.as_bytes(), text.as_bytes());
475    }
476
477    segment_match_chars(pattern, text)
478}
479
480#[allow(clippy::similar_names)] // star_pi / star_ti name the same Kleene-star state in two coordinate systems
481fn segment_match_ascii(pattern: &[u8], text: &[u8]) -> bool {
482    let mut pi = 0usize;
483    let mut ti = 0usize;
484    let mut star_pi = None;
485    let mut star_ti = 0usize;
486
487    while ti < text.len() {
488        if pi < pattern.len() && pattern[pi] == b'*' {
489            star_pi = Some(pi);
490            star_ti = ti;
491            pi += 1;
492            continue;
493        }
494
495        if pi < pattern.len() && pattern[pi] == text[ti] {
496            pi += 1;
497            ti += 1;
498            continue;
499        }
500
501        if let Some(star) = star_pi {
502            star_ti += 1;
503            ti = star_ti;
504            pi = star + 1;
505            continue;
506        }
507
508        return false;
509    }
510
511    while pi < pattern.len() && pattern[pi] == b'*' {
512        pi += 1;
513    }
514
515    pi == pattern.len()
516}
517
518#[allow(clippy::similar_names)] // star_pi / star_ti name the same Kleene-star state in two coordinate systems
519fn segment_match_chars(pattern: &str, text: &str) -> bool {
520    let pattern_chars: Vec<char> = pattern.chars().collect();
521    let text_chars: Vec<char> = text.chars().collect();
522
523    let mut pi = 0usize;
524    let mut ti = 0usize;
525    let mut star_pi = None;
526    let mut star_ti = 0usize;
527
528    while ti < text_chars.len() {
529        if pi < pattern_chars.len() && pattern_chars[pi] == '*' {
530            star_pi = Some(pi);
531            star_ti = ti;
532            pi += 1;
533            continue;
534        }
535
536        if pi < pattern_chars.len() && pattern_chars[pi] == text_chars[ti] {
537            pi += 1;
538            ti += 1;
539            continue;
540        }
541
542        if let Some(star) = star_pi {
543            star_ti += 1;
544            ti = star_ti;
545            pi = star + 1;
546            continue;
547        }
548
549        return false;
550    }
551
552    while pi < pattern_chars.len() && pattern_chars[pi] == '*' {
553        pi += 1;
554    }
555
556    pi == pattern_chars.len()
557}
558
559fn normalize_path(path: &str) -> String {
560    let path = path.replace('\\', "/");
561    let mut parts = Vec::new();
562    for component in Path::new(&path).components() {
563        match component {
564            Component::CurDir => {}
565            Component::ParentDir => {
566                if !parts.is_empty() && parts.last().is_some_and(|part| part != "..") {
567                    parts.pop();
568                } else {
569                    parts.push("..".to_string());
570                }
571            }
572            Component::Normal(part) => parts.push(part.to_string_lossy().into_owned()),
573            Component::RootDir => parts.clear(),
574            Component::Prefix(prefix) => parts.push(prefix.as_os_str().to_string_lossy().into()),
575        }
576    }
577    parts.join("/")
578}
579
580fn parse_sha256_hex(input: &str) -> Option<[u8; 32]> {
581    let input = input.trim();
582    // A SHA-256 hex digest is 64 ASCII bytes. Operate on the byte slice, not
583    // `&str[..]` slicing: a 64-*byte* input containing a multibyte UTF-8 char
584    // at an odd offset (e.g. a stray `é` pasted into `.keyhogignore`) would
585    // make `&input[idx*2..idx*2+2]` panic on a non-char boundary. Decode each
586    // nibble directly so any non-hex byte just fails the parse.
587    let bytes = input.as_bytes();
588    if bytes.len() != 64 {
589        return None;
590    }
591    let mut digest = [0u8; 32];
592    for idx in 0..32 {
593        let hi = hex_nibble(bytes[idx * 2])?;
594        let lo = hex_nibble(bytes[idx * 2 + 1])?;
595        digest[idx] = (hi << 4) | lo;
596    }
597    Some(digest)
598}
599
600/// Inline metadata parsed from a `.keyhogignore` line trailer. Used to
601/// implement enterprise governance fields (`reason`, `expires`,
602/// `approved_by`) per audits/legendary-2026-04-26 Tier-B #18.
603#[derive(Default, Debug)]
604struct InlineMetadata {
605    reason: Option<String>,
606    expires: Option<String>,
607    approved_by: Option<String>,
608}