Skip to main content

keyhog_scanner/
lib.rs

1//! KeyHog Scanner: A high-performance, multi-layered secret detection engine.
2//!
3//! This crate implements the core scanning logic, combining SIMD pre-filtering,
4//! Aho-Corasick literal matching, regex fallback, and ML-based confidence scoring.
5
6#![deny(unsafe_op_in_unsafe_fn)]
7#![allow(clippy::too_many_arguments)]
8
9// ── Public API ──────────────────────────────────────────────────────
10/// Offline AWS account-ID recovery from an access-key ID (no network/verify).
11pub mod aws;
12/// Service-specific credential checksum validation (GitHub, npm, Slack, etc.).
13pub mod checksum;
14/// Detector compilation into high-performance matching structures.
15pub mod compiler;
16/// Heuristic and ML-based confidence scoring for candidate matches.
17pub mod confidence;
18/// Code context analysis (comments, assignments, test files).
19pub mod context;
20/// Decode-through pipeline for nested encodings (base64, hex, URL, etc.).
21pub mod decode;
22/// Decode-structure analysis: classify what a candidate base64/hex-decodes to
23/// (binary asset magic bytes, protobuf wire) so decode-through feeds scoring.
24pub mod decode_structure;
25/// Core scan execution engine.
26pub mod engine;
27/// Shannon entropy analysis for secret detection.
28pub mod entropy;
29/// Specialized error types for the scanner.
30pub mod error;
31/// GPU-accelerated matching via wgpu.
32pub mod gpu;
33/// Hardware capability detection and backend selection.
34pub mod hw_probe;
35/// Machine learning inference for secret scoring.
36pub mod ml_scorer;
37/// Multiline secret reassembly logic.
38pub mod multiline;
39/// Match resolution and deduplication.
40pub mod resolution;
41/// Scanner configuration and state.
42pub mod scanner_config;
43/// Static-string interner backed by vyre's CHD perfect hash.
44/// Used by `CompiledScanner` to pre-intern detector metadata strings
45/// so the per-scan `ScanState` interner is hit only by dynamic
46/// strings (file paths, commit SHAs).
47pub mod static_intern;
48/// Shared types for the scanner engine.
49pub mod types;
50
51// Internal modules.
52/// SIMD-accelerated alphabet pre-filtering.
53pub mod alphabet_filter;
54/// ASCII case-insensitive byte-search primitives shared by every hot path
55/// that needs to skim text without lowering the haystack first.
56pub(crate) mod ascii_ci;
57/// Bigram bloom filter for fast chunk gating.
58pub mod bigram_bloom;
59/// AVX-512 optimized entropy calculation.
60pub(crate) mod entropy_avx512;
61/// Fast scalar entropy calculation.
62pub mod entropy_fast;
63#[cfg(target_arch = "aarch64")]
64pub(crate) mod entropy_fast_neon;
65#[cfg(target_arch = "x86_64")]
66pub(crate) mod entropy_fast_x86;
67/// JWT structural validation and anomaly detection.
68pub mod jwt;
69// `fragment_cache` lives under `multiline/` (its only call sites are there);
70// re-exported at the crate root so existing `keyhog_scanner::fragment_cache`
71// paths and the Tier-C audit cleanup don't churn the public API.
72pub use multiline::fragment_cache;
73pub(crate) mod homoglyph;
74/// Internal scan pipeline orchestration.
75pub mod pipeline;
76/// Prefix trie for efficient keyword propagation.
77pub mod prefix_trie;
78pub(crate) mod probabilistic_gate;
79pub(crate) mod structured;
80pub(crate) mod suppression;
81/// Per-scan telemetry: always-on counters + opt-in `--dogfood` events.
82pub mod telemetry;
83/// Unicode normalization and homoglyph defense.
84pub mod unicode_hardening;
85/// Shared FNV-1a hash + content-keyed memoization primitives. Single home for
86/// the seed every per-scan cache keys on, plus the bounded thread-local cache
87/// helper they all share, so a hash change can never re-key only some caches.
88pub(crate) mod util_hash;
89
90/// SHA-256 of a credential as the raw 32 inline bytes - matching
91/// `Finding::credential_hash: [u8; 32]`. Hex encoding happens at the
92/// serde/reporter boundary (`keyhog_core::hex_encode`), keeping the pre-dedup
93/// hot path zero-heap. All `credential_hash:` assignment sites forward this
94/// value straight into the `[u8; 32]` field, so the byte form is required.
95pub(crate) fn sha256_hash(s: &str) -> [u8; 32] {
96    use sha2::{Digest, Sha256};
97    let mut hasher = Sha256::new();
98    hasher.update(s.as_bytes());
99    hasher.finalize().into()
100}
101
102#[cfg(feature = "simd")]
103pub(crate) mod simd;
104#[cfg(feature = "simdsieve")]
105mod simdsieve_prefilter;
106
107pub(crate) mod shared_regexes;
108
109pub use engine::GpuPhase1Output;
110pub use engine::{CompiledScanner, GpuInitPolicy};
111pub use error::{Result, ScanError};
112pub use hw_probe::{probe_hardware, select_backend, HardwareCaps, ScanBackend};
113pub use types::ScannerConfig;
114
115use std::borrow::Cow;
116
117/// Normalize scannable text by removing evasion characters and handling homoglyphs.
118pub fn normalize_chunk_data(data: &str) -> Cow<'_, str> {
119    if data.is_ascii() {
120        return Cow::Borrowed(data);
121    }
122    let mut normalized = String::with_capacity(data.len());
123    let mut changed = false;
124    for ch in data.chars() {
125        if !unicode_hardening::is_evasion_char(ch) {
126            normalized.push(ch);
127        } else {
128            changed = true;
129        }
130    }
131    if changed {
132        Cow::Owned(normalized)
133    } else {
134        Cow::Borrowed(data)
135    }
136}
137
138/// Pre-process a chunk of text for scanning.
139pub fn normalize_scannable_chunk<'a>(
140    chunk: &'a keyhog_core::Chunk,
141    owned: &'a mut Option<keyhog_core::Chunk>,
142) -> &'a keyhog_core::Chunk {
143    pipeline::normalize_scannable_chunk(chunk, owned)
144}
145
146/// Compute line offsets for a block of text.
147pub fn compute_line_offsets(text: &str) -> Vec<usize> {
148    pipeline::compute_line_offsets(text)
149}
150
151/// Map a byte offset to a line number using pre-computed offsets.
152pub fn match_line_number(
153    preprocessed: &types::ScannerPreprocessedText<'_>,
154    line_offsets: &[usize],
155    offset: usize,
156) -> usize {
157    pipeline::match_line_number(preprocessed, line_offsets, offset)
158}
159
160/// measure shannon entropy of a byte slice.
161pub fn match_entropy(data: &[u8]) -> f64 {
162    pipeline::match_entropy(data)
163}
164
165/// Find the largest char boundary <= index.
166pub fn floor_char_boundary(text: &str, index: usize) -> usize {
167    engine::floor_char_boundary(text, index)
168}
169
170/// Check if a match is within a hex-encoded context.
171pub fn is_within_hex_context(data: &str, match_start: usize, match_end: usize) -> bool {
172    pipeline::is_within_hex_context(data, match_start, match_end)
173}
174
175/// Check if a credential should be suppressed because it is a known example.
176pub fn should_suppress_known_example_credential(
177    credential: &str,
178    path: Option<&str>,
179    context: context::CodeContext,
180) -> bool {
181    pipeline::should_suppress_known_example_credential(credential, path, context)
182}
183
184/// Search for a companion pattern near a primary match.
185pub fn find_companion(
186    preprocessed: &types::ScannerPreprocessedText<'_>,
187    primary_line: usize,
188    companion: &types::CompiledCompanion,
189) -> Option<String> {
190    pipeline::find_companion(preprocessed, primary_line, companion)
191}
192
193pub mod testing {
194    pub use crate::compiler::{rewrite_alternation_prefix, split_leading_inline_flag};
195    pub use crate::confidence::penalties::finalize_confidence;
196    pub use crate::engine::boundary::scan_chunk_boundaries;
197    pub use crate::engine::gpu_postprocess::{
198        attribute_matches_to_chunks, fold_overlapping_same_pid_inplace, gpu_phase2_hits_are_dense,
199    };
200    pub use crate::engine::gpu_regex_dfa::extract_literal_core;
201    pub use crate::entropy::keywords::looks_like_program_identifier;
202
203    /// Internal entropy shape-classification predicates, exposed for the
204    /// canonical-shape unit tests migrated out of `src/entropy/scanner.rs`
205    /// (KH-GAP-004). `credential_keyword_context` builds the production
206    /// credential anchor so tests need not know the private tuning constants.
207    pub mod entropy_scanner {
208        pub use crate::entropy::keywords::KeywordContext;
209        pub use crate::entropy::scanner::{
210            candidate_is_plausible, credential_keyword_context, is_canonical_non_secret_shape,
211        };
212    }
213
214    /// Internal prose/decoy/strict-secret predicates, exposed for the unit
215    /// tests migrated out of `src/entropy/keywords.rs` (KH-GAP-004).
216    pub mod entropy_keywords {
217        pub use crate::entropy::keywords::{
218            entropy_value_looks_like_prose, is_dash_segmented_alnum_decoy,
219            looks_like_english_prose, passes_strict_secret_checks,
220        };
221    }
222    /// Full feature extractor (with detector-config keyword lists) exposed for
223    /// the ML training-pipeline parity harness (`ml/parity_check.py`), which
224    /// must compute byte-identical features to the serve path.
225    pub use crate::ml_scorer::compute_features_with_config;
226    pub use crate::probabilistic_gate::ProbabilisticGate;
227    pub use crate::static_intern::seed_source_type_count;
228    pub use crate::suppression::shape_gates::looks_like_standard_base64_blob;
229    pub use crate::util_hash::{hash_fast, memoize_by_hash};
230
231    pub mod ascii_ci {
232        pub use crate::ascii_ci::{ci_find, contains_path_segment, contains_path_segment_two};
233    }
234
235    pub mod shape {
236        pub use crate::suppression::shape::{
237            looks_like_credential_colliding_punctuation,
238            looks_like_punctuation_decorated_identifier, looks_like_syntactic_punctuation_marker,
239        };
240    }
241
242    pub mod compiler_prefix {
243        pub use crate::compiler::compiler_prefix::{
244            extract_literal_prefixes, strip_leading_boundary_guard, strip_leading_inline_flags,
245        };
246    }
247
248    pub use crate::decode::caesar::{
249        caesar_shift, is_source_code_path, looks_credential_shaped, CaesarDecoder,
250    };
251    pub use crate::decode::hex::find_hex_strings;
252    pub use crate::decode::reverse::{looks_reversible, reverse_str, ReverseDecoder};
253    pub use crate::decode::util::take_hex_digits;
254    pub use crate::gpu::{env_no_gpu, is_ci_environment};
255
256    /// Shannon entropy of `chunk` in bits/byte.
257    ///
258    /// # Safety
259    ///
260    /// On `x86_64` this dispatches straight to the AVX-512 kernel, which
261    /// requires the running CPU to support `avx512f`/`avx512bw`. The caller
262    /// must confirm those features first (e.g. via `is_x86_feature_detected!`);
263    /// calling it on a CPU without them is undefined behavior.
264    ///
265    /// On every other target (aarch64/macOS, wasm, …) the AVX-512 kernel does
266    /// not exist, so this routes to the portable feature-detecting dispatcher
267    /// (`entropy_fast::shannon_entropy_simd`), which is itself safe and always
268    /// correct. The `unsafe` marker is kept for one cross-platform signature.
269    /// Without this arch split the non-x86 build failed to compile
270    /// (`E0425: cannot find calculate_shannon_entropy`), breaking the portable
271    /// / macOS-arm64 build.
272    pub unsafe fn calculate_shannon_entropy(chunk: &[u8]) -> f64 {
273        #[cfg(target_arch = "x86_64")]
274        {
275            unsafe { crate::entropy_avx512::calculate_shannon_entropy(chunk) }
276        }
277        #[cfg(not(target_arch = "x86_64"))]
278        {
279            crate::entropy_fast::shannon_entropy_simd(chunk)
280        }
281    }
282
283    #[cfg(feature = "simd")]
284    pub use crate::simd::backend::HsScanner;
285
286    #[cfg(feature = "simdsieve")]
287    pub use crate::simdsieve_prefilter::{
288        HOT_PATTERNS, HOT_PATTERN_DETECTOR_IDS, HOT_PATTERN_DISPLAY_NAMES, HOT_PATTERN_NAMES,
289    };
290
291    pub use crate::structured::parsers::{
292        parse_docker_compose, parse_env, parse_hcl, parse_jupyter, parse_k8s_secret, parse_tfstate,
293    };
294}