Skip to main content

keyhog_scanner/
lib.rs

1//! KeyHog Scanner: A high-performance, multi-layered secret detection engine.
2//!
3//! This crate implements the core scanning logic, combining SIMD pre-filtering,
4//! Aho-Corasick literal matching, regex fallback, and ML-based confidence scoring.
5
6#![deny(unsafe_op_in_unsafe_fn)]
7#![allow(clippy::too_many_arguments)]
8
9// ── Public API ──────────────────────────────────────────────────────
10/// Service-specific credential checksum validation (GitHub, npm, Slack, etc.).
11pub mod checksum;
12/// Detector compilation into high-performance matching structures.
13pub mod compiler;
14/// Heuristic and ML-based confidence scoring for candidate matches.
15pub mod confidence;
16/// Code context analysis (comments, assignments, test files).
17pub mod context;
18/// Decode-through pipeline for nested encodings (base64, hex, URL, etc.).
19pub mod decode;
20/// Decode-structure analysis: classify what a candidate base64/hex-decodes to
21/// (binary asset magic bytes, protobuf wire) so decode-through feeds scoring.
22pub mod decode_structure;
23/// Core scan execution engine.
24pub mod engine;
25/// Shannon entropy analysis for secret detection.
26pub mod entropy;
27/// Specialized error types for the scanner.
28pub mod error;
29/// GPU-accelerated matching via wgpu.
30pub mod gpu;
31/// Hardware capability detection and backend selection.
32pub mod hw_probe;
33/// Machine learning inference for secret scoring.
34pub mod ml_scorer;
35/// Multiline secret reassembly logic.
36pub mod multiline;
37/// Match resolution and deduplication.
38pub mod resolution;
39/// Scanner configuration and state.
40pub mod scanner_config;
41/// Static-string interner backed by vyre's CHD perfect hash.
42/// Used by `CompiledScanner` to pre-intern detector metadata strings
43/// so the per-scan `ScanState` interner is hit only by dynamic
44/// strings (file paths, commit SHAs).
45pub mod static_intern;
46/// Shared types for the scanner engine.
47pub mod types;
48
49// Internal modules.
50/// SIMD-accelerated alphabet pre-filtering.
51pub mod alphabet_filter;
52/// ASCII case-insensitive byte-search primitives shared by every hot path
53/// that needs to skim text without lowering the haystack first.
54pub(crate) mod ascii_ci;
55/// Bigram bloom filter for fast chunk gating.
56pub mod bigram_bloom;
57/// AVX-512 optimized entropy calculation.
58pub(crate) mod entropy_avx512;
59/// Fast scalar entropy calculation.
60pub mod entropy_fast;
61#[cfg(target_arch = "aarch64")]
62pub(crate) mod entropy_fast_neon;
63#[cfg(target_arch = "x86_64")]
64pub(crate) mod entropy_fast_x86;
65/// JWT structural validation and anomaly detection.
66pub mod jwt;
67// `fragment_cache` lives under `multiline/` (its only call sites are there);
68// re-exported at the crate root so existing `keyhog_scanner::fragment_cache`
69// paths and the Tier-C audit cleanup don't churn the public API.
70pub use multiline::fragment_cache;
71pub(crate) mod homoglyph;
72/// Internal scan pipeline orchestration.
73pub mod pipeline;
74/// Prefix trie for efficient keyword propagation.
75pub mod prefix_trie;
76pub(crate) mod probabilistic_gate;
77pub(crate) mod structured;
78pub(crate) mod suppression;
79/// Per-scan telemetry: always-on counters + opt-in `--dogfood` events.
80pub mod telemetry;
81/// Unicode normalization and homoglyph defense.
82pub mod unicode_hardening;
83
84pub(crate) fn sha256_hash(s: &str) -> String {
85    use sha2::{Digest, Sha256};
86    let mut hasher = Sha256::new();
87    hasher.update(s.as_bytes());
88    hex::encode(hasher.finalize())
89}
90
91#[cfg(feature = "simd")]
92pub(crate) mod simd;
93#[cfg(feature = "simdsieve")]
94mod simdsieve_prefilter;
95
96pub(crate) mod shared_regexes;
97
98pub use engine::CompiledScanner;
99pub use engine::GpuPhase1Output;
100pub use error::{Result, ScanError};
101pub use hw_probe::{probe_hardware, select_backend, HardwareCaps, ScanBackend};
102pub use types::ScannerConfig;
103
104use std::borrow::Cow;
105
106/// Normalize scannable text by removing evasion characters and handling homoglyphs.
107pub fn normalize_chunk_data(data: &str) -> Cow<'_, str> {
108    if data.is_ascii() {
109        return Cow::Borrowed(data);
110    }
111    let mut normalized = String::with_capacity(data.len());
112    let mut changed = false;
113    for ch in data.chars() {
114        if !unicode_hardening::is_evasion_char(ch) {
115            normalized.push(ch);
116        } else {
117            changed = true;
118        }
119    }
120    if changed {
121        Cow::Owned(normalized)
122    } else {
123        Cow::Borrowed(data)
124    }
125}
126
127/// Pre-process a chunk of text for scanning.
128pub fn normalize_scannable_chunk<'a>(
129    chunk: &'a keyhog_core::Chunk,
130    owned: &'a mut Option<keyhog_core::Chunk>,
131) -> &'a keyhog_core::Chunk {
132    pipeline::normalize_scannable_chunk(chunk, owned)
133}
134
135/// Compute line offsets for a block of text.
136pub fn compute_line_offsets(text: &str) -> Vec<usize> {
137    pipeline::compute_line_offsets(text)
138}
139
140/// Map a byte offset to a line number using pre-computed offsets.
141pub fn match_line_number(
142    preprocessed: &types::ScannerPreprocessedText,
143    line_offsets: &[usize],
144    offset: usize,
145) -> usize {
146    pipeline::match_line_number(preprocessed, line_offsets, offset)
147}
148
149/// measure shannon entropy of a byte slice.
150pub fn match_entropy(data: &[u8]) -> f64 {
151    pipeline::match_entropy(data)
152}
153
154/// Find the largest char boundary <= index.
155pub fn floor_char_boundary(text: &str, index: usize) -> usize {
156    engine::floor_char_boundary(text, index)
157}
158
159/// Check if a match is within a hex-encoded context.
160pub fn is_within_hex_context(data: &str, match_start: usize, match_end: usize) -> bool {
161    pipeline::is_within_hex_context(data, match_start, match_end)
162}
163
164/// Check if a credential should be suppressed because it is a known example.
165pub fn should_suppress_known_example_credential(
166    credential: &str,
167    path: Option<&str>,
168    context: context::CodeContext,
169) -> bool {
170    pipeline::should_suppress_known_example_credential(credential, path, context)
171}
172
173/// Search for a companion pattern near a primary match.
174pub fn find_companion(
175    preprocessed: &types::ScannerPreprocessedText,
176    primary_line: usize,
177    companion: &types::CompiledCompanion,
178) -> Option<String> {
179    pipeline::find_companion(preprocessed, primary_line, companion)
180}
181
182pub mod testing {
183    pub use crate::compiler::{rewrite_alternation_prefix, split_leading_inline_flag};
184    pub use crate::confidence::penalties::finalize_confidence;
185    /// Full feature extractor (with detector-config keyword lists) exposed for
186    /// the ML training-pipeline parity harness (`ml/parity_check.py`), which
187    /// must compute byte-identical features to the serve path.
188    pub use crate::ml_scorer::compute_features_with_config;
189    pub use crate::engine::boundary::scan_chunk_boundaries;
190    pub use crate::engine::gpu_postprocess::{
191        attribute_matches_to_chunks, fold_overlapping_same_pid_inplace,
192    };
193    pub use crate::engine::gpu_regex_dfa::extract_literal_core;
194    pub use crate::entropy::keywords::looks_like_program_identifier;
195    pub use crate::probabilistic_gate::ProbabilisticGate;
196    pub use crate::static_intern::seed_source_type_count;
197
198    pub mod ascii_ci {
199        pub use crate::ascii_ci::{ci_find, contains_path_segment, contains_path_segment_two};
200    }
201
202    pub mod shape {
203        pub use crate::suppression::shape::{
204            looks_like_credential_colliding_punctuation,
205            looks_like_punctuation_decorated_identifier, looks_like_syntactic_punctuation_marker,
206        };
207    }
208
209    pub mod compiler_prefix {
210        pub use crate::compiler::compiler_prefix::{
211            extract_literal_prefixes, strip_leading_boundary_guard, strip_leading_inline_flags,
212        };
213    }
214
215    pub use crate::decode::caesar::{
216        caesar_shift, is_source_code_path, looks_credential_shaped, CaesarDecoder,
217    };
218    pub use crate::decode::hex::find_hex_strings;
219    pub use crate::decode::reverse::{looks_reversible, reverse_str, ReverseDecoder};
220    pub use crate::decode::util::take_hex_digits;
221    pub use crate::gpu::{env_no_gpu, is_ci_environment};
222
223    /// Shannon entropy of `chunk` in bits/byte.
224    ///
225    /// # Safety
226    ///
227    /// On `x86_64` this dispatches straight to the AVX-512 kernel, which
228    /// requires the running CPU to support `avx512f`/`avx512bw`. The caller
229    /// must confirm those features first (e.g. via `is_x86_feature_detected!`);
230    /// calling it on a CPU without them is undefined behavior.
231    ///
232    /// On every other target (aarch64/macOS, wasm, …) the AVX-512 kernel does
233    /// not exist, so this routes to the portable feature-detecting dispatcher
234    /// (`entropy_fast::shannon_entropy_simd`), which is itself safe and always
235    /// correct. The `unsafe` marker is kept for one cross-platform signature.
236    /// Without this arch split the non-x86 build failed to compile
237    /// (`E0425: cannot find calculate_shannon_entropy`), breaking the portable
238    /// / macOS-arm64 build.
239    pub unsafe fn calculate_shannon_entropy(chunk: &[u8]) -> f64 {
240        #[cfg(target_arch = "x86_64")]
241        {
242            unsafe { crate::entropy_avx512::calculate_shannon_entropy(chunk) }
243        }
244        #[cfg(not(target_arch = "x86_64"))]
245        {
246            crate::entropy_fast::shannon_entropy_simd(chunk)
247        }
248    }
249
250    #[cfg(feature = "simd")]
251    pub use crate::simd::backend::HsScanner;
252
253    #[cfg(feature = "simdsieve")]
254    pub use crate::simdsieve_prefilter::{
255        HOT_PATTERNS, HOT_PATTERN_DETECTOR_IDS, HOT_PATTERN_DISPLAY_NAMES, HOT_PATTERN_NAMES,
256    };
257
258    pub use crate::structured::parsers::{
259        parse_docker_compose, parse_env, parse_jupyter, parse_k8s_secret, parse_tfstate,
260    };
261}