use regex::Regex;
use std::cmp::Reverse;
#[cfg(feature = "ml")]
use std::collections::VecDeque;
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::sync::Arc;
pub const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
pub const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
pub const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
pub const WINDOW_OVERLAP_BYTES: usize = 128 * 1024;
pub const MIN_FALLBACK_LINE_LENGTH: usize = 8;
pub const FULL_MATCH_INDEX: usize = 0;
pub const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
pub const FIRST_LINE_NUMBER: usize = 1;
pub const PREVIOUS_LINE_DISTANCE: usize = 1;
pub const MIN_LITERAL_PREFIX_CHARS: usize = 3;
pub const REGEX_SIZE_LIMIT_BYTES: usize = 1 << 20;
pub const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
pub const MIN_HEX_MATCH_LEN: usize = 16;
pub const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
pub const MIN_HEX_CONTEXT_DIGITS: usize = 8;
pub const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
#[cfg(feature = "ml")]
pub const MAX_ML_CACHE_ENTRIES: usize = 1024;
#[cfg(feature = "ml")]
pub const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
#[cfg(feature = "ml")]
pub const ML_CONTEXT_RADIUS_LINES: usize = 5;
#[cfg(feature = "ml")]
pub const ML_WEIGHT: f64 = 0.6;
#[cfg(feature = "ml")]
pub const HEURISTIC_WEIGHT: f64 = 0.4;
#[cfg(not(feature = "multiline"))]
#[derive(Debug, Clone)]
pub struct LineMapping {
pub start_offset: usize,
pub end_offset: usize,
pub line_number: usize,
}
#[cfg(not(feature = "multiline"))]
#[derive(Debug, Clone)]
pub struct PreprocessedText {
pub text: String,
pub mappings: Vec<LineMapping>,
}
#[cfg(not(feature = "multiline"))]
impl PreprocessedText {
pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
self.mappings
.iter()
.find(|mapping| offset >= mapping.start_offset && offset < mapping.end_offset)
.map(|mapping| mapping.line_number)
}
pub fn passthrough(line: &str) -> Self {
Self {
text: line.to_string(),
mappings: vec![LineMapping {
line_number: 1,
start_offset: 0,
end_offset: line.len(),
}],
}
}
}
#[cfg(feature = "multiline")]
pub type ScannerPreprocessedText = crate::multiline::PreprocessedText;
#[cfg(not(feature = "multiline"))]
pub type ScannerPreprocessedText = PreprocessedText;
#[derive(Debug, Clone)]
pub struct CompiledPattern {
pub detector_index: usize,
pub regex: Regex,
pub group: Option<usize>,
}
pub struct CompiledCompanion {
pub name: String,
pub regex: Regex,
pub capture_group: Option<usize>,
pub within_lines: usize,
pub required: bool,
}
#[derive(Debug, Clone)]
pub struct ScannerConfig {
pub max_decode_depth: usize,
pub validate_decode: bool,
pub entropy_enabled: bool,
pub entropy_threshold: f64,
pub entropy_in_source_files: bool,
pub ml_enabled: bool,
pub ml_weight: f64,
pub min_confidence: f64,
pub unicode_normalization: bool,
pub max_decode_bytes: usize,
pub max_matches_per_chunk: usize,
pub multiline: crate::multiline::MultilineConfig,
pub known_prefixes: Vec<String>,
pub secret_keywords: Vec<String>,
pub test_keywords: Vec<String>,
pub placeholder_keywords: Vec<String>,
}
impl Default for ScannerConfig {
fn default() -> Self {
Self {
max_decode_depth: 10,
validate_decode: true,
entropy_enabled: true,
entropy_threshold: 4.0,
entropy_in_source_files: false,
ml_enabled: true,
ml_weight: 0.6,
min_confidence: 0.5,
unicode_normalization: true,
max_decode_bytes: 512 * 1024, max_matches_per_chunk: 1000,
multiline: crate::multiline::MultilineConfig::default(),
known_prefixes: Vec::new(),
secret_keywords: default_secret_keywords(),
test_keywords: default_test_keywords(),
placeholder_keywords: default_placeholder_keywords(),
}
}
}
fn default_secret_keywords() -> Vec<String> {
[
"password",
"passwd",
"pwd",
"secret",
"token",
"api_key",
"apikey",
"api-key",
"access_key",
"auth_token",
"auth_key",
"private_key",
"client_secret",
"encryption_key",
"signing_key",
"bearer",
"credential",
"license_key",
]
.iter()
.map(|s| s.to_string())
.collect()
}
fn default_test_keywords() -> Vec<String> {
[
"test", "mock", "fake", "dummy", "stub", "fixture", "example", "sample", "sandbox",
"staging",
]
.iter()
.map(|s| s.to_string())
.collect()
}
fn default_placeholder_keywords() -> Vec<String> {
[
"change_me",
"changeme",
"replace_me",
"todo",
"fixme",
"your_",
"insert_",
"put_your",
"fill_in",
"<your",
]
.iter()
.map(|s| s.to_string())
.collect()
}
impl ScannerConfig {
pub fn fast() -> Self {
Self {
max_decode_depth: 0,
ml_enabled: false,
entropy_enabled: false,
..Default::default()
}
}
pub fn thorough() -> Self {
Self {
max_decode_depth: 10,
ml_enabled: true,
entropy_enabled: true,
min_confidence: 0.5,
..Default::default()
}
}
pub fn min_confidence(mut self, min_confidence: f64) -> Self {
self.min_confidence = min_confidence;
self
}
}
impl From<keyhog_core::config::ScanConfig> for ScannerConfig {
fn from(config: keyhog_core::config::ScanConfig) -> Self {
Self {
max_decode_depth: config.max_decode_depth,
validate_decode: true,
entropy_enabled: config.entropy_enabled,
entropy_threshold: config.entropy_threshold,
entropy_in_source_files: config.entropy_in_source_files,
ml_enabled: config.ml_enabled,
ml_weight: config.ml_weight,
min_confidence: config.min_confidence,
unicode_normalization: config.unicode_normalization,
max_decode_bytes: config.decode_size_limit,
max_matches_per_chunk: config.max_matches_per_chunk,
multiline: crate::multiline::MultilineConfig::default(),
known_prefixes: config.known_prefixes,
secret_keywords: config.secret_keywords,
test_keywords: config.test_keywords,
placeholder_keywords: config.placeholder_keywords,
}
}
}
#[cfg(feature = "ml")]
#[derive(Debug, Clone)]
pub struct MlPendingMatch {
pub raw_match: keyhog_core::RawMatch,
pub heuristic_conf: f64,
pub code_context: crate::context::CodeContext,
pub credential: String,
pub ml_context: String,
}
#[derive(Default)]
pub struct ScanState {
pub matches: BinaryHeap<Reverse<keyhog_core::RawMatch>>,
pub credential_interner: HashSet<Arc<str>>,
pub metadata_interner: HashMap<String, Arc<str>>,
#[cfg(feature = "ml")]
pub ml_score_cache: HashMap<(String, String), f64>,
#[cfg(feature = "ml")]
pub ml_cache_order: VecDeque<(String, String)>,
#[cfg(feature = "ml")]
pub ml_cache_bytes: usize,
#[cfg(feature = "ml")]
pub ml_pending: Vec<MlPendingMatch>,
}
impl ScanState {
pub fn intern_credential(&mut self, s: &str) -> Arc<str> {
if let Some(existing) = self.credential_interner.get(s) {
existing.clone()
} else {
let shared: Arc<str> = Arc::from(s);
self.credential_interner.insert(shared.clone());
shared
}
}
pub fn intern_metadata(&mut self, s: &str) -> Arc<str> {
if let Some(existing) = self.metadata_interner.get(s) {
existing.clone()
} else {
let shared: Arc<str> = Arc::from(s);
self.metadata_interner.insert(s.to_string(), shared.clone());
shared
}
}
pub fn push_match(&mut self, m: keyhog_core::RawMatch, limit: usize) {
if self.matches.len() < limit {
self.matches.push(Reverse(m));
} else if let Some(mut lowest) = self.matches.peek_mut()
&& m > lowest.0
{
*lowest = Reverse(m);
}
}
pub fn into_matches(self) -> Vec<keyhog_core::RawMatch> {
let mut matches: Vec<_> = self.matches.into_iter().map(|r| r.0).collect();
matches.sort_by(|a, b| b.cmp(a));
matches
}
}