use crate::context;
use crate::types::*;
use keyhog_core::{Chunk, MatchLocation, RawMatch};
use std::borrow::Cow;
use std::collections::HashMap;
pub fn build_raw_match(
detector: &keyhog_core::DetectorSpec,
chunk: &Chunk,
credential: &str,
companions: HashMap<String, String>,
offset: usize,
line: usize,
ent: f64,
confidence: f64,
scan_state: &mut ScanState,
) -> RawMatch {
RawMatch {
detector_id: scan_state.intern_metadata(&detector.id),
detector_name: scan_state.intern_metadata(&detector.name),
service: scan_state.intern_metadata(&detector.service),
severity: detector.severity,
credential_hash: crate::sha256_hash(credential),
credential: scan_state.intern_credential(credential),
companions,
location: MatchLocation {
source: scan_state.intern_metadata(&chunk.metadata.source_type),
file_path: chunk
.metadata
.path
.as_ref()
.map(|p| scan_state.intern_metadata(p)),
line: Some(line),
offset,
commit: chunk
.metadata
.commit
.as_ref()
.map(|c| scan_state.intern_metadata(c)),
author: chunk
.metadata
.author
.as_ref()
.map(|a| scan_state.intern_metadata(a)),
date: chunk
.metadata
.date
.as_ref()
.map(|d| scan_state.intern_metadata(d)),
},
entropy: Some(ent),
confidence: Some(confidence),
}
}
pub fn local_context_window(text: &str, line: usize, radius: usize) -> String {
let lines: Vec<&str> = text.lines().collect();
if lines.is_empty() {
return String::new();
}
let start = line.saturating_sub(radius).saturating_sub(1);
let end = (line + radius).min(lines.len());
lines[start..end].join("\n")
}
pub fn compute_line_offsets(text: &str) -> Vec<usize> {
let mut offsets = vec![0];
for (idx, _) in text.match_indices('\n') {
offsets.push(idx + 1);
}
offsets
}
pub fn match_line_number(
preprocessed: &ScannerPreprocessedText,
line_offsets: &[usize],
offset: usize,
) -> usize {
preprocessed.line_for_offset(offset).unwrap_or_else(|| {
line_offsets
.iter()
.position(|&lo| lo > offset)
.unwrap_or(line_offsets.len())
})
}
pub fn normalize_scannable_chunk<'a>(chunk: &'a Chunk, owned: &'a mut Option<Chunk>) -> &'a Chunk {
let normalized = crate::normalize_chunk_data(&chunk.data);
if let Cow::Owned(data) = normalized {
*owned = Some(Chunk {
data,
metadata: chunk.metadata.clone(),
});
owned.as_ref().unwrap_or(chunk)
} else {
chunk
}
}
fn upper_contains_token(upper: &str, token: &str) -> bool {
upper.match_indices(token).any(|(idx, _)| {
let before = idx.checked_sub(1).and_then(|i| upper.chars().nth(i));
let after = upper[idx + token.len()..].chars().next();
before.is_none_or(|c| !c.is_alphanumeric()) && after.is_none_or(|c| !c.is_alphanumeric())
})
}
pub fn should_suppress_known_example_credential(
credential: &str,
path: Option<&str>,
context: context::CodeContext,
) -> bool {
let upper = credential.to_uppercase();
const PLACEHOLDER_WORDS: &[&str] = &["DUMMY", "PLACEHOLDER", "FAKE", "MOCK", "SAMPLE"];
for word in PLACEHOLDER_WORDS {
if upper_contains_token(&upper, word) {
return true;
}
}
if upper_contains_token(&upper, "EXAMPLE")
&& !credential.contains("example.com")
&& !credential.contains("example.org")
{
return true;
}
const INSTRUCTIONAL_FRAGMENTS: &[&str] = &["YOUR_", "YOUR-", "INSERT", "CHANGE", "REPLACE"];
for frag in INSTRUCTIONAL_FRAGMENTS {
if upper.contains(frag) {
let mut positions = upper.match_indices(frag);
if positions.any(|(idx, _)| {
idx == 0
|| upper
.chars()
.nth(idx - 1)
.is_none_or(|c| !c.is_alphanumeric())
}) {
return true;
}
}
}
if upper.contains("XXXXX") {
return true;
}
if credential.len() < 20 && has_three_or_more_consecutive_identical(credential) {
return true;
}
if has_n_or_more_consecutive_identical(credential, 5) {
return true;
}
if credential
.chars()
.all(|c| c == 'x' || c == 'X' || c == '*' || c == '-' || c == '.')
{
return true;
}
if credential.len() >= 8
&& credential.chars().all(|c| !c.is_alphanumeric())
&& credential
.chars()
.collect::<std::collections::HashSet<_>>()
.len()
<= 2
{
return true;
}
const FAKE_SEQUENCES: &[&str] = &["1234567890", "0123456789", "ABCDEFGH", "ABCDEFGHIJ"];
for seq in FAKE_SEQUENCES {
if upper.contains(seq) {
let seq_ratio = seq.len() as f64 / credential.len().max(1) as f64;
if seq_ratio > 0.4 {
return true;
}
}
}
if upper_contains_token(&upper, "TODO") || upper_contains_token(&upper, "FIXME") {
return true;
}
if crate::context::is_known_example_credential(credential) {
return true;
}
if matches!(
context,
context::CodeContext::Documentation | context::CodeContext::Comment
) {
let trimmed = credential.trim_matches(|c: char| !c.is_alphanumeric());
let trimmed_upper = trimmed.to_uppercase();
if trimmed_upper == "TOKEN"
|| trimmed_upper == "KEY"
|| trimmed_upper == "SECRET"
|| trimmed_upper == "PASSWORD"
|| trimmed_upper == "API_KEY"
|| trimmed_upper == "API_TOKEN"
|| trimmed_upper == "YOUR_TOKEN"
|| trimmed_upper == "YOUR_API_KEY"
{
return true;
}
}
if let Some(path) = path {
let lower_path = path.to_lowercase();
let is_example_path = lower_path.split(['/', '\\']).any(|component| {
matches!(
component,
"example" | "examples" | "test" | "tests" | "fixture" | "fixtures"
)
});
if is_example_path && upper_contains_token(&upper, "EXAMPLE") {
return true;
}
}
false
}
fn has_three_or_more_consecutive_identical(s: &str) -> bool {
let mut chars = s.chars().peekable();
while let Some(ch) = chars.next() {
let mut run = 1;
while chars.peek() == Some(&ch) {
run += 1;
chars.next();
}
if run >= 3 {
return true;
}
}
false
}
fn has_n_or_more_consecutive_identical(s: &str, n: usize) -> bool {
let mut chars = s.chars().peekable();
while let Some(ch) = chars.next() {
let mut run = 1;
while chars.peek() == Some(&ch) {
run += 1;
chars.next();
}
if run >= n && ch != '-' {
return true;
}
}
false
}
pub fn find_companion(
preprocessed: &ScannerPreprocessedText,
primary_line: usize,
companion: &CompiledCompanion,
) -> Option<String> {
let start = primary_line.saturating_sub(companion.within_lines);
let end = primary_line.saturating_add(companion.within_lines);
let (window_start, window_end) =
line_window_offsets(preprocessed, start + FIRST_LINE_NUMBER, end)?;
let haystack = &preprocessed.text[window_start..window_end];
for captures in companion.regex.captures_iter(haystack) {
let Some(m) = captures.get(companion.capture_group.unwrap_or(FIRST_CAPTURE_GROUP_INDEX))
else {
continue;
};
if m.len() > 4096 {
continue; }
if let Some(line) = preprocessed.line_for_offset(window_start + m.start())
&& (start + FIRST_LINE_NUMBER..=end).contains(&line)
{
return Some(m.as_str().to_string());
}
}
None
}
pub fn line_window_offsets(
preprocessed: &ScannerPreprocessedText,
start_line: usize,
end_line: usize,
) -> Option<(usize, usize)> {
let mut start_offset = None;
let mut end_offset = None;
for mapping in &preprocessed.mappings {
if start_offset.is_none() && mapping.line_number >= start_line {
start_offset = Some(mapping.start_offset);
}
if mapping.line_number <= end_line {
end_offset = Some(mapping.end_offset);
}
}
Some((start_offset?, end_offset?))
}
pub fn is_within_hex_context(data: &str, match_start: usize, match_end: usize) -> bool {
if !valid_match_bounds(data, match_start, match_end) {
return false;
}
let matched = &data[match_start..match_end];
let matched_hex_digits = matched.chars().filter(|c| c.is_ascii_hexdigit()).count();
if matched.len() < MIN_HEX_MATCH_LEN || matched_hex_digits < MIN_HEX_DIGITS_IN_MATCH {
return false;
}
let (before, after) = surrounding_hex_context(data, match_start, match_end);
let hex_before = formatted_hex_run(before.chars().rev());
let hex_after = formatted_hex_run(after.chars());
hex_before >= MIN_HEX_CONTEXT_DIGITS && hex_after >= MIN_HEX_CONTEXT_DIGITS
}
fn valid_match_bounds(data: &str, match_start: usize, match_end: usize) -> bool {
match_end > match_start
&& data.is_char_boundary(match_start)
&& data.is_char_boundary(match_end)
}
fn surrounding_hex_context(data: &str, match_start: usize, match_end: usize) -> (&str, &str) {
let context_start = crate::engine::floor_char_boundary(
data,
match_start.saturating_sub(HEX_CONTEXT_RADIUS_CHARS),
);
let context_end = {
let mut end = (match_end + HEX_CONTEXT_RADIUS_CHARS).min(data.len());
while end < data.len() && !data.is_char_boundary(end) {
end += 1;
}
end.min(data.len())
};
(
&data[context_start..match_start],
&data[match_end..context_end],
)
}
fn formatted_hex_run(iter: impl Iterator<Item = char>) -> usize {
let mut hex_digits = 0usize;
let mut separators = 0usize;
let mut seen_hex = false;
for ch in iter {
if ch.is_ascii_hexdigit() {
hex_digits += 1;
seen_hex = true;
continue;
}
if matches!(ch, ' ' | '\t' | ':' | '-')
&& (!seen_hex || separators < MAX_HEX_CONTEXT_SEPARATORS)
{
separators += 1;
continue;
}
break;
}
hex_digits
}
pub fn match_entropy(data: &[u8]) -> f64 {
#[cfg(feature = "entropy")]
{
crate::entropy::shannon_entropy(data)
}
#[cfg(not(feature = "entropy"))]
{
fallback_entropy(data)
}
}
#[cfg(not(feature = "entropy"))]
fn fallback_entropy(data: &[u8]) -> f64 {
if data.is_empty() {
return 0.0;
}
let mut counts = [0u64; 256];
for &byte in data {
counts[byte as usize] += 1;
}
let len = data.len() as f64;
let mut entropy = 0.0;
for &count in &counts {
if count > 0 {
let p = count as f64 / len;
entropy -= p * p.log2();
}
}
entropy
}