use super::Decoder;
use super::base64::{Base64Decoder, Z85Decoder};
use super::hex::HexDecoder;
use super::url::{
HexEscapeDecoder, HtmlNamedEntityDecoder, HtmlNumericEntityDecoder, MimeEncodedWordDecoder,
OctalEscapeDecoder, QuotedPrintableDecoder, UnicodeEscapeDecoder, UrlDecoder,
};
use keyhog_core::{Chunk, ChunkMetadata};
use std::collections::{HashSet, VecDeque};
static DECODERS: std::sync::OnceLock<Vec<Box<dyn Decoder>>> =
std::sync::OnceLock::new();
const MAX_DECODED_CHUNKS_PER_ROOT: usize = 1000;
const MAX_DECODED_TOTAL_BYTES: usize = 64 * 1024 * 1024;
fn get_decoders() -> &'static [Box<dyn Decoder>] {
DECODERS.get_or_init(|| {
vec![
Box::new(Base64Decoder),
Box::new(HexDecoder),
Box::new(UrlDecoder),
Box::new(QuotedPrintableDecoder),
Box::new(HtmlNamedEntityDecoder),
Box::new(HtmlNumericEntityDecoder),
Box::new(HexEscapeDecoder),
Box::new(OctalEscapeDecoder),
Box::new(MimeEncodedWordDecoder),
Box::new(UnicodeEscapeDecoder),
Box::new(Z85Decoder),
]
})
}
pub fn register_decoder(decoder: Box<dyn Decoder>) {
if DECODERS.get().is_some() {
tracing::warn!("register_decoder called after initialization — decoder ignored. Fix: register custom decoders before scanning.");
return;
}
let mut decoders: Vec<Box<dyn Decoder>> = vec![
Box::new(Base64Decoder),
Box::new(HexDecoder),
Box::new(UrlDecoder),
Box::new(QuotedPrintableDecoder),
Box::new(HtmlNamedEntityDecoder),
Box::new(HtmlNumericEntityDecoder),
Box::new(HexEscapeDecoder),
Box::new(OctalEscapeDecoder),
Box::new(MimeEncodedWordDecoder),
Box::new(UnicodeEscapeDecoder),
Box::new(Z85Decoder),
];
decoders.push(decoder);
let _ = DECODERS.set(decoders);
}
pub fn decode_chunk(
chunk: &Chunk,
max_depth: usize,
_validate: bool,
deadline: Option<std::time::Instant>,
screen: Option<&crate::alphabet_filter::AlphabetScreen>,
) -> Vec<Chunk> {
let mut decoded_chunks = Vec::new();
let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
let mut seen = HashSet::from([hash_fast(chunk.data.as_bytes())]);
let mut total_bytes = 0usize;
let registry = get_decoders();
while let Some((current, depth)) = queue.pop_front() {
if let Some(deadline) = deadline
&& std::time::Instant::now() > deadline
{
break;
}
if depth >= max_depth {
continue;
}
for decoder in registry.iter() {
for decoded in decoder.decode_chunk(¤t) {
if seen.insert(hash_fast(decoded.data.as_bytes())) {
if let Some(screen) = screen
&& !screen.screen(decoded.data.as_bytes())
{
continue;
}
total_bytes += decoded.data.len();
if decoded_chunks.len() >= MAX_DECODED_CHUNKS_PER_ROOT
|| total_bytes > MAX_DECODED_TOTAL_BYTES
{
tracing::warn!(
path = ?chunk.metadata.path,
"Recursive decoding limit reached. Fix: reduce decode depth or decode size limits"
);
return decoded_chunks;
}
queue.push_back((decoded.clone(), depth + 1));
decoded_chunks.push(decoded);
}
}
}
}
decoded_chunks
}
pub(super) fn push_decoded_text_chunk(
decoded_chunks: &mut Vec<Chunk>,
chunk: &Chunk,
text: String,
decoder_name: &str,
) {
if text.is_empty()
|| !text
.chars()
.all(|ch| !ch.is_control() || ch == '\n' || ch == '\r' || ch == '\t')
{
return;
}
decoded_chunks.push(Chunk {
data: text,
metadata: ChunkMetadata {
source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
path: chunk.metadata.path.clone(),
commit: chunk.metadata.commit.clone(),
author: chunk.metadata.author.clone(),
date: chunk.metadata.date.clone(),
},
});
}
pub(super) fn decode_candidates<F>(
chunk: &Chunk,
candidates: Vec<String>,
mut decode: F,
decoder_name: &str,
) -> Vec<Chunk>
where
F: FnMut(&str) -> Result<String, ()>,
{
let mut decoded_chunks = Vec::new();
for candidate in candidates {
if let Ok(text) = decode(&candidate) {
push_decoded_text_chunk(&mut decoded_chunks, chunk, text, decoder_name);
}
}
decoded_chunks
}
pub(super) fn extract_encoded_values(text: &str) -> Vec<String> {
let mut values = Vec::new();
let mut b64_block = String::new();
let is_b64_char = |ch: char| -> bool {
ch.is_ascii_alphanumeric() || ch == '+' || ch == '/' || ch == '=' || ch == '-' || ch == '_'
};
let mut chars = text.char_indices().peekable();
while let Some(&(_, ch)) = chars.peek() {
if ch == '"' || ch == '\'' || ch == '`' {
if b64_block.len() >= 16 {
values.push(std::mem::take(&mut b64_block));
}
b64_block.clear();
let quote = ch;
chars.next();
let mut escaping = false;
let mut cleaned = String::with_capacity(32);
while let Some(&(_, current)) = chars.peek() {
chars.next();
if escaping {
cleaned.push(current);
escaping = false;
} else if current == '\\' {
escaping = true;
} else if current == quote {
if cleaned.len() >= 4 {
values.push(cleaned);
}
break;
} else if !current.is_ascii_whitespace() {
cleaned.push(current);
}
}
continue;
}
if ch == ':' || ch == '=' {
if b64_block.len() >= 16 {
values.push(std::mem::take(&mut b64_block));
}
b64_block.clear();
chars.next();
while chars.peek().is_some_and(|&(_, c)| c.is_ascii_whitespace()) {
chars.next();
}
let mut cleaned = String::with_capacity(32);
while let Some(&(_, c)) = chars.peek() {
if c.is_ascii_whitespace()
|| c == ';' || c == ',' || c == '"' || c == '\'' || c == '`'
{
break;
}
cleaned.push(c);
chars.next();
}
if cleaned.len() >= 4 {
values.push(cleaned);
}
continue;
}
if is_b64_char(ch) {
b64_block.push(ch);
} else if !ch.is_whitespace() {
if b64_block.len() >= 16 {
values.push(std::mem::take(&mut b64_block));
}
b64_block.clear();
}
chars.next();
}
if b64_block.len() >= 16 {
values.push(b64_block);
}
values
}
fn hash_fast(data: &[u8]) -> u64 {
let mut hash: u64 = 0xcbf29ce484222325;
for &byte in data {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(0x100000001b3);
}
hash
}