use super::base64::{Base64Decoder, Z85Decoder};
use super::caesar::CaesarDecoder;
use super::hex::HexDecoder;
use super::json::JsonDecoder;
use super::reverse::ReverseDecoder;
use super::url::{
HexEscapeDecoder, HtmlNamedEntityDecoder, HtmlNumericEntityDecoder, MimeEncodedWordDecoder,
OctalEscapeDecoder, QuotedPrintableDecoder, UnicodeEscapeDecoder, UrlDecoder,
};
use super::Decoder;
use keyhog_core::{Chunk, ChunkMetadata};
use std::collections::{HashSet, VecDeque};
static DECODERS: std::sync::OnceLock<Vec<Box<dyn Decoder>>> = std::sync::OnceLock::new();
const MAX_DECODED_CHUNKS_PER_ROOT: usize = 1000;
const MAX_DECODED_TOTAL_BYTES: usize = 64 * 1024 * 1024;
const DEFAULT_DECODE_WALL_BUDGET_MS: u64 = 50;
fn get_decoders() -> &'static [Box<dyn Decoder>] {
DECODERS.get_or_init(|| {
vec![
Box::new(Base64Decoder),
Box::new(HexDecoder),
Box::new(UrlDecoder),
Box::new(QuotedPrintableDecoder),
Box::new(HtmlNamedEntityDecoder),
Box::new(HtmlNumericEntityDecoder),
Box::new(HexEscapeDecoder),
Box::new(OctalEscapeDecoder),
Box::new(MimeEncodedWordDecoder),
Box::new(UnicodeEscapeDecoder),
Box::new(JsonDecoder),
Box::new(Z85Decoder),
Box::new(ReverseDecoder),
Box::new(CaesarDecoder),
]
})
}
pub fn register_decoder(decoder: Box<dyn Decoder>) {
if DECODERS.get().is_some() {
tracing::warn!("register_decoder called after initialization: decoder ignored. Fix: register custom decoders before scanning.");
return;
}
let mut decoders: Vec<Box<dyn Decoder>> = vec![
Box::new(Base64Decoder),
Box::new(HexDecoder),
Box::new(UrlDecoder),
Box::new(QuotedPrintableDecoder),
Box::new(HtmlNamedEntityDecoder),
Box::new(HtmlNumericEntityDecoder),
Box::new(HexEscapeDecoder),
Box::new(OctalEscapeDecoder),
Box::new(MimeEncodedWordDecoder),
Box::new(UnicodeEscapeDecoder),
Box::new(JsonDecoder),
Box::new(Z85Decoder),
Box::new(ReverseDecoder),
Box::new(CaesarDecoder),
];
decoders.push(decoder);
let _ = DECODERS.set(decoders);
}
pub fn decode_chunk(
chunk: &Chunk,
max_depth: usize,
validate: bool,
deadline: Option<std::time::Instant>,
screen: Option<&crate::alphabet_filter::AlphabetScreen>,
) -> Vec<Chunk> {
let mut decoded_chunks = Vec::new();
let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
let mut seen = HashSet::from([hash_fast(chunk.data.as_bytes())]);
let mut total_bytes = 0usize;
let registry = get_decoders();
let local_ceiling =
std::time::Instant::now() + std::time::Duration::from_millis(DEFAULT_DECODE_WALL_BUDGET_MS);
let effective_deadline = match deadline {
Some(d) => d.min(local_ceiling),
None => local_ceiling,
};
while let Some((current, depth)) = queue.pop_front() {
if std::time::Instant::now() > effective_deadline {
tracing::debug!(
path = ?chunk.metadata.path,
budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
"decode budget exhausted; stopping decode-through"
);
break;
}
if depth >= max_depth {
continue;
}
for decoder in registry.iter() {
for decoded in decoder.decode_chunk(¤t) {
if seen.insert(hash_fast(decoded.data.as_bytes())) {
if validate && decoded.data.as_bytes().contains(&0u8) {
continue;
}
let passes_screen = if let Some(screen) = screen {
screen.screen(decoded.data.as_bytes())
} else {
true
};
total_bytes += decoded.data.len();
if decoded_chunks.len() >= MAX_DECODED_CHUNKS_PER_ROOT
|| total_bytes > MAX_DECODED_TOTAL_BYTES
{
tracing::debug!(
path = ?chunk.metadata.path,
"decode depth/size cap reached: chunk truncated to limit"
);
return decoded_chunks;
}
queue.push_back((decoded.clone(), depth + 1));
if passes_screen {
decoded_chunks.push(decoded);
}
}
}
}
}
decoded_chunks
}
pub(super) fn push_decoded_text_chunk(
decoded_chunks: &mut Vec<Chunk>,
chunk: &Chunk,
text: String,
decoder_name: &str,
) {
push_decoded_text_chunk_spliced(decoded_chunks, chunk, "", text, decoder_name);
}
pub(super) fn push_decoded_text_chunk_spliced(
decoded_chunks: &mut Vec<Chunk>,
chunk: &Chunk,
original_encoded: &str,
text: String,
decoder_name: &str,
) {
let bytes = text.as_bytes();
if text.is_empty()
|| bytes
.iter()
.any(|&b| b < 0x20 && b != b'\n' && b != b'\r' && b != b'\t')
{
return;
}
const MAX_SPLICE_PARENT_BYTES: usize = 256 * 1024;
let payload = if !original_encoded.is_empty()
&& chunk.data.len() <= MAX_SPLICE_PARENT_BYTES
&& chunk.data.as_str().contains(original_encoded)
{
chunk.data.as_str().replacen(original_encoded, &text, 1)
} else {
text
};
decoded_chunks.push(Chunk {
data: payload.into(),
metadata: ChunkMetadata {
base_offset: chunk.metadata.base_offset,
source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
path: chunk.metadata.path.clone(),
commit: chunk.metadata.commit.clone(),
author: chunk.metadata.author.clone(),
date: chunk.metadata.date.clone(),
mtime_ns: chunk.metadata.mtime_ns,
size_bytes: chunk.metadata.size_bytes,
},
});
}
pub(super) fn decode_candidates<F>(
chunk: &Chunk,
candidates: Vec<String>,
mut decode: F,
decoder_name: &str,
) -> Vec<Chunk>
where
F: FnMut(&str) -> Result<String, ()>,
{
let mut decoded_chunks = Vec::new();
for candidate in candidates {
if let Ok(text) = decode(&candidate) {
push_decoded_text_chunk_spliced(
&mut decoded_chunks,
chunk,
&candidate,
text,
decoder_name,
);
}
}
decoded_chunks
}
mod extractor;
pub(super) use extractor::{extract_encoded_values, hash_fast};