use super::base64::{Base64Decoder, Z85Decoder};
use super::caesar::CaesarDecoder;
use super::hex::HexDecoder;
use super::json::JsonDecoder;
use super::reverse::ReverseDecoder;
use super::url::{
HexEscapeDecoder, HtmlNamedEntityDecoder, HtmlNumericEntityDecoder, MimeEncodedWordDecoder,
OctalEscapeDecoder, QuotedPrintableDecoder, UnicodeEscapeDecoder, UrlDecoder,
};
use super::Decoder;
use keyhog_core::{Chunk, ChunkMetadata};
use std::collections::{HashSet, VecDeque};
static DECODERS: std::sync::OnceLock<Vec<Box<dyn Decoder>>> = std::sync::OnceLock::new();
const MAX_DECODED_CHUNKS_PER_ROOT: usize = 1000;
const MAX_DECODED_TOTAL_BYTES: usize = 64 * 1024 * 1024;
const DEFAULT_DECODE_WALL_BUDGET_MS: u64 = 50;
fn get_decoders() -> &'static [Box<dyn Decoder>] {
DECODERS.get_or_init(|| {
vec![
Box::new(Base64Decoder),
Box::new(HexDecoder),
Box::new(UrlDecoder),
Box::new(QuotedPrintableDecoder),
Box::new(HtmlNamedEntityDecoder),
Box::new(HtmlNumericEntityDecoder),
Box::new(HexEscapeDecoder),
Box::new(OctalEscapeDecoder),
Box::new(MimeEncodedWordDecoder),
Box::new(UnicodeEscapeDecoder),
Box::new(JsonDecoder),
Box::new(Z85Decoder),
Box::new(ReverseDecoder),
Box::new(CaesarDecoder),
]
})
}
pub fn register_decoder(decoder: Box<dyn Decoder>) {
if DECODERS.get().is_some() {
tracing::warn!("register_decoder called after initialization: decoder ignored. Fix: register custom decoders before scanning.");
return;
}
let mut decoders: Vec<Box<dyn Decoder>> = vec![
Box::new(Base64Decoder),
Box::new(HexDecoder),
Box::new(UrlDecoder),
Box::new(QuotedPrintableDecoder),
Box::new(HtmlNamedEntityDecoder),
Box::new(HtmlNumericEntityDecoder),
Box::new(HexEscapeDecoder),
Box::new(OctalEscapeDecoder),
Box::new(MimeEncodedWordDecoder),
Box::new(UnicodeEscapeDecoder),
Box::new(JsonDecoder),
Box::new(Z85Decoder),
Box::new(ReverseDecoder),
Box::new(CaesarDecoder),
];
decoders.push(decoder);
let _ = DECODERS.set(decoders);
}
pub fn decode_chunk(
chunk: &Chunk,
max_depth: usize,
validate: bool,
deadline: Option<std::time::Instant>,
screen: Option<&crate::alphabet_filter::AlphabetScreen>,
) -> Vec<Chunk> {
let mut decoded_chunks = Vec::new();
let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
let mut seen = HashSet::from([hash_fast(chunk.data.as_bytes())]);
let mut total_bytes = 0usize;
let mut produced = 0usize;
let registry = get_decoders();
let local_ceiling =
std::time::Instant::now() + std::time::Duration::from_millis(DEFAULT_DECODE_WALL_BUDGET_MS);
let effective_deadline = match deadline {
Some(d) => d.min(local_ceiling),
None => local_ceiling,
};
while let Some((current, depth)) = queue.pop_front() {
if std::time::Instant::now() > effective_deadline {
tracing::debug!(
path = ?chunk.metadata.path,
budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
"decode budget exhausted; stopping decode-through"
);
break;
}
if depth >= max_depth {
continue;
}
for decoder in registry.iter() {
if std::time::Instant::now() > effective_deadline {
tracing::debug!(
path = ?chunk.metadata.path,
budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
"decode budget exhausted mid-fan-out; stopping decode-through"
);
return decoded_chunks;
}
for decoded in decoder.decode_chunk(¤t) {
if std::time::Instant::now() > effective_deadline {
tracing::debug!(
path = ?chunk.metadata.path,
budget_ms = DEFAULT_DECODE_WALL_BUDGET_MS,
"decode budget exhausted while consuming decoder output; \
stopping decode-through"
);
return decoded_chunks;
}
if seen.insert(hash_fast(decoded.data.as_bytes())) {
if validate && decoded.data.as_bytes().contains(&0u8) {
continue;
}
let passes_screen = if let Some(screen) = screen {
screen.screen(decoded.data.as_bytes())
} else {
true
};
produced += 1;
total_bytes += decoded.data.len();
if produced > MAX_DECODED_CHUNKS_PER_ROOT
|| total_bytes > MAX_DECODED_TOTAL_BYTES
{
tracing::debug!(
path = ?chunk.metadata.path,
"decode depth/size cap reached: chunk truncated to limit"
);
return decoded_chunks;
}
queue.push_back((decoded.clone(), depth + 1));
if passes_screen {
decoded_chunks.push(decoded);
}
}
}
}
}
decoded_chunks
}
pub(super) fn push_decoded_text_chunk(
decoded_chunks: &mut Vec<Chunk>,
chunk: &Chunk,
text: String,
decoder_name: &str,
) {
push_decoded_text_chunk_spliced(decoded_chunks, chunk, "", text, decoder_name);
}
pub(super) fn push_decoded_text_chunk_spliced(
decoded_chunks: &mut Vec<Chunk>,
chunk: &Chunk,
original_encoded: &str,
text: String,
decoder_name: &str,
) {
let bytes = text.as_bytes();
if text.is_empty()
|| bytes
.iter()
.any(|&b| b < 0x20 && b != b'\n' && b != b'\r' && b != b'\t')
{
return;
}
const MAX_SPLICE_PARENT_BYTES: usize = 256 * 1024;
let (base_offset, payload) = if !original_encoded.is_empty()
&& chunk.data.len() <= MAX_SPLICE_PARENT_BYTES
{
match splice_decoded_payload(chunk.data.as_str(), original_encoded, &text, decoder_name) {
Some((win_start, spliced)) => (
chunk.metadata.base_offset.saturating_add(win_start),
spliced,
),
None => (chunk.metadata.base_offset, text),
}
} else {
(chunk.metadata.base_offset, text)
};
decoded_chunks.push(Chunk {
data: payload.into(),
metadata: ChunkMetadata {
base_offset,
base_line: chunk.metadata.base_line,
source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
path: chunk.metadata.path.clone(),
commit: chunk.metadata.commit.clone(),
author: chunk.metadata.author.clone(),
date: chunk.metadata.date.clone(),
mtime_ns: chunk.metadata.mtime_ns,
size_bytes: chunk.metadata.size_bytes,
},
});
}
const SPLICE_CONTEXT_WINDOW: usize = 512;
fn floor_char_boundary(s: &str, mut idx: usize) -> usize {
if idx >= s.len() {
return s.len();
}
while idx > 0 && !s.is_char_boundary(idx) {
idx -= 1;
}
idx
}
fn ceil_char_boundary(s: &str, mut idx: usize) -> usize {
if idx >= s.len() {
return s.len();
}
while idx < s.len() && !s.is_char_boundary(idx) {
idx += 1;
}
idx
}
fn splice_decoded_payload(
parent: &str,
original_encoded: &str,
decoded_text: &str,
decoder_name: &str,
) -> Option<(usize, String)> {
let start = parent.find(original_encoded)?;
let mut end = start + original_encoded.len();
if decoder_name == "base64" {
end = consume_adjacent_base64_padding(parent.as_bytes(), end);
}
let win_start = floor_char_boundary(parent, start.saturating_sub(SPLICE_CONTEXT_WINDOW));
let win_end = ceil_char_boundary(parent, end.saturating_add(SPLICE_CONTEXT_WINDOW));
let mut payload =
String::with_capacity((win_end - win_start) - (end - start) + decoded_text.len());
payload.push_str(&parent[win_start..start]);
payload.push_str(decoded_text);
payload.push_str(&parent[end..win_end]);
Some((win_start, payload))
}
fn consume_adjacent_base64_padding(parent: &[u8], start: usize) -> usize {
let mut end = start;
while end < parent.len() && parent[end] == b'=' && end - start < 2 {
end += 1;
}
if end == start {
return start;
}
match parent.get(end).copied() {
None | Some(b'\n' | b'\r' | b'\t' | b' ' | b';' | b',' | b'"' | b'\'' | b'`') => end,
_ => start,
}
}
pub(super) fn decode_candidates<F>(
chunk: &Chunk,
candidates: Vec<String>,
mut decode: F,
decoder_name: &str,
) -> Vec<Chunk>
where
F: FnMut(&str) -> Result<String, ()>,
{
let mut decoded_chunks = Vec::new();
for candidate in candidates {
if let Ok(text) = decode(&candidate) {
push_decoded_text_chunk_spliced(
&mut decoded_chunks,
chunk,
&candidate,
text,
decoder_name,
);
}
}
decoded_chunks
}
mod extractor;
pub(super) use extractor::{extract_encoded_values, hash_fast};