Skip to main content

keyhog_scanner/decode/
pipeline.rs

1use super::Decoder;
2use super::base64::{Base64Decoder, Z85Decoder};
3use super::hex::HexDecoder;
4use super::url::{
5    HexEscapeDecoder, HtmlNamedEntityDecoder, HtmlNumericEntityDecoder, MimeEncodedWordDecoder,
6    OctalEscapeDecoder, QuotedPrintableDecoder, UnicodeEscapeDecoder, UrlDecoder,
7};
8use keyhog_core::{Chunk, ChunkMetadata};
9use std::collections::{HashSet, VecDeque};
10
11static DECODERS: std::sync::OnceLock<Vec<Box<dyn Decoder>>> =
12    std::sync::OnceLock::new();
13
14const MAX_DECODED_CHUNKS_PER_ROOT: usize = 1000;
15const MAX_DECODED_TOTAL_BYTES: usize = 64 * 1024 * 1024;
16
17fn get_decoders() -> &'static [Box<dyn Decoder>] {
18    DECODERS.get_or_init(|| {
19        vec![
20            Box::new(Base64Decoder),
21            Box::new(HexDecoder),
22            Box::new(UrlDecoder),
23            Box::new(QuotedPrintableDecoder),
24            Box::new(HtmlNamedEntityDecoder),
25            Box::new(HtmlNumericEntityDecoder),
26            Box::new(HexEscapeDecoder),
27            Box::new(OctalEscapeDecoder),
28            Box::new(MimeEncodedWordDecoder),
29            Box::new(UnicodeEscapeDecoder),
30            Box::new(Z85Decoder),
31        ]
32    })
33}
34
35/// Register a custom decoder. Must be called BEFORE any scan runs.
36/// Panics if the decoder list has already been initialized.
37pub fn register_decoder(decoder: Box<dyn Decoder>) {
38    // After initialization, the decoder list is immutable for lock-free reads.
39    // Custom decoders must be registered before the first scan.
40    if DECODERS.get().is_some() {
41        tracing::warn!("register_decoder called after initialization — decoder ignored. Fix: register custom decoders before scanning.");
42        return;
43    }
44    // Force initialization with the custom decoder appended.
45    let mut decoders: Vec<Box<dyn Decoder>> = vec![
46        Box::new(Base64Decoder),
47        Box::new(HexDecoder),
48        Box::new(UrlDecoder),
49        Box::new(QuotedPrintableDecoder),
50        Box::new(HtmlNamedEntityDecoder),
51        Box::new(HtmlNumericEntityDecoder),
52        Box::new(HexEscapeDecoder),
53        Box::new(OctalEscapeDecoder),
54        Box::new(MimeEncodedWordDecoder),
55        Box::new(UnicodeEscapeDecoder),
56        Box::new(Z85Decoder),
57    ];
58    decoders.push(decoder);
59    let _ = DECODERS.set(decoders);
60}
61
62pub fn decode_chunk(
63    chunk: &Chunk,
64    max_depth: usize,
65    _validate: bool,
66    deadline: Option<std::time::Instant>,
67    screen: Option<&crate::alphabet_filter::AlphabetScreen>,
68) -> Vec<Chunk> {
69    let mut decoded_chunks = Vec::new();
70    let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
71    // Use hash of data instead of full string to save memory on large files.
72    let mut seen = HashSet::from([hash_fast(chunk.data.as_bytes())]);
73    let mut total_bytes = 0usize;
74
75    let registry = get_decoders();
76
77    while let Some((current, depth)) = queue.pop_front() {
78        if let Some(deadline) = deadline
79            && std::time::Instant::now() > deadline
80        {
81            break;
82        }
83        if depth >= max_depth {
84            continue;
85        }
86
87        for decoder in registry.iter() {
88            for decoded in decoder.decode_chunk(&current) {
89                if seen.insert(hash_fast(decoded.data.as_bytes())) {
90                    if let Some(screen) = screen
91                        && !screen.screen(decoded.data.as_bytes())
92                    {
93                        continue;
94                    }
95
96                    total_bytes += decoded.data.len();
97                    if decoded_chunks.len() >= MAX_DECODED_CHUNKS_PER_ROOT
98                        || total_bytes > MAX_DECODED_TOTAL_BYTES
99                    {
100                        tracing::warn!(
101                            path = ?chunk.metadata.path,
102                            "Recursive decoding limit reached. Fix: reduce decode depth or decode size limits"
103                        );
104                        return decoded_chunks;
105                    }
106
107                    queue.push_back((decoded.clone(), depth + 1));
108                    decoded_chunks.push(decoded);
109                }
110            }
111        }
112    }
113    decoded_chunks
114}
115
116pub(super) fn push_decoded_text_chunk(
117    decoded_chunks: &mut Vec<Chunk>,
118    chunk: &Chunk,
119    text: String,
120    decoder_name: &str,
121) {
122    if text.is_empty()
123        || !text
124            .chars()
125            .all(|ch| !ch.is_control() || ch == '\n' || ch == '\r' || ch == '\t')
126    {
127        return;
128    }
129
130    decoded_chunks.push(Chunk {
131        data: text,
132        metadata: ChunkMetadata {
133            source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
134            path: chunk.metadata.path.clone(),
135            commit: chunk.metadata.commit.clone(),
136            author: chunk.metadata.author.clone(),
137            date: chunk.metadata.date.clone(),
138        },
139    });
140}
141
142pub(super) fn decode_candidates<F>(
143    chunk: &Chunk,
144    candidates: Vec<String>,
145    mut decode: F,
146    decoder_name: &str,
147) -> Vec<Chunk>
148where
149    F: FnMut(&str) -> Result<String, ()>,
150{
151    let mut decoded_chunks = Vec::new();
152    for candidate in candidates {
153        if let Ok(text) = decode(&candidate) {
154            push_decoded_text_chunk(&mut decoded_chunks, chunk, text, decoder_name);
155        }
156    }
157    decoded_chunks
158}
159
160pub(super) fn extract_encoded_values(text: &str) -> Vec<String> {
161    let mut values = Vec::new();
162    // Base64 block accumulator — collected in the SAME pass as quoted/assigned values.
163    let mut b64_block = String::new();
164
165    let is_b64_char = |ch: char| -> bool {
166        ch.is_ascii_alphanumeric() || ch == '+' || ch == '/' || ch == '=' || ch == '-' || ch == '_'
167    };
168
169    // Single-pass char-level iteration. Safe for UTF-8 (no mid-codepoint splits).
170    let mut chars = text.char_indices().peekable();
171    while let Some(&(_, ch)) = chars.peek() {
172        // ── Quoted strings ──────────────────────────────────────────
173        if ch == '"' || ch == '\'' || ch == '`' {
174            // Flush any pending b64 block
175            if b64_block.len() >= 16 {
176                values.push(std::mem::take(&mut b64_block));
177            }
178            b64_block.clear();
179
180            let quote = ch;
181            chars.next();
182            let mut escaping = false;
183            let mut cleaned = String::with_capacity(32);
184
185            while let Some(&(_, current)) = chars.peek() {
186                chars.next();
187                if escaping {
188                    cleaned.push(current);
189                    escaping = false;
190                } else if current == '\\' {
191                    escaping = true;
192                } else if current == quote {
193                    if cleaned.len() >= 4 {
194                        values.push(cleaned);
195                    }
196                    break;
197                } else if !current.is_ascii_whitespace() {
198                    cleaned.push(current);
199                }
200            }
201            continue;
202        }
203
204        // ── Assignment values (key=value / key: value) ──────────────
205        if ch == ':' || ch == '=' {
206            if b64_block.len() >= 16 {
207                values.push(std::mem::take(&mut b64_block));
208            }
209            b64_block.clear();
210
211            chars.next();
212            // Skip whitespace after delimiter
213            while chars.peek().is_some_and(|&(_, c)| c.is_ascii_whitespace()) {
214                chars.next();
215            }
216            let mut cleaned = String::with_capacity(32);
217            while let Some(&(_, c)) = chars.peek() {
218                if c.is_ascii_whitespace()
219                    || c == ';' || c == ',' || c == '"' || c == '\'' || c == '`'
220                {
221                    break;
222                }
223                cleaned.push(c);
224                chars.next();
225            }
226            if cleaned.len() >= 4 {
227                values.push(cleaned);
228            }
229            continue;
230        }
231
232        // ── Base64 block accumulation (merged from old second pass) ─
233        if is_b64_char(ch) {
234            b64_block.push(ch);
235        } else if !ch.is_whitespace() {
236            if b64_block.len() >= 16 {
237                values.push(std::mem::take(&mut b64_block));
238            }
239            b64_block.clear();
240        }
241        // else: whitespace inside b64 blocks is allowed (line continuations)
242
243        chars.next();
244    }
245
246    // Flush trailing b64 block
247    if b64_block.len() >= 16 {
248        values.push(b64_block);
249    }
250
251    values
252}
253
254/// Fast non-cryptographic hash for dedup. FNV-1a is simple and fast enough
255/// for collision avoidance in a small set of decoded chunks.
256fn hash_fast(data: &[u8]) -> u64 {
257    let mut hash: u64 = 0xcbf29ce484222325;
258    for &byte in data {
259        hash ^= u64::from(byte);
260        hash = hash.wrapping_mul(0x100000001b3);
261    }
262    hash
263}