keyhog_scanner/decode/
pipeline.rs1use super::Decoder;
2use super::base64::{Base64Decoder, Z85Decoder};
3use super::hex::HexDecoder;
4use super::url::{
5 HexEscapeDecoder, HtmlNamedEntityDecoder, HtmlNumericEntityDecoder, MimeEncodedWordDecoder,
6 OctalEscapeDecoder, QuotedPrintableDecoder, UnicodeEscapeDecoder, UrlDecoder,
7};
8use keyhog_core::{Chunk, ChunkMetadata};
9use std::collections::{HashSet, VecDeque};
10
11static DECODERS: std::sync::OnceLock<Vec<Box<dyn Decoder>>> =
12 std::sync::OnceLock::new();
13
14const MAX_DECODED_CHUNKS_PER_ROOT: usize = 1000;
15const MAX_DECODED_TOTAL_BYTES: usize = 64 * 1024 * 1024;
16
17fn get_decoders() -> &'static [Box<dyn Decoder>] {
18 DECODERS.get_or_init(|| {
19 vec![
20 Box::new(Base64Decoder),
21 Box::new(HexDecoder),
22 Box::new(UrlDecoder),
23 Box::new(QuotedPrintableDecoder),
24 Box::new(HtmlNamedEntityDecoder),
25 Box::new(HtmlNumericEntityDecoder),
26 Box::new(HexEscapeDecoder),
27 Box::new(OctalEscapeDecoder),
28 Box::new(MimeEncodedWordDecoder),
29 Box::new(UnicodeEscapeDecoder),
30 Box::new(Z85Decoder),
31 ]
32 })
33}
34
35pub fn register_decoder(decoder: Box<dyn Decoder>) {
38 if DECODERS.get().is_some() {
41 tracing::warn!("register_decoder called after initialization — decoder ignored. Fix: register custom decoders before scanning.");
42 return;
43 }
44 let mut decoders: Vec<Box<dyn Decoder>> = vec![
46 Box::new(Base64Decoder),
47 Box::new(HexDecoder),
48 Box::new(UrlDecoder),
49 Box::new(QuotedPrintableDecoder),
50 Box::new(HtmlNamedEntityDecoder),
51 Box::new(HtmlNumericEntityDecoder),
52 Box::new(HexEscapeDecoder),
53 Box::new(OctalEscapeDecoder),
54 Box::new(MimeEncodedWordDecoder),
55 Box::new(UnicodeEscapeDecoder),
56 Box::new(Z85Decoder),
57 ];
58 decoders.push(decoder);
59 let _ = DECODERS.set(decoders);
60}
61
62pub fn decode_chunk(
63 chunk: &Chunk,
64 max_depth: usize,
65 _validate: bool,
66 deadline: Option<std::time::Instant>,
67 screen: Option<&crate::alphabet_filter::AlphabetScreen>,
68) -> Vec<Chunk> {
69 let mut decoded_chunks = Vec::new();
70 let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
71 let mut seen = HashSet::from([hash_fast(chunk.data.as_bytes())]);
73 let mut total_bytes = 0usize;
74
75 let registry = get_decoders();
76
77 while let Some((current, depth)) = queue.pop_front() {
78 if let Some(deadline) = deadline
79 && std::time::Instant::now() > deadline
80 {
81 break;
82 }
83 if depth >= max_depth {
84 continue;
85 }
86
87 for decoder in registry.iter() {
88 for decoded in decoder.decode_chunk(¤t) {
89 if seen.insert(hash_fast(decoded.data.as_bytes())) {
90 if let Some(screen) = screen
91 && !screen.screen(decoded.data.as_bytes())
92 {
93 continue;
94 }
95
96 total_bytes += decoded.data.len();
97 if decoded_chunks.len() >= MAX_DECODED_CHUNKS_PER_ROOT
98 || total_bytes > MAX_DECODED_TOTAL_BYTES
99 {
100 tracing::warn!(
101 path = ?chunk.metadata.path,
102 "Recursive decoding limit reached. Fix: reduce decode depth or decode size limits"
103 );
104 return decoded_chunks;
105 }
106
107 queue.push_back((decoded.clone(), depth + 1));
108 decoded_chunks.push(decoded);
109 }
110 }
111 }
112 }
113 decoded_chunks
114}
115
116pub(super) fn push_decoded_text_chunk(
117 decoded_chunks: &mut Vec<Chunk>,
118 chunk: &Chunk,
119 text: String,
120 decoder_name: &str,
121) {
122 if text.is_empty()
123 || !text
124 .chars()
125 .all(|ch| !ch.is_control() || ch == '\n' || ch == '\r' || ch == '\t')
126 {
127 return;
128 }
129
130 decoded_chunks.push(Chunk {
131 data: text,
132 metadata: ChunkMetadata {
133 source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
134 path: chunk.metadata.path.clone(),
135 commit: chunk.metadata.commit.clone(),
136 author: chunk.metadata.author.clone(),
137 date: chunk.metadata.date.clone(),
138 },
139 });
140}
141
142pub(super) fn decode_candidates<F>(
143 chunk: &Chunk,
144 candidates: Vec<String>,
145 mut decode: F,
146 decoder_name: &str,
147) -> Vec<Chunk>
148where
149 F: FnMut(&str) -> Result<String, ()>,
150{
151 let mut decoded_chunks = Vec::new();
152 for candidate in candidates {
153 if let Ok(text) = decode(&candidate) {
154 push_decoded_text_chunk(&mut decoded_chunks, chunk, text, decoder_name);
155 }
156 }
157 decoded_chunks
158}
159
160pub(super) fn extract_encoded_values(text: &str) -> Vec<String> {
161 let mut values = Vec::new();
162 let mut b64_block = String::new();
164
165 let is_b64_char = |ch: char| -> bool {
166 ch.is_ascii_alphanumeric() || ch == '+' || ch == '/' || ch == '=' || ch == '-' || ch == '_'
167 };
168
169 let mut chars = text.char_indices().peekable();
171 while let Some(&(_, ch)) = chars.peek() {
172 if ch == '"' || ch == '\'' || ch == '`' {
174 if b64_block.len() >= 16 {
176 values.push(std::mem::take(&mut b64_block));
177 }
178 b64_block.clear();
179
180 let quote = ch;
181 chars.next();
182 let mut escaping = false;
183 let mut cleaned = String::with_capacity(32);
184
185 while let Some(&(_, current)) = chars.peek() {
186 chars.next();
187 if escaping {
188 cleaned.push(current);
189 escaping = false;
190 } else if current == '\\' {
191 escaping = true;
192 } else if current == quote {
193 if cleaned.len() >= 4 {
194 values.push(cleaned);
195 }
196 break;
197 } else if !current.is_ascii_whitespace() {
198 cleaned.push(current);
199 }
200 }
201 continue;
202 }
203
204 if ch == ':' || ch == '=' {
206 if b64_block.len() >= 16 {
207 values.push(std::mem::take(&mut b64_block));
208 }
209 b64_block.clear();
210
211 chars.next();
212 while chars.peek().is_some_and(|&(_, c)| c.is_ascii_whitespace()) {
214 chars.next();
215 }
216 let mut cleaned = String::with_capacity(32);
217 while let Some(&(_, c)) = chars.peek() {
218 if c.is_ascii_whitespace()
219 || c == ';' || c == ',' || c == '"' || c == '\'' || c == '`'
220 {
221 break;
222 }
223 cleaned.push(c);
224 chars.next();
225 }
226 if cleaned.len() >= 4 {
227 values.push(cleaned);
228 }
229 continue;
230 }
231
232 if is_b64_char(ch) {
234 b64_block.push(ch);
235 } else if !ch.is_whitespace() {
236 if b64_block.len() >= 16 {
237 values.push(std::mem::take(&mut b64_block));
238 }
239 b64_block.clear();
240 }
241 chars.next();
244 }
245
246 if b64_block.len() >= 16 {
248 values.push(b64_block);
249 }
250
251 values
252}
253
254fn hash_fast(data: &[u8]) -> u64 {
257 let mut hash: u64 = 0xcbf29ce484222325;
258 for &byte in data {
259 hash ^= u64::from(byte);
260 hash = hash.wrapping_mul(0x100000001b3);
261 }
262 hash
263}