Skip to main content

keyhog_scanner/
decode.rs

1//! Decode-through scanning: decode base64 and hex strings before pattern matching.
2//!
3//! Catches secrets hidden behind encoding layers — Kubernetes manifests,
4//! CI/CD configs, hex-encoded credentials.
5
6/// Decoding layer: decode base64 and hex strings before pattern matching.
7/// Catches secrets hidden behind encoding (evasion technique).
8use crate::context;
9use keyhog_core::{Chunk, ChunkMetadata};
10use std::collections::{HashSet, VecDeque};
11
12/// A trait for decoding chunks to find hidden secrets.
13///
14/// # Examples
15///
16/// ```rust,ignore
17/// use keyhog_core::Chunk;
18/// use keyhog_scanner::decode::Decoder;
19///
20/// struct Passthrough;
21///
22/// impl Decoder for Passthrough {
23///     fn name(&self) -> &'static str { "passthrough" }
24///     fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> { vec![chunk.clone()] }
25/// }
26/// ```
27pub trait Decoder: Send + Sync {
28    fn name(&self) -> &'static str;
29    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk>;
30}
31
32struct Base64Decoder;
33impl Decoder for Base64Decoder {
34    fn name(&self) -> &'static str {
35        "base64"
36    }
37    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
38        let mut decoded_chunks = Vec::new();
39        let lines: Vec<&str> = chunk.data.lines().collect();
40        for (line_idx, line) in lines.iter().enumerate() {
41            if context::is_false_positive_context(&lines, line_idx, chunk.metadata.path.as_deref())
42            {
43                continue;
44            }
45            for b64_match in find_base64_strings(line, 20) {
46                match base64_decode(&b64_match.value) {
47                    Ok(decoded) => match String::from_utf8(decoded) {
48                        Ok(text)
49                            if text.chars().all(|c| {
50                                !c.is_control() || c == '\n' || c == '\r' || c == '\t'
51                            }) =>
52                        {
53                            decoded_chunks.push(Chunk {
54                                data: text,
55                                metadata: ChunkMetadata {
56                                    source_type: format!("{}/base64", chunk.metadata.source_type),
57                                    path: chunk.metadata.path.clone(),
58                                    commit: chunk.metadata.commit.clone(),
59                                    author: chunk.metadata.author.clone(),
60                                    date: chunk.metadata.date.clone(),
61                                },
62                            });
63                        }
64                        Ok(_) => {
65                            tracing::trace!(
66                                path = ?chunk.metadata.path,
67                                "base64 decoded to text with control characters, skipping"
68                            );
69                        }
70                        Err(_) => {
71                            tracing::trace!(
72                                path = ?chunk.metadata.path,
73                                "base64 decoded to non-UTF-8 bytes, skipping"
74                            );
75                        }
76                    },
77                    Err(()) => {
78                        tracing::trace!(
79                            path = ?chunk.metadata.path,
80                            candidate_len = b64_match.value.len(),
81                            "base64 decode failed for candidate"
82                        );
83                    }
84                }
85            }
86        }
87        decoded_chunks
88    }
89}
90
91struct HexDecoder;
92impl Decoder for HexDecoder {
93    fn name(&self) -> &'static str {
94        "hex"
95    }
96    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
97        let mut decoded_chunks = Vec::new();
98        for hex_match in find_hex_strings(&chunk.data, 40) {
99            if let Ok(decoded) = hex_decode(&hex_match.value)
100                && let Ok(text) = String::from_utf8(decoded)
101                && text
102                    .chars()
103                    .all(|c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
104            {
105                decoded_chunks.push(Chunk {
106                    data: text,
107                    metadata: ChunkMetadata {
108                        source_type: format!("{}/hex", chunk.metadata.source_type),
109                        path: chunk.metadata.path.clone(),
110                        commit: chunk.metadata.commit.clone(),
111                        author: chunk.metadata.author.clone(),
112                        date: chunk.metadata.date.clone(),
113                    },
114                });
115            }
116        }
117        decoded_chunks
118    }
119}
120
121struct UrlDecoder;
122impl Decoder for UrlDecoder {
123    fn name(&self) -> &'static str {
124        "url"
125    }
126    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
127        decode_candidates(
128            chunk,
129            extract_encoded_values(&chunk.data)
130                .into_iter()
131                .filter(|candidate| candidate.contains('%'))
132                .collect(),
133            url_decode,
134            self.name(),
135        )
136    }
137}
138
139struct QuotedPrintableDecoder;
140impl Decoder for QuotedPrintableDecoder {
141    fn name(&self) -> &'static str {
142        "quoted-printable"
143    }
144    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
145        let mut decoded_chunks = Vec::new();
146        let lines: Vec<&str> = chunk.data.lines().collect();
147        for (line_idx, line) in lines.iter().enumerate() {
148            if context::is_false_positive_context(&lines, line_idx, chunk.metadata.path.as_deref())
149            {
150                continue;
151            }
152            let mut candidates = extract_encoded_values(line);
153            let trimmed = line.trim();
154            if trimmed.contains('=') && !trimmed.is_empty() {
155                candidates.push(trimmed.to_string());
156            }
157            decoded_chunks.extend(decode_candidates(
158                chunk,
159                candidates
160                    .into_iter()
161                    .filter(|candidate| candidate.contains('='))
162                    .collect(),
163                quoted_printable_decode,
164                self.name(),
165            ));
166        }
167        decoded_chunks
168    }
169}
170
171struct HtmlNamedEntityDecoder;
172impl Decoder for HtmlNamedEntityDecoder {
173    fn name(&self) -> &'static str {
174        "html-named-entity"
175    }
176    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
177        let mut candidates = extract_encoded_values(&chunk.data);
178        let trimmed = chunk.data.trim();
179        if trimmed.contains('&') && !trimmed.is_empty() {
180            candidates.push(trimmed.to_string());
181        }
182        decode_candidates(
183            chunk,
184            candidates
185                .into_iter()
186                .filter(|candidate| candidate.contains('&'))
187                .collect(),
188            html_named_entity_decode,
189            self.name(),
190        )
191    }
192}
193
194struct HtmlNumericEntityDecoder;
195impl Decoder for HtmlNumericEntityDecoder {
196    fn name(&self) -> &'static str {
197        "html-numeric-entity"
198    }
199    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
200        let mut candidates = extract_encoded_values(&chunk.data);
201        let trimmed = chunk.data.trim();
202        if trimmed.contains("&#") && !trimmed.is_empty() {
203            candidates.push(trimmed.to_string());
204        }
205        decode_candidates(
206            chunk,
207            candidates
208                .into_iter()
209                .filter(|candidate| candidate.contains("&#"))
210                .collect(),
211            html_numeric_entity_decode,
212            self.name(),
213        )
214    }
215}
216
217struct HexEscapeDecoder;
218impl Decoder for HexEscapeDecoder {
219    fn name(&self) -> &'static str {
220        "hex-escape"
221    }
222    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
223        let mut candidates = extract_encoded_values(&chunk.data);
224        let trimmed = chunk.data.trim();
225        if trimmed.contains("\\x") && !trimmed.is_empty() {
226            candidates.push(trimmed.to_string());
227        }
228        decode_candidates(
229            chunk,
230            candidates
231                .into_iter()
232                .filter(|candidate| candidate.contains("\\x"))
233                .collect(),
234            hex_escape_decode,
235            self.name(),
236        )
237    }
238}
239
240struct OctalEscapeDecoder;
241impl Decoder for OctalEscapeDecoder {
242    fn name(&self) -> &'static str {
243        "octal-escape"
244    }
245    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
246        let mut candidates = extract_encoded_values(&chunk.data);
247        let trimmed = chunk.data.trim();
248        if trimmed.contains('\\') && !trimmed.is_empty() {
249            candidates.push(trimmed.to_string());
250        }
251        decode_candidates(
252            chunk,
253            candidates
254                .into_iter()
255                .filter(|candidate| contains_octal_escape(candidate))
256                .collect(),
257            octal_escape_decode,
258            self.name(),
259        )
260    }
261}
262
263struct MimeEncodedWordDecoder;
264impl Decoder for MimeEncodedWordDecoder {
265    fn name(&self) -> &'static str {
266        "mime-encoded-word"
267    }
268    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
269        let mut candidates = Vec::new();
270        for line in chunk.data.lines() {
271            candidates.extend(find_mime_encoded_words(line));
272        }
273        decode_candidates(chunk, candidates, mime_encoded_word_decode, self.name())
274    }
275}
276
277struct UnicodeEscapeDecoder;
278impl Decoder for UnicodeEscapeDecoder {
279    fn name(&self) -> &'static str {
280        "unicode-escape"
281    }
282    fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
283        decode_candidates(
284            chunk,
285            extract_encoded_values(&chunk.data)
286                .into_iter()
287                .filter(|candidate| candidate.contains("\\u") || candidate.contains("\\x"))
288                .collect(),
289            unicode_escape_decode,
290            self.name(),
291        )
292    }
293}
294
295static DECODERS: std::sync::OnceLock<std::sync::RwLock<Vec<Box<dyn Decoder>>>> =
296    std::sync::OnceLock::new();
297
298fn get_decoders() -> &'static std::sync::RwLock<Vec<Box<dyn Decoder>>> {
299    DECODERS.get_or_init(|| {
300        std::sync::RwLock::new(vec![
301            Box::new(Base64Decoder),
302            Box::new(HexDecoder),
303            Box::new(UrlDecoder),
304            Box::new(QuotedPrintableDecoder),
305            Box::new(HtmlNamedEntityDecoder),
306            Box::new(HtmlNumericEntityDecoder),
307            Box::new(HexEscapeDecoder),
308            Box::new(OctalEscapeDecoder),
309            Box::new(MimeEncodedWordDecoder),
310            Box::new(UnicodeEscapeDecoder),
311        ])
312    })
313}
314
315/// Register a custom decoder that participates in decode-through scanning.
316/// Register a custom decode stage used by [`decode_chunk`].
317///
318/// # Examples
319///
320/// ```rust,ignore
321/// use keyhog_core::Chunk;
322/// use keyhog_scanner::decode::{Decoder, register_decoder};
323///
324/// struct Passthrough;
325/// impl Decoder for Passthrough {
326///     fn name(&self) -> &'static str { "passthrough" }
327///     fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> { vec![chunk.clone()] }
328/// }
329///
330/// register_decoder(Box::new(Passthrough));
331/// ```
332pub fn register_decoder(decoder: Box<dyn Decoder>) {
333    let mut registry = get_decoders()
334        .write()
335        .unwrap_or_else(|poisoned| poisoned.into_inner());
336    registry.push(decoder);
337}
338
339/// Maximum decode recursion depth. Two levels handle the common case of
340/// `base64(hex(secret))` or `hex(base64(secret))`. Higher depths are
341/// theoretically possible but:
342///   - Real-world triple-encoding is vanishingly rare in codebases.
343///   - Each level multiplies the candidate set combinatorially.
344///   - The `seen` dedup set prevents repeat work, but O(candidates²) growth
345///     still makes depth > 2 impractical for large chunks.
346///
347/// Attackers who triple-encode to evade scanners will also evade TruffleHog,
348/// Semgrep, and every other current-generation scanner.
349const MAX_DECODE_DEPTH: usize = 2;
350
351/// Decode base64, hex, URL, and other encoded strings in a chunk, producing
352/// additional chunks with decoded content for scanning.
353///
354/// Uses BFS with deduplication to avoid redundant decode–re-decode cycles.
355/// The search is bounded by [`MAX_DECODE_DEPTH`] to prevent combinatorial
356/// explosion on pathological inputs.
357/// Decode a chunk through all registered decoders and return derived chunks.
358///
359/// # Examples
360///
361/// ```rust
362/// use keyhog_core::{Chunk, ChunkMetadata};
363/// use keyhog_scanner::decode::decode_chunk;
364///
365/// let chunk = Chunk {
366///     data: "ZGVtb19BQkMxMjM0NQ==".into(),
367///     metadata: ChunkMetadata {
368///         source_type: "example".into(),
369///         path: None,
370///         commit: None,
371///         author: None,
372///         date: None,
373///     },
374/// };
375/// assert!(!decode_chunk(&chunk).is_empty());
376/// ```
377pub fn decode_chunk(chunk: &Chunk) -> Vec<Chunk> {
378    let mut decoded_chunks = Vec::new();
379    let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
380    let mut seen = HashSet::from([chunk.data.clone()]);
381    let registry = get_decoders()
382        .read()
383        .unwrap_or_else(|poisoned| poisoned.into_inner());
384
385    while let Some((current, depth)) = queue.pop_front() {
386        if depth >= MAX_DECODE_DEPTH {
387            continue;
388        }
389        for decoder in registry.iter() {
390            for decoded in decoder.decode_chunk(&current) {
391                if seen.insert(decoded.data.clone()) {
392                    queue.push_back((decoded.clone(), depth + 1));
393                    decoded_chunks.push(decoded);
394                }
395            }
396        }
397    }
398    decoded_chunks
399}
400
401struct EncodedString {
402    value: String,
403}
404
405/// Find base64-encoded strings in text (minimum length, valid base64 charset).
406fn find_base64_strings(text: &str, min_length: usize) -> Vec<EncodedString> {
407    let mut results = Vec::new();
408    let b64_chars = |c: char| {
409        c.is_ascii_alphanumeric() || c == '+' || c == '/' || c == '=' || c == '-' || c == '_'
410    };
411
412    for line in text.lines() {
413        // Look for base64 after = or : or in quotes
414        let candidates = extract_encoded_values(line);
415        for candidate in candidates {
416            if candidate.len() >= min_length
417                && candidate.chars().all(b64_chars)
418                && classify_base64(candidate.as_str()).is_some()
419            {
420                results.push(EncodedString { value: candidate });
421            }
422        }
423    }
424
425    results
426}
427
428/// Find hex-encoded strings (even length, all hex chars, minimum length).
429fn find_hex_strings(text: &str, min_length: usize) -> Vec<EncodedString> {
430    let mut results = Vec::new();
431
432    for line in text.lines() {
433        let candidates = extract_encoded_values(line);
434        for candidate in candidates {
435            if candidate.len() >= min_length
436                && candidate.len() % 2 == 0
437                && candidate.chars().all(|c| c.is_ascii_hexdigit())
438            {
439                results.push(EncodedString { value: candidate });
440            }
441        }
442    }
443
444    results
445}
446
447/// Extract potential encoded values from a line (after =, :, or in quotes).
448fn extract_encoded_values(line: &str) -> Vec<String> {
449    let mut values = Vec::new();
450
451    // After : or = (try : first — it's the YAML/JSON key-value separator
452    // and won't match inside base64 padding like = does)
453    if let Some(pos) = line.find(':').or_else(|| line.find('=')) {
454        let candidate_value = line[pos + 1..]
455            .trim()
456            .trim_matches(|c: char| c == '"' || c == '\'' || c == '`');
457        if !candidate_value.is_empty() {
458            values.push(candidate_value.to_string());
459        }
460    }
461
462    // Quoted strings
463    for quote in ['"', '\''] {
464        let mut start = None;
465        for (i, ch) in line.char_indices() {
466            if ch == quote {
467                match start {
468                    None => start = Some(i + 1),
469                    Some(s) => {
470                        let content = &line[s..i];
471                        if !content.is_empty() {
472                            values.push(content.to_string());
473                        }
474                        start = None;
475                    }
476                }
477            }
478        }
479    }
480
481    values
482}
483
484#[derive(Clone, Copy)]
485enum Base64Variant {
486    Standard,
487    StandardNoPad,
488    UrlSafe,
489    UrlSafeNoPad,
490}
491
492fn classify_base64(candidate: &str) -> Option<Base64Variant> {
493    if !has_valid_base64_padding(candidate) {
494        return None;
495    }
496
497    let has_standard = candidate.contains('+') || candidate.contains('/');
498    let has_urlsafe = candidate.contains('-') || candidate.contains('_');
499    if has_standard && has_urlsafe {
500        return None;
501    }
502
503    let padded = candidate.contains('=');
504    match (has_urlsafe, padded, candidate.len() % 4) {
505        (_, true, 0) => Some(if has_urlsafe {
506            Base64Variant::UrlSafe
507        } else {
508            Base64Variant::Standard
509        }),
510        (_, true, _) => None,
511        (_, false, 1) => None,
512        (true, false, _) => Some(Base64Variant::UrlSafeNoPad),
513        (false, false, 0) => Some(Base64Variant::Standard),
514        (false, false, _) => Some(Base64Variant::StandardNoPad),
515    }
516}
517
518fn has_valid_base64_padding(candidate: &str) -> bool {
519    let first_padding = match candidate.find('=') {
520        Some(index) => index,
521        None => return true,
522    };
523
524    let padding = &candidate[first_padding..];
525    first_padding > 0
526        && padding.len() <= 2
527        && padding.bytes().all(|byte| byte == b'=')
528        && candidate[..first_padding].bytes().all(|byte| byte != b'=')
529}
530
531fn base64_decode(input: &str) -> Result<Vec<u8>, ()> {
532    use base64::{Engine, engine::general_purpose};
533
534    let variant = classify_base64(input).ok_or(())?;
535    match variant {
536        Base64Variant::Standard => general_purpose::STANDARD.decode(input),
537        Base64Variant::StandardNoPad => general_purpose::STANDARD_NO_PAD.decode(input),
538        Base64Variant::UrlSafe => general_purpose::URL_SAFE.decode(input),
539        Base64Variant::UrlSafeNoPad => general_purpose::URL_SAFE_NO_PAD.decode(input),
540    }
541    .map_err(|_| ())
542}
543
544fn hex_decode(input: &str) -> Result<Vec<u8>, ()> {
545    if !input.len().is_multiple_of(2) {
546        return Err(());
547    }
548    let mut decoded_bytes = Vec::with_capacity(input.len() / 2);
549    for offset in (0..input.len()).step_by(2) {
550        let high = hex_val(input.as_bytes()[offset])?;
551        let low = hex_val(input.as_bytes()[offset + 1])?;
552        decoded_bytes.push((high << 4) | low);
553    }
554    Ok(decoded_bytes)
555}
556
557fn hex_val(b: u8) -> Result<u8, ()> {
558    match b {
559        b'0'..=b'9' => Ok(b - b'0'),
560        b'a'..=b'f' => Ok(b - b'a' + 10),
561        b'A'..=b'F' => Ok(b - b'A' + 10),
562        _ => Err(()),
563    }
564}
565
566fn decode_candidates<F>(
567    chunk: &Chunk,
568    candidates: Vec<String>,
569    mut decode: F,
570    decoder_name: &str,
571) -> Vec<Chunk>
572where
573    F: FnMut(&str) -> Result<String, ()>,
574{
575    let mut decoded_chunks = Vec::new();
576    for candidate in candidates {
577        if let Ok(text) = decode(&candidate)
578            && !text.is_empty()
579            && text
580                .chars()
581                .all(|c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
582        {
583            decoded_chunks.push(Chunk {
584                data: text,
585                metadata: ChunkMetadata {
586                    source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
587                    path: chunk.metadata.path.clone(),
588                    commit: chunk.metadata.commit.clone(),
589                    author: chunk.metadata.author.clone(),
590                    date: chunk.metadata.date.clone(),
591                },
592            });
593        }
594    }
595    decoded_chunks
596}
597
598fn percent_decode(input: &str) -> Result<String, ()> {
599    let mut bytes = Vec::with_capacity(input.len());
600    let mut i = 0;
601    let input_bytes = input.as_bytes();
602    while i < input_bytes.len() {
603        match input_bytes[i] {
604            b'%' if i + 2 < input_bytes.len() => {
605                let high = hex_val(input_bytes[i + 1])?;
606                let low = hex_val(input_bytes[i + 2])?;
607                bytes.push((high << 4) | low);
608                i += 3;
609            }
610            byte => {
611                bytes.push(byte);
612                i += 1;
613            }
614        }
615    }
616    String::from_utf8(bytes).map_err(|_| ())
617}
618
619fn url_decode(input: &str) -> Result<String, ()> {
620    let decoded = percent_decode(input)?;
621    if contains_percent_escape(&decoded) {
622        percent_decode(&decoded)
623    } else {
624        Ok(decoded)
625    }
626}
627
628fn contains_percent_escape(input: &str) -> bool {
629    input
630        .as_bytes()
631        .windows(3)
632        .any(|window| window[0] == b'%' && hex_val(window[1]).is_ok() && hex_val(window[2]).is_ok())
633}
634
635fn quoted_printable_decode(input: &str) -> Result<String, ()> {
636    let mut bytes = Vec::with_capacity(input.len());
637    let mut i = 0;
638    let input_bytes = input.as_bytes();
639    while i < input_bytes.len() {
640        match input_bytes[i] {
641            b'=' if i + 2 < input_bytes.len() => {
642                if input_bytes[i + 1] == b'\r' && input_bytes[i + 2] == b'\n' {
643                    i += 3;
644                    continue;
645                }
646                let high = hex_val(input_bytes[i + 1])?;
647                let low = hex_val(input_bytes[i + 2])?;
648                bytes.push((high << 4) | low);
649                i += 3;
650            }
651            byte => {
652                bytes.push(byte);
653                i += 1;
654            }
655        }
656    }
657    String::from_utf8(bytes).map_err(|_| ())
658}
659
660fn html_named_entity_decode(input: &str) -> Result<String, ()> {
661    let mut decoded = String::with_capacity(input.len());
662    let mut changed = false;
663    let mut chars = input.chars().peekable();
664
665    while let Some(ch) = chars.next() {
666        if ch != '&' {
667            decoded.push(ch);
668            continue;
669        }
670
671        let mut entity = String::new();
672        while let Some(&next) = chars.peek() {
673            entity.push(next);
674            chars.next();
675            if next == ';' || entity.len() > 10 {
676                break;
677            }
678        }
679
680        let replacement = match entity.as_str() {
681            "amp;" => Some('&'),
682            "lt;" => Some('<'),
683            "gt;" => Some('>'),
684            "quot;" => Some('"'),
685            "apos;" => Some('\''),
686            "nbsp;" => Some('\u{00A0}'),
687            _ => None,
688        };
689
690        if let Some(replacement) = replacement {
691            decoded.push(replacement);
692            changed = true;
693        } else {
694            decoded.push('&');
695            decoded.push_str(&entity);
696        }
697    }
698
699    changed.then_some(decoded).ok_or(())
700}
701
702fn html_numeric_entity_decode(input: &str) -> Result<String, ()> {
703    let mut decoded = String::with_capacity(input.len());
704    let mut changed = false;
705    let mut chars = input.chars().peekable();
706
707    while let Some(ch) = chars.next() {
708        if ch != '&' || chars.peek() != Some(&'#') {
709            decoded.push(ch);
710            continue;
711        }
712
713        chars.next();
714        let is_hex = matches!(chars.peek(), Some('x') | Some('X'));
715        if is_hex {
716            chars.next();
717        }
718
719        let mut digits = String::new();
720        while let Some(&next) = chars.peek() {
721            if next == ';' {
722                chars.next();
723                break;
724            }
725            if (is_hex && next.is_ascii_hexdigit()) || (!is_hex && next.is_ascii_digit()) {
726                digits.push(next);
727                chars.next();
728            } else {
729                decoded.push('&');
730                decoded.push('#');
731                if is_hex {
732                    decoded.push('x');
733                }
734                decoded.push_str(&digits);
735                decoded.push(next);
736                chars.next();
737                digits.clear();
738                break;
739            }
740        }
741
742        if digits.is_empty() {
743            decoded.push('&');
744            decoded.push('#');
745            if is_hex {
746                decoded.push('x');
747            }
748            continue;
749        }
750
751        let radix = if is_hex { 16 } else { 10 };
752        let code = u32::from_str_radix(&digits, radix).map_err(|_| ())?;
753        let replacement = char::from_u32(code).ok_or(())?;
754        decoded.push(replacement);
755        changed = true;
756    }
757
758    changed.then_some(decoded).ok_or(())
759}
760
761fn hex_escape_decode(input: &str) -> Result<String, ()> {
762    let mut decoded = String::with_capacity(input.len());
763    let mut chars = input.chars().peekable();
764    let mut changed = false;
765
766    while let Some(ch) = chars.next() {
767        if ch != '\\' || chars.peek() != Some(&'x') {
768            decoded.push(ch);
769            continue;
770        }
771
772        chars.next();
773        let high = chars.next().ok_or(())?.to_digit(16).ok_or(())?;
774        let low = chars.next().ok_or(())?.to_digit(16).ok_or(())?;
775        let byte = ((high << 4) | low) as u8;
776        decoded.push(char::from(byte));
777        changed = true;
778    }
779
780    changed.then_some(decoded).ok_or(())
781}
782
783fn octal_escape_decode(input: &str) -> Result<String, ()> {
784    let mut decoded = String::with_capacity(input.len());
785    let mut chars = input.chars().peekable();
786    let mut changed = false;
787
788    while let Some(ch) = chars.next() {
789        if ch != '\\' {
790            decoded.push(ch);
791            continue;
792        }
793
794        let Some(&next) = chars.peek() else {
795            return Err(());
796        };
797        if !('0'..='7').contains(&next) {
798            decoded.push(ch);
799            continue;
800        }
801
802        let mut value = 0u8;
803        for _ in 0..3 {
804            let digit = chars.next().ok_or(())?;
805            let digit = digit.to_digit(8).ok_or(())? as u8;
806            value = (value << 3) | digit;
807        }
808        decoded.push(char::from(value));
809        changed = true;
810    }
811
812    changed.then_some(decoded).ok_or(())
813}
814
815fn contains_octal_escape(input: &str) -> bool {
816    let bytes = input.as_bytes();
817    bytes.windows(4).any(|window| {
818        window[0] == b'\\'
819            && (b'0'..=b'7').contains(&window[1])
820            && (b'0'..=b'7').contains(&window[2])
821            && (b'0'..=b'7').contains(&window[3])
822    })
823}
824
825fn mime_encoded_word_decode(input: &str) -> Result<String, ()> {
826    if !input.starts_with("=?") || !input.ends_with("?=") {
827        return Err(());
828    }
829
830    let inner = &input[2..input.len() - 2];
831    let mut parts = inner.splitn(3, '?');
832    let _charset = parts.next().ok_or(())?;
833    let encoding = parts.next().ok_or(())?;
834    let encoded = parts.next().ok_or(())?;
835
836    let bytes = match encoding {
837        "B" | "b" => base64_decode(encoded)?,
838        "Q" | "q" => mime_q_decode(encoded)?,
839        _ => return Err(()),
840    };
841
842    String::from_utf8(bytes).map_err(|_| ())
843}
844
845fn mime_q_decode(input: &str) -> Result<Vec<u8>, ()> {
846    let normalized = input.replace('_', " ");
847    let mut bytes = Vec::with_capacity(normalized.len());
848    let mut i = 0;
849    let input_bytes = normalized.as_bytes();
850
851    while i < input_bytes.len() {
852        match input_bytes[i] {
853            b'=' if i + 2 < input_bytes.len() => {
854                let high = hex_val(input_bytes[i + 1])?;
855                let low = hex_val(input_bytes[i + 2])?;
856                bytes.push((high << 4) | low);
857                i += 3;
858            }
859            byte => {
860                bytes.push(byte);
861                i += 1;
862            }
863        }
864    }
865
866    Ok(bytes)
867}
868
869fn find_mime_encoded_words(line: &str) -> Vec<String> {
870    let mut words = Vec::new();
871    let mut offset = 0;
872
873    while let Some(start) = line[offset..].find("=?") {
874        let absolute_start = offset + start;
875        if let Some(end) = line[absolute_start + 2..].find("?=") {
876            let absolute_end = absolute_start + 2 + end + 2;
877            words.push(line[absolute_start..absolute_end].to_string());
878            offset = absolute_end;
879        } else {
880            break;
881        }
882    }
883
884    words
885}
886
887fn unicode_escape_decode(input: &str) -> Result<String, ()> {
888    let mut decoded_text = String::with_capacity(input.len());
889    let mut chars = input.chars().peekable();
890
891    while let Some(ch) = chars.next() {
892        if ch != '\\' {
893            decoded_text.push(ch);
894            continue;
895        }
896
897        match chars.next() {
898            Some('u') => {
899                let code = take_hex_digits(&mut chars, 4)?;
900                let ch = char::from_u32(code).ok_or(())?;
901                decoded_text.push(ch);
902            }
903            Some('x') => {
904                let code = take_hex_digits(&mut chars, 2)?;
905                decoded_text.push(char::from_u32(code).ok_or(())?);
906            }
907            Some(escaped) => decoded_text.push(escaped),
908            None => return Err(()),
909        }
910    }
911
912    Ok(decoded_text)
913}
914
915fn take_hex_digits<I>(chars: &mut std::iter::Peekable<I>, count: usize) -> Result<u32, ()>
916where
917    I: Iterator<Item = char>,
918{
919    let mut value = 0u32;
920    for _ in 0..count {
921        let ch = chars.next().ok_or(())?;
922        let digit = ch.to_digit(16).ok_or(())?;
923        value = (value << 4) | digit;
924    }
925    Ok(value)
926}
927
928#[cfg(test)]
929mod tests {
930    use super::*;
931
932    #[test]
933    fn decode_base64_secret() {
934        // "sk-proj-abc123" in base64
935        let encoded = "c2stcHJvai1hYmMxMjM=";
936        let decoded = base64_decode(encoded).unwrap();
937        assert_eq!(String::from_utf8(decoded).unwrap(), "sk-proj-abc123");
938    }
939
940    #[test]
941    fn decode_hex_secret() {
942        // "sk-proj-abc" in hex
943        let encoded = "736b2d70726f6a2d616263";
944        let decoded = hex_decode(encoded).unwrap();
945        assert_eq!(String::from_utf8(decoded).unwrap(), "sk-proj-abc");
946    }
947
948    #[test]
949    fn decode_url_safe_base64() {
950        let encoded = "c2stcHJvai1hYmMxMjM"; // URL-safe, no padding
951        let decoded = base64_decode(encoded).unwrap();
952        assert_eq!(String::from_utf8(decoded).unwrap(), "sk-proj-abc123");
953    }
954
955    #[test]
956    fn find_base64_in_text() {
957        let text = r#"TOKEN = "c2stcHJvai1hYmMxMjM=""#;
958        let matches = find_base64_strings(text, 10);
959        assert_eq!(matches.len(), 2);
960        assert_eq!(matches[0].value, "c2stcHJvai1hYmMxMjM=");
961        assert_eq!(matches[1].value, "c2stcHJvai1hYmMxMjM=");
962    }
963
964    #[test]
965    fn decode_chunk_finds_encoded_secret() {
966        let chunk = Chunk {
967            data: "SECRET=c2stcHJvai1hYmMxMjM=\n".to_string(),
968            metadata: ChunkMetadata {
969                source_type: "test".into(),
970                path: Some("test.env".into()),
971                commit: None,
972                author: None,
973                date: None,
974            },
975        };
976        let decoded = decode_chunk(&chunk);
977        assert!(!decoded.is_empty());
978        assert!(decoded[0].data.contains("sk-proj-abc123"));
979        assert!(decoded[0].metadata.source_type.contains("base64"));
980    }
981
982    #[test]
983    fn decode_url_encoded_secret() {
984        let decoded = percent_decode("ghp_%61%62%63defghijklmnopqrstuvwxyz1234567890").unwrap();
985        assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
986    }
987
988    #[test]
989    fn decode_unicode_escaped_secret() {
990        let decoded = unicode_escape_decode(
991            "\\u0067\\u0068\\u0070\\u005Fabcdefghijklmnopqrstuvwxyz1234567890",
992        )
993        .unwrap();
994        assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
995    }
996
997    #[test]
998    fn decode_quoted_printable_secret() {
999        let decoded =
1000            quoted_printable_decode("ghp=5Fabcdefghijklmnopqrstuvwxyz1234567890").unwrap();
1001        assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1002    }
1003
1004    #[test]
1005    fn decode_double_url_encoded_secret() {
1006        let decoded =
1007            url_decode("%2567%2568%2570%255Fabcdefghijklmnopqrstuvwxyz1234567890").unwrap();
1008        assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1009    }
1010
1011    #[test]
1012    fn decode_html_named_entities() {
1013        let decoded = html_named_entity_decode("&lt;tag&gt;&amp;&quot;&apos;&nbsp;").unwrap();
1014        assert_eq!(decoded, "<tag>&\"'\u{00A0}");
1015    }
1016
1017    #[test]
1018    fn decode_html_numeric_entities() {
1019        let decoded = html_numeric_entity_decode(
1020            "&#103;&#104;&#112;&#95;&#x61;&#x62;&#x63;defghijklmnopqrstuvwxyz1234567890",
1021        )
1022        .unwrap();
1023        assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1024    }
1025
1026    #[test]
1027    fn decode_hex_escape_secret() {
1028        let decoded =
1029            hex_escape_decode("\\x67\\x68\\x70\\x5Fabcdefghijklmnopqrstuvwxyz1234567890").unwrap();
1030        assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1031    }
1032
1033    #[test]
1034    fn decode_octal_escape_secret() {
1035        let decoded =
1036            octal_escape_decode("\\147\\150\\160\\137abcdefghijklmnopqrstuvwxyz1234567890")
1037                .unwrap();
1038        assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1039    }
1040
1041    #[test]
1042    fn decode_mime_encoded_word_base64_secret() {
1043        let decoded = mime_encoded_word_decode("=?utf-8?B?c2stcHJvai1hYmMxMjM=?=").unwrap();
1044        assert_eq!(decoded, "sk-proj-abc123");
1045    }
1046
1047    #[test]
1048    fn decode_mime_encoded_word_q_secret() {
1049        let decoded = mime_encoded_word_decode(
1050            "=?utf-8?Q?xoxb=2DEXAMPLE1234=2DEXAMPLE5678=2DExAmPlEtOkEnVaLuEhErE?=",
1051        )
1052        .unwrap();
1053        assert_eq!(
1054            decoded,
1055            "xoxb-EXAMPLE1234-EXAMPLE5678-ExAmPlEtOkEnVaLuEhErE"
1056        );
1057    }
1058
1059    #[test]
1060    fn rejects_base64_with_non_terminal_padding() {
1061        assert!(classify_base64("=abc").is_none());
1062        assert!(classify_base64("ab=c").is_none());
1063        assert!(classify_base64("abc===").is_none());
1064    }
1065}