1use crate::context;
9use keyhog_core::{Chunk, ChunkMetadata};
10use std::collections::{HashSet, VecDeque};
11
12pub trait Decoder: Send + Sync {
28 fn name(&self) -> &'static str;
29 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk>;
30}
31
32struct Base64Decoder;
33impl Decoder for Base64Decoder {
34 fn name(&self) -> &'static str {
35 "base64"
36 }
37 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
38 let mut decoded_chunks = Vec::new();
39 let lines: Vec<&str> = chunk.data.lines().collect();
40 for (line_idx, line) in lines.iter().enumerate() {
41 if context::is_false_positive_context(&lines, line_idx, chunk.metadata.path.as_deref())
42 {
43 continue;
44 }
45 for b64_match in find_base64_strings(line, 20) {
46 match base64_decode(&b64_match.value) {
47 Ok(decoded) => match String::from_utf8(decoded) {
48 Ok(text)
49 if text.chars().all(|c| {
50 !c.is_control() || c == '\n' || c == '\r' || c == '\t'
51 }) =>
52 {
53 decoded_chunks.push(Chunk {
54 data: text,
55 metadata: ChunkMetadata {
56 source_type: format!("{}/base64", chunk.metadata.source_type),
57 path: chunk.metadata.path.clone(),
58 commit: chunk.metadata.commit.clone(),
59 author: chunk.metadata.author.clone(),
60 date: chunk.metadata.date.clone(),
61 },
62 });
63 }
64 Ok(_) => {
65 tracing::trace!(
66 path = ?chunk.metadata.path,
67 "base64 decoded to text with control characters, skipping"
68 );
69 }
70 Err(_) => {
71 tracing::trace!(
72 path = ?chunk.metadata.path,
73 "base64 decoded to non-UTF-8 bytes, skipping"
74 );
75 }
76 },
77 Err(()) => {
78 tracing::trace!(
79 path = ?chunk.metadata.path,
80 candidate_len = b64_match.value.len(),
81 "base64 decode failed for candidate"
82 );
83 }
84 }
85 }
86 }
87 decoded_chunks
88 }
89}
90
91struct HexDecoder;
92impl Decoder for HexDecoder {
93 fn name(&self) -> &'static str {
94 "hex"
95 }
96 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
97 let mut decoded_chunks = Vec::new();
98 for hex_match in find_hex_strings(&chunk.data, 40) {
99 if let Ok(decoded) = hex_decode(&hex_match.value)
100 && let Ok(text) = String::from_utf8(decoded)
101 && text
102 .chars()
103 .all(|c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
104 {
105 decoded_chunks.push(Chunk {
106 data: text,
107 metadata: ChunkMetadata {
108 source_type: format!("{}/hex", chunk.metadata.source_type),
109 path: chunk.metadata.path.clone(),
110 commit: chunk.metadata.commit.clone(),
111 author: chunk.metadata.author.clone(),
112 date: chunk.metadata.date.clone(),
113 },
114 });
115 }
116 }
117 decoded_chunks
118 }
119}
120
121struct UrlDecoder;
122impl Decoder for UrlDecoder {
123 fn name(&self) -> &'static str {
124 "url"
125 }
126 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
127 decode_candidates(
128 chunk,
129 extract_encoded_values(&chunk.data)
130 .into_iter()
131 .filter(|candidate| candidate.contains('%'))
132 .collect(),
133 url_decode,
134 self.name(),
135 )
136 }
137}
138
139struct QuotedPrintableDecoder;
140impl Decoder for QuotedPrintableDecoder {
141 fn name(&self) -> &'static str {
142 "quoted-printable"
143 }
144 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
145 let mut decoded_chunks = Vec::new();
146 let lines: Vec<&str> = chunk.data.lines().collect();
147 for (line_idx, line) in lines.iter().enumerate() {
148 if context::is_false_positive_context(&lines, line_idx, chunk.metadata.path.as_deref())
149 {
150 continue;
151 }
152 let mut candidates = extract_encoded_values(line);
153 let trimmed = line.trim();
154 if trimmed.contains('=') && !trimmed.is_empty() {
155 candidates.push(trimmed.to_string());
156 }
157 decoded_chunks.extend(decode_candidates(
158 chunk,
159 candidates
160 .into_iter()
161 .filter(|candidate| candidate.contains('='))
162 .collect(),
163 quoted_printable_decode,
164 self.name(),
165 ));
166 }
167 decoded_chunks
168 }
169}
170
171struct HtmlNamedEntityDecoder;
172impl Decoder for HtmlNamedEntityDecoder {
173 fn name(&self) -> &'static str {
174 "html-named-entity"
175 }
176 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
177 let mut candidates = extract_encoded_values(&chunk.data);
178 let trimmed = chunk.data.trim();
179 if trimmed.contains('&') && !trimmed.is_empty() {
180 candidates.push(trimmed.to_string());
181 }
182 decode_candidates(
183 chunk,
184 candidates
185 .into_iter()
186 .filter(|candidate| candidate.contains('&'))
187 .collect(),
188 html_named_entity_decode,
189 self.name(),
190 )
191 }
192}
193
194struct HtmlNumericEntityDecoder;
195impl Decoder for HtmlNumericEntityDecoder {
196 fn name(&self) -> &'static str {
197 "html-numeric-entity"
198 }
199 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
200 let mut candidates = extract_encoded_values(&chunk.data);
201 let trimmed = chunk.data.trim();
202 if trimmed.contains("&#") && !trimmed.is_empty() {
203 candidates.push(trimmed.to_string());
204 }
205 decode_candidates(
206 chunk,
207 candidates
208 .into_iter()
209 .filter(|candidate| candidate.contains("&#"))
210 .collect(),
211 html_numeric_entity_decode,
212 self.name(),
213 )
214 }
215}
216
217struct HexEscapeDecoder;
218impl Decoder for HexEscapeDecoder {
219 fn name(&self) -> &'static str {
220 "hex-escape"
221 }
222 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
223 let mut candidates = extract_encoded_values(&chunk.data);
224 let trimmed = chunk.data.trim();
225 if trimmed.contains("\\x") && !trimmed.is_empty() {
226 candidates.push(trimmed.to_string());
227 }
228 decode_candidates(
229 chunk,
230 candidates
231 .into_iter()
232 .filter(|candidate| candidate.contains("\\x"))
233 .collect(),
234 hex_escape_decode,
235 self.name(),
236 )
237 }
238}
239
240struct OctalEscapeDecoder;
241impl Decoder for OctalEscapeDecoder {
242 fn name(&self) -> &'static str {
243 "octal-escape"
244 }
245 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
246 let mut candidates = extract_encoded_values(&chunk.data);
247 let trimmed = chunk.data.trim();
248 if trimmed.contains('\\') && !trimmed.is_empty() {
249 candidates.push(trimmed.to_string());
250 }
251 decode_candidates(
252 chunk,
253 candidates
254 .into_iter()
255 .filter(|candidate| contains_octal_escape(candidate))
256 .collect(),
257 octal_escape_decode,
258 self.name(),
259 )
260 }
261}
262
263struct MimeEncodedWordDecoder;
264impl Decoder for MimeEncodedWordDecoder {
265 fn name(&self) -> &'static str {
266 "mime-encoded-word"
267 }
268 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
269 let mut candidates = Vec::new();
270 for line in chunk.data.lines() {
271 candidates.extend(find_mime_encoded_words(line));
272 }
273 decode_candidates(chunk, candidates, mime_encoded_word_decode, self.name())
274 }
275}
276
277struct UnicodeEscapeDecoder;
278impl Decoder for UnicodeEscapeDecoder {
279 fn name(&self) -> &'static str {
280 "unicode-escape"
281 }
282 fn decode_chunk(&self, chunk: &Chunk) -> Vec<Chunk> {
283 decode_candidates(
284 chunk,
285 extract_encoded_values(&chunk.data)
286 .into_iter()
287 .filter(|candidate| candidate.contains("\\u") || candidate.contains("\\x"))
288 .collect(),
289 unicode_escape_decode,
290 self.name(),
291 )
292 }
293}
294
295static DECODERS: std::sync::OnceLock<std::sync::RwLock<Vec<Box<dyn Decoder>>>> =
296 std::sync::OnceLock::new();
297
298fn get_decoders() -> &'static std::sync::RwLock<Vec<Box<dyn Decoder>>> {
299 DECODERS.get_or_init(|| {
300 std::sync::RwLock::new(vec![
301 Box::new(Base64Decoder),
302 Box::new(HexDecoder),
303 Box::new(UrlDecoder),
304 Box::new(QuotedPrintableDecoder),
305 Box::new(HtmlNamedEntityDecoder),
306 Box::new(HtmlNumericEntityDecoder),
307 Box::new(HexEscapeDecoder),
308 Box::new(OctalEscapeDecoder),
309 Box::new(MimeEncodedWordDecoder),
310 Box::new(UnicodeEscapeDecoder),
311 ])
312 })
313}
314
315pub fn register_decoder(decoder: Box<dyn Decoder>) {
333 let mut registry = get_decoders()
334 .write()
335 .unwrap_or_else(|poisoned| poisoned.into_inner());
336 registry.push(decoder);
337}
338
339const MAX_DECODE_DEPTH: usize = 2;
350
351pub fn decode_chunk(chunk: &Chunk) -> Vec<Chunk> {
378 let mut decoded_chunks = Vec::new();
379 let mut queue = VecDeque::from([(chunk.clone(), 0usize)]);
380 let mut seen = HashSet::from([chunk.data.clone()]);
381 let registry = get_decoders()
382 .read()
383 .unwrap_or_else(|poisoned| poisoned.into_inner());
384
385 while let Some((current, depth)) = queue.pop_front() {
386 if depth >= MAX_DECODE_DEPTH {
387 continue;
388 }
389 for decoder in registry.iter() {
390 for decoded in decoder.decode_chunk(¤t) {
391 if seen.insert(decoded.data.clone()) {
392 queue.push_back((decoded.clone(), depth + 1));
393 decoded_chunks.push(decoded);
394 }
395 }
396 }
397 }
398 decoded_chunks
399}
400
401struct EncodedString {
402 value: String,
403}
404
405fn find_base64_strings(text: &str, min_length: usize) -> Vec<EncodedString> {
407 let mut results = Vec::new();
408 let b64_chars = |c: char| {
409 c.is_ascii_alphanumeric() || c == '+' || c == '/' || c == '=' || c == '-' || c == '_'
410 };
411
412 for line in text.lines() {
413 let candidates = extract_encoded_values(line);
415 for candidate in candidates {
416 if candidate.len() >= min_length
417 && candidate.chars().all(b64_chars)
418 && classify_base64(candidate.as_str()).is_some()
419 {
420 results.push(EncodedString { value: candidate });
421 }
422 }
423 }
424
425 results
426}
427
428fn find_hex_strings(text: &str, min_length: usize) -> Vec<EncodedString> {
430 let mut results = Vec::new();
431
432 for line in text.lines() {
433 let candidates = extract_encoded_values(line);
434 for candidate in candidates {
435 if candidate.len() >= min_length
436 && candidate.len() % 2 == 0
437 && candidate.chars().all(|c| c.is_ascii_hexdigit())
438 {
439 results.push(EncodedString { value: candidate });
440 }
441 }
442 }
443
444 results
445}
446
447fn extract_encoded_values(line: &str) -> Vec<String> {
449 let mut values = Vec::new();
450
451 if let Some(pos) = line.find(':').or_else(|| line.find('=')) {
454 let candidate_value = line[pos + 1..]
455 .trim()
456 .trim_matches(|c: char| c == '"' || c == '\'' || c == '`');
457 if !candidate_value.is_empty() {
458 values.push(candidate_value.to_string());
459 }
460 }
461
462 for quote in ['"', '\''] {
464 let mut start = None;
465 for (i, ch) in line.char_indices() {
466 if ch == quote {
467 match start {
468 None => start = Some(i + 1),
469 Some(s) => {
470 let content = &line[s..i];
471 if !content.is_empty() {
472 values.push(content.to_string());
473 }
474 start = None;
475 }
476 }
477 }
478 }
479 }
480
481 values
482}
483
484#[derive(Clone, Copy)]
485enum Base64Variant {
486 Standard,
487 StandardNoPad,
488 UrlSafe,
489 UrlSafeNoPad,
490}
491
492fn classify_base64(candidate: &str) -> Option<Base64Variant> {
493 if !has_valid_base64_padding(candidate) {
494 return None;
495 }
496
497 let has_standard = candidate.contains('+') || candidate.contains('/');
498 let has_urlsafe = candidate.contains('-') || candidate.contains('_');
499 if has_standard && has_urlsafe {
500 return None;
501 }
502
503 let padded = candidate.contains('=');
504 match (has_urlsafe, padded, candidate.len() % 4) {
505 (_, true, 0) => Some(if has_urlsafe {
506 Base64Variant::UrlSafe
507 } else {
508 Base64Variant::Standard
509 }),
510 (_, true, _) => None,
511 (_, false, 1) => None,
512 (true, false, _) => Some(Base64Variant::UrlSafeNoPad),
513 (false, false, 0) => Some(Base64Variant::Standard),
514 (false, false, _) => Some(Base64Variant::StandardNoPad),
515 }
516}
517
518fn has_valid_base64_padding(candidate: &str) -> bool {
519 let first_padding = match candidate.find('=') {
520 Some(index) => index,
521 None => return true,
522 };
523
524 let padding = &candidate[first_padding..];
525 first_padding > 0
526 && padding.len() <= 2
527 && padding.bytes().all(|byte| byte == b'=')
528 && candidate[..first_padding].bytes().all(|byte| byte != b'=')
529}
530
531fn base64_decode(input: &str) -> Result<Vec<u8>, ()> {
532 use base64::{Engine, engine::general_purpose};
533
534 let variant = classify_base64(input).ok_or(())?;
535 match variant {
536 Base64Variant::Standard => general_purpose::STANDARD.decode(input),
537 Base64Variant::StandardNoPad => general_purpose::STANDARD_NO_PAD.decode(input),
538 Base64Variant::UrlSafe => general_purpose::URL_SAFE.decode(input),
539 Base64Variant::UrlSafeNoPad => general_purpose::URL_SAFE_NO_PAD.decode(input),
540 }
541 .map_err(|_| ())
542}
543
544fn hex_decode(input: &str) -> Result<Vec<u8>, ()> {
545 if !input.len().is_multiple_of(2) {
546 return Err(());
547 }
548 let mut decoded_bytes = Vec::with_capacity(input.len() / 2);
549 for offset in (0..input.len()).step_by(2) {
550 let high = hex_val(input.as_bytes()[offset])?;
551 let low = hex_val(input.as_bytes()[offset + 1])?;
552 decoded_bytes.push((high << 4) | low);
553 }
554 Ok(decoded_bytes)
555}
556
557fn hex_val(b: u8) -> Result<u8, ()> {
558 match b {
559 b'0'..=b'9' => Ok(b - b'0'),
560 b'a'..=b'f' => Ok(b - b'a' + 10),
561 b'A'..=b'F' => Ok(b - b'A' + 10),
562 _ => Err(()),
563 }
564}
565
566fn decode_candidates<F>(
567 chunk: &Chunk,
568 candidates: Vec<String>,
569 mut decode: F,
570 decoder_name: &str,
571) -> Vec<Chunk>
572where
573 F: FnMut(&str) -> Result<String, ()>,
574{
575 let mut decoded_chunks = Vec::new();
576 for candidate in candidates {
577 if let Ok(text) = decode(&candidate)
578 && !text.is_empty()
579 && text
580 .chars()
581 .all(|c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
582 {
583 decoded_chunks.push(Chunk {
584 data: text,
585 metadata: ChunkMetadata {
586 source_type: format!("{}/{}", chunk.metadata.source_type, decoder_name),
587 path: chunk.metadata.path.clone(),
588 commit: chunk.metadata.commit.clone(),
589 author: chunk.metadata.author.clone(),
590 date: chunk.metadata.date.clone(),
591 },
592 });
593 }
594 }
595 decoded_chunks
596}
597
598fn percent_decode(input: &str) -> Result<String, ()> {
599 let mut bytes = Vec::with_capacity(input.len());
600 let mut i = 0;
601 let input_bytes = input.as_bytes();
602 while i < input_bytes.len() {
603 match input_bytes[i] {
604 b'%' if i + 2 < input_bytes.len() => {
605 let high = hex_val(input_bytes[i + 1])?;
606 let low = hex_val(input_bytes[i + 2])?;
607 bytes.push((high << 4) | low);
608 i += 3;
609 }
610 byte => {
611 bytes.push(byte);
612 i += 1;
613 }
614 }
615 }
616 String::from_utf8(bytes).map_err(|_| ())
617}
618
619fn url_decode(input: &str) -> Result<String, ()> {
620 let decoded = percent_decode(input)?;
621 if contains_percent_escape(&decoded) {
622 percent_decode(&decoded)
623 } else {
624 Ok(decoded)
625 }
626}
627
628fn contains_percent_escape(input: &str) -> bool {
629 input
630 .as_bytes()
631 .windows(3)
632 .any(|window| window[0] == b'%' && hex_val(window[1]).is_ok() && hex_val(window[2]).is_ok())
633}
634
635fn quoted_printable_decode(input: &str) -> Result<String, ()> {
636 let mut bytes = Vec::with_capacity(input.len());
637 let mut i = 0;
638 let input_bytes = input.as_bytes();
639 while i < input_bytes.len() {
640 match input_bytes[i] {
641 b'=' if i + 2 < input_bytes.len() => {
642 if input_bytes[i + 1] == b'\r' && input_bytes[i + 2] == b'\n' {
643 i += 3;
644 continue;
645 }
646 let high = hex_val(input_bytes[i + 1])?;
647 let low = hex_val(input_bytes[i + 2])?;
648 bytes.push((high << 4) | low);
649 i += 3;
650 }
651 byte => {
652 bytes.push(byte);
653 i += 1;
654 }
655 }
656 }
657 String::from_utf8(bytes).map_err(|_| ())
658}
659
660fn html_named_entity_decode(input: &str) -> Result<String, ()> {
661 let mut decoded = String::with_capacity(input.len());
662 let mut changed = false;
663 let mut chars = input.chars().peekable();
664
665 while let Some(ch) = chars.next() {
666 if ch != '&' {
667 decoded.push(ch);
668 continue;
669 }
670
671 let mut entity = String::new();
672 while let Some(&next) = chars.peek() {
673 entity.push(next);
674 chars.next();
675 if next == ';' || entity.len() > 10 {
676 break;
677 }
678 }
679
680 let replacement = match entity.as_str() {
681 "amp;" => Some('&'),
682 "lt;" => Some('<'),
683 "gt;" => Some('>'),
684 "quot;" => Some('"'),
685 "apos;" => Some('\''),
686 "nbsp;" => Some('\u{00A0}'),
687 _ => None,
688 };
689
690 if let Some(replacement) = replacement {
691 decoded.push(replacement);
692 changed = true;
693 } else {
694 decoded.push('&');
695 decoded.push_str(&entity);
696 }
697 }
698
699 changed.then_some(decoded).ok_or(())
700}
701
702fn html_numeric_entity_decode(input: &str) -> Result<String, ()> {
703 let mut decoded = String::with_capacity(input.len());
704 let mut changed = false;
705 let mut chars = input.chars().peekable();
706
707 while let Some(ch) = chars.next() {
708 if ch != '&' || chars.peek() != Some(&'#') {
709 decoded.push(ch);
710 continue;
711 }
712
713 chars.next();
714 let is_hex = matches!(chars.peek(), Some('x') | Some('X'));
715 if is_hex {
716 chars.next();
717 }
718
719 let mut digits = String::new();
720 while let Some(&next) = chars.peek() {
721 if next == ';' {
722 chars.next();
723 break;
724 }
725 if (is_hex && next.is_ascii_hexdigit()) || (!is_hex && next.is_ascii_digit()) {
726 digits.push(next);
727 chars.next();
728 } else {
729 decoded.push('&');
730 decoded.push('#');
731 if is_hex {
732 decoded.push('x');
733 }
734 decoded.push_str(&digits);
735 decoded.push(next);
736 chars.next();
737 digits.clear();
738 break;
739 }
740 }
741
742 if digits.is_empty() {
743 decoded.push('&');
744 decoded.push('#');
745 if is_hex {
746 decoded.push('x');
747 }
748 continue;
749 }
750
751 let radix = if is_hex { 16 } else { 10 };
752 let code = u32::from_str_radix(&digits, radix).map_err(|_| ())?;
753 let replacement = char::from_u32(code).ok_or(())?;
754 decoded.push(replacement);
755 changed = true;
756 }
757
758 changed.then_some(decoded).ok_or(())
759}
760
761fn hex_escape_decode(input: &str) -> Result<String, ()> {
762 let mut decoded = String::with_capacity(input.len());
763 let mut chars = input.chars().peekable();
764 let mut changed = false;
765
766 while let Some(ch) = chars.next() {
767 if ch != '\\' || chars.peek() != Some(&'x') {
768 decoded.push(ch);
769 continue;
770 }
771
772 chars.next();
773 let high = chars.next().ok_or(())?.to_digit(16).ok_or(())?;
774 let low = chars.next().ok_or(())?.to_digit(16).ok_or(())?;
775 let byte = ((high << 4) | low) as u8;
776 decoded.push(char::from(byte));
777 changed = true;
778 }
779
780 changed.then_some(decoded).ok_or(())
781}
782
783fn octal_escape_decode(input: &str) -> Result<String, ()> {
784 let mut decoded = String::with_capacity(input.len());
785 let mut chars = input.chars().peekable();
786 let mut changed = false;
787
788 while let Some(ch) = chars.next() {
789 if ch != '\\' {
790 decoded.push(ch);
791 continue;
792 }
793
794 let Some(&next) = chars.peek() else {
795 return Err(());
796 };
797 if !('0'..='7').contains(&next) {
798 decoded.push(ch);
799 continue;
800 }
801
802 let mut value = 0u8;
803 for _ in 0..3 {
804 let digit = chars.next().ok_or(())?;
805 let digit = digit.to_digit(8).ok_or(())? as u8;
806 value = (value << 3) | digit;
807 }
808 decoded.push(char::from(value));
809 changed = true;
810 }
811
812 changed.then_some(decoded).ok_or(())
813}
814
815fn contains_octal_escape(input: &str) -> bool {
816 let bytes = input.as_bytes();
817 bytes.windows(4).any(|window| {
818 window[0] == b'\\'
819 && (b'0'..=b'7').contains(&window[1])
820 && (b'0'..=b'7').contains(&window[2])
821 && (b'0'..=b'7').contains(&window[3])
822 })
823}
824
825fn mime_encoded_word_decode(input: &str) -> Result<String, ()> {
826 if !input.starts_with("=?") || !input.ends_with("?=") {
827 return Err(());
828 }
829
830 let inner = &input[2..input.len() - 2];
831 let mut parts = inner.splitn(3, '?');
832 let _charset = parts.next().ok_or(())?;
833 let encoding = parts.next().ok_or(())?;
834 let encoded = parts.next().ok_or(())?;
835
836 let bytes = match encoding {
837 "B" | "b" => base64_decode(encoded)?,
838 "Q" | "q" => mime_q_decode(encoded)?,
839 _ => return Err(()),
840 };
841
842 String::from_utf8(bytes).map_err(|_| ())
843}
844
845fn mime_q_decode(input: &str) -> Result<Vec<u8>, ()> {
846 let normalized = input.replace('_', " ");
847 let mut bytes = Vec::with_capacity(normalized.len());
848 let mut i = 0;
849 let input_bytes = normalized.as_bytes();
850
851 while i < input_bytes.len() {
852 match input_bytes[i] {
853 b'=' if i + 2 < input_bytes.len() => {
854 let high = hex_val(input_bytes[i + 1])?;
855 let low = hex_val(input_bytes[i + 2])?;
856 bytes.push((high << 4) | low);
857 i += 3;
858 }
859 byte => {
860 bytes.push(byte);
861 i += 1;
862 }
863 }
864 }
865
866 Ok(bytes)
867}
868
869fn find_mime_encoded_words(line: &str) -> Vec<String> {
870 let mut words = Vec::new();
871 let mut offset = 0;
872
873 while let Some(start) = line[offset..].find("=?") {
874 let absolute_start = offset + start;
875 if let Some(end) = line[absolute_start + 2..].find("?=") {
876 let absolute_end = absolute_start + 2 + end + 2;
877 words.push(line[absolute_start..absolute_end].to_string());
878 offset = absolute_end;
879 } else {
880 break;
881 }
882 }
883
884 words
885}
886
887fn unicode_escape_decode(input: &str) -> Result<String, ()> {
888 let mut decoded_text = String::with_capacity(input.len());
889 let mut chars = input.chars().peekable();
890
891 while let Some(ch) = chars.next() {
892 if ch != '\\' {
893 decoded_text.push(ch);
894 continue;
895 }
896
897 match chars.next() {
898 Some('u') => {
899 let code = take_hex_digits(&mut chars, 4)?;
900 let ch = char::from_u32(code).ok_or(())?;
901 decoded_text.push(ch);
902 }
903 Some('x') => {
904 let code = take_hex_digits(&mut chars, 2)?;
905 decoded_text.push(char::from_u32(code).ok_or(())?);
906 }
907 Some(escaped) => decoded_text.push(escaped),
908 None => return Err(()),
909 }
910 }
911
912 Ok(decoded_text)
913}
914
915fn take_hex_digits<I>(chars: &mut std::iter::Peekable<I>, count: usize) -> Result<u32, ()>
916where
917 I: Iterator<Item = char>,
918{
919 let mut value = 0u32;
920 for _ in 0..count {
921 let ch = chars.next().ok_or(())?;
922 let digit = ch.to_digit(16).ok_or(())?;
923 value = (value << 4) | digit;
924 }
925 Ok(value)
926}
927
928#[cfg(test)]
929mod tests {
930 use super::*;
931
932 #[test]
933 fn decode_base64_secret() {
934 let encoded = "c2stcHJvai1hYmMxMjM=";
936 let decoded = base64_decode(encoded).unwrap();
937 assert_eq!(String::from_utf8(decoded).unwrap(), "sk-proj-abc123");
938 }
939
940 #[test]
941 fn decode_hex_secret() {
942 let encoded = "736b2d70726f6a2d616263";
944 let decoded = hex_decode(encoded).unwrap();
945 assert_eq!(String::from_utf8(decoded).unwrap(), "sk-proj-abc");
946 }
947
948 #[test]
949 fn decode_url_safe_base64() {
950 let encoded = "c2stcHJvai1hYmMxMjM"; let decoded = base64_decode(encoded).unwrap();
952 assert_eq!(String::from_utf8(decoded).unwrap(), "sk-proj-abc123");
953 }
954
955 #[test]
956 fn find_base64_in_text() {
957 let text = r#"TOKEN = "c2stcHJvai1hYmMxMjM=""#;
958 let matches = find_base64_strings(text, 10);
959 assert_eq!(matches.len(), 2);
960 assert_eq!(matches[0].value, "c2stcHJvai1hYmMxMjM=");
961 assert_eq!(matches[1].value, "c2stcHJvai1hYmMxMjM=");
962 }
963
964 #[test]
965 fn decode_chunk_finds_encoded_secret() {
966 let chunk = Chunk {
967 data: "SECRET=c2stcHJvai1hYmMxMjM=\n".to_string(),
968 metadata: ChunkMetadata {
969 source_type: "test".into(),
970 path: Some("test.env".into()),
971 commit: None,
972 author: None,
973 date: None,
974 },
975 };
976 let decoded = decode_chunk(&chunk);
977 assert!(!decoded.is_empty());
978 assert!(decoded[0].data.contains("sk-proj-abc123"));
979 assert!(decoded[0].metadata.source_type.contains("base64"));
980 }
981
982 #[test]
983 fn decode_url_encoded_secret() {
984 let decoded = percent_decode("ghp_%61%62%63defghijklmnopqrstuvwxyz1234567890").unwrap();
985 assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
986 }
987
988 #[test]
989 fn decode_unicode_escaped_secret() {
990 let decoded = unicode_escape_decode(
991 "\\u0067\\u0068\\u0070\\u005Fabcdefghijklmnopqrstuvwxyz1234567890",
992 )
993 .unwrap();
994 assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
995 }
996
997 #[test]
998 fn decode_quoted_printable_secret() {
999 let decoded =
1000 quoted_printable_decode("ghp=5Fabcdefghijklmnopqrstuvwxyz1234567890").unwrap();
1001 assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1002 }
1003
1004 #[test]
1005 fn decode_double_url_encoded_secret() {
1006 let decoded =
1007 url_decode("%2567%2568%2570%255Fabcdefghijklmnopqrstuvwxyz1234567890").unwrap();
1008 assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1009 }
1010
1011 #[test]
1012 fn decode_html_named_entities() {
1013 let decoded = html_named_entity_decode("<tag>&"' ").unwrap();
1014 assert_eq!(decoded, "<tag>&\"'\u{00A0}");
1015 }
1016
1017 #[test]
1018 fn decode_html_numeric_entities() {
1019 let decoded = html_numeric_entity_decode(
1020 "ghp_abcdefghijklmnopqrstuvwxyz1234567890",
1021 )
1022 .unwrap();
1023 assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1024 }
1025
1026 #[test]
1027 fn decode_hex_escape_secret() {
1028 let decoded =
1029 hex_escape_decode("\\x67\\x68\\x70\\x5Fabcdefghijklmnopqrstuvwxyz1234567890").unwrap();
1030 assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1031 }
1032
1033 #[test]
1034 fn decode_octal_escape_secret() {
1035 let decoded =
1036 octal_escape_decode("\\147\\150\\160\\137abcdefghijklmnopqrstuvwxyz1234567890")
1037 .unwrap();
1038 assert_eq!(decoded, "ghp_abcdefghijklmnopqrstuvwxyz1234567890");
1039 }
1040
1041 #[test]
1042 fn decode_mime_encoded_word_base64_secret() {
1043 let decoded = mime_encoded_word_decode("=?utf-8?B?c2stcHJvai1hYmMxMjM=?=").unwrap();
1044 assert_eq!(decoded, "sk-proj-abc123");
1045 }
1046
1047 #[test]
1048 fn decode_mime_encoded_word_q_secret() {
1049 let decoded = mime_encoded_word_decode(
1050 "=?utf-8?Q?xoxb=2DEXAMPLE1234=2DEXAMPLE5678=2DExAmPlEtOkEnVaLuEhErE?=",
1051 )
1052 .unwrap();
1053 assert_eq!(
1054 decoded,
1055 "xoxb-EXAMPLE1234-EXAMPLE5678-ExAmPlEtOkEnVaLuEhErE"
1056 );
1057 }
1058
1059 #[test]
1060 fn rejects_base64_with_non_terminal_padding() {
1061 assert!(classify_base64("=abc").is_none());
1062 assert!(classify_base64("ab=c").is_none());
1063 assert!(classify_base64("abc===").is_none());
1064 }
1065}