1use crate::error::{Result, SanitizeError};
54use crate::processor::profile::FileTypeProfile;
55use crate::processor::registry::ProcessorRegistry;
56use crate::scanner::{ScanStats, StreamScanner};
57use crate::store::MappingStore;
58
59use std::collections::HashMap;
60use std::io::{self, Read, Seek, Write};
61use std::sync::Arc;
62
63const MAX_STRUCTURED_ENTRY_SIZE: u64 = 256 * 1024 * 1024; pub const DEFAULT_MAX_ARCHIVE_DEPTH: u32 = 3;
79
80const MAX_ALLOWED_ARCHIVE_DEPTH: u32 = 10;
84
85#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91pub enum ArchiveFormat {
92 Zip,
94 Tar,
96 TarGz,
98}
99
100impl ArchiveFormat {
101 pub fn from_path(path: &str) -> Option<Self> {
105 let lower = path.to_ascii_lowercase();
106 if lower.ends_with(".tar.gz")
107 || std::path::Path::new(&lower)
108 .extension()
109 .is_some_and(|ext| ext.eq_ignore_ascii_case("tgz"))
110 {
111 Some(Self::TarGz)
112 } else if std::path::Path::new(&lower)
113 .extension()
114 .is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
115 {
116 Some(Self::Tar)
117 } else if std::path::Path::new(&lower)
118 .extension()
119 .is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
120 {
121 Some(Self::Zip)
122 } else {
123 None
124 }
125 }
126}
127
128#[derive(Debug, Clone, Default)]
134pub struct ArchiveStats {
135 pub files_processed: u64,
137 pub entries_skipped: u64,
139 pub structured_hits: u64,
141 pub scanner_fallback: u64,
143 pub nested_archives: u64,
146 pub total_input_bytes: u64,
148 pub total_output_bytes: u64,
150 pub file_methods: HashMap<String, String>,
153 pub file_scan_stats: HashMap<String, ScanStats>,
155}
156
157impl ArchiveStats {
158 fn merge(&mut self, child: &ArchiveStats) {
160 self.files_processed += child.files_processed;
161 self.entries_skipped += child.entries_skipped;
162 self.structured_hits += child.structured_hits;
163 self.scanner_fallback += child.scanner_fallback;
164 self.nested_archives += child.nested_archives;
165 self.total_input_bytes += child.total_input_bytes;
166 self.total_output_bytes += child.total_output_bytes;
167 for (k, v) in &child.file_methods {
168 self.file_methods.insert(k.clone(), v.clone());
169 }
170 for (k, v) in &child.file_scan_stats {
171 self.file_scan_stats.insert(k.clone(), v.clone());
172 }
173 }
174}
175
176pub struct ArchiveProcessor {
207 registry: Arc<ProcessorRegistry>,
209 scanner: Arc<StreamScanner>,
211 store: Arc<MappingStore>,
213 profiles: Vec<FileTypeProfile>,
215 max_depth: u32,
217}
218
219impl ArchiveProcessor {
220 pub fn new(
229 registry: Arc<ProcessorRegistry>,
230 scanner: Arc<StreamScanner>,
231 store: Arc<MappingStore>,
232 profiles: Vec<FileTypeProfile>,
233 ) -> Self {
234 Self {
235 registry,
236 scanner,
237 store,
238 profiles,
239 max_depth: DEFAULT_MAX_ARCHIVE_DEPTH,
240 }
241 }
242
243 #[must_use]
249 pub fn with_max_depth(mut self, depth: u32) -> Self {
250 self.max_depth = depth.min(MAX_ALLOWED_ARCHIVE_DEPTH);
251 self
252 }
253
254 fn find_profile(&self, filename: &str) -> Option<&FileTypeProfile> {
256 self.profiles.iter().find(|p| p.matches_filename(filename))
257 }
258
259 #[allow(clippy::missing_errors_doc)] fn sanitize_entry(
271 &self,
272 filename: &str,
273 reader: &mut dyn Read,
274 writer: &mut dyn Write,
275 stats: &mut ArchiveStats,
276 entry_size_hint: Option<u64>,
277 depth: u32,
278 ) -> Result<()> {
279 if let Some(nested_fmt) = ArchiveFormat::from_path(filename) {
281 return self.sanitize_nested_archive(
282 filename,
283 reader,
284 writer,
285 stats,
286 entry_size_hint,
287 nested_fmt,
288 depth,
289 );
290 }
291
292 let within_size_cap = entry_size_hint.map_or(true, |sz| sz <= MAX_STRUCTURED_ENTRY_SIZE); if within_size_cap {
300 if let Some(profile) = self.find_profile(filename) {
301 let mut content = Vec::new();
303 reader.read_to_end(&mut content).map_err(|e| {
304 SanitizeError::ArchiveError(format!("read entry '{filename}': {e}"))
305 })?;
306
307 stats.total_input_bytes += content.len() as u64;
308
309 if let Some(output) = self.registry.process(&content, profile, &self.store)? {
310 stats.structured_hits += 1;
311 stats.total_output_bytes += output.len() as u64;
312 stats.file_methods.insert(
313 filename.to_string(),
314 format!("structured:{}", profile.processor),
315 );
316 writer.write_all(&output).map_err(|e| {
317 SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
318 })?;
319 return Ok(());
320 }
321
322 let (output, scan_stats) = self.scanner.scan_bytes(&content)?;
325 stats.scanner_fallback += 1;
326 stats.total_output_bytes += output.len() as u64;
327 stats
328 .file_methods
329 .insert(filename.to_string(), "scanner".to_string());
330 stats
331 .file_scan_stats
332 .insert(filename.to_string(), scan_stats);
333 writer.write_all(&output).map_err(|e| {
334 SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
335 })?;
336 return Ok(());
337 }
338 }
339
340 let mut counting_r = CountingReader::new(reader);
345 let mut counting_w = CountingWriter::new(writer);
346 let scan_stats = self.scanner.scan_reader(&mut counting_r, &mut counting_w)?;
347
348 stats.scanner_fallback += 1;
349 stats.total_input_bytes += counting_r.bytes_read();
350 stats.total_output_bytes += counting_w.bytes_written();
351 stats
352 .file_methods
353 .insert(filename.to_string(), "scanner".to_string());
354 stats
355 .file_scan_stats
356 .insert(filename.to_string(), scan_stats);
357
358 Ok(())
359 }
360
361 #[allow(clippy::too_many_arguments)]
364 fn sanitize_nested_archive(
365 &self,
366 filename: &str,
367 reader: &mut dyn Read,
368 writer: &mut dyn Write,
369 stats: &mut ArchiveStats,
370 entry_size_hint: Option<u64>,
371 nested_fmt: ArchiveFormat,
372 depth: u32,
373 ) -> Result<()> {
374 if depth >= self.max_depth {
375 return Err(SanitizeError::RecursionDepthExceeded(format!(
376 "nested archive '{}' at depth {} exceeds maximum nesting depth of {}",
377 filename, depth, self.max_depth,
378 )));
379 }
380
381 if let Some(sz) = entry_size_hint {
383 if sz > MAX_STRUCTURED_ENTRY_SIZE {
384 return Err(SanitizeError::ArchiveError(format!(
385 "nested archive '{}' is too large ({} bytes, limit {} bytes)",
386 filename, sz, MAX_STRUCTURED_ENTRY_SIZE,
387 )));
388 }
389 }
390
391 let mut content = Vec::new();
392 reader.read_to_end(&mut content).map_err(|e| {
393 SanitizeError::ArchiveError(format!("read nested archive '{filename}': {e}"))
394 })?;
395 stats.total_input_bytes += content.len() as u64;
396
397 let mut output_buf: Vec<u8> = Vec::new();
399 let child_stats = match nested_fmt {
400 ArchiveFormat::Tar => {
401 self.process_tar_at_depth(&content[..], &mut output_buf, depth + 1)?
402 }
403 ArchiveFormat::TarGz => {
404 self.process_tar_gz_at_depth(&content[..], &mut output_buf, depth + 1)?
405 }
406 ArchiveFormat::Zip => {
407 let reader = io::Cursor::new(&content);
408 let mut writer = io::Cursor::new(Vec::new());
409 let s = self.process_zip_at_depth(reader, &mut writer, depth + 1)?;
410 output_buf = writer.into_inner();
411 s
412 }
413 };
414
415 stats.nested_archives += 1;
416 stats.merge(&child_stats);
417 stats.total_output_bytes += output_buf.len() as u64;
418 let fmt_name = match nested_fmt {
419 ArchiveFormat::Tar => "tar",
420 ArchiveFormat::TarGz => "tar.gz",
421 ArchiveFormat::Zip => "zip",
422 };
423 stats
424 .file_methods
425 .insert(filename.to_string(), format!("nested:{fmt_name}"));
426 writer.write_all(&output_buf).map_err(|e| {
427 SanitizeError::ArchiveError(format!("write nested archive '{filename}': {e}"))
428 })?;
429 Ok(())
430 }
431
432 pub fn process_tar<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
447 self.process_tar_at_depth(reader, writer, 0)
448 }
449
450 fn process_tar_at_depth<R: Read, W: Write>(
452 &self,
453 reader: R,
454 writer: W,
455 depth: u32,
456 ) -> Result<ArchiveStats> {
457 let mut stats = ArchiveStats::default();
458 let mut archive = tar::Archive::new(reader);
459 let mut builder = tar::Builder::new(writer);
460
461 let entries = archive
462 .entries()
463 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entries: {}", e)))?;
464
465 for entry_result in entries {
466 let mut entry = entry_result
467 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {}", e)))?;
468
469 let header = entry.header().clone();
470 let path = entry
471 .path()
472 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {}", e)))?
473 .to_string_lossy()
474 .to_string();
475
476 let entry_type = header.entry_type();
477
478 if !entry_type.is_file() {
480 let mut data = Vec::new();
484 entry.read_to_end(&mut data).map_err(|e| {
485 SanitizeError::ArchiveError(format!("read non-file entry '{}': {}", path, e))
486 })?;
487 builder.append(&header, &*data).map_err(|e| {
488 SanitizeError::ArchiveError(format!("append entry '{}': {}", path, e))
489 })?;
490 stats.entries_skipped += 1;
491 continue;
492 }
493
494 let mut sanitized_buf: Vec<u8> = Vec::new();
496 let entry_size = header.size().ok();
497 self.sanitize_entry(
498 &path,
499 &mut entry,
500 &mut sanitized_buf,
501 &mut stats,
502 entry_size,
503 depth,
504 )?;
505
506 let mut new_header = header.clone();
509 new_header.set_size(sanitized_buf.len() as u64);
510 new_header.set_cksum();
511
512 builder.append(&new_header, &*sanitized_buf).map_err(|e| {
513 SanitizeError::ArchiveError(format!("append entry '{}': {}", path, e))
514 })?;
515
516 stats.files_processed += 1;
517 }
518
519 builder
520 .finish()
521 .map_err(|e| SanitizeError::ArchiveError(format!("finalize tar: {}", e)))?;
522
523 Ok(stats)
524 }
525
526 pub fn process_tar_gz<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
536 self.process_tar_gz_at_depth(reader, writer, 0)
537 }
538
539 fn process_tar_gz_at_depth<R: Read, W: Write>(
541 &self,
542 reader: R,
543 writer: W,
544 depth: u32,
545 ) -> Result<ArchiveStats> {
546 let gz_reader = flate2::read::GzDecoder::new(reader);
547 let gz_writer = flate2::write::GzEncoder::new(writer, flate2::Compression::default());
548
549 let stats = self.process_tar_at_depth(gz_reader, gz_writer, depth)?;
550 Ok(stats)
556 }
557
558 pub fn process_zip<R: Read + Seek, W: Write + Seek>(
574 &self,
575 reader: R,
576 writer: W,
577 ) -> Result<ArchiveStats> {
578 self.process_zip_at_depth(reader, writer, 0)
579 }
580
581 fn process_zip_at_depth<R: Read + Seek, W: Write + Seek>(
583 &self,
584 reader: R,
585 writer: W,
586 depth: u32,
587 ) -> Result<ArchiveStats> {
588 let mut stats = ArchiveStats::default();
589 let mut zip_in = zip::ZipArchive::new(reader)
590 .map_err(|e| SanitizeError::ArchiveError(format!("open zip: {}", e)))?;
591 let mut zip_out = zip::ZipWriter::new(writer);
592
593 for i in 0..zip_in.len() {
594 let mut entry = zip_in
595 .by_index(i)
596 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
597
598 let name = entry.name().to_string();
599
600 if entry.is_dir() {
608 let options = zip::write::FileOptions::default()
609 .last_modified_time(entry.last_modified())
610 .compression_method(entry.compression());
611
612 #[cfg(unix)]
613 let options = if let Some(mode) = entry.unix_mode() {
614 options.unix_permissions(mode)
615 } else {
616 options
617 };
618
619 zip_out.add_directory(&name, options).map_err(|e| {
620 SanitizeError::ArchiveError(format!("add dir '{}': {}", name, e))
621 })?;
622 stats.entries_skipped += 1;
623 continue;
624 }
625
626 let options = zip::write::FileOptions::default()
628 .compression_method(entry.compression())
629 .last_modified_time(entry.last_modified());
630
631 #[cfg(unix)]
632 let options = if let Some(mode) = entry.unix_mode() {
633 options.unix_permissions(mode)
634 } else {
635 options
636 };
637
638 let mut sanitized_buf: Vec<u8> = Vec::new();
640 let entry_size = Some(entry.size());
641 self.sanitize_entry(
642 &name,
643 &mut entry,
644 &mut sanitized_buf,
645 &mut stats,
646 entry_size,
647 depth,
648 )?;
649
650 zip_out.start_file(&name, options).map_err(|e| {
651 SanitizeError::ArchiveError(format!("start file '{}': {}", name, e))
652 })?;
653 zip_out.write_all(&sanitized_buf).map_err(|e| {
654 SanitizeError::ArchiveError(format!("write file '{}': {}", name, e))
655 })?;
656
657 stats.files_processed += 1;
658 }
659
660 zip_out
661 .finish()
662 .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
663
664 Ok(stats)
665 }
666
667 pub fn process<R: Read + Seek, W: Write + Seek>(
683 &self,
684 reader: R,
685 writer: W,
686 format: ArchiveFormat,
687 ) -> Result<ArchiveStats> {
688 match format {
689 ArchiveFormat::Zip => self.process_zip(reader, writer),
690 ArchiveFormat::Tar => self.process_tar(reader, writer),
691 ArchiveFormat::TarGz => self.process_tar_gz(reader, writer),
692 }
693 }
694}
695
696struct CountingReader<'a> {
702 inner: &'a mut dyn Read,
703 count: u64,
704}
705
706impl<'a> CountingReader<'a> {
707 fn new(inner: &'a mut dyn Read) -> Self {
708 Self { inner, count: 0 }
709 }
710
711 fn bytes_read(&self) -> u64 {
712 self.count
713 }
714}
715
716impl Read for CountingReader<'_> {
717 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
718 let n = self.inner.read(buf)?;
719 self.count += n as u64;
720 Ok(n)
721 }
722}
723
724struct CountingWriter<'a> {
726 inner: &'a mut dyn Write,
727 count: u64,
728}
729
730impl<'a> CountingWriter<'a> {
731 fn new(inner: &'a mut dyn Write) -> Self {
732 Self { inner, count: 0 }
733 }
734
735 fn bytes_written(&self) -> u64 {
736 self.count
737 }
738}
739
740impl Write for CountingWriter<'_> {
741 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
742 let n = self.inner.write(buf)?;
743 self.count += n as u64;
744 Ok(n)
745 }
746
747 fn flush(&mut self) -> io::Result<()> {
748 self.inner.flush()
749 }
750}
751
752#[cfg(test)]
757mod tests {
758 use super::*;
759 use crate::category::Category;
760 use crate::generator::HmacGenerator;
761 use crate::processor::profile::{FieldRule, FileTypeProfile};
762 use crate::processor::registry::ProcessorRegistry;
763 use crate::scanner::{ScanConfig, ScanPattern};
764 use std::io::Cursor;
765
766 fn make_archive_processor() -> ArchiveProcessor {
768 let gen = Arc::new(HmacGenerator::new([42u8; 32]));
769 let store = Arc::new(MappingStore::new(gen, None));
770
771 let patterns = vec![
772 ScanPattern::from_regex(
773 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
774 Category::Email,
775 "email",
776 )
777 .unwrap(),
778 ScanPattern::from_literal("SUPERSECRET", Category::Custom("api_key".into()), "api_key")
779 .unwrap(),
780 ];
781
782 let scanner = Arc::new(
783 StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
784 );
785
786 let registry = Arc::new(ProcessorRegistry::with_builtins());
787
788 let profiles = vec![FileTypeProfile::new(
789 "json",
790 vec![FieldRule::new("*").with_category(Category::Custom("field".into()))],
791 )
792 .with_extension(".json")];
793
794 ArchiveProcessor::new(registry, scanner, store, profiles)
795 }
796
797 fn build_test_tar(entries: &[(&str, &[u8])]) -> Vec<u8> {
800 let mut buf = Vec::new();
801 {
802 let mut builder = tar::Builder::new(&mut buf);
803 for (name, data) in entries {
804 let mut header = tar::Header::new_gnu();
805 header.set_size(data.len() as u64);
806 header.set_mode(0o644);
807 header.set_mtime(1_700_000_000);
808 header.set_cksum();
809 builder.append_data(&mut header, *name, *data).unwrap();
810 }
811 builder.finish().unwrap();
812 }
813 buf
814 }
815
816 #[test]
817 fn tar_sanitizes_plaintext_with_scanner() {
818 let proc = make_archive_processor();
819 let input = build_test_tar(&[("readme.txt", b"Contact alice@corp.com for help.")]);
820
821 let mut output = Vec::new();
822 let stats = proc.process_tar(&input[..], &mut output).unwrap();
823
824 assert_eq!(stats.files_processed, 1);
825 assert_eq!(stats.scanner_fallback, 1);
826 assert_eq!(stats.structured_hits, 0);
827
828 let mut archive = tar::Archive::new(&output[..]);
830 for entry in archive.entries().unwrap() {
831 let mut e = entry.unwrap();
832 let mut content = String::new();
833 e.read_to_string(&mut content).unwrap();
834 assert!(
835 !content.contains("alice@corp.com"),
836 "email should be sanitized: {content}"
837 );
838 }
839 }
840
841 #[test]
842 fn tar_sanitizes_json_with_structured_processor() {
843 let proc = make_archive_processor();
844 let json_content = br#"{"email": "bob@example.org", "name": "Bob"}"#;
845 let input = build_test_tar(&[("config.json", json_content)]);
846
847 let mut output = Vec::new();
848 let stats = proc.process_tar(&input[..], &mut output).unwrap();
849
850 assert_eq!(stats.files_processed, 1);
851 assert_eq!(stats.structured_hits, 1);
852 assert_eq!(stats.scanner_fallback, 0);
853 assert_eq!(
854 stats.file_methods.get("config.json").unwrap(),
855 "structured:json"
856 );
857
858 let mut archive = tar::Archive::new(&output[..]);
860 for entry in archive.entries().unwrap() {
861 let mut e = entry.unwrap();
862 let mut content = String::new();
863 e.read_to_string(&mut content).unwrap();
864 assert!(
865 !content.contains("bob@example.org"),
866 "email should be sanitized"
867 );
868 assert!(!content.contains("Bob"), "name should be sanitized");
869 }
870 }
871
872 #[test]
873 fn tar_preserves_metadata() {
874 let proc = make_archive_processor();
875 let input = build_test_tar(&[("data.txt", b"SUPERSECRET token here")]);
876
877 let mut output = Vec::new();
878 proc.process_tar(&input[..], &mut output).unwrap();
879
880 let mut archive = tar::Archive::new(&output[..]);
881 for entry in archive.entries().unwrap() {
882 let e = entry.unwrap();
883 let hdr = e.header();
884 assert_eq!(hdr.mode().unwrap(), 0o644);
885 assert_eq!(hdr.mtime().unwrap(), 1_700_000_000);
886 }
887 }
888
889 #[test]
890 fn tar_handles_multiple_files() {
891 let proc = make_archive_processor();
892 let input = build_test_tar(&[
893 ("a.txt", b"alice@corp.com"),
894 ("b.json", br#"{"key":"value"}"#),
895 ("c.log", b"no secrets here"),
896 ]);
897
898 let mut output = Vec::new();
899 let stats = proc.process_tar(&input[..], &mut output).unwrap();
900
901 assert_eq!(stats.files_processed, 3);
902 assert_eq!(stats.structured_hits, 1);
904 assert_eq!(stats.scanner_fallback, 2);
906 }
907
908 #[test]
909 fn tar_passes_through_directories() {
910 let mut buf = Vec::new();
911 {
912 let mut builder = tar::Builder::new(&mut buf);
913
914 let mut dir_header = tar::Header::new_gnu();
916 dir_header.set_entry_type(tar::EntryType::Directory);
917 dir_header.set_size(0);
918 dir_header.set_mode(0o755);
919 dir_header.set_cksum();
920 builder
921 .append_data(&mut dir_header, "mydir/", &b""[..])
922 .unwrap();
923
924 let mut file_header = tar::Header::new_gnu();
926 file_header.set_size(5);
927 file_header.set_mode(0o644);
928 file_header.set_cksum();
929 builder
930 .append_data(&mut file_header, "mydir/hello.txt", &b"hello"[..])
931 .unwrap();
932
933 builder.finish().unwrap();
934 }
935
936 let proc = make_archive_processor();
937 let mut output = Vec::new();
938 let stats = proc.process_tar(&buf[..], &mut output).unwrap();
939
940 assert_eq!(stats.entries_skipped, 1);
941 assert_eq!(stats.files_processed, 1);
942 }
943
944 #[test]
947 fn tar_gz_round_trip() {
948 let proc = make_archive_processor();
949
950 let tar_data = build_test_tar(&[("secret.txt", b"Key is SUPERSECRET okay")]);
952 let mut gz_input = Vec::new();
953 {
954 let mut encoder =
955 flate2::write::GzEncoder::new(&mut gz_input, flate2::Compression::fast());
956 encoder.write_all(&tar_data).unwrap();
957 encoder.finish().unwrap();
958 }
959
960 let mut gz_output = Vec::new();
961 let stats = proc.process_tar_gz(&gz_input[..], &mut gz_output).unwrap();
962
963 assert_eq!(stats.files_processed, 1);
964 assert_eq!(stats.scanner_fallback, 1);
965
966 let decoder = flate2::read::GzDecoder::new(&gz_output[..]);
968 let mut archive = tar::Archive::new(decoder);
969 for entry in archive.entries().unwrap() {
970 let mut e = entry.unwrap();
971 let mut content = String::new();
972 e.read_to_string(&mut content).unwrap();
973 assert!(
974 !content.contains("SUPERSECRET"),
975 "secret should be sanitized: {content}"
976 );
977 }
978 }
979
980 fn build_test_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
983 let mut buf = Cursor::new(Vec::new());
984 {
985 let mut zip = zip::ZipWriter::new(&mut buf);
986 for (name, data) in entries {
987 let options = zip::write::FileOptions::default()
988 .compression_method(zip::CompressionMethod::Deflated);
989 zip.start_file(*name, options).unwrap();
990 zip.write_all(data).unwrap();
991 }
992 zip.finish().unwrap();
993 }
994 buf.into_inner()
995 }
996
997 #[test]
998 fn zip_sanitizes_plaintext_with_scanner() {
999 let proc = make_archive_processor();
1000 let zip_data = build_test_zip(&[("notes.txt", b"Reach alice@corp.com for info.")]);
1001
1002 let reader = Cursor::new(&zip_data);
1003 let mut writer = Cursor::new(Vec::new());
1004 let stats = proc.process_zip(reader, &mut writer).unwrap();
1005
1006 assert_eq!(stats.files_processed, 1);
1007 assert_eq!(stats.scanner_fallback, 1);
1008
1009 let out_data = writer.into_inner();
1011 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1012 let mut entry = zip_out.by_index(0).unwrap();
1013 let mut content = String::new();
1014 entry.read_to_string(&mut content).unwrap();
1015 assert!(
1016 !content.contains("alice@corp.com"),
1017 "email should be sanitized: {content}"
1018 );
1019 }
1020
1021 #[test]
1022 fn zip_sanitizes_json_with_structured_processor() {
1023 let proc = make_archive_processor();
1024 let json_content = br#"{"password": "hunter2", "host": "db.internal"}"#;
1025 let zip_data = build_test_zip(&[("settings.json", json_content)]);
1026
1027 let reader = Cursor::new(&zip_data);
1028 let mut writer = Cursor::new(Vec::new());
1029 let stats = proc.process_zip(reader, &mut writer).unwrap();
1030
1031 assert_eq!(stats.files_processed, 1);
1032 assert_eq!(stats.structured_hits, 1);
1033
1034 let out_data = writer.into_inner();
1035 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1036 let mut entry = zip_out.by_index(0).unwrap();
1037 let mut content = String::new();
1038 entry.read_to_string(&mut content).unwrap();
1039 assert!(!content.contains("hunter2"), "password should be sanitized");
1040 assert!(!content.contains("db.internal"), "host should be sanitized");
1041 }
1042
1043 #[test]
1044 fn zip_preserves_directory_entries() {
1045 let mut buf = Cursor::new(Vec::new());
1046 {
1047 let mut zip = zip::ZipWriter::new(&mut buf);
1048
1049 let dir_options = zip::write::FileOptions::default();
1050 zip.add_directory("subdir/", dir_options).unwrap();
1051
1052 let file_options = zip::write::FileOptions::default()
1053 .compression_method(zip::CompressionMethod::Stored);
1054 zip.start_file("subdir/data.txt", file_options).unwrap();
1055 zip.write_all(b"SUPERSECRET value").unwrap();
1056
1057 zip.finish().unwrap();
1058 }
1059
1060 let zip_data = buf.into_inner();
1061 let proc = make_archive_processor();
1062 let reader = Cursor::new(&zip_data);
1063 let mut writer = Cursor::new(Vec::new());
1064 let stats = proc.process_zip(reader, &mut writer).unwrap();
1065
1066 assert_eq!(stats.entries_skipped, 1); assert_eq!(stats.files_processed, 1);
1068 }
1069
1070 #[test]
1071 fn zip_handles_multiple_files() {
1072 let proc = make_archive_processor();
1073 let zip_data = build_test_zip(&[
1074 ("file1.txt", b"alice@corp.com"),
1075 ("file2.json", br#"{"secret":"SUPERSECRET"}"#),
1076 ("file3.log", b"nothing to see"),
1077 ]);
1078
1079 let reader = Cursor::new(&zip_data);
1080 let mut writer = Cursor::new(Vec::new());
1081 let stats = proc.process_zip(reader, &mut writer).unwrap();
1082
1083 assert_eq!(stats.files_processed, 3);
1084 assert_eq!(stats.structured_hits, 1); assert_eq!(stats.scanner_fallback, 2); }
1087
1088 #[test]
1091 fn format_detection_from_path() {
1092 assert_eq!(
1093 ArchiveFormat::from_path("data.tar"),
1094 Some(ArchiveFormat::Tar)
1095 );
1096 assert_eq!(
1097 ArchiveFormat::from_path("data.tar.gz"),
1098 Some(ArchiveFormat::TarGz)
1099 );
1100 assert_eq!(
1101 ArchiveFormat::from_path("data.tgz"),
1102 Some(ArchiveFormat::TarGz)
1103 );
1104 assert_eq!(
1105 ArchiveFormat::from_path("data.zip"),
1106 Some(ArchiveFormat::Zip)
1107 );
1108 assert_eq!(
1109 ArchiveFormat::from_path("DATA.ZIP"),
1110 Some(ArchiveFormat::Zip)
1111 );
1112 assert_eq!(ArchiveFormat::from_path("photo.png"), None);
1113 }
1114
1115 #[test]
1118 fn same_secret_gets_same_replacement_across_entries() {
1119 let proc = make_archive_processor();
1120 let input = build_test_tar(&[
1121 ("a.txt", b"contact alice@corp.com"),
1122 ("b.txt", b"reach alice@corp.com"),
1123 ]);
1124
1125 let mut output = Vec::new();
1126 proc.process_tar(&input[..], &mut output).unwrap();
1127
1128 let mut archive = tar::Archive::new(&output[..]);
1129 let mut contents: Vec<String> = Vec::new();
1130 for entry in archive.entries().unwrap() {
1131 let mut e = entry.unwrap();
1132 let mut s = String::new();
1133 e.read_to_string(&mut s).unwrap();
1134 contents.push(s);
1135 }
1136
1137 let replacement_a = contents[0].strip_prefix("contact ").unwrap();
1140 let replacement_b = contents[1].strip_prefix("reach ").unwrap();
1141 assert_eq!(
1142 replacement_a, replacement_b,
1143 "dedup should produce identical replacements"
1144 );
1145 assert!(!replacement_a.contains("alice@corp.com"));
1146 }
1147
1148 #[test]
1151 fn process_auto_dispatch_tar() {
1152 let proc = make_archive_processor();
1153 let tar_data = build_test_tar(&[("f.txt", b"SUPERSECRET")]);
1154
1155 let reader = Cursor::new(tar_data);
1156 let writer = Cursor::new(Vec::new());
1157 let stats = proc.process(reader, writer, ArchiveFormat::Tar).unwrap();
1158
1159 assert_eq!(stats.files_processed, 1);
1160 }
1161
1162 #[test]
1163 fn process_auto_dispatch_zip() {
1164 let proc = make_archive_processor();
1165 let zip_data = build_test_zip(&[("f.txt", b"SUPERSECRET")]);
1166
1167 let reader = Cursor::new(zip_data);
1168 let mut writer = Cursor::new(Vec::new());
1169 let stats = proc
1170 .process(reader, &mut writer, ArchiveFormat::Zip)
1171 .unwrap();
1172
1173 assert_eq!(stats.files_processed, 1);
1174 }
1175
1176 #[test]
1179 fn tar_empty_archive() {
1180 let proc = make_archive_processor();
1181 let tar_data = build_test_tar(&[]);
1182
1183 let mut output = Vec::new();
1184 let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
1185
1186 assert_eq!(stats.files_processed, 0);
1187 assert_eq!(stats.entries_skipped, 0);
1188 }
1189
1190 #[test]
1191 fn zip_empty_archive() {
1192 let proc = make_archive_processor();
1193 let zip_data = build_test_zip(&[]);
1194
1195 let reader = Cursor::new(zip_data);
1196 let mut writer = Cursor::new(Vec::new());
1197 let stats = proc.process_zip(reader, &mut writer).unwrap();
1198
1199 assert_eq!(stats.files_processed, 0);
1200 }
1201}