1use crate::error::{Result, SanitizeError};
54use crate::processor::profile::FileTypeProfile;
55use crate::processor::registry::ProcessorRegistry;
56use crate::scanner::{ScanStats, StreamScanner};
57use crate::store::MappingStore;
58
59fn sanitize_archive_entry_name(name: &str) -> String {
66 let name = name.replace('\\', "/");
67 let name = name.trim_start_matches('/');
68 let safe: Vec<&str> = name
69 .split('/')
70 .filter(|s| !s.is_empty() && *s != "." && *s != "..")
71 .collect();
72 let result = safe.join("/");
73 if result.is_empty() {
74 "_".to_string()
75 } else {
76 result
77 }
78}
79
80#[inline]
81fn sanitize_zip_entry_name(name: &str) -> String {
82 sanitize_archive_entry_name(name)
83}
84
85#[inline]
86fn sanitize_tar_entry_name(name: &str) -> String {
87 sanitize_archive_entry_name(name)
88}
89
90use glob::MatchOptions;
91use rayon::prelude::*;
92use std::collections::HashMap;
93use std::io::{self, Read, Seek, Write};
94use std::sync::Arc;
95
96use crate::processor::limits::{
97 DEFAULT_ARCHIVE_DEPTH, MAX_ARCHIVE_DEPTH, PARALLEL_ENTRY_THRESHOLD, PARALLEL_TAR_DATA_SIZE,
98 PARALLEL_ZIP_DATA_SIZE, STRUCTURED_ENTRY_SIZE,
99};
100
101type ParEntryResult = (usize, Result<(Vec<u8>, ArchiveStats)>);
107
108pub type EntryCallback = Arc<dyn Fn(&str, &[u8]) + Send + Sync>;
113
114#[derive(Default, Clone)]
141pub struct ArchiveFilter {
142 only: Vec<CompiledPattern>,
143 exclude: Vec<CompiledPattern>,
144}
145
146#[derive(Clone)]
147enum CompiledPattern {
148 DirPrefix(String),
151 Glob(glob::Pattern),
153}
154
155const GLOB_OPTS: MatchOptions = MatchOptions {
156 case_sensitive: true,
157 require_literal_separator: true,
158 require_literal_leading_dot: false,
159};
160
161impl CompiledPattern {
162 fn compile(raw: &str) -> std::result::Result<Self, String> {
163 if raw.ends_with('/') {
164 Ok(CompiledPattern::DirPrefix(
166 raw.trim_end_matches('/').to_string(),
167 ))
168 } else {
169 glob::Pattern::new(raw)
170 .map(CompiledPattern::Glob)
171 .map_err(|e| format!("invalid glob pattern '{raw}': {e}"))
172 }
173 }
174
175 fn matches(&self, path: &str) -> bool {
176 match self {
177 CompiledPattern::DirPrefix(prefix) => {
178 path == prefix || path.starts_with(&format!("{prefix}/"))
179 }
180 CompiledPattern::Glob(pat) => pat.matches_with(path, GLOB_OPTS),
181 }
182 }
183}
184
185impl ArchiveFilter {
186 pub fn new(only: Vec<String>, exclude: Vec<String>) -> std::result::Result<Self, String> {
192 let only = only
193 .into_iter()
194 .map(|p| CompiledPattern::compile(&p))
195 .collect::<std::result::Result<Vec<_>, _>>()?;
196 let exclude = exclude
197 .into_iter()
198 .map(|p| CompiledPattern::compile(&p))
199 .collect::<std::result::Result<Vec<_>, _>>()?;
200 Ok(Self { only, exclude })
201 }
202
203 pub fn is_empty(&self) -> bool {
205 self.only.is_empty() && self.exclude.is_empty()
206 }
207
208 pub fn passes(&self, path: &str) -> bool {
212 if !self.only.is_empty() && !self.only.iter().any(|p| p.matches(path)) {
213 return false;
214 }
215 if self.exclude.iter().any(|p| p.matches(path)) {
216 return false;
217 }
218 true
219 }
220}
221
222#[derive(Debug, Clone, Copy, PartialEq, Eq)]
226pub enum ArchiveFormat {
227 Zip,
229 Tar,
231 TarGz,
233}
234
235impl ArchiveFormat {
236 pub fn from_path(path: &str) -> Option<Self> {
240 let lower = path.to_ascii_lowercase();
241 if lower.ends_with(".tar.gz")
242 || std::path::Path::new(&lower)
243 .extension()
244 .is_some_and(|ext| ext.eq_ignore_ascii_case("tgz"))
245 {
246 Some(Self::TarGz)
247 } else if std::path::Path::new(&lower)
248 .extension()
249 .is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
250 {
251 Some(Self::Tar)
252 } else if std::path::Path::new(&lower)
253 .extension()
254 .is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
255 {
256 Some(Self::Zip)
257 } else {
258 None
259 }
260 }
261}
262
263#[derive(Debug, Clone, Default)]
269pub struct ArchiveStats {
270 pub files_processed: u64,
272 pub entries_skipped: u64,
274 pub structured_hits: u64,
276 pub scanner_fallback: u64,
278 pub nested_archives: u64,
281 pub total_input_bytes: u64,
283 pub total_output_bytes: u64,
285 pub file_methods: HashMap<String, String>,
288 pub file_scan_stats: HashMap<String, ScanStats>,
290 pub entries_filtered: u64,
292}
293
294#[derive(Debug, Clone, Eq, PartialEq)]
296pub struct ArchiveProgress {
297 pub entries_seen: u64,
299 pub files_processed: u64,
301 pub entries_skipped: u64,
303 pub total_entries: Option<u64>,
305 pub current_entry: String,
307}
308
309type ArchiveProgressCallback = Arc<dyn Fn(&ArchiveProgress) + Send + Sync>;
310
311impl ArchiveStats {
312 fn merge(&mut self, child: &ArchiveStats) {
314 self.files_processed += child.files_processed;
315 self.entries_skipped += child.entries_skipped;
316 self.structured_hits += child.structured_hits;
317 self.scanner_fallback += child.scanner_fallback;
318 self.nested_archives += child.nested_archives;
319 self.total_input_bytes += child.total_input_bytes;
320 self.total_output_bytes += child.total_output_bytes;
321 self.entries_filtered += child.entries_filtered;
322 self.file_methods.extend(
323 child
324 .file_methods
325 .iter()
326 .map(|(k, v)| (k.clone(), v.clone())),
327 );
328 self.file_scan_stats.extend(
329 child
330 .file_scan_stats
331 .iter()
332 .map(|(k, v)| (k.clone(), v.clone())),
333 );
334 }
335}
336
337pub struct ArchiveProcessor {
368 registry: Arc<ProcessorRegistry>,
370 scanner: Arc<StreamScanner>,
372 store: Arc<MappingStore>,
374 profiles: Vec<FileTypeProfile>,
376 max_depth: u32,
378 progress_callback: Option<ArchiveProgressCallback>,
380 parallel_threshold: usize,
383 filter: ArchiveFilter,
386 force_text: bool,
390 entry_callback: Option<EntryCallback>,
393}
394
395impl ArchiveProcessor {
396 pub fn new(
405 registry: Arc<ProcessorRegistry>,
406 scanner: Arc<StreamScanner>,
407 store: Arc<MappingStore>,
408 profiles: Vec<FileTypeProfile>,
409 ) -> Self {
410 Self {
411 registry,
412 scanner,
413 store,
414 profiles,
415 max_depth: DEFAULT_ARCHIVE_DEPTH,
416 progress_callback: None,
417 parallel_threshold: PARALLEL_ENTRY_THRESHOLD,
418 filter: ArchiveFilter::default(),
419 force_text: false,
420 entry_callback: None,
421 }
422 }
423
424 #[must_use]
430 pub fn with_max_depth(mut self, depth: u32) -> Self {
431 self.max_depth = depth.min(MAX_ARCHIVE_DEPTH);
432 self
433 }
434
435 #[must_use]
440 pub fn with_parallel_threshold(mut self, threshold: usize) -> Self {
441 self.parallel_threshold = threshold;
442 self
443 }
444
445 #[must_use]
447 pub fn with_progress_callback(mut self, callback: ArchiveProgressCallback) -> Self {
448 self.progress_callback = Some(callback);
449 self
450 }
451
452 #[must_use]
458 pub fn with_filter(mut self, filter: ArchiveFilter) -> Self {
459 self.filter = filter;
460 self
461 }
462
463 #[must_use]
470 pub fn with_force_text(mut self, force_text: bool) -> Self {
471 self.force_text = force_text;
472 self
473 }
474
475 #[must_use]
478 pub fn with_entry_callback(mut self, callback: EntryCallback) -> Self {
479 self.entry_callback = Some(callback);
480 self
481 }
482
483 fn emit_entry_bytes(&self, name: &str, bytes: &[u8]) {
484 if let Some(cb) = &self.entry_callback {
485 cb(name, bytes);
486 }
487 }
488
489 fn find_profile(&self, filename: &str) -> Option<&FileTypeProfile> {
491 self.profiles.iter().find(|p| p.matches_filename(filename))
492 }
493
494 fn emit_progress(&self, stats: &ArchiveStats, total_entries: Option<u64>, current_entry: &str) {
495 if let Some(callback) = &self.progress_callback {
496 callback(&ArchiveProgress {
497 entries_seen: stats.files_processed + stats.entries_skipped,
498 files_processed: stats.files_processed,
499 entries_skipped: stats.entries_skipped,
500 total_entries,
501 current_entry: current_entry.to_string(),
502 });
503 }
504 }
505
506 fn sanitize_entry_bytes(
513 &self,
514 filename: &str,
515 data: &[u8],
516 entry_size_hint: Option<u64>,
517 depth: u32,
518 ) -> Result<(Vec<u8>, ArchiveStats)> {
519 let mut out: Vec<u8> = Vec::with_capacity(data.len());
520 let mut entry_stats = ArchiveStats::default();
521 let mut reader = io::Cursor::new(data);
522 self.sanitize_entry(
523 filename,
524 &mut reader,
525 &mut out,
526 &mut entry_stats,
527 entry_size_hint,
528 depth,
529 )?;
530 Ok((out, entry_stats))
531 }
532
533 #[allow(clippy::missing_errors_doc)] fn sanitize_entry(
545 &self,
546 filename: &str,
547 reader: &mut dyn Read,
548 writer: &mut dyn Write,
549 stats: &mut ArchiveStats,
550 entry_size_hint: Option<u64>,
551 depth: u32,
552 ) -> Result<()> {
553 if let Some(nested_fmt) = ArchiveFormat::from_path(filename) {
555 return self.sanitize_nested_archive(
556 filename,
557 reader,
558 writer,
559 stats,
560 entry_size_hint,
561 nested_fmt,
562 depth,
563 );
564 }
565
566 let within_size_cap = entry_size_hint.map_or(true, |sz| sz <= STRUCTURED_ENTRY_SIZE); if !self.force_text && within_size_cap {
574 if let Some(profile) = self.find_profile(filename) {
575 let mut content = Vec::new();
577 reader.read_to_end(&mut content).map_err(|e| {
578 SanitizeError::ArchiveError(format!("read entry '{filename}': {e}"))
579 })?;
580
581 stats.total_input_bytes += content.len() as u64;
582
583 if let Ok(Some(structured_out)) =
588 self.registry.process(&content, profile, &self.store)
589 {
590 let (output, scan_stats) = self.scanner.scan_bytes(&structured_out)?;
593 stats.structured_hits += 1;
594 stats.total_output_bytes += output.len() as u64;
595 stats.file_methods.insert(
596 filename.to_string(),
597 format!("structured+scan:{}", profile.processor),
598 );
599 stats
600 .file_scan_stats
601 .insert(filename.to_string(), scan_stats);
602 writer.write_all(&output).map_err(|e| {
603 SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
604 })?;
605 return Ok(());
606 }
607
608 let (output, scan_stats) = self.scanner.scan_bytes(&content)?;
611 stats.scanner_fallback += 1;
612 stats.total_output_bytes += output.len() as u64;
613 stats
614 .file_methods
615 .insert(filename.to_string(), "scanner".to_string());
616 stats
617 .file_scan_stats
618 .insert(filename.to_string(), scan_stats);
619 writer.write_all(&output).map_err(|e| {
620 SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
621 })?;
622 return Ok(());
623 }
624 }
625
626 let mut counting_r = CountingReader::new(reader);
631 let mut counting_w = CountingWriter::new(writer);
632 let scan_stats = self.scanner.scan_reader(&mut counting_r, &mut counting_w)?;
633
634 stats.scanner_fallback += 1;
635 stats.total_input_bytes += counting_r.bytes_read();
636 stats.total_output_bytes += counting_w.bytes_written();
637 stats
638 .file_methods
639 .insert(filename.to_string(), "scanner".to_string());
640 stats
641 .file_scan_stats
642 .insert(filename.to_string(), scan_stats);
643
644 Ok(())
645 }
646
647 #[allow(clippy::too_many_arguments)]
650 fn sanitize_nested_archive(
651 &self,
652 filename: &str,
653 reader: &mut dyn Read,
654 writer: &mut dyn Write,
655 stats: &mut ArchiveStats,
656 entry_size_hint: Option<u64>,
657 nested_fmt: ArchiveFormat,
658 depth: u32,
659 ) -> Result<()> {
660 if depth >= self.max_depth {
661 return Err(SanitizeError::RecursionDepthExceeded(format!(
662 "nested archive '{}' at depth {} exceeds maximum nesting depth of {}",
663 filename, depth, self.max_depth,
664 )));
665 }
666
667 if let Some(sz) = entry_size_hint {
669 if sz > STRUCTURED_ENTRY_SIZE {
670 return Err(SanitizeError::ArchiveError(format!(
671 "nested archive '{}' is too large ({} bytes, limit {} bytes)",
672 filename, sz, STRUCTURED_ENTRY_SIZE,
673 )));
674 }
675 }
676
677 let mut content = Vec::new();
678 reader.read_to_end(&mut content).map_err(|e| {
679 SanitizeError::ArchiveError(format!("read nested archive '{filename}': {e}"))
680 })?;
681 stats.total_input_bytes += content.len() as u64;
682
683 let mut output_buf: Vec<u8> = Vec::new();
685 let child_stats = match nested_fmt {
686 ArchiveFormat::Tar => {
687 self.process_tar_at_depth(&content[..], &mut output_buf, depth + 1)?
688 }
689 ArchiveFormat::TarGz => {
690 self.process_tar_gz_at_depth(&content[..], &mut output_buf, depth + 1)?
691 }
692 ArchiveFormat::Zip => {
693 let reader = io::Cursor::new(&content);
694 let mut writer = io::Cursor::new(Vec::new());
695 let s = self.process_zip_at_depth(reader, &mut writer, depth + 1)?;
696 output_buf = writer.into_inner();
697 s
698 }
699 };
700
701 stats.nested_archives += 1;
702 stats.merge(&child_stats);
703 stats.total_output_bytes += output_buf.len() as u64;
704 let fmt_name = match nested_fmt {
705 ArchiveFormat::Tar => "tar",
706 ArchiveFormat::TarGz => "tar.gz",
707 ArchiveFormat::Zip => "zip",
708 };
709 stats
710 .file_methods
711 .insert(filename.to_string(), format!("nested:{fmt_name}"));
712 writer.write_all(&output_buf).map_err(|e| {
713 SanitizeError::ArchiveError(format!("write nested archive '{filename}': {e}"))
714 })?;
715 Ok(())
716 }
717
718 pub fn discover_profiles_tar<R: Read>(&self, reader: R) -> Result<()> {
737 if self.profiles.is_empty() {
738 return Ok(());
739 }
740 let mut archive = tar::Archive::new(reader);
741 let entries = archive
742 .entries()
743 .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entries: {e}")))?;
744 for entry_result in entries {
745 let mut entry = entry_result
746 .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entry: {e}")))?;
747 if !entry.header().entry_type().is_file() {
748 continue;
749 }
750 let path = entry
751 .path()
752 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
753 .to_string_lossy()
754 .to_string();
755 let Some(profile) = self.find_profile(&path) else {
756 continue;
757 };
758 let mut content = Vec::new();
759 entry
760 .read_to_end(&mut content)
761 .map_err(|e| SanitizeError::ArchiveError(format!("read '{path}': {e}")))?;
762 let _ = self.registry.process(&content, profile, &self.store);
763 }
764 Ok(())
765 }
766
767 pub fn discover_profiles_tar_gz<R: Read>(&self, reader: R) -> Result<()> {
775 let gz = flate2::read::GzDecoder::new(reader);
776 self.discover_profiles_tar(gz)
777 }
778
779 pub fn discover_profiles_zip<R: Read + Seek>(&self, reader: R) -> Result<()> {
787 if self.profiles.is_empty() {
788 return Ok(());
789 }
790 let mut zip = zip::ZipArchive::new(reader)
791 .map_err(|e| SanitizeError::ArchiveError(format!("open zip for discovery: {e}")))?;
792 for i in 0..zip.len() {
793 let mut entry = zip
794 .by_index(i)
795 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {i}: {e}")))?;
796 if entry.is_dir() {
797 continue;
798 }
799 let name = sanitize_zip_entry_name(entry.name());
800 let Some(profile) = self.find_profile(&name) else {
801 continue;
802 };
803 let mut content = Vec::new();
804 entry
805 .read_to_end(&mut content)
806 .map_err(|e| SanitizeError::ArchiveError(format!("read '{name}': {e}")))?;
807 let _ = self.registry.process(&content, profile, &self.store);
808 }
809 Ok(())
810 }
811
812 pub fn process_tar<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
826 self.process_tar_at_depth(reader, writer, 0)
827 }
828
829 #[allow(clippy::too_many_lines)]
847 fn process_tar_at_depth<R: Read, W: Write>(
848 &self,
849 reader: R,
850 writer: W,
851 depth: u32,
852 ) -> Result<ArchiveStats> {
853 struct TarEntry {
854 header: tar::Header,
855 path: String,
856 is_file: bool,
857 passes_filter: bool,
858 data: Vec<u8>,
859 }
860
861 let mut archive = tar::Archive::new(reader);
862 let mut builder = tar::Builder::new(writer);
863 let mut stats = ArchiveStats::default();
864
865 let mut entries_iter = archive
869 .entries()
870 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entries: {e}")))?;
871
872 let mut buffered: Vec<TarEntry> = Vec::new();
873 let mut file_count: usize = 0;
874 let mut total_data: u64 = 0;
875 let mut overflowed = false;
876
877 for entry_result in entries_iter.by_ref() {
878 let mut entry = entry_result
879 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {e}")))?;
880
881 let header = entry.header().clone();
882 let path = entry
883 .path()
884 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
885 .to_string_lossy()
886 .into_owned();
887 let is_file = header.entry_type().is_file();
888 let passes_filter = !is_file || self.filter.passes(&path);
889
890 let mut data = Vec::new();
891 entry
892 .read_to_end(&mut data)
893 .map_err(|e| SanitizeError::ArchiveError(format!("read entry '{path}': {e}")))?;
894 drop(entry);
895
896 if is_file && passes_filter {
897 file_count += 1;
898 total_data = total_data.saturating_add(data.len() as u64);
899 }
900
901 buffered.push(TarEntry {
902 header,
903 path,
904 is_file,
905 passes_filter,
906 data,
907 });
908
909 if total_data > PARALLEL_TAR_DATA_SIZE {
910 overflowed = true;
911 break;
912 }
913 }
914
915 let use_parallel = !overflowed
917 && file_count >= self.parallel_threshold
918 && rayon::current_thread_index().is_none();
919
920 if use_parallel {
921 let file_indices: Vec<usize> = buffered
924 .iter()
925 .enumerate()
926 .filter(|(_, e)| e.is_file && e.passes_filter)
927 .map(|(i, _)| i)
928 .collect();
929
930 let results: Vec<ParEntryResult> = file_indices
931 .into_par_iter()
932 .map(|i| {
933 let e = &buffered[i];
934 let size_hint = e.header.size().ok();
935 (
936 i,
937 self.sanitize_entry_bytes(&e.path, &e.data, size_hint, depth),
938 )
939 })
940 .collect();
941
942 let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; buffered.len()];
943 for (i, r) in results {
944 sanitized[i] = Some(r?);
945 }
946
947 for (i, entry) in buffered.iter().enumerate() {
948 if !entry.is_file {
949 builder
950 .append(&entry.header, entry.data.as_slice())
951 .map_err(|e| {
952 SanitizeError::ArchiveError(format!("append '{}': {e}", entry.path))
953 })?;
954 stats.entries_skipped += 1;
955 self.emit_progress(&stats, None, &entry.path);
956 continue;
957 }
958 if !entry.passes_filter {
959 stats.entries_filtered += 1;
960 self.emit_progress(&stats, None, &entry.path);
961 continue;
962 }
963
964 let (sanitized_buf, entry_stats) =
965 sanitized[i].take().expect("parallel result missing");
966 stats.merge(&entry_stats);
967 self.emit_entry_bytes(&entry.path, &sanitized_buf);
968
969 let mut new_header = entry.header.clone();
970 let safe_path = sanitize_tar_entry_name(&entry.path);
971 new_header.set_path(&safe_path).map_err(|e| {
972 SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
973 })?;
974 new_header.set_size(sanitized_buf.len() as u64);
975 new_header.set_cksum();
976 builder
977 .append(&new_header, sanitized_buf.as_slice())
978 .map_err(|e| {
979 SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
980 })?;
981 stats.files_processed += 1;
982 self.emit_progress(&stats, None, &entry.path);
983 }
984 } else {
985 let write_buffered = |entry: &TarEntry,
990 builder: &mut tar::Builder<W>,
991 stats: &mut ArchiveStats,
992 processor: &ArchiveProcessor|
993 -> Result<()> {
994 if !entry.is_file {
995 builder
996 .append(&entry.header, entry.data.as_slice())
997 .map_err(|e| {
998 SanitizeError::ArchiveError(format!("append '{}': {e}", entry.path))
999 })?;
1000 stats.entries_skipped += 1;
1001 processor.emit_progress(stats, None, &entry.path);
1002 return Ok(());
1003 }
1004 if !entry.passes_filter {
1005 stats.entries_filtered += 1;
1006 processor.emit_progress(stats, None, &entry.path);
1007 return Ok(());
1008 }
1009 let size_hint = entry.header.size().ok();
1010 let (sanitized_buf, entry_stats) =
1011 processor.sanitize_entry_bytes(&entry.path, &entry.data, size_hint, depth)?;
1012 stats.merge(&entry_stats);
1013 processor.emit_entry_bytes(&entry.path, &sanitized_buf);
1014 let mut new_header = entry.header.clone();
1015 let safe_path = sanitize_tar_entry_name(&entry.path);
1016 new_header.set_path(&safe_path).map_err(|e| {
1017 SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
1018 })?;
1019 new_header.set_size(sanitized_buf.len() as u64);
1020 new_header.set_cksum();
1021 builder
1022 .append(&new_header, sanitized_buf.as_slice())
1023 .map_err(|e| {
1024 SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
1025 })?;
1026 stats.files_processed += 1;
1027 processor.emit_progress(stats, None, &entry.path);
1028 Ok(())
1029 };
1030
1031 for entry in &buffered {
1032 write_buffered(entry, &mut builder, &mut stats, self)?;
1033 }
1034 drop(buffered);
1035
1036 if overflowed {
1038 for entry_result in entries_iter {
1039 let mut entry = entry_result
1040 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {e}")))?;
1041
1042 let header = entry.header().clone();
1043 let path = entry
1044 .path()
1045 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
1046 .to_string_lossy()
1047 .into_owned();
1048 let is_file = header.entry_type().is_file();
1049
1050 if !is_file {
1051 let mut data = Vec::new();
1052 entry.read_to_end(&mut data).map_err(|e| {
1053 SanitizeError::ArchiveError(format!("read '{path}': {e}"))
1054 })?;
1055 drop(entry);
1056 builder.append(&header, data.as_slice()).map_err(|e| {
1057 SanitizeError::ArchiveError(format!("append '{path}': {e}"))
1058 })?;
1059 stats.entries_skipped += 1;
1060 self.emit_progress(&stats, None, &path);
1061 continue;
1062 }
1063
1064 if !self.filter.passes(&path) {
1065 stats.entries_filtered += 1;
1066 continue;
1067 }
1068
1069 let size_hint = header.size().ok();
1070 let mut sanitized_buf = Vec::new();
1071 let mut entry_stats = ArchiveStats::default();
1072 self.sanitize_entry(
1073 &path,
1074 &mut entry,
1075 &mut sanitized_buf,
1076 &mut entry_stats,
1077 size_hint,
1078 depth,
1079 )?;
1080 drop(entry);
1081 self.emit_entry_bytes(&path, &sanitized_buf);
1082
1083 let mut new_header = header.clone();
1084 let safe_path = sanitize_tar_entry_name(&path);
1085 new_header.set_path(&safe_path).map_err(|e| {
1086 SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
1087 })?;
1088 new_header.set_size(sanitized_buf.len() as u64);
1089 new_header.set_cksum();
1090 builder
1091 .append(&new_header, sanitized_buf.as_slice())
1092 .map_err(|e| {
1093 SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
1094 })?;
1095
1096 stats.merge(&entry_stats);
1097 stats.files_processed += 1;
1098 self.emit_progress(&stats, None, &path);
1099 }
1100 }
1101 }
1102
1103 builder
1104 .finish()
1105 .map_err(|e| SanitizeError::ArchiveError(format!("finalize tar: {e}")))?;
1106
1107 Ok(stats)
1108 }
1109
1110 pub fn process_tar_gz<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
1120 self.process_tar_gz_at_depth(reader, writer, 0)
1121 }
1122
1123 fn process_tar_gz_at_depth<R: Read, W: Write>(
1125 &self,
1126 reader: R,
1127 writer: W,
1128 depth: u32,
1129 ) -> Result<ArchiveStats> {
1130 let gz_reader = flate2::read::GzDecoder::new(reader);
1131 let gz_writer = flate2::write::GzEncoder::new(writer, flate2::Compression::fast());
1132
1133 let stats = self.process_tar_at_depth(gz_reader, gz_writer, depth)?;
1134 Ok(stats)
1140 }
1141
1142 pub fn process_zip<R: Read + Seek, W: Write + Seek>(
1158 &self,
1159 reader: R,
1160 writer: W,
1161 ) -> Result<ArchiveStats> {
1162 self.process_zip_at_depth(reader, writer, 0)
1163 }
1164
1165 #[allow(clippy::too_many_lines)]
1176 fn process_zip_at_depth<R: Read + Seek, W: Write + Seek>(
1177 &self,
1178 reader: R,
1179 writer: W,
1180 depth: u32,
1181 ) -> Result<ArchiveStats> {
1182 struct ZipMeta {
1186 name: String,
1187 is_dir: bool,
1188 compression: zip::CompressionMethod,
1189 last_modified: Option<zip::DateTime>,
1190 unix_mode: Option<u32>,
1191 size: u64,
1192 }
1193
1194 let mut zip_in = zip::ZipArchive::new(reader)
1195 .map_err(|e| SanitizeError::ArchiveError(format!("open zip: {}", e)))?;
1196 let total_entries = zip_in.len();
1197 let total_entries_hint = Some(total_entries as u64);
1198
1199 let mut metas: Vec<ZipMeta> = Vec::with_capacity(total_entries);
1200 let mut file_count = 0usize;
1201 let mut total_uncompressed_size: u64 = 0;
1202
1203 for i in 0..total_entries {
1204 let entry = zip_in
1205 .by_index(i)
1206 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
1207 let is_dir = entry.is_dir();
1208 let size = entry.size();
1209 if !is_dir {
1210 file_count += 1;
1211 total_uncompressed_size = total_uncompressed_size.saturating_add(size);
1212 }
1213 metas.push(ZipMeta {
1214 name: sanitize_zip_entry_name(entry.name()),
1215 is_dir,
1216 compression: entry.compression(),
1217 last_modified: entry.last_modified(),
1218 unix_mode: entry.unix_mode(),
1219 size,
1220 });
1221 }
1223
1224 let use_parallel = file_count >= self.parallel_threshold
1229 && rayon::current_thread_index().is_none()
1230 && total_uncompressed_size <= PARALLEL_ZIP_DATA_SIZE;
1231
1232 let mut stats = ArchiveStats::default();
1233
1234 let make_options = |m: &ZipMeta| {
1236 let mut opts =
1237 zip::write::SimpleFileOptions::default().compression_method(m.compression);
1238 if let Some(dt) = m.last_modified {
1239 opts = opts.last_modified_time(dt);
1240 }
1241 if let Some(mode) = m.unix_mode {
1242 opts.unix_permissions(mode)
1243 } else {
1244 opts
1245 }
1246 };
1247
1248 if use_parallel {
1249 struct ZipEntry {
1251 meta_idx: usize,
1252 data: Vec<u8>,
1253 }
1254
1255 let mut file_entries: Vec<ZipEntry> = Vec::with_capacity(file_count);
1256
1257 for (i, meta) in metas.iter().enumerate() {
1258 if meta.is_dir {
1259 continue;
1260 }
1261 if !self.filter.passes(&meta.name) {
1263 continue;
1264 }
1265 let mut entry = zip_in
1266 .by_index(i)
1267 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
1268 let mut data = Vec::new();
1269 entry.read_to_end(&mut data).map_err(|e| {
1270 SanitizeError::ArchiveError(format!("read zip entry '{}': {}", meta.name, e))
1271 })?;
1272 file_entries.push(ZipEntry { meta_idx: i, data });
1273 }
1274
1275 let results: Vec<ParEntryResult> = file_entries
1276 .into_par_iter()
1277 .map(|e| {
1278 let meta = &metas[e.meta_idx];
1279 let result =
1280 self.sanitize_entry_bytes(&meta.name, &e.data, Some(meta.size), depth);
1281 (e.meta_idx, result)
1282 })
1283 .collect();
1284
1285 let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; metas.len()];
1288 for (meta_idx, r) in results {
1289 sanitized[meta_idx] = Some(r?);
1290 }
1291
1292 let mut zip_out = zip::ZipWriter::new(writer);
1293 for (i, meta) in metas.iter().enumerate() {
1294 let options = make_options(meta);
1295 if meta.is_dir {
1296 zip_out.add_directory(&meta.name, options).map_err(|e| {
1297 SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1298 })?;
1299 stats.entries_skipped += 1;
1300 self.emit_progress(&stats, total_entries_hint, &meta.name);
1301 continue;
1302 }
1303 if !self.filter.passes(&meta.name) {
1305 stats.entries_filtered += 1;
1306 self.emit_progress(&stats, total_entries_hint, &meta.name);
1307 continue;
1308 }
1309 let (sanitized_buf, entry_stats) = sanitized[i]
1310 .take()
1311 .expect("file entry sanitization result missing");
1312 stats.merge(&entry_stats);
1313 self.emit_entry_bytes(&meta.name, &sanitized_buf);
1314 zip_out.start_file(&meta.name, options).map_err(|e| {
1315 SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1316 })?;
1317 zip_out.write_all(&sanitized_buf).map_err(|e| {
1318 SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1319 })?;
1320 stats.files_processed += 1;
1321 self.emit_progress(&stats, total_entries_hint, &meta.name);
1322 }
1323 zip_out
1324 .finish()
1325 .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1326 } else {
1327 let mut zip_out = zip::ZipWriter::new(writer);
1330 for (i, meta) in metas.iter().enumerate() {
1331 let options = make_options(meta);
1332 if meta.is_dir {
1333 zip_out.add_directory(&meta.name, options).map_err(|e| {
1334 SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1335 })?;
1336 stats.entries_skipped += 1;
1337 self.emit_progress(&stats, total_entries_hint, &meta.name);
1338 continue;
1339 }
1340
1341 if !self.filter.passes(&meta.name) {
1343 stats.entries_filtered += 1;
1344 self.emit_progress(&stats, total_entries_hint, &meta.name);
1345 continue;
1346 }
1347
1348 let data = {
1349 let mut entry = zip_in.by_index(i).map_err(|e| {
1350 SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e))
1351 })?;
1352 let mut buf = Vec::new();
1353 entry.read_to_end(&mut buf).map_err(|e| {
1354 SanitizeError::ArchiveError(format!(
1355 "read zip entry '{}': {}",
1356 meta.name, e
1357 ))
1358 })?;
1359 buf
1360 };
1362
1363 let (sanitized_buf, entry_stats) =
1364 self.sanitize_entry_bytes(&meta.name, &data, Some(meta.size), depth)?;
1365 drop(data);
1366 self.emit_entry_bytes(&meta.name, &sanitized_buf);
1367
1368 zip_out.start_file(&meta.name, options).map_err(|e| {
1369 SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1370 })?;
1371 zip_out.write_all(&sanitized_buf).map_err(|e| {
1372 SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1373 })?;
1374 drop(sanitized_buf);
1375
1376 stats.merge(&entry_stats);
1377 stats.files_processed += 1;
1378 self.emit_progress(&stats, total_entries_hint, &meta.name);
1379 }
1380 zip_out
1381 .finish()
1382 .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1383 }
1384
1385 Ok(stats)
1386 }
1387
1388 pub fn process<R: Read + Seek, W: Write + Seek>(
1404 &self,
1405 reader: R,
1406 writer: W,
1407 format: ArchiveFormat,
1408 ) -> Result<ArchiveStats> {
1409 match format {
1410 ArchiveFormat::Zip => self.process_zip(reader, writer),
1411 ArchiveFormat::Tar => self.process_tar(reader, writer),
1412 ArchiveFormat::TarGz => self.process_tar_gz(reader, writer),
1413 }
1414 }
1415}
1416
1417struct CountingReader<'a> {
1423 inner: &'a mut dyn Read,
1424 count: u64,
1425}
1426
1427impl<'a> CountingReader<'a> {
1428 fn new(inner: &'a mut dyn Read) -> Self {
1429 Self { inner, count: 0 }
1430 }
1431
1432 fn bytes_read(&self) -> u64 {
1433 self.count
1434 }
1435}
1436
1437impl Read for CountingReader<'_> {
1438 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
1439 let n = self.inner.read(buf)?;
1440 self.count += n as u64;
1441 Ok(n)
1442 }
1443}
1444
1445struct CountingWriter<'a> {
1447 inner: &'a mut dyn Write,
1448 count: u64,
1449}
1450
1451impl<'a> CountingWriter<'a> {
1452 fn new(inner: &'a mut dyn Write) -> Self {
1453 Self { inner, count: 0 }
1454 }
1455
1456 fn bytes_written(&self) -> u64 {
1457 self.count
1458 }
1459}
1460
1461impl Write for CountingWriter<'_> {
1462 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
1463 let n = self.inner.write(buf)?;
1464 self.count += n as u64;
1465 Ok(n)
1466 }
1467
1468 fn flush(&mut self) -> io::Result<()> {
1469 self.inner.flush()
1470 }
1471}
1472
1473#[cfg(test)]
1478mod tests {
1479 use super::*;
1480 use crate::category::Category;
1481 use crate::generator::HmacGenerator;
1482 use crate::processor::profile::{FieldRule, FileTypeProfile};
1483 use crate::processor::registry::ProcessorRegistry;
1484 use crate::scanner::{ScanConfig, ScanPattern};
1485 use std::io::Cursor;
1486 use std::sync::Mutex;
1487
1488 fn make_archive_processor() -> ArchiveProcessor {
1490 let gen = Arc::new(HmacGenerator::new([42u8; 32]));
1491 let store = Arc::new(MappingStore::new(gen, None));
1492
1493 let patterns = vec![
1494 ScanPattern::from_regex(
1495 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
1496 Category::Email,
1497 "email",
1498 )
1499 .unwrap(),
1500 ScanPattern::from_literal("SUPERSECRET", Category::Custom("api_key".into()), "api_key")
1501 .unwrap(),
1502 ];
1503
1504 let scanner = Arc::new(
1505 StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
1506 );
1507
1508 let registry = Arc::new(ProcessorRegistry::with_builtins());
1509
1510 let profiles = vec![FileTypeProfile::new(
1511 "json",
1512 vec![FieldRule::new("*").with_category(Category::Custom("field".into()))],
1513 )
1514 .with_extension(".json")];
1515
1516 ArchiveProcessor::new(registry, scanner, store, profiles)
1517 }
1518
1519 fn build_test_tar(entries: &[(&str, &[u8])]) -> Vec<u8> {
1522 let mut buf = Vec::new();
1523 {
1524 let mut builder = tar::Builder::new(&mut buf);
1525 for (name, data) in entries {
1526 let mut header = tar::Header::new_gnu();
1527 header.set_size(data.len() as u64);
1528 header.set_mode(0o644);
1529 header.set_mtime(1_700_000_000);
1530 header.set_cksum();
1531 builder.append_data(&mut header, *name, *data).unwrap();
1532 }
1533 builder.finish().unwrap();
1534 }
1535 buf
1536 }
1537
1538 #[test]
1539 fn tar_sanitizes_plaintext_with_scanner() {
1540 let proc = make_archive_processor();
1541 let input = build_test_tar(&[("readme.txt", b"Contact alice@corp.com for help.")]);
1542
1543 let mut output = Vec::new();
1544 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1545
1546 assert_eq!(stats.files_processed, 1);
1547 assert_eq!(stats.scanner_fallback, 1);
1548 assert_eq!(stats.structured_hits, 0);
1549
1550 let mut archive = tar::Archive::new(&output[..]);
1552 for entry in archive.entries().unwrap() {
1553 let mut e = entry.unwrap();
1554 let mut content = String::new();
1555 e.read_to_string(&mut content).unwrap();
1556 assert!(
1557 !content.contains("alice@corp.com"),
1558 "email should be sanitized: {content}"
1559 );
1560 }
1561 }
1562
1563 #[test]
1564 fn tar_sanitizes_json_with_structured_processor() {
1565 let proc = make_archive_processor();
1566 let json_content = br#"{"email": "bob@example.org", "name": "Bob"}"#;
1567 let input = build_test_tar(&[("config.json", json_content)]);
1568
1569 let mut output = Vec::new();
1570 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1571
1572 assert_eq!(stats.files_processed, 1);
1573 assert_eq!(stats.structured_hits, 1);
1574 assert_eq!(stats.scanner_fallback, 0);
1575 assert_eq!(
1576 stats.file_methods.get("config.json").unwrap(),
1577 "structured+scan:json"
1578 );
1579
1580 let mut archive = tar::Archive::new(&output[..]);
1582 for entry in archive.entries().unwrap() {
1583 let mut e = entry.unwrap();
1584 let mut content = String::new();
1585 e.read_to_string(&mut content).unwrap();
1586 assert!(
1587 !content.contains("bob@example.org"),
1588 "email should be sanitized"
1589 );
1590 assert!(!content.contains("Bob"), "name should be sanitized");
1591 }
1592 }
1593
1594 #[test]
1595 fn tar_preserves_metadata() {
1596 let proc = make_archive_processor();
1597 let input = build_test_tar(&[("data.txt", b"SUPERSECRET token here")]);
1598
1599 let mut output = Vec::new();
1600 proc.process_tar(&input[..], &mut output).unwrap();
1601
1602 let mut archive = tar::Archive::new(&output[..]);
1603 for entry in archive.entries().unwrap() {
1604 let e = entry.unwrap();
1605 let hdr = e.header();
1606 assert_eq!(hdr.mode().unwrap(), 0o644);
1607 assert_eq!(hdr.mtime().unwrap(), 1_700_000_000);
1608 }
1609 }
1610
1611 #[test]
1612 fn tar_handles_multiple_files() {
1613 let proc = make_archive_processor();
1614 let input = build_test_tar(&[
1615 ("a.txt", b"alice@corp.com"),
1616 ("b.json", br#"{"key":"value"}"#),
1617 ("c.log", b"no secrets here"),
1618 ]);
1619
1620 let mut output = Vec::new();
1621 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1622
1623 assert_eq!(stats.files_processed, 3);
1624 assert_eq!(stats.structured_hits, 1);
1626 assert_eq!(stats.scanner_fallback, 2);
1628 }
1629
1630 #[test]
1631 fn tar_passes_through_directories() {
1632 let mut buf = Vec::new();
1633 {
1634 let mut builder = tar::Builder::new(&mut buf);
1635
1636 let mut dir_header = tar::Header::new_gnu();
1638 dir_header.set_entry_type(tar::EntryType::Directory);
1639 dir_header.set_size(0);
1640 dir_header.set_mode(0o755);
1641 dir_header.set_cksum();
1642 builder
1643 .append_data(&mut dir_header, "mydir/", &b""[..])
1644 .unwrap();
1645
1646 let mut file_header = tar::Header::new_gnu();
1648 file_header.set_size(5);
1649 file_header.set_mode(0o644);
1650 file_header.set_cksum();
1651 builder
1652 .append_data(&mut file_header, "mydir/hello.txt", &b"hello"[..])
1653 .unwrap();
1654
1655 builder.finish().unwrap();
1656 }
1657
1658 let proc = make_archive_processor();
1659 let mut output = Vec::new();
1660 let stats = proc.process_tar(&buf[..], &mut output).unwrap();
1661
1662 assert_eq!(stats.entries_skipped, 1);
1663 assert_eq!(stats.files_processed, 1);
1664 }
1665
1666 #[test]
1669 fn tar_gz_round_trip() {
1670 let proc = make_archive_processor();
1671
1672 let tar_data = build_test_tar(&[("secret.txt", b"Key is SUPERSECRET okay")]);
1674 let mut gz_input = Vec::new();
1675 {
1676 let mut encoder =
1677 flate2::write::GzEncoder::new(&mut gz_input, flate2::Compression::fast());
1678 encoder.write_all(&tar_data).unwrap();
1679 encoder.finish().unwrap();
1680 }
1681
1682 let mut gz_output = Vec::new();
1683 let stats = proc.process_tar_gz(&gz_input[..], &mut gz_output).unwrap();
1684
1685 assert_eq!(stats.files_processed, 1);
1686 assert_eq!(stats.scanner_fallback, 1);
1687
1688 let decoder = flate2::read::GzDecoder::new(&gz_output[..]);
1690 let mut archive = tar::Archive::new(decoder);
1691 for entry in archive.entries().unwrap() {
1692 let mut e = entry.unwrap();
1693 let mut content = String::new();
1694 e.read_to_string(&mut content).unwrap();
1695 assert!(
1696 !content.contains("SUPERSECRET"),
1697 "secret should be sanitized: {content}"
1698 );
1699 }
1700 }
1701
1702 fn build_test_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
1705 let mut buf = Cursor::new(Vec::new());
1706 {
1707 let mut zip = zip::ZipWriter::new(&mut buf);
1708 for (name, data) in entries {
1709 let options = zip::write::SimpleFileOptions::default()
1710 .compression_method(zip::CompressionMethod::Deflated);
1711 zip.start_file(*name, options).unwrap();
1712 zip.write_all(data).unwrap();
1713 }
1714 zip.finish().unwrap();
1715 }
1716 buf.into_inner()
1717 }
1718
1719 #[test]
1720 fn zip_sanitizes_plaintext_with_scanner() {
1721 let proc = make_archive_processor();
1722 let zip_data = build_test_zip(&[("notes.txt", b"Reach alice@corp.com for info.")]);
1723
1724 let reader = Cursor::new(&zip_data);
1725 let mut writer = Cursor::new(Vec::new());
1726 let stats = proc.process_zip(reader, &mut writer).unwrap();
1727
1728 assert_eq!(stats.files_processed, 1);
1729 assert_eq!(stats.scanner_fallback, 1);
1730
1731 let out_data = writer.into_inner();
1733 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1734 let mut entry = zip_out.by_index(0).unwrap();
1735 let mut content = String::new();
1736 entry.read_to_string(&mut content).unwrap();
1737 assert!(
1738 !content.contains("alice@corp.com"),
1739 "email should be sanitized: {content}"
1740 );
1741 }
1742
1743 #[test]
1744 fn zip_sanitizes_json_with_structured_processor() {
1745 let proc = make_archive_processor();
1746 let json_content = br#"{"password": "hunter2", "host": "db.internal"}"#;
1747 let zip_data = build_test_zip(&[("settings.json", json_content)]);
1748
1749 let reader = Cursor::new(&zip_data);
1750 let mut writer = Cursor::new(Vec::new());
1751 let stats = proc.process_zip(reader, &mut writer).unwrap();
1752
1753 assert_eq!(stats.files_processed, 1);
1754 assert_eq!(stats.structured_hits, 1);
1755
1756 let out_data = writer.into_inner();
1757 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1758 let mut entry = zip_out.by_index(0).unwrap();
1759 let mut content = String::new();
1760 entry.read_to_string(&mut content).unwrap();
1761 assert!(!content.contains("hunter2"), "password should be sanitized");
1762 assert!(!content.contains("db.internal"), "host should be sanitized");
1763 }
1764
1765 #[test]
1766 fn zip_preserves_directory_entries() {
1767 let mut buf = Cursor::new(Vec::new());
1768 {
1769 let mut zip = zip::ZipWriter::new(&mut buf);
1770
1771 let dir_options = zip::write::SimpleFileOptions::default();
1772 zip.add_directory("subdir/", dir_options).unwrap();
1773
1774 let file_options = zip::write::SimpleFileOptions::default()
1775 .compression_method(zip::CompressionMethod::Stored);
1776 zip.start_file("subdir/data.txt", file_options).unwrap();
1777 zip.write_all(b"SUPERSECRET value").unwrap();
1778
1779 zip.finish().unwrap();
1780 }
1781
1782 let zip_data = buf.into_inner();
1783 let proc = make_archive_processor();
1784 let reader = Cursor::new(&zip_data);
1785 let mut writer = Cursor::new(Vec::new());
1786 let stats = proc.process_zip(reader, &mut writer).unwrap();
1787
1788 assert_eq!(stats.entries_skipped, 1); assert_eq!(stats.files_processed, 1);
1790 }
1791
1792 #[test]
1793 fn zip_handles_multiple_files() {
1794 let proc = make_archive_processor();
1795 let zip_data = build_test_zip(&[
1796 ("file1.txt", b"alice@corp.com"),
1797 ("file2.json", br#"{"secret":"SUPERSECRET"}"#),
1798 ("file3.log", b"nothing to see"),
1799 ]);
1800
1801 let reader = Cursor::new(&zip_data);
1802 let mut writer = Cursor::new(Vec::new());
1803 let stats = proc.process_zip(reader, &mut writer).unwrap();
1804
1805 assert_eq!(stats.files_processed, 3);
1806 assert_eq!(stats.structured_hits, 1); assert_eq!(stats.scanner_fallback, 2); }
1809
1810 #[test]
1811 fn tar_progress_callback_receives_updates() {
1812 let updates = Arc::new(Mutex::new(Vec::new()));
1813 let proc = make_archive_processor().with_progress_callback({
1814 let updates = Arc::clone(&updates);
1815 Arc::new(move |progress| {
1816 updates
1817 .lock()
1818 .expect("archive progress lock")
1819 .push(progress.clone());
1820 })
1821 });
1822 let input = build_test_tar(&[("a.txt", b"alice@corp.com"), ("b.txt", b"SUPERSECRET")]);
1823
1824 let mut output = Vec::new();
1825 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1826 let updates = updates.lock().unwrap();
1827
1828 assert_eq!(updates.len(), 2);
1829 assert_eq!(updates.last().unwrap().entries_seen, 2);
1830 assert_eq!(
1831 updates.last().unwrap().files_processed,
1832 stats.files_processed
1833 );
1834 assert_eq!(updates.last().unwrap().total_entries, None);
1835 }
1836
1837 #[test]
1838 fn zip_progress_callback_reports_total_entries() {
1839 let updates = Arc::new(Mutex::new(Vec::new()));
1840 let proc = make_archive_processor().with_progress_callback({
1841 let updates = Arc::clone(&updates);
1842 Arc::new(move |progress| {
1843 updates
1844 .lock()
1845 .expect("archive progress lock")
1846 .push(progress.clone());
1847 })
1848 });
1849 let zip_data = build_test_zip(&[
1850 ("file1.txt", b"alice@corp.com"),
1851 ("file2.log", b"nothing to see"),
1852 ]);
1853
1854 let reader = Cursor::new(&zip_data);
1855 let mut writer = Cursor::new(Vec::new());
1856 let stats = proc.process_zip(reader, &mut writer).unwrap();
1857 let updates = updates.lock().unwrap();
1858
1859 assert_eq!(updates.len(), 2);
1860 assert_eq!(
1861 updates.last().unwrap().files_processed,
1862 stats.files_processed
1863 );
1864 assert_eq!(updates.last().unwrap().total_entries, Some(2));
1865 assert_eq!(updates.last().unwrap().current_entry, "file2.log");
1866 }
1867
1868 #[test]
1871 fn format_detection_from_path() {
1872 assert_eq!(
1873 ArchiveFormat::from_path("data.tar"),
1874 Some(ArchiveFormat::Tar)
1875 );
1876 assert_eq!(
1877 ArchiveFormat::from_path("data.tar.gz"),
1878 Some(ArchiveFormat::TarGz)
1879 );
1880 assert_eq!(
1881 ArchiveFormat::from_path("data.tgz"),
1882 Some(ArchiveFormat::TarGz)
1883 );
1884 assert_eq!(
1885 ArchiveFormat::from_path("data.zip"),
1886 Some(ArchiveFormat::Zip)
1887 );
1888 assert_eq!(
1889 ArchiveFormat::from_path("DATA.ZIP"),
1890 Some(ArchiveFormat::Zip)
1891 );
1892 assert_eq!(ArchiveFormat::from_path("photo.png"), None);
1893 }
1894
1895 #[test]
1898 fn same_secret_gets_same_replacement_across_entries() {
1899 let proc = make_archive_processor();
1900 let input = build_test_tar(&[
1901 ("a.txt", b"contact alice@corp.com"),
1902 ("b.txt", b"reach alice@corp.com"),
1903 ]);
1904
1905 let mut output = Vec::new();
1906 proc.process_tar(&input[..], &mut output).unwrap();
1907
1908 let mut archive = tar::Archive::new(&output[..]);
1909 let mut contents: Vec<String> = Vec::new();
1910 for entry in archive.entries().unwrap() {
1911 let mut e = entry.unwrap();
1912 let mut s = String::new();
1913 e.read_to_string(&mut s).unwrap();
1914 contents.push(s);
1915 }
1916
1917 let replacement_a = contents[0].strip_prefix("contact ").unwrap();
1920 let replacement_b = contents[1].strip_prefix("reach ").unwrap();
1921 assert_eq!(
1922 replacement_a, replacement_b,
1923 "dedup should produce identical replacements"
1924 );
1925 assert!(!replacement_a.contains("alice@corp.com"));
1926 }
1927
1928 #[test]
1931 fn process_auto_dispatch_tar() {
1932 let proc = make_archive_processor();
1933 let tar_data = build_test_tar(&[("f.txt", b"SUPERSECRET")]);
1934
1935 let reader = Cursor::new(tar_data);
1936 let writer = Cursor::new(Vec::new());
1937 let stats = proc.process(reader, writer, ArchiveFormat::Tar).unwrap();
1938
1939 assert_eq!(stats.files_processed, 1);
1940 }
1941
1942 #[test]
1943 fn process_auto_dispatch_zip() {
1944 let proc = make_archive_processor();
1945 let zip_data = build_test_zip(&[("f.txt", b"SUPERSECRET")]);
1946
1947 let reader = Cursor::new(zip_data);
1948 let mut writer = Cursor::new(Vec::new());
1949 let stats = proc
1950 .process(reader, &mut writer, ArchiveFormat::Zip)
1951 .unwrap();
1952
1953 assert_eq!(stats.files_processed, 1);
1954 }
1955
1956 #[test]
1959 fn tar_empty_archive() {
1960 let proc = make_archive_processor();
1961 let tar_data = build_test_tar(&[]);
1962
1963 let mut output = Vec::new();
1964 let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
1965
1966 assert_eq!(stats.files_processed, 0);
1967 assert_eq!(stats.entries_skipped, 0);
1968 }
1969
1970 #[test]
1971 fn zip_empty_archive() {
1972 let proc = make_archive_processor();
1973 let zip_data = build_test_zip(&[]);
1974
1975 let reader = Cursor::new(zip_data);
1976 let mut writer = Cursor::new(Vec::new());
1977 let stats = proc.process_zip(reader, &mut writer).unwrap();
1978
1979 assert_eq!(stats.files_processed, 0);
1980 }
1981
1982 #[test]
1985 fn zip_entry_name_clean_passthrough() {
1986 assert_eq!(sanitize_zip_entry_name("logs/app.log"), "logs/app.log");
1987 assert_eq!(sanitize_zip_entry_name("config.yaml"), "config.yaml");
1988 assert_eq!(sanitize_zip_entry_name("a/b/c.txt"), "a/b/c.txt");
1989 }
1990
1991 #[test]
1992 fn zip_entry_name_strips_leading_slash() {
1993 assert_eq!(sanitize_zip_entry_name("/etc/passwd"), "etc/passwd");
1994 assert_eq!(sanitize_zip_entry_name("///etc/passwd"), "etc/passwd");
1995 }
1996
1997 #[test]
1998 fn zip_entry_name_strips_dotdot() {
1999 assert_eq!(sanitize_zip_entry_name("../etc/passwd"), "etc/passwd");
2000 assert_eq!(
2001 sanitize_zip_entry_name("a/../../etc/passwd"),
2002 "a/etc/passwd"
2003 );
2004 assert_eq!(
2005 sanitize_zip_entry_name("../../root/.ssh/id_rsa"),
2006 "root/.ssh/id_rsa"
2007 );
2008 }
2009
2010 #[test]
2011 fn zip_entry_name_strips_leading_dot_slash() {
2012 assert_eq!(sanitize_zip_entry_name("./config.yaml"), "config.yaml");
2013 assert_eq!(sanitize_zip_entry_name("././config.yaml"), "config.yaml");
2014 }
2015
2016 #[test]
2017 fn zip_entry_name_backslash_normalised() {
2018 assert_eq!(sanitize_zip_entry_name("a\\b\\c.txt"), "a/b/c.txt");
2019 assert_eq!(sanitize_zip_entry_name("..\\etc\\passwd"), "etc/passwd");
2020 }
2021
2022 #[test]
2023 fn zip_entry_name_empty_result_replaced() {
2024 assert_eq!(sanitize_zip_entry_name("../.."), "_");
2025 assert_eq!(sanitize_zip_entry_name(""), "_");
2026 assert_eq!(sanitize_zip_entry_name("/"), "_");
2027 }
2028
2029 #[test]
2030 fn zip_entry_name_absolute_dotdot_combo() {
2031 assert_eq!(sanitize_zip_entry_name("/../etc/passwd"), "etc/passwd");
2032 }
2033
2034 #[test]
2037 fn filter_empty_passes_everything() {
2038 let f = ArchiveFilter::new(vec![], vec![]).unwrap();
2039 assert!(f.is_empty());
2040 assert!(f.passes("config/app.yaml"));
2041 assert!(f.passes("logs/server.log"));
2042 }
2043
2044 #[test]
2045 fn filter_only_glob_includes_match() {
2046 let f = ArchiveFilter::new(vec!["**/*.json".into()], vec![]).unwrap();
2047 assert!(!f.is_empty());
2048 assert!(f.passes("config/settings.json"));
2049 assert!(f.passes("deep/nested/file.json"));
2050 assert!(!f.passes("config/settings.yaml"));
2051 }
2052
2053 #[test]
2054 fn filter_only_dir_prefix_includes_subtree() {
2055 let f = ArchiveFilter::new(vec!["config/".into()], vec![]).unwrap();
2056 assert!(f.passes("config/app.yaml"));
2057 assert!(f.passes("config/nested/db.yaml"));
2058 assert!(!f.passes("logs/server.log"));
2059 }
2060
2061 #[test]
2062 fn filter_dir_prefix_exact_match() {
2063 let f = ArchiveFilter::new(vec!["config/".into()], vec![]).unwrap();
2064 assert!(f.passes("config"));
2066 }
2067
2068 #[test]
2069 fn filter_exclude_removes_match() {
2070 let f = ArchiveFilter::new(vec![], vec!["**/*.log".into()]).unwrap();
2071 assert!(!f.passes("logs/server.log"));
2072 assert!(f.passes("config/app.yaml"));
2073 }
2074
2075 #[test]
2076 fn filter_only_and_exclude_combined() {
2077 let f =
2078 ArchiveFilter::new(vec!["config/".into()], vec!["config/secrets.yaml".into()]).unwrap();
2079 assert!(f.passes("config/app.yaml"));
2080 assert!(!f.passes("config/secrets.yaml"));
2081 assert!(!f.passes("logs/server.log"));
2082 }
2083
2084 #[test]
2085 fn filter_invalid_glob_returns_error() {
2086 assert!(ArchiveFilter::new(vec!["[invalid".into()], vec![]).is_err());
2087 assert!(ArchiveFilter::new(vec![], vec!["[bad".into()]).is_err());
2088 }
2089
2090 #[test]
2093 fn builder_with_max_depth_clamps_at_max() {
2094 let proc = make_archive_processor().with_max_depth(999);
2095 assert_eq!(proc.max_depth, MAX_ARCHIVE_DEPTH);
2096 }
2097
2098 #[test]
2099 fn builder_with_max_depth_sets_value() {
2100 let proc = make_archive_processor().with_max_depth(2);
2101 assert_eq!(proc.max_depth, 2);
2102 }
2103
2104 #[test]
2105 fn builder_with_parallel_threshold_sets_value() {
2106 let proc = make_archive_processor().with_parallel_threshold(usize::MAX);
2107 assert_eq!(proc.parallel_threshold, usize::MAX);
2108 }
2109
2110 #[test]
2111 fn builder_with_force_text_enables_flag() {
2112 let proc = make_archive_processor().with_force_text(true);
2113 assert!(proc.force_text);
2114 }
2115
2116 #[test]
2117 fn builder_with_filter_applied_to_zip() {
2118 let proc = make_archive_processor()
2119 .with_filter(ArchiveFilter::new(vec!["**/*.json".into()], vec![]).unwrap());
2120
2121 let zip_data = build_test_zip(&[
2122 ("config.json", br#"{"email":"alice@corp.com"}"#),
2123 ("notes.txt", b"alice@corp.com"),
2124 ]);
2125
2126 let reader = Cursor::new(zip_data);
2127 let mut writer = Cursor::new(Vec::new());
2128 let stats = proc.process_zip(reader, &mut writer).unwrap();
2129
2130 assert_eq!(stats.files_processed, 1);
2132 assert_eq!(stats.entries_filtered, 1);
2133 }
2134
2135 #[test]
2136 fn builder_with_filter_applied_to_tar() {
2137 let proc = make_archive_processor()
2138 .with_filter(ArchiveFilter::new(vec!["**/*.json".into()], vec![]).unwrap());
2139
2140 let tar_data = build_test_tar(&[
2141 ("config.json", br#"{"email":"alice@corp.com"}"#),
2142 ("notes.txt", b"alice@corp.com"),
2143 ]);
2144
2145 let mut output = Vec::new();
2146 let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
2147
2148 assert_eq!(stats.files_processed, 1);
2149 assert_eq!(stats.entries_filtered, 1);
2150 }
2151
2152 #[test]
2155 fn parallel_tar_sanitizes_all_entries() {
2156 let proc = make_archive_processor().with_parallel_threshold(0);
2158 let tar_data = build_test_tar(&[
2159 ("a.txt", b"alice@corp.com"),
2160 ("b.txt", b"bob@corp.com"),
2161 ("c.txt", b"carol@corp.com"),
2162 ("d.txt", b"dave@corp.com"),
2163 ]);
2164
2165 let mut output = Vec::new();
2166 let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
2167
2168 assert_eq!(stats.files_processed, 4);
2169
2170 let originals = [
2172 "alice@corp.com",
2173 "bob@corp.com",
2174 "carol@corp.com",
2175 "dave@corp.com",
2176 ];
2177 let mut archive = tar::Archive::new(&output[..]);
2178 for entry in archive.entries().unwrap() {
2179 let mut e = entry.unwrap();
2180 let mut content = String::new();
2181 e.read_to_string(&mut content).unwrap();
2182 for orig in &originals {
2183 assert!(
2184 !content.contains(orig),
2185 "original secret leaked in {:?}",
2186 e.path()
2187 );
2188 }
2189 }
2190 }
2191
2192 #[test]
2193 fn parallel_tar_preserves_entry_order() {
2194 let proc = make_archive_processor().with_parallel_threshold(0);
2195 let tar_data = build_test_tar(&[
2196 ("first.txt", b"alice@corp.com"),
2197 ("second.txt", b"hello"),
2198 ("third.txt", b"bob@corp.com"),
2199 ]);
2200
2201 let mut output = Vec::new();
2202 proc.process_tar(&tar_data[..], &mut output).unwrap();
2203
2204 let mut archive = tar::Archive::new(&output[..]);
2205 let names: Vec<String> = archive
2206 .entries()
2207 .unwrap()
2208 .map(|e| e.unwrap().path().unwrap().to_string_lossy().to_string())
2209 .collect();
2210
2211 assert_eq!(names, vec!["first.txt", "second.txt", "third.txt"]);
2212 }
2213
2214 #[test]
2215 fn parallel_zip_sanitizes_all_entries() {
2216 let proc = make_archive_processor().with_parallel_threshold(0);
2217 let zip_data = build_test_zip(&[
2218 ("a.txt", b"alice@corp.com"),
2219 ("b.txt", b"bob@corp.com"),
2220 ("c.txt", b"carol@corp.com"),
2221 ("d.txt", b"dave@corp.com"),
2222 ]);
2223
2224 let reader = Cursor::new(zip_data);
2225 let mut writer = Cursor::new(Vec::new());
2226 let stats = proc.process_zip(reader, &mut writer).unwrap();
2227
2228 assert_eq!(stats.files_processed, 4);
2229
2230 let originals = [
2231 "alice@corp.com",
2232 "bob@corp.com",
2233 "carol@corp.com",
2234 "dave@corp.com",
2235 ];
2236 let out_data = writer.into_inner();
2237 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
2238 for i in 0..zip_out.len() {
2239 let mut entry = zip_out.by_index(i).unwrap();
2240 let mut content = String::new();
2241 entry.read_to_string(&mut content).unwrap();
2242 for orig in &originals {
2243 assert!(
2244 !content.contains(orig),
2245 "original secret leaked in entry {i}"
2246 );
2247 }
2248 }
2249 }
2250
2251 #[test]
2252 fn parallel_tar_mixed_structured_and_scanner() {
2253 let proc = make_archive_processor().with_parallel_threshold(0);
2254 let tar_data = build_test_tar(&[
2255 ("config.json", br#"{"email":"alice@corp.com","port":5432}"#),
2256 ("notes.txt", b"contact bob@corp.com for help"),
2257 ("data.json", br#"{"email":"carol@corp.com"}"#),
2258 ("readme.txt", b"dave@corp.com is the owner"),
2259 ]);
2260
2261 let mut output = Vec::new();
2262 let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
2263
2264 assert_eq!(stats.files_processed, 4);
2265 assert_eq!(stats.structured_hits, 2); assert_eq!(stats.scanner_fallback, 2); let originals = [
2269 "alice@corp.com",
2270 "bob@corp.com",
2271 "carol@corp.com",
2272 "dave@corp.com",
2273 ];
2274 let mut archive = tar::Archive::new(&output[..]);
2275 for entry in archive.entries().unwrap() {
2276 let mut e = entry.unwrap();
2277 let mut content = String::new();
2278 e.read_to_string(&mut content).unwrap();
2279 for orig in &originals {
2280 assert!(!content.contains(orig), "original secret leaked");
2281 }
2282 }
2283 }
2284
2285 #[test]
2288 fn tar_in_tar_secrets_sanitized() {
2289 let inner_tar = build_test_tar(&[("inner.txt", b"alice@corp.com")]);
2291
2292 let outer_tar = build_test_tar(&[("nested.tar", &inner_tar)]);
2294
2295 let proc = make_archive_processor();
2296 let mut output = Vec::new();
2297 let stats = proc.process_tar(&outer_tar[..], &mut output).unwrap();
2298
2299 assert_eq!(stats.nested_archives, 1);
2300
2301 let mut outer = tar::Archive::new(&output[..]);
2303 for entry in outer.entries().unwrap() {
2304 let mut e = entry.unwrap();
2305 let mut inner_bytes = Vec::new();
2306 e.read_to_end(&mut inner_bytes).unwrap();
2307 let mut inner = tar::Archive::new(&inner_bytes[..]);
2308 for inner_entry in inner.entries().unwrap() {
2309 let mut ie = inner_entry.unwrap();
2310 let mut content = String::new();
2311 ie.read_to_string(&mut content).unwrap();
2312 assert!(
2313 !content.contains("alice@corp.com"),
2314 "secret survived nested tar"
2315 );
2316 }
2317 }
2318 }
2319
2320 #[test]
2321 fn zip_in_tar_secrets_sanitized() {
2322 let inner_zip = build_test_zip(&[("inner.txt", b"SUPERSECRET")]);
2323 let outer_tar = build_test_tar(&[("nested.zip", &inner_zip)]);
2324
2325 let proc = make_archive_processor();
2326 let mut output = Vec::new();
2327 let stats = proc.process_tar(&outer_tar[..], &mut output).unwrap();
2328
2329 assert_eq!(stats.nested_archives, 1);
2330
2331 let mut outer = tar::Archive::new(&output[..]);
2332 for entry in outer.entries().unwrap() {
2333 let mut e = entry.unwrap();
2334 let mut zip_bytes = Vec::new();
2335 e.read_to_end(&mut zip_bytes).unwrap();
2336 let mut zip_out = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap();
2337 for i in 0..zip_out.len() {
2338 let mut ze = zip_out.by_index(i).unwrap();
2339 let mut content = String::new();
2340 ze.read_to_string(&mut content).unwrap();
2341 assert!(
2342 !content.contains("SUPERSECRET"),
2343 "secret survived zip-in-tar"
2344 );
2345 }
2346 }
2347 }
2348
2349 #[test]
2350 fn zip_in_zip_secrets_sanitized() {
2351 let inner_zip = build_test_zip(&[("secret.txt", b"alice@corp.com")]);
2352 let outer_zip = build_test_zip(&[("nested.zip", &inner_zip)]);
2353
2354 let proc = make_archive_processor();
2355 let reader = Cursor::new(outer_zip);
2356 let mut writer = Cursor::new(Vec::new());
2357 let stats = proc.process_zip(reader, &mut writer).unwrap();
2358
2359 assert_eq!(stats.nested_archives, 1);
2360
2361 let out_bytes = writer.into_inner();
2362 let mut outer = zip::ZipArchive::new(Cursor::new(out_bytes)).unwrap();
2363 let mut inner_bytes = Vec::new();
2364 outer
2365 .by_index(0)
2366 .unwrap()
2367 .read_to_end(&mut inner_bytes)
2368 .unwrap();
2369 let mut inner = zip::ZipArchive::new(Cursor::new(inner_bytes)).unwrap();
2370 let mut content = String::new();
2371 inner
2372 .by_index(0)
2373 .unwrap()
2374 .read_to_string(&mut content)
2375 .unwrap();
2376 assert!(
2377 !content.contains("alice@corp.com"),
2378 "secret survived zip-in-zip"
2379 );
2380 }
2381
2382 #[test]
2383 fn nested_archive_depth_limit_returns_error() {
2384 let proc = make_archive_processor().with_max_depth(1);
2387
2388 let innermost = build_test_tar(&[("file.txt", b"secret")]);
2389 let middle = build_test_tar(&[("inner.tar", &innermost)]);
2390 let outer = build_test_tar(&[("middle.tar", &middle)]);
2391
2392 let mut output = Vec::new();
2393 let err = proc.process_tar(&outer[..], &mut output).unwrap_err();
2394 assert!(matches!(err, SanitizeError::RecursionDepthExceeded(_)));
2395 }
2396
2397 #[test]
2398 fn force_text_skips_structured_processor() {
2399 let proc = make_archive_processor().with_force_text(true);
2400 let tar_data = build_test_tar(&[("config.json", br#"{"email":"alice@corp.com"}"#)]);
2401
2402 let mut output = Vec::new();
2403 let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
2404
2405 assert_eq!(stats.scanner_fallback, 1);
2407 assert_eq!(stats.structured_hits, 0);
2408 }
2409}