1use crate::error::{Result, SanitizeError};
54use crate::processor::profile::FileTypeProfile;
55use crate::processor::registry::ProcessorRegistry;
56use crate::scanner::{ScanStats, StreamScanner};
57use crate::store::MappingStore;
58
59fn sanitize_archive_entry_name(name: &str) -> String {
66 let name = name.replace('\\', "/");
67 let name = name.trim_start_matches('/');
68 let safe: Vec<&str> = name
69 .split('/')
70 .filter(|s| !s.is_empty() && *s != "." && *s != "..")
71 .collect();
72 let result = safe.join("/");
73 if result.is_empty() {
74 "_".to_string()
75 } else {
76 result
77 }
78}
79
80#[inline]
81fn sanitize_zip_entry_name(name: &str) -> String {
82 sanitize_archive_entry_name(name)
83}
84
85#[inline]
86fn sanitize_tar_entry_name(name: &str) -> String {
87 sanitize_archive_entry_name(name)
88}
89
90use glob::MatchOptions;
91use rayon::prelude::*;
92use std::collections::HashMap;
93use std::io::{self, Read, Seek, Write};
94use std::sync::Arc;
95
96use crate::processor::limits::{
97 DEFAULT_ARCHIVE_DEPTH, MAX_ARCHIVE_DEPTH, PARALLEL_ENTRY_THRESHOLD, PARALLEL_TAR_DATA_SIZE,
98 PARALLEL_ZIP_DATA_SIZE, STRUCTURED_ENTRY_SIZE,
99};
100
101type ParEntryResult = (usize, Result<(Vec<u8>, ArchiveStats)>);
107
108#[derive(Default, Clone)]
135pub struct ArchiveFilter {
136 only: Vec<CompiledPattern>,
137 exclude: Vec<CompiledPattern>,
138}
139
140#[derive(Clone)]
141enum CompiledPattern {
142 DirPrefix(String),
145 Glob(glob::Pattern),
147}
148
149const GLOB_OPTS: MatchOptions = MatchOptions {
150 case_sensitive: true,
151 require_literal_separator: true,
152 require_literal_leading_dot: false,
153};
154
155impl CompiledPattern {
156 fn compile(raw: &str) -> std::result::Result<Self, String> {
157 if raw.ends_with('/') {
158 Ok(CompiledPattern::DirPrefix(
160 raw.trim_end_matches('/').to_string(),
161 ))
162 } else {
163 glob::Pattern::new(raw)
164 .map(CompiledPattern::Glob)
165 .map_err(|e| format!("invalid glob pattern '{raw}': {e}"))
166 }
167 }
168
169 fn matches(&self, path: &str) -> bool {
170 match self {
171 CompiledPattern::DirPrefix(prefix) => {
172 path == prefix || path.starts_with(&format!("{prefix}/"))
173 }
174 CompiledPattern::Glob(pat) => pat.matches_with(path, GLOB_OPTS),
175 }
176 }
177}
178
179impl ArchiveFilter {
180 pub fn new(only: Vec<String>, exclude: Vec<String>) -> std::result::Result<Self, String> {
186 let only = only
187 .into_iter()
188 .map(|p| CompiledPattern::compile(&p))
189 .collect::<std::result::Result<Vec<_>, _>>()?;
190 let exclude = exclude
191 .into_iter()
192 .map(|p| CompiledPattern::compile(&p))
193 .collect::<std::result::Result<Vec<_>, _>>()?;
194 Ok(Self { only, exclude })
195 }
196
197 pub fn is_empty(&self) -> bool {
199 self.only.is_empty() && self.exclude.is_empty()
200 }
201
202 pub fn passes(&self, path: &str) -> bool {
206 if !self.only.is_empty() && !self.only.iter().any(|p| p.matches(path)) {
207 return false;
208 }
209 if self.exclude.iter().any(|p| p.matches(path)) {
210 return false;
211 }
212 true
213 }
214}
215
216#[derive(Debug, Clone, Copy, PartialEq, Eq)]
220pub enum ArchiveFormat {
221 Zip,
223 Tar,
225 TarGz,
227}
228
229impl ArchiveFormat {
230 pub fn from_path(path: &str) -> Option<Self> {
234 let lower = path.to_ascii_lowercase();
235 if lower.ends_with(".tar.gz")
236 || std::path::Path::new(&lower)
237 .extension()
238 .is_some_and(|ext| ext.eq_ignore_ascii_case("tgz"))
239 {
240 Some(Self::TarGz)
241 } else if std::path::Path::new(&lower)
242 .extension()
243 .is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
244 {
245 Some(Self::Tar)
246 } else if std::path::Path::new(&lower)
247 .extension()
248 .is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
249 {
250 Some(Self::Zip)
251 } else {
252 None
253 }
254 }
255}
256
257#[derive(Debug, Clone, Default)]
263pub struct ArchiveStats {
264 pub files_processed: u64,
266 pub entries_skipped: u64,
268 pub structured_hits: u64,
270 pub scanner_fallback: u64,
272 pub nested_archives: u64,
275 pub total_input_bytes: u64,
277 pub total_output_bytes: u64,
279 pub file_methods: HashMap<String, String>,
282 pub file_scan_stats: HashMap<String, ScanStats>,
284 pub entries_filtered: u64,
286}
287
288#[derive(Debug, Clone, Eq, PartialEq)]
290pub struct ArchiveProgress {
291 pub entries_seen: u64,
293 pub files_processed: u64,
295 pub entries_skipped: u64,
297 pub total_entries: Option<u64>,
299 pub current_entry: String,
301}
302
303type ArchiveProgressCallback = Arc<dyn Fn(&ArchiveProgress) + Send + Sync>;
304
305impl ArchiveStats {
306 fn merge(&mut self, child: &ArchiveStats) {
308 self.files_processed += child.files_processed;
309 self.entries_skipped += child.entries_skipped;
310 self.structured_hits += child.structured_hits;
311 self.scanner_fallback += child.scanner_fallback;
312 self.nested_archives += child.nested_archives;
313 self.total_input_bytes += child.total_input_bytes;
314 self.total_output_bytes += child.total_output_bytes;
315 self.entries_filtered += child.entries_filtered;
316 self.file_methods.extend(
317 child
318 .file_methods
319 .iter()
320 .map(|(k, v)| (k.clone(), v.clone())),
321 );
322 self.file_scan_stats.extend(
323 child
324 .file_scan_stats
325 .iter()
326 .map(|(k, v)| (k.clone(), v.clone())),
327 );
328 }
329}
330
331pub struct ArchiveProcessor {
362 registry: Arc<ProcessorRegistry>,
364 scanner: Arc<StreamScanner>,
366 store: Arc<MappingStore>,
368 profiles: Vec<FileTypeProfile>,
370 max_depth: u32,
372 progress_callback: Option<ArchiveProgressCallback>,
374 parallel_threshold: usize,
377 filter: ArchiveFilter,
380 force_text: bool,
384}
385
386impl ArchiveProcessor {
387 pub fn new(
396 registry: Arc<ProcessorRegistry>,
397 scanner: Arc<StreamScanner>,
398 store: Arc<MappingStore>,
399 profiles: Vec<FileTypeProfile>,
400 ) -> Self {
401 Self {
402 registry,
403 scanner,
404 store,
405 profiles,
406 max_depth: DEFAULT_ARCHIVE_DEPTH,
407 progress_callback: None,
408 parallel_threshold: PARALLEL_ENTRY_THRESHOLD,
409 filter: ArchiveFilter::default(),
410 force_text: false,
411 }
412 }
413
414 #[must_use]
420 pub fn with_max_depth(mut self, depth: u32) -> Self {
421 self.max_depth = depth.min(MAX_ARCHIVE_DEPTH);
422 self
423 }
424
425 #[must_use]
430 pub fn with_parallel_threshold(mut self, threshold: usize) -> Self {
431 self.parallel_threshold = threshold;
432 self
433 }
434
435 #[must_use]
437 pub fn with_progress_callback(mut self, callback: ArchiveProgressCallback) -> Self {
438 self.progress_callback = Some(callback);
439 self
440 }
441
442 #[must_use]
448 pub fn with_filter(mut self, filter: ArchiveFilter) -> Self {
449 self.filter = filter;
450 self
451 }
452
453 #[must_use]
460 pub fn with_force_text(mut self, force_text: bool) -> Self {
461 self.force_text = force_text;
462 self
463 }
464
465 fn find_profile(&self, filename: &str) -> Option<&FileTypeProfile> {
467 self.profiles.iter().find(|p| p.matches_filename(filename))
468 }
469
470 fn emit_progress(&self, stats: &ArchiveStats, total_entries: Option<u64>, current_entry: &str) {
471 if let Some(callback) = &self.progress_callback {
472 callback(&ArchiveProgress {
473 entries_seen: stats.files_processed + stats.entries_skipped,
474 files_processed: stats.files_processed,
475 entries_skipped: stats.entries_skipped,
476 total_entries,
477 current_entry: current_entry.to_string(),
478 });
479 }
480 }
481
482 fn sanitize_entry_bytes(
489 &self,
490 filename: &str,
491 data: &[u8],
492 entry_size_hint: Option<u64>,
493 depth: u32,
494 ) -> Result<(Vec<u8>, ArchiveStats)> {
495 let mut out: Vec<u8> = Vec::with_capacity(data.len());
496 let mut entry_stats = ArchiveStats::default();
497 let mut reader = io::Cursor::new(data);
498 self.sanitize_entry(
499 filename,
500 &mut reader,
501 &mut out,
502 &mut entry_stats,
503 entry_size_hint,
504 depth,
505 )?;
506 Ok((out, entry_stats))
507 }
508
509 #[allow(clippy::missing_errors_doc)] fn sanitize_entry(
521 &self,
522 filename: &str,
523 reader: &mut dyn Read,
524 writer: &mut dyn Write,
525 stats: &mut ArchiveStats,
526 entry_size_hint: Option<u64>,
527 depth: u32,
528 ) -> Result<()> {
529 if let Some(nested_fmt) = ArchiveFormat::from_path(filename) {
531 return self.sanitize_nested_archive(
532 filename,
533 reader,
534 writer,
535 stats,
536 entry_size_hint,
537 nested_fmt,
538 depth,
539 );
540 }
541
542 let within_size_cap = entry_size_hint.map_or(true, |sz| sz <= STRUCTURED_ENTRY_SIZE); if !self.force_text && within_size_cap {
550 if let Some(profile) = self.find_profile(filename) {
551 let mut content = Vec::new();
553 reader.read_to_end(&mut content).map_err(|e| {
554 SanitizeError::ArchiveError(format!("read entry '{filename}': {e}"))
555 })?;
556
557 stats.total_input_bytes += content.len() as u64;
558
559 if let Ok(Some(structured_out)) =
564 self.registry.process(&content, profile, &self.store)
565 {
566 let (output, scan_stats) = self.scanner.scan_bytes(&structured_out)?;
569 stats.structured_hits += 1;
570 stats.total_output_bytes += output.len() as u64;
571 stats.file_methods.insert(
572 filename.to_string(),
573 format!("structured+scan:{}", profile.processor),
574 );
575 stats
576 .file_scan_stats
577 .insert(filename.to_string(), scan_stats);
578 writer.write_all(&output).map_err(|e| {
579 SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
580 })?;
581 return Ok(());
582 }
583
584 let (output, scan_stats) = self.scanner.scan_bytes(&content)?;
587 stats.scanner_fallback += 1;
588 stats.total_output_bytes += output.len() as u64;
589 stats
590 .file_methods
591 .insert(filename.to_string(), "scanner".to_string());
592 stats
593 .file_scan_stats
594 .insert(filename.to_string(), scan_stats);
595 writer.write_all(&output).map_err(|e| {
596 SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
597 })?;
598 return Ok(());
599 }
600 }
601
602 let mut counting_r = CountingReader::new(reader);
607 let mut counting_w = CountingWriter::new(writer);
608 let scan_stats = self.scanner.scan_reader(&mut counting_r, &mut counting_w)?;
609
610 stats.scanner_fallback += 1;
611 stats.total_input_bytes += counting_r.bytes_read();
612 stats.total_output_bytes += counting_w.bytes_written();
613 stats
614 .file_methods
615 .insert(filename.to_string(), "scanner".to_string());
616 stats
617 .file_scan_stats
618 .insert(filename.to_string(), scan_stats);
619
620 Ok(())
621 }
622
623 #[allow(clippy::too_many_arguments)]
626 fn sanitize_nested_archive(
627 &self,
628 filename: &str,
629 reader: &mut dyn Read,
630 writer: &mut dyn Write,
631 stats: &mut ArchiveStats,
632 entry_size_hint: Option<u64>,
633 nested_fmt: ArchiveFormat,
634 depth: u32,
635 ) -> Result<()> {
636 if depth >= self.max_depth {
637 return Err(SanitizeError::RecursionDepthExceeded(format!(
638 "nested archive '{}' at depth {} exceeds maximum nesting depth of {}",
639 filename, depth, self.max_depth,
640 )));
641 }
642
643 if let Some(sz) = entry_size_hint {
645 if sz > STRUCTURED_ENTRY_SIZE {
646 return Err(SanitizeError::ArchiveError(format!(
647 "nested archive '{}' is too large ({} bytes, limit {} bytes)",
648 filename, sz, STRUCTURED_ENTRY_SIZE,
649 )));
650 }
651 }
652
653 let mut content = Vec::new();
654 reader.read_to_end(&mut content).map_err(|e| {
655 SanitizeError::ArchiveError(format!("read nested archive '{filename}': {e}"))
656 })?;
657 stats.total_input_bytes += content.len() as u64;
658
659 let mut output_buf: Vec<u8> = Vec::new();
661 let child_stats = match nested_fmt {
662 ArchiveFormat::Tar => {
663 self.process_tar_at_depth(&content[..], &mut output_buf, depth + 1)?
664 }
665 ArchiveFormat::TarGz => {
666 self.process_tar_gz_at_depth(&content[..], &mut output_buf, depth + 1)?
667 }
668 ArchiveFormat::Zip => {
669 let reader = io::Cursor::new(&content);
670 let mut writer = io::Cursor::new(Vec::new());
671 let s = self.process_zip_at_depth(reader, &mut writer, depth + 1)?;
672 output_buf = writer.into_inner();
673 s
674 }
675 };
676
677 stats.nested_archives += 1;
678 stats.merge(&child_stats);
679 stats.total_output_bytes += output_buf.len() as u64;
680 let fmt_name = match nested_fmt {
681 ArchiveFormat::Tar => "tar",
682 ArchiveFormat::TarGz => "tar.gz",
683 ArchiveFormat::Zip => "zip",
684 };
685 stats
686 .file_methods
687 .insert(filename.to_string(), format!("nested:{fmt_name}"));
688 writer.write_all(&output_buf).map_err(|e| {
689 SanitizeError::ArchiveError(format!("write nested archive '{filename}': {e}"))
690 })?;
691 Ok(())
692 }
693
694 pub fn discover_profiles_tar<R: Read>(&self, reader: R) -> Result<()> {
713 if self.profiles.is_empty() {
714 return Ok(());
715 }
716 let mut archive = tar::Archive::new(reader);
717 let entries = archive
718 .entries()
719 .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entries: {e}")))?;
720 for entry_result in entries {
721 let mut entry = entry_result
722 .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entry: {e}")))?;
723 if !entry.header().entry_type().is_file() {
724 continue;
725 }
726 let path = entry
727 .path()
728 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
729 .to_string_lossy()
730 .to_string();
731 let Some(profile) = self.find_profile(&path) else {
732 continue;
733 };
734 let mut content = Vec::new();
735 entry
736 .read_to_end(&mut content)
737 .map_err(|e| SanitizeError::ArchiveError(format!("read '{path}': {e}")))?;
738 let _ = self.registry.process(&content, profile, &self.store);
739 }
740 Ok(())
741 }
742
743 pub fn discover_profiles_tar_gz<R: Read>(&self, reader: R) -> Result<()> {
751 let gz = flate2::read::GzDecoder::new(reader);
752 self.discover_profiles_tar(gz)
753 }
754
755 pub fn discover_profiles_zip<R: Read + Seek>(&self, reader: R) -> Result<()> {
763 if self.profiles.is_empty() {
764 return Ok(());
765 }
766 let mut zip = zip::ZipArchive::new(reader)
767 .map_err(|e| SanitizeError::ArchiveError(format!("open zip for discovery: {e}")))?;
768 for i in 0..zip.len() {
769 let mut entry = zip
770 .by_index(i)
771 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {i}: {e}")))?;
772 if entry.is_dir() {
773 continue;
774 }
775 let name = sanitize_zip_entry_name(entry.name());
776 let Some(profile) = self.find_profile(&name) else {
777 continue;
778 };
779 let mut content = Vec::new();
780 entry
781 .read_to_end(&mut content)
782 .map_err(|e| SanitizeError::ArchiveError(format!("read '{name}': {e}")))?;
783 let _ = self.registry.process(&content, profile, &self.store);
784 }
785 Ok(())
786 }
787
788 pub fn process_tar<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
802 self.process_tar_at_depth(reader, writer, 0)
803 }
804
805 #[allow(clippy::too_many_lines)]
823 fn process_tar_at_depth<R: Read, W: Write>(
824 &self,
825 reader: R,
826 writer: W,
827 depth: u32,
828 ) -> Result<ArchiveStats> {
829 struct TarEntry {
830 header: tar::Header,
831 path: String,
832 is_file: bool,
833 passes_filter: bool,
834 data: Vec<u8>,
835 }
836
837 let mut archive = tar::Archive::new(reader);
838 let mut builder = tar::Builder::new(writer);
839 let mut stats = ArchiveStats::default();
840
841 let mut entries_iter = archive
845 .entries()
846 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entries: {e}")))?;
847
848 let mut buffered: Vec<TarEntry> = Vec::new();
849 let mut file_count: usize = 0;
850 let mut total_data: u64 = 0;
851 let mut overflowed = false;
852
853 for entry_result in entries_iter.by_ref() {
854 let mut entry = entry_result
855 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {e}")))?;
856
857 let header = entry.header().clone();
858 let path = entry
859 .path()
860 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
861 .to_string_lossy()
862 .into_owned();
863 let is_file = header.entry_type().is_file();
864 let passes_filter = !is_file || self.filter.passes(&path);
865
866 let mut data = Vec::new();
867 entry
868 .read_to_end(&mut data)
869 .map_err(|e| SanitizeError::ArchiveError(format!("read entry '{path}': {e}")))?;
870 drop(entry);
871
872 if is_file && passes_filter {
873 file_count += 1;
874 total_data = total_data.saturating_add(data.len() as u64);
875 }
876
877 buffered.push(TarEntry {
878 header,
879 path,
880 is_file,
881 passes_filter,
882 data,
883 });
884
885 if total_data > PARALLEL_TAR_DATA_SIZE {
886 overflowed = true;
887 break;
888 }
889 }
890
891 let use_parallel = !overflowed
893 && file_count >= self.parallel_threshold
894 && rayon::current_thread_index().is_none();
895
896 if use_parallel {
897 let file_indices: Vec<usize> = buffered
900 .iter()
901 .enumerate()
902 .filter(|(_, e)| e.is_file && e.passes_filter)
903 .map(|(i, _)| i)
904 .collect();
905
906 let results: Vec<ParEntryResult> = file_indices
907 .into_par_iter()
908 .map(|i| {
909 let e = &buffered[i];
910 let size_hint = e.header.size().ok();
911 (
912 i,
913 self.sanitize_entry_bytes(&e.path, &e.data, size_hint, depth),
914 )
915 })
916 .collect();
917
918 let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; buffered.len()];
919 for (i, r) in results {
920 sanitized[i] = Some(r?);
921 }
922
923 for (i, entry) in buffered.iter().enumerate() {
924 if !entry.is_file {
925 builder
926 .append(&entry.header, entry.data.as_slice())
927 .map_err(|e| {
928 SanitizeError::ArchiveError(format!("append '{}': {e}", entry.path))
929 })?;
930 stats.entries_skipped += 1;
931 self.emit_progress(&stats, None, &entry.path);
932 continue;
933 }
934 if !entry.passes_filter {
935 stats.entries_filtered += 1;
936 self.emit_progress(&stats, None, &entry.path);
937 continue;
938 }
939
940 let (sanitized_buf, entry_stats) =
941 sanitized[i].take().expect("parallel result missing");
942 stats.merge(&entry_stats);
943
944 let mut new_header = entry.header.clone();
945 let safe_path = sanitize_tar_entry_name(&entry.path);
946 new_header.set_path(&safe_path).map_err(|e| {
947 SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
948 })?;
949 new_header.set_size(sanitized_buf.len() as u64);
950 new_header.set_cksum();
951 builder
952 .append(&new_header, sanitized_buf.as_slice())
953 .map_err(|e| {
954 SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
955 })?;
956 stats.files_processed += 1;
957 self.emit_progress(&stats, None, &entry.path);
958 }
959 } else {
960 let write_buffered = |entry: &TarEntry,
965 builder: &mut tar::Builder<W>,
966 stats: &mut ArchiveStats,
967 processor: &ArchiveProcessor|
968 -> Result<()> {
969 if !entry.is_file {
970 builder
971 .append(&entry.header, entry.data.as_slice())
972 .map_err(|e| {
973 SanitizeError::ArchiveError(format!("append '{}': {e}", entry.path))
974 })?;
975 stats.entries_skipped += 1;
976 processor.emit_progress(stats, None, &entry.path);
977 return Ok(());
978 }
979 if !entry.passes_filter {
980 stats.entries_filtered += 1;
981 processor.emit_progress(stats, None, &entry.path);
982 return Ok(());
983 }
984 let size_hint = entry.header.size().ok();
985 let (sanitized_buf, entry_stats) =
986 processor.sanitize_entry_bytes(&entry.path, &entry.data, size_hint, depth)?;
987 stats.merge(&entry_stats);
988 let mut new_header = entry.header.clone();
989 let safe_path = sanitize_tar_entry_name(&entry.path);
990 new_header.set_path(&safe_path).map_err(|e| {
991 SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
992 })?;
993 new_header.set_size(sanitized_buf.len() as u64);
994 new_header.set_cksum();
995 builder
996 .append(&new_header, sanitized_buf.as_slice())
997 .map_err(|e| {
998 SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
999 })?;
1000 stats.files_processed += 1;
1001 processor.emit_progress(stats, None, &entry.path);
1002 Ok(())
1003 };
1004
1005 for entry in &buffered {
1006 write_buffered(entry, &mut builder, &mut stats, self)?;
1007 }
1008 drop(buffered);
1009
1010 if overflowed {
1012 for entry_result in entries_iter {
1013 let mut entry = entry_result
1014 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {e}")))?;
1015
1016 let header = entry.header().clone();
1017 let path = entry
1018 .path()
1019 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
1020 .to_string_lossy()
1021 .into_owned();
1022 let is_file = header.entry_type().is_file();
1023
1024 if !is_file {
1025 let mut data = Vec::new();
1026 entry.read_to_end(&mut data).map_err(|e| {
1027 SanitizeError::ArchiveError(format!("read '{path}': {e}"))
1028 })?;
1029 drop(entry);
1030 builder.append(&header, data.as_slice()).map_err(|e| {
1031 SanitizeError::ArchiveError(format!("append '{path}': {e}"))
1032 })?;
1033 stats.entries_skipped += 1;
1034 self.emit_progress(&stats, None, &path);
1035 continue;
1036 }
1037
1038 if !self.filter.passes(&path) {
1039 stats.entries_filtered += 1;
1040 continue;
1041 }
1042
1043 let size_hint = header.size().ok();
1044 let mut sanitized_buf = Vec::new();
1045 let mut entry_stats = ArchiveStats::default();
1046 self.sanitize_entry(
1047 &path,
1048 &mut entry,
1049 &mut sanitized_buf,
1050 &mut entry_stats,
1051 size_hint,
1052 depth,
1053 )?;
1054 drop(entry);
1055
1056 let mut new_header = header.clone();
1057 let safe_path = sanitize_tar_entry_name(&path);
1058 new_header.set_path(&safe_path).map_err(|e| {
1059 SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
1060 })?;
1061 new_header.set_size(sanitized_buf.len() as u64);
1062 new_header.set_cksum();
1063 builder
1064 .append(&new_header, sanitized_buf.as_slice())
1065 .map_err(|e| {
1066 SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
1067 })?;
1068
1069 stats.merge(&entry_stats);
1070 stats.files_processed += 1;
1071 self.emit_progress(&stats, None, &path);
1072 }
1073 }
1074 }
1075
1076 builder
1077 .finish()
1078 .map_err(|e| SanitizeError::ArchiveError(format!("finalize tar: {e}")))?;
1079
1080 Ok(stats)
1081 }
1082
1083 pub fn process_tar_gz<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
1093 self.process_tar_gz_at_depth(reader, writer, 0)
1094 }
1095
1096 fn process_tar_gz_at_depth<R: Read, W: Write>(
1098 &self,
1099 reader: R,
1100 writer: W,
1101 depth: u32,
1102 ) -> Result<ArchiveStats> {
1103 let gz_reader = flate2::read::GzDecoder::new(reader);
1104 let gz_writer = flate2::write::GzEncoder::new(writer, flate2::Compression::fast());
1105
1106 let stats = self.process_tar_at_depth(gz_reader, gz_writer, depth)?;
1107 Ok(stats)
1113 }
1114
1115 pub fn process_zip<R: Read + Seek, W: Write + Seek>(
1131 &self,
1132 reader: R,
1133 writer: W,
1134 ) -> Result<ArchiveStats> {
1135 self.process_zip_at_depth(reader, writer, 0)
1136 }
1137
1138 #[allow(clippy::too_many_lines)]
1149 fn process_zip_at_depth<R: Read + Seek, W: Write + Seek>(
1150 &self,
1151 reader: R,
1152 writer: W,
1153 depth: u32,
1154 ) -> Result<ArchiveStats> {
1155 struct ZipMeta {
1159 name: String,
1160 is_dir: bool,
1161 compression: zip::CompressionMethod,
1162 last_modified: Option<zip::DateTime>,
1163 unix_mode: Option<u32>,
1164 size: u64,
1165 }
1166
1167 let mut zip_in = zip::ZipArchive::new(reader)
1168 .map_err(|e| SanitizeError::ArchiveError(format!("open zip: {}", e)))?;
1169 let total_entries = zip_in.len();
1170 let total_entries_hint = Some(total_entries as u64);
1171
1172 let mut metas: Vec<ZipMeta> = Vec::with_capacity(total_entries);
1173 let mut file_count = 0usize;
1174 let mut total_uncompressed_size: u64 = 0;
1175
1176 for i in 0..total_entries {
1177 let entry = zip_in
1178 .by_index(i)
1179 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
1180 let is_dir = entry.is_dir();
1181 let size = entry.size();
1182 if !is_dir {
1183 file_count += 1;
1184 total_uncompressed_size = total_uncompressed_size.saturating_add(size);
1185 }
1186 metas.push(ZipMeta {
1187 name: sanitize_zip_entry_name(entry.name()),
1188 is_dir,
1189 compression: entry.compression(),
1190 last_modified: entry.last_modified(),
1191 unix_mode: entry.unix_mode(),
1192 size,
1193 });
1194 }
1196
1197 let use_parallel = file_count >= self.parallel_threshold
1202 && rayon::current_thread_index().is_none()
1203 && total_uncompressed_size <= PARALLEL_ZIP_DATA_SIZE;
1204
1205 let mut stats = ArchiveStats::default();
1206
1207 let make_options = |m: &ZipMeta| {
1209 let mut opts =
1210 zip::write::SimpleFileOptions::default().compression_method(m.compression);
1211 if let Some(dt) = m.last_modified {
1212 opts = opts.last_modified_time(dt);
1213 }
1214 if let Some(mode) = m.unix_mode {
1215 opts.unix_permissions(mode)
1216 } else {
1217 opts
1218 }
1219 };
1220
1221 if use_parallel {
1222 struct ZipEntry {
1224 meta_idx: usize,
1225 data: Vec<u8>,
1226 }
1227
1228 let mut file_entries: Vec<ZipEntry> = Vec::with_capacity(file_count);
1229
1230 for (i, meta) in metas.iter().enumerate() {
1231 if meta.is_dir {
1232 continue;
1233 }
1234 if !self.filter.passes(&meta.name) {
1236 continue;
1237 }
1238 let mut entry = zip_in
1239 .by_index(i)
1240 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
1241 let mut data = Vec::new();
1242 entry.read_to_end(&mut data).map_err(|e| {
1243 SanitizeError::ArchiveError(format!("read zip entry '{}': {}", meta.name, e))
1244 })?;
1245 file_entries.push(ZipEntry { meta_idx: i, data });
1246 }
1247
1248 let results: Vec<ParEntryResult> = file_entries
1249 .into_par_iter()
1250 .map(|e| {
1251 let meta = &metas[e.meta_idx];
1252 let result =
1253 self.sanitize_entry_bytes(&meta.name, &e.data, Some(meta.size), depth);
1254 (e.meta_idx, result)
1255 })
1256 .collect();
1257
1258 let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; metas.len()];
1261 for (meta_idx, r) in results {
1262 sanitized[meta_idx] = Some(r?);
1263 }
1264
1265 let mut zip_out = zip::ZipWriter::new(writer);
1266 for (i, meta) in metas.iter().enumerate() {
1267 let options = make_options(meta);
1268 if meta.is_dir {
1269 zip_out.add_directory(&meta.name, options).map_err(|e| {
1270 SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1271 })?;
1272 stats.entries_skipped += 1;
1273 self.emit_progress(&stats, total_entries_hint, &meta.name);
1274 continue;
1275 }
1276 if !self.filter.passes(&meta.name) {
1278 stats.entries_filtered += 1;
1279 self.emit_progress(&stats, total_entries_hint, &meta.name);
1280 continue;
1281 }
1282 let (sanitized_buf, entry_stats) = sanitized[i]
1283 .take()
1284 .expect("file entry sanitization result missing");
1285 stats.merge(&entry_stats);
1286 zip_out.start_file(&meta.name, options).map_err(|e| {
1287 SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1288 })?;
1289 zip_out.write_all(&sanitized_buf).map_err(|e| {
1290 SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1291 })?;
1292 stats.files_processed += 1;
1293 self.emit_progress(&stats, total_entries_hint, &meta.name);
1294 }
1295 zip_out
1296 .finish()
1297 .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1298 } else {
1299 let mut zip_out = zip::ZipWriter::new(writer);
1302 for (i, meta) in metas.iter().enumerate() {
1303 let options = make_options(meta);
1304 if meta.is_dir {
1305 zip_out.add_directory(&meta.name, options).map_err(|e| {
1306 SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1307 })?;
1308 stats.entries_skipped += 1;
1309 self.emit_progress(&stats, total_entries_hint, &meta.name);
1310 continue;
1311 }
1312
1313 if !self.filter.passes(&meta.name) {
1315 stats.entries_filtered += 1;
1316 self.emit_progress(&stats, total_entries_hint, &meta.name);
1317 continue;
1318 }
1319
1320 let data = {
1321 let mut entry = zip_in.by_index(i).map_err(|e| {
1322 SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e))
1323 })?;
1324 let mut buf = Vec::new();
1325 entry.read_to_end(&mut buf).map_err(|e| {
1326 SanitizeError::ArchiveError(format!(
1327 "read zip entry '{}': {}",
1328 meta.name, e
1329 ))
1330 })?;
1331 buf
1332 };
1334
1335 let (sanitized_buf, entry_stats) =
1336 self.sanitize_entry_bytes(&meta.name, &data, Some(meta.size), depth)?;
1337 drop(data);
1338
1339 zip_out.start_file(&meta.name, options).map_err(|e| {
1340 SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1341 })?;
1342 zip_out.write_all(&sanitized_buf).map_err(|e| {
1343 SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1344 })?;
1345 drop(sanitized_buf);
1346
1347 stats.merge(&entry_stats);
1348 stats.files_processed += 1;
1349 self.emit_progress(&stats, total_entries_hint, &meta.name);
1350 }
1351 zip_out
1352 .finish()
1353 .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1354 }
1355
1356 Ok(stats)
1357 }
1358
1359 pub fn process<R: Read + Seek, W: Write + Seek>(
1375 &self,
1376 reader: R,
1377 writer: W,
1378 format: ArchiveFormat,
1379 ) -> Result<ArchiveStats> {
1380 match format {
1381 ArchiveFormat::Zip => self.process_zip(reader, writer),
1382 ArchiveFormat::Tar => self.process_tar(reader, writer),
1383 ArchiveFormat::TarGz => self.process_tar_gz(reader, writer),
1384 }
1385 }
1386}
1387
1388struct CountingReader<'a> {
1394 inner: &'a mut dyn Read,
1395 count: u64,
1396}
1397
1398impl<'a> CountingReader<'a> {
1399 fn new(inner: &'a mut dyn Read) -> Self {
1400 Self { inner, count: 0 }
1401 }
1402
1403 fn bytes_read(&self) -> u64 {
1404 self.count
1405 }
1406}
1407
1408impl Read for CountingReader<'_> {
1409 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
1410 let n = self.inner.read(buf)?;
1411 self.count += n as u64;
1412 Ok(n)
1413 }
1414}
1415
1416struct CountingWriter<'a> {
1418 inner: &'a mut dyn Write,
1419 count: u64,
1420}
1421
1422impl<'a> CountingWriter<'a> {
1423 fn new(inner: &'a mut dyn Write) -> Self {
1424 Self { inner, count: 0 }
1425 }
1426
1427 fn bytes_written(&self) -> u64 {
1428 self.count
1429 }
1430}
1431
1432impl Write for CountingWriter<'_> {
1433 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
1434 let n = self.inner.write(buf)?;
1435 self.count += n as u64;
1436 Ok(n)
1437 }
1438
1439 fn flush(&mut self) -> io::Result<()> {
1440 self.inner.flush()
1441 }
1442}
1443
1444#[cfg(test)]
1449mod tests {
1450 use super::*;
1451 use crate::category::Category;
1452 use crate::generator::HmacGenerator;
1453 use crate::processor::profile::{FieldRule, FileTypeProfile};
1454 use crate::processor::registry::ProcessorRegistry;
1455 use crate::scanner::{ScanConfig, ScanPattern};
1456 use std::io::Cursor;
1457 use std::sync::Mutex;
1458
1459 fn make_archive_processor() -> ArchiveProcessor {
1461 let gen = Arc::new(HmacGenerator::new([42u8; 32]));
1462 let store = Arc::new(MappingStore::new(gen, None));
1463
1464 let patterns = vec![
1465 ScanPattern::from_regex(
1466 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
1467 Category::Email,
1468 "email",
1469 )
1470 .unwrap(),
1471 ScanPattern::from_literal("SUPERSECRET", Category::Custom("api_key".into()), "api_key")
1472 .unwrap(),
1473 ];
1474
1475 let scanner = Arc::new(
1476 StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
1477 );
1478
1479 let registry = Arc::new(ProcessorRegistry::with_builtins());
1480
1481 let profiles = vec![FileTypeProfile::new(
1482 "json",
1483 vec![FieldRule::new("*").with_category(Category::Custom("field".into()))],
1484 )
1485 .with_extension(".json")];
1486
1487 ArchiveProcessor::new(registry, scanner, store, profiles)
1488 }
1489
1490 fn build_test_tar(entries: &[(&str, &[u8])]) -> Vec<u8> {
1493 let mut buf = Vec::new();
1494 {
1495 let mut builder = tar::Builder::new(&mut buf);
1496 for (name, data) in entries {
1497 let mut header = tar::Header::new_gnu();
1498 header.set_size(data.len() as u64);
1499 header.set_mode(0o644);
1500 header.set_mtime(1_700_000_000);
1501 header.set_cksum();
1502 builder.append_data(&mut header, *name, *data).unwrap();
1503 }
1504 builder.finish().unwrap();
1505 }
1506 buf
1507 }
1508
1509 #[test]
1510 fn tar_sanitizes_plaintext_with_scanner() {
1511 let proc = make_archive_processor();
1512 let input = build_test_tar(&[("readme.txt", b"Contact alice@corp.com for help.")]);
1513
1514 let mut output = Vec::new();
1515 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1516
1517 assert_eq!(stats.files_processed, 1);
1518 assert_eq!(stats.scanner_fallback, 1);
1519 assert_eq!(stats.structured_hits, 0);
1520
1521 let mut archive = tar::Archive::new(&output[..]);
1523 for entry in archive.entries().unwrap() {
1524 let mut e = entry.unwrap();
1525 let mut content = String::new();
1526 e.read_to_string(&mut content).unwrap();
1527 assert!(
1528 !content.contains("alice@corp.com"),
1529 "email should be sanitized: {content}"
1530 );
1531 }
1532 }
1533
1534 #[test]
1535 fn tar_sanitizes_json_with_structured_processor() {
1536 let proc = make_archive_processor();
1537 let json_content = br#"{"email": "bob@example.org", "name": "Bob"}"#;
1538 let input = build_test_tar(&[("config.json", json_content)]);
1539
1540 let mut output = Vec::new();
1541 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1542
1543 assert_eq!(stats.files_processed, 1);
1544 assert_eq!(stats.structured_hits, 1);
1545 assert_eq!(stats.scanner_fallback, 0);
1546 assert_eq!(
1547 stats.file_methods.get("config.json").unwrap(),
1548 "structured+scan:json"
1549 );
1550
1551 let mut archive = tar::Archive::new(&output[..]);
1553 for entry in archive.entries().unwrap() {
1554 let mut e = entry.unwrap();
1555 let mut content = String::new();
1556 e.read_to_string(&mut content).unwrap();
1557 assert!(
1558 !content.contains("bob@example.org"),
1559 "email should be sanitized"
1560 );
1561 assert!(!content.contains("Bob"), "name should be sanitized");
1562 }
1563 }
1564
1565 #[test]
1566 fn tar_preserves_metadata() {
1567 let proc = make_archive_processor();
1568 let input = build_test_tar(&[("data.txt", b"SUPERSECRET token here")]);
1569
1570 let mut output = Vec::new();
1571 proc.process_tar(&input[..], &mut output).unwrap();
1572
1573 let mut archive = tar::Archive::new(&output[..]);
1574 for entry in archive.entries().unwrap() {
1575 let e = entry.unwrap();
1576 let hdr = e.header();
1577 assert_eq!(hdr.mode().unwrap(), 0o644);
1578 assert_eq!(hdr.mtime().unwrap(), 1_700_000_000);
1579 }
1580 }
1581
1582 #[test]
1583 fn tar_handles_multiple_files() {
1584 let proc = make_archive_processor();
1585 let input = build_test_tar(&[
1586 ("a.txt", b"alice@corp.com"),
1587 ("b.json", br#"{"key":"value"}"#),
1588 ("c.log", b"no secrets here"),
1589 ]);
1590
1591 let mut output = Vec::new();
1592 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1593
1594 assert_eq!(stats.files_processed, 3);
1595 assert_eq!(stats.structured_hits, 1);
1597 assert_eq!(stats.scanner_fallback, 2);
1599 }
1600
1601 #[test]
1602 fn tar_passes_through_directories() {
1603 let mut buf = Vec::new();
1604 {
1605 let mut builder = tar::Builder::new(&mut buf);
1606
1607 let mut dir_header = tar::Header::new_gnu();
1609 dir_header.set_entry_type(tar::EntryType::Directory);
1610 dir_header.set_size(0);
1611 dir_header.set_mode(0o755);
1612 dir_header.set_cksum();
1613 builder
1614 .append_data(&mut dir_header, "mydir/", &b""[..])
1615 .unwrap();
1616
1617 let mut file_header = tar::Header::new_gnu();
1619 file_header.set_size(5);
1620 file_header.set_mode(0o644);
1621 file_header.set_cksum();
1622 builder
1623 .append_data(&mut file_header, "mydir/hello.txt", &b"hello"[..])
1624 .unwrap();
1625
1626 builder.finish().unwrap();
1627 }
1628
1629 let proc = make_archive_processor();
1630 let mut output = Vec::new();
1631 let stats = proc.process_tar(&buf[..], &mut output).unwrap();
1632
1633 assert_eq!(stats.entries_skipped, 1);
1634 assert_eq!(stats.files_processed, 1);
1635 }
1636
1637 #[test]
1640 fn tar_gz_round_trip() {
1641 let proc = make_archive_processor();
1642
1643 let tar_data = build_test_tar(&[("secret.txt", b"Key is SUPERSECRET okay")]);
1645 let mut gz_input = Vec::new();
1646 {
1647 let mut encoder =
1648 flate2::write::GzEncoder::new(&mut gz_input, flate2::Compression::fast());
1649 encoder.write_all(&tar_data).unwrap();
1650 encoder.finish().unwrap();
1651 }
1652
1653 let mut gz_output = Vec::new();
1654 let stats = proc.process_tar_gz(&gz_input[..], &mut gz_output).unwrap();
1655
1656 assert_eq!(stats.files_processed, 1);
1657 assert_eq!(stats.scanner_fallback, 1);
1658
1659 let decoder = flate2::read::GzDecoder::new(&gz_output[..]);
1661 let mut archive = tar::Archive::new(decoder);
1662 for entry in archive.entries().unwrap() {
1663 let mut e = entry.unwrap();
1664 let mut content = String::new();
1665 e.read_to_string(&mut content).unwrap();
1666 assert!(
1667 !content.contains("SUPERSECRET"),
1668 "secret should be sanitized: {content}"
1669 );
1670 }
1671 }
1672
1673 fn build_test_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
1676 let mut buf = Cursor::new(Vec::new());
1677 {
1678 let mut zip = zip::ZipWriter::new(&mut buf);
1679 for (name, data) in entries {
1680 let options = zip::write::SimpleFileOptions::default()
1681 .compression_method(zip::CompressionMethod::Deflated);
1682 zip.start_file(*name, options).unwrap();
1683 zip.write_all(data).unwrap();
1684 }
1685 zip.finish().unwrap();
1686 }
1687 buf.into_inner()
1688 }
1689
1690 #[test]
1691 fn zip_sanitizes_plaintext_with_scanner() {
1692 let proc = make_archive_processor();
1693 let zip_data = build_test_zip(&[("notes.txt", b"Reach alice@corp.com for info.")]);
1694
1695 let reader = Cursor::new(&zip_data);
1696 let mut writer = Cursor::new(Vec::new());
1697 let stats = proc.process_zip(reader, &mut writer).unwrap();
1698
1699 assert_eq!(stats.files_processed, 1);
1700 assert_eq!(stats.scanner_fallback, 1);
1701
1702 let out_data = writer.into_inner();
1704 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1705 let mut entry = zip_out.by_index(0).unwrap();
1706 let mut content = String::new();
1707 entry.read_to_string(&mut content).unwrap();
1708 assert!(
1709 !content.contains("alice@corp.com"),
1710 "email should be sanitized: {content}"
1711 );
1712 }
1713
1714 #[test]
1715 fn zip_sanitizes_json_with_structured_processor() {
1716 let proc = make_archive_processor();
1717 let json_content = br#"{"password": "hunter2", "host": "db.internal"}"#;
1718 let zip_data = build_test_zip(&[("settings.json", json_content)]);
1719
1720 let reader = Cursor::new(&zip_data);
1721 let mut writer = Cursor::new(Vec::new());
1722 let stats = proc.process_zip(reader, &mut writer).unwrap();
1723
1724 assert_eq!(stats.files_processed, 1);
1725 assert_eq!(stats.structured_hits, 1);
1726
1727 let out_data = writer.into_inner();
1728 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1729 let mut entry = zip_out.by_index(0).unwrap();
1730 let mut content = String::new();
1731 entry.read_to_string(&mut content).unwrap();
1732 assert!(!content.contains("hunter2"), "password should be sanitized");
1733 assert!(!content.contains("db.internal"), "host should be sanitized");
1734 }
1735
1736 #[test]
1737 fn zip_preserves_directory_entries() {
1738 let mut buf = Cursor::new(Vec::new());
1739 {
1740 let mut zip = zip::ZipWriter::new(&mut buf);
1741
1742 let dir_options = zip::write::SimpleFileOptions::default();
1743 zip.add_directory("subdir/", dir_options).unwrap();
1744
1745 let file_options = zip::write::SimpleFileOptions::default()
1746 .compression_method(zip::CompressionMethod::Stored);
1747 zip.start_file("subdir/data.txt", file_options).unwrap();
1748 zip.write_all(b"SUPERSECRET value").unwrap();
1749
1750 zip.finish().unwrap();
1751 }
1752
1753 let zip_data = buf.into_inner();
1754 let proc = make_archive_processor();
1755 let reader = Cursor::new(&zip_data);
1756 let mut writer = Cursor::new(Vec::new());
1757 let stats = proc.process_zip(reader, &mut writer).unwrap();
1758
1759 assert_eq!(stats.entries_skipped, 1); assert_eq!(stats.files_processed, 1);
1761 }
1762
1763 #[test]
1764 fn zip_handles_multiple_files() {
1765 let proc = make_archive_processor();
1766 let zip_data = build_test_zip(&[
1767 ("file1.txt", b"alice@corp.com"),
1768 ("file2.json", br#"{"secret":"SUPERSECRET"}"#),
1769 ("file3.log", b"nothing to see"),
1770 ]);
1771
1772 let reader = Cursor::new(&zip_data);
1773 let mut writer = Cursor::new(Vec::new());
1774 let stats = proc.process_zip(reader, &mut writer).unwrap();
1775
1776 assert_eq!(stats.files_processed, 3);
1777 assert_eq!(stats.structured_hits, 1); assert_eq!(stats.scanner_fallback, 2); }
1780
1781 #[test]
1782 fn tar_progress_callback_receives_updates() {
1783 let updates = Arc::new(Mutex::new(Vec::new()));
1784 let proc = make_archive_processor().with_progress_callback({
1785 let updates = Arc::clone(&updates);
1786 Arc::new(move |progress| {
1787 updates
1788 .lock()
1789 .expect("archive progress lock")
1790 .push(progress.clone());
1791 })
1792 });
1793 let input = build_test_tar(&[("a.txt", b"alice@corp.com"), ("b.txt", b"SUPERSECRET")]);
1794
1795 let mut output = Vec::new();
1796 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1797 let updates = updates.lock().unwrap();
1798
1799 assert_eq!(updates.len(), 2);
1800 assert_eq!(updates.last().unwrap().entries_seen, 2);
1801 assert_eq!(
1802 updates.last().unwrap().files_processed,
1803 stats.files_processed
1804 );
1805 assert_eq!(updates.last().unwrap().total_entries, None);
1806 }
1807
1808 #[test]
1809 fn zip_progress_callback_reports_total_entries() {
1810 let updates = Arc::new(Mutex::new(Vec::new()));
1811 let proc = make_archive_processor().with_progress_callback({
1812 let updates = Arc::clone(&updates);
1813 Arc::new(move |progress| {
1814 updates
1815 .lock()
1816 .expect("archive progress lock")
1817 .push(progress.clone());
1818 })
1819 });
1820 let zip_data = build_test_zip(&[
1821 ("file1.txt", b"alice@corp.com"),
1822 ("file2.log", b"nothing to see"),
1823 ]);
1824
1825 let reader = Cursor::new(&zip_data);
1826 let mut writer = Cursor::new(Vec::new());
1827 let stats = proc.process_zip(reader, &mut writer).unwrap();
1828 let updates = updates.lock().unwrap();
1829
1830 assert_eq!(updates.len(), 2);
1831 assert_eq!(
1832 updates.last().unwrap().files_processed,
1833 stats.files_processed
1834 );
1835 assert_eq!(updates.last().unwrap().total_entries, Some(2));
1836 assert_eq!(updates.last().unwrap().current_entry, "file2.log");
1837 }
1838
1839 #[test]
1842 fn format_detection_from_path() {
1843 assert_eq!(
1844 ArchiveFormat::from_path("data.tar"),
1845 Some(ArchiveFormat::Tar)
1846 );
1847 assert_eq!(
1848 ArchiveFormat::from_path("data.tar.gz"),
1849 Some(ArchiveFormat::TarGz)
1850 );
1851 assert_eq!(
1852 ArchiveFormat::from_path("data.tgz"),
1853 Some(ArchiveFormat::TarGz)
1854 );
1855 assert_eq!(
1856 ArchiveFormat::from_path("data.zip"),
1857 Some(ArchiveFormat::Zip)
1858 );
1859 assert_eq!(
1860 ArchiveFormat::from_path("DATA.ZIP"),
1861 Some(ArchiveFormat::Zip)
1862 );
1863 assert_eq!(ArchiveFormat::from_path("photo.png"), None);
1864 }
1865
1866 #[test]
1869 fn same_secret_gets_same_replacement_across_entries() {
1870 let proc = make_archive_processor();
1871 let input = build_test_tar(&[
1872 ("a.txt", b"contact alice@corp.com"),
1873 ("b.txt", b"reach alice@corp.com"),
1874 ]);
1875
1876 let mut output = Vec::new();
1877 proc.process_tar(&input[..], &mut output).unwrap();
1878
1879 let mut archive = tar::Archive::new(&output[..]);
1880 let mut contents: Vec<String> = Vec::new();
1881 for entry in archive.entries().unwrap() {
1882 let mut e = entry.unwrap();
1883 let mut s = String::new();
1884 e.read_to_string(&mut s).unwrap();
1885 contents.push(s);
1886 }
1887
1888 let replacement_a = contents[0].strip_prefix("contact ").unwrap();
1891 let replacement_b = contents[1].strip_prefix("reach ").unwrap();
1892 assert_eq!(
1893 replacement_a, replacement_b,
1894 "dedup should produce identical replacements"
1895 );
1896 assert!(!replacement_a.contains("alice@corp.com"));
1897 }
1898
1899 #[test]
1902 fn process_auto_dispatch_tar() {
1903 let proc = make_archive_processor();
1904 let tar_data = build_test_tar(&[("f.txt", b"SUPERSECRET")]);
1905
1906 let reader = Cursor::new(tar_data);
1907 let writer = Cursor::new(Vec::new());
1908 let stats = proc.process(reader, writer, ArchiveFormat::Tar).unwrap();
1909
1910 assert_eq!(stats.files_processed, 1);
1911 }
1912
1913 #[test]
1914 fn process_auto_dispatch_zip() {
1915 let proc = make_archive_processor();
1916 let zip_data = build_test_zip(&[("f.txt", b"SUPERSECRET")]);
1917
1918 let reader = Cursor::new(zip_data);
1919 let mut writer = Cursor::new(Vec::new());
1920 let stats = proc
1921 .process(reader, &mut writer, ArchiveFormat::Zip)
1922 .unwrap();
1923
1924 assert_eq!(stats.files_processed, 1);
1925 }
1926
1927 #[test]
1930 fn tar_empty_archive() {
1931 let proc = make_archive_processor();
1932 let tar_data = build_test_tar(&[]);
1933
1934 let mut output = Vec::new();
1935 let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
1936
1937 assert_eq!(stats.files_processed, 0);
1938 assert_eq!(stats.entries_skipped, 0);
1939 }
1940
1941 #[test]
1942 fn zip_empty_archive() {
1943 let proc = make_archive_processor();
1944 let zip_data = build_test_zip(&[]);
1945
1946 let reader = Cursor::new(zip_data);
1947 let mut writer = Cursor::new(Vec::new());
1948 let stats = proc.process_zip(reader, &mut writer).unwrap();
1949
1950 assert_eq!(stats.files_processed, 0);
1951 }
1952
1953 #[test]
1956 fn zip_entry_name_clean_passthrough() {
1957 assert_eq!(sanitize_zip_entry_name("logs/app.log"), "logs/app.log");
1958 assert_eq!(sanitize_zip_entry_name("config.yaml"), "config.yaml");
1959 assert_eq!(sanitize_zip_entry_name("a/b/c.txt"), "a/b/c.txt");
1960 }
1961
1962 #[test]
1963 fn zip_entry_name_strips_leading_slash() {
1964 assert_eq!(sanitize_zip_entry_name("/etc/passwd"), "etc/passwd");
1965 assert_eq!(sanitize_zip_entry_name("///etc/passwd"), "etc/passwd");
1966 }
1967
1968 #[test]
1969 fn zip_entry_name_strips_dotdot() {
1970 assert_eq!(sanitize_zip_entry_name("../etc/passwd"), "etc/passwd");
1971 assert_eq!(
1972 sanitize_zip_entry_name("a/../../etc/passwd"),
1973 "a/etc/passwd"
1974 );
1975 assert_eq!(
1976 sanitize_zip_entry_name("../../root/.ssh/id_rsa"),
1977 "root/.ssh/id_rsa"
1978 );
1979 }
1980
1981 #[test]
1982 fn zip_entry_name_strips_leading_dot_slash() {
1983 assert_eq!(sanitize_zip_entry_name("./config.yaml"), "config.yaml");
1984 assert_eq!(sanitize_zip_entry_name("././config.yaml"), "config.yaml");
1985 }
1986
1987 #[test]
1988 fn zip_entry_name_backslash_normalised() {
1989 assert_eq!(sanitize_zip_entry_name("a\\b\\c.txt"), "a/b/c.txt");
1990 assert_eq!(sanitize_zip_entry_name("..\\etc\\passwd"), "etc/passwd");
1991 }
1992
1993 #[test]
1994 fn zip_entry_name_empty_result_replaced() {
1995 assert_eq!(sanitize_zip_entry_name("../.."), "_");
1996 assert_eq!(sanitize_zip_entry_name(""), "_");
1997 assert_eq!(sanitize_zip_entry_name("/"), "_");
1998 }
1999
2000 #[test]
2001 fn zip_entry_name_absolute_dotdot_combo() {
2002 assert_eq!(sanitize_zip_entry_name("/../etc/passwd"), "etc/passwd");
2003 }
2004}