1use crate::error::{Result, SanitizeError};
54use crate::processor::profile::FileTypeProfile;
55use crate::processor::registry::ProcessorRegistry;
56use crate::scanner::{ScanStats, StreamScanner};
57use crate::store::MappingStore;
58
59use glob::MatchOptions;
60use rayon::prelude::*;
61use std::collections::HashMap;
62use std::io::{self, Read, Seek, Write};
63use std::sync::Arc;
64
65const MAX_STRUCTURED_ENTRY_SIZE: u64 = 256 * 1024 * 1024; const MAX_PARALLEL_ZIP_DATA_SIZE: u64 = 256 * 1024 * 1024; pub const DEFAULT_MAX_ARCHIVE_DEPTH: u32 = 3;
87
88const MAX_ALLOWED_ARCHIVE_DEPTH: u32 = 10;
92
93const PARALLEL_ENTRY_THRESHOLD: usize = 4;
97
98type ZipEntryResult = (usize, Result<(Vec<u8>, ArchiveStats)>);
104
105#[derive(Default, Clone)]
132pub struct ArchiveFilter {
133 only: Vec<CompiledPattern>,
134 exclude: Vec<CompiledPattern>,
135}
136
137#[derive(Clone)]
138enum CompiledPattern {
139 DirPrefix(String),
142 Glob(glob::Pattern),
144}
145
146const GLOB_OPTS: MatchOptions = MatchOptions {
147 case_sensitive: true,
148 require_literal_separator: true,
149 require_literal_leading_dot: false,
150};
151
152impl CompiledPattern {
153 fn compile(raw: &str) -> std::result::Result<Self, String> {
154 if raw.ends_with('/') {
155 Ok(CompiledPattern::DirPrefix(
157 raw.trim_end_matches('/').to_string(),
158 ))
159 } else {
160 glob::Pattern::new(raw)
161 .map(CompiledPattern::Glob)
162 .map_err(|e| format!("invalid glob pattern '{raw}': {e}"))
163 }
164 }
165
166 fn matches(&self, path: &str) -> bool {
167 match self {
168 CompiledPattern::DirPrefix(prefix) => {
169 path == prefix || path.starts_with(&format!("{prefix}/"))
170 }
171 CompiledPattern::Glob(pat) => pat.matches_with(path, GLOB_OPTS),
172 }
173 }
174}
175
176impl ArchiveFilter {
177 pub fn new(
181 only: Vec<String>,
182 exclude: Vec<String>,
183 ) -> std::result::Result<Self, String> {
184 let only = only
185 .iter()
186 .map(|p| CompiledPattern::compile(p))
187 .collect::<std::result::Result<Vec<_>, _>>()?;
188 let exclude = exclude
189 .iter()
190 .map(|p| CompiledPattern::compile(p))
191 .collect::<std::result::Result<Vec<_>, _>>()?;
192 Ok(Self { only, exclude })
193 }
194
195 pub fn is_empty(&self) -> bool {
197 self.only.is_empty() && self.exclude.is_empty()
198 }
199
200 pub fn passes(&self, path: &str) -> bool {
204 if !self.only.is_empty() && !self.only.iter().any(|p| p.matches(path)) {
205 return false;
206 }
207 if self.exclude.iter().any(|p| p.matches(path)) {
208 return false;
209 }
210 true
211 }
212}
213
214#[derive(Debug, Clone, Copy, PartialEq, Eq)]
218pub enum ArchiveFormat {
219 Zip,
221 Tar,
223 TarGz,
225}
226
227impl ArchiveFormat {
228 pub fn from_path(path: &str) -> Option<Self> {
232 let lower = path.to_ascii_lowercase();
233 if lower.ends_with(".tar.gz")
234 || std::path::Path::new(&lower)
235 .extension()
236 .is_some_and(|ext| ext.eq_ignore_ascii_case("tgz"))
237 {
238 Some(Self::TarGz)
239 } else if std::path::Path::new(&lower)
240 .extension()
241 .is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
242 {
243 Some(Self::Tar)
244 } else if std::path::Path::new(&lower)
245 .extension()
246 .is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
247 {
248 Some(Self::Zip)
249 } else {
250 None
251 }
252 }
253}
254
255#[derive(Debug, Clone, Default)]
261pub struct ArchiveStats {
262 pub files_processed: u64,
264 pub entries_skipped: u64,
266 pub structured_hits: u64,
268 pub scanner_fallback: u64,
270 pub nested_archives: u64,
273 pub total_input_bytes: u64,
275 pub total_output_bytes: u64,
277 pub file_methods: HashMap<String, String>,
280 pub file_scan_stats: HashMap<String, ScanStats>,
282 pub entries_filtered: u64,
284}
285
286#[derive(Debug, Clone, Eq, PartialEq)]
288pub struct ArchiveProgress {
289 pub entries_seen: u64,
291 pub files_processed: u64,
293 pub entries_skipped: u64,
295 pub total_entries: Option<u64>,
297 pub current_entry: String,
299}
300
301type ArchiveProgressCallback = Arc<dyn Fn(&ArchiveProgress) + Send + Sync>;
302
303impl ArchiveStats {
304 fn merge(&mut self, child: &ArchiveStats) {
306 self.files_processed += child.files_processed;
307 self.entries_skipped += child.entries_skipped;
308 self.structured_hits += child.structured_hits;
309 self.scanner_fallback += child.scanner_fallback;
310 self.nested_archives += child.nested_archives;
311 self.total_input_bytes += child.total_input_bytes;
312 self.total_output_bytes += child.total_output_bytes;
313 self.entries_filtered += child.entries_filtered;
314 self.file_methods
315 .extend(child.file_methods.iter().map(|(k, v)| (k.clone(), v.clone())));
316 self.file_scan_stats
317 .extend(child.file_scan_stats.iter().map(|(k, v)| (k.clone(), v.clone())));
318 }
319}
320
321pub struct ArchiveProcessor {
352 registry: Arc<ProcessorRegistry>,
354 scanner: Arc<StreamScanner>,
356 store: Arc<MappingStore>,
358 profiles: Vec<FileTypeProfile>,
360 max_depth: u32,
362 progress_callback: Option<ArchiveProgressCallback>,
364 parallel_threshold: usize,
367 filter: ArchiveFilter,
370 force_text: bool,
374}
375
376impl ArchiveProcessor {
377 pub fn new(
386 registry: Arc<ProcessorRegistry>,
387 scanner: Arc<StreamScanner>,
388 store: Arc<MappingStore>,
389 profiles: Vec<FileTypeProfile>,
390 ) -> Self {
391 Self {
392 registry,
393 scanner,
394 store,
395 profiles,
396 max_depth: DEFAULT_MAX_ARCHIVE_DEPTH,
397 progress_callback: None,
398 parallel_threshold: PARALLEL_ENTRY_THRESHOLD,
399 filter: ArchiveFilter::default(),
400 force_text: false,
401 }
402 }
403
404 #[must_use]
410 pub fn with_max_depth(mut self, depth: u32) -> Self {
411 self.max_depth = depth.min(MAX_ALLOWED_ARCHIVE_DEPTH);
412 self
413 }
414
415 #[must_use]
420 pub fn with_parallel_threshold(mut self, threshold: usize) -> Self {
421 self.parallel_threshold = threshold;
422 self
423 }
424
425 #[must_use]
427 pub fn with_progress_callback(mut self, callback: ArchiveProgressCallback) -> Self {
428 self.progress_callback = Some(callback);
429 self
430 }
431
432 #[must_use]
438 pub fn with_filter(mut self, filter: ArchiveFilter) -> Self {
439 self.filter = filter;
440 self
441 }
442
443 #[must_use]
450 pub fn with_force_text(mut self, force_text: bool) -> Self {
451 self.force_text = force_text;
452 self
453 }
454
455 fn find_profile(&self, filename: &str) -> Option<&FileTypeProfile> {
457 self.profiles.iter().find(|p| p.matches_filename(filename))
458 }
459
460 fn emit_progress(&self, stats: &ArchiveStats, total_entries: Option<u64>, current_entry: &str) {
461 if let Some(callback) = &self.progress_callback {
462 callback(&ArchiveProgress {
463 entries_seen: stats.files_processed + stats.entries_skipped,
464 files_processed: stats.files_processed,
465 entries_skipped: stats.entries_skipped,
466 total_entries,
467 current_entry: current_entry.to_string(),
468 });
469 }
470 }
471
472 fn sanitize_entry_bytes(
479 &self,
480 filename: &str,
481 data: &[u8],
482 entry_size_hint: Option<u64>,
483 depth: u32,
484 ) -> Result<(Vec<u8>, ArchiveStats)> {
485 let mut out: Vec<u8> = Vec::with_capacity(data.len());
486 let mut entry_stats = ArchiveStats::default();
487 let mut reader = io::Cursor::new(data);
488 self.sanitize_entry(
489 filename,
490 &mut reader,
491 &mut out,
492 &mut entry_stats,
493 entry_size_hint,
494 depth,
495 )?;
496 Ok((out, entry_stats))
497 }
498
499 #[allow(clippy::missing_errors_doc)] fn sanitize_entry(
511 &self,
512 filename: &str,
513 reader: &mut dyn Read,
514 writer: &mut dyn Write,
515 stats: &mut ArchiveStats,
516 entry_size_hint: Option<u64>,
517 depth: u32,
518 ) -> Result<()> {
519 if let Some(nested_fmt) = ArchiveFormat::from_path(filename) {
521 return self.sanitize_nested_archive(
522 filename,
523 reader,
524 writer,
525 stats,
526 entry_size_hint,
527 nested_fmt,
528 depth,
529 );
530 }
531
532 let within_size_cap = entry_size_hint.map_or(true, |sz| sz <= MAX_STRUCTURED_ENTRY_SIZE); if !self.force_text && within_size_cap {
540 if let Some(profile) = self.find_profile(filename) {
541 let mut content = Vec::new();
543 reader.read_to_end(&mut content).map_err(|e| {
544 SanitizeError::ArchiveError(format!("read entry '{filename}': {e}"))
545 })?;
546
547 stats.total_input_bytes += content.len() as u64;
548
549 match self.registry.process(&content, profile, &self.store) {
553 Ok(Some(structured_out)) => {
554 let (output, scan_stats) = self.scanner.scan_bytes(&structured_out)?;
557 stats.structured_hits += 1;
558 stats.total_output_bytes += output.len() as u64;
559 stats.file_methods.insert(
560 filename.to_string(),
561 format!("structured+scan:{}", profile.processor),
562 );
563 stats
564 .file_scan_stats
565 .insert(filename.to_string(), scan_stats);
566 writer.write_all(&output).map_err(|e| {
567 SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
568 })?;
569 return Ok(());
570 }
571 Ok(None) => {} Err(_) => {} }
574
575 let (output, scan_stats) = self.scanner.scan_bytes(&content)?;
578 stats.scanner_fallback += 1;
579 stats.total_output_bytes += output.len() as u64;
580 stats
581 .file_methods
582 .insert(filename.to_string(), "scanner".to_string());
583 stats
584 .file_scan_stats
585 .insert(filename.to_string(), scan_stats);
586 writer.write_all(&output).map_err(|e| {
587 SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
588 })?;
589 return Ok(());
590 }
591 }
592
593 let mut counting_r = CountingReader::new(reader);
598 let mut counting_w = CountingWriter::new(writer);
599 let scan_stats = self.scanner.scan_reader(&mut counting_r, &mut counting_w)?;
600
601 stats.scanner_fallback += 1;
602 stats.total_input_bytes += counting_r.bytes_read();
603 stats.total_output_bytes += counting_w.bytes_written();
604 stats
605 .file_methods
606 .insert(filename.to_string(), "scanner".to_string());
607 stats
608 .file_scan_stats
609 .insert(filename.to_string(), scan_stats);
610
611 Ok(())
612 }
613
614 #[allow(clippy::too_many_arguments)]
617 fn sanitize_nested_archive(
618 &self,
619 filename: &str,
620 reader: &mut dyn Read,
621 writer: &mut dyn Write,
622 stats: &mut ArchiveStats,
623 entry_size_hint: Option<u64>,
624 nested_fmt: ArchiveFormat,
625 depth: u32,
626 ) -> Result<()> {
627 if depth >= self.max_depth {
628 return Err(SanitizeError::RecursionDepthExceeded(format!(
629 "nested archive '{}' at depth {} exceeds maximum nesting depth of {}",
630 filename, depth, self.max_depth,
631 )));
632 }
633
634 if let Some(sz) = entry_size_hint {
636 if sz > MAX_STRUCTURED_ENTRY_SIZE {
637 return Err(SanitizeError::ArchiveError(format!(
638 "nested archive '{}' is too large ({} bytes, limit {} bytes)",
639 filename, sz, MAX_STRUCTURED_ENTRY_SIZE,
640 )));
641 }
642 }
643
644 let mut content = Vec::new();
645 reader.read_to_end(&mut content).map_err(|e| {
646 SanitizeError::ArchiveError(format!("read nested archive '{filename}': {e}"))
647 })?;
648 stats.total_input_bytes += content.len() as u64;
649
650 let mut output_buf: Vec<u8> = Vec::new();
652 let child_stats = match nested_fmt {
653 ArchiveFormat::Tar => {
654 self.process_tar_at_depth(&content[..], &mut output_buf, depth + 1)?
655 }
656 ArchiveFormat::TarGz => {
657 self.process_tar_gz_at_depth(&content[..], &mut output_buf, depth + 1)?
658 }
659 ArchiveFormat::Zip => {
660 let reader = io::Cursor::new(&content);
661 let mut writer = io::Cursor::new(Vec::new());
662 let s = self.process_zip_at_depth(reader, &mut writer, depth + 1)?;
663 output_buf = writer.into_inner();
664 s
665 }
666 };
667
668 stats.nested_archives += 1;
669 stats.merge(&child_stats);
670 stats.total_output_bytes += output_buf.len() as u64;
671 let fmt_name = match nested_fmt {
672 ArchiveFormat::Tar => "tar",
673 ArchiveFormat::TarGz => "tar.gz",
674 ArchiveFormat::Zip => "zip",
675 };
676 stats
677 .file_methods
678 .insert(filename.to_string(), format!("nested:{fmt_name}"));
679 writer.write_all(&output_buf).map_err(|e| {
680 SanitizeError::ArchiveError(format!("write nested archive '{filename}': {e}"))
681 })?;
682 Ok(())
683 }
684
685 pub fn discover_profiles_tar<R: Read>(&self, reader: R) -> Result<()> {
700 if self.profiles.is_empty() {
701 return Ok(());
702 }
703 let mut archive = tar::Archive::new(reader);
704 let entries = archive
705 .entries()
706 .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entries: {e}")))?;
707 for entry_result in entries {
708 let mut entry = entry_result
709 .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entry: {e}")))?;
710 if !entry.header().entry_type().is_file() {
711 continue;
712 }
713 let path = entry
714 .path()
715 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
716 .to_string_lossy()
717 .to_string();
718 let Some(profile) = self.find_profile(&path) else {
719 continue;
720 };
721 let mut content = Vec::new();
722 entry
723 .read_to_end(&mut content)
724 .map_err(|e| SanitizeError::ArchiveError(format!("read '{path}': {e}")))?;
725 let _ = self.registry.process(&content, profile, &self.store);
726 }
727 Ok(())
728 }
729
730 pub fn discover_profiles_tar_gz<R: Read>(&self, reader: R) -> Result<()> {
734 let gz = flate2::read::GzDecoder::new(reader);
735 self.discover_profiles_tar(gz)
736 }
737
738 pub fn discover_profiles_zip<R: Read + Seek>(&self, reader: R) -> Result<()> {
742 if self.profiles.is_empty() {
743 return Ok(());
744 }
745 let mut zip = zip::ZipArchive::new(reader)
746 .map_err(|e| SanitizeError::ArchiveError(format!("open zip for discovery: {e}")))?;
747 for i in 0..zip.len() {
748 let mut entry = zip
749 .by_index(i)
750 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {i}: {e}")))?;
751 if entry.is_dir() {
752 continue;
753 }
754 let name = entry.name().to_string();
755 let Some(profile) = self.find_profile(&name) else {
756 continue;
757 };
758 let mut content = Vec::new();
759 entry
760 .read_to_end(&mut content)
761 .map_err(|e| SanitizeError::ArchiveError(format!("read '{name}': {e}")))?;
762 let _ = self.registry.process(&content, profile, &self.store);
763 }
764 Ok(())
765 }
766
767 pub fn process_tar<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
781 self.process_tar_at_depth(reader, writer, 0)
782 }
783
784 fn process_tar_at_depth<R: Read, W: Write>(
792 &self,
793 reader: R,
794 writer: W,
795 depth: u32,
796 ) -> Result<ArchiveStats> {
797 let mut stats = ArchiveStats::default();
798 let mut archive = tar::Archive::new(reader);
799 let mut builder = tar::Builder::new(writer);
800
801 let entries_iter = archive
802 .entries()
803 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entries: {}", e)))?;
804
805 for entry_result in entries_iter {
806 let mut entry = entry_result
807 .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {}", e)))?;
808
809 let header = entry.header().clone();
810 let path = entry
811 .path()
812 .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {}", e)))?
813 .to_string_lossy()
814 .to_string();
815 let is_file = header.entry_type().is_file();
816
817 if !is_file {
818 let mut data = Vec::new();
820 entry.read_to_end(&mut data).map_err(|e| {
821 SanitizeError::ArchiveError(format!("read tar entry '{}': {}", path, e))
822 })?;
823 drop(entry);
824 builder.append(&header, &*data).map_err(|e| {
825 SanitizeError::ArchiveError(format!(
826 "append non-file entry '{}': {}",
827 path, e
828 ))
829 })?;
830 stats.entries_skipped += 1;
831 self.emit_progress(&stats, None, &path);
832 continue;
833 }
834
835 if !self.filter.passes(&path) {
843 stats.entries_filtered += 1;
844 continue;
845 }
846
847 let size_hint = header.size().ok();
848 let mut sanitized_buf: Vec<u8> = Vec::new();
849 let mut entry_stats = ArchiveStats::default();
850 self.sanitize_entry(
851 &path,
852 &mut entry,
853 &mut sanitized_buf,
854 &mut entry_stats,
855 size_hint,
856 depth,
857 )?;
858 drop(entry);
859
860 let mut new_header = header.clone();
861 new_header.set_size(sanitized_buf.len() as u64);
862 new_header.set_cksum();
863
864 builder.append(&new_header, &*sanitized_buf).map_err(|e| {
865 SanitizeError::ArchiveError(format!("append entry '{}': {}", path, e))
866 })?;
867 drop(sanitized_buf);
868
869 stats.merge(&entry_stats);
870 stats.files_processed += 1;
871 self.emit_progress(&stats, None, &path);
872 }
873
874 builder
875 .finish()
876 .map_err(|e| SanitizeError::ArchiveError(format!("finalize tar: {}", e)))?;
877
878 Ok(stats)
879 }
880
881 pub fn process_tar_gz<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
891 self.process_tar_gz_at_depth(reader, writer, 0)
892 }
893
894 fn process_tar_gz_at_depth<R: Read, W: Write>(
896 &self,
897 reader: R,
898 writer: W,
899 depth: u32,
900 ) -> Result<ArchiveStats> {
901 let gz_reader = flate2::read::GzDecoder::new(reader);
902 let gz_writer = flate2::write::GzEncoder::new(writer, flate2::Compression::fast());
903
904 let stats = self.process_tar_at_depth(gz_reader, gz_writer, depth)?;
905 Ok(stats)
911 }
912
913 pub fn process_zip<R: Read + Seek, W: Write + Seek>(
929 &self,
930 reader: R,
931 writer: W,
932 ) -> Result<ArchiveStats> {
933 self.process_zip_at_depth(reader, writer, 0)
934 }
935
936 #[allow(clippy::too_many_lines)]
947 fn process_zip_at_depth<R: Read + Seek, W: Write + Seek>(
948 &self,
949 reader: R,
950 writer: W,
951 depth: u32,
952 ) -> Result<ArchiveStats> {
953 struct ZipMeta {
957 name: String,
958 is_dir: bool,
959 compression: zip::CompressionMethod,
960 last_modified: zip::DateTime,
961 unix_mode: Option<u32>,
962 size: u64,
963 }
964
965 let mut zip_in = zip::ZipArchive::new(reader)
966 .map_err(|e| SanitizeError::ArchiveError(format!("open zip: {}", e)))?;
967 let total_entries = zip_in.len();
968 let total_entries_hint = Some(total_entries as u64);
969
970 let mut metas: Vec<ZipMeta> = Vec::with_capacity(total_entries);
971 let mut file_count = 0usize;
972 let mut total_uncompressed_size: u64 = 0;
973
974 for i in 0..total_entries {
975 let entry = zip_in
976 .by_index(i)
977 .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
978 let is_dir = entry.is_dir();
979 let size = entry.size();
980 if !is_dir {
981 file_count += 1;
982 total_uncompressed_size = total_uncompressed_size.saturating_add(size);
983 }
984 metas.push(ZipMeta {
985 name: entry.name().to_string(),
986 is_dir,
987 compression: entry.compression(),
988 last_modified: entry.last_modified(),
989 unix_mode: entry.unix_mode(),
990 size,
991 });
992 }
994
995 let use_parallel = file_count >= self.parallel_threshold
1000 && rayon::current_thread_index().is_none()
1001 && total_uncompressed_size <= MAX_PARALLEL_ZIP_DATA_SIZE;
1002
1003 let mut stats = ArchiveStats::default();
1004
1005 let make_options = |m: &ZipMeta| {
1007 let opts = zip::write::FileOptions::default()
1008 .compression_method(m.compression)
1009 .last_modified_time(m.last_modified);
1010 if let Some(mode) = m.unix_mode {
1011 opts.unix_permissions(mode)
1012 } else {
1013 opts
1014 }
1015 };
1016
1017 if use_parallel {
1018 struct ZipEntry {
1020 meta_idx: usize,
1021 data: Vec<u8>,
1022 }
1023
1024 let mut file_entries: Vec<ZipEntry> = Vec::with_capacity(file_count);
1025
1026 for (i, meta) in metas.iter().enumerate() {
1027 if meta.is_dir {
1028 continue;
1029 }
1030 if !self.filter.passes(&meta.name) {
1032 continue;
1033 }
1034 let mut entry = zip_in.by_index(i).map_err(|e| {
1035 SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e))
1036 })?;
1037 let mut data = Vec::new();
1038 entry.read_to_end(&mut data).map_err(|e| {
1039 SanitizeError::ArchiveError(format!("read zip entry '{}': {}", meta.name, e))
1040 })?;
1041 file_entries.push(ZipEntry { meta_idx: i, data });
1042 }
1043
1044 let results: Vec<ZipEntryResult> = file_entries
1045 .into_par_iter()
1046 .map(|e| {
1047 let meta = &metas[e.meta_idx];
1048 let result =
1049 self.sanitize_entry_bytes(&meta.name, &e.data, Some(meta.size), depth);
1050 (e.meta_idx, result)
1051 })
1052 .collect();
1053
1054 let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; metas.len()];
1057 for (meta_idx, r) in results {
1058 sanitized[meta_idx] = Some(r?);
1059 }
1060
1061 let mut zip_out = zip::ZipWriter::new(writer);
1062 for (i, meta) in metas.iter().enumerate() {
1063 let options = make_options(meta);
1064 if meta.is_dir {
1065 zip_out.add_directory(&meta.name, options).map_err(|e| {
1066 SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1067 })?;
1068 stats.entries_skipped += 1;
1069 self.emit_progress(&stats, total_entries_hint, &meta.name);
1070 continue;
1071 }
1072 if !self.filter.passes(&meta.name) {
1074 stats.entries_filtered += 1;
1075 self.emit_progress(&stats, total_entries_hint, &meta.name);
1076 continue;
1077 }
1078 let (sanitized_buf, entry_stats) = sanitized[i]
1079 .take()
1080 .expect("file entry sanitization result missing");
1081 stats.merge(&entry_stats);
1082 zip_out.start_file(&meta.name, options).map_err(|e| {
1083 SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1084 })?;
1085 zip_out.write_all(&sanitized_buf).map_err(|e| {
1086 SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1087 })?;
1088 stats.files_processed += 1;
1089 self.emit_progress(&stats, total_entries_hint, &meta.name);
1090 }
1091 zip_out
1092 .finish()
1093 .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1094 } else {
1095 let mut zip_out = zip::ZipWriter::new(writer);
1098 for (i, meta) in metas.iter().enumerate() {
1099 let options = make_options(meta);
1100 if meta.is_dir {
1101 zip_out.add_directory(&meta.name, options).map_err(|e| {
1102 SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1103 })?;
1104 stats.entries_skipped += 1;
1105 self.emit_progress(&stats, total_entries_hint, &meta.name);
1106 continue;
1107 }
1108
1109 if !self.filter.passes(&meta.name) {
1111 stats.entries_filtered += 1;
1112 self.emit_progress(&stats, total_entries_hint, &meta.name);
1113 continue;
1114 }
1115
1116 let data = {
1117 let mut entry = zip_in.by_index(i).map_err(|e| {
1118 SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e))
1119 })?;
1120 let mut buf = Vec::new();
1121 entry.read_to_end(&mut buf).map_err(|e| {
1122 SanitizeError::ArchiveError(format!(
1123 "read zip entry '{}': {}",
1124 meta.name, e
1125 ))
1126 })?;
1127 buf
1128 };
1130
1131 let (sanitized_buf, entry_stats) =
1132 self.sanitize_entry_bytes(&meta.name, &data, Some(meta.size), depth)?;
1133 drop(data);
1134
1135 zip_out.start_file(&meta.name, options).map_err(|e| {
1136 SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1137 })?;
1138 zip_out.write_all(&sanitized_buf).map_err(|e| {
1139 SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1140 })?;
1141 drop(sanitized_buf);
1142
1143 stats.merge(&entry_stats);
1144 stats.files_processed += 1;
1145 self.emit_progress(&stats, total_entries_hint, &meta.name);
1146 }
1147 zip_out
1148 .finish()
1149 .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1150 }
1151
1152 Ok(stats)
1153 }
1154
1155 pub fn process<R: Read + Seek, W: Write + Seek>(
1171 &self,
1172 reader: R,
1173 writer: W,
1174 format: ArchiveFormat,
1175 ) -> Result<ArchiveStats> {
1176 match format {
1177 ArchiveFormat::Zip => self.process_zip(reader, writer),
1178 ArchiveFormat::Tar => self.process_tar(reader, writer),
1179 ArchiveFormat::TarGz => self.process_tar_gz(reader, writer),
1180 }
1181 }
1182}
1183
1184struct CountingReader<'a> {
1190 inner: &'a mut dyn Read,
1191 count: u64,
1192}
1193
1194impl<'a> CountingReader<'a> {
1195 fn new(inner: &'a mut dyn Read) -> Self {
1196 Self { inner, count: 0 }
1197 }
1198
1199 fn bytes_read(&self) -> u64 {
1200 self.count
1201 }
1202}
1203
1204impl Read for CountingReader<'_> {
1205 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
1206 let n = self.inner.read(buf)?;
1207 self.count += n as u64;
1208 Ok(n)
1209 }
1210}
1211
1212struct CountingWriter<'a> {
1214 inner: &'a mut dyn Write,
1215 count: u64,
1216}
1217
1218impl<'a> CountingWriter<'a> {
1219 fn new(inner: &'a mut dyn Write) -> Self {
1220 Self { inner, count: 0 }
1221 }
1222
1223 fn bytes_written(&self) -> u64 {
1224 self.count
1225 }
1226}
1227
1228impl Write for CountingWriter<'_> {
1229 fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
1230 let n = self.inner.write(buf)?;
1231 self.count += n as u64;
1232 Ok(n)
1233 }
1234
1235 fn flush(&mut self) -> io::Result<()> {
1236 self.inner.flush()
1237 }
1238}
1239
1240#[cfg(test)]
1245mod tests {
1246 use super::*;
1247 use crate::category::Category;
1248 use crate::generator::HmacGenerator;
1249 use crate::processor::profile::{FieldRule, FileTypeProfile};
1250 use crate::processor::registry::ProcessorRegistry;
1251 use crate::scanner::{ScanConfig, ScanPattern};
1252 use std::io::Cursor;
1253 use std::sync::Mutex;
1254
1255 fn make_archive_processor() -> ArchiveProcessor {
1257 let gen = Arc::new(HmacGenerator::new([42u8; 32]));
1258 let store = Arc::new(MappingStore::new(gen, None));
1259
1260 let patterns = vec![
1261 ScanPattern::from_regex(
1262 r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
1263 Category::Email,
1264 "email",
1265 )
1266 .unwrap(),
1267 ScanPattern::from_literal("SUPERSECRET", Category::Custom("api_key".into()), "api_key")
1268 .unwrap(),
1269 ];
1270
1271 let scanner = Arc::new(
1272 StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
1273 );
1274
1275 let registry = Arc::new(ProcessorRegistry::with_builtins());
1276
1277 let profiles = vec![FileTypeProfile::new(
1278 "json",
1279 vec![FieldRule::new("*").with_category(Category::Custom("field".into()))],
1280 )
1281 .with_extension(".json")];
1282
1283 ArchiveProcessor::new(registry, scanner, store, profiles)
1284 }
1285
1286 fn build_test_tar(entries: &[(&str, &[u8])]) -> Vec<u8> {
1289 let mut buf = Vec::new();
1290 {
1291 let mut builder = tar::Builder::new(&mut buf);
1292 for (name, data) in entries {
1293 let mut header = tar::Header::new_gnu();
1294 header.set_size(data.len() as u64);
1295 header.set_mode(0o644);
1296 header.set_mtime(1_700_000_000);
1297 header.set_cksum();
1298 builder.append_data(&mut header, *name, *data).unwrap();
1299 }
1300 builder.finish().unwrap();
1301 }
1302 buf
1303 }
1304
1305 #[test]
1306 fn tar_sanitizes_plaintext_with_scanner() {
1307 let proc = make_archive_processor();
1308 let input = build_test_tar(&[("readme.txt", b"Contact alice@corp.com for help.")]);
1309
1310 let mut output = Vec::new();
1311 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1312
1313 assert_eq!(stats.files_processed, 1);
1314 assert_eq!(stats.scanner_fallback, 1);
1315 assert_eq!(stats.structured_hits, 0);
1316
1317 let mut archive = tar::Archive::new(&output[..]);
1319 for entry in archive.entries().unwrap() {
1320 let mut e = entry.unwrap();
1321 let mut content = String::new();
1322 e.read_to_string(&mut content).unwrap();
1323 assert!(
1324 !content.contains("alice@corp.com"),
1325 "email should be sanitized: {content}"
1326 );
1327 }
1328 }
1329
1330 #[test]
1331 fn tar_sanitizes_json_with_structured_processor() {
1332 let proc = make_archive_processor();
1333 let json_content = br#"{"email": "bob@example.org", "name": "Bob"}"#;
1334 let input = build_test_tar(&[("config.json", json_content)]);
1335
1336 let mut output = Vec::new();
1337 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1338
1339 assert_eq!(stats.files_processed, 1);
1340 assert_eq!(stats.structured_hits, 1);
1341 assert_eq!(stats.scanner_fallback, 0);
1342 assert_eq!(
1343 stats.file_methods.get("config.json").unwrap(),
1344 "structured+scan:json"
1345 );
1346
1347 let mut archive = tar::Archive::new(&output[..]);
1349 for entry in archive.entries().unwrap() {
1350 let mut e = entry.unwrap();
1351 let mut content = String::new();
1352 e.read_to_string(&mut content).unwrap();
1353 assert!(
1354 !content.contains("bob@example.org"),
1355 "email should be sanitized"
1356 );
1357 assert!(!content.contains("Bob"), "name should be sanitized");
1358 }
1359 }
1360
1361 #[test]
1362 fn tar_preserves_metadata() {
1363 let proc = make_archive_processor();
1364 let input = build_test_tar(&[("data.txt", b"SUPERSECRET token here")]);
1365
1366 let mut output = Vec::new();
1367 proc.process_tar(&input[..], &mut output).unwrap();
1368
1369 let mut archive = tar::Archive::new(&output[..]);
1370 for entry in archive.entries().unwrap() {
1371 let e = entry.unwrap();
1372 let hdr = e.header();
1373 assert_eq!(hdr.mode().unwrap(), 0o644);
1374 assert_eq!(hdr.mtime().unwrap(), 1_700_000_000);
1375 }
1376 }
1377
1378 #[test]
1379 fn tar_handles_multiple_files() {
1380 let proc = make_archive_processor();
1381 let input = build_test_tar(&[
1382 ("a.txt", b"alice@corp.com"),
1383 ("b.json", br#"{"key":"value"}"#),
1384 ("c.log", b"no secrets here"),
1385 ]);
1386
1387 let mut output = Vec::new();
1388 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1389
1390 assert_eq!(stats.files_processed, 3);
1391 assert_eq!(stats.structured_hits, 1);
1393 assert_eq!(stats.scanner_fallback, 2);
1395 }
1396
1397 #[test]
1398 fn tar_passes_through_directories() {
1399 let mut buf = Vec::new();
1400 {
1401 let mut builder = tar::Builder::new(&mut buf);
1402
1403 let mut dir_header = tar::Header::new_gnu();
1405 dir_header.set_entry_type(tar::EntryType::Directory);
1406 dir_header.set_size(0);
1407 dir_header.set_mode(0o755);
1408 dir_header.set_cksum();
1409 builder
1410 .append_data(&mut dir_header, "mydir/", &b""[..])
1411 .unwrap();
1412
1413 let mut file_header = tar::Header::new_gnu();
1415 file_header.set_size(5);
1416 file_header.set_mode(0o644);
1417 file_header.set_cksum();
1418 builder
1419 .append_data(&mut file_header, "mydir/hello.txt", &b"hello"[..])
1420 .unwrap();
1421
1422 builder.finish().unwrap();
1423 }
1424
1425 let proc = make_archive_processor();
1426 let mut output = Vec::new();
1427 let stats = proc.process_tar(&buf[..], &mut output).unwrap();
1428
1429 assert_eq!(stats.entries_skipped, 1);
1430 assert_eq!(stats.files_processed, 1);
1431 }
1432
1433 #[test]
1436 fn tar_gz_round_trip() {
1437 let proc = make_archive_processor();
1438
1439 let tar_data = build_test_tar(&[("secret.txt", b"Key is SUPERSECRET okay")]);
1441 let mut gz_input = Vec::new();
1442 {
1443 let mut encoder =
1444 flate2::write::GzEncoder::new(&mut gz_input, flate2::Compression::fast());
1445 encoder.write_all(&tar_data).unwrap();
1446 encoder.finish().unwrap();
1447 }
1448
1449 let mut gz_output = Vec::new();
1450 let stats = proc.process_tar_gz(&gz_input[..], &mut gz_output).unwrap();
1451
1452 assert_eq!(stats.files_processed, 1);
1453 assert_eq!(stats.scanner_fallback, 1);
1454
1455 let decoder = flate2::read::GzDecoder::new(&gz_output[..]);
1457 let mut archive = tar::Archive::new(decoder);
1458 for entry in archive.entries().unwrap() {
1459 let mut e = entry.unwrap();
1460 let mut content = String::new();
1461 e.read_to_string(&mut content).unwrap();
1462 assert!(
1463 !content.contains("SUPERSECRET"),
1464 "secret should be sanitized: {content}"
1465 );
1466 }
1467 }
1468
1469 fn build_test_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
1472 let mut buf = Cursor::new(Vec::new());
1473 {
1474 let mut zip = zip::ZipWriter::new(&mut buf);
1475 for (name, data) in entries {
1476 let options = zip::write::FileOptions::default()
1477 .compression_method(zip::CompressionMethod::Deflated);
1478 zip.start_file(*name, options).unwrap();
1479 zip.write_all(data).unwrap();
1480 }
1481 zip.finish().unwrap();
1482 }
1483 buf.into_inner()
1484 }
1485
1486 #[test]
1487 fn zip_sanitizes_plaintext_with_scanner() {
1488 let proc = make_archive_processor();
1489 let zip_data = build_test_zip(&[("notes.txt", b"Reach alice@corp.com for info.")]);
1490
1491 let reader = Cursor::new(&zip_data);
1492 let mut writer = Cursor::new(Vec::new());
1493 let stats = proc.process_zip(reader, &mut writer).unwrap();
1494
1495 assert_eq!(stats.files_processed, 1);
1496 assert_eq!(stats.scanner_fallback, 1);
1497
1498 let out_data = writer.into_inner();
1500 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1501 let mut entry = zip_out.by_index(0).unwrap();
1502 let mut content = String::new();
1503 entry.read_to_string(&mut content).unwrap();
1504 assert!(
1505 !content.contains("alice@corp.com"),
1506 "email should be sanitized: {content}"
1507 );
1508 }
1509
1510 #[test]
1511 fn zip_sanitizes_json_with_structured_processor() {
1512 let proc = make_archive_processor();
1513 let json_content = br#"{"password": "hunter2", "host": "db.internal"}"#;
1514 let zip_data = build_test_zip(&[("settings.json", json_content)]);
1515
1516 let reader = Cursor::new(&zip_data);
1517 let mut writer = Cursor::new(Vec::new());
1518 let stats = proc.process_zip(reader, &mut writer).unwrap();
1519
1520 assert_eq!(stats.files_processed, 1);
1521 assert_eq!(stats.structured_hits, 1);
1522
1523 let out_data = writer.into_inner();
1524 let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1525 let mut entry = zip_out.by_index(0).unwrap();
1526 let mut content = String::new();
1527 entry.read_to_string(&mut content).unwrap();
1528 assert!(!content.contains("hunter2"), "password should be sanitized");
1529 assert!(!content.contains("db.internal"), "host should be sanitized");
1530 }
1531
1532 #[test]
1533 fn zip_preserves_directory_entries() {
1534 let mut buf = Cursor::new(Vec::new());
1535 {
1536 let mut zip = zip::ZipWriter::new(&mut buf);
1537
1538 let dir_options = zip::write::FileOptions::default();
1539 zip.add_directory("subdir/", dir_options).unwrap();
1540
1541 let file_options = zip::write::FileOptions::default()
1542 .compression_method(zip::CompressionMethod::Stored);
1543 zip.start_file("subdir/data.txt", file_options).unwrap();
1544 zip.write_all(b"SUPERSECRET value").unwrap();
1545
1546 zip.finish().unwrap();
1547 }
1548
1549 let zip_data = buf.into_inner();
1550 let proc = make_archive_processor();
1551 let reader = Cursor::new(&zip_data);
1552 let mut writer = Cursor::new(Vec::new());
1553 let stats = proc.process_zip(reader, &mut writer).unwrap();
1554
1555 assert_eq!(stats.entries_skipped, 1); assert_eq!(stats.files_processed, 1);
1557 }
1558
1559 #[test]
1560 fn zip_handles_multiple_files() {
1561 let proc = make_archive_processor();
1562 let zip_data = build_test_zip(&[
1563 ("file1.txt", b"alice@corp.com"),
1564 ("file2.json", br#"{"secret":"SUPERSECRET"}"#),
1565 ("file3.log", b"nothing to see"),
1566 ]);
1567
1568 let reader = Cursor::new(&zip_data);
1569 let mut writer = Cursor::new(Vec::new());
1570 let stats = proc.process_zip(reader, &mut writer).unwrap();
1571
1572 assert_eq!(stats.files_processed, 3);
1573 assert_eq!(stats.structured_hits, 1); assert_eq!(stats.scanner_fallback, 2); }
1576
1577 #[test]
1578 fn tar_progress_callback_receives_updates() {
1579 let updates = Arc::new(Mutex::new(Vec::new()));
1580 let proc = make_archive_processor().with_progress_callback({
1581 let updates = Arc::clone(&updates);
1582 Arc::new(move |progress| {
1583 updates.lock().expect("archive progress lock").push(progress.clone());
1584 })
1585 });
1586 let input = build_test_tar(&[("a.txt", b"alice@corp.com"), ("b.txt", b"SUPERSECRET")]);
1587
1588 let mut output = Vec::new();
1589 let stats = proc.process_tar(&input[..], &mut output).unwrap();
1590 let updates = updates.lock().unwrap();
1591
1592 assert_eq!(updates.len(), 2);
1593 assert_eq!(updates.last().unwrap().entries_seen, 2);
1594 assert_eq!(
1595 updates.last().unwrap().files_processed,
1596 stats.files_processed
1597 );
1598 assert_eq!(updates.last().unwrap().total_entries, None);
1599 }
1600
1601 #[test]
1602 fn zip_progress_callback_reports_total_entries() {
1603 let updates = Arc::new(Mutex::new(Vec::new()));
1604 let proc = make_archive_processor().with_progress_callback({
1605 let updates = Arc::clone(&updates);
1606 Arc::new(move |progress| {
1607 updates.lock().expect("archive progress lock").push(progress.clone());
1608 })
1609 });
1610 let zip_data = build_test_zip(&[
1611 ("file1.txt", b"alice@corp.com"),
1612 ("file2.log", b"nothing to see"),
1613 ]);
1614
1615 let reader = Cursor::new(&zip_data);
1616 let mut writer = Cursor::new(Vec::new());
1617 let stats = proc.process_zip(reader, &mut writer).unwrap();
1618 let updates = updates.lock().unwrap();
1619
1620 assert_eq!(updates.len(), 2);
1621 assert_eq!(
1622 updates.last().unwrap().files_processed,
1623 stats.files_processed
1624 );
1625 assert_eq!(updates.last().unwrap().total_entries, Some(2));
1626 assert_eq!(updates.last().unwrap().current_entry, "file2.log");
1627 }
1628
1629 #[test]
1632 fn format_detection_from_path() {
1633 assert_eq!(
1634 ArchiveFormat::from_path("data.tar"),
1635 Some(ArchiveFormat::Tar)
1636 );
1637 assert_eq!(
1638 ArchiveFormat::from_path("data.tar.gz"),
1639 Some(ArchiveFormat::TarGz)
1640 );
1641 assert_eq!(
1642 ArchiveFormat::from_path("data.tgz"),
1643 Some(ArchiveFormat::TarGz)
1644 );
1645 assert_eq!(
1646 ArchiveFormat::from_path("data.zip"),
1647 Some(ArchiveFormat::Zip)
1648 );
1649 assert_eq!(
1650 ArchiveFormat::from_path("DATA.ZIP"),
1651 Some(ArchiveFormat::Zip)
1652 );
1653 assert_eq!(ArchiveFormat::from_path("photo.png"), None);
1654 }
1655
1656 #[test]
1659 fn same_secret_gets_same_replacement_across_entries() {
1660 let proc = make_archive_processor();
1661 let input = build_test_tar(&[
1662 ("a.txt", b"contact alice@corp.com"),
1663 ("b.txt", b"reach alice@corp.com"),
1664 ]);
1665
1666 let mut output = Vec::new();
1667 proc.process_tar(&input[..], &mut output).unwrap();
1668
1669 let mut archive = tar::Archive::new(&output[..]);
1670 let mut contents: Vec<String> = Vec::new();
1671 for entry in archive.entries().unwrap() {
1672 let mut e = entry.unwrap();
1673 let mut s = String::new();
1674 e.read_to_string(&mut s).unwrap();
1675 contents.push(s);
1676 }
1677
1678 let replacement_a = contents[0].strip_prefix("contact ").unwrap();
1681 let replacement_b = contents[1].strip_prefix("reach ").unwrap();
1682 assert_eq!(
1683 replacement_a, replacement_b,
1684 "dedup should produce identical replacements"
1685 );
1686 assert!(!replacement_a.contains("alice@corp.com"));
1687 }
1688
1689 #[test]
1692 fn process_auto_dispatch_tar() {
1693 let proc = make_archive_processor();
1694 let tar_data = build_test_tar(&[("f.txt", b"SUPERSECRET")]);
1695
1696 let reader = Cursor::new(tar_data);
1697 let writer = Cursor::new(Vec::new());
1698 let stats = proc.process(reader, writer, ArchiveFormat::Tar).unwrap();
1699
1700 assert_eq!(stats.files_processed, 1);
1701 }
1702
1703 #[test]
1704 fn process_auto_dispatch_zip() {
1705 let proc = make_archive_processor();
1706 let zip_data = build_test_zip(&[("f.txt", b"SUPERSECRET")]);
1707
1708 let reader = Cursor::new(zip_data);
1709 let mut writer = Cursor::new(Vec::new());
1710 let stats = proc
1711 .process(reader, &mut writer, ArchiveFormat::Zip)
1712 .unwrap();
1713
1714 assert_eq!(stats.files_processed, 1);
1715 }
1716
1717 #[test]
1720 fn tar_empty_archive() {
1721 let proc = make_archive_processor();
1722 let tar_data = build_test_tar(&[]);
1723
1724 let mut output = Vec::new();
1725 let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
1726
1727 assert_eq!(stats.files_processed, 0);
1728 assert_eq!(stats.entries_skipped, 0);
1729 }
1730
1731 #[test]
1732 fn zip_empty_archive() {
1733 let proc = make_archive_processor();
1734 let zip_data = build_test_zip(&[]);
1735
1736 let reader = Cursor::new(zip_data);
1737 let mut writer = Cursor::new(Vec::new());
1738 let stats = proc.process_zip(reader, &mut writer).unwrap();
1739
1740 assert_eq!(stats.files_processed, 0);
1741 }
1742}