sanitize_engine/processor/
archive.rs

1//! Archive processor for sanitizing files inside `.zip`, `.tar`, and `.tar.gz` archives.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌───────────────────────┐
7//! │  Archive (zip/tar/gz) │
8//! └────────┬──────────────┘
9//!          │  for each entry
10//!          ▼
11//! ┌─────────────────────────────────────────────┐
12//! │  1. Match entry filename → FileTypeProfile  │
13//! │  2. Try ProcessorRegistry (structured)      │
14//! │  3. Fallback: StreamScanner (streaming)     │
15//! └────────┬────────────────────────────────────┘
16//!          │  sanitized bytes
17//!          ▼
18//! ┌───────────────────────┐
19//! │  Rebuilt archive       │
20//! │  (same format, meta   │
21//! │   preserved)          │
22//! └───────────────────────┘
23//! ```
24//!
25//! # Memory Efficiency
26//!
27//! Archives are processed **entry-by-entry**. Each entry is piped
28//! through either a structured processor (which must buffer the full
29//! entry) or the [`StreamScanner`]
30//! (which processes in configurable chunks). This means the maximum
31//! memory footprint is proportional to the largest *single entry*
32//! that uses a structured processor. Files without a profile match
33//! are streamed through the scanner without buffering the whole entry.
34//!
35//! For very large individual files inside archives, the streaming
36//! scanner path keeps only `chunk_size + overlap_size` bytes in memory.
37//!
38//! # Thread Safety
39//!
40//! [`ArchiveProcessor`] is `Send + Sync`. The underlying
41//! [`MappingStore`] provides lock-free
42//! reads for dedup consistency.
43//!
44//! # Metadata Preservation
45//!
46//! - **Tar**: modification time, permissions (mode), uid/gid, and
47//!   username/groupname are copied from the source entry.
48//! - **Zip**: modification time, compression method, and unix
49//!   permissions are preserved.
50//! - Symlinks, directories, and other non-regular entries are passed
51//!   through unchanged.
52
53use crate::error::{Result, SanitizeError};
54use crate::processor::profile::FileTypeProfile;
55use crate::processor::registry::ProcessorRegistry;
56use crate::scanner::{ScanStats, StreamScanner};
57use crate::store::MappingStore;
58
59/// Strip path traversal components from an archive entry path before writing output.
60///
61/// Removes: leading `/`, `./`, and any `../` sequences. The result is always
62/// a relative path with no upward traversal. An empty result is replaced with
63/// `"_"` to avoid writing an entry with a blank name. Backslashes are
64/// normalised to forward slashes (handles Windows-style zip entries).
65fn sanitize_archive_entry_name(name: &str) -> String {
66    let name = name.replace('\\', "/");
67    let name = name.trim_start_matches('/');
68    let safe: Vec<&str> = name
69        .split('/')
70        .filter(|s| !s.is_empty() && *s != "." && *s != "..")
71        .collect();
72    let result = safe.join("/");
73    if result.is_empty() {
74        "_".to_string()
75    } else {
76        result
77    }
78}
79
80#[inline]
81fn sanitize_zip_entry_name(name: &str) -> String {
82    sanitize_archive_entry_name(name)
83}
84
85#[inline]
86fn sanitize_tar_entry_name(name: &str) -> String {
87    sanitize_archive_entry_name(name)
88}
89
90use glob::MatchOptions;
91use rayon::prelude::*;
92use std::collections::HashMap;
93use std::io::{self, Read, Seek, Write};
94use std::sync::Arc;
95
96use crate::processor::limits::{
97    DEFAULT_ARCHIVE_DEPTH, MAX_ARCHIVE_DEPTH, PARALLEL_ENTRY_THRESHOLD, PARALLEL_TAR_DATA_SIZE,
98    PARALLEL_ZIP_DATA_SIZE, STRUCTURED_ENTRY_SIZE,
99};
100
101// ---------------------------------------------------------------------------
102// Archive format enum
103// ---------------------------------------------------------------------------
104
105/// Per-entry result from parallel archive processing: `(source_index, sanitized_bytes_and_stats)`.
106type ParEntryResult = (usize, Result<(Vec<u8>, ArchiveStats)>);
107
108/// Callback invoked with `(entry_name, sanitized_bytes)` after each file entry
109/// inside an archive is processed. Used by callers that need to inspect the
110/// sanitized content without buffering the entire archive (e.g. log context
111/// extraction).
112pub type EntryCallback = Arc<dyn Fn(&str, &[u8]) + Send + Sync>;
113
114// ---------------------------------------------------------------------------
115// ArchiveFilter
116// ---------------------------------------------------------------------------
117
118/// A compiled glob-based entry filter for archive processing.
119///
120/// Patterns are compiled once at construction time. At processing time
121/// `passes()` is called for each file entry path inside the archive.
122///
123/// ## Pattern semantics
124///
125/// - `*` matches any sequence of characters that does **not** contain `/`.
126/// - `**` matches any sequence of characters including `/`.
127/// - `?` matches any single character except `/`.
128/// - `[abc]` matches one of the listed characters.
129/// - A pattern ending with `/` is a *directory prefix* — it matches
130///   the directory itself and any path underneath it.
131///
132/// ## Filter logic
133///
134/// 1. If `--only` patterns are present: the entry path must match at
135///    least one pattern, otherwise it is dropped.
136/// 2. If `--exclude` patterns are present: if the entry path matches
137///    any pattern, it is dropped.
138/// 3. Only file entries are filtered; directory / symlink entries
139///    always pass through to preserve archive structure.
140#[derive(Default, Clone)]
141pub struct ArchiveFilter {
142    only: Vec<CompiledPattern>,
143    exclude: Vec<CompiledPattern>,
144}
145
146#[derive(Clone)]
147enum CompiledPattern {
148    /// Pattern that ended with `/` — matches the prefix directory and
149    /// everything inside it.
150    DirPrefix(String),
151    /// General glob pattern compiled with `require_literal_separator`.
152    Glob(glob::Pattern),
153}
154
155const GLOB_OPTS: MatchOptions = MatchOptions {
156    case_sensitive: true,
157    require_literal_separator: true,
158    require_literal_leading_dot: false,
159};
160
161impl CompiledPattern {
162    fn compile(raw: &str) -> std::result::Result<Self, String> {
163        if raw.ends_with('/') {
164            // Strip trailing slash; matching is done manually in `matches`.
165            Ok(CompiledPattern::DirPrefix(
166                raw.trim_end_matches('/').to_string(),
167            ))
168        } else {
169            glob::Pattern::new(raw)
170                .map(CompiledPattern::Glob)
171                .map_err(|e| format!("invalid glob pattern '{raw}': {e}"))
172        }
173    }
174
175    fn matches(&self, path: &str) -> bool {
176        match self {
177            CompiledPattern::DirPrefix(prefix) => {
178                path == prefix || path.starts_with(&format!("{prefix}/"))
179            }
180            CompiledPattern::Glob(pat) => pat.matches_with(path, GLOB_OPTS),
181        }
182    }
183}
184
185impl ArchiveFilter {
186    /// Compile `only` and `exclude` pattern lists into an `ArchiveFilter`.
187    ///
188    /// # Errors
189    ///
190    /// Returns an error if any pattern contains invalid glob syntax.
191    pub fn new(only: Vec<String>, exclude: Vec<String>) -> std::result::Result<Self, String> {
192        let only = only
193            .into_iter()
194            .map(|p| CompiledPattern::compile(&p))
195            .collect::<std::result::Result<Vec<_>, _>>()?;
196        let exclude = exclude
197            .into_iter()
198            .map(|p| CompiledPattern::compile(&p))
199            .collect::<std::result::Result<Vec<_>, _>>()?;
200        Ok(Self { only, exclude })
201    }
202
203    /// Returns `true` when neither `--only` nor `--exclude` patterns are set.
204    pub fn is_empty(&self) -> bool {
205        self.only.is_empty() && self.exclude.is_empty()
206    }
207
208    /// Returns `true` if `path` should be included in the output archive.
209    ///
210    /// Only applies to file entries; directory entries bypass this check.
211    pub fn passes(&self, path: &str) -> bool {
212        if !self.only.is_empty() && !self.only.iter().any(|p| p.matches(path)) {
213            return false;
214        }
215        if self.exclude.iter().any(|p| p.matches(path)) {
216            return false;
217        }
218        true
219    }
220}
221
222// ---------------------------------------------------------------------------
223// Archive format enum
224// ---------------------------------------------------------------------------
225#[derive(Debug, Clone, Copy, PartialEq, Eq)]
226pub enum ArchiveFormat {
227    /// `.zip` archive.
228    Zip,
229    /// Uncompressed `.tar` archive.
230    Tar,
231    /// Gzip-compressed `.tar.gz` / `.tgz` archive.
232    TarGz,
233}
234
235impl ArchiveFormat {
236    /// Detect archive format from a file path / extension.
237    ///
238    /// Returns `None` for unrecognised extensions.
239    pub fn from_path(path: &str) -> Option<Self> {
240        let lower = path.to_ascii_lowercase();
241        if lower.ends_with(".tar.gz")
242            || std::path::Path::new(&lower)
243                .extension()
244                .is_some_and(|ext| ext.eq_ignore_ascii_case("tgz"))
245        {
246            Some(Self::TarGz)
247        } else if std::path::Path::new(&lower)
248            .extension()
249            .is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
250        {
251            Some(Self::Tar)
252        } else if std::path::Path::new(&lower)
253            .extension()
254            .is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
255        {
256            Some(Self::Zip)
257        } else {
258            None
259        }
260    }
261}
262
263// ---------------------------------------------------------------------------
264// Archive statistics
265// ---------------------------------------------------------------------------
266
267/// Statistics collected while processing an archive.
268#[derive(Debug, Clone, Default)]
269pub struct ArchiveStats {
270    /// Number of file entries processed (excludes dirs/symlinks).
271    pub files_processed: u64,
272    /// Number of entries passed through unchanged (dirs, symlinks, etc.).
273    pub entries_skipped: u64,
274    /// Number of files handled by a structured processor.
275    pub structured_hits: u64,
276    /// Number of files handled by the streaming scanner fallback.
277    pub scanner_fallback: u64,
278    /// Number of entries that were themselves archives and processed
279    /// recursively.
280    pub nested_archives: u64,
281    /// Total input bytes across all file entries.
282    pub total_input_bytes: u64,
283    /// Total output bytes across all file entries.
284    pub total_output_bytes: u64,
285    /// Per-file processing method: filename → `"structured:<proc>"`, `"scanner"`,
286    /// or `"nested:<format>"`.
287    pub file_methods: HashMap<String, String>,
288    /// Per-file scan statistics (matches, replacements, bytes, pattern counts).
289    pub file_scan_stats: HashMap<String, ScanStats>,
290    /// Number of file entries removed by the [`ArchiveFilter`].
291    pub entries_filtered: u64,
292}
293
294/// Progress snapshot emitted while processing archive entries.
295#[derive(Debug, Clone, Eq, PartialEq)]
296pub struct ArchiveProgress {
297    /// Entries seen so far, including skipped entries.
298    pub entries_seen: u64,
299    /// Regular file entries processed so far.
300    pub files_processed: u64,
301    /// Non-file entries skipped so far.
302    pub entries_skipped: u64,
303    /// Total entries when cheaply known.
304    pub total_entries: Option<u64>,
305    /// Path of the current entry.
306    pub current_entry: String,
307}
308
309type ArchiveProgressCallback = Arc<dyn Fn(&ArchiveProgress) + Send + Sync>;
310
311impl ArchiveStats {
312    /// Merge statistics from a nested archive into this parent.
313    fn merge(&mut self, child: &ArchiveStats) {
314        self.files_processed += child.files_processed;
315        self.entries_skipped += child.entries_skipped;
316        self.structured_hits += child.structured_hits;
317        self.scanner_fallback += child.scanner_fallback;
318        self.nested_archives += child.nested_archives;
319        self.total_input_bytes += child.total_input_bytes;
320        self.total_output_bytes += child.total_output_bytes;
321        self.entries_filtered += child.entries_filtered;
322        self.file_methods.extend(
323            child
324                .file_methods
325                .iter()
326                .map(|(k, v)| (k.clone(), v.clone())),
327        );
328        self.file_scan_stats.extend(
329            child
330                .file_scan_stats
331                .iter()
332                .map(|(k, v)| (k.clone(), v.clone())),
333        );
334    }
335}
336
337// ---------------------------------------------------------------------------
338// ArchiveProcessor
339// ---------------------------------------------------------------------------
340
341/// Processes archives by sanitizing each contained file and rebuilding
342/// the archive with the same format and preserved metadata.
343///
344/// # Usage
345///
346/// ```rust,no_run
347/// use sanitize_engine::processor::archive::{ArchiveProcessor, ArchiveFormat};
348/// use sanitize_engine::processor::registry::ProcessorRegistry;
349/// use sanitize_engine::scanner::{StreamScanner, ScanPattern, ScanConfig};
350/// use sanitize_engine::generator::HmacGenerator;
351/// use sanitize_engine::store::MappingStore;
352/// use sanitize_engine::category::Category;
353/// use std::sync::Arc;
354///
355/// let gen = Arc::new(HmacGenerator::new([42u8; 32]));
356/// let store = Arc::new(MappingStore::new(gen, None));
357/// let patterns = vec![
358///     ScanPattern::from_regex(r"secret\w+", Category::Custom("secret".into()), "secrets").unwrap(),
359/// ];
360/// let scanner = Arc::new(
361///     StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
362/// );
363/// let registry = Arc::new(ProcessorRegistry::with_builtins());
364///
365/// let archive_proc = ArchiveProcessor::new(registry, scanner, store, vec![]);
366/// ```
367pub struct ArchiveProcessor {
368    /// Registry of structured processors.
369    registry: Arc<ProcessorRegistry>,
370    /// Streaming scanner for fallback processing.
371    scanner: Arc<StreamScanner>,
372    /// Shared mapping store (one-way replacements).
373    store: Arc<MappingStore>,
374    /// File-type profiles for structured processor matching.
375    profiles: Vec<FileTypeProfile>,
376    /// Maximum nesting depth for recursive archive processing.
377    max_depth: u32,
378    /// Optional callback for per-entry progress updates.
379    progress_callback: Option<ArchiveProgressCallback>,
380    /// Minimum number of file entries required to enable parallel entry
381    /// sanitization. Default: [`PARALLEL_ENTRY_THRESHOLD`].
382    parallel_threshold: usize,
383    /// Entry-level filter controlling which paths are included in the
384    /// output archive. Default: empty (pass all entries).
385    filter: ArchiveFilter,
386    /// When true, bypass all structured processors and use only the
387    /// streaming scanner for every entry. Trades format preservation
388    /// for maximum sanitization coverage.
389    force_text: bool,
390    /// Optional callback invoked with `(entry_name, sanitized_bytes)` after
391    /// each file entry is processed. Only called for regular file entries.
392    entry_callback: Option<EntryCallback>,
393}
394
395impl ArchiveProcessor {
396    /// Create a new archive processor.
397    ///
398    /// # Arguments
399    ///
400    /// - `registry` — structured processor registry.
401    /// - `scanner` — streaming scanner for fallback.
402    /// - `store` — shared mapping store for one-way dedup replacements.
403    /// - `profiles` — file-type profiles for structured matching.
404    pub fn new(
405        registry: Arc<ProcessorRegistry>,
406        scanner: Arc<StreamScanner>,
407        store: Arc<MappingStore>,
408        profiles: Vec<FileTypeProfile>,
409    ) -> Self {
410        Self {
411            registry,
412            scanner,
413            store,
414            profiles,
415            max_depth: DEFAULT_ARCHIVE_DEPTH,
416            progress_callback: None,
417            parallel_threshold: PARALLEL_ENTRY_THRESHOLD,
418            filter: ArchiveFilter::default(),
419            force_text: false,
420            entry_callback: None,
421        }
422    }
423
424    /// Override the maximum nesting depth for recursive archive
425    /// processing.
426    ///
427    /// The default is [`DEFAULT_ARCHIVE_DEPTH`] (5). Values above
428    /// 10 are clamped.
429    #[must_use]
430    pub fn with_max_depth(mut self, depth: u32) -> Self {
431        self.max_depth = depth.min(MAX_ARCHIVE_DEPTH);
432        self
433    }
434
435    /// Override the minimum entry count required to enable parallel
436    /// entry sanitization. Set to `usize::MAX` to disable parallelism
437    /// entirely for this processor instance (e.g. when outer file-level
438    /// parallelism is already saturating the thread budget).
439    #[must_use]
440    pub fn with_parallel_threshold(mut self, threshold: usize) -> Self {
441        self.parallel_threshold = threshold;
442        self
443    }
444
445    /// Register a per-entry archive progress callback.
446    #[must_use]
447    pub fn with_progress_callback(mut self, callback: ArchiveProgressCallback) -> Self {
448        self.progress_callback = Some(callback);
449        self
450    }
451
452    /// Apply an [`ArchiveFilter`] that controls which file entries are
453    /// included in the output archive.
454    ///
455    /// Entries that do not pass the filter are **removed** from the
456    /// output entirely. Directory / symlink entries are never filtered.
457    #[must_use]
458    pub fn with_filter(mut self, filter: ArchiveFilter) -> Self {
459        self.filter = filter;
460        self
461    }
462
463    /// When set, bypass all structured processors and use only the
464    /// streaming scanner for every archive entry.
465    ///
466    /// Trades format preservation for maximum sanitization coverage.
467    /// Useful when the user is uncertain about field rules or wants a
468    /// belt-and-suspenders guarantee that every byte is scanned.
469    #[must_use]
470    pub fn with_force_text(mut self, force_text: bool) -> Self {
471        self.force_text = force_text;
472        self
473    }
474
475    /// Register a callback that is invoked with `(entry_name, sanitized_bytes)`
476    /// after each regular file entry is fully processed.
477    #[must_use]
478    pub fn with_entry_callback(mut self, callback: EntryCallback) -> Self {
479        self.entry_callback = Some(callback);
480        self
481    }
482
483    fn emit_entry_bytes(&self, name: &str, bytes: &[u8]) {
484        if let Some(cb) = &self.entry_callback {
485            cb(name, bytes);
486        }
487    }
488
489    /// Find the first profile matching a filename.
490    fn find_profile(&self, filename: &str) -> Option<&FileTypeProfile> {
491        self.profiles.iter().find(|p| p.matches_filename(filename))
492    }
493
494    fn emit_progress(&self, stats: &ArchiveStats, total_entries: Option<u64>, current_entry: &str) {
495        if let Some(callback) = &self.progress_callback {
496            callback(&ArchiveProgress {
497                entries_seen: stats.files_processed + stats.entries_skipped,
498                files_processed: stats.files_processed,
499                entries_skipped: stats.entries_skipped,
500                total_entries,
501                current_entry: current_entry.to_string(),
502            });
503        }
504    }
505
506    /// Sanitize a file entry given its raw bytes.
507    ///
508    /// Returns the sanitized bytes together with a fresh [`ArchiveStats`]
509    /// covering only this entry. This is the core work unit for parallel
510    /// entry processing in [`process_tar_at_depth`] and
511    /// [`process_zip_at_depth`].
512    fn sanitize_entry_bytes(
513        &self,
514        filename: &str,
515        data: &[u8],
516        entry_size_hint: Option<u64>,
517        depth: u32,
518    ) -> Result<(Vec<u8>, ArchiveStats)> {
519        let mut out: Vec<u8> = Vec::with_capacity(data.len());
520        let mut entry_stats = ArchiveStats::default();
521        let mut reader = io::Cursor::new(data);
522        self.sanitize_entry(
523            filename,
524            &mut reader,
525            &mut out,
526            &mut entry_stats,
527            entry_size_hint,
528            depth,
529        )?;
530        Ok((out, entry_stats))
531    }
532
533    /// Sanitize the content of a single file entry.
534    ///
535    /// If the entry is itself an archive (detected via extension), it is
536    /// recursively processed up to `self.max_depth`. Otherwise, tries a
537    /// structured processor first; falls back to the streaming scanner
538    /// if no processor matches.
539    ///
540    /// For the streaming scanner path, the content is piped through
541    /// `scan_reader` directly to the writer for memory-efficient
542    /// chunk-based processing (F-02 fix: no full output buffering).
543    #[allow(clippy::missing_errors_doc)] // private method
544    fn sanitize_entry(
545        &self,
546        filename: &str,
547        reader: &mut dyn Read,
548        writer: &mut dyn Write,
549        stats: &mut ArchiveStats,
550        entry_size_hint: Option<u64>,
551        depth: u32,
552    ) -> Result<()> {
553        // --- Nested archive detection ---
554        if let Some(nested_fmt) = ArchiveFormat::from_path(filename) {
555            return self.sanitize_nested_archive(
556                filename,
557                reader,
558                writer,
559                stats,
560                entry_size_hint,
561                nested_fmt,
562                depth,
563            );
564        }
565
566        // --- Structured / scanner processing ---
567
568        // Try structured processing first, but only if the entry is
569        // within the size cap and --force-text is not set.
570        // Oversized entries fall through to the streaming scanner (M-3 fix).
571        let within_size_cap = entry_size_hint.map_or(true, |sz| sz <= STRUCTURED_ENTRY_SIZE); // unknown size → allow (conservative)
572
573        if !self.force_text && within_size_cap {
574            if let Some(profile) = self.find_profile(filename) {
575                // Structured processors need the full content in memory.
576                let mut content = Vec::new();
577                reader.read_to_end(&mut content).map_err(|e| {
578                    SanitizeError::ArchiveError(format!("read entry '{filename}': {e}"))
579                })?;
580
581                stats.total_input_bytes += content.len() as u64;
582
583                // A parse error (e.g. binary content with a .yaml extension, like
584                // macOS resource-fork ._* files) falls through to the scanner
585                // rather than failing the whole archive.
586                // A parse error or heuristic rejection falls through to the scanner below.
587                if let Ok(Some(structured_out)) =
588                    self.registry.process(&content, profile, &self.store)
589                {
590                    // Double-pass: run the streaming scanner on the structured
591                    // output to catch anything the field rules missed.
592                    let (output, scan_stats) = self.scanner.scan_bytes(&structured_out)?;
593                    stats.structured_hits += 1;
594                    stats.total_output_bytes += output.len() as u64;
595                    stats.file_methods.insert(
596                        filename.to_string(),
597                        format!("structured+scan:{}", profile.processor),
598                    );
599                    stats
600                        .file_scan_stats
601                        .insert(filename.to_string(), scan_stats);
602                    writer.write_all(&output).map_err(|e| {
603                        SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
604                    })?;
605                    return Ok(());
606                }
607
608                // Processor didn't match or failed — fall back to
609                // scanner with the already-buffered content.
610                let (output, scan_stats) = self.scanner.scan_bytes(&content)?;
611                stats.scanner_fallback += 1;
612                stats.total_output_bytes += output.len() as u64;
613                stats
614                    .file_methods
615                    .insert(filename.to_string(), "scanner".to_string());
616                stats
617                    .file_scan_stats
618                    .insert(filename.to_string(), scan_stats);
619                writer.write_all(&output).map_err(|e| {
620                    SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
621                })?;
622                return Ok(());
623            }
624        }
625
626        // No profile (or entry too large) → streaming scanner.
627        // F-02 fix: stream directly from reader → scanner → writer
628        // without buffering the full output. We use a CountingWriter
629        // to track output bytes alongside the CountingReader for input.
630        let mut counting_r = CountingReader::new(reader);
631        let mut counting_w = CountingWriter::new(writer);
632        let scan_stats = self.scanner.scan_reader(&mut counting_r, &mut counting_w)?;
633
634        stats.scanner_fallback += 1;
635        stats.total_input_bytes += counting_r.bytes_read();
636        stats.total_output_bytes += counting_w.bytes_written();
637        stats
638            .file_methods
639            .insert(filename.to_string(), "scanner".to_string());
640        stats
641            .file_scan_stats
642            .insert(filename.to_string(), scan_stats);
643
644        Ok(())
645    }
646
647    /// Handle a nested archive entry: validate depth/size, buffer, recurse,
648    /// and write the sanitized output.
649    #[allow(clippy::too_many_arguments)]
650    fn sanitize_nested_archive(
651        &self,
652        filename: &str,
653        reader: &mut dyn Read,
654        writer: &mut dyn Write,
655        stats: &mut ArchiveStats,
656        entry_size_hint: Option<u64>,
657        nested_fmt: ArchiveFormat,
658        depth: u32,
659    ) -> Result<()> {
660        if depth >= self.max_depth {
661            return Err(SanitizeError::RecursionDepthExceeded(format!(
662                "nested archive '{}' at depth {} exceeds maximum nesting depth of {}",
663                filename, depth, self.max_depth,
664            )));
665        }
666
667        // Buffer the nested archive (bounded by STRUCTURED_ENTRY_SIZE).
668        if let Some(sz) = entry_size_hint {
669            if sz > STRUCTURED_ENTRY_SIZE {
670                return Err(SanitizeError::ArchiveError(format!(
671                    "nested archive '{}' is too large ({} bytes, limit {} bytes)",
672                    filename, sz, STRUCTURED_ENTRY_SIZE,
673                )));
674            }
675        }
676
677        let mut content = Vec::new();
678        reader.read_to_end(&mut content).map_err(|e| {
679            SanitizeError::ArchiveError(format!("read nested archive '{filename}': {e}"))
680        })?;
681        stats.total_input_bytes += content.len() as u64;
682
683        // Recurse into the nested archive.
684        let mut output_buf: Vec<u8> = Vec::new();
685        let child_stats = match nested_fmt {
686            ArchiveFormat::Tar => {
687                self.process_tar_at_depth(&content[..], &mut output_buf, depth + 1)?
688            }
689            ArchiveFormat::TarGz => {
690                self.process_tar_gz_at_depth(&content[..], &mut output_buf, depth + 1)?
691            }
692            ArchiveFormat::Zip => {
693                let reader = io::Cursor::new(&content);
694                let mut writer = io::Cursor::new(Vec::new());
695                let s = self.process_zip_at_depth(reader, &mut writer, depth + 1)?;
696                output_buf = writer.into_inner();
697                s
698            }
699        };
700
701        stats.nested_archives += 1;
702        stats.merge(&child_stats);
703        stats.total_output_bytes += output_buf.len() as u64;
704        let fmt_name = match nested_fmt {
705            ArchiveFormat::Tar => "tar",
706            ArchiveFormat::TarGz => "tar.gz",
707            ArchiveFormat::Zip => "zip",
708        };
709        stats
710            .file_methods
711            .insert(filename.to_string(), format!("nested:{fmt_name}"));
712        writer.write_all(&output_buf).map_err(|e| {
713            SanitizeError::ArchiveError(format!("write nested archive '{filename}': {e}"))
714        })?;
715        Ok(())
716    }
717
718    // -----------------------------------------------------------------------
719    // Profile discovery passes (two-phase support)
720    // -----------------------------------------------------------------------
721    //
722    // These methods perform a read-only pre-pass over an archive, running the
723    // structured processor on every profile-matched entry and discarding the
724    // output.  The side-effect is that `self.store` is populated with the
725    // original→replacement mappings for those fields, so a subsequent call to
726    // `build_augmented_scanner` can inject those values as literals into the
727    // scanner used for the real processing pass.
728
729    /// Run the structured processor on every profile-matched entry in a
730    /// `.tar` archive, recording replacements into the store.  Output is
731    /// discarded; the archive is not modified.
732    ///
733    /// # Errors
734    ///
735    /// Returns an error if the archive cannot be read or an entry cannot be processed.
736    pub fn discover_profiles_tar<R: Read>(&self, reader: R) -> Result<()> {
737        if self.profiles.is_empty() {
738            return Ok(());
739        }
740        let mut archive = tar::Archive::new(reader);
741        let entries = archive
742            .entries()
743            .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entries: {e}")))?;
744        for entry_result in entries {
745            let mut entry = entry_result
746                .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entry: {e}")))?;
747            if !entry.header().entry_type().is_file() {
748                continue;
749            }
750            let path = entry
751                .path()
752                .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
753                .to_string_lossy()
754                .to_string();
755            let Some(profile) = self.find_profile(&path) else {
756                continue;
757            };
758            let mut content = Vec::new();
759            entry
760                .read_to_end(&mut content)
761                .map_err(|e| SanitizeError::ArchiveError(format!("read '{path}': {e}")))?;
762            let _ = self.registry.process(&content, profile, &self.store);
763        }
764        Ok(())
765    }
766
767    /// Run the structured processor on every profile-matched entry in a
768    /// `.tar.gz` archive, recording replacements into the store.  Output is
769    /// discarded; the archive is not modified.
770    ///
771    /// # Errors
772    ///
773    /// Returns an error if the archive cannot be read or an entry cannot be processed.
774    pub fn discover_profiles_tar_gz<R: Read>(&self, reader: R) -> Result<()> {
775        let gz = flate2::read::GzDecoder::new(reader);
776        self.discover_profiles_tar(gz)
777    }
778
779    /// Run the structured processor on every profile-matched entry in a
780    /// `.zip` archive, recording replacements into the store.  Output is
781    /// discarded; the archive is not modified.
782    ///
783    /// # Errors
784    ///
785    /// Returns an error if the archive cannot be read or an entry cannot be processed.
786    pub fn discover_profiles_zip<R: Read + Seek>(&self, reader: R) -> Result<()> {
787        if self.profiles.is_empty() {
788            return Ok(());
789        }
790        let mut zip = zip::ZipArchive::new(reader)
791            .map_err(|e| SanitizeError::ArchiveError(format!("open zip for discovery: {e}")))?;
792        for i in 0..zip.len() {
793            let mut entry = zip
794                .by_index(i)
795                .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {i}: {e}")))?;
796            if entry.is_dir() {
797                continue;
798            }
799            let name = sanitize_zip_entry_name(entry.name());
800            let Some(profile) = self.find_profile(&name) else {
801                continue;
802            };
803            let mut content = Vec::new();
804            entry
805                .read_to_end(&mut content)
806                .map_err(|e| SanitizeError::ArchiveError(format!("read '{name}': {e}")))?;
807            let _ = self.registry.process(&content, profile, &self.store);
808        }
809        Ok(())
810    }
811
812    // Tar processing
813    // -----------------------------------------------------------------------
814
815    /// Process a `.tar` archive, sanitizing each file entry and
816    /// rebuilding the archive with preserved metadata.
817    ///
818    /// Entries that are not regular files (directories, symlinks, etc.)
819    /// are copied through unchanged.
820    ///
821    /// # Errors
822    ///
823    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
824    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
825    pub fn process_tar<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
826        self.process_tar_at_depth(reader, writer, 0)
827    }
828
829    /// Internal: process a tar archive at a given nesting depth.
830    ///
831    /// Uses a speculative-buffer strategy to decide between parallel and
832    /// sequential processing:
833    ///
834    /// - **Parallel** (total buffered data ≤ `PARALLEL_TAR_DATA_SIZE` AND
835    ///   file count ≥ threshold AND not inside a rayon worker): buffer all
836    ///   entries, sanitize concurrently with rayon, write in source order.
837    /// - **Sequential — buffered** (threshold not met but data fits): process
838    ///   entries from the in-memory buffer one at a time.
839    /// - **Sequential — streaming** (data exceeds cap mid-stream): process
840    ///   already-buffered entries from memory, then continue streaming the
841    ///   remainder of the archive without additional buffering.
842    ///
843    /// Unlike zip, tar has no central directory so sizes cannot be known before
844    /// reading. The buffer cap (`PARALLEL_TAR_DATA_SIZE`) bounds peak memory to
845    /// cap + one entry overhead regardless of archive size.
846    #[allow(clippy::too_many_lines)]
847    fn process_tar_at_depth<R: Read, W: Write>(
848        &self,
849        reader: R,
850        writer: W,
851        depth: u32,
852    ) -> Result<ArchiveStats> {
853        struct TarEntry {
854            header: tar::Header,
855            path: String,
856            is_file: bool,
857            passes_filter: bool,
858            data: Vec<u8>,
859        }
860
861        let mut archive = tar::Archive::new(reader);
862        let mut builder = tar::Builder::new(writer);
863        let mut stats = ArchiveStats::default();
864
865        // --- Phase 1: speculative buffering ----------------------------------
866        // Stream entries into memory, tracking total file-data size.
867        // Stop buffering (but keep the last entry) if the cap is exceeded.
868        let mut entries_iter = archive
869            .entries()
870            .map_err(|e| SanitizeError::ArchiveError(format!("read tar entries: {e}")))?;
871
872        let mut buffered: Vec<TarEntry> = Vec::new();
873        let mut file_count: usize = 0;
874        let mut total_data: u64 = 0;
875        let mut overflowed = false;
876
877        for entry_result in entries_iter.by_ref() {
878            let mut entry = entry_result
879                .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {e}")))?;
880
881            let header = entry.header().clone();
882            let path = entry
883                .path()
884                .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
885                .to_string_lossy()
886                .into_owned();
887            let is_file = header.entry_type().is_file();
888            let passes_filter = !is_file || self.filter.passes(&path);
889
890            let mut data = Vec::new();
891            entry
892                .read_to_end(&mut data)
893                .map_err(|e| SanitizeError::ArchiveError(format!("read entry '{path}': {e}")))?;
894            drop(entry);
895
896            if is_file && passes_filter {
897                file_count += 1;
898                total_data = total_data.saturating_add(data.len() as u64);
899            }
900
901            buffered.push(TarEntry {
902                header,
903                path,
904                is_file,
905                passes_filter,
906                data,
907            });
908
909            if total_data > PARALLEL_TAR_DATA_SIZE {
910                overflowed = true;
911                break;
912            }
913        }
914
915        // --- Phase 2: choose strategy ----------------------------------------
916        let use_parallel = !overflowed
917            && file_count >= self.parallel_threshold
918            && rayon::current_thread_index().is_none();
919
920        if use_parallel {
921            // --- Parallel path -----------------------------------------------
922            // Sanitize all file entries concurrently; write in source order.
923            let file_indices: Vec<usize> = buffered
924                .iter()
925                .enumerate()
926                .filter(|(_, e)| e.is_file && e.passes_filter)
927                .map(|(i, _)| i)
928                .collect();
929
930            let results: Vec<ParEntryResult> = file_indices
931                .into_par_iter()
932                .map(|i| {
933                    let e = &buffered[i];
934                    let size_hint = e.header.size().ok();
935                    (
936                        i,
937                        self.sanitize_entry_bytes(&e.path, &e.data, size_hint, depth),
938                    )
939                })
940                .collect();
941
942            let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; buffered.len()];
943            for (i, r) in results {
944                sanitized[i] = Some(r?);
945            }
946
947            for (i, entry) in buffered.iter().enumerate() {
948                if !entry.is_file {
949                    builder
950                        .append(&entry.header, entry.data.as_slice())
951                        .map_err(|e| {
952                            SanitizeError::ArchiveError(format!("append '{}': {e}", entry.path))
953                        })?;
954                    stats.entries_skipped += 1;
955                    self.emit_progress(&stats, None, &entry.path);
956                    continue;
957                }
958                if !entry.passes_filter {
959                    stats.entries_filtered += 1;
960                    self.emit_progress(&stats, None, &entry.path);
961                    continue;
962                }
963
964                let (sanitized_buf, entry_stats) =
965                    sanitized[i].take().expect("parallel result missing");
966                stats.merge(&entry_stats);
967                self.emit_entry_bytes(&entry.path, &sanitized_buf);
968
969                let mut new_header = entry.header.clone();
970                let safe_path = sanitize_tar_entry_name(&entry.path);
971                new_header.set_path(&safe_path).map_err(|e| {
972                    SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
973                })?;
974                new_header.set_size(sanitized_buf.len() as u64);
975                new_header.set_cksum();
976                builder
977                    .append(&new_header, sanitized_buf.as_slice())
978                    .map_err(|e| {
979                        SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
980                    })?;
981                stats.files_processed += 1;
982                self.emit_progress(&stats, None, &entry.path);
983            }
984        } else {
985            // --- Sequential path ---------------------------------------------
986            // Process buffered entries first, then stream the remainder.
987
988            // Helper: write one buffered entry to the builder.
989            let write_buffered = |entry: &TarEntry,
990                                  builder: &mut tar::Builder<W>,
991                                  stats: &mut ArchiveStats,
992                                  processor: &ArchiveProcessor|
993             -> Result<()> {
994                if !entry.is_file {
995                    builder
996                        .append(&entry.header, entry.data.as_slice())
997                        .map_err(|e| {
998                            SanitizeError::ArchiveError(format!("append '{}': {e}", entry.path))
999                        })?;
1000                    stats.entries_skipped += 1;
1001                    processor.emit_progress(stats, None, &entry.path);
1002                    return Ok(());
1003                }
1004                if !entry.passes_filter {
1005                    stats.entries_filtered += 1;
1006                    processor.emit_progress(stats, None, &entry.path);
1007                    return Ok(());
1008                }
1009                let size_hint = entry.header.size().ok();
1010                let (sanitized_buf, entry_stats) =
1011                    processor.sanitize_entry_bytes(&entry.path, &entry.data, size_hint, depth)?;
1012                stats.merge(&entry_stats);
1013                processor.emit_entry_bytes(&entry.path, &sanitized_buf);
1014                let mut new_header = entry.header.clone();
1015                let safe_path = sanitize_tar_entry_name(&entry.path);
1016                new_header.set_path(&safe_path).map_err(|e| {
1017                    SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
1018                })?;
1019                new_header.set_size(sanitized_buf.len() as u64);
1020                new_header.set_cksum();
1021                builder
1022                    .append(&new_header, sanitized_buf.as_slice())
1023                    .map_err(|e| {
1024                        SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
1025                    })?;
1026                stats.files_processed += 1;
1027                processor.emit_progress(stats, None, &entry.path);
1028                Ok(())
1029            };
1030
1031            for entry in &buffered {
1032                write_buffered(entry, &mut builder, &mut stats, self)?;
1033            }
1034            drop(buffered);
1035
1036            // Stream remaining entries when the buffer cap was exceeded.
1037            if overflowed {
1038                for entry_result in entries_iter {
1039                    let mut entry = entry_result
1040                        .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {e}")))?;
1041
1042                    let header = entry.header().clone();
1043                    let path = entry
1044                        .path()
1045                        .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
1046                        .to_string_lossy()
1047                        .into_owned();
1048                    let is_file = header.entry_type().is_file();
1049
1050                    if !is_file {
1051                        let mut data = Vec::new();
1052                        entry.read_to_end(&mut data).map_err(|e| {
1053                            SanitizeError::ArchiveError(format!("read '{path}': {e}"))
1054                        })?;
1055                        drop(entry);
1056                        builder.append(&header, data.as_slice()).map_err(|e| {
1057                            SanitizeError::ArchiveError(format!("append '{path}': {e}"))
1058                        })?;
1059                        stats.entries_skipped += 1;
1060                        self.emit_progress(&stats, None, &path);
1061                        continue;
1062                    }
1063
1064                    if !self.filter.passes(&path) {
1065                        stats.entries_filtered += 1;
1066                        continue;
1067                    }
1068
1069                    let size_hint = header.size().ok();
1070                    let mut sanitized_buf = Vec::new();
1071                    let mut entry_stats = ArchiveStats::default();
1072                    self.sanitize_entry(
1073                        &path,
1074                        &mut entry,
1075                        &mut sanitized_buf,
1076                        &mut entry_stats,
1077                        size_hint,
1078                        depth,
1079                    )?;
1080                    drop(entry);
1081                    self.emit_entry_bytes(&path, &sanitized_buf);
1082
1083                    let mut new_header = header.clone();
1084                    let safe_path = sanitize_tar_entry_name(&path);
1085                    new_header.set_path(&safe_path).map_err(|e| {
1086                        SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
1087                    })?;
1088                    new_header.set_size(sanitized_buf.len() as u64);
1089                    new_header.set_cksum();
1090                    builder
1091                        .append(&new_header, sanitized_buf.as_slice())
1092                        .map_err(|e| {
1093                            SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
1094                        })?;
1095
1096                    stats.merge(&entry_stats);
1097                    stats.files_processed += 1;
1098                    self.emit_progress(&stats, None, &path);
1099                }
1100            }
1101        }
1102
1103        builder
1104            .finish()
1105            .map_err(|e| SanitizeError::ArchiveError(format!("finalize tar: {e}")))?;
1106
1107        Ok(stats)
1108    }
1109
1110    /// Process a `.tar.gz` archive (gzip-compressed tar).
1111    ///
1112    /// Decompresses on the fly, processes each entry, and recompresses
1113    /// the output.
1114    ///
1115    /// # Errors
1116    ///
1117    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
1118    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
1119    pub fn process_tar_gz<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
1120        self.process_tar_gz_at_depth(reader, writer, 0)
1121    }
1122
1123    /// Internal: process a tar.gz archive at a given nesting depth.
1124    fn process_tar_gz_at_depth<R: Read, W: Write>(
1125        &self,
1126        reader: R,
1127        writer: W,
1128        depth: u32,
1129    ) -> Result<ArchiveStats> {
1130        let gz_reader = flate2::read::GzDecoder::new(reader);
1131        let gz_writer = flate2::write::GzEncoder::new(writer, flate2::Compression::fast());
1132
1133        let stats = self.process_tar_at_depth(gz_reader, gz_writer, depth)?;
1134        // GzEncoder is flushed when the tar builder finishes and the
1135        // encoder is dropped. The `finish()` call in `process_tar`
1136        // flushes the tar builder, which flushes writes to the
1137        // GzEncoder. When the GzEncoder is dropped it finalises the
1138        // gzip stream.
1139        Ok(stats)
1140    }
1141
1142    // -----------------------------------------------------------------------
1143    // Zip processing
1144    // -----------------------------------------------------------------------
1145
1146    /// Process a `.zip` archive, sanitizing each file entry and
1147    /// rebuilding the archive with preserved metadata.
1148    ///
1149    /// # Type Bounds
1150    ///
1151    /// Zip requires seekable I/O for both reading and writing.
1152    ///
1153    /// # Errors
1154    ///
1155    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
1156    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
1157    pub fn process_zip<R: Read + Seek, W: Write + Seek>(
1158        &self,
1159        reader: R,
1160        writer: W,
1161    ) -> Result<ArchiveStats> {
1162        self.process_zip_at_depth(reader, writer, 0)
1163    }
1164
1165    /// Internal: process a zip archive at a given nesting depth.
1166    ///
1167    /// Uses a lightweight metadata pre-pass (local-header reads, no data
1168    /// decompression) to decide between parallel and sequential strategies:
1169    ///
1170    /// - **Parallel** (total uncompressed ≤ `PARALLEL_ZIP_DATA_SIZE` AND
1171    ///   file count ≥ threshold AND depth == 0): load all entry data into
1172    ///   memory, sanitize with rayon, write in order.
1173    /// - **Sequential** (everything else): read → sanitize → write one entry
1174    ///   at a time.  Peak memory is bounded to 2 × largest single entry.
1175    #[allow(clippy::too_many_lines)]
1176    fn process_zip_at_depth<R: Read + Seek, W: Write + Seek>(
1177        &self,
1178        reader: R,
1179        writer: W,
1180        depth: u32,
1181    ) -> Result<ArchiveStats> {
1182        // --- Stage 0: metadata pre-pass (no data reads) ---------------------
1183        // Read local file headers to collect names, sizes, and options.
1184        // This does N seeks but decompresses nothing, keeping memory flat.
1185        struct ZipMeta {
1186            name: String,
1187            is_dir: bool,
1188            compression: zip::CompressionMethod,
1189            last_modified: Option<zip::DateTime>,
1190            unix_mode: Option<u32>,
1191            size: u64,
1192        }
1193
1194        let mut zip_in = zip::ZipArchive::new(reader)
1195            .map_err(|e| SanitizeError::ArchiveError(format!("open zip: {}", e)))?;
1196        let total_entries = zip_in.len();
1197        let total_entries_hint = Some(total_entries as u64);
1198
1199        let mut metas: Vec<ZipMeta> = Vec::with_capacity(total_entries);
1200        let mut file_count = 0usize;
1201        let mut total_uncompressed_size: u64 = 0;
1202
1203        for i in 0..total_entries {
1204            let entry = zip_in
1205                .by_index(i)
1206                .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
1207            let is_dir = entry.is_dir();
1208            let size = entry.size();
1209            if !is_dir {
1210                file_count += 1;
1211                total_uncompressed_size = total_uncompressed_size.saturating_add(size);
1212            }
1213            metas.push(ZipMeta {
1214                name: sanitize_zip_entry_name(entry.name()),
1215                is_dir,
1216                compression: entry.compression(),
1217                last_modified: entry.last_modified(),
1218                unix_mode: entry.unix_mode(),
1219                size,
1220            });
1221            // entry dropped here — no data decompressed
1222        }
1223
1224        // Parallel only when the total data fits comfortably in memory.
1225        // Parallel when: enough entries, data fits in memory, and we are not
1226        // already running inside a rayon worker thread (nested parallelism
1227        // would over-subscribe the pool without proportional gains).
1228        let use_parallel = file_count >= self.parallel_threshold
1229            && rayon::current_thread_index().is_none()
1230            && total_uncompressed_size <= PARALLEL_ZIP_DATA_SIZE;
1231
1232        let mut stats = ArchiveStats::default();
1233
1234        // Helper: build SimpleFileOptions for a metadata entry.
1235        let make_options = |m: &ZipMeta| {
1236            let mut opts =
1237                zip::write::SimpleFileOptions::default().compression_method(m.compression);
1238            if let Some(dt) = m.last_modified {
1239                opts = opts.last_modified_time(dt);
1240            }
1241            if let Some(mode) = m.unix_mode {
1242                opts.unix_permissions(mode)
1243            } else {
1244                opts
1245            }
1246        };
1247
1248        if use_parallel {
1249            // --- Parallel path: load all data then sanitize concurrently ----
1250            struct ZipEntry {
1251                meta_idx: usize,
1252                data: Vec<u8>,
1253            }
1254
1255            let mut file_entries: Vec<ZipEntry> = Vec::with_capacity(file_count);
1256
1257            for (i, meta) in metas.iter().enumerate() {
1258                if meta.is_dir {
1259                    continue;
1260                }
1261                // Skip loading data for entries that will be filtered out.
1262                if !self.filter.passes(&meta.name) {
1263                    continue;
1264                }
1265                let mut entry = zip_in
1266                    .by_index(i)
1267                    .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
1268                let mut data = Vec::new();
1269                entry.read_to_end(&mut data).map_err(|e| {
1270                    SanitizeError::ArchiveError(format!("read zip entry '{}': {}", meta.name, e))
1271                })?;
1272                file_entries.push(ZipEntry { meta_idx: i, data });
1273            }
1274
1275            let results: Vec<ParEntryResult> = file_entries
1276                .into_par_iter()
1277                .map(|e| {
1278                    let meta = &metas[e.meta_idx];
1279                    let result =
1280                        self.sanitize_entry_bytes(&meta.name, &e.data, Some(meta.size), depth);
1281                    (e.meta_idx, result)
1282                })
1283                .collect();
1284
1285            // Collect into a positional Vec (indexed by metas position) for
1286            // O(1) ordered writes, avoiding HashMap hashing overhead.
1287            let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; metas.len()];
1288            for (meta_idx, r) in results {
1289                sanitized[meta_idx] = Some(r?);
1290            }
1291
1292            let mut zip_out = zip::ZipWriter::new(writer);
1293            for (i, meta) in metas.iter().enumerate() {
1294                let options = make_options(meta);
1295                if meta.is_dir {
1296                    zip_out.add_directory(&meta.name, options).map_err(|e| {
1297                        SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1298                    })?;
1299                    stats.entries_skipped += 1;
1300                    self.emit_progress(&stats, total_entries_hint, &meta.name);
1301                    continue;
1302                }
1303                // Filter: drop entries not matching --only/--exclude rules.
1304                if !self.filter.passes(&meta.name) {
1305                    stats.entries_filtered += 1;
1306                    self.emit_progress(&stats, total_entries_hint, &meta.name);
1307                    continue;
1308                }
1309                let (sanitized_buf, entry_stats) = sanitized[i]
1310                    .take()
1311                    .expect("file entry sanitization result missing");
1312                stats.merge(&entry_stats);
1313                self.emit_entry_bytes(&meta.name, &sanitized_buf);
1314                zip_out.start_file(&meta.name, options).map_err(|e| {
1315                    SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1316                })?;
1317                zip_out.write_all(&sanitized_buf).map_err(|e| {
1318                    SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1319                })?;
1320                stats.files_processed += 1;
1321                self.emit_progress(&stats, total_entries_hint, &meta.name);
1322            }
1323            zip_out
1324                .finish()
1325                .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1326        } else {
1327            // --- Sequential path: one entry at a time -----------------------
1328            // Only one entry's data (input + sanitized output) is live at once.
1329            let mut zip_out = zip::ZipWriter::new(writer);
1330            for (i, meta) in metas.iter().enumerate() {
1331                let options = make_options(meta);
1332                if meta.is_dir {
1333                    zip_out.add_directory(&meta.name, options).map_err(|e| {
1334                        SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1335                    })?;
1336                    stats.entries_skipped += 1;
1337                    self.emit_progress(&stats, total_entries_hint, &meta.name);
1338                    continue;
1339                }
1340
1341                // Filter: drop entries not matching --only/--exclude rules.
1342                if !self.filter.passes(&meta.name) {
1343                    stats.entries_filtered += 1;
1344                    self.emit_progress(&stats, total_entries_hint, &meta.name);
1345                    continue;
1346                }
1347
1348                let data = {
1349                    let mut entry = zip_in.by_index(i).map_err(|e| {
1350                        SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e))
1351                    })?;
1352                    let mut buf = Vec::new();
1353                    entry.read_to_end(&mut buf).map_err(|e| {
1354                        SanitizeError::ArchiveError(format!(
1355                            "read zip entry '{}': {}",
1356                            meta.name, e
1357                        ))
1358                    })?;
1359                    buf
1360                    // entry dropped here
1361                };
1362
1363                let (sanitized_buf, entry_stats) =
1364                    self.sanitize_entry_bytes(&meta.name, &data, Some(meta.size), depth)?;
1365                drop(data);
1366                self.emit_entry_bytes(&meta.name, &sanitized_buf);
1367
1368                zip_out.start_file(&meta.name, options).map_err(|e| {
1369                    SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1370                })?;
1371                zip_out.write_all(&sanitized_buf).map_err(|e| {
1372                    SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1373                })?;
1374                drop(sanitized_buf);
1375
1376                stats.merge(&entry_stats);
1377                stats.files_processed += 1;
1378                self.emit_progress(&stats, total_entries_hint, &meta.name);
1379            }
1380            zip_out
1381                .finish()
1382                .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1383        }
1384
1385        Ok(stats)
1386    }
1387
1388    // -----------------------------------------------------------------------
1389    // Format-aware dispatch
1390    // -----------------------------------------------------------------------
1391
1392    /// Auto-detect the archive format and process accordingly.
1393    ///
1394    /// For zip archives the reader must additionally implement `Seek`.
1395    /// This method accepts `Read + Seek` to cover all formats uniformly.
1396    /// Tar and tar.gz do not require seeking, but the bound is imposed
1397    /// for a single entry point.
1398    ///
1399    /// # Errors
1400    ///
1401    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
1402    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
1403    pub fn process<R: Read + Seek, W: Write + Seek>(
1404        &self,
1405        reader: R,
1406        writer: W,
1407        format: ArchiveFormat,
1408    ) -> Result<ArchiveStats> {
1409        match format {
1410            ArchiveFormat::Zip => self.process_zip(reader, writer),
1411            ArchiveFormat::Tar => self.process_tar(reader, writer),
1412            ArchiveFormat::TarGz => self.process_tar_gz(reader, writer),
1413        }
1414    }
1415}
1416
1417// ---------------------------------------------------------------------------
1418// Counting reader wrapper (for input byte tracking)
1419// ---------------------------------------------------------------------------
1420
1421/// A thin wrapper around a reader that counts bytes read.
1422struct CountingReader<'a> {
1423    inner: &'a mut dyn Read,
1424    count: u64,
1425}
1426
1427impl<'a> CountingReader<'a> {
1428    fn new(inner: &'a mut dyn Read) -> Self {
1429        Self { inner, count: 0 }
1430    }
1431
1432    fn bytes_read(&self) -> u64 {
1433        self.count
1434    }
1435}
1436
1437impl Read for CountingReader<'_> {
1438    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
1439        let n = self.inner.read(buf)?;
1440        self.count += n as u64;
1441        Ok(n)
1442    }
1443}
1444
1445/// A thin wrapper around a writer that counts bytes written (F-02 fix).
1446struct CountingWriter<'a> {
1447    inner: &'a mut dyn Write,
1448    count: u64,
1449}
1450
1451impl<'a> CountingWriter<'a> {
1452    fn new(inner: &'a mut dyn Write) -> Self {
1453        Self { inner, count: 0 }
1454    }
1455
1456    fn bytes_written(&self) -> u64 {
1457        self.count
1458    }
1459}
1460
1461impl Write for CountingWriter<'_> {
1462    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
1463        let n = self.inner.write(buf)?;
1464        self.count += n as u64;
1465        Ok(n)
1466    }
1467
1468    fn flush(&mut self) -> io::Result<()> {
1469        self.inner.flush()
1470    }
1471}
1472
1473// ---------------------------------------------------------------------------
1474// Tests
1475// ---------------------------------------------------------------------------
1476
1477#[cfg(test)]
1478mod tests {
1479    use super::*;
1480    use crate::category::Category;
1481    use crate::generator::HmacGenerator;
1482    use crate::processor::profile::{FieldRule, FileTypeProfile};
1483    use crate::processor::registry::ProcessorRegistry;
1484    use crate::scanner::{ScanConfig, ScanPattern};
1485    use std::io::Cursor;
1486    use std::sync::Mutex;
1487
1488    /// Build a test archive processor with an email pattern and a JSON profile.
1489    fn make_archive_processor() -> ArchiveProcessor {
1490        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
1491        let store = Arc::new(MappingStore::new(gen, None));
1492
1493        let patterns = vec![
1494            ScanPattern::from_regex(
1495                r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
1496                Category::Email,
1497                "email",
1498            )
1499            .unwrap(),
1500            ScanPattern::from_literal("SUPERSECRET", Category::Custom("api_key".into()), "api_key")
1501                .unwrap(),
1502        ];
1503
1504        let scanner = Arc::new(
1505            StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
1506        );
1507
1508        let registry = Arc::new(ProcessorRegistry::with_builtins());
1509
1510        let profiles = vec![FileTypeProfile::new(
1511            "json",
1512            vec![FieldRule::new("*").with_category(Category::Custom("field".into()))],
1513        )
1514        .with_extension(".json")];
1515
1516        ArchiveProcessor::new(registry, scanner, store, profiles)
1517    }
1518
1519    // -- Tar tests ----------------------------------------------------------
1520
1521    fn build_test_tar(entries: &[(&str, &[u8])]) -> Vec<u8> {
1522        let mut buf = Vec::new();
1523        {
1524            let mut builder = tar::Builder::new(&mut buf);
1525            for (name, data) in entries {
1526                let mut header = tar::Header::new_gnu();
1527                header.set_size(data.len() as u64);
1528                header.set_mode(0o644);
1529                header.set_mtime(1_700_000_000);
1530                header.set_cksum();
1531                builder.append_data(&mut header, *name, *data).unwrap();
1532            }
1533            builder.finish().unwrap();
1534        }
1535        buf
1536    }
1537
1538    #[test]
1539    fn tar_sanitizes_plaintext_with_scanner() {
1540        let proc = make_archive_processor();
1541        let input = build_test_tar(&[("readme.txt", b"Contact alice@corp.com for help.")]);
1542
1543        let mut output = Vec::new();
1544        let stats = proc.process_tar(&input[..], &mut output).unwrap();
1545
1546        assert_eq!(stats.files_processed, 1);
1547        assert_eq!(stats.scanner_fallback, 1);
1548        assert_eq!(stats.structured_hits, 0);
1549
1550        // Verify the output is a valid tar and the secret is gone.
1551        let mut archive = tar::Archive::new(&output[..]);
1552        for entry in archive.entries().unwrap() {
1553            let mut e = entry.unwrap();
1554            let mut content = String::new();
1555            e.read_to_string(&mut content).unwrap();
1556            assert!(
1557                !content.contains("alice@corp.com"),
1558                "email should be sanitized: {content}"
1559            );
1560        }
1561    }
1562
1563    #[test]
1564    fn tar_sanitizes_json_with_structured_processor() {
1565        let proc = make_archive_processor();
1566        let json_content = br#"{"email": "bob@example.org", "name": "Bob"}"#;
1567        let input = build_test_tar(&[("config.json", json_content)]);
1568
1569        let mut output = Vec::new();
1570        let stats = proc.process_tar(&input[..], &mut output).unwrap();
1571
1572        assert_eq!(stats.files_processed, 1);
1573        assert_eq!(stats.structured_hits, 1);
1574        assert_eq!(stats.scanner_fallback, 0);
1575        assert_eq!(
1576            stats.file_methods.get("config.json").unwrap(),
1577            "structured+scan:json"
1578        );
1579
1580        // Verify sanitized output.
1581        let mut archive = tar::Archive::new(&output[..]);
1582        for entry in archive.entries().unwrap() {
1583            let mut e = entry.unwrap();
1584            let mut content = String::new();
1585            e.read_to_string(&mut content).unwrap();
1586            assert!(
1587                !content.contains("bob@example.org"),
1588                "email should be sanitized"
1589            );
1590            assert!(!content.contains("Bob"), "name should be sanitized");
1591        }
1592    }
1593
1594    #[test]
1595    fn tar_preserves_metadata() {
1596        let proc = make_archive_processor();
1597        let input = build_test_tar(&[("data.txt", b"SUPERSECRET token here")]);
1598
1599        let mut output = Vec::new();
1600        proc.process_tar(&input[..], &mut output).unwrap();
1601
1602        let mut archive = tar::Archive::new(&output[..]);
1603        for entry in archive.entries().unwrap() {
1604            let e = entry.unwrap();
1605            let hdr = e.header();
1606            assert_eq!(hdr.mode().unwrap(), 0o644);
1607            assert_eq!(hdr.mtime().unwrap(), 1_700_000_000);
1608        }
1609    }
1610
1611    #[test]
1612    fn tar_handles_multiple_files() {
1613        let proc = make_archive_processor();
1614        let input = build_test_tar(&[
1615            ("a.txt", b"alice@corp.com"),
1616            ("b.json", br#"{"key":"value"}"#),
1617            ("c.log", b"no secrets here"),
1618        ]);
1619
1620        let mut output = Vec::new();
1621        let stats = proc.process_tar(&input[..], &mut output).unwrap();
1622
1623        assert_eq!(stats.files_processed, 3);
1624        // b.json matched the JSON profile
1625        assert_eq!(stats.structured_hits, 1);
1626        // a.txt and c.log fall back to scanner
1627        assert_eq!(stats.scanner_fallback, 2);
1628    }
1629
1630    #[test]
1631    fn tar_passes_through_directories() {
1632        let mut buf = Vec::new();
1633        {
1634            let mut builder = tar::Builder::new(&mut buf);
1635
1636            // Add a directory entry.
1637            let mut dir_header = tar::Header::new_gnu();
1638            dir_header.set_entry_type(tar::EntryType::Directory);
1639            dir_header.set_size(0);
1640            dir_header.set_mode(0o755);
1641            dir_header.set_cksum();
1642            builder
1643                .append_data(&mut dir_header, "mydir/", &b""[..])
1644                .unwrap();
1645
1646            // Add a file.
1647            let mut file_header = tar::Header::new_gnu();
1648            file_header.set_size(5);
1649            file_header.set_mode(0o644);
1650            file_header.set_cksum();
1651            builder
1652                .append_data(&mut file_header, "mydir/hello.txt", &b"hello"[..])
1653                .unwrap();
1654
1655            builder.finish().unwrap();
1656        }
1657
1658        let proc = make_archive_processor();
1659        let mut output = Vec::new();
1660        let stats = proc.process_tar(&buf[..], &mut output).unwrap();
1661
1662        assert_eq!(stats.entries_skipped, 1);
1663        assert_eq!(stats.files_processed, 1);
1664    }
1665
1666    // -- Tar.gz tests -------------------------------------------------------
1667
1668    #[test]
1669    fn tar_gz_round_trip() {
1670        let proc = make_archive_processor();
1671
1672        // Build a tar and gzip it.
1673        let tar_data = build_test_tar(&[("secret.txt", b"Key is SUPERSECRET okay")]);
1674        let mut gz_input = Vec::new();
1675        {
1676            let mut encoder =
1677                flate2::write::GzEncoder::new(&mut gz_input, flate2::Compression::fast());
1678            encoder.write_all(&tar_data).unwrap();
1679            encoder.finish().unwrap();
1680        }
1681
1682        let mut gz_output = Vec::new();
1683        let stats = proc.process_tar_gz(&gz_input[..], &mut gz_output).unwrap();
1684
1685        assert_eq!(stats.files_processed, 1);
1686        assert_eq!(stats.scanner_fallback, 1);
1687
1688        // Decompress and verify.
1689        let decoder = flate2::read::GzDecoder::new(&gz_output[..]);
1690        let mut archive = tar::Archive::new(decoder);
1691        for entry in archive.entries().unwrap() {
1692            let mut e = entry.unwrap();
1693            let mut content = String::new();
1694            e.read_to_string(&mut content).unwrap();
1695            assert!(
1696                !content.contains("SUPERSECRET"),
1697                "secret should be sanitized: {content}"
1698            );
1699        }
1700    }
1701
1702    // -- Zip tests ----------------------------------------------------------
1703
1704    fn build_test_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
1705        let mut buf = Cursor::new(Vec::new());
1706        {
1707            let mut zip = zip::ZipWriter::new(&mut buf);
1708            for (name, data) in entries {
1709                let options = zip::write::SimpleFileOptions::default()
1710                    .compression_method(zip::CompressionMethod::Deflated);
1711                zip.start_file(*name, options).unwrap();
1712                zip.write_all(data).unwrap();
1713            }
1714            zip.finish().unwrap();
1715        }
1716        buf.into_inner()
1717    }
1718
1719    #[test]
1720    fn zip_sanitizes_plaintext_with_scanner() {
1721        let proc = make_archive_processor();
1722        let zip_data = build_test_zip(&[("notes.txt", b"Reach alice@corp.com for info.")]);
1723
1724        let reader = Cursor::new(&zip_data);
1725        let mut writer = Cursor::new(Vec::new());
1726        let stats = proc.process_zip(reader, &mut writer).unwrap();
1727
1728        assert_eq!(stats.files_processed, 1);
1729        assert_eq!(stats.scanner_fallback, 1);
1730
1731        // Verify the output zip.
1732        let out_data = writer.into_inner();
1733        let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1734        let mut entry = zip_out.by_index(0).unwrap();
1735        let mut content = String::new();
1736        entry.read_to_string(&mut content).unwrap();
1737        assert!(
1738            !content.contains("alice@corp.com"),
1739            "email should be sanitized: {content}"
1740        );
1741    }
1742
1743    #[test]
1744    fn zip_sanitizes_json_with_structured_processor() {
1745        let proc = make_archive_processor();
1746        let json_content = br#"{"password": "hunter2", "host": "db.internal"}"#;
1747        let zip_data = build_test_zip(&[("settings.json", json_content)]);
1748
1749        let reader = Cursor::new(&zip_data);
1750        let mut writer = Cursor::new(Vec::new());
1751        let stats = proc.process_zip(reader, &mut writer).unwrap();
1752
1753        assert_eq!(stats.files_processed, 1);
1754        assert_eq!(stats.structured_hits, 1);
1755
1756        let out_data = writer.into_inner();
1757        let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1758        let mut entry = zip_out.by_index(0).unwrap();
1759        let mut content = String::new();
1760        entry.read_to_string(&mut content).unwrap();
1761        assert!(!content.contains("hunter2"), "password should be sanitized");
1762        assert!(!content.contains("db.internal"), "host should be sanitized");
1763    }
1764
1765    #[test]
1766    fn zip_preserves_directory_entries() {
1767        let mut buf = Cursor::new(Vec::new());
1768        {
1769            let mut zip = zip::ZipWriter::new(&mut buf);
1770
1771            let dir_options = zip::write::SimpleFileOptions::default();
1772            zip.add_directory("subdir/", dir_options).unwrap();
1773
1774            let file_options = zip::write::SimpleFileOptions::default()
1775                .compression_method(zip::CompressionMethod::Stored);
1776            zip.start_file("subdir/data.txt", file_options).unwrap();
1777            zip.write_all(b"SUPERSECRET value").unwrap();
1778
1779            zip.finish().unwrap();
1780        }
1781
1782        let zip_data = buf.into_inner();
1783        let proc = make_archive_processor();
1784        let reader = Cursor::new(&zip_data);
1785        let mut writer = Cursor::new(Vec::new());
1786        let stats = proc.process_zip(reader, &mut writer).unwrap();
1787
1788        assert_eq!(stats.entries_skipped, 1); // directory
1789        assert_eq!(stats.files_processed, 1);
1790    }
1791
1792    #[test]
1793    fn zip_handles_multiple_files() {
1794        let proc = make_archive_processor();
1795        let zip_data = build_test_zip(&[
1796            ("file1.txt", b"alice@corp.com"),
1797            ("file2.json", br#"{"secret":"SUPERSECRET"}"#),
1798            ("file3.log", b"nothing to see"),
1799        ]);
1800
1801        let reader = Cursor::new(&zip_data);
1802        let mut writer = Cursor::new(Vec::new());
1803        let stats = proc.process_zip(reader, &mut writer).unwrap();
1804
1805        assert_eq!(stats.files_processed, 3);
1806        assert_eq!(stats.structured_hits, 1); // JSON
1807        assert_eq!(stats.scanner_fallback, 2); // .txt + .log
1808    }
1809
1810    #[test]
1811    fn tar_progress_callback_receives_updates() {
1812        let updates = Arc::new(Mutex::new(Vec::new()));
1813        let proc = make_archive_processor().with_progress_callback({
1814            let updates = Arc::clone(&updates);
1815            Arc::new(move |progress| {
1816                updates
1817                    .lock()
1818                    .expect("archive progress lock")
1819                    .push(progress.clone());
1820            })
1821        });
1822        let input = build_test_tar(&[("a.txt", b"alice@corp.com"), ("b.txt", b"SUPERSECRET")]);
1823
1824        let mut output = Vec::new();
1825        let stats = proc.process_tar(&input[..], &mut output).unwrap();
1826        let updates = updates.lock().unwrap();
1827
1828        assert_eq!(updates.len(), 2);
1829        assert_eq!(updates.last().unwrap().entries_seen, 2);
1830        assert_eq!(
1831            updates.last().unwrap().files_processed,
1832            stats.files_processed
1833        );
1834        assert_eq!(updates.last().unwrap().total_entries, None);
1835    }
1836
1837    #[test]
1838    fn zip_progress_callback_reports_total_entries() {
1839        let updates = Arc::new(Mutex::new(Vec::new()));
1840        let proc = make_archive_processor().with_progress_callback({
1841            let updates = Arc::clone(&updates);
1842            Arc::new(move |progress| {
1843                updates
1844                    .lock()
1845                    .expect("archive progress lock")
1846                    .push(progress.clone());
1847            })
1848        });
1849        let zip_data = build_test_zip(&[
1850            ("file1.txt", b"alice@corp.com"),
1851            ("file2.log", b"nothing to see"),
1852        ]);
1853
1854        let reader = Cursor::new(&zip_data);
1855        let mut writer = Cursor::new(Vec::new());
1856        let stats = proc.process_zip(reader, &mut writer).unwrap();
1857        let updates = updates.lock().unwrap();
1858
1859        assert_eq!(updates.len(), 2);
1860        assert_eq!(
1861            updates.last().unwrap().files_processed,
1862            stats.files_processed
1863        );
1864        assert_eq!(updates.last().unwrap().total_entries, Some(2));
1865        assert_eq!(updates.last().unwrap().current_entry, "file2.log");
1866    }
1867
1868    // -- Format detection tests ---------------------------------------------
1869
1870    #[test]
1871    fn format_detection_from_path() {
1872        assert_eq!(
1873            ArchiveFormat::from_path("data.tar"),
1874            Some(ArchiveFormat::Tar)
1875        );
1876        assert_eq!(
1877            ArchiveFormat::from_path("data.tar.gz"),
1878            Some(ArchiveFormat::TarGz)
1879        );
1880        assert_eq!(
1881            ArchiveFormat::from_path("data.tgz"),
1882            Some(ArchiveFormat::TarGz)
1883        );
1884        assert_eq!(
1885            ArchiveFormat::from_path("data.zip"),
1886            Some(ArchiveFormat::Zip)
1887        );
1888        assert_eq!(
1889            ArchiveFormat::from_path("DATA.ZIP"),
1890            Some(ArchiveFormat::Zip)
1891        );
1892        assert_eq!(ArchiveFormat::from_path("photo.png"), None);
1893    }
1894
1895    // -- Determinism / dedup tests ------------------------------------------
1896
1897    #[test]
1898    fn same_secret_gets_same_replacement_across_entries() {
1899        let proc = make_archive_processor();
1900        let input = build_test_tar(&[
1901            ("a.txt", b"contact alice@corp.com"),
1902            ("b.txt", b"reach alice@corp.com"),
1903        ]);
1904
1905        let mut output = Vec::new();
1906        proc.process_tar(&input[..], &mut output).unwrap();
1907
1908        let mut archive = tar::Archive::new(&output[..]);
1909        let mut contents: Vec<String> = Vec::new();
1910        for entry in archive.entries().unwrap() {
1911            let mut e = entry.unwrap();
1912            let mut s = String::new();
1913            e.read_to_string(&mut s).unwrap();
1914            contents.push(s);
1915        }
1916
1917        // Both files should have the *same* replacement for alice@corp.com.
1918        // Extract the replacement by removing the prefix.
1919        let replacement_a = contents[0].strip_prefix("contact ").unwrap();
1920        let replacement_b = contents[1].strip_prefix("reach ").unwrap();
1921        assert_eq!(
1922            replacement_a, replacement_b,
1923            "dedup should produce identical replacements"
1924        );
1925        assert!(!replacement_a.contains("alice@corp.com"));
1926    }
1927
1928    // -- Auto-dispatch test -------------------------------------------------
1929
1930    #[test]
1931    fn process_auto_dispatch_tar() {
1932        let proc = make_archive_processor();
1933        let tar_data = build_test_tar(&[("f.txt", b"SUPERSECRET")]);
1934
1935        let reader = Cursor::new(tar_data);
1936        let writer = Cursor::new(Vec::new());
1937        let stats = proc.process(reader, writer, ArchiveFormat::Tar).unwrap();
1938
1939        assert_eq!(stats.files_processed, 1);
1940    }
1941
1942    #[test]
1943    fn process_auto_dispatch_zip() {
1944        let proc = make_archive_processor();
1945        let zip_data = build_test_zip(&[("f.txt", b"SUPERSECRET")]);
1946
1947        let reader = Cursor::new(zip_data);
1948        let mut writer = Cursor::new(Vec::new());
1949        let stats = proc
1950            .process(reader, &mut writer, ArchiveFormat::Zip)
1951            .unwrap();
1952
1953        assert_eq!(stats.files_processed, 1);
1954    }
1955
1956    // -- Empty archive tests ------------------------------------------------
1957
1958    #[test]
1959    fn tar_empty_archive() {
1960        let proc = make_archive_processor();
1961        let tar_data = build_test_tar(&[]);
1962
1963        let mut output = Vec::new();
1964        let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
1965
1966        assert_eq!(stats.files_processed, 0);
1967        assert_eq!(stats.entries_skipped, 0);
1968    }
1969
1970    #[test]
1971    fn zip_empty_archive() {
1972        let proc = make_archive_processor();
1973        let zip_data = build_test_zip(&[]);
1974
1975        let reader = Cursor::new(zip_data);
1976        let mut writer = Cursor::new(Vec::new());
1977        let stats = proc.process_zip(reader, &mut writer).unwrap();
1978
1979        assert_eq!(stats.files_processed, 0);
1980    }
1981
1982    // sanitize_zip_entry_name
1983
1984    #[test]
1985    fn zip_entry_name_clean_passthrough() {
1986        assert_eq!(sanitize_zip_entry_name("logs/app.log"), "logs/app.log");
1987        assert_eq!(sanitize_zip_entry_name("config.yaml"), "config.yaml");
1988        assert_eq!(sanitize_zip_entry_name("a/b/c.txt"), "a/b/c.txt");
1989    }
1990
1991    #[test]
1992    fn zip_entry_name_strips_leading_slash() {
1993        assert_eq!(sanitize_zip_entry_name("/etc/passwd"), "etc/passwd");
1994        assert_eq!(sanitize_zip_entry_name("///etc/passwd"), "etc/passwd");
1995    }
1996
1997    #[test]
1998    fn zip_entry_name_strips_dotdot() {
1999        assert_eq!(sanitize_zip_entry_name("../etc/passwd"), "etc/passwd");
2000        assert_eq!(
2001            sanitize_zip_entry_name("a/../../etc/passwd"),
2002            "a/etc/passwd"
2003        );
2004        assert_eq!(
2005            sanitize_zip_entry_name("../../root/.ssh/id_rsa"),
2006            "root/.ssh/id_rsa"
2007        );
2008    }
2009
2010    #[test]
2011    fn zip_entry_name_strips_leading_dot_slash() {
2012        assert_eq!(sanitize_zip_entry_name("./config.yaml"), "config.yaml");
2013        assert_eq!(sanitize_zip_entry_name("././config.yaml"), "config.yaml");
2014    }
2015
2016    #[test]
2017    fn zip_entry_name_backslash_normalised() {
2018        assert_eq!(sanitize_zip_entry_name("a\\b\\c.txt"), "a/b/c.txt");
2019        assert_eq!(sanitize_zip_entry_name("..\\etc\\passwd"), "etc/passwd");
2020    }
2021
2022    #[test]
2023    fn zip_entry_name_empty_result_replaced() {
2024        assert_eq!(sanitize_zip_entry_name("../.."), "_");
2025        assert_eq!(sanitize_zip_entry_name(""), "_");
2026        assert_eq!(sanitize_zip_entry_name("/"), "_");
2027    }
2028
2029    #[test]
2030    fn zip_entry_name_absolute_dotdot_combo() {
2031        assert_eq!(sanitize_zip_entry_name("/../etc/passwd"), "etc/passwd");
2032    }
2033
2034    // -- ArchiveFilter tests ------------------------------------------------
2035
2036    #[test]
2037    fn filter_empty_passes_everything() {
2038        let f = ArchiveFilter::new(vec![], vec![]).unwrap();
2039        assert!(f.is_empty());
2040        assert!(f.passes("config/app.yaml"));
2041        assert!(f.passes("logs/server.log"));
2042    }
2043
2044    #[test]
2045    fn filter_only_glob_includes_match() {
2046        let f = ArchiveFilter::new(vec!["**/*.json".into()], vec![]).unwrap();
2047        assert!(!f.is_empty());
2048        assert!(f.passes("config/settings.json"));
2049        assert!(f.passes("deep/nested/file.json"));
2050        assert!(!f.passes("config/settings.yaml"));
2051    }
2052
2053    #[test]
2054    fn filter_only_dir_prefix_includes_subtree() {
2055        let f = ArchiveFilter::new(vec!["config/".into()], vec![]).unwrap();
2056        assert!(f.passes("config/app.yaml"));
2057        assert!(f.passes("config/nested/db.yaml"));
2058        assert!(!f.passes("logs/server.log"));
2059    }
2060
2061    #[test]
2062    fn filter_dir_prefix_exact_match() {
2063        let f = ArchiveFilter::new(vec!["config/".into()], vec![]).unwrap();
2064        // Exact prefix without trailing separator should also match.
2065        assert!(f.passes("config"));
2066    }
2067
2068    #[test]
2069    fn filter_exclude_removes_match() {
2070        let f = ArchiveFilter::new(vec![], vec!["**/*.log".into()]).unwrap();
2071        assert!(!f.passes("logs/server.log"));
2072        assert!(f.passes("config/app.yaml"));
2073    }
2074
2075    #[test]
2076    fn filter_only_and_exclude_combined() {
2077        let f =
2078            ArchiveFilter::new(vec!["config/".into()], vec!["config/secrets.yaml".into()]).unwrap();
2079        assert!(f.passes("config/app.yaml"));
2080        assert!(!f.passes("config/secrets.yaml"));
2081        assert!(!f.passes("logs/server.log"));
2082    }
2083
2084    #[test]
2085    fn filter_invalid_glob_returns_error() {
2086        assert!(ArchiveFilter::new(vec!["[invalid".into()], vec![]).is_err());
2087        assert!(ArchiveFilter::new(vec![], vec!["[bad".into()]).is_err());
2088    }
2089
2090    // -- ArchiveProcessor builder methods -----------------------------------
2091
2092    #[test]
2093    fn builder_with_max_depth_clamps_at_max() {
2094        let proc = make_archive_processor().with_max_depth(999);
2095        assert_eq!(proc.max_depth, MAX_ARCHIVE_DEPTH);
2096    }
2097
2098    #[test]
2099    fn builder_with_max_depth_sets_value() {
2100        let proc = make_archive_processor().with_max_depth(2);
2101        assert_eq!(proc.max_depth, 2);
2102    }
2103
2104    #[test]
2105    fn builder_with_parallel_threshold_sets_value() {
2106        let proc = make_archive_processor().with_parallel_threshold(usize::MAX);
2107        assert_eq!(proc.parallel_threshold, usize::MAX);
2108    }
2109
2110    #[test]
2111    fn builder_with_force_text_enables_flag() {
2112        let proc = make_archive_processor().with_force_text(true);
2113        assert!(proc.force_text);
2114    }
2115
2116    #[test]
2117    fn builder_with_filter_applied_to_zip() {
2118        let proc = make_archive_processor()
2119            .with_filter(ArchiveFilter::new(vec!["**/*.json".into()], vec![]).unwrap());
2120
2121        let zip_data = build_test_zip(&[
2122            ("config.json", br#"{"email":"alice@corp.com"}"#),
2123            ("notes.txt", b"alice@corp.com"),
2124        ]);
2125
2126        let reader = Cursor::new(zip_data);
2127        let mut writer = Cursor::new(Vec::new());
2128        let stats = proc.process_zip(reader, &mut writer).unwrap();
2129
2130        // notes.txt is excluded by the filter — only config.json processed.
2131        assert_eq!(stats.files_processed, 1);
2132        assert_eq!(stats.entries_filtered, 1);
2133    }
2134
2135    #[test]
2136    fn builder_with_filter_applied_to_tar() {
2137        let proc = make_archive_processor()
2138            .with_filter(ArchiveFilter::new(vec!["**/*.json".into()], vec![]).unwrap());
2139
2140        let tar_data = build_test_tar(&[
2141            ("config.json", br#"{"email":"alice@corp.com"}"#),
2142            ("notes.txt", b"alice@corp.com"),
2143        ]);
2144
2145        let mut output = Vec::new();
2146        let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
2147
2148        assert_eq!(stats.files_processed, 1);
2149        assert_eq!(stats.entries_filtered, 1);
2150    }
2151
2152    // -- Parallel path tests ------------------------------------------------
2153
2154    #[test]
2155    fn parallel_tar_sanitizes_all_entries() {
2156        // parallel_threshold(0) forces parallel execution regardless of entry count.
2157        let proc = make_archive_processor().with_parallel_threshold(0);
2158        let tar_data = build_test_tar(&[
2159            ("a.txt", b"alice@corp.com"),
2160            ("b.txt", b"bob@corp.com"),
2161            ("c.txt", b"carol@corp.com"),
2162            ("d.txt", b"dave@corp.com"),
2163        ]);
2164
2165        let mut output = Vec::new();
2166        let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
2167
2168        assert_eq!(stats.files_processed, 4);
2169
2170        // Verify originals are gone (domain is preserved by email strategy, full addresses must not appear).
2171        let originals = [
2172            "alice@corp.com",
2173            "bob@corp.com",
2174            "carol@corp.com",
2175            "dave@corp.com",
2176        ];
2177        let mut archive = tar::Archive::new(&output[..]);
2178        for entry in archive.entries().unwrap() {
2179            let mut e = entry.unwrap();
2180            let mut content = String::new();
2181            e.read_to_string(&mut content).unwrap();
2182            for orig in &originals {
2183                assert!(
2184                    !content.contains(orig),
2185                    "original secret leaked in {:?}",
2186                    e.path()
2187                );
2188            }
2189        }
2190    }
2191
2192    #[test]
2193    fn parallel_tar_preserves_entry_order() {
2194        let proc = make_archive_processor().with_parallel_threshold(0);
2195        let tar_data = build_test_tar(&[
2196            ("first.txt", b"alice@corp.com"),
2197            ("second.txt", b"hello"),
2198            ("third.txt", b"bob@corp.com"),
2199        ]);
2200
2201        let mut output = Vec::new();
2202        proc.process_tar(&tar_data[..], &mut output).unwrap();
2203
2204        let mut archive = tar::Archive::new(&output[..]);
2205        let names: Vec<String> = archive
2206            .entries()
2207            .unwrap()
2208            .map(|e| e.unwrap().path().unwrap().to_string_lossy().to_string())
2209            .collect();
2210
2211        assert_eq!(names, vec!["first.txt", "second.txt", "third.txt"]);
2212    }
2213
2214    #[test]
2215    fn parallel_zip_sanitizes_all_entries() {
2216        let proc = make_archive_processor().with_parallel_threshold(0);
2217        let zip_data = build_test_zip(&[
2218            ("a.txt", b"alice@corp.com"),
2219            ("b.txt", b"bob@corp.com"),
2220            ("c.txt", b"carol@corp.com"),
2221            ("d.txt", b"dave@corp.com"),
2222        ]);
2223
2224        let reader = Cursor::new(zip_data);
2225        let mut writer = Cursor::new(Vec::new());
2226        let stats = proc.process_zip(reader, &mut writer).unwrap();
2227
2228        assert_eq!(stats.files_processed, 4);
2229
2230        let originals = [
2231            "alice@corp.com",
2232            "bob@corp.com",
2233            "carol@corp.com",
2234            "dave@corp.com",
2235        ];
2236        let out_data = writer.into_inner();
2237        let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
2238        for i in 0..zip_out.len() {
2239            let mut entry = zip_out.by_index(i).unwrap();
2240            let mut content = String::new();
2241            entry.read_to_string(&mut content).unwrap();
2242            for orig in &originals {
2243                assert!(
2244                    !content.contains(orig),
2245                    "original secret leaked in entry {i}"
2246                );
2247            }
2248        }
2249    }
2250
2251    #[test]
2252    fn parallel_tar_mixed_structured_and_scanner() {
2253        let proc = make_archive_processor().with_parallel_threshold(0);
2254        let tar_data = build_test_tar(&[
2255            ("config.json", br#"{"email":"alice@corp.com","port":5432}"#),
2256            ("notes.txt", b"contact bob@corp.com for help"),
2257            ("data.json", br#"{"email":"carol@corp.com"}"#),
2258            ("readme.txt", b"dave@corp.com is the owner"),
2259        ]);
2260
2261        let mut output = Vec::new();
2262        let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
2263
2264        assert_eq!(stats.files_processed, 4);
2265        assert_eq!(stats.structured_hits, 2); // two JSON files
2266        assert_eq!(stats.scanner_fallback, 2); // two plain text files
2267
2268        let originals = [
2269            "alice@corp.com",
2270            "bob@corp.com",
2271            "carol@corp.com",
2272            "dave@corp.com",
2273        ];
2274        let mut archive = tar::Archive::new(&output[..]);
2275        for entry in archive.entries().unwrap() {
2276            let mut e = entry.unwrap();
2277            let mut content = String::new();
2278            e.read_to_string(&mut content).unwrap();
2279            for orig in &originals {
2280                assert!(!content.contains(orig), "original secret leaked");
2281            }
2282        }
2283    }
2284
2285    // -- Nested archive tests -----------------------------------------------
2286
2287    #[test]
2288    fn tar_in_tar_secrets_sanitized() {
2289        // Build inner tar with a secret.
2290        let inner_tar = build_test_tar(&[("inner.txt", b"alice@corp.com")]);
2291
2292        // Embed the inner tar as an entry in the outer tar.
2293        let outer_tar = build_test_tar(&[("nested.tar", &inner_tar)]);
2294
2295        let proc = make_archive_processor();
2296        let mut output = Vec::new();
2297        let stats = proc.process_tar(&outer_tar[..], &mut output).unwrap();
2298
2299        assert_eq!(stats.nested_archives, 1);
2300
2301        // Unpack the outer tar and read the inner tar's content.
2302        let mut outer = tar::Archive::new(&output[..]);
2303        for entry in outer.entries().unwrap() {
2304            let mut e = entry.unwrap();
2305            let mut inner_bytes = Vec::new();
2306            e.read_to_end(&mut inner_bytes).unwrap();
2307            let mut inner = tar::Archive::new(&inner_bytes[..]);
2308            for inner_entry in inner.entries().unwrap() {
2309                let mut ie = inner_entry.unwrap();
2310                let mut content = String::new();
2311                ie.read_to_string(&mut content).unwrap();
2312                assert!(
2313                    !content.contains("alice@corp.com"),
2314                    "secret survived nested tar"
2315                );
2316            }
2317        }
2318    }
2319
2320    #[test]
2321    fn zip_in_tar_secrets_sanitized() {
2322        let inner_zip = build_test_zip(&[("inner.txt", b"SUPERSECRET")]);
2323        let outer_tar = build_test_tar(&[("nested.zip", &inner_zip)]);
2324
2325        let proc = make_archive_processor();
2326        let mut output = Vec::new();
2327        let stats = proc.process_tar(&outer_tar[..], &mut output).unwrap();
2328
2329        assert_eq!(stats.nested_archives, 1);
2330
2331        let mut outer = tar::Archive::new(&output[..]);
2332        for entry in outer.entries().unwrap() {
2333            let mut e = entry.unwrap();
2334            let mut zip_bytes = Vec::new();
2335            e.read_to_end(&mut zip_bytes).unwrap();
2336            let mut zip_out = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap();
2337            for i in 0..zip_out.len() {
2338                let mut ze = zip_out.by_index(i).unwrap();
2339                let mut content = String::new();
2340                ze.read_to_string(&mut content).unwrap();
2341                assert!(
2342                    !content.contains("SUPERSECRET"),
2343                    "secret survived zip-in-tar"
2344                );
2345            }
2346        }
2347    }
2348
2349    #[test]
2350    fn zip_in_zip_secrets_sanitized() {
2351        let inner_zip = build_test_zip(&[("secret.txt", b"alice@corp.com")]);
2352        let outer_zip = build_test_zip(&[("nested.zip", &inner_zip)]);
2353
2354        let proc = make_archive_processor();
2355        let reader = Cursor::new(outer_zip);
2356        let mut writer = Cursor::new(Vec::new());
2357        let stats = proc.process_zip(reader, &mut writer).unwrap();
2358
2359        assert_eq!(stats.nested_archives, 1);
2360
2361        let out_bytes = writer.into_inner();
2362        let mut outer = zip::ZipArchive::new(Cursor::new(out_bytes)).unwrap();
2363        let mut inner_bytes = Vec::new();
2364        outer
2365            .by_index(0)
2366            .unwrap()
2367            .read_to_end(&mut inner_bytes)
2368            .unwrap();
2369        let mut inner = zip::ZipArchive::new(Cursor::new(inner_bytes)).unwrap();
2370        let mut content = String::new();
2371        inner
2372            .by_index(0)
2373            .unwrap()
2374            .read_to_string(&mut content)
2375            .unwrap();
2376        assert!(
2377            !content.contains("alice@corp.com"),
2378            "secret survived zip-in-zip"
2379        );
2380    }
2381
2382    #[test]
2383    fn nested_archive_depth_limit_returns_error() {
2384        // Build an archive nested max_depth + 1 levels deep.
2385        // Default max_depth is DEFAULT_ARCHIVE_DEPTH (5); use a proc with depth=1.
2386        let proc = make_archive_processor().with_max_depth(1);
2387
2388        let innermost = build_test_tar(&[("file.txt", b"secret")]);
2389        let middle = build_test_tar(&[("inner.tar", &innermost)]);
2390        let outer = build_test_tar(&[("middle.tar", &middle)]);
2391
2392        let mut output = Vec::new();
2393        let err = proc.process_tar(&outer[..], &mut output).unwrap_err();
2394        assert!(matches!(err, SanitizeError::RecursionDepthExceeded(_)));
2395    }
2396
2397    #[test]
2398    fn force_text_skips_structured_processor() {
2399        let proc = make_archive_processor().with_force_text(true);
2400        let tar_data = build_test_tar(&[("config.json", br#"{"email":"alice@corp.com"}"#)]);
2401
2402        let mut output = Vec::new();
2403        let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
2404
2405        // With force_text, JSON is scanned as plain text — no structured hit.
2406        assert_eq!(stats.scanner_fallback, 1);
2407        assert_eq!(stats.structured_hits, 0);
2408    }
2409}
sanitize_engine/processor/archive.rs

sanitize_engine/processor/
archive.rs