sanitize_engine/processor/
archive.rs

1//! Archive processor for sanitizing files inside `.zip`, `.tar`, and `.tar.gz` archives.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌───────────────────────┐
7//! │  Archive (zip/tar/gz) │
8//! └────────┬──────────────┘
9//!          │  for each entry
10//!          ▼
11//! ┌─────────────────────────────────────────────┐
12//! │  1. Match entry filename → FileTypeProfile  │
13//! │  2. Try ProcessorRegistry (structured)      │
14//! │  3. Fallback: StreamScanner (streaming)     │
15//! └────────┬────────────────────────────────────┘
16//!          │  sanitized bytes
17//!          ▼
18//! ┌───────────────────────┐
19//! │  Rebuilt archive       │
20//! │  (same format, meta   │
21//! │   preserved)          │
22//! └───────────────────────┘
23//! ```
24//!
25//! # Memory Efficiency
26//!
27//! Archives are processed **entry-by-entry**. Each entry is piped
28//! through either a structured processor (which must buffer the full
29//! entry) or the [`StreamScanner`]
30//! (which processes in configurable chunks). This means the maximum
31//! memory footprint is proportional to the largest *single entry*
32//! that uses a structured processor. Files without a profile match
33//! are streamed through the scanner without buffering the whole entry.
34//!
35//! For very large individual files inside archives, the streaming
36//! scanner path keeps only `chunk_size + overlap_size` bytes in memory.
37//!
38//! # Thread Safety
39//!
40//! [`ArchiveProcessor`] is `Send + Sync`. The underlying
41//! [`MappingStore`] provides lock-free
42//! reads for dedup consistency.
43//!
44//! # Metadata Preservation
45//!
46//! - **Tar**: modification time, permissions (mode), uid/gid, and
47//!   username/groupname are copied from the source entry.
48//! - **Zip**: modification time, compression method, and unix
49//!   permissions are preserved.
50//! - Symlinks, directories, and other non-regular entries are passed
51//!   through unchanged.
52
53use crate::error::{Result, SanitizeError};
54use crate::processor::profile::FileTypeProfile;
55use crate::processor::registry::ProcessorRegistry;
56use crate::scanner::{ScanStats, StreamScanner};
57use crate::store::MappingStore;
58
59/// Strip path traversal components from an archive entry path before writing output.
60///
61/// Removes: leading `/`, `./`, and any `../` sequences. The result is always
62/// a relative path with no upward traversal. An empty result is replaced with
63/// `"_"` to avoid writing an entry with a blank name. Backslashes are
64/// normalised to forward slashes (handles Windows-style zip entries).
65fn sanitize_archive_entry_name(name: &str) -> String {
66    let name = name.replace('\\', "/");
67    let name = name.trim_start_matches('/');
68    let safe: Vec<&str> = name
69        .split('/')
70        .filter(|s| !s.is_empty() && *s != "." && *s != "..")
71        .collect();
72    let result = safe.join("/");
73    if result.is_empty() {
74        "_".to_string()
75    } else {
76        result
77    }
78}
79
80#[inline]
81fn sanitize_zip_entry_name(name: &str) -> String {
82    sanitize_archive_entry_name(name)
83}
84
85#[inline]
86fn sanitize_tar_entry_name(name: &str) -> String {
87    sanitize_archive_entry_name(name)
88}
89
90use glob::MatchOptions;
91use rayon::prelude::*;
92use std::collections::HashMap;
93use std::io::{self, Read, Seek, Write};
94use std::sync::Arc;
95
96use crate::processor::limits::{
97    DEFAULT_ARCHIVE_DEPTH, MAX_ARCHIVE_DEPTH, PARALLEL_ENTRY_THRESHOLD, PARALLEL_TAR_DATA_SIZE,
98    PARALLEL_ZIP_DATA_SIZE, STRUCTURED_ENTRY_SIZE,
99};
100
101// ---------------------------------------------------------------------------
102// Archive format enum
103// ---------------------------------------------------------------------------
104
105/// Per-entry result from parallel archive processing: `(source_index, sanitized_bytes_and_stats)`.
106type ParEntryResult = (usize, Result<(Vec<u8>, ArchiveStats)>);
107
108// ---------------------------------------------------------------------------
109// ArchiveFilter
110// ---------------------------------------------------------------------------
111
112/// A compiled glob-based entry filter for archive processing.
113///
114/// Patterns are compiled once at construction time. At processing time
115/// `passes()` is called for each file entry path inside the archive.
116///
117/// ## Pattern semantics
118///
119/// - `*` matches any sequence of characters that does **not** contain `/`.
120/// - `**` matches any sequence of characters including `/`.
121/// - `?` matches any single character except `/`.
122/// - `[abc]` matches one of the listed characters.
123/// - A pattern ending with `/` is a *directory prefix* — it matches
124///   the directory itself and any path underneath it.
125///
126/// ## Filter logic
127///
128/// 1. If `--only` patterns are present: the entry path must match at
129///    least one pattern, otherwise it is dropped.
130/// 2. If `--exclude` patterns are present: if the entry path matches
131///    any pattern, it is dropped.
132/// 3. Only file entries are filtered; directory / symlink entries
133///    always pass through to preserve archive structure.
134#[derive(Default, Clone)]
135pub struct ArchiveFilter {
136    only: Vec<CompiledPattern>,
137    exclude: Vec<CompiledPattern>,
138}
139
140#[derive(Clone)]
141enum CompiledPattern {
142    /// Pattern that ended with `/` — matches the prefix directory and
143    /// everything inside it.
144    DirPrefix(String),
145    /// General glob pattern compiled with `require_literal_separator`.
146    Glob(glob::Pattern),
147}
148
149const GLOB_OPTS: MatchOptions = MatchOptions {
150    case_sensitive: true,
151    require_literal_separator: true,
152    require_literal_leading_dot: false,
153};
154
155impl CompiledPattern {
156    fn compile(raw: &str) -> std::result::Result<Self, String> {
157        if raw.ends_with('/') {
158            // Strip trailing slash; matching is done manually in `matches`.
159            Ok(CompiledPattern::DirPrefix(
160                raw.trim_end_matches('/').to_string(),
161            ))
162        } else {
163            glob::Pattern::new(raw)
164                .map(CompiledPattern::Glob)
165                .map_err(|e| format!("invalid glob pattern '{raw}': {e}"))
166        }
167    }
168
169    fn matches(&self, path: &str) -> bool {
170        match self {
171            CompiledPattern::DirPrefix(prefix) => {
172                path == prefix || path.starts_with(&format!("{prefix}/"))
173            }
174            CompiledPattern::Glob(pat) => pat.matches_with(path, GLOB_OPTS),
175        }
176    }
177}
178
179impl ArchiveFilter {
180    /// Compile `only` and `exclude` pattern lists into an `ArchiveFilter`.
181    ///
182    /// # Errors
183    ///
184    /// Returns an error if any pattern contains invalid glob syntax.
185    pub fn new(only: Vec<String>, exclude: Vec<String>) -> std::result::Result<Self, String> {
186        let only = only
187            .into_iter()
188            .map(|p| CompiledPattern::compile(&p))
189            .collect::<std::result::Result<Vec<_>, _>>()?;
190        let exclude = exclude
191            .into_iter()
192            .map(|p| CompiledPattern::compile(&p))
193            .collect::<std::result::Result<Vec<_>, _>>()?;
194        Ok(Self { only, exclude })
195    }
196
197    /// Returns `true` when neither `--only` nor `--exclude` patterns are set.
198    pub fn is_empty(&self) -> bool {
199        self.only.is_empty() && self.exclude.is_empty()
200    }
201
202    /// Returns `true` if `path` should be included in the output archive.
203    ///
204    /// Only applies to file entries; directory entries bypass this check.
205    pub fn passes(&self, path: &str) -> bool {
206        if !self.only.is_empty() && !self.only.iter().any(|p| p.matches(path)) {
207            return false;
208        }
209        if self.exclude.iter().any(|p| p.matches(path)) {
210            return false;
211        }
212        true
213    }
214}
215
216// ---------------------------------------------------------------------------
217// Archive format enum
218// ---------------------------------------------------------------------------
219#[derive(Debug, Clone, Copy, PartialEq, Eq)]
220pub enum ArchiveFormat {
221    /// `.zip` archive.
222    Zip,
223    /// Uncompressed `.tar` archive.
224    Tar,
225    /// Gzip-compressed `.tar.gz` / `.tgz` archive.
226    TarGz,
227}
228
229impl ArchiveFormat {
230    /// Detect archive format from a file path / extension.
231    ///
232    /// Returns `None` for unrecognised extensions.
233    pub fn from_path(path: &str) -> Option<Self> {
234        let lower = path.to_ascii_lowercase();
235        if lower.ends_with(".tar.gz")
236            || std::path::Path::new(&lower)
237                .extension()
238                .is_some_and(|ext| ext.eq_ignore_ascii_case("tgz"))
239        {
240            Some(Self::TarGz)
241        } else if std::path::Path::new(&lower)
242            .extension()
243            .is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
244        {
245            Some(Self::Tar)
246        } else if std::path::Path::new(&lower)
247            .extension()
248            .is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
249        {
250            Some(Self::Zip)
251        } else {
252            None
253        }
254    }
255}
256
257// ---------------------------------------------------------------------------
258// Archive statistics
259// ---------------------------------------------------------------------------
260
261/// Statistics collected while processing an archive.
262#[derive(Debug, Clone, Default)]
263pub struct ArchiveStats {
264    /// Number of file entries processed (excludes dirs/symlinks).
265    pub files_processed: u64,
266    /// Number of entries passed through unchanged (dirs, symlinks, etc.).
267    pub entries_skipped: u64,
268    /// Number of files handled by a structured processor.
269    pub structured_hits: u64,
270    /// Number of files handled by the streaming scanner fallback.
271    pub scanner_fallback: u64,
272    /// Number of entries that were themselves archives and processed
273    /// recursively.
274    pub nested_archives: u64,
275    /// Total input bytes across all file entries.
276    pub total_input_bytes: u64,
277    /// Total output bytes across all file entries.
278    pub total_output_bytes: u64,
279    /// Per-file processing method: filename → `"structured:<proc>"`, `"scanner"`,
280    /// or `"nested:<format>"`.
281    pub file_methods: HashMap<String, String>,
282    /// Per-file scan statistics (matches, replacements, bytes, pattern counts).
283    pub file_scan_stats: HashMap<String, ScanStats>,
284    /// Number of file entries removed by the [`ArchiveFilter`].
285    pub entries_filtered: u64,
286}
287
288/// Progress snapshot emitted while processing archive entries.
289#[derive(Debug, Clone, Eq, PartialEq)]
290pub struct ArchiveProgress {
291    /// Entries seen so far, including skipped entries.
292    pub entries_seen: u64,
293    /// Regular file entries processed so far.
294    pub files_processed: u64,
295    /// Non-file entries skipped so far.
296    pub entries_skipped: u64,
297    /// Total entries when cheaply known.
298    pub total_entries: Option<u64>,
299    /// Path of the current entry.
300    pub current_entry: String,
301}
302
303type ArchiveProgressCallback = Arc<dyn Fn(&ArchiveProgress) + Send + Sync>;
304
305impl ArchiveStats {
306    /// Merge statistics from a nested archive into this parent.
307    fn merge(&mut self, child: &ArchiveStats) {
308        self.files_processed += child.files_processed;
309        self.entries_skipped += child.entries_skipped;
310        self.structured_hits += child.structured_hits;
311        self.scanner_fallback += child.scanner_fallback;
312        self.nested_archives += child.nested_archives;
313        self.total_input_bytes += child.total_input_bytes;
314        self.total_output_bytes += child.total_output_bytes;
315        self.entries_filtered += child.entries_filtered;
316        self.file_methods.extend(
317            child
318                .file_methods
319                .iter()
320                .map(|(k, v)| (k.clone(), v.clone())),
321        );
322        self.file_scan_stats.extend(
323            child
324                .file_scan_stats
325                .iter()
326                .map(|(k, v)| (k.clone(), v.clone())),
327        );
328    }
329}
330
331// ---------------------------------------------------------------------------
332// ArchiveProcessor
333// ---------------------------------------------------------------------------
334
335/// Processes archives by sanitizing each contained file and rebuilding
336/// the archive with the same format and preserved metadata.
337///
338/// # Usage
339///
340/// ```rust,no_run
341/// use sanitize_engine::processor::archive::{ArchiveProcessor, ArchiveFormat};
342/// use sanitize_engine::processor::registry::ProcessorRegistry;
343/// use sanitize_engine::scanner::{StreamScanner, ScanPattern, ScanConfig};
344/// use sanitize_engine::generator::HmacGenerator;
345/// use sanitize_engine::store::MappingStore;
346/// use sanitize_engine::category::Category;
347/// use std::sync::Arc;
348///
349/// let gen = Arc::new(HmacGenerator::new([42u8; 32]));
350/// let store = Arc::new(MappingStore::new(gen, None));
351/// let patterns = vec![
352///     ScanPattern::from_regex(r"secret\w+", Category::Custom("secret".into()), "secrets").unwrap(),
353/// ];
354/// let scanner = Arc::new(
355///     StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
356/// );
357/// let registry = Arc::new(ProcessorRegistry::with_builtins());
358///
359/// let archive_proc = ArchiveProcessor::new(registry, scanner, store, vec![]);
360/// ```
361pub struct ArchiveProcessor {
362    /// Registry of structured processors.
363    registry: Arc<ProcessorRegistry>,
364    /// Streaming scanner for fallback processing.
365    scanner: Arc<StreamScanner>,
366    /// Shared mapping store (one-way replacements).
367    store: Arc<MappingStore>,
368    /// File-type profiles for structured processor matching.
369    profiles: Vec<FileTypeProfile>,
370    /// Maximum nesting depth for recursive archive processing.
371    max_depth: u32,
372    /// Optional callback for per-entry progress updates.
373    progress_callback: Option<ArchiveProgressCallback>,
374    /// Minimum number of file entries required to enable parallel entry
375    /// sanitization. Default: [`PARALLEL_ENTRY_THRESHOLD`].
376    parallel_threshold: usize,
377    /// Entry-level filter controlling which paths are included in the
378    /// output archive. Default: empty (pass all entries).
379    filter: ArchiveFilter,
380    /// When true, bypass all structured processors and use only the
381    /// streaming scanner for every entry. Trades format preservation
382    /// for maximum sanitization coverage.
383    force_text: bool,
384}
385
386impl ArchiveProcessor {
387    /// Create a new archive processor.
388    ///
389    /// # Arguments
390    ///
391    /// - `registry` — structured processor registry.
392    /// - `scanner` — streaming scanner for fallback.
393    /// - `store` — shared mapping store for one-way dedup replacements.
394    /// - `profiles` — file-type profiles for structured matching.
395    pub fn new(
396        registry: Arc<ProcessorRegistry>,
397        scanner: Arc<StreamScanner>,
398        store: Arc<MappingStore>,
399        profiles: Vec<FileTypeProfile>,
400    ) -> Self {
401        Self {
402            registry,
403            scanner,
404            store,
405            profiles,
406            max_depth: DEFAULT_ARCHIVE_DEPTH,
407            progress_callback: None,
408            parallel_threshold: PARALLEL_ENTRY_THRESHOLD,
409            filter: ArchiveFilter::default(),
410            force_text: false,
411        }
412    }
413
414    /// Override the maximum nesting depth for recursive archive
415    /// processing.
416    ///
417    /// The default is [`DEFAULT_ARCHIVE_DEPTH`] (3). Values above
418    /// 10 are clamped.
419    #[must_use]
420    pub fn with_max_depth(mut self, depth: u32) -> Self {
421        self.max_depth = depth.min(MAX_ARCHIVE_DEPTH);
422        self
423    }
424
425    /// Override the minimum entry count required to enable parallel
426    /// entry sanitization. Set to `usize::MAX` to disable parallelism
427    /// entirely for this processor instance (e.g. when outer file-level
428    /// parallelism is already saturating the thread budget).
429    #[must_use]
430    pub fn with_parallel_threshold(mut self, threshold: usize) -> Self {
431        self.parallel_threshold = threshold;
432        self
433    }
434
435    /// Register a per-entry archive progress callback.
436    #[must_use]
437    pub fn with_progress_callback(mut self, callback: ArchiveProgressCallback) -> Self {
438        self.progress_callback = Some(callback);
439        self
440    }
441
442    /// Apply an [`ArchiveFilter`] that controls which file entries are
443    /// included in the output archive.
444    ///
445    /// Entries that do not pass the filter are **removed** from the
446    /// output entirely. Directory / symlink entries are never filtered.
447    #[must_use]
448    pub fn with_filter(mut self, filter: ArchiveFilter) -> Self {
449        self.filter = filter;
450        self
451    }
452
453    /// When set, bypass all structured processors and use only the
454    /// streaming scanner for every archive entry.
455    ///
456    /// Trades format preservation for maximum sanitization coverage.
457    /// Useful when the user is uncertain about field rules or wants a
458    /// belt-and-suspenders guarantee that every byte is scanned.
459    #[must_use]
460    pub fn with_force_text(mut self, force_text: bool) -> Self {
461        self.force_text = force_text;
462        self
463    }
464
465    /// Find the first profile matching a filename.
466    fn find_profile(&self, filename: &str) -> Option<&FileTypeProfile> {
467        self.profiles.iter().find(|p| p.matches_filename(filename))
468    }
469
470    fn emit_progress(&self, stats: &ArchiveStats, total_entries: Option<u64>, current_entry: &str) {
471        if let Some(callback) = &self.progress_callback {
472            callback(&ArchiveProgress {
473                entries_seen: stats.files_processed + stats.entries_skipped,
474                files_processed: stats.files_processed,
475                entries_skipped: stats.entries_skipped,
476                total_entries,
477                current_entry: current_entry.to_string(),
478            });
479        }
480    }
481
482    /// Sanitize a file entry given its raw bytes.
483    ///
484    /// Returns the sanitized bytes together with a fresh [`ArchiveStats`]
485    /// covering only this entry. This is the core work unit for parallel
486    /// entry processing in [`process_tar_at_depth`] and
487    /// [`process_zip_at_depth`].
488    fn sanitize_entry_bytes(
489        &self,
490        filename: &str,
491        data: &[u8],
492        entry_size_hint: Option<u64>,
493        depth: u32,
494    ) -> Result<(Vec<u8>, ArchiveStats)> {
495        let mut out: Vec<u8> = Vec::with_capacity(data.len());
496        let mut entry_stats = ArchiveStats::default();
497        let mut reader = io::Cursor::new(data);
498        self.sanitize_entry(
499            filename,
500            &mut reader,
501            &mut out,
502            &mut entry_stats,
503            entry_size_hint,
504            depth,
505        )?;
506        Ok((out, entry_stats))
507    }
508
509    /// Sanitize the content of a single file entry.
510    ///
511    /// If the entry is itself an archive (detected via extension), it is
512    /// recursively processed up to `self.max_depth`. Otherwise, tries a
513    /// structured processor first; falls back to the streaming scanner
514    /// if no processor matches.
515    ///
516    /// For the streaming scanner path, the content is piped through
517    /// `scan_reader` directly to the writer for memory-efficient
518    /// chunk-based processing (F-02 fix: no full output buffering).
519    #[allow(clippy::missing_errors_doc)] // private method
520    fn sanitize_entry(
521        &self,
522        filename: &str,
523        reader: &mut dyn Read,
524        writer: &mut dyn Write,
525        stats: &mut ArchiveStats,
526        entry_size_hint: Option<u64>,
527        depth: u32,
528    ) -> Result<()> {
529        // --- Nested archive detection ---
530        if let Some(nested_fmt) = ArchiveFormat::from_path(filename) {
531            return self.sanitize_nested_archive(
532                filename,
533                reader,
534                writer,
535                stats,
536                entry_size_hint,
537                nested_fmt,
538                depth,
539            );
540        }
541
542        // --- Structured / scanner processing ---
543
544        // Try structured processing first, but only if the entry is
545        // within the size cap and --force-text is not set.
546        // Oversized entries fall through to the streaming scanner (M-3 fix).
547        let within_size_cap = entry_size_hint.map_or(true, |sz| sz <= STRUCTURED_ENTRY_SIZE); // unknown size → allow (conservative)
548
549        if !self.force_text && within_size_cap {
550            if let Some(profile) = self.find_profile(filename) {
551                // Structured processors need the full content in memory.
552                let mut content = Vec::new();
553                reader.read_to_end(&mut content).map_err(|e| {
554                    SanitizeError::ArchiveError(format!("read entry '{filename}': {e}"))
555                })?;
556
557                stats.total_input_bytes += content.len() as u64;
558
559                // A parse error (e.g. binary content with a .yaml extension, like
560                // macOS resource-fork ._* files) falls through to the scanner
561                // rather than failing the whole archive.
562                // A parse error or heuristic rejection falls through to the scanner below.
563                if let Ok(Some(structured_out)) =
564                    self.registry.process(&content, profile, &self.store)
565                {
566                    // Double-pass: run the streaming scanner on the structured
567                    // output to catch anything the field rules missed.
568                    let (output, scan_stats) = self.scanner.scan_bytes(&structured_out)?;
569                    stats.structured_hits += 1;
570                    stats.total_output_bytes += output.len() as u64;
571                    stats.file_methods.insert(
572                        filename.to_string(),
573                        format!("structured+scan:{}", profile.processor),
574                    );
575                    stats
576                        .file_scan_stats
577                        .insert(filename.to_string(), scan_stats);
578                    writer.write_all(&output).map_err(|e| {
579                        SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
580                    })?;
581                    return Ok(());
582                }
583
584                // Processor didn't match or failed — fall back to
585                // scanner with the already-buffered content.
586                let (output, scan_stats) = self.scanner.scan_bytes(&content)?;
587                stats.scanner_fallback += 1;
588                stats.total_output_bytes += output.len() as u64;
589                stats
590                    .file_methods
591                    .insert(filename.to_string(), "scanner".to_string());
592                stats
593                    .file_scan_stats
594                    .insert(filename.to_string(), scan_stats);
595                writer.write_all(&output).map_err(|e| {
596                    SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
597                })?;
598                return Ok(());
599            }
600        }
601
602        // No profile (or entry too large) → streaming scanner.
603        // F-02 fix: stream directly from reader → scanner → writer
604        // without buffering the full output. We use a CountingWriter
605        // to track output bytes alongside the CountingReader for input.
606        let mut counting_r = CountingReader::new(reader);
607        let mut counting_w = CountingWriter::new(writer);
608        let scan_stats = self.scanner.scan_reader(&mut counting_r, &mut counting_w)?;
609
610        stats.scanner_fallback += 1;
611        stats.total_input_bytes += counting_r.bytes_read();
612        stats.total_output_bytes += counting_w.bytes_written();
613        stats
614            .file_methods
615            .insert(filename.to_string(), "scanner".to_string());
616        stats
617            .file_scan_stats
618            .insert(filename.to_string(), scan_stats);
619
620        Ok(())
621    }
622
623    /// Handle a nested archive entry: validate depth/size, buffer, recurse,
624    /// and write the sanitized output.
625    #[allow(clippy::too_many_arguments)]
626    fn sanitize_nested_archive(
627        &self,
628        filename: &str,
629        reader: &mut dyn Read,
630        writer: &mut dyn Write,
631        stats: &mut ArchiveStats,
632        entry_size_hint: Option<u64>,
633        nested_fmt: ArchiveFormat,
634        depth: u32,
635    ) -> Result<()> {
636        if depth >= self.max_depth {
637            return Err(SanitizeError::RecursionDepthExceeded(format!(
638                "nested archive '{}' at depth {} exceeds maximum nesting depth of {}",
639                filename, depth, self.max_depth,
640            )));
641        }
642
643        // Buffer the nested archive (bounded by STRUCTURED_ENTRY_SIZE).
644        if let Some(sz) = entry_size_hint {
645            if sz > STRUCTURED_ENTRY_SIZE {
646                return Err(SanitizeError::ArchiveError(format!(
647                    "nested archive '{}' is too large ({} bytes, limit {} bytes)",
648                    filename, sz, STRUCTURED_ENTRY_SIZE,
649                )));
650            }
651        }
652
653        let mut content = Vec::new();
654        reader.read_to_end(&mut content).map_err(|e| {
655            SanitizeError::ArchiveError(format!("read nested archive '{filename}': {e}"))
656        })?;
657        stats.total_input_bytes += content.len() as u64;
658
659        // Recurse into the nested archive.
660        let mut output_buf: Vec<u8> = Vec::new();
661        let child_stats = match nested_fmt {
662            ArchiveFormat::Tar => {
663                self.process_tar_at_depth(&content[..], &mut output_buf, depth + 1)?
664            }
665            ArchiveFormat::TarGz => {
666                self.process_tar_gz_at_depth(&content[..], &mut output_buf, depth + 1)?
667            }
668            ArchiveFormat::Zip => {
669                let reader = io::Cursor::new(&content);
670                let mut writer = io::Cursor::new(Vec::new());
671                let s = self.process_zip_at_depth(reader, &mut writer, depth + 1)?;
672                output_buf = writer.into_inner();
673                s
674            }
675        };
676
677        stats.nested_archives += 1;
678        stats.merge(&child_stats);
679        stats.total_output_bytes += output_buf.len() as u64;
680        let fmt_name = match nested_fmt {
681            ArchiveFormat::Tar => "tar",
682            ArchiveFormat::TarGz => "tar.gz",
683            ArchiveFormat::Zip => "zip",
684        };
685        stats
686            .file_methods
687            .insert(filename.to_string(), format!("nested:{fmt_name}"));
688        writer.write_all(&output_buf).map_err(|e| {
689            SanitizeError::ArchiveError(format!("write nested archive '{filename}': {e}"))
690        })?;
691        Ok(())
692    }
693
694    // -----------------------------------------------------------------------
695    // Profile discovery passes (two-phase support)
696    // -----------------------------------------------------------------------
697    //
698    // These methods perform a read-only pre-pass over an archive, running the
699    // structured processor on every profile-matched entry and discarding the
700    // output.  The side-effect is that `self.store` is populated with the
701    // original→replacement mappings for those fields, so a subsequent call to
702    // `build_augmented_scanner` can inject those values as literals into the
703    // scanner used for the real processing pass.
704
705    /// Run the structured processor on every profile-matched entry in a
706    /// `.tar` archive, recording replacements into the store.  Output is
707    /// discarded; the archive is not modified.
708    ///
709    /// # Errors
710    ///
711    /// Returns an error if the archive cannot be read or an entry cannot be processed.
712    pub fn discover_profiles_tar<R: Read>(&self, reader: R) -> Result<()> {
713        if self.profiles.is_empty() {
714            return Ok(());
715        }
716        let mut archive = tar::Archive::new(reader);
717        let entries = archive
718            .entries()
719            .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entries: {e}")))?;
720        for entry_result in entries {
721            let mut entry = entry_result
722                .map_err(|e| SanitizeError::ArchiveError(format!("discover tar entry: {e}")))?;
723            if !entry.header().entry_type().is_file() {
724                continue;
725            }
726            let path = entry
727                .path()
728                .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
729                .to_string_lossy()
730                .to_string();
731            let Some(profile) = self.find_profile(&path) else {
732                continue;
733            };
734            let mut content = Vec::new();
735            entry
736                .read_to_end(&mut content)
737                .map_err(|e| SanitizeError::ArchiveError(format!("read '{path}': {e}")))?;
738            let _ = self.registry.process(&content, profile, &self.store);
739        }
740        Ok(())
741    }
742
743    /// Run the structured processor on every profile-matched entry in a
744    /// `.tar.gz` archive, recording replacements into the store.  Output is
745    /// discarded; the archive is not modified.
746    ///
747    /// # Errors
748    ///
749    /// Returns an error if the archive cannot be read or an entry cannot be processed.
750    pub fn discover_profiles_tar_gz<R: Read>(&self, reader: R) -> Result<()> {
751        let gz = flate2::read::GzDecoder::new(reader);
752        self.discover_profiles_tar(gz)
753    }
754
755    /// Run the structured processor on every profile-matched entry in a
756    /// `.zip` archive, recording replacements into the store.  Output is
757    /// discarded; the archive is not modified.
758    ///
759    /// # Errors
760    ///
761    /// Returns an error if the archive cannot be read or an entry cannot be processed.
762    pub fn discover_profiles_zip<R: Read + Seek>(&self, reader: R) -> Result<()> {
763        if self.profiles.is_empty() {
764            return Ok(());
765        }
766        let mut zip = zip::ZipArchive::new(reader)
767            .map_err(|e| SanitizeError::ArchiveError(format!("open zip for discovery: {e}")))?;
768        for i in 0..zip.len() {
769            let mut entry = zip
770                .by_index(i)
771                .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {i}: {e}")))?;
772            if entry.is_dir() {
773                continue;
774            }
775            let name = sanitize_zip_entry_name(entry.name());
776            let Some(profile) = self.find_profile(&name) else {
777                continue;
778            };
779            let mut content = Vec::new();
780            entry
781                .read_to_end(&mut content)
782                .map_err(|e| SanitizeError::ArchiveError(format!("read '{name}': {e}")))?;
783            let _ = self.registry.process(&content, profile, &self.store);
784        }
785        Ok(())
786    }
787
788    // Tar processing
789    // -----------------------------------------------------------------------
790
791    /// Process a `.tar` archive, sanitizing each file entry and
792    /// rebuilding the archive with preserved metadata.
793    ///
794    /// Entries that are not regular files (directories, symlinks, etc.)
795    /// are copied through unchanged.
796    ///
797    /// # Errors
798    ///
799    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
800    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
801    pub fn process_tar<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
802        self.process_tar_at_depth(reader, writer, 0)
803    }
804
805    /// Internal: process a tar archive at a given nesting depth.
806    ///
807    /// Uses a speculative-buffer strategy to decide between parallel and
808    /// sequential processing:
809    ///
810    /// - **Parallel** (total buffered data ≤ `PARALLEL_TAR_DATA_SIZE` AND
811    ///   file count ≥ threshold AND not inside a rayon worker): buffer all
812    ///   entries, sanitize concurrently with rayon, write in source order.
813    /// - **Sequential — buffered** (threshold not met but data fits): process
814    ///   entries from the in-memory buffer one at a time.
815    /// - **Sequential — streaming** (data exceeds cap mid-stream): process
816    ///   already-buffered entries from memory, then continue streaming the
817    ///   remainder of the archive without additional buffering.
818    ///
819    /// Unlike zip, tar has no central directory so sizes cannot be known before
820    /// reading. The buffer cap (`PARALLEL_TAR_DATA_SIZE`) bounds peak memory to
821    /// cap + one entry overhead regardless of archive size.
822    #[allow(clippy::too_many_lines)]
823    fn process_tar_at_depth<R: Read, W: Write>(
824        &self,
825        reader: R,
826        writer: W,
827        depth: u32,
828    ) -> Result<ArchiveStats> {
829        struct TarEntry {
830            header: tar::Header,
831            path: String,
832            is_file: bool,
833            passes_filter: bool,
834            data: Vec<u8>,
835        }
836
837        let mut archive = tar::Archive::new(reader);
838        let mut builder = tar::Builder::new(writer);
839        let mut stats = ArchiveStats::default();
840
841        // --- Phase 1: speculative buffering ----------------------------------
842        // Stream entries into memory, tracking total file-data size.
843        // Stop buffering (but keep the last entry) if the cap is exceeded.
844        let mut entries_iter = archive
845            .entries()
846            .map_err(|e| SanitizeError::ArchiveError(format!("read tar entries: {e}")))?;
847
848        let mut buffered: Vec<TarEntry> = Vec::new();
849        let mut file_count: usize = 0;
850        let mut total_data: u64 = 0;
851        let mut overflowed = false;
852
853        for entry_result in entries_iter.by_ref() {
854            let mut entry = entry_result
855                .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {e}")))?;
856
857            let header = entry.header().clone();
858            let path = entry
859                .path()
860                .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
861                .to_string_lossy()
862                .into_owned();
863            let is_file = header.entry_type().is_file();
864            let passes_filter = !is_file || self.filter.passes(&path);
865
866            let mut data = Vec::new();
867            entry
868                .read_to_end(&mut data)
869                .map_err(|e| SanitizeError::ArchiveError(format!("read entry '{path}': {e}")))?;
870            drop(entry);
871
872            if is_file && passes_filter {
873                file_count += 1;
874                total_data = total_data.saturating_add(data.len() as u64);
875            }
876
877            buffered.push(TarEntry {
878                header,
879                path,
880                is_file,
881                passes_filter,
882                data,
883            });
884
885            if total_data > PARALLEL_TAR_DATA_SIZE {
886                overflowed = true;
887                break;
888            }
889        }
890
891        // --- Phase 2: choose strategy ----------------------------------------
892        let use_parallel = !overflowed
893            && file_count >= self.parallel_threshold
894            && rayon::current_thread_index().is_none();
895
896        if use_parallel {
897            // --- Parallel path -----------------------------------------------
898            // Sanitize all file entries concurrently; write in source order.
899            let file_indices: Vec<usize> = buffered
900                .iter()
901                .enumerate()
902                .filter(|(_, e)| e.is_file && e.passes_filter)
903                .map(|(i, _)| i)
904                .collect();
905
906            let results: Vec<ParEntryResult> = file_indices
907                .into_par_iter()
908                .map(|i| {
909                    let e = &buffered[i];
910                    let size_hint = e.header.size().ok();
911                    (
912                        i,
913                        self.sanitize_entry_bytes(&e.path, &e.data, size_hint, depth),
914                    )
915                })
916                .collect();
917
918            let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; buffered.len()];
919            for (i, r) in results {
920                sanitized[i] = Some(r?);
921            }
922
923            for (i, entry) in buffered.iter().enumerate() {
924                if !entry.is_file {
925                    builder
926                        .append(&entry.header, entry.data.as_slice())
927                        .map_err(|e| {
928                            SanitizeError::ArchiveError(format!("append '{}': {e}", entry.path))
929                        })?;
930                    stats.entries_skipped += 1;
931                    self.emit_progress(&stats, None, &entry.path);
932                    continue;
933                }
934                if !entry.passes_filter {
935                    stats.entries_filtered += 1;
936                    self.emit_progress(&stats, None, &entry.path);
937                    continue;
938                }
939
940                let (sanitized_buf, entry_stats) =
941                    sanitized[i].take().expect("parallel result missing");
942                stats.merge(&entry_stats);
943
944                let mut new_header = entry.header.clone();
945                let safe_path = sanitize_tar_entry_name(&entry.path);
946                new_header.set_path(&safe_path).map_err(|e| {
947                    SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
948                })?;
949                new_header.set_size(sanitized_buf.len() as u64);
950                new_header.set_cksum();
951                builder
952                    .append(&new_header, sanitized_buf.as_slice())
953                    .map_err(|e| {
954                        SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
955                    })?;
956                stats.files_processed += 1;
957                self.emit_progress(&stats, None, &entry.path);
958            }
959        } else {
960            // --- Sequential path ---------------------------------------------
961            // Process buffered entries first, then stream the remainder.
962
963            // Helper: write one buffered entry to the builder.
964            let write_buffered = |entry: &TarEntry,
965                                  builder: &mut tar::Builder<W>,
966                                  stats: &mut ArchiveStats,
967                                  processor: &ArchiveProcessor|
968             -> Result<()> {
969                if !entry.is_file {
970                    builder
971                        .append(&entry.header, entry.data.as_slice())
972                        .map_err(|e| {
973                            SanitizeError::ArchiveError(format!("append '{}': {e}", entry.path))
974                        })?;
975                    stats.entries_skipped += 1;
976                    processor.emit_progress(stats, None, &entry.path);
977                    return Ok(());
978                }
979                if !entry.passes_filter {
980                    stats.entries_filtered += 1;
981                    processor.emit_progress(stats, None, &entry.path);
982                    return Ok(());
983                }
984                let size_hint = entry.header.size().ok();
985                let (sanitized_buf, entry_stats) =
986                    processor.sanitize_entry_bytes(&entry.path, &entry.data, size_hint, depth)?;
987                stats.merge(&entry_stats);
988                let mut new_header = entry.header.clone();
989                let safe_path = sanitize_tar_entry_name(&entry.path);
990                new_header.set_path(&safe_path).map_err(|e| {
991                    SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
992                })?;
993                new_header.set_size(sanitized_buf.len() as u64);
994                new_header.set_cksum();
995                builder
996                    .append(&new_header, sanitized_buf.as_slice())
997                    .map_err(|e| {
998                        SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
999                    })?;
1000                stats.files_processed += 1;
1001                processor.emit_progress(stats, None, &entry.path);
1002                Ok(())
1003            };
1004
1005            for entry in &buffered {
1006                write_buffered(entry, &mut builder, &mut stats, self)?;
1007            }
1008            drop(buffered);
1009
1010            // Stream remaining entries when the buffer cap was exceeded.
1011            if overflowed {
1012                for entry_result in entries_iter {
1013                    let mut entry = entry_result
1014                        .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {e}")))?;
1015
1016                    let header = entry.header().clone();
1017                    let path = entry
1018                        .path()
1019                        .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {e}")))?
1020                        .to_string_lossy()
1021                        .into_owned();
1022                    let is_file = header.entry_type().is_file();
1023
1024                    if !is_file {
1025                        let mut data = Vec::new();
1026                        entry.read_to_end(&mut data).map_err(|e| {
1027                            SanitizeError::ArchiveError(format!("read '{path}': {e}"))
1028                        })?;
1029                        drop(entry);
1030                        builder.append(&header, data.as_slice()).map_err(|e| {
1031                            SanitizeError::ArchiveError(format!("append '{path}': {e}"))
1032                        })?;
1033                        stats.entries_skipped += 1;
1034                        self.emit_progress(&stats, None, &path);
1035                        continue;
1036                    }
1037
1038                    if !self.filter.passes(&path) {
1039                        stats.entries_filtered += 1;
1040                        continue;
1041                    }
1042
1043                    let size_hint = header.size().ok();
1044                    let mut sanitized_buf = Vec::new();
1045                    let mut entry_stats = ArchiveStats::default();
1046                    self.sanitize_entry(
1047                        &path,
1048                        &mut entry,
1049                        &mut sanitized_buf,
1050                        &mut entry_stats,
1051                        size_hint,
1052                        depth,
1053                    )?;
1054                    drop(entry);
1055
1056                    let mut new_header = header.clone();
1057                    let safe_path = sanitize_tar_entry_name(&path);
1058                    new_header.set_path(&safe_path).map_err(|e| {
1059                        SanitizeError::ArchiveError(format!("set path '{safe_path}': {e}"))
1060                    })?;
1061                    new_header.set_size(sanitized_buf.len() as u64);
1062                    new_header.set_cksum();
1063                    builder
1064                        .append(&new_header, sanitized_buf.as_slice())
1065                        .map_err(|e| {
1066                            SanitizeError::ArchiveError(format!("append '{safe_path}': {e}"))
1067                        })?;
1068
1069                    stats.merge(&entry_stats);
1070                    stats.files_processed += 1;
1071                    self.emit_progress(&stats, None, &path);
1072                }
1073            }
1074        }
1075
1076        builder
1077            .finish()
1078            .map_err(|e| SanitizeError::ArchiveError(format!("finalize tar: {e}")))?;
1079
1080        Ok(stats)
1081    }
1082
1083    /// Process a `.tar.gz` archive (gzip-compressed tar).
1084    ///
1085    /// Decompresses on the fly, processes each entry, and recompresses
1086    /// the output.
1087    ///
1088    /// # Errors
1089    ///
1090    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
1091    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
1092    pub fn process_tar_gz<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
1093        self.process_tar_gz_at_depth(reader, writer, 0)
1094    }
1095
1096    /// Internal: process a tar.gz archive at a given nesting depth.
1097    fn process_tar_gz_at_depth<R: Read, W: Write>(
1098        &self,
1099        reader: R,
1100        writer: W,
1101        depth: u32,
1102    ) -> Result<ArchiveStats> {
1103        let gz_reader = flate2::read::GzDecoder::new(reader);
1104        let gz_writer = flate2::write::GzEncoder::new(writer, flate2::Compression::fast());
1105
1106        let stats = self.process_tar_at_depth(gz_reader, gz_writer, depth)?;
1107        // GzEncoder is flushed when the tar builder finishes and the
1108        // encoder is dropped. The `finish()` call in `process_tar`
1109        // flushes the tar builder, which flushes writes to the
1110        // GzEncoder. When the GzEncoder is dropped it finalises the
1111        // gzip stream.
1112        Ok(stats)
1113    }
1114
1115    // -----------------------------------------------------------------------
1116    // Zip processing
1117    // -----------------------------------------------------------------------
1118
1119    /// Process a `.zip` archive, sanitizing each file entry and
1120    /// rebuilding the archive with preserved metadata.
1121    ///
1122    /// # Type Bounds
1123    ///
1124    /// Zip requires seekable I/O for both reading and writing.
1125    ///
1126    /// # Errors
1127    ///
1128    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
1129    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
1130    pub fn process_zip<R: Read + Seek, W: Write + Seek>(
1131        &self,
1132        reader: R,
1133        writer: W,
1134    ) -> Result<ArchiveStats> {
1135        self.process_zip_at_depth(reader, writer, 0)
1136    }
1137
1138    /// Internal: process a zip archive at a given nesting depth.
1139    ///
1140    /// Uses a lightweight metadata pre-pass (local-header reads, no data
1141    /// decompression) to decide between parallel and sequential strategies:
1142    ///
1143    /// - **Parallel** (total uncompressed ≤ `PARALLEL_ZIP_DATA_SIZE` AND
1144    ///   file count ≥ threshold AND depth == 0): load all entry data into
1145    ///   memory, sanitize with rayon, write in order.
1146    /// - **Sequential** (everything else): read → sanitize → write one entry
1147    ///   at a time.  Peak memory is bounded to 2 × largest single entry.
1148    #[allow(clippy::too_many_lines)]
1149    fn process_zip_at_depth<R: Read + Seek, W: Write + Seek>(
1150        &self,
1151        reader: R,
1152        writer: W,
1153        depth: u32,
1154    ) -> Result<ArchiveStats> {
1155        // --- Stage 0: metadata pre-pass (no data reads) ---------------------
1156        // Read local file headers to collect names, sizes, and options.
1157        // This does N seeks but decompresses nothing, keeping memory flat.
1158        struct ZipMeta {
1159            name: String,
1160            is_dir: bool,
1161            compression: zip::CompressionMethod,
1162            last_modified: Option<zip::DateTime>,
1163            unix_mode: Option<u32>,
1164            size: u64,
1165        }
1166
1167        let mut zip_in = zip::ZipArchive::new(reader)
1168            .map_err(|e| SanitizeError::ArchiveError(format!("open zip: {}", e)))?;
1169        let total_entries = zip_in.len();
1170        let total_entries_hint = Some(total_entries as u64);
1171
1172        let mut metas: Vec<ZipMeta> = Vec::with_capacity(total_entries);
1173        let mut file_count = 0usize;
1174        let mut total_uncompressed_size: u64 = 0;
1175
1176        for i in 0..total_entries {
1177            let entry = zip_in
1178                .by_index(i)
1179                .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
1180            let is_dir = entry.is_dir();
1181            let size = entry.size();
1182            if !is_dir {
1183                file_count += 1;
1184                total_uncompressed_size = total_uncompressed_size.saturating_add(size);
1185            }
1186            metas.push(ZipMeta {
1187                name: sanitize_zip_entry_name(entry.name()),
1188                is_dir,
1189                compression: entry.compression(),
1190                last_modified: entry.last_modified(),
1191                unix_mode: entry.unix_mode(),
1192                size,
1193            });
1194            // entry dropped here — no data decompressed
1195        }
1196
1197        // Parallel only when the total data fits comfortably in memory.
1198        // Parallel when: enough entries, data fits in memory, and we are not
1199        // already running inside a rayon worker thread (nested parallelism
1200        // would over-subscribe the pool without proportional gains).
1201        let use_parallel = file_count >= self.parallel_threshold
1202            && rayon::current_thread_index().is_none()
1203            && total_uncompressed_size <= PARALLEL_ZIP_DATA_SIZE;
1204
1205        let mut stats = ArchiveStats::default();
1206
1207        // Helper: build SimpleFileOptions for a metadata entry.
1208        let make_options = |m: &ZipMeta| {
1209            let mut opts =
1210                zip::write::SimpleFileOptions::default().compression_method(m.compression);
1211            if let Some(dt) = m.last_modified {
1212                opts = opts.last_modified_time(dt);
1213            }
1214            if let Some(mode) = m.unix_mode {
1215                opts.unix_permissions(mode)
1216            } else {
1217                opts
1218            }
1219        };
1220
1221        if use_parallel {
1222            // --- Parallel path: load all data then sanitize concurrently ----
1223            struct ZipEntry {
1224                meta_idx: usize,
1225                data: Vec<u8>,
1226            }
1227
1228            let mut file_entries: Vec<ZipEntry> = Vec::with_capacity(file_count);
1229
1230            for (i, meta) in metas.iter().enumerate() {
1231                if meta.is_dir {
1232                    continue;
1233                }
1234                // Skip loading data for entries that will be filtered out.
1235                if !self.filter.passes(&meta.name) {
1236                    continue;
1237                }
1238                let mut entry = zip_in
1239                    .by_index(i)
1240                    .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
1241                let mut data = Vec::new();
1242                entry.read_to_end(&mut data).map_err(|e| {
1243                    SanitizeError::ArchiveError(format!("read zip entry '{}': {}", meta.name, e))
1244                })?;
1245                file_entries.push(ZipEntry { meta_idx: i, data });
1246            }
1247
1248            let results: Vec<ParEntryResult> = file_entries
1249                .into_par_iter()
1250                .map(|e| {
1251                    let meta = &metas[e.meta_idx];
1252                    let result =
1253                        self.sanitize_entry_bytes(&meta.name, &e.data, Some(meta.size), depth);
1254                    (e.meta_idx, result)
1255                })
1256                .collect();
1257
1258            // Collect into a positional Vec (indexed by metas position) for
1259            // O(1) ordered writes, avoiding HashMap hashing overhead.
1260            let mut sanitized: Vec<Option<(Vec<u8>, ArchiveStats)>> = vec![None; metas.len()];
1261            for (meta_idx, r) in results {
1262                sanitized[meta_idx] = Some(r?);
1263            }
1264
1265            let mut zip_out = zip::ZipWriter::new(writer);
1266            for (i, meta) in metas.iter().enumerate() {
1267                let options = make_options(meta);
1268                if meta.is_dir {
1269                    zip_out.add_directory(&meta.name, options).map_err(|e| {
1270                        SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1271                    })?;
1272                    stats.entries_skipped += 1;
1273                    self.emit_progress(&stats, total_entries_hint, &meta.name);
1274                    continue;
1275                }
1276                // Filter: drop entries not matching --only/--exclude rules.
1277                if !self.filter.passes(&meta.name) {
1278                    stats.entries_filtered += 1;
1279                    self.emit_progress(&stats, total_entries_hint, &meta.name);
1280                    continue;
1281                }
1282                let (sanitized_buf, entry_stats) = sanitized[i]
1283                    .take()
1284                    .expect("file entry sanitization result missing");
1285                stats.merge(&entry_stats);
1286                zip_out.start_file(&meta.name, options).map_err(|e| {
1287                    SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1288                })?;
1289                zip_out.write_all(&sanitized_buf).map_err(|e| {
1290                    SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1291                })?;
1292                stats.files_processed += 1;
1293                self.emit_progress(&stats, total_entries_hint, &meta.name);
1294            }
1295            zip_out
1296                .finish()
1297                .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1298        } else {
1299            // --- Sequential path: one entry at a time -----------------------
1300            // Only one entry's data (input + sanitized output) is live at once.
1301            let mut zip_out = zip::ZipWriter::new(writer);
1302            for (i, meta) in metas.iter().enumerate() {
1303                let options = make_options(meta);
1304                if meta.is_dir {
1305                    zip_out.add_directory(&meta.name, options).map_err(|e| {
1306                        SanitizeError::ArchiveError(format!("add dir '{}': {}", meta.name, e))
1307                    })?;
1308                    stats.entries_skipped += 1;
1309                    self.emit_progress(&stats, total_entries_hint, &meta.name);
1310                    continue;
1311                }
1312
1313                // Filter: drop entries not matching --only/--exclude rules.
1314                if !self.filter.passes(&meta.name) {
1315                    stats.entries_filtered += 1;
1316                    self.emit_progress(&stats, total_entries_hint, &meta.name);
1317                    continue;
1318                }
1319
1320                let data = {
1321                    let mut entry = zip_in.by_index(i).map_err(|e| {
1322                        SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e))
1323                    })?;
1324                    let mut buf = Vec::new();
1325                    entry.read_to_end(&mut buf).map_err(|e| {
1326                        SanitizeError::ArchiveError(format!(
1327                            "read zip entry '{}': {}",
1328                            meta.name, e
1329                        ))
1330                    })?;
1331                    buf
1332                    // entry dropped here
1333                };
1334
1335                let (sanitized_buf, entry_stats) =
1336                    self.sanitize_entry_bytes(&meta.name, &data, Some(meta.size), depth)?;
1337                drop(data);
1338
1339                zip_out.start_file(&meta.name, options).map_err(|e| {
1340                    SanitizeError::ArchiveError(format!("start file '{}': {}", meta.name, e))
1341                })?;
1342                zip_out.write_all(&sanitized_buf).map_err(|e| {
1343                    SanitizeError::ArchiveError(format!("write file '{}': {}", meta.name, e))
1344                })?;
1345                drop(sanitized_buf);
1346
1347                stats.merge(&entry_stats);
1348                stats.files_processed += 1;
1349                self.emit_progress(&stats, total_entries_hint, &meta.name);
1350            }
1351            zip_out
1352                .finish()
1353                .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
1354        }
1355
1356        Ok(stats)
1357    }
1358
1359    // -----------------------------------------------------------------------
1360    // Format-aware dispatch
1361    // -----------------------------------------------------------------------
1362
1363    /// Auto-detect the archive format and process accordingly.
1364    ///
1365    /// For zip archives the reader must additionally implement `Seek`.
1366    /// This method accepts `Read + Seek` to cover all formats uniformly.
1367    /// Tar and tar.gz do not require seeking, but the bound is imposed
1368    /// for a single entry point.
1369    ///
1370    /// # Errors
1371    ///
1372    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
1373    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
1374    pub fn process<R: Read + Seek, W: Write + Seek>(
1375        &self,
1376        reader: R,
1377        writer: W,
1378        format: ArchiveFormat,
1379    ) -> Result<ArchiveStats> {
1380        match format {
1381            ArchiveFormat::Zip => self.process_zip(reader, writer),
1382            ArchiveFormat::Tar => self.process_tar(reader, writer),
1383            ArchiveFormat::TarGz => self.process_tar_gz(reader, writer),
1384        }
1385    }
1386}
1387
1388// ---------------------------------------------------------------------------
1389// Counting reader wrapper (for input byte tracking)
1390// ---------------------------------------------------------------------------
1391
1392/// A thin wrapper around a reader that counts bytes read.
1393struct CountingReader<'a> {
1394    inner: &'a mut dyn Read,
1395    count: u64,
1396}
1397
1398impl<'a> CountingReader<'a> {
1399    fn new(inner: &'a mut dyn Read) -> Self {
1400        Self { inner, count: 0 }
1401    }
1402
1403    fn bytes_read(&self) -> u64 {
1404        self.count
1405    }
1406}
1407
1408impl Read for CountingReader<'_> {
1409    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
1410        let n = self.inner.read(buf)?;
1411        self.count += n as u64;
1412        Ok(n)
1413    }
1414}
1415
1416/// A thin wrapper around a writer that counts bytes written (F-02 fix).
1417struct CountingWriter<'a> {
1418    inner: &'a mut dyn Write,
1419    count: u64,
1420}
1421
1422impl<'a> CountingWriter<'a> {
1423    fn new(inner: &'a mut dyn Write) -> Self {
1424        Self { inner, count: 0 }
1425    }
1426
1427    fn bytes_written(&self) -> u64 {
1428        self.count
1429    }
1430}
1431
1432impl Write for CountingWriter<'_> {
1433    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
1434        let n = self.inner.write(buf)?;
1435        self.count += n as u64;
1436        Ok(n)
1437    }
1438
1439    fn flush(&mut self) -> io::Result<()> {
1440        self.inner.flush()
1441    }
1442}
1443
1444// ---------------------------------------------------------------------------
1445// Tests
1446// ---------------------------------------------------------------------------
1447
1448#[cfg(test)]
1449mod tests {
1450    use super::*;
1451    use crate::category::Category;
1452    use crate::generator::HmacGenerator;
1453    use crate::processor::profile::{FieldRule, FileTypeProfile};
1454    use crate::processor::registry::ProcessorRegistry;
1455    use crate::scanner::{ScanConfig, ScanPattern};
1456    use std::io::Cursor;
1457    use std::sync::Mutex;
1458
1459    /// Build a test archive processor with an email pattern and a JSON profile.
1460    fn make_archive_processor() -> ArchiveProcessor {
1461        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
1462        let store = Arc::new(MappingStore::new(gen, None));
1463
1464        let patterns = vec![
1465            ScanPattern::from_regex(
1466                r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
1467                Category::Email,
1468                "email",
1469            )
1470            .unwrap(),
1471            ScanPattern::from_literal("SUPERSECRET", Category::Custom("api_key".into()), "api_key")
1472                .unwrap(),
1473        ];
1474
1475        let scanner = Arc::new(
1476            StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
1477        );
1478
1479        let registry = Arc::new(ProcessorRegistry::with_builtins());
1480
1481        let profiles = vec![FileTypeProfile::new(
1482            "json",
1483            vec![FieldRule::new("*").with_category(Category::Custom("field".into()))],
1484        )
1485        .with_extension(".json")];
1486
1487        ArchiveProcessor::new(registry, scanner, store, profiles)
1488    }
1489
1490    // -- Tar tests ----------------------------------------------------------
1491
1492    fn build_test_tar(entries: &[(&str, &[u8])]) -> Vec<u8> {
1493        let mut buf = Vec::new();
1494        {
1495            let mut builder = tar::Builder::new(&mut buf);
1496            for (name, data) in entries {
1497                let mut header = tar::Header::new_gnu();
1498                header.set_size(data.len() as u64);
1499                header.set_mode(0o644);
1500                header.set_mtime(1_700_000_000);
1501                header.set_cksum();
1502                builder.append_data(&mut header, *name, *data).unwrap();
1503            }
1504            builder.finish().unwrap();
1505        }
1506        buf
1507    }
1508
1509    #[test]
1510    fn tar_sanitizes_plaintext_with_scanner() {
1511        let proc = make_archive_processor();
1512        let input = build_test_tar(&[("readme.txt", b"Contact alice@corp.com for help.")]);
1513
1514        let mut output = Vec::new();
1515        let stats = proc.process_tar(&input[..], &mut output).unwrap();
1516
1517        assert_eq!(stats.files_processed, 1);
1518        assert_eq!(stats.scanner_fallback, 1);
1519        assert_eq!(stats.structured_hits, 0);
1520
1521        // Verify the output is a valid tar and the secret is gone.
1522        let mut archive = tar::Archive::new(&output[..]);
1523        for entry in archive.entries().unwrap() {
1524            let mut e = entry.unwrap();
1525            let mut content = String::new();
1526            e.read_to_string(&mut content).unwrap();
1527            assert!(
1528                !content.contains("alice@corp.com"),
1529                "email should be sanitized: {content}"
1530            );
1531        }
1532    }
1533
1534    #[test]
1535    fn tar_sanitizes_json_with_structured_processor() {
1536        let proc = make_archive_processor();
1537        let json_content = br#"{"email": "bob@example.org", "name": "Bob"}"#;
1538        let input = build_test_tar(&[("config.json", json_content)]);
1539
1540        let mut output = Vec::new();
1541        let stats = proc.process_tar(&input[..], &mut output).unwrap();
1542
1543        assert_eq!(stats.files_processed, 1);
1544        assert_eq!(stats.structured_hits, 1);
1545        assert_eq!(stats.scanner_fallback, 0);
1546        assert_eq!(
1547            stats.file_methods.get("config.json").unwrap(),
1548            "structured+scan:json"
1549        );
1550
1551        // Verify sanitized output.
1552        let mut archive = tar::Archive::new(&output[..]);
1553        for entry in archive.entries().unwrap() {
1554            let mut e = entry.unwrap();
1555            let mut content = String::new();
1556            e.read_to_string(&mut content).unwrap();
1557            assert!(
1558                !content.contains("bob@example.org"),
1559                "email should be sanitized"
1560            );
1561            assert!(!content.contains("Bob"), "name should be sanitized");
1562        }
1563    }
1564
1565    #[test]
1566    fn tar_preserves_metadata() {
1567        let proc = make_archive_processor();
1568        let input = build_test_tar(&[("data.txt", b"SUPERSECRET token here")]);
1569
1570        let mut output = Vec::new();
1571        proc.process_tar(&input[..], &mut output).unwrap();
1572
1573        let mut archive = tar::Archive::new(&output[..]);
1574        for entry in archive.entries().unwrap() {
1575            let e = entry.unwrap();
1576            let hdr = e.header();
1577            assert_eq!(hdr.mode().unwrap(), 0o644);
1578            assert_eq!(hdr.mtime().unwrap(), 1_700_000_000);
1579        }
1580    }
1581
1582    #[test]
1583    fn tar_handles_multiple_files() {
1584        let proc = make_archive_processor();
1585        let input = build_test_tar(&[
1586            ("a.txt", b"alice@corp.com"),
1587            ("b.json", br#"{"key":"value"}"#),
1588            ("c.log", b"no secrets here"),
1589        ]);
1590
1591        let mut output = Vec::new();
1592        let stats = proc.process_tar(&input[..], &mut output).unwrap();
1593
1594        assert_eq!(stats.files_processed, 3);
1595        // b.json matched the JSON profile
1596        assert_eq!(stats.structured_hits, 1);
1597        // a.txt and c.log fall back to scanner
1598        assert_eq!(stats.scanner_fallback, 2);
1599    }
1600
1601    #[test]
1602    fn tar_passes_through_directories() {
1603        let mut buf = Vec::new();
1604        {
1605            let mut builder = tar::Builder::new(&mut buf);
1606
1607            // Add a directory entry.
1608            let mut dir_header = tar::Header::new_gnu();
1609            dir_header.set_entry_type(tar::EntryType::Directory);
1610            dir_header.set_size(0);
1611            dir_header.set_mode(0o755);
1612            dir_header.set_cksum();
1613            builder
1614                .append_data(&mut dir_header, "mydir/", &b""[..])
1615                .unwrap();
1616
1617            // Add a file.
1618            let mut file_header = tar::Header::new_gnu();
1619            file_header.set_size(5);
1620            file_header.set_mode(0o644);
1621            file_header.set_cksum();
1622            builder
1623                .append_data(&mut file_header, "mydir/hello.txt", &b"hello"[..])
1624                .unwrap();
1625
1626            builder.finish().unwrap();
1627        }
1628
1629        let proc = make_archive_processor();
1630        let mut output = Vec::new();
1631        let stats = proc.process_tar(&buf[..], &mut output).unwrap();
1632
1633        assert_eq!(stats.entries_skipped, 1);
1634        assert_eq!(stats.files_processed, 1);
1635    }
1636
1637    // -- Tar.gz tests -------------------------------------------------------
1638
1639    #[test]
1640    fn tar_gz_round_trip() {
1641        let proc = make_archive_processor();
1642
1643        // Build a tar and gzip it.
1644        let tar_data = build_test_tar(&[("secret.txt", b"Key is SUPERSECRET okay")]);
1645        let mut gz_input = Vec::new();
1646        {
1647            let mut encoder =
1648                flate2::write::GzEncoder::new(&mut gz_input, flate2::Compression::fast());
1649            encoder.write_all(&tar_data).unwrap();
1650            encoder.finish().unwrap();
1651        }
1652
1653        let mut gz_output = Vec::new();
1654        let stats = proc.process_tar_gz(&gz_input[..], &mut gz_output).unwrap();
1655
1656        assert_eq!(stats.files_processed, 1);
1657        assert_eq!(stats.scanner_fallback, 1);
1658
1659        // Decompress and verify.
1660        let decoder = flate2::read::GzDecoder::new(&gz_output[..]);
1661        let mut archive = tar::Archive::new(decoder);
1662        for entry in archive.entries().unwrap() {
1663            let mut e = entry.unwrap();
1664            let mut content = String::new();
1665            e.read_to_string(&mut content).unwrap();
1666            assert!(
1667                !content.contains("SUPERSECRET"),
1668                "secret should be sanitized: {content}"
1669            );
1670        }
1671    }
1672
1673    // -- Zip tests ----------------------------------------------------------
1674
1675    fn build_test_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
1676        let mut buf = Cursor::new(Vec::new());
1677        {
1678            let mut zip = zip::ZipWriter::new(&mut buf);
1679            for (name, data) in entries {
1680                let options = zip::write::SimpleFileOptions::default()
1681                    .compression_method(zip::CompressionMethod::Deflated);
1682                zip.start_file(*name, options).unwrap();
1683                zip.write_all(data).unwrap();
1684            }
1685            zip.finish().unwrap();
1686        }
1687        buf.into_inner()
1688    }
1689
1690    #[test]
1691    fn zip_sanitizes_plaintext_with_scanner() {
1692        let proc = make_archive_processor();
1693        let zip_data = build_test_zip(&[("notes.txt", b"Reach alice@corp.com for info.")]);
1694
1695        let reader = Cursor::new(&zip_data);
1696        let mut writer = Cursor::new(Vec::new());
1697        let stats = proc.process_zip(reader, &mut writer).unwrap();
1698
1699        assert_eq!(stats.files_processed, 1);
1700        assert_eq!(stats.scanner_fallback, 1);
1701
1702        // Verify the output zip.
1703        let out_data = writer.into_inner();
1704        let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1705        let mut entry = zip_out.by_index(0).unwrap();
1706        let mut content = String::new();
1707        entry.read_to_string(&mut content).unwrap();
1708        assert!(
1709            !content.contains("alice@corp.com"),
1710            "email should be sanitized: {content}"
1711        );
1712    }
1713
1714    #[test]
1715    fn zip_sanitizes_json_with_structured_processor() {
1716        let proc = make_archive_processor();
1717        let json_content = br#"{"password": "hunter2", "host": "db.internal"}"#;
1718        let zip_data = build_test_zip(&[("settings.json", json_content)]);
1719
1720        let reader = Cursor::new(&zip_data);
1721        let mut writer = Cursor::new(Vec::new());
1722        let stats = proc.process_zip(reader, &mut writer).unwrap();
1723
1724        assert_eq!(stats.files_processed, 1);
1725        assert_eq!(stats.structured_hits, 1);
1726
1727        let out_data = writer.into_inner();
1728        let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1729        let mut entry = zip_out.by_index(0).unwrap();
1730        let mut content = String::new();
1731        entry.read_to_string(&mut content).unwrap();
1732        assert!(!content.contains("hunter2"), "password should be sanitized");
1733        assert!(!content.contains("db.internal"), "host should be sanitized");
1734    }
1735
1736    #[test]
1737    fn zip_preserves_directory_entries() {
1738        let mut buf = Cursor::new(Vec::new());
1739        {
1740            let mut zip = zip::ZipWriter::new(&mut buf);
1741
1742            let dir_options = zip::write::SimpleFileOptions::default();
1743            zip.add_directory("subdir/", dir_options).unwrap();
1744
1745            let file_options = zip::write::SimpleFileOptions::default()
1746                .compression_method(zip::CompressionMethod::Stored);
1747            zip.start_file("subdir/data.txt", file_options).unwrap();
1748            zip.write_all(b"SUPERSECRET value").unwrap();
1749
1750            zip.finish().unwrap();
1751        }
1752
1753        let zip_data = buf.into_inner();
1754        let proc = make_archive_processor();
1755        let reader = Cursor::new(&zip_data);
1756        let mut writer = Cursor::new(Vec::new());
1757        let stats = proc.process_zip(reader, &mut writer).unwrap();
1758
1759        assert_eq!(stats.entries_skipped, 1); // directory
1760        assert_eq!(stats.files_processed, 1);
1761    }
1762
1763    #[test]
1764    fn zip_handles_multiple_files() {
1765        let proc = make_archive_processor();
1766        let zip_data = build_test_zip(&[
1767            ("file1.txt", b"alice@corp.com"),
1768            ("file2.json", br#"{"secret":"SUPERSECRET"}"#),
1769            ("file3.log", b"nothing to see"),
1770        ]);
1771
1772        let reader = Cursor::new(&zip_data);
1773        let mut writer = Cursor::new(Vec::new());
1774        let stats = proc.process_zip(reader, &mut writer).unwrap();
1775
1776        assert_eq!(stats.files_processed, 3);
1777        assert_eq!(stats.structured_hits, 1); // JSON
1778        assert_eq!(stats.scanner_fallback, 2); // .txt + .log
1779    }
1780
1781    #[test]
1782    fn tar_progress_callback_receives_updates() {
1783        let updates = Arc::new(Mutex::new(Vec::new()));
1784        let proc = make_archive_processor().with_progress_callback({
1785            let updates = Arc::clone(&updates);
1786            Arc::new(move |progress| {
1787                updates
1788                    .lock()
1789                    .expect("archive progress lock")
1790                    .push(progress.clone());
1791            })
1792        });
1793        let input = build_test_tar(&[("a.txt", b"alice@corp.com"), ("b.txt", b"SUPERSECRET")]);
1794
1795        let mut output = Vec::new();
1796        let stats = proc.process_tar(&input[..], &mut output).unwrap();
1797        let updates = updates.lock().unwrap();
1798
1799        assert_eq!(updates.len(), 2);
1800        assert_eq!(updates.last().unwrap().entries_seen, 2);
1801        assert_eq!(
1802            updates.last().unwrap().files_processed,
1803            stats.files_processed
1804        );
1805        assert_eq!(updates.last().unwrap().total_entries, None);
1806    }
1807
1808    #[test]
1809    fn zip_progress_callback_reports_total_entries() {
1810        let updates = Arc::new(Mutex::new(Vec::new()));
1811        let proc = make_archive_processor().with_progress_callback({
1812            let updates = Arc::clone(&updates);
1813            Arc::new(move |progress| {
1814                updates
1815                    .lock()
1816                    .expect("archive progress lock")
1817                    .push(progress.clone());
1818            })
1819        });
1820        let zip_data = build_test_zip(&[
1821            ("file1.txt", b"alice@corp.com"),
1822            ("file2.log", b"nothing to see"),
1823        ]);
1824
1825        let reader = Cursor::new(&zip_data);
1826        let mut writer = Cursor::new(Vec::new());
1827        let stats = proc.process_zip(reader, &mut writer).unwrap();
1828        let updates = updates.lock().unwrap();
1829
1830        assert_eq!(updates.len(), 2);
1831        assert_eq!(
1832            updates.last().unwrap().files_processed,
1833            stats.files_processed
1834        );
1835        assert_eq!(updates.last().unwrap().total_entries, Some(2));
1836        assert_eq!(updates.last().unwrap().current_entry, "file2.log");
1837    }
1838
1839    // -- Format detection tests ---------------------------------------------
1840
1841    #[test]
1842    fn format_detection_from_path() {
1843        assert_eq!(
1844            ArchiveFormat::from_path("data.tar"),
1845            Some(ArchiveFormat::Tar)
1846        );
1847        assert_eq!(
1848            ArchiveFormat::from_path("data.tar.gz"),
1849            Some(ArchiveFormat::TarGz)
1850        );
1851        assert_eq!(
1852            ArchiveFormat::from_path("data.tgz"),
1853            Some(ArchiveFormat::TarGz)
1854        );
1855        assert_eq!(
1856            ArchiveFormat::from_path("data.zip"),
1857            Some(ArchiveFormat::Zip)
1858        );
1859        assert_eq!(
1860            ArchiveFormat::from_path("DATA.ZIP"),
1861            Some(ArchiveFormat::Zip)
1862        );
1863        assert_eq!(ArchiveFormat::from_path("photo.png"), None);
1864    }
1865
1866    // -- Determinism / dedup tests ------------------------------------------
1867
1868    #[test]
1869    fn same_secret_gets_same_replacement_across_entries() {
1870        let proc = make_archive_processor();
1871        let input = build_test_tar(&[
1872            ("a.txt", b"contact alice@corp.com"),
1873            ("b.txt", b"reach alice@corp.com"),
1874        ]);
1875
1876        let mut output = Vec::new();
1877        proc.process_tar(&input[..], &mut output).unwrap();
1878
1879        let mut archive = tar::Archive::new(&output[..]);
1880        let mut contents: Vec<String> = Vec::new();
1881        for entry in archive.entries().unwrap() {
1882            let mut e = entry.unwrap();
1883            let mut s = String::new();
1884            e.read_to_string(&mut s).unwrap();
1885            contents.push(s);
1886        }
1887
1888        // Both files should have the *same* replacement for alice@corp.com.
1889        // Extract the replacement by removing the prefix.
1890        let replacement_a = contents[0].strip_prefix("contact ").unwrap();
1891        let replacement_b = contents[1].strip_prefix("reach ").unwrap();
1892        assert_eq!(
1893            replacement_a, replacement_b,
1894            "dedup should produce identical replacements"
1895        );
1896        assert!(!replacement_a.contains("alice@corp.com"));
1897    }
1898
1899    // -- Auto-dispatch test -------------------------------------------------
1900
1901    #[test]
1902    fn process_auto_dispatch_tar() {
1903        let proc = make_archive_processor();
1904        let tar_data = build_test_tar(&[("f.txt", b"SUPERSECRET")]);
1905
1906        let reader = Cursor::new(tar_data);
1907        let writer = Cursor::new(Vec::new());
1908        let stats = proc.process(reader, writer, ArchiveFormat::Tar).unwrap();
1909
1910        assert_eq!(stats.files_processed, 1);
1911    }
1912
1913    #[test]
1914    fn process_auto_dispatch_zip() {
1915        let proc = make_archive_processor();
1916        let zip_data = build_test_zip(&[("f.txt", b"SUPERSECRET")]);
1917
1918        let reader = Cursor::new(zip_data);
1919        let mut writer = Cursor::new(Vec::new());
1920        let stats = proc
1921            .process(reader, &mut writer, ArchiveFormat::Zip)
1922            .unwrap();
1923
1924        assert_eq!(stats.files_processed, 1);
1925    }
1926
1927    // -- Empty archive tests ------------------------------------------------
1928
1929    #[test]
1930    fn tar_empty_archive() {
1931        let proc = make_archive_processor();
1932        let tar_data = build_test_tar(&[]);
1933
1934        let mut output = Vec::new();
1935        let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
1936
1937        assert_eq!(stats.files_processed, 0);
1938        assert_eq!(stats.entries_skipped, 0);
1939    }
1940
1941    #[test]
1942    fn zip_empty_archive() {
1943        let proc = make_archive_processor();
1944        let zip_data = build_test_zip(&[]);
1945
1946        let reader = Cursor::new(zip_data);
1947        let mut writer = Cursor::new(Vec::new());
1948        let stats = proc.process_zip(reader, &mut writer).unwrap();
1949
1950        assert_eq!(stats.files_processed, 0);
1951    }
1952
1953    // sanitize_zip_entry_name
1954
1955    #[test]
1956    fn zip_entry_name_clean_passthrough() {
1957        assert_eq!(sanitize_zip_entry_name("logs/app.log"), "logs/app.log");
1958        assert_eq!(sanitize_zip_entry_name("config.yaml"), "config.yaml");
1959        assert_eq!(sanitize_zip_entry_name("a/b/c.txt"), "a/b/c.txt");
1960    }
1961
1962    #[test]
1963    fn zip_entry_name_strips_leading_slash() {
1964        assert_eq!(sanitize_zip_entry_name("/etc/passwd"), "etc/passwd");
1965        assert_eq!(sanitize_zip_entry_name("///etc/passwd"), "etc/passwd");
1966    }
1967
1968    #[test]
1969    fn zip_entry_name_strips_dotdot() {
1970        assert_eq!(sanitize_zip_entry_name("../etc/passwd"), "etc/passwd");
1971        assert_eq!(
1972            sanitize_zip_entry_name("a/../../etc/passwd"),
1973            "a/etc/passwd"
1974        );
1975        assert_eq!(
1976            sanitize_zip_entry_name("../../root/.ssh/id_rsa"),
1977            "root/.ssh/id_rsa"
1978        );
1979    }
1980
1981    #[test]
1982    fn zip_entry_name_strips_leading_dot_slash() {
1983        assert_eq!(sanitize_zip_entry_name("./config.yaml"), "config.yaml");
1984        assert_eq!(sanitize_zip_entry_name("././config.yaml"), "config.yaml");
1985    }
1986
1987    #[test]
1988    fn zip_entry_name_backslash_normalised() {
1989        assert_eq!(sanitize_zip_entry_name("a\\b\\c.txt"), "a/b/c.txt");
1990        assert_eq!(sanitize_zip_entry_name("..\\etc\\passwd"), "etc/passwd");
1991    }
1992
1993    #[test]
1994    fn zip_entry_name_empty_result_replaced() {
1995        assert_eq!(sanitize_zip_entry_name("../.."), "_");
1996        assert_eq!(sanitize_zip_entry_name(""), "_");
1997        assert_eq!(sanitize_zip_entry_name("/"), "_");
1998    }
1999
2000    #[test]
2001    fn zip_entry_name_absolute_dotdot_combo() {
2002        assert_eq!(sanitize_zip_entry_name("/../etc/passwd"), "etc/passwd");
2003    }
2004}
sanitize_engine/processor/archive.rs

sanitize_engine/processor/
archive.rs