sanitize_engine/processor/
archive.rs

1//! Archive processor for sanitizing files inside `.zip`, `.tar`, and `.tar.gz` archives.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌───────────────────────┐
7//! │  Archive (zip/tar/gz) │
8//! └────────┬──────────────┘
9//!          │  for each entry
10//!          ▼
11//! ┌─────────────────────────────────────────────┐
12//! │  1. Match entry filename → FileTypeProfile  │
13//! │  2. Try ProcessorRegistry (structured)      │
14//! │  3. Fallback: StreamScanner (streaming)     │
15//! └────────┬────────────────────────────────────┘
16//!          │  sanitized bytes
17//!          ▼
18//! ┌───────────────────────┐
19//! │  Rebuilt archive       │
20//! │  (same format, meta   │
21//! │   preserved)          │
22//! └───────────────────────┘
23//! ```
24//!
25//! # Memory Efficiency
26//!
27//! Archives are processed **entry-by-entry**. Each entry is piped
28//! through either a structured processor (which must buffer the full
29//! entry) or the [`StreamScanner`]
30//! (which processes in configurable chunks). This means the maximum
31//! memory footprint is proportional to the largest *single entry*
32//! that uses a structured processor. Files without a profile match
33//! are streamed through the scanner without buffering the whole entry.
34//!
35//! For very large individual files inside archives, the streaming
36//! scanner path keeps only `chunk_size + overlap_size` bytes in memory.
37//!
38//! # Thread Safety
39//!
40//! [`ArchiveProcessor`] is `Send + Sync`. The underlying
41//! [`MappingStore`] provides lock-free
42//! reads for dedup consistency.
43//!
44//! # Metadata Preservation
45//!
46//! - **Tar**: modification time, permissions (mode), uid/gid, and
47//!   username/groupname are copied from the source entry.
48//! - **Zip**: modification time, compression method, and unix
49//!   permissions are preserved.
50//! - Symlinks, directories, and other non-regular entries are passed
51//!   through unchanged.
52
53use crate::error::{Result, SanitizeError};
54use crate::processor::profile::FileTypeProfile;
55use crate::processor::registry::ProcessorRegistry;
56use crate::scanner::{ScanStats, StreamScanner};
57use crate::store::MappingStore;
58
59use std::collections::HashMap;
60use std::io::{self, Read, Seek, Write};
61use std::sync::Arc;
62
63/// Maximum size (in bytes) for a single archive entry to be loaded into
64/// memory for structured processing. Entries larger than this are
65/// streamed through the scanner instead (M-3 fix).
66const MAX_STRUCTURED_ENTRY_SIZE: u64 = 256 * 1024 * 1024; // 256 MiB
67
68/// Default maximum nesting depth for recursive archive processing.
69///
70/// Depth 0 is the top-level archive. Nested archives at depths 1
71/// through `DEFAULT_MAX_ARCHIVE_DEPTH` are recursively extracted and
72/// sanitized. Exceeding this limit returns
73/// [`SanitizeError::RecursionDepthExceeded`].
74///
75/// Each nesting level buffers the inner archive in memory (up to
76/// `MAX_STRUCTURED_ENTRY_SIZE` per level), so the hard maximum is
77/// capped at 10 to bound peak memory.
78pub const DEFAULT_MAX_ARCHIVE_DEPTH: u32 = 3;
79
80/// Absolute maximum allowed value for archive nesting depth.
81/// Guards against excessive memory usage (each level can buffer up to
82/// 256 MiB).
83const MAX_ALLOWED_ARCHIVE_DEPTH: u32 = 10;
84
85// ---------------------------------------------------------------------------
86// Archive format enum
87// ---------------------------------------------------------------------------
88
89/// Supported archive formats.
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91pub enum ArchiveFormat {
92    /// `.zip` archive.
93    Zip,
94    /// Uncompressed `.tar` archive.
95    Tar,
96    /// Gzip-compressed `.tar.gz` / `.tgz` archive.
97    TarGz,
98}
99
100impl ArchiveFormat {
101    /// Detect archive format from a file path / extension.
102    ///
103    /// Returns `None` for unrecognised extensions.
104    pub fn from_path(path: &str) -> Option<Self> {
105        let lower = path.to_ascii_lowercase();
106        if lower.ends_with(".tar.gz")
107            || std::path::Path::new(&lower)
108                .extension()
109                .is_some_and(|ext| ext.eq_ignore_ascii_case("tgz"))
110        {
111            Some(Self::TarGz)
112        } else if std::path::Path::new(&lower)
113            .extension()
114            .is_some_and(|ext| ext.eq_ignore_ascii_case("tar"))
115        {
116            Some(Self::Tar)
117        } else if std::path::Path::new(&lower)
118            .extension()
119            .is_some_and(|ext| ext.eq_ignore_ascii_case("zip"))
120        {
121            Some(Self::Zip)
122        } else {
123            None
124        }
125    }
126}
127
128// ---------------------------------------------------------------------------
129// Archive statistics
130// ---------------------------------------------------------------------------
131
132/// Statistics collected while processing an archive.
133#[derive(Debug, Clone, Default)]
134pub struct ArchiveStats {
135    /// Number of file entries processed (excludes dirs/symlinks).
136    pub files_processed: u64,
137    /// Number of entries passed through unchanged (dirs, symlinks, etc.).
138    pub entries_skipped: u64,
139    /// Number of files handled by a structured processor.
140    pub structured_hits: u64,
141    /// Number of files handled by the streaming scanner fallback.
142    pub scanner_fallback: u64,
143    /// Number of entries that were themselves archives and processed
144    /// recursively.
145    pub nested_archives: u64,
146    /// Total input bytes across all file entries.
147    pub total_input_bytes: u64,
148    /// Total output bytes across all file entries.
149    pub total_output_bytes: u64,
150    /// Per-file processing method: filename → `"structured:<proc>"`, `"scanner"`,
151    /// or `"nested:<format>"`.
152    pub file_methods: HashMap<String, String>,
153    /// Per-file scan statistics (matches, replacements, bytes, pattern counts).
154    pub file_scan_stats: HashMap<String, ScanStats>,
155}
156
157impl ArchiveStats {
158    /// Merge statistics from a nested archive into this parent.
159    fn merge(&mut self, child: &ArchiveStats) {
160        self.files_processed += child.files_processed;
161        self.entries_skipped += child.entries_skipped;
162        self.structured_hits += child.structured_hits;
163        self.scanner_fallback += child.scanner_fallback;
164        self.nested_archives += child.nested_archives;
165        self.total_input_bytes += child.total_input_bytes;
166        self.total_output_bytes += child.total_output_bytes;
167        for (k, v) in &child.file_methods {
168            self.file_methods.insert(k.clone(), v.clone());
169        }
170        for (k, v) in &child.file_scan_stats {
171            self.file_scan_stats.insert(k.clone(), v.clone());
172        }
173    }
174}
175
176// ---------------------------------------------------------------------------
177// ArchiveProcessor
178// ---------------------------------------------------------------------------
179
180/// Processes archives by sanitizing each contained file and rebuilding
181/// the archive with the same format and preserved metadata.
182///
183/// # Usage
184///
185/// ```rust,no_run
186/// use sanitize_engine::processor::archive::{ArchiveProcessor, ArchiveFormat};
187/// use sanitize_engine::processor::registry::ProcessorRegistry;
188/// use sanitize_engine::scanner::{StreamScanner, ScanPattern, ScanConfig};
189/// use sanitize_engine::generator::HmacGenerator;
190/// use sanitize_engine::store::MappingStore;
191/// use sanitize_engine::category::Category;
192/// use std::sync::Arc;
193///
194/// let gen = Arc::new(HmacGenerator::new([42u8; 32]));
195/// let store = Arc::new(MappingStore::new(gen, None));
196/// let patterns = vec![
197///     ScanPattern::from_regex(r"secret\w+", Category::Custom("secret".into()), "secrets").unwrap(),
198/// ];
199/// let scanner = Arc::new(
200///     StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
201/// );
202/// let registry = Arc::new(ProcessorRegistry::with_builtins());
203///
204/// let archive_proc = ArchiveProcessor::new(registry, scanner, store, vec![]);
205/// ```
206pub struct ArchiveProcessor {
207    /// Registry of structured processors.
208    registry: Arc<ProcessorRegistry>,
209    /// Streaming scanner for fallback processing.
210    scanner: Arc<StreamScanner>,
211    /// Shared mapping store (one-way replacements).
212    store: Arc<MappingStore>,
213    /// File-type profiles for structured processor matching.
214    profiles: Vec<FileTypeProfile>,
215    /// Maximum nesting depth for recursive archive processing.
216    max_depth: u32,
217}
218
219impl ArchiveProcessor {
220    /// Create a new archive processor.
221    ///
222    /// # Arguments
223    ///
224    /// - `registry` — structured processor registry.
225    /// - `scanner` — streaming scanner for fallback.
226    /// - `store` — shared mapping store for one-way dedup replacements.
227    /// - `profiles` — file-type profiles for structured matching.
228    pub fn new(
229        registry: Arc<ProcessorRegistry>,
230        scanner: Arc<StreamScanner>,
231        store: Arc<MappingStore>,
232        profiles: Vec<FileTypeProfile>,
233    ) -> Self {
234        Self {
235            registry,
236            scanner,
237            store,
238            profiles,
239            max_depth: DEFAULT_MAX_ARCHIVE_DEPTH,
240        }
241    }
242
243    /// Override the maximum nesting depth for recursive archive
244    /// processing.
245    ///
246    /// The default is [`DEFAULT_MAX_ARCHIVE_DEPTH`] (3). Values above
247    /// 10 are clamped.
248    #[must_use]
249    pub fn with_max_depth(mut self, depth: u32) -> Self {
250        self.max_depth = depth.min(MAX_ALLOWED_ARCHIVE_DEPTH);
251        self
252    }
253
254    /// Find the first profile matching a filename.
255    fn find_profile(&self, filename: &str) -> Option<&FileTypeProfile> {
256        self.profiles.iter().find(|p| p.matches_filename(filename))
257    }
258
259    /// Sanitize the content of a single file entry.
260    ///
261    /// If the entry is itself an archive (detected via extension), it is
262    /// recursively processed up to `self.max_depth`. Otherwise, tries a
263    /// structured processor first; falls back to the streaming scanner
264    /// if no processor matches.
265    ///
266    /// For the streaming scanner path, the content is piped through
267    /// `scan_reader` directly to the writer for memory-efficient
268    /// chunk-based processing (F-02 fix: no full output buffering).
269    #[allow(clippy::missing_errors_doc)] // private method
270    fn sanitize_entry(
271        &self,
272        filename: &str,
273        reader: &mut dyn Read,
274        writer: &mut dyn Write,
275        stats: &mut ArchiveStats,
276        entry_size_hint: Option<u64>,
277        depth: u32,
278    ) -> Result<()> {
279        // --- Nested archive detection ---
280        if let Some(nested_fmt) = ArchiveFormat::from_path(filename) {
281            return self.sanitize_nested_archive(
282                filename,
283                reader,
284                writer,
285                stats,
286                entry_size_hint,
287                nested_fmt,
288                depth,
289            );
290        }
291
292        // --- Structured / scanner processing (unchanged) ---
293
294        // Try structured processing first, but only if the entry is
295        // within the size cap.  Oversized entries fall through to the
296        // streaming scanner (M-3 fix).
297        let within_size_cap = entry_size_hint.map_or(true, |sz| sz <= MAX_STRUCTURED_ENTRY_SIZE); // unknown size → allow (conservative)
298
299        if within_size_cap {
300            if let Some(profile) = self.find_profile(filename) {
301                // Structured processors need the full content in memory.
302                let mut content = Vec::new();
303                reader.read_to_end(&mut content).map_err(|e| {
304                    SanitizeError::ArchiveError(format!("read entry '{filename}': {e}"))
305                })?;
306
307                stats.total_input_bytes += content.len() as u64;
308
309                if let Some(output) = self.registry.process(&content, profile, &self.store)? {
310                    stats.structured_hits += 1;
311                    stats.total_output_bytes += output.len() as u64;
312                    stats.file_methods.insert(
313                        filename.to_string(),
314                        format!("structured:{}", profile.processor),
315                    );
316                    writer.write_all(&output).map_err(|e| {
317                        SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
318                    })?;
319                    return Ok(());
320                }
321
322                // Processor didn't match content heuristic — fall back to
323                // scanner with the already-buffered content.
324                let (output, scan_stats) = self.scanner.scan_bytes(&content)?;
325                stats.scanner_fallback += 1;
326                stats.total_output_bytes += output.len() as u64;
327                stats
328                    .file_methods
329                    .insert(filename.to_string(), "scanner".to_string());
330                stats
331                    .file_scan_stats
332                    .insert(filename.to_string(), scan_stats);
333                writer.write_all(&output).map_err(|e| {
334                    SanitizeError::ArchiveError(format!("write entry '{filename}': {e}"))
335                })?;
336                return Ok(());
337            }
338        }
339
340        // No profile (or entry too large) → streaming scanner.
341        // F-02 fix: stream directly from reader → scanner → writer
342        // without buffering the full output. We use a CountingWriter
343        // to track output bytes alongside the CountingReader for input.
344        let mut counting_r = CountingReader::new(reader);
345        let mut counting_w = CountingWriter::new(writer);
346        let scan_stats = self.scanner.scan_reader(&mut counting_r, &mut counting_w)?;
347
348        stats.scanner_fallback += 1;
349        stats.total_input_bytes += counting_r.bytes_read();
350        stats.total_output_bytes += counting_w.bytes_written();
351        stats
352            .file_methods
353            .insert(filename.to_string(), "scanner".to_string());
354        stats
355            .file_scan_stats
356            .insert(filename.to_string(), scan_stats);
357
358        Ok(())
359    }
360
361    /// Handle a nested archive entry: validate depth/size, buffer, recurse,
362    /// and write the sanitized output.
363    #[allow(clippy::too_many_arguments)]
364    fn sanitize_nested_archive(
365        &self,
366        filename: &str,
367        reader: &mut dyn Read,
368        writer: &mut dyn Write,
369        stats: &mut ArchiveStats,
370        entry_size_hint: Option<u64>,
371        nested_fmt: ArchiveFormat,
372        depth: u32,
373    ) -> Result<()> {
374        if depth >= self.max_depth {
375            return Err(SanitizeError::RecursionDepthExceeded(format!(
376                "nested archive '{}' at depth {} exceeds maximum nesting depth of {}",
377                filename, depth, self.max_depth,
378            )));
379        }
380
381        // Buffer the nested archive (bounded by MAX_STRUCTURED_ENTRY_SIZE).
382        if let Some(sz) = entry_size_hint {
383            if sz > MAX_STRUCTURED_ENTRY_SIZE {
384                return Err(SanitizeError::ArchiveError(format!(
385                    "nested archive '{}' is too large ({} bytes, limit {} bytes)",
386                    filename, sz, MAX_STRUCTURED_ENTRY_SIZE,
387                )));
388            }
389        }
390
391        let mut content = Vec::new();
392        reader.read_to_end(&mut content).map_err(|e| {
393            SanitizeError::ArchiveError(format!("read nested archive '{filename}': {e}"))
394        })?;
395        stats.total_input_bytes += content.len() as u64;
396
397        // Recurse into the nested archive.
398        let mut output_buf: Vec<u8> = Vec::new();
399        let child_stats = match nested_fmt {
400            ArchiveFormat::Tar => {
401                self.process_tar_at_depth(&content[..], &mut output_buf, depth + 1)?
402            }
403            ArchiveFormat::TarGz => {
404                self.process_tar_gz_at_depth(&content[..], &mut output_buf, depth + 1)?
405            }
406            ArchiveFormat::Zip => {
407                let reader = io::Cursor::new(&content);
408                let mut writer = io::Cursor::new(Vec::new());
409                let s = self.process_zip_at_depth(reader, &mut writer, depth + 1)?;
410                output_buf = writer.into_inner();
411                s
412            }
413        };
414
415        stats.nested_archives += 1;
416        stats.merge(&child_stats);
417        stats.total_output_bytes += output_buf.len() as u64;
418        let fmt_name = match nested_fmt {
419            ArchiveFormat::Tar => "tar",
420            ArchiveFormat::TarGz => "tar.gz",
421            ArchiveFormat::Zip => "zip",
422        };
423        stats
424            .file_methods
425            .insert(filename.to_string(), format!("nested:{fmt_name}"));
426        writer.write_all(&output_buf).map_err(|e| {
427            SanitizeError::ArchiveError(format!("write nested archive '{filename}': {e}"))
428        })?;
429        Ok(())
430    }
431
432    // -----------------------------------------------------------------------
433    // Tar processing
434    // -----------------------------------------------------------------------
435
436    /// Process a `.tar` archive, sanitizing each file entry and
437    /// rebuilding the archive with preserved metadata.
438    ///
439    /// Entries that are not regular files (directories, symlinks, etc.)
440    /// are copied through unchanged.
441    ///
442    /// # Errors
443    ///
444    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
445    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
446    pub fn process_tar<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
447        self.process_tar_at_depth(reader, writer, 0)
448    }
449
450    /// Internal: process a tar archive at a given nesting depth.
451    fn process_tar_at_depth<R: Read, W: Write>(
452        &self,
453        reader: R,
454        writer: W,
455        depth: u32,
456    ) -> Result<ArchiveStats> {
457        let mut stats = ArchiveStats::default();
458        let mut archive = tar::Archive::new(reader);
459        let mut builder = tar::Builder::new(writer);
460
461        let entries = archive
462            .entries()
463            .map_err(|e| SanitizeError::ArchiveError(format!("read tar entries: {}", e)))?;
464
465        for entry_result in entries {
466            let mut entry = entry_result
467                .map_err(|e| SanitizeError::ArchiveError(format!("read tar entry: {}", e)))?;
468
469            let header = entry.header().clone();
470            let path = entry
471                .path()
472                .map_err(|e| SanitizeError::ArchiveError(format!("entry path: {}", e)))?
473                .to_string_lossy()
474                .to_string();
475
476            let entry_type = header.entry_type();
477
478            // Only process regular files.
479            if !entry_type.is_file() {
480                // Pass through directories, symlinks, etc. unchanged.
481                // We need to read the entry data (even if empty) to
482                // advance the archive cursor.
483                let mut data = Vec::new();
484                entry.read_to_end(&mut data).map_err(|e| {
485                    SanitizeError::ArchiveError(format!("read non-file entry '{}': {}", path, e))
486                })?;
487                builder.append(&header, &*data).map_err(|e| {
488                    SanitizeError::ArchiveError(format!("append entry '{}': {}", path, e))
489                })?;
490                stats.entries_skipped += 1;
491                continue;
492            }
493
494            // Sanitize the file content.
495            let mut sanitized_buf: Vec<u8> = Vec::new();
496            let entry_size = header.size().ok();
497            self.sanitize_entry(
498                &path,
499                &mut entry,
500                &mut sanitized_buf,
501                &mut stats,
502                entry_size,
503                depth,
504            )?;
505
506            // Build a new header with the sanitized content length but
507            // preserved metadata.
508            let mut new_header = header.clone();
509            new_header.set_size(sanitized_buf.len() as u64);
510            new_header.set_cksum();
511
512            builder.append(&new_header, &*sanitized_buf).map_err(|e| {
513                SanitizeError::ArchiveError(format!("append entry '{}': {}", path, e))
514            })?;
515
516            stats.files_processed += 1;
517        }
518
519        builder
520            .finish()
521            .map_err(|e| SanitizeError::ArchiveError(format!("finalize tar: {}", e)))?;
522
523        Ok(stats)
524    }
525
526    /// Process a `.tar.gz` archive (gzip-compressed tar).
527    ///
528    /// Decompresses on the fly, processes each entry, and recompresses
529    /// the output.
530    ///
531    /// # Errors
532    ///
533    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
534    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
535    pub fn process_tar_gz<R: Read, W: Write>(&self, reader: R, writer: W) -> Result<ArchiveStats> {
536        self.process_tar_gz_at_depth(reader, writer, 0)
537    }
538
539    /// Internal: process a tar.gz archive at a given nesting depth.
540    fn process_tar_gz_at_depth<R: Read, W: Write>(
541        &self,
542        reader: R,
543        writer: W,
544        depth: u32,
545    ) -> Result<ArchiveStats> {
546        let gz_reader = flate2::read::GzDecoder::new(reader);
547        let gz_writer = flate2::write::GzEncoder::new(writer, flate2::Compression::default());
548
549        let stats = self.process_tar_at_depth(gz_reader, gz_writer, depth)?;
550        // GzEncoder is flushed when the tar builder finishes and the
551        // encoder is dropped. The `finish()` call in `process_tar`
552        // flushes the tar builder, which flushes writes to the
553        // GzEncoder. When the GzEncoder is dropped it finalises the
554        // gzip stream.
555        Ok(stats)
556    }
557
558    // -----------------------------------------------------------------------
559    // Zip processing
560    // -----------------------------------------------------------------------
561
562    /// Process a `.zip` archive, sanitizing each file entry and
563    /// rebuilding the archive with preserved metadata.
564    ///
565    /// # Type Bounds
566    ///
567    /// Zip requires seekable I/O for both reading and writing.
568    ///
569    /// # Errors
570    ///
571    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
572    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
573    pub fn process_zip<R: Read + Seek, W: Write + Seek>(
574        &self,
575        reader: R,
576        writer: W,
577    ) -> Result<ArchiveStats> {
578        self.process_zip_at_depth(reader, writer, 0)
579    }
580
581    /// Internal: process a zip archive at a given nesting depth.
582    fn process_zip_at_depth<R: Read + Seek, W: Write + Seek>(
583        &self,
584        reader: R,
585        writer: W,
586        depth: u32,
587    ) -> Result<ArchiveStats> {
588        let mut stats = ArchiveStats::default();
589        let mut zip_in = zip::ZipArchive::new(reader)
590            .map_err(|e| SanitizeError::ArchiveError(format!("open zip: {}", e)))?;
591        let mut zip_out = zip::ZipWriter::new(writer);
592
593        for i in 0..zip_in.len() {
594            let mut entry = zip_in
595                .by_index(i)
596                .map_err(|e| SanitizeError::ArchiveError(format!("zip entry {}: {}", i, e)))?;
597
598            let name = entry.name().to_string();
599
600            // Security note: entry names are preserved verbatim (including any
601            // "../" or absolute-path components) because this tool writes a
602            // sanitised *archive*, not a filesystem tree.  Path traversal is
603            // therefore not exploitable here.  Consumers that later *extract*
604            // the output archive must apply their own path validation.
605
606            // Directories and non-files: pass through.
607            if entry.is_dir() {
608                let options = zip::write::FileOptions::default()
609                    .last_modified_time(entry.last_modified())
610                    .compression_method(entry.compression());
611
612                #[cfg(unix)]
613                let options = if let Some(mode) = entry.unix_mode() {
614                    options.unix_permissions(mode)
615                } else {
616                    options
617                };
618
619                zip_out.add_directory(&name, options).map_err(|e| {
620                    SanitizeError::ArchiveError(format!("add dir '{}': {}", name, e))
621                })?;
622                stats.entries_skipped += 1;
623                continue;
624            }
625
626            // Build write options preserving metadata.
627            let options = zip::write::FileOptions::default()
628                .compression_method(entry.compression())
629                .last_modified_time(entry.last_modified());
630
631            #[cfg(unix)]
632            let options = if let Some(mode) = entry.unix_mode() {
633                options.unix_permissions(mode)
634            } else {
635                options
636            };
637
638            // Sanitize the file.
639            let mut sanitized_buf: Vec<u8> = Vec::new();
640            let entry_size = Some(entry.size());
641            self.sanitize_entry(
642                &name,
643                &mut entry,
644                &mut sanitized_buf,
645                &mut stats,
646                entry_size,
647                depth,
648            )?;
649
650            zip_out.start_file(&name, options).map_err(|e| {
651                SanitizeError::ArchiveError(format!("start file '{}': {}", name, e))
652            })?;
653            zip_out.write_all(&sanitized_buf).map_err(|e| {
654                SanitizeError::ArchiveError(format!("write file '{}': {}", name, e))
655            })?;
656
657            stats.files_processed += 1;
658        }
659
660        zip_out
661            .finish()
662            .map_err(|e| SanitizeError::ArchiveError(format!("finalize zip: {}", e)))?;
663
664        Ok(stats)
665    }
666
667    // -----------------------------------------------------------------------
668    // Format-aware dispatch
669    // -----------------------------------------------------------------------
670
671    /// Auto-detect the archive format and process accordingly.
672    ///
673    /// For zip archives the reader must additionally implement `Seek`.
674    /// This method accepts `Read + Seek` to cover all formats uniformly.
675    /// Tar and tar.gz do not require seeking, but the bound is imposed
676    /// for a single entry point.
677    ///
678    /// # Errors
679    ///
680    /// Returns [`SanitizeError::ArchiveError`] on I/O failures or
681    /// [`SanitizeError::RecursionDepthExceeded`] for nested archives.
682    pub fn process<R: Read + Seek, W: Write + Seek>(
683        &self,
684        reader: R,
685        writer: W,
686        format: ArchiveFormat,
687    ) -> Result<ArchiveStats> {
688        match format {
689            ArchiveFormat::Zip => self.process_zip(reader, writer),
690            ArchiveFormat::Tar => self.process_tar(reader, writer),
691            ArchiveFormat::TarGz => self.process_tar_gz(reader, writer),
692        }
693    }
694}
695
696// ---------------------------------------------------------------------------
697// Counting reader wrapper (for input byte tracking)
698// ---------------------------------------------------------------------------
699
700/// A thin wrapper around a reader that counts bytes read.
701struct CountingReader<'a> {
702    inner: &'a mut dyn Read,
703    count: u64,
704}
705
706impl<'a> CountingReader<'a> {
707    fn new(inner: &'a mut dyn Read) -> Self {
708        Self { inner, count: 0 }
709    }
710
711    fn bytes_read(&self) -> u64 {
712        self.count
713    }
714}
715
716impl Read for CountingReader<'_> {
717    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
718        let n = self.inner.read(buf)?;
719        self.count += n as u64;
720        Ok(n)
721    }
722}
723
724/// A thin wrapper around a writer that counts bytes written (F-02 fix).
725struct CountingWriter<'a> {
726    inner: &'a mut dyn Write,
727    count: u64,
728}
729
730impl<'a> CountingWriter<'a> {
731    fn new(inner: &'a mut dyn Write) -> Self {
732        Self { inner, count: 0 }
733    }
734
735    fn bytes_written(&self) -> u64 {
736        self.count
737    }
738}
739
740impl Write for CountingWriter<'_> {
741    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
742        let n = self.inner.write(buf)?;
743        self.count += n as u64;
744        Ok(n)
745    }
746
747    fn flush(&mut self) -> io::Result<()> {
748        self.inner.flush()
749    }
750}
751
752// ---------------------------------------------------------------------------
753// Tests
754// ---------------------------------------------------------------------------
755
756#[cfg(test)]
757mod tests {
758    use super::*;
759    use crate::category::Category;
760    use crate::generator::HmacGenerator;
761    use crate::processor::profile::{FieldRule, FileTypeProfile};
762    use crate::processor::registry::ProcessorRegistry;
763    use crate::scanner::{ScanConfig, ScanPattern};
764    use std::io::Cursor;
765
766    /// Build a test archive processor with an email pattern and a JSON profile.
767    fn make_archive_processor() -> ArchiveProcessor {
768        let gen = Arc::new(HmacGenerator::new([42u8; 32]));
769        let store = Arc::new(MappingStore::new(gen, None));
770
771        let patterns = vec![
772            ScanPattern::from_regex(
773                r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
774                Category::Email,
775                "email",
776            )
777            .unwrap(),
778            ScanPattern::from_literal("SUPERSECRET", Category::Custom("api_key".into()), "api_key")
779                .unwrap(),
780        ];
781
782        let scanner = Arc::new(
783            StreamScanner::new(patterns, Arc::clone(&store), ScanConfig::default()).unwrap(),
784        );
785
786        let registry = Arc::new(ProcessorRegistry::with_builtins());
787
788        let profiles = vec![FileTypeProfile::new(
789            "json",
790            vec![FieldRule::new("*").with_category(Category::Custom("field".into()))],
791        )
792        .with_extension(".json")];
793
794        ArchiveProcessor::new(registry, scanner, store, profiles)
795    }
796
797    // -- Tar tests ----------------------------------------------------------
798
799    fn build_test_tar(entries: &[(&str, &[u8])]) -> Vec<u8> {
800        let mut buf = Vec::new();
801        {
802            let mut builder = tar::Builder::new(&mut buf);
803            for (name, data) in entries {
804                let mut header = tar::Header::new_gnu();
805                header.set_size(data.len() as u64);
806                header.set_mode(0o644);
807                header.set_mtime(1_700_000_000);
808                header.set_cksum();
809                builder.append_data(&mut header, *name, *data).unwrap();
810            }
811            builder.finish().unwrap();
812        }
813        buf
814    }
815
816    #[test]
817    fn tar_sanitizes_plaintext_with_scanner() {
818        let proc = make_archive_processor();
819        let input = build_test_tar(&[("readme.txt", b"Contact alice@corp.com for help.")]);
820
821        let mut output = Vec::new();
822        let stats = proc.process_tar(&input[..], &mut output).unwrap();
823
824        assert_eq!(stats.files_processed, 1);
825        assert_eq!(stats.scanner_fallback, 1);
826        assert_eq!(stats.structured_hits, 0);
827
828        // Verify the output is a valid tar and the secret is gone.
829        let mut archive = tar::Archive::new(&output[..]);
830        for entry in archive.entries().unwrap() {
831            let mut e = entry.unwrap();
832            let mut content = String::new();
833            e.read_to_string(&mut content).unwrap();
834            assert!(
835                !content.contains("alice@corp.com"),
836                "email should be sanitized: {content}"
837            );
838        }
839    }
840
841    #[test]
842    fn tar_sanitizes_json_with_structured_processor() {
843        let proc = make_archive_processor();
844        let json_content = br#"{"email": "bob@example.org", "name": "Bob"}"#;
845        let input = build_test_tar(&[("config.json", json_content)]);
846
847        let mut output = Vec::new();
848        let stats = proc.process_tar(&input[..], &mut output).unwrap();
849
850        assert_eq!(stats.files_processed, 1);
851        assert_eq!(stats.structured_hits, 1);
852        assert_eq!(stats.scanner_fallback, 0);
853        assert_eq!(
854            stats.file_methods.get("config.json").unwrap(),
855            "structured:json"
856        );
857
858        // Verify sanitized output.
859        let mut archive = tar::Archive::new(&output[..]);
860        for entry in archive.entries().unwrap() {
861            let mut e = entry.unwrap();
862            let mut content = String::new();
863            e.read_to_string(&mut content).unwrap();
864            assert!(
865                !content.contains("bob@example.org"),
866                "email should be sanitized"
867            );
868            assert!(!content.contains("Bob"), "name should be sanitized");
869        }
870    }
871
872    #[test]
873    fn tar_preserves_metadata() {
874        let proc = make_archive_processor();
875        let input = build_test_tar(&[("data.txt", b"SUPERSECRET token here")]);
876
877        let mut output = Vec::new();
878        proc.process_tar(&input[..], &mut output).unwrap();
879
880        let mut archive = tar::Archive::new(&output[..]);
881        for entry in archive.entries().unwrap() {
882            let e = entry.unwrap();
883            let hdr = e.header();
884            assert_eq!(hdr.mode().unwrap(), 0o644);
885            assert_eq!(hdr.mtime().unwrap(), 1_700_000_000);
886        }
887    }
888
889    #[test]
890    fn tar_handles_multiple_files() {
891        let proc = make_archive_processor();
892        let input = build_test_tar(&[
893            ("a.txt", b"alice@corp.com"),
894            ("b.json", br#"{"key":"value"}"#),
895            ("c.log", b"no secrets here"),
896        ]);
897
898        let mut output = Vec::new();
899        let stats = proc.process_tar(&input[..], &mut output).unwrap();
900
901        assert_eq!(stats.files_processed, 3);
902        // b.json matched the JSON profile
903        assert_eq!(stats.structured_hits, 1);
904        // a.txt and c.log fall back to scanner
905        assert_eq!(stats.scanner_fallback, 2);
906    }
907
908    #[test]
909    fn tar_passes_through_directories() {
910        let mut buf = Vec::new();
911        {
912            let mut builder = tar::Builder::new(&mut buf);
913
914            // Add a directory entry.
915            let mut dir_header = tar::Header::new_gnu();
916            dir_header.set_entry_type(tar::EntryType::Directory);
917            dir_header.set_size(0);
918            dir_header.set_mode(0o755);
919            dir_header.set_cksum();
920            builder
921                .append_data(&mut dir_header, "mydir/", &b""[..])
922                .unwrap();
923
924            // Add a file.
925            let mut file_header = tar::Header::new_gnu();
926            file_header.set_size(5);
927            file_header.set_mode(0o644);
928            file_header.set_cksum();
929            builder
930                .append_data(&mut file_header, "mydir/hello.txt", &b"hello"[..])
931                .unwrap();
932
933            builder.finish().unwrap();
934        }
935
936        let proc = make_archive_processor();
937        let mut output = Vec::new();
938        let stats = proc.process_tar(&buf[..], &mut output).unwrap();
939
940        assert_eq!(stats.entries_skipped, 1);
941        assert_eq!(stats.files_processed, 1);
942    }
943
944    // -- Tar.gz tests -------------------------------------------------------
945
946    #[test]
947    fn tar_gz_round_trip() {
948        let proc = make_archive_processor();
949
950        // Build a tar and gzip it.
951        let tar_data = build_test_tar(&[("secret.txt", b"Key is SUPERSECRET okay")]);
952        let mut gz_input = Vec::new();
953        {
954            let mut encoder =
955                flate2::write::GzEncoder::new(&mut gz_input, flate2::Compression::fast());
956            encoder.write_all(&tar_data).unwrap();
957            encoder.finish().unwrap();
958        }
959
960        let mut gz_output = Vec::new();
961        let stats = proc.process_tar_gz(&gz_input[..], &mut gz_output).unwrap();
962
963        assert_eq!(stats.files_processed, 1);
964        assert_eq!(stats.scanner_fallback, 1);
965
966        // Decompress and verify.
967        let decoder = flate2::read::GzDecoder::new(&gz_output[..]);
968        let mut archive = tar::Archive::new(decoder);
969        for entry in archive.entries().unwrap() {
970            let mut e = entry.unwrap();
971            let mut content = String::new();
972            e.read_to_string(&mut content).unwrap();
973            assert!(
974                !content.contains("SUPERSECRET"),
975                "secret should be sanitized: {content}"
976            );
977        }
978    }
979
980    // -- Zip tests ----------------------------------------------------------
981
982    fn build_test_zip(entries: &[(&str, &[u8])]) -> Vec<u8> {
983        let mut buf = Cursor::new(Vec::new());
984        {
985            let mut zip = zip::ZipWriter::new(&mut buf);
986            for (name, data) in entries {
987                let options = zip::write::FileOptions::default()
988                    .compression_method(zip::CompressionMethod::Deflated);
989                zip.start_file(*name, options).unwrap();
990                zip.write_all(data).unwrap();
991            }
992            zip.finish().unwrap();
993        }
994        buf.into_inner()
995    }
996
997    #[test]
998    fn zip_sanitizes_plaintext_with_scanner() {
999        let proc = make_archive_processor();
1000        let zip_data = build_test_zip(&[("notes.txt", b"Reach alice@corp.com for info.")]);
1001
1002        let reader = Cursor::new(&zip_data);
1003        let mut writer = Cursor::new(Vec::new());
1004        let stats = proc.process_zip(reader, &mut writer).unwrap();
1005
1006        assert_eq!(stats.files_processed, 1);
1007        assert_eq!(stats.scanner_fallback, 1);
1008
1009        // Verify the output zip.
1010        let out_data = writer.into_inner();
1011        let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1012        let mut entry = zip_out.by_index(0).unwrap();
1013        let mut content = String::new();
1014        entry.read_to_string(&mut content).unwrap();
1015        assert!(
1016            !content.contains("alice@corp.com"),
1017            "email should be sanitized: {content}"
1018        );
1019    }
1020
1021    #[test]
1022    fn zip_sanitizes_json_with_structured_processor() {
1023        let proc = make_archive_processor();
1024        let json_content = br#"{"password": "hunter2", "host": "db.internal"}"#;
1025        let zip_data = build_test_zip(&[("settings.json", json_content)]);
1026
1027        let reader = Cursor::new(&zip_data);
1028        let mut writer = Cursor::new(Vec::new());
1029        let stats = proc.process_zip(reader, &mut writer).unwrap();
1030
1031        assert_eq!(stats.files_processed, 1);
1032        assert_eq!(stats.structured_hits, 1);
1033
1034        let out_data = writer.into_inner();
1035        let mut zip_out = zip::ZipArchive::new(Cursor::new(out_data)).unwrap();
1036        let mut entry = zip_out.by_index(0).unwrap();
1037        let mut content = String::new();
1038        entry.read_to_string(&mut content).unwrap();
1039        assert!(!content.contains("hunter2"), "password should be sanitized");
1040        assert!(!content.contains("db.internal"), "host should be sanitized");
1041    }
1042
1043    #[test]
1044    fn zip_preserves_directory_entries() {
1045        let mut buf = Cursor::new(Vec::new());
1046        {
1047            let mut zip = zip::ZipWriter::new(&mut buf);
1048
1049            let dir_options = zip::write::FileOptions::default();
1050            zip.add_directory("subdir/", dir_options).unwrap();
1051
1052            let file_options = zip::write::FileOptions::default()
1053                .compression_method(zip::CompressionMethod::Stored);
1054            zip.start_file("subdir/data.txt", file_options).unwrap();
1055            zip.write_all(b"SUPERSECRET value").unwrap();
1056
1057            zip.finish().unwrap();
1058        }
1059
1060        let zip_data = buf.into_inner();
1061        let proc = make_archive_processor();
1062        let reader = Cursor::new(&zip_data);
1063        let mut writer = Cursor::new(Vec::new());
1064        let stats = proc.process_zip(reader, &mut writer).unwrap();
1065
1066        assert_eq!(stats.entries_skipped, 1); // directory
1067        assert_eq!(stats.files_processed, 1);
1068    }
1069
1070    #[test]
1071    fn zip_handles_multiple_files() {
1072        let proc = make_archive_processor();
1073        let zip_data = build_test_zip(&[
1074            ("file1.txt", b"alice@corp.com"),
1075            ("file2.json", br#"{"secret":"SUPERSECRET"}"#),
1076            ("file3.log", b"nothing to see"),
1077        ]);
1078
1079        let reader = Cursor::new(&zip_data);
1080        let mut writer = Cursor::new(Vec::new());
1081        let stats = proc.process_zip(reader, &mut writer).unwrap();
1082
1083        assert_eq!(stats.files_processed, 3);
1084        assert_eq!(stats.structured_hits, 1); // JSON
1085        assert_eq!(stats.scanner_fallback, 2); // .txt + .log
1086    }
1087
1088    // -- Format detection tests ---------------------------------------------
1089
1090    #[test]
1091    fn format_detection_from_path() {
1092        assert_eq!(
1093            ArchiveFormat::from_path("data.tar"),
1094            Some(ArchiveFormat::Tar)
1095        );
1096        assert_eq!(
1097            ArchiveFormat::from_path("data.tar.gz"),
1098            Some(ArchiveFormat::TarGz)
1099        );
1100        assert_eq!(
1101            ArchiveFormat::from_path("data.tgz"),
1102            Some(ArchiveFormat::TarGz)
1103        );
1104        assert_eq!(
1105            ArchiveFormat::from_path("data.zip"),
1106            Some(ArchiveFormat::Zip)
1107        );
1108        assert_eq!(
1109            ArchiveFormat::from_path("DATA.ZIP"),
1110            Some(ArchiveFormat::Zip)
1111        );
1112        assert_eq!(ArchiveFormat::from_path("photo.png"), None);
1113    }
1114
1115    // -- Determinism / dedup tests ------------------------------------------
1116
1117    #[test]
1118    fn same_secret_gets_same_replacement_across_entries() {
1119        let proc = make_archive_processor();
1120        let input = build_test_tar(&[
1121            ("a.txt", b"contact alice@corp.com"),
1122            ("b.txt", b"reach alice@corp.com"),
1123        ]);
1124
1125        let mut output = Vec::new();
1126        proc.process_tar(&input[..], &mut output).unwrap();
1127
1128        let mut archive = tar::Archive::new(&output[..]);
1129        let mut contents: Vec<String> = Vec::new();
1130        for entry in archive.entries().unwrap() {
1131            let mut e = entry.unwrap();
1132            let mut s = String::new();
1133            e.read_to_string(&mut s).unwrap();
1134            contents.push(s);
1135        }
1136
1137        // Both files should have the *same* replacement for alice@corp.com.
1138        // Extract the replacement by removing the prefix.
1139        let replacement_a = contents[0].strip_prefix("contact ").unwrap();
1140        let replacement_b = contents[1].strip_prefix("reach ").unwrap();
1141        assert_eq!(
1142            replacement_a, replacement_b,
1143            "dedup should produce identical replacements"
1144        );
1145        assert!(!replacement_a.contains("alice@corp.com"));
1146    }
1147
1148    // -- Auto-dispatch test -------------------------------------------------
1149
1150    #[test]
1151    fn process_auto_dispatch_tar() {
1152        let proc = make_archive_processor();
1153        let tar_data = build_test_tar(&[("f.txt", b"SUPERSECRET")]);
1154
1155        let reader = Cursor::new(tar_data);
1156        let writer = Cursor::new(Vec::new());
1157        let stats = proc.process(reader, writer, ArchiveFormat::Tar).unwrap();
1158
1159        assert_eq!(stats.files_processed, 1);
1160    }
1161
1162    #[test]
1163    fn process_auto_dispatch_zip() {
1164        let proc = make_archive_processor();
1165        let zip_data = build_test_zip(&[("f.txt", b"SUPERSECRET")]);
1166
1167        let reader = Cursor::new(zip_data);
1168        let mut writer = Cursor::new(Vec::new());
1169        let stats = proc
1170            .process(reader, &mut writer, ArchiveFormat::Zip)
1171            .unwrap();
1172
1173        assert_eq!(stats.files_processed, 1);
1174    }
1175
1176    // -- Empty archive tests ------------------------------------------------
1177
1178    #[test]
1179    fn tar_empty_archive() {
1180        let proc = make_archive_processor();
1181        let tar_data = build_test_tar(&[]);
1182
1183        let mut output = Vec::new();
1184        let stats = proc.process_tar(&tar_data[..], &mut output).unwrap();
1185
1186        assert_eq!(stats.files_processed, 0);
1187        assert_eq!(stats.entries_skipped, 0);
1188    }
1189
1190    #[test]
1191    fn zip_empty_archive() {
1192        let proc = make_archive_processor();
1193        let zip_data = build_test_zip(&[]);
1194
1195        let reader = Cursor::new(zip_data);
1196        let mut writer = Cursor::new(Vec::new());
1197        let stats = proc.process_zip(reader, &mut writer).unwrap();
1198
1199        assert_eq!(stats.files_processed, 0);
1200    }
1201}
sanitize_engine/processor/archive.rs

sanitize_engine/processor/
archive.rs