Skip to main content

sanitize_engine/
report.rs

1//! Structured reporting for sanitization runs.
2//!
3//! Generates a JSON report summarising what the sanitization tool did
4//! without ever including original secret values. The report captures:
5//!
6//! - **Metadata**: tool version, CLI flags, timestamp.
7//! - **Per-file details**: matches found, replacements applied, bytes
8//!   processed, and per-pattern match counts.
9//! - **Aggregated summary**: totals across all files plus wall-clock
10//!   duration.
11//!
12//! # Thread Safety
13//!
14//! [`ReportBuilder`] is `Send + Sync`. Multiple threads can record file
15//! results concurrently via [`ReportBuilder::record_file`], which takes
16//! an internal `Mutex` only long enough to push a single entry.
17//!
18//! # Example
19//!
20//! ```rust
21//! use sanitize_engine::report::{ReportBuilder, ReportMetadata, FileReport};
22//! use std::collections::HashMap;
23//!
24//! let meta = ReportMetadata {
25//!     version: "0.2.0".into(),
26//!     timestamp: "2026-03-01T00:00:00Z".into(),
27//!     deterministic: true,
28//!     dry_run: false,
29//!     strict: false,
30//!     chunk_size: 1_048_576,
31//!     threads: Some(4),
32//!     secrets_file: Some("secrets.enc".into()),
33//! };
34//!
35//! let builder = ReportBuilder::new(meta);
36//!
37//! builder.record_file(FileReport {
38//!     path: "data.log".into(),
39//!     matches: 42,
40//!     replacements: 42,
41//!     bytes_processed: 10_000,
42//!     bytes_output: 10_200,
43//!     pattern_counts: HashMap::from([("email".into(), 30), ("ipv4".into(), 12)]),
44//!     method: "scanner".into(),
45//! });
46//!
47//! let report = builder.finish();
48//! let json = report.to_json_pretty().unwrap();
49//! assert!(json.contains("\"total_matches\": 42"));
50//! ```
51
52use serde::Serialize;
53use std::collections::HashMap;
54use std::sync::Mutex;
55use std::time::Instant;
56
57use crate::scanner::ScanStats;
58
59// ---------------------------------------------------------------------------
60// Report structures
61// ---------------------------------------------------------------------------
62
63/// Top-level sanitization report.
64///
65/// Serialized to JSON via [`Self::to_json`] / [`Self::to_json_pretty`].
66/// Never contains original secret values.
67#[derive(Debug, Clone, Serialize)]
68pub struct SanitizeReport {
69    /// Tool metadata and flags.
70    pub metadata: ReportMetadata,
71    /// Aggregated summary across all files.
72    pub summary: ReportSummary,
73    /// Per-file details.
74    pub files: Vec<FileReport>,
75}
76
77impl SanitizeReport {
78    /// Serialize the report as compact JSON.
79    ///
80    /// # Errors
81    ///
82    /// Returns [`serde_json::Error`] if serialization fails.
83    pub fn to_json(&self) -> serde_json::Result<String> {
84        serde_json::to_string(self)
85    }
86
87    /// Serialize the report as pretty-printed JSON.
88    ///
89    /// # Errors
90    ///
91    /// Returns [`serde_json::Error`] if serialization fails.
92    pub fn to_json_pretty(&self) -> serde_json::Result<String> {
93        serde_json::to_string_pretty(self)
94    }
95}
96
97/// Tool metadata embedded in every report.
98#[derive(Debug, Clone, Serialize)]
99pub struct ReportMetadata {
100    /// Crate / binary version (from `Cargo.toml`).
101    pub version: String,
102    /// ISO-8601 timestamp when the run started.
103    pub timestamp: String,
104    /// Whether `--deterministic` was used.
105    pub deterministic: bool,
106    /// Whether `--dry-run` was used.
107    pub dry_run: bool,
108    /// Whether `--strict` was used.
109    pub strict: bool,
110    /// Chunk size in bytes (`--chunk-size`).
111    pub chunk_size: usize,
112    /// Thread count (`--threads`), if specified.
113    pub threads: Option<usize>,
114    /// Path to the secrets file, if provided.
115    pub secrets_file: Option<String>,
116}
117
118/// Aggregated summary across all processed files.
119#[derive(Debug, Clone, Serialize)]
120pub struct ReportSummary {
121    /// Number of files processed.
122    pub total_files: u64,
123    /// Total pattern matches found.
124    pub total_matches: u64,
125    /// Total replacements applied.
126    pub total_replacements: u64,
127    /// Total bytes read from input(s).
128    pub total_bytes_processed: u64,
129    /// Total bytes written to output(s).
130    pub total_bytes_output: u64,
131    /// Wall-clock duration of processing in milliseconds.
132    pub duration_ms: u64,
133    /// Aggregate per-pattern match counts.
134    pub pattern_counts: HashMap<String, u64>,
135}
136
137/// Per-file result details.
138///
139/// Does **not** contain any original secret values — only counts,
140/// byte sizes, pattern labels, and the processing method used.
141#[derive(Debug, Clone, Serialize)]
142pub struct FileReport {
143    /// File path (relative or archive entry name).
144    pub path: String,
145    /// Number of matches found in this file.
146    pub matches: u64,
147    /// Number of replacements applied.
148    pub replacements: u64,
149    /// Bytes read from this file.
150    pub bytes_processed: u64,
151    /// Bytes written for this file.
152    pub bytes_output: u64,
153    /// Per-pattern match counts for this file.
154    pub pattern_counts: HashMap<String, u64>,
155    /// Processing method: `"scanner"`, `"structured:json"`, etc.
156    pub method: String,
157}
158
159impl FileReport {
160    /// Build a `FileReport` from scanner [`ScanStats`].
161    #[must_use]
162    pub fn from_scan_stats(
163        path: impl Into<String>,
164        stats: &ScanStats,
165        method: impl Into<String>,
166    ) -> Self {
167        Self {
168            path: path.into(),
169            matches: stats.matches_found,
170            replacements: stats.replacements_applied,
171            bytes_processed: stats.bytes_processed,
172            bytes_output: stats.bytes_output,
173            pattern_counts: stats.pattern_counts.clone(),
174            method: method.into(),
175        }
176    }
177}
178
179// ---------------------------------------------------------------------------
180// Thread-safe report builder
181// ---------------------------------------------------------------------------
182
183/// Thread-safe builder that accumulates per-file results and produces
184/// a final [`SanitizeReport`].
185///
186/// Designed for concurrent use: wrap in `Arc` and share across threads.
187/// The internal `Mutex` is held only for the duration of a single
188/// `Vec::push`, so contention is negligible even at high thread counts.
189#[derive(Debug)]
190pub struct ReportBuilder {
191    metadata: ReportMetadata,
192    files: Mutex<Vec<FileReport>>,
193    start: Instant,
194}
195
196// All fields are Send + Sync natively (Mutex<Vec<_>>, Instant, owned structs),
197// so ReportBuilder auto-derives Send + Sync without unsafe.
198const _: fn() = || {
199    fn assert_send<T: Send>() {}
200    fn assert_sync<T: Sync>() {}
201    assert_send::<ReportBuilder>();
202    assert_sync::<ReportBuilder>();
203};
204
205impl ReportBuilder {
206    /// Create a new builder with the given metadata.
207    ///
208    /// The wall-clock timer starts now.
209    #[must_use]
210    pub fn new(metadata: ReportMetadata) -> Self {
211        Self {
212            metadata,
213            files: Mutex::new(Vec::new()),
214            start: Instant::now(),
215        }
216    }
217
218    /// Record the result for a single file. Thread-safe.
219    pub fn record_file(&self, file_report: FileReport) {
220        let mut files = self.files.lock().expect("report mutex poisoned");
221        files.push(file_report);
222    }
223
224    /// Record multiple file results at once (e.g., from archive processing).
225    pub fn record_files(&self, reports: impl IntoIterator<Item = FileReport>) {
226        let mut files = self.files.lock().expect("report mutex poisoned");
227        files.extend(reports);
228    }
229
230    /// Consume the builder and produce the final report.
231    ///
232    /// The duration is measured from builder creation to this call.
233    pub fn finish(self) -> SanitizeReport {
234        #[allow(clippy::cast_possible_truncation)] // duration in ms won't exceed u64
235        let duration_ms = self.start.elapsed().as_millis() as u64;
236        let files = self.files.into_inner().expect("report mutex poisoned");
237
238        // Aggregate summary.
239        let mut total_matches: u64 = 0;
240        let mut total_replacements: u64 = 0;
241        let mut total_bytes_processed: u64 = 0;
242        let mut total_bytes_output: u64 = 0;
243        let mut pattern_counts: HashMap<String, u64> = HashMap::new();
244
245        for f in &files {
246            total_matches += f.matches;
247            total_replacements += f.replacements;
248            total_bytes_processed += f.bytes_processed;
249            total_bytes_output += f.bytes_output;
250            for (pat, count) in &f.pattern_counts {
251                *pattern_counts.entry(pat.clone()).or_insert(0) += count;
252            }
253        }
254
255        let summary = ReportSummary {
256            total_files: files.len() as u64,
257            total_matches,
258            total_replacements,
259            total_bytes_processed,
260            total_bytes_output,
261            duration_ms,
262            pattern_counts,
263        };
264
265        SanitizeReport {
266            metadata: self.metadata,
267            summary,
268            files,
269        }
270    }
271}
272
273// ---------------------------------------------------------------------------
274// Unit tests
275// ---------------------------------------------------------------------------
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280
281    fn sample_metadata() -> ReportMetadata {
282        ReportMetadata {
283            version: "0.2.0".into(),
284            timestamp: "2026-03-01T00:00:00Z".into(),
285            deterministic: false,
286            dry_run: false,
287            strict: false,
288            chunk_size: 1_048_576,
289            threads: None,
290            secrets_file: None,
291        }
292    }
293
294    fn sample_file_report(path: &str, matches: u64, pattern: &str) -> FileReport {
295        FileReport {
296            path: path.into(),
297            matches,
298            replacements: matches,
299            bytes_processed: matches * 100,
300            bytes_output: matches * 110,
301            pattern_counts: HashMap::from([(pattern.into(), matches)]),
302            method: "scanner".into(),
303        }
304    }
305
306    // ---- Basic construction ----
307
308    #[test]
309    fn empty_report() {
310        let builder = ReportBuilder::new(sample_metadata());
311        let report = builder.finish();
312        assert_eq!(report.summary.total_files, 0);
313        assert_eq!(report.summary.total_matches, 0);
314        assert!(report.files.is_empty());
315    }
316
317    #[test]
318    fn single_file_report() {
319        let builder = ReportBuilder::new(sample_metadata());
320        builder.record_file(sample_file_report("data.log", 10, "email"));
321        let report = builder.finish();
322
323        assert_eq!(report.summary.total_files, 1);
324        assert_eq!(report.summary.total_matches, 10);
325        assert_eq!(report.summary.total_replacements, 10);
326        assert_eq!(report.summary.total_bytes_processed, 1000);
327        assert_eq!(report.summary.total_bytes_output, 1100);
328        assert_eq!(*report.summary.pattern_counts.get("email").unwrap(), 10);
329        assert_eq!(report.files[0].path, "data.log");
330    }
331
332    #[test]
333    fn multiple_files_aggregated() {
334        let builder = ReportBuilder::new(sample_metadata());
335        builder.record_file(sample_file_report("a.log", 5, "email"));
336        builder.record_file(sample_file_report("b.log", 3, "ipv4"));
337        builder.record_file(sample_file_report("c.log", 7, "email"));
338        let report = builder.finish();
339
340        assert_eq!(report.summary.total_files, 3);
341        assert_eq!(report.summary.total_matches, 15);
342        assert_eq!(*report.summary.pattern_counts.get("email").unwrap(), 12);
343        assert_eq!(*report.summary.pattern_counts.get("ipv4").unwrap(), 3);
344    }
345
346    // ---- JSON serialization ----
347
348    #[test]
349    fn json_serialization_no_secrets() {
350        let builder = ReportBuilder::new(sample_metadata());
351        builder.record_file(FileReport {
352            path: "config.yaml".into(),
353            matches: 2,
354            replacements: 2,
355            bytes_processed: 500,
356            bytes_output: 520,
357            pattern_counts: HashMap::from([("hostname".into(), 2)]),
358            method: "structured:yaml".into(),
359        });
360        let report = builder.finish();
361        let json = report.to_json_pretty().unwrap();
362
363        // Must contain expected fields.
364        assert!(json.contains("\"total_matches\": 2"));
365        assert!(json.contains("\"version\": \"0.2.0\""));
366        assert!(json.contains("\"hostname\": 2"));
367        assert!(json.contains("\"method\": \"structured:yaml\""));
368        assert!(json.contains("\"duration_ms\""));
369
370        // Must NOT contain any original secret values — we only ever
371        // store counts and labels, never pattern text or matched text.
372        // This is a structural guarantee; verify that deserializing
373        // back produces the same data without secret leakage.
374        let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
375        assert!(parsed["files"][0]["path"].as_str() == Some("config.yaml"));
376        // No field named "secret", "original", or "value" at any level.
377        let flat = json.to_lowercase();
378        assert!(!flat.contains("\"original\""));
379        assert!(!flat.contains("\"secret_value\""));
380    }
381
382    #[test]
383    fn compact_json() {
384        let builder = ReportBuilder::new(sample_metadata());
385        let report = builder.finish();
386        let json = report.to_json().unwrap();
387        // Compact JSON has no pretty indentation.
388        assert!(!json.contains("  "));
389    }
390
391    // ---- Metadata flags ----
392
393    #[test]
394    fn metadata_flags_preserved() {
395        let meta = ReportMetadata {
396            version: "1.0.0".into(),
397            timestamp: "2026-06-15T12:00:00Z".into(),
398            deterministic: true,
399            dry_run: true,
400            strict: true,
401            chunk_size: 262_144,
402            threads: Some(8),
403            secrets_file: Some("secrets.enc".into()),
404        };
405        let builder = ReportBuilder::new(meta);
406        let report = builder.finish();
407        assert!(report.metadata.deterministic);
408        assert!(report.metadata.dry_run);
409        assert!(report.metadata.strict);
410        assert_eq!(report.metadata.chunk_size, 262_144);
411        assert_eq!(report.metadata.threads, Some(8));
412        assert_eq!(report.metadata.secrets_file.as_deref(), Some("secrets.enc"));
413    }
414
415    // ---- Duration tracking ----
416
417    #[test]
418    fn duration_is_positive() {
419        let builder = ReportBuilder::new(sample_metadata());
420        // Do a tiny amount of work.
421        builder.record_file(sample_file_report("x.txt", 1, "email"));
422        let report = builder.finish();
423        // Duration should be ≥ 0 (it will be 0 or 1 on fast machines).
424        assert!(report.summary.duration_ms < 5_000); // sanity ceiling
425    }
426
427    // ---- Thread-safe concurrent recording ----
428
429    #[test]
430    fn concurrent_recording() {
431        use std::sync::Arc;
432        use std::thread;
433
434        let builder = Arc::new(ReportBuilder::new(sample_metadata()));
435        let mut handles = Vec::new();
436
437        for i in 0_u64..16 {
438            let b = Arc::clone(&builder);
439            handles.push(thread::spawn(move || {
440                b.record_file(sample_file_report(&format!("file_{i}.log"), i + 1, "email"));
441            }));
442        }
443
444        for h in handles {
445            h.join().unwrap();
446        }
447
448        // We need to unwrap the Arc to call finish().
449        let builder = Arc::try_unwrap(builder).expect("other refs still held");
450        let report = builder.finish();
451
452        assert_eq!(report.summary.total_files, 16);
453        // Sum of 1..=16 = 136.
454        assert_eq!(report.summary.total_matches, 136);
455    }
456
457    // ---- FileReport::from_scan_stats ----
458
459    #[test]
460    fn file_report_from_scan_stats() {
461        let stats = ScanStats {
462            bytes_processed: 2048,
463            bytes_output: 2100,
464            matches_found: 5,
465            replacements_applied: 5,
466            pattern_counts: HashMap::from([("email".into(), 3), ("ipv4".into(), 2)]),
467        };
468        let fr = FileReport::from_scan_stats("test.log", &stats, "scanner");
469        assert_eq!(fr.path, "test.log");
470        assert_eq!(fr.matches, 5);
471        assert_eq!(fr.bytes_processed, 2048);
472        assert_eq!(*fr.pattern_counts.get("email").unwrap(), 3);
473        assert_eq!(fr.method, "scanner");
474    }
475
476    // ---- Large-file simulation ----
477
478    #[test]
479    fn large_file_report() {
480        let builder = ReportBuilder::new(sample_metadata());
481        // Simulate a 10 GB file processed in chunks.
482        builder.record_file(FileReport {
483            path: "huge.log".into(),
484            matches: 1_000_000,
485            replacements: 1_000_000,
486            bytes_processed: 10_737_418_240, // 10 GiB
487            bytes_output: 10_900_000_000,
488            pattern_counts: HashMap::from([("email".into(), 600_000), ("ipv4".into(), 400_000)]),
489            method: "scanner".into(),
490        });
491        let report = builder.finish();
492        assert_eq!(report.summary.total_matches, 1_000_000);
493        assert_eq!(report.summary.total_bytes_processed, 10_737_418_240);
494
495        // JSON serialization still works for large numbers.
496        let json = report.to_json().unwrap();
497        assert!(json.contains("10737418240"));
498    }
499
500    // ---- record_files bulk insert ----
501
502    #[test]
503    fn record_files_bulk() {
504        let builder = ReportBuilder::new(sample_metadata());
505        let files: Vec<FileReport> = (0..5)
506            .map(|i| sample_file_report(&format!("entry_{i}.txt"), 2, "ssn"))
507            .collect();
508        builder.record_files(files);
509        let report = builder.finish();
510        assert_eq!(report.summary.total_files, 5);
511        assert_eq!(report.summary.total_matches, 10);
512    }
513}