sanitize_engine/
report.rs

1//! Structured reporting for sanitization runs.
2//!
3//! Generates a JSON report summarising what the sanitization tool did
4//! without ever including original secret values. The report captures:
5//!
6//! - **Metadata**: tool version, CLI flags, timestamp.
7//! - **Per-file details**: matches found, replacements applied, bytes
8//!   processed, and per-pattern match counts.
9//! - **Aggregated summary**: totals across all files plus wall-clock
10//!   duration.
11//! - **Log context** (optional): keyword-matched lines with surrounding
12//!   context windows, populated when `--extract-context` is used.
13//!
14//! # Thread Safety
15//!
16//! [`ReportBuilder`] is `Send + Sync`. Multiple threads can record file
17//! results concurrently via [`ReportBuilder::record_file`], which takes
18//! an internal `Mutex` only long enough to push a single entry.
19//!
20//! # Example
21//!
22//! ```rust
23//! use sanitize_engine::log_context::{extract_context, LogContextConfig};
24//! use sanitize_engine::report::{FileReport, ReportBuilder, ReportMetadata};
25//! use std::collections::HashMap;
26//!
27//! let meta = ReportMetadata {
28//!     version: "0.4.0".into(),
29//!     timestamp: "2026-03-01T00:00:00Z".into(),
30//!     deterministic: true,
31//!     dry_run: false,
32//!     strict: false,
33//!     chunk_size: 1_048_576,
34//!     threads: Some(4),
35//!     secrets_file: Some("secrets.enc".into()),
36//! };
37//!
38//! let builder = ReportBuilder::new(meta);
39//!
40//! builder.record_file(FileReport {
41//!     path: "data.log".into(),
42//!     matches: 42,
43//!     replacements: 42,
44//!     bytes_processed: 10_000,
45//!     bytes_output: 10_200,
46//!     pattern_counts: HashMap::from([("email".into(), 30), ("ipv4".into(), 12)]),
47//!     method: "scanner".into(),
48//!     log_context: None,
49//!     match_locations: None,
50//! });
51//!
52//! // Optionally attach per-file log context (populated by --extract-context).
53//! let sanitized_output = "INFO ok\nERROR disk full\nINFO retrying";
54//! let ctx = extract_context(sanitized_output, &LogContextConfig::new().with_context_lines(1));
55//! builder.set_file_log_context("data.log", ctx);
56//!
57//! let report = builder.finish();
58//! let json = report.to_json_pretty().unwrap();
59//! assert!(json.contains("\"total_matches\": 42"));
60//! assert!(json.contains("\"log_context\""));
61//! assert!(json.contains("\"keyword\": \"error\""));
62//! ```
63
64use serde::Serialize;
65use std::collections::HashMap;
66use std::sync::Mutex;
67use std::time::Instant;
68
69use crate::log_context::LogContextResult;
70use crate::scanner::{MatchLocation, ScanStats};
71
72// ---------------------------------------------------------------------------
73// Report structures
74// ---------------------------------------------------------------------------
75
76/// Top-level sanitization report.
77///
78/// Serialized to JSON via [`Self::to_json`] / [`Self::to_json_pretty`].
79/// Never contains original secret values.
80#[derive(Debug, Clone, Serialize)]
81pub struct SanitizeReport {
82    /// Tool metadata and flags.
83    pub metadata: ReportMetadata,
84    /// Aggregated summary across all files.
85    pub summary: ReportSummary,
86    /// Per-file details. Each entry may include `log_context` when
87    /// `--extract-context` was used.
88    pub files: Vec<FileReport>,
89}
90
91impl SanitizeReport {
92    /// Serialize the report as compact JSON.
93    ///
94    /// # Errors
95    ///
96    /// Returns [`serde_json::Error`] if serialization fails.
97    pub fn to_json(&self) -> serde_json::Result<String> {
98        serde_json::to_string(self)
99    }
100
101    /// Serialize the report as pretty-printed JSON.
102    ///
103    /// # Errors
104    ///
105    /// Returns [`serde_json::Error`] if serialization fails.
106    pub fn to_json_pretty(&self) -> serde_json::Result<String> {
107        serde_json::to_string_pretty(self)
108    }
109
110    /// Serialize the report as SARIF 2.1.0 JSON.
111    ///
112    /// SARIF (Static Analysis Results Interchange Format) is consumed natively
113    /// by GitHub Advanced Security, VS Code Problems panel, and most SIEM
114    /// tooling. Results are file-level (no line numbers — the sanitize engine
115    /// operates on byte streams and does not record source positions).
116    ///
117    /// # Errors
118    ///
119    /// Returns [`serde_json::Error`] if serialization fails.
120    #[allow(clippy::too_many_lines)]
121    pub fn to_sarif(&self) -> serde_json::Result<String> {
122        use serde_json::json;
123
124        // Collect unique named pattern IDs in sorted order → one SARIF rule each.
125        // When structured-processor-only runs produce matches without named patterns,
126        // add the synthetic "sensitive_value" rule to cover those results.
127        let needs_generic = self
128            .files
129            .iter()
130            .any(|f| f.matches > 0 && f.pattern_counts.is_empty());
131
132        let mut rule_ids: Vec<&str> = self
133            .summary
134            .pattern_counts
135            .keys()
136            .map(String::as_str)
137            .collect();
138        rule_ids.sort_unstable();
139        if needs_generic {
140            rule_ids.push("sensitive_value");
141        }
142
143        let rules: Vec<serde_json::Value> = rule_ids
144            .iter()
145            .map(|&id| {
146                let (short, full) = if id == "sensitive_value" {
147                    (
148                        "Sensitive value detected".to_owned(),
149                        "One or more sensitive values were detected during sanitization and \
150                         replaced with safe substitutes. No original values are stored. \
151                         Run with a secrets file for per-pattern breakdown."
152                            .to_owned(),
153                    )
154                } else {
155                    (
156                        format!("Sensitive value of type '{}' detected", id),
157                        format!(
158                            "A sensitive value of type '{}' was detected during sanitization \
159                             and replaced with a safe substitute. No original value is stored.",
160                            id
161                        ),
162                    )
163                };
164                json!({
165                    "id": id,
166                    "name": sarif_rule_name(id),
167                    "shortDescription": { "text": short },
168                    "fullDescription": { "text": full },
169                    "defaultConfiguration": { "level": sarif_level(id) },
170                    "properties": { "tags": ["security"] }
171                })
172            })
173            .collect();
174
175        // One SARIF result per (file, pattern) pair where count > 0.
176        // Files with matches but no named breakdown emit a single generic result.
177        let mut results: Vec<serde_json::Value> = Vec::new();
178        for f in &self.files {
179            let uri = path_to_sarif_uri(&f.path);
180            let location = json!([{
181                "physicalLocation": {
182                    "artifactLocation": { "uri": uri, "uriBaseId": "%SRCROOT%" }
183                }
184            }]);
185            if f.matches > 0 && f.pattern_counts.is_empty() {
186                results.push(json!({
187                    "ruleId": "sensitive_value",
188                    "level": "warning",
189                    "message": {
190                        "text": format!(
191                            "{} sensitive value(s) detected and sanitized.",
192                            f.matches
193                        )
194                    },
195                    "locations": location
196                }));
197            } else {
198                for (pattern, &count) in &f.pattern_counts {
199                    if count == 0 {
200                        continue;
201                    }
202                    // Emit startLine when we have location data for this pattern.
203                    let first_line = f.match_locations.as_ref().and_then(|ml| {
204                        ml.locations
205                            .iter()
206                            .find(|loc| loc.pattern == *pattern)
207                            .map(|loc| loc.line)
208                    });
209                    let loc = if let Some(line) = first_line {
210                        json!([{
211                            "physicalLocation": {
212                                "artifactLocation": { "uri": &uri, "uriBaseId": "%SRCROOT%" },
213                                "region": { "startLine": line }
214                            }
215                        }])
216                    } else {
217                        location.clone()
218                    };
219                    results.push(json!({
220                        "ruleId": pattern,
221                        "level": sarif_level(pattern),
222                        "message": {
223                            "text": format!(
224                                "{} sensitive value(s) of type '{}' detected and sanitized.",
225                                count, pattern
226                            )
227                        },
228                        "locations": loc
229                    }));
230                }
231            }
232        }
233
234        let artifacts: Vec<serde_json::Value> = self
235            .files
236            .iter()
237            .map(|f| {
238                let uri = path_to_sarif_uri(&f.path);
239                json!({ "location": { "uri": uri, "uriBaseId": "%SRCROOT%" } })
240            })
241            .collect();
242
243        let sarif = json!({
244            "$schema": "https://json.schemastore.org/sarif-2.1.0.json",
245            "version": "2.1.0",
246            "runs": [{
247                "tool": {
248                    "driver": {
249                        "name": "rust-sanitize",
250                        "version": self.metadata.version,
251                        "informationUri": "https://github.com/kayelohbyte/rust-sanitize",
252                        "rules": rules
253                    }
254                },
255                "invocations": [{
256                    "executionSuccessful": true,
257                    "endTimeUtc": self.metadata.timestamp
258                }],
259                "results": results,
260                "artifacts": artifacts
261            }]
262        });
263
264        serde_json::to_string_pretty(&sarif)
265    }
266
267    /// Render the report as a self-contained HTML document.
268    ///
269    /// The output has no external dependencies (no CDN, no external fonts).
270    /// Includes a summary dashboard, per-pattern totals, and a per-file table.
271    /// Dark mode is supported via `prefers-color-scheme`.
272    #[must_use]
273    #[allow(clippy::too_many_lines, clippy::format_collect)]
274    pub fn to_html(&self) -> String {
275        let s = &self.summary;
276        let m = &self.metadata;
277
278        // --- summary cards ---------------------------------------------------
279        let cards = format!(
280            r#"<div class="cards">
281  <div class="card"><div class="card-label">Files</div><div class="card-value">{}</div></div>
282  <div class="card"><div class="card-label">Matches</div><div class="card-value">{}</div></div>
283  <div class="card"><div class="card-label">Replacements</div><div class="card-value">{}</div></div>
284  <div class="card"><div class="card-label">Input</div><div class="card-value">{}</div></div>
285  <div class="card"><div class="card-label">Duration</div><div class="card-value">{} ms</div></div>
286</div>"#,
287            s.total_files,
288            s.total_matches,
289            s.total_replacements,
290            fmt_bytes(s.total_bytes_processed),
291            s.duration_ms,
292        );
293
294        // --- pattern breakdown table (only when there are matches) -----------
295        let patterns_section = if s.total_matches > 0 {
296            let mut sorted_patterns: Vec<(&String, &u64)> = s.pattern_counts.iter().collect();
297            sorted_patterns.sort_by(|a, b| b.1.cmp(a.1).then(a.0.cmp(b.0)));
298            let rows: String = sorted_patterns
299                .iter()
300                .map(|(pat, count)| {
301                    format!("<tr><td>{}</td><td>{}</td></tr>\n", html_escape(pat), count)
302                })
303                .collect();
304            format!(
305                r#"<div class="section">
306<h2>Patterns detected</h2>
307<div class="table-wrap"><table>
308<thead><tr><th>Pattern</th><th>Total matches</th></tr></thead>
309<tbody>{}</tbody>
310</table></div></div>"#,
311                rows
312            )
313        } else {
314            String::new()
315        };
316
317        // --- per-file table --------------------------------------------------
318        let has_locations = self.files.iter().any(|f| f.match_locations.is_some());
319        let file_rows: String = self
320            .files
321            .iter()
322            .map(|f| {
323                let badges: String = {
324                    let mut pairs: Vec<(&String, &u64)> = f.pattern_counts.iter().collect();
325                    pairs.sort_by(|a, b| b.1.cmp(a.1).then(a.0.cmp(b.0)));
326                    pairs
327                        .iter()
328                        .filter(|(_, &c)| c > 0)
329                        .map(|(pat, count)| {
330                            format!(
331                                r#"<span class="badge {}">{}: {}</span>"#,
332                                sarif_badge_class(pat),
333                                html_escape(pat),
334                                count
335                            )
336                        })
337                        .collect()
338                };
339                let match_class = if f.matches > 0 { "count-positive" } else { "count-zero" };
340                let first_line_cell = if has_locations {
341                    match f.match_locations.as_ref().and_then(|ml| ml.locations.first()) {
342                        Some(loc) => {
343                            let truncated_marker = if f
344                                .match_locations
345                                .as_ref()
346                                .is_some_and(|ml| ml.truncated)
347                            {
348                                "<span title=\"more matches not shown\">…</span>"
349                            } else {
350                                ""
351                            };
352                            format!(
353                                "<td class=\"count-positive\">L{}{}</td>",
354                                loc.line, truncated_marker
355                            )
356                        }
357                        None => "<td class=\"count-zero\">—</td>".to_owned(),
358                    }
359                } else {
360                    String::new()
361                };
362                format!(
363                    "<tr><td><code>{}</code></td><td class=\"{}\">{}</td><td>{}</td>{}<td>{}</td></tr>\n",
364                    html_escape(&f.path),
365                    match_class,
366                    f.matches,
367                    html_escape(&f.method),
368                    first_line_cell,
369                    badges,
370                )
371            })
372            .collect();
373
374        let first_line_header = if has_locations {
375            "<th>First match</th>"
376        } else {
377            ""
378        };
379
380        format!(
381            r#"<!DOCTYPE html>
382<html lang="en">
383<head>
384<meta charset="utf-8">
385<meta name="viewport" content="width=device-width,initial-scale=1">
386<title>rust-sanitize report</title>
387<style>
388:root{{--bg:#f8f9fa;--surface:#fff;--border:#dee2e6;--text:#212529;--muted:#6c757d;--accent:#0d6efd;--danger:#dc3545;--warn-col:#fd7e14;--success:#198754;--badge:#e9ecef;--code-bg:#f1f3f4}}
389@media(prefers-color-scheme:dark){{:root{{--bg:#0d1117;--surface:#161b22;--border:#30363d;--text:#e6edf3;--muted:#8b949e;--accent:#58a6ff;--danger:#f85149;--warn-col:#d29922;--success:#3fb950;--badge:#21262d;--code-bg:#1c2128}}}}
390*,*::before,*::after{{box-sizing:border-box;margin:0;padding:0}}
391body{{font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif;background:var(--bg);color:var(--text);line-height:1.5;font-size:14px}}
392.container{{max-width:1100px;margin:0 auto;padding:24px 16px}}
393header{{margin-bottom:24px;padding-bottom:16px;border-bottom:1px solid var(--border)}}
394h1{{font-size:1.4rem;font-weight:600}}
395.meta{{font-size:.8rem;color:var(--muted);margin-top:4px}}
396.section{{margin-bottom:28px}}
397h2{{font-size:.95rem;font-weight:600;margin-bottom:10px}}
398.cards{{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px}}
399.card{{background:var(--surface);border:1px solid var(--border);border-radius:6px;padding:14px}}
400.card-label{{font-size:.7rem;text-transform:uppercase;letter-spacing:.05em;color:var(--muted)}}
401.card-value{{font-size:1.4rem;font-weight:600;margin-top:2px}}
402.table-wrap{{overflow-x:auto}}
403table{{width:100%;border-collapse:collapse;background:var(--surface);border:1px solid var(--border);border-radius:6px;font-size:.85rem}}
404th{{text-align:left;padding:9px 12px;border-bottom:1px solid var(--border);font-weight:600;color:var(--muted);white-space:nowrap}}
405td{{padding:9px 12px;border-bottom:1px solid var(--border);vertical-align:top}}
406tr:last-child td{{border-bottom:none}}
407tr:hover td{{background:var(--badge)}}
408code{{background:var(--code-bg);border-radius:3px;padding:1px 4px;font-size:.8rem;word-break:break-all}}
409.badge{{display:inline-block;padding:1px 7px;border-radius:12px;font-size:.72rem;font-weight:500;background:var(--badge);margin:1px}}
410.badge-pii{{background:rgba(220,53,69,.12);color:var(--danger)}}
411.badge-warn{{background:rgba(253,126,20,.12);color:var(--warn-col)}}
412.count-zero{{color:var(--muted)}}
413.count-positive{{font-weight:600}}
414footer{{margin-top:40px;padding-top:16px;border-top:1px solid var(--border);font-size:.75rem;color:var(--muted)}}
415</style>
416</head>
417<body>
418<div class="container">
419<header>
420<h1>rust-sanitize report</h1>
421<div class="meta">version {version}&nbsp;·&nbsp;{timestamp}&nbsp;·&nbsp;{duration_ms} ms total</div>
422</header>
423{cards}
424{patterns_section}
425<div class="section">
426<h2>Files</h2>
427<div class="table-wrap"><table>
428<thead><tr><th>Path</th><th>Matches</th><th>Method</th>{first_line_header}<th>Patterns</th></tr></thead>
429<tbody>{file_rows}</tbody>
430</table></div></div>
431<footer>Generated by <strong>rust-sanitize {version}</strong> on {timestamp}</footer>
432</div>
433</body>
434</html>"#,
435            version = html_escape(&m.version),
436            timestamp = html_escape(&m.timestamp),
437            duration_ms = s.duration_ms,
438            cards = cards,
439            patterns_section = patterns_section,
440            first_line_header = first_line_header,
441            file_rows = file_rows,
442        )
443    }
444}
445
446// ---------------------------------------------------------------------------
447// Private helpers
448// ---------------------------------------------------------------------------
449
450/// Map a pattern name to a SARIF severity level.
451/// PII and credential categories → "error"; everything else → "warning".
452fn sarif_level(pattern: &str) -> &'static str {
453    match pattern {
454        "email" | "name" | "phone" | "credit_card" | "ssn" | "auth_token" | "jwt" => "error",
455        _ => "warning",
456    }
457}
458
459/// Convert a pattern name to a CamelCase SARIF rule name.
460/// e.g. "auth_token" → "AuthToken", "custom:password" → "CustomPassword"
461fn sarif_rule_name(pattern: &str) -> String {
462    pattern
463        .split(['_', ':', '-'])
464        .map(|word| {
465            let mut chars = word.chars();
466            match chars.next() {
467                None => String::new(),
468                Some(c) => c.to_uppercase().collect::<String>() + chars.as_str(),
469            }
470        })
471        .collect()
472}
473
474/// Convert a file path to a SARIF URI (forward slashes, no percent-encoding).
475fn path_to_sarif_uri(path: &str) -> String {
476    path.replace('\\', "/")
477}
478
479/// CSS badge class for a pattern in the HTML report.
480fn sarif_badge_class(pattern: &str) -> &'static str {
481    match pattern {
482        "email" | "name" | "phone" | "credit_card" | "ssn" | "auth_token" | "jwt" => "badge-pii",
483        _ => "badge-warn",
484    }
485}
486
487/// Format a byte count as a human-readable string.
488#[allow(clippy::cast_precision_loss)]
489fn fmt_bytes(bytes: u64) -> String {
490    const KIB: u64 = 1024;
491    const MIB: u64 = 1024 * KIB;
492    const GIB: u64 = 1024 * MIB;
493    if bytes >= GIB {
494        format!("{:.1} GiB", bytes as f64 / GIB as f64)
495    } else if bytes >= MIB {
496        format!("{:.1} MiB", bytes as f64 / MIB as f64)
497    } else if bytes >= KIB {
498        format!("{:.1} KiB", bytes as f64 / KIB as f64)
499    } else {
500        format!("{bytes} B")
501    }
502}
503
504/// Escape HTML special characters to prevent injection in the HTML report.
505fn html_escape(s: &str) -> String {
506    s.replace('&', "&amp;")
507        .replace('<', "&lt;")
508        .replace('>', "&gt;")
509        .replace('"', "&quot;")
510}
511
512/// Tool metadata embedded in every report.
513#[derive(Debug, Clone, Serialize)]
514pub struct ReportMetadata {
515    /// Crate / binary version (from `Cargo.toml`).
516    pub version: String,
517    /// ISO-8601 timestamp when the run started.
518    pub timestamp: String,
519    /// Whether `--deterministic` was used.
520    pub deterministic: bool,
521    /// Whether `--dry-run` was used.
522    pub dry_run: bool,
523    /// Whether `--strict` was used.
524    pub strict: bool,
525    /// Chunk size in bytes (`--chunk-size`).
526    pub chunk_size: usize,
527    /// Thread count (`--threads`), if specified.
528    pub threads: Option<usize>,
529    /// Path to the secrets file, if provided.
530    pub secrets_file: Option<String>,
531}
532
533/// Aggregated summary across all processed files.
534#[derive(Debug, Clone, Serialize)]
535pub struct ReportSummary {
536    /// Number of files processed.
537    pub total_files: u64,
538    /// Total pattern matches found.
539    pub total_matches: u64,
540    /// Total replacements applied.
541    pub total_replacements: u64,
542    /// Total bytes read from input(s).
543    pub total_bytes_processed: u64,
544    /// Total bytes written to output(s).
545    pub total_bytes_output: u64,
546    /// Wall-clock duration of processing in milliseconds.
547    pub duration_ms: u64,
548    /// Aggregate per-pattern match counts.
549    pub pattern_counts: HashMap<String, u64>,
550}
551
552/// Per-match line-number results for a file, populated when
553/// `--max-match-locations` is non-zero and the scanner path is used.
554#[derive(Debug, Clone, Serialize)]
555pub struct MatchLocationsResult {
556    /// Individual match locations in document order.
557    pub locations: Vec<MatchLocation>,
558    /// `true` when the cap was hit and additional matches exist beyond
559    /// what is listed in `locations`.
560    pub truncated: bool,
561}
562
563/// Per-file result details.
564///
565/// Does **not** contain any original secret values — only counts,
566/// byte sizes, pattern labels, and the processing method used.
567#[derive(Debug, Clone, Serialize)]
568pub struct FileReport {
569    /// File path (relative or archive entry name).
570    pub path: String,
571    /// Number of matches found in this file.
572    pub matches: u64,
573    /// Number of replacements applied.
574    pub replacements: u64,
575    /// Bytes read from this file.
576    pub bytes_processed: u64,
577    /// Bytes written for this file.
578    pub bytes_output: u64,
579    /// Per-pattern match counts for this file.
580    pub pattern_counts: HashMap<String, u64>,
581    /// Processing method: `"scanner"`, `"structured:json"`, etc.
582    pub method: String,
583    /// Log context extraction results for this file, present when
584    /// `--extract-context` was used.
585    #[serde(skip_serializing_if = "Option::is_none")]
586    pub log_context: Option<LogContextResult>,
587    /// Per-match line numbers and byte offsets, present when
588    /// `--max-match-locations` is non-zero and the scanner path is used.
589    /// Structured-processor paths do not populate this field.
590    #[serde(skip_serializing_if = "Option::is_none")]
591    pub match_locations: Option<MatchLocationsResult>,
592}
593
594impl FileReport {
595    /// Build a `FileReport` from scanner [`ScanStats`].
596    #[must_use]
597    pub fn from_scan_stats(
598        path: impl Into<String>,
599        stats: &ScanStats,
600        method: impl Into<String>,
601    ) -> Self {
602        Self {
603            path: path.into(),
604            matches: stats.matches_found,
605            replacements: stats.replacements_applied,
606            bytes_processed: stats.bytes_processed,
607            bytes_output: stats.bytes_output,
608            pattern_counts: stats.pattern_counts.clone(),
609            method: method.into(),
610            log_context: None,
611            match_locations: None,
612        }
613    }
614
615    /// Attach per-match location data collected via
616    /// [`crate::scanner::StreamScanner::scan_reader_with_callbacks`].
617    ///
618    /// No-ops when `locations` is empty and `truncated` is false, keeping
619    /// the JSON output clean for files with no scanner matches.
620    #[must_use]
621    pub fn with_match_locations(mut self, locations: Vec<MatchLocation>, truncated: bool) -> Self {
622        if !locations.is_empty() || truncated {
623            self.match_locations = Some(MatchLocationsResult {
624                locations,
625                truncated,
626            });
627        }
628        self
629    }
630}
631
632// ---------------------------------------------------------------------------
633// Thread-safe report builder
634// ---------------------------------------------------------------------------
635
636/// Thread-safe builder that accumulates per-file results and produces
637/// a final [`SanitizeReport`].
638///
639/// Designed for concurrent use: wrap in `Arc` and share across threads.
640/// The internal `Mutex` is held only for the duration of a single
641/// `Vec::push`, so contention is negligible even at high thread counts.
642#[derive(Debug)]
643pub struct ReportBuilder {
644    metadata: ReportMetadata,
645    files: Mutex<Vec<FileReport>>,
646    start: Instant,
647}
648
649// All fields are Send + Sync natively (Mutex<Vec<_>>, Instant, owned structs),
650// so ReportBuilder auto-derives Send + Sync without unsafe.
651const _: fn() = || {
652    fn assert_send<T: Send>() {}
653    fn assert_sync<T: Sync>() {}
654    assert_send::<ReportBuilder>();
655    assert_sync::<ReportBuilder>();
656};
657
658impl ReportBuilder {
659    /// Create a new builder with the given metadata.
660    ///
661    /// The wall-clock timer starts now.
662    #[must_use]
663    pub fn new(metadata: ReportMetadata) -> Self {
664        Self {
665            metadata,
666            files: Mutex::new(Vec::new()),
667            start: Instant::now(),
668        }
669    }
670
671    /// Attach log context extraction results to the [`FileReport`] identified
672    /// by `path`. The file must already have been recorded via
673    /// [`Self::record_file`]. Thread-safe.
674    pub fn set_file_log_context(&self, path: &str, result: LogContextResult) {
675        let mut files = self.files.lock().expect("report mutex poisoned");
676        if let Some(file) = files.iter_mut().find(|f| f.path == path) {
677            file.log_context = Some(result);
678        }
679    }
680
681    /// Record the result for a single file. Thread-safe.
682    pub fn record_file(&self, file_report: FileReport) {
683        let mut files = self.files.lock().expect("report mutex poisoned");
684        files.push(file_report);
685    }
686
687    /// Record multiple file results at once (e.g., from archive processing).
688    pub fn record_files(&self, reports: impl IntoIterator<Item = FileReport>) {
689        let mut files = self.files.lock().expect("report mutex poisoned");
690        files.extend(reports);
691    }
692
693    /// Consume the builder and produce the final report.
694    ///
695    /// The duration is measured from builder creation to this call.
696    pub fn finish(self) -> SanitizeReport {
697        #[allow(clippy::cast_possible_truncation)] // duration in ms won't exceed u64
698        let duration_ms = self.start.elapsed().as_millis() as u64;
699        let files = self.files.into_inner().expect("report mutex poisoned");
700
701        // Aggregate summary.
702        let mut total_matches: u64 = 0;
703        let mut total_replacements: u64 = 0;
704        let mut total_bytes_processed: u64 = 0;
705        let mut total_bytes_output: u64 = 0;
706        let mut pattern_counts: HashMap<String, u64> = HashMap::new();
707
708        for f in &files {
709            total_matches += f.matches;
710            total_replacements += f.replacements;
711            total_bytes_processed += f.bytes_processed;
712            total_bytes_output += f.bytes_output;
713            for (pat, count) in &f.pattern_counts {
714                *pattern_counts.entry(pat.clone()).or_insert(0) += count;
715            }
716        }
717
718        let summary = ReportSummary {
719            total_files: files.len() as u64,
720            total_matches,
721            total_replacements,
722            total_bytes_processed,
723            total_bytes_output,
724            duration_ms,
725            pattern_counts,
726        };
727
728        SanitizeReport {
729            metadata: self.metadata,
730            summary,
731            files,
732        }
733    }
734}
735
736// ---------------------------------------------------------------------------
737// Unit tests
738// ---------------------------------------------------------------------------
739
740#[cfg(test)]
741mod tests {
742    use super::*;
743
744    fn sample_metadata() -> ReportMetadata {
745        ReportMetadata {
746            version: "0.2.0".into(),
747            timestamp: "2026-03-01T00:00:00Z".into(),
748            deterministic: false,
749            dry_run: false,
750            strict: false,
751            chunk_size: 1_048_576,
752            threads: None,
753            secrets_file: None,
754        }
755    }
756
757    fn sample_file_report(path: &str, matches: u64, pattern: &str) -> FileReport {
758        FileReport {
759            path: path.into(),
760            matches,
761            replacements: matches,
762            bytes_processed: matches * 100,
763            bytes_output: matches * 110,
764            pattern_counts: HashMap::from([(pattern.into(), matches)]),
765            method: "scanner".into(),
766            log_context: None,
767            match_locations: None,
768        }
769    }
770
771    // ---- Basic construction ----
772
773    #[test]
774    fn empty_report() {
775        let builder = ReportBuilder::new(sample_metadata());
776        let report = builder.finish();
777        assert_eq!(report.summary.total_files, 0);
778        assert_eq!(report.summary.total_matches, 0);
779        assert!(report.files.is_empty());
780    }
781
782    #[test]
783    fn single_file_report() {
784        let builder = ReportBuilder::new(sample_metadata());
785        builder.record_file(sample_file_report("data.log", 10, "email"));
786        let report = builder.finish();
787
788        assert_eq!(report.summary.total_files, 1);
789        assert_eq!(report.summary.total_matches, 10);
790        assert_eq!(report.summary.total_replacements, 10);
791        assert_eq!(report.summary.total_bytes_processed, 1000);
792        assert_eq!(report.summary.total_bytes_output, 1100);
793        assert_eq!(*report.summary.pattern_counts.get("email").unwrap(), 10);
794        assert_eq!(report.files[0].path, "data.log");
795    }
796
797    #[test]
798    fn multiple_files_aggregated() {
799        let builder = ReportBuilder::new(sample_metadata());
800        builder.record_file(sample_file_report("a.log", 5, "email"));
801        builder.record_file(sample_file_report("b.log", 3, "ipv4"));
802        builder.record_file(sample_file_report("c.log", 7, "email"));
803        let report = builder.finish();
804
805        assert_eq!(report.summary.total_files, 3);
806        assert_eq!(report.summary.total_matches, 15);
807        assert_eq!(*report.summary.pattern_counts.get("email").unwrap(), 12);
808        assert_eq!(*report.summary.pattern_counts.get("ipv4").unwrap(), 3);
809    }
810
811    // ---- JSON serialization ----
812
813    #[test]
814    fn json_serialization_no_secrets() {
815        let builder = ReportBuilder::new(sample_metadata());
816        builder.record_file(FileReport {
817            path: "config.yaml".into(),
818            matches: 2,
819            replacements: 2,
820            bytes_processed: 500,
821            bytes_output: 520,
822            pattern_counts: HashMap::from([("hostname".into(), 2)]),
823            method: "structured:yaml".into(),
824            log_context: None,
825            match_locations: None,
826        });
827        let report = builder.finish();
828        let json = report.to_json_pretty().unwrap();
829
830        // Must contain expected fields.
831        assert!(json.contains("\"total_matches\": 2"));
832        assert!(json.contains("\"version\": \"0.2.0\""));
833        assert!(json.contains("\"hostname\": 2"));
834        assert!(json.contains("\"method\": \"structured:yaml\""));
835        assert!(json.contains("\"duration_ms\""));
836
837        // Must NOT contain any original secret values — we only ever
838        // store counts and labels, never pattern text or matched text.
839        // This is a structural guarantee; verify that deserializing
840        // back produces the same data without secret leakage.
841        let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
842        assert!(parsed["files"][0]["path"].as_str() == Some("config.yaml"));
843        // No field named "secret", "original", or "value" at any level.
844        let flat = json.to_lowercase();
845        assert!(!flat.contains("\"original\""));
846        assert!(!flat.contains("\"secret_value\""));
847    }
848
849    #[test]
850    fn compact_json() {
851        let builder = ReportBuilder::new(sample_metadata());
852        let report = builder.finish();
853        let json = report.to_json().unwrap();
854        // Compact JSON has no pretty indentation.
855        assert!(!json.contains("  "));
856    }
857
858    // ---- Metadata flags ----
859
860    #[test]
861    fn metadata_flags_preserved() {
862        let meta = ReportMetadata {
863            version: "0.8.0".into(),
864            timestamp: "2026-06-15T12:00:00Z".into(),
865            deterministic: true,
866            dry_run: true,
867            strict: true,
868            chunk_size: 262_144,
869            threads: Some(8),
870            secrets_file: Some("secrets.enc".into()),
871        };
872        let builder = ReportBuilder::new(meta);
873        let report = builder.finish();
874        assert!(report.metadata.deterministic);
875        assert!(report.metadata.dry_run);
876        assert!(report.metadata.strict);
877        assert_eq!(report.metadata.chunk_size, 262_144);
878        assert_eq!(report.metadata.threads, Some(8));
879        assert_eq!(report.metadata.secrets_file.as_deref(), Some("secrets.enc"));
880    }
881
882    // ---- Duration tracking ----
883
884    #[test]
885    fn duration_is_positive() {
886        let builder = ReportBuilder::new(sample_metadata());
887        // Do a tiny amount of work.
888        builder.record_file(sample_file_report("x.txt", 1, "email"));
889        let report = builder.finish();
890        // Duration should be ≥ 0 (it will be 0 or 1 on fast machines).
891        assert!(report.summary.duration_ms < 5_000); // sanity ceiling
892    }
893
894    // ---- Thread-safe concurrent recording ----
895
896    #[test]
897    fn concurrent_recording() {
898        use std::sync::Arc;
899        use std::thread;
900
901        let builder = Arc::new(ReportBuilder::new(sample_metadata()));
902        let mut handles = Vec::new();
903
904        for i in 0_u64..16 {
905            let b = Arc::clone(&builder);
906            handles.push(thread::spawn(move || {
907                b.record_file(sample_file_report(&format!("file_{i}.log"), i + 1, "email"));
908            }));
909        }
910
911        for h in handles {
912            h.join().unwrap();
913        }
914
915        // We need to unwrap the Arc to call finish().
916        let builder = Arc::try_unwrap(builder).expect("other refs still held");
917        let report = builder.finish();
918
919        assert_eq!(report.summary.total_files, 16);
920        // Sum of 1..=16 = 136.
921        assert_eq!(report.summary.total_matches, 136);
922    }
923
924    // ---- FileReport::from_scan_stats ----
925
926    #[test]
927    fn file_report_from_scan_stats() {
928        let stats = ScanStats {
929            bytes_processed: 2048,
930            bytes_output: 2100,
931            matches_found: 5,
932            replacements_applied: 5,
933            pattern_counts: HashMap::from([("email".into(), 3), ("ipv4".into(), 2)]),
934        };
935        let fr = FileReport::from_scan_stats("test.log", &stats, "scanner");
936        assert_eq!(fr.path, "test.log");
937        assert_eq!(fr.matches, 5);
938        assert_eq!(fr.bytes_processed, 2048);
939        assert_eq!(*fr.pattern_counts.get("email").unwrap(), 3);
940        assert_eq!(fr.method, "scanner");
941    }
942
943    // ---- Large-file simulation ----
944
945    #[test]
946    fn large_file_report() {
947        let builder = ReportBuilder::new(sample_metadata());
948        // Simulate a 10 GB file processed in chunks.
949        builder.record_file(FileReport {
950            path: "huge.log".into(),
951            matches: 1_000_000,
952            replacements: 1_000_000,
953            bytes_processed: 10_737_418_240, // 10 GiB
954            bytes_output: 10_900_000_000,
955            pattern_counts: HashMap::from([("email".into(), 600_000), ("ipv4".into(), 400_000)]),
956            method: "scanner".into(),
957            log_context: None,
958            match_locations: None,
959        });
960        let report = builder.finish();
961        assert_eq!(report.summary.total_matches, 1_000_000);
962        assert_eq!(report.summary.total_bytes_processed, 10_737_418_240);
963
964        // JSON serialization still works for large numbers.
965        let json = report.to_json().unwrap();
966        assert!(json.contains("10737418240"));
967    }
968
969    // ---- record_files bulk insert ----
970
971    #[test]
972    fn record_files_bulk() {
973        let builder = ReportBuilder::new(sample_metadata());
974        let files: Vec<FileReport> = (0..5)
975            .map(|i| sample_file_report(&format!("entry_{i}.txt"), 2, "ssn"))
976            .collect();
977        builder.record_files(files);
978        let report = builder.finish();
979        assert_eq!(report.summary.total_files, 5);
980        assert_eq!(report.summary.total_matches, 10);
981    }
982
983    // ---- SARIF output ----
984
985    fn rich_report() -> SanitizeReport {
986        let builder = ReportBuilder::new(sample_metadata());
987        builder.record_file(FileReport {
988            path: "config.yaml".into(),
989            matches: 3,
990            replacements: 3,
991            bytes_processed: 1024,
992            bytes_output: 1100,
993            pattern_counts: HashMap::from([("auth_token".into(), 2u64), ("email".into(), 1u64)]),
994            method: "structured:yaml".into(),
995            log_context: None,
996            match_locations: None,
997        });
998        builder.record_file(FileReport {
999            path: "logs/app.log".into(),
1000            matches: 0,
1001            replacements: 0,
1002            bytes_processed: 512,
1003            bytes_output: 512,
1004            pattern_counts: HashMap::new(),
1005            method: "scanner".into(),
1006            log_context: None,
1007            match_locations: None,
1008        });
1009        builder.finish()
1010    }
1011
1012    #[test]
1013    fn sarif_is_valid_json() {
1014        let sarif = rich_report().to_sarif().unwrap();
1015        let v: serde_json::Value = serde_json::from_str(&sarif).unwrap();
1016        assert_eq!(v["version"], "2.1.0");
1017        assert_eq!(
1018            v["$schema"],
1019            "https://json.schemastore.org/sarif-2.1.0.json"
1020        );
1021    }
1022
1023    #[test]
1024    fn sarif_contains_one_run() {
1025        let v: serde_json::Value =
1026            serde_json::from_str(&rich_report().to_sarif().unwrap()).unwrap();
1027        assert_eq!(v["runs"].as_array().unwrap().len(), 1);
1028    }
1029
1030    #[test]
1031    fn sarif_driver_name_and_version() {
1032        let v: serde_json::Value =
1033            serde_json::from_str(&rich_report().to_sarif().unwrap()).unwrap();
1034        let driver = &v["runs"][0]["tool"]["driver"];
1035        assert_eq!(driver["name"], "rust-sanitize");
1036        assert_eq!(driver["version"], "0.2.0");
1037    }
1038
1039    #[test]
1040    fn sarif_rules_one_per_pattern() {
1041        let v: serde_json::Value =
1042            serde_json::from_str(&rich_report().to_sarif().unwrap()).unwrap();
1043        let rules = v["runs"][0]["tool"]["driver"]["rules"].as_array().unwrap();
1044        // Two patterns: auth_token, email.
1045        assert_eq!(rules.len(), 2);
1046        let ids: Vec<&str> = rules.iter().map(|r| r["id"].as_str().unwrap()).collect();
1047        assert!(ids.contains(&"auth_token"));
1048        assert!(ids.contains(&"email"));
1049    }
1050
1051    #[test]
1052    fn sarif_results_only_for_nonzero_counts() {
1053        let v: serde_json::Value =
1054            serde_json::from_str(&rich_report().to_sarif().unwrap()).unwrap();
1055        let results = v["runs"][0]["results"].as_array().unwrap();
1056        // logs/app.log has 0 matches → 0 results for it; config.yaml has 2 patterns.
1057        assert_eq!(results.len(), 2);
1058    }
1059
1060    #[test]
1061    fn sarif_result_level_pii_is_error() {
1062        let v: serde_json::Value =
1063            serde_json::from_str(&rich_report().to_sarif().unwrap()).unwrap();
1064        let results = v["runs"][0]["results"].as_array().unwrap();
1065        let email_result = results
1066            .iter()
1067            .find(|r| r["ruleId"] == "email")
1068            .expect("email result missing");
1069        assert_eq!(email_result["level"], "error");
1070    }
1071
1072    #[test]
1073    fn sarif_result_has_file_uri() {
1074        let v: serde_json::Value =
1075            serde_json::from_str(&rich_report().to_sarif().unwrap()).unwrap();
1076        let results = v["runs"][0]["results"].as_array().unwrap();
1077        for result in results {
1078            let uri = result["locations"][0]["physicalLocation"]["artifactLocation"]["uri"]
1079                .as_str()
1080                .unwrap();
1081            assert_eq!(uri, "config.yaml");
1082        }
1083    }
1084
1085    #[test]
1086    fn sarif_artifacts_all_files() {
1087        let v: serde_json::Value =
1088            serde_json::from_str(&rich_report().to_sarif().unwrap()).unwrap();
1089        let artifacts = v["runs"][0]["artifacts"].as_array().unwrap();
1090        assert_eq!(artifacts.len(), 2);
1091        let uris: Vec<&str> = artifacts
1092            .iter()
1093            .map(|a| a["location"]["uri"].as_str().unwrap())
1094            .collect();
1095        assert!(uris.contains(&"config.yaml"));
1096        assert!(uris.contains(&"logs/app.log"));
1097    }
1098
1099    #[test]
1100    fn sarif_windows_paths_use_forward_slash() {
1101        let builder = ReportBuilder::new(sample_metadata());
1102        builder.record_file(FileReport {
1103            path: r"src\secrets\config.json".into(),
1104            matches: 1,
1105            replacements: 1,
1106            bytes_processed: 100,
1107            bytes_output: 110,
1108            pattern_counts: HashMap::from([("auth_token".into(), 1u64)]),
1109            method: "structured:json".into(),
1110            log_context: None,
1111            match_locations: None,
1112        });
1113        let report = builder.finish();
1114        let v: serde_json::Value = serde_json::from_str(&report.to_sarif().unwrap()).unwrap();
1115        let uri = v["runs"][0]["results"][0]["locations"][0]["physicalLocation"]
1116            ["artifactLocation"]["uri"]
1117            .as_str()
1118            .unwrap();
1119        assert_eq!(uri, "src/secrets/config.json");
1120    }
1121
1122    // ---- HTML output ----
1123
1124    #[test]
1125    fn html_is_valid_document() {
1126        let html = rich_report().to_html();
1127        assert!(html.starts_with("<!DOCTYPE html>"));
1128        assert!(html.contains("</html>"));
1129        assert!(html.contains("<title>rust-sanitize report</title>"));
1130    }
1131
1132    #[test]
1133    fn html_contains_summary_stats() {
1134        let html = rich_report().to_html();
1135        // 1 file with matches + 1 clean file = 2 files total.
1136        assert!(html.contains(">2<"), "file count missing");
1137        // 3 total matches.
1138        assert!(html.contains(">3<"), "match count missing");
1139    }
1140
1141    #[test]
1142    fn html_contains_file_paths() {
1143        let html = rich_report().to_html();
1144        assert!(html.contains("config.yaml"));
1145        assert!(html.contains("logs/app.log"));
1146    }
1147
1148    #[test]
1149    fn html_escapes_special_chars() {
1150        let builder = ReportBuilder::new(sample_metadata());
1151        builder.record_file(FileReport {
1152            path: "<script>alert(1)</script>".into(),
1153            matches: 0,
1154            replacements: 0,
1155            bytes_processed: 0,
1156            bytes_output: 0,
1157            pattern_counts: HashMap::new(),
1158            method: "scanner".into(),
1159            log_context: None,
1160            match_locations: None,
1161        });
1162        let html = builder.finish().to_html();
1163        assert!(!html.contains("<script>alert(1)</script>"));
1164        assert!(html.contains("&lt;script&gt;"));
1165    }
1166
1167    #[test]
1168    fn html_no_external_resources() {
1169        let html = rich_report().to_html();
1170        // No CDN links, no external stylesheets, no external scripts.
1171        assert!(!html.contains("http://") || !html.contains("https://json.schemastore.org"));
1172        assert!(!html.contains("cdn."));
1173        assert!(!html.contains("src=\"http"));
1174        assert!(!html.contains("href=\"http"));
1175    }
1176
1177    // ---- helpers ----
1178
1179    #[test]
1180    fn sarif_rule_name_camel_case() {
1181        assert_eq!(sarif_rule_name("auth_token"), "AuthToken");
1182        assert_eq!(sarif_rule_name("email"), "Email");
1183        assert_eq!(sarif_rule_name("custom:password"), "CustomPassword");
1184        assert_eq!(sarif_rule_name("aws_arn"), "AwsArn");
1185    }
1186
1187    #[test]
1188    fn fmt_bytes_human_readable() {
1189        assert_eq!(fmt_bytes(512), "512 B");
1190        assert_eq!(fmt_bytes(1024), "1.0 KiB");
1191        assert_eq!(fmt_bytes(1536), "1.5 KiB");
1192        assert_eq!(fmt_bytes(1024 * 1024), "1.0 MiB");
1193        assert_eq!(fmt_bytes(1024 * 1024 * 1024), "1.0 GiB");
1194    }
1195
1196    #[test]
1197    fn html_escape_special_chars() {
1198        assert_eq!(html_escape("a&b"), "a&amp;b");
1199        assert_eq!(html_escape("<tag>"), "&lt;tag&gt;");
1200        assert_eq!(html_escape("\"quote\""), "&quot;quote&quot;");
1201        assert_eq!(html_escape("normal"), "normal");
1202    }
1203}
sanitize_engine/report.rs

sanitize_engine/
report.rs