Skip to main content

sanitize_engine/processor/
mod.rs

1//! Structured processors for format-aware sanitization.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌──────────────────┐     ┌───────────────────┐     ┌──────────────────┐
7//! │  Input bytes     │ ──▶ │ ProcessorRegistry  │ ──▶ │  Output bytes    │
8//! │  (file content)  │     │ (profile matching) │     │  (sanitized)     │
9//! └──────────────────┘     └────────┬───────────┘     └──────────────────┘
10//!                                   │
11//!                          ┌────────▼────────┐
12//!                          │ dyn Processor    │
13//!                          │                  │
14//!                          │  KeyValue        │ ← gitlab.rb-style
15//!                          │  JsonProcessor   │ ← JSON files
16//!                          │  YamlProcessor   │ ← YAML files
17//!                          │  XmlProcessor    │ ← XML files
18//!                          │  CsvProcessor    │ ← CSV/TSV files
19//!                          └────────┬────────┘
20//!                                   │
21//!                          ┌────────▼────────┐
22//!                          │  MappingStore    │
23//!                          │  (one-way dedup) │
24//!                          └─────────────────┘
25//! ```
26//!
27//! # File-Type Profiles
28//!
29//! A [`FileTypeProfile`] specifies which processor to use and what
30//! fields/keys to sanitize. Users provide profiles to control which
31//! parts of a structured file are replaced. If no profile matches,
32//! the caller falls back to the streaming scanner.
33//!
34//! # Extensibility
35//!
36//! Implement the [`Processor`] trait and register it with the
37//! [`ProcessorRegistry`]. The registry matches profiles to processors
38//! by name and dispatches processing.
39
40pub mod archive;
41pub mod csv_proc;
42pub mod env_proc;
43pub mod ini_proc;
44pub mod json_proc;
45pub mod jsonl_proc;
46pub mod key_value;
47pub(crate) mod limits;
48pub mod log_line;
49pub mod profile;
50pub mod registry;
51pub mod toml_proc;
52pub mod xml_proc;
53pub mod yaml_proc;
54
55// Re-export core types.
56pub use profile::{FieldNameSignal, FieldRule, FileTypeProfile, DEFAULT_FIELD_SIGNAL_THRESHOLD};
57pub use registry::ProcessorRegistry;
58
59use crate::category::Category;
60use crate::error::{Result, SanitizeError};
61use crate::store::MappingStore;
62use std::io;
63
64// ---------------------------------------------------------------------------
65// Processor trait
66// ---------------------------------------------------------------------------
67
68/// A structured processor that can sanitize a specific file format while
69/// preserving its structure and formatting as much as possible.
70///
71/// Processors are **stateless** — all mutable state lives in the
72/// [`MappingStore`] they receive. This makes processors `Send + Sync`
73/// and reusable across files.
74///
75/// # Contract
76///
77/// - `name()` must return a unique, lowercase identifier (e.g. `"json"`).
78/// - `can_handle()` is a fast heuristic check; it may inspect a few
79///   bytes or the file extension but should not fully parse.
80/// - `process()` performs the full structured sanitization. It should
81///   preserve formatting/whitespace where possible and only replace
82///   values in fields matched by the profile's [`FieldRule`]s.
83/// - Replacements are **one-way** via the `MappingStore` — no reverse
84///   mapping is produced.
85pub trait Processor: Send + Sync {
86    /// Unique name for this processor (e.g. `"json"`, `"yaml"`, `"key_value"`).
87    fn name(&self) -> &'static str;
88
89    /// Quick heuristic: can this processor handle the given content?
90    ///
91    /// Implementations may check magic bytes, file extension hints in
92    /// the profile, or the first few bytes of content. This is called
93    /// before `process()` and should be fast.
94    fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool;
95
96    /// Process the content, replacing matched field values one-way.
97    ///
98    /// # Arguments
99    ///
100    /// - `content` — raw file bytes.
101    /// - `profile` — the user-supplied profile with field rules.
102    /// - `store` — the mapping store for dedup-consistent one-way replacements.
103    ///
104    /// # Returns
105    ///
106    /// The sanitized content as bytes, preserving structure/formatting
107    /// where possible.
108    ///
109    /// # Errors
110    ///
111    /// Returns [`SanitizeError`] if parsing or replacement generation fails.
112    fn process(
113        &self,
114        content: &[u8],
115        profile: &FileTypeProfile,
116        store: &MappingStore,
117    ) -> Result<Vec<u8>>;
118
119    /// Whether this processor supports bounded-memory streaming via
120    /// [`process_stream`](Self::process_stream).
121    ///
122    /// Processors that return `true` here are eligible for the streaming
123    /// structured path in the CLI, which opens the file as a reader instead
124    /// of reading it fully into memory. The default is `false`.
125    fn supports_streaming(&self) -> bool {
126        false
127    }
128
129    /// Process content from a reader, writing sanitized output to a writer.
130    ///
131    /// The default implementation reads the entire reader into memory and
132    /// delegates to [`process`](Self::process). Processors that return
133    /// `true` from [`supports_streaming`](Self::supports_streaming) should
134    /// override this to handle data incrementally, keeping memory usage
135    /// bounded regardless of input size.
136    ///
137    /// # Errors
138    ///
139    /// Returns [`SanitizeError`] on read, parse,
140    /// or write failure.
141    fn process_stream(
142        &self,
143        reader: &mut dyn io::Read,
144        writer: &mut dyn io::Write,
145        profile: &FileTypeProfile,
146        store: &MappingStore,
147    ) -> Result<()> {
148        let mut buf = Vec::new();
149        io::Read::read_to_end(reader, &mut buf)?;
150        let out = self.process(&buf, profile, store)?;
151        io::Write::write_all(writer, &out)?;
152        Ok(())
153    }
154}
155
156// ---------------------------------------------------------------------------
157// Helpers shared across processors
158// ---------------------------------------------------------------------------
159
160/// Replace a value through the mapping store using a field rule's category.
161///
162/// Returns the original `value` unchanged when it is shorter than
163/// `rule.min_length` (if set). This prevents broad glob patterns like
164/// `*token*` from redacting obviously non-secret values such as `"false"`,
165/// `"0"`, or `"nil"`.
166pub(crate) fn replace_value(value: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
167    if let Some(min) = rule.min_length {
168        if value.len() < min {
169            return Ok(value.to_string());
170        }
171    }
172    let category = rule
173        .category
174        .clone()
175        .unwrap_or(Category::Custom("field".into()));
176    let sanitized = store.get_or_insert(&category, value)?;
177    Ok(sanitized.to_string())
178}
179
180/// Build a dot-separated key path by appending `key` to `prefix`.
181///
182/// Returns `key` unchanged when `prefix` is empty.
183#[must_use]
184pub(crate) fn build_path(prefix: &str, key: &str) -> String {
185    if prefix.is_empty() {
186        key.to_string()
187    } else {
188        format!("{}.{}", prefix, key)
189    }
190}
191
192/// Check whether a single glob `pattern` matches `key_path`.
193///
194/// `*` is the only wildcard character. It matches any sequence of characters,
195/// including empty strings and path separators (`.`, `[`, `]`).
196///
197/// | Pattern | Matches |
198/// |---------|---------|
199/// | `"*"` | anything |
200/// | `"password"` | `"password"` exactly |
201/// | `"*.password"` | `"password"`, `"db.password"`, `"a.b.password"` |
202/// | `"db.*"` | `"db.host"`, `"db.port"`, `"db.nested.key"` |
203/// | `"*password*"` | any key containing `"password"` as a substring |
204/// | `"*['smtp_password']"` | `"gitlab_rails['smtp_password']"` (bracket notation) |
205#[must_use]
206pub(crate) fn pattern_matches(pattern: &str, key_path: &str) -> bool {
207    // Fast path: `*` matches everything.
208    if pattern == "*" {
209        return true;
210    }
211    // Fast path: exact match.
212    if pattern == key_path {
213        return true;
214    }
215    // Fast path: no wildcards — only the exact match above can succeed.
216    if !pattern.contains('*') {
217        return false;
218    }
219    // Dot-path glob: `*.suffix` — requires a dot boundary before the suffix
220    // so that `*.password` matches `db.password` but not `dbpassword`.
221    if let Some(suffix) = pattern.strip_prefix("*.") {
222        if !suffix.contains('*')
223            && (key_path == suffix
224                || key_path
225                    .strip_suffix(suffix)
226                    .is_some_and(|rest| rest.ends_with('.')))
227        {
228            return true;
229        }
230    }
231    // Dot-path glob: `prefix.*` — `db.*` matches `db.host`, `db.nested.key`.
232    if let Some(prefix) = pattern.strip_suffix(".*") {
233        if !prefix.contains('*')
234            && key_path
235                .strip_prefix(prefix)
236                .is_some_and(|rest| rest.starts_with('.'))
237        {
238            return true;
239        }
240    }
241    // General multi-wildcard glob: split on `*` and verify segments appear in
242    // order. This handles patterns like `*password*`, `*['key']`, `a*b*c`.
243    glob_matches(pattern, key_path)
244}
245
246use crate::allowlist::glob_matches;
247
248/// Compute Shannon entropy of `data` in bits per character.
249///
250/// Returns `0.0` for empty input. Uses a fixed 256-element frequency table
251/// so the cost is O(n) time and O(1) space regardless of alphabet size.
252#[inline]
253#[allow(clippy::cast_precision_loss)]
254pub(crate) fn shannon_entropy(data: &[u8]) -> f64 {
255    if data.is_empty() {
256        return 0.0;
257    }
258    let mut counts = [0u32; 256];
259    for &b in data {
260        counts[b as usize] += 1;
261    }
262    let len = data.len() as f64;
263    counts
264        .iter()
265        .filter(|&&c| c > 0)
266        .map(|&c| {
267            let p = f64::from(c) / len;
268            -p * p.log2()
269        })
270        .sum()
271}
272
273/// Return the first [`FieldNameSignal`] whose key pattern matches `key`.
274///
275/// `key` is the **bare** field name (leaf key only, not the full dot-path).
276#[must_use]
277pub(crate) fn find_field_signal<'a>(
278    key: &str,
279    signals: &'a [FieldNameSignal],
280) -> Option<&'a FieldNameSignal> {
281    signals.iter().find(|sig| sig.matches_key(key))
282}
283
284/// Replace `value` via the mapping store when its entropy meets the signal's gate.
285///
286/// Returns `Some(replacement)` when the value's Shannon entropy is at or above
287/// `sig.threshold`, or `None` when the entropy is too low to be a real secret
288/// (e.g. `"Bearer"`, `"basic"`, `"true"`).
289pub(crate) fn replace_by_signal(
290    value: &str,
291    sig: &FieldNameSignal,
292    store: &MappingStore,
293) -> Result<Option<String>> {
294    if value.is_empty() {
295        return Ok(None);
296    }
297    if shannon_entropy(value.as_bytes()) < sig.threshold {
298        return Ok(None);
299    }
300    let replaced = store.get_or_insert(&sig.category, value)?;
301    Ok(Some(replaced.to_string()))
302}
303
304/// Return the first rule in `profile` whose pattern matches `key_path`.
305///
306/// Supports exact matches and glob patterns — see [`pattern_matches`] for the
307/// full pattern syntax including dot-path globs and bracket notation.
308#[must_use]
309pub(crate) fn find_matching_rule<'a>(
310    key_path: &str,
311    profile: &'a FileTypeProfile,
312) -> Option<&'a FieldRule> {
313    profile
314        .fields
315        .iter()
316        .find(|rule| pattern_matches(&rule.pattern, key_path))
317}
318
319// ---------------------------------------------------------------------------
320// Shared tree walker
321// ---------------------------------------------------------------------------
322
323/// Visitor interface over a structured value tree.
324///
325/// Implemented by [`serde_json::Value`], [`serde_yaml_ng::Value`], and
326/// [`toml::Value`] so that [`walk_tree`] can drive sanitization without
327/// knowing the format it is operating on.
328pub(crate) trait TreeNode {
329    /// Call `f(key, child)` for every entry in this map node.
330    /// Is a no-op (returns `Ok(())`) if this node is not a map.
331    fn for_each_map_entry<F>(&mut self, f: F) -> Result<()>
332    where
333        F: FnMut(&str, &mut Self) -> Result<()>;
334
335    /// Call `f(item)` for every item in this sequence node.
336    /// Is a no-op (returns `Ok(())`) if this node is not a sequence.
337    fn for_each_seq_item<F>(&mut self, f: F) -> Result<()>
338    where
339        F: FnMut(&mut Self) -> Result<()>;
340
341    /// Mutable access to the inner `String` if this is a string node.
342    fn as_str_mut(&mut self) -> Option<&mut String>;
343
344    /// `true` if this is a non-string primitive scalar (number, bool, datetime, …).
345    fn is_scalar(&self) -> bool;
346
347    /// String representation used as the replacement input for scalar values.
348    fn scalar_to_string(&self) -> String;
349
350    /// Replace this node's content with a string value in-place.
351    fn set_string(&mut self, s: String);
352}
353
354/// Recursively walk a structured value tree, replacing matched leaf values.
355///
356/// This is the shared implementation for the JSON, YAML, and TOML processors.
357/// Each processor implements [`TreeNode`] for its own value type and wraps
358/// this call in a thin format-named function.
359pub(crate) fn walk_tree<V: TreeNode>(
360    value: &mut V,
361    prefix: &str,
362    profile: &FileTypeProfile,
363    store: &MappingStore,
364    depth: usize,
365    format_name: &str,
366) -> Result<()> {
367    if depth > limits::DEFAULT_DEPTH {
368        return Err(SanitizeError::RecursionDepthExceeded(format!(
369            "{format_name} recursion depth exceeds limit of {}",
370            limits::DEFAULT_DEPTH
371        )));
372    }
373    value.for_each_map_entry(|key, v| {
374        let path = build_path(prefix, key);
375        if let Some(s) = v.as_str_mut() {
376            if let Some(rule) = find_matching_rule(&path, profile) {
377                *s = replace_value(s, rule, store)?;
378            } else if let Some(sig) = find_field_signal(key, &profile.field_name_signals) {
379                if let Some(replaced) = replace_by_signal(s, sig, store)? {
380                    *s = replaced;
381                }
382            }
383        } else if v.is_scalar() {
384            if let Some(rule) = find_matching_rule(&path, profile) {
385                let repr = v.scalar_to_string();
386                let replaced = replace_value(&repr, rule, store)?;
387                v.set_string(replaced);
388            } else if let Some(sig) = find_field_signal(key, &profile.field_name_signals) {
389                let repr = v.scalar_to_string();
390                if let Some(replaced) = replace_by_signal(&repr, sig, store)? {
391                    v.set_string(replaced);
392                }
393            }
394        } else {
395            walk_tree(v, &path, profile, store, depth + 1, format_name)?;
396        }
397        Ok(())
398    })?;
399    value.for_each_seq_item(|item| walk_tree(item, prefix, profile, store, depth + 1, format_name))
400}
401
402// ---------------------------------------------------------------------------
403// Unit tests
404// ---------------------------------------------------------------------------
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use crate::category::Category;
410
411    // ── shannon_entropy ──────────────────────────────────────────────────────
412
413    #[test]
414    #[allow(clippy::float_cmp)]
415    fn entropy_empty_is_zero() {
416        assert_eq!(shannon_entropy(b""), 0.0);
417    }
418
419    #[test]
420    #[allow(clippy::float_cmp)]
421    fn entropy_single_byte_is_zero() {
422        // All characters the same → zero entropy.
423        assert_eq!(shannon_entropy(b"aaaa"), 0.0);
424    }
425
426    #[test]
427    fn entropy_two_equal_symbols_is_one_bit() {
428        // "ab" repeated — 2 equally likely symbols → exactly 1.0 bit.
429        assert!((shannon_entropy(b"abababab") - 1.0).abs() < 1e-10);
430    }
431
432    #[test]
433    fn entropy_high_for_random_hex() {
434        // 32-char hex string should be well above 3.5 bits/char.
435        let h = shannon_entropy(b"a3f8c2d1e9b7f4a2c8d3e1b9f7a4c2d1");
436        assert!(h > 3.5, "expected entropy > 3.5, got {h}");
437    }
438
439    #[test]
440    fn entropy_low_for_word() {
441        // "Bearer" uses only 5 distinct chars, should be below 3.0.
442        let h = shannon_entropy(b"Bearer");
443        assert!(h < 3.0, "expected entropy < 3.0, got {h}");
444    }
445
446    // ── FieldNameSignal::matches_key ─────────────────────────────────────────
447
448    #[test]
449    fn signal_matches_exact_key() {
450        let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
451        assert!(sig.matches_key("password"));
452        assert!(!sig.matches_key("db_password"));
453        assert!(!sig.matches_key("PASSWORD_HASH"));
454    }
455
456    #[test]
457    fn signal_match_is_case_insensitive() {
458        let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
459        assert!(sig.matches_key("PASSWORD"));
460        assert!(sig.matches_key("Password"));
461    }
462
463    #[test]
464    fn signal_alternation_pattern() {
465        let sig =
466            FieldNameSignal::new(r"^(password|secret|token)$", Category::AuthToken, None, 3.5)
467                .unwrap();
468        assert!(sig.matches_key("password"));
469        assert!(sig.matches_key("secret"));
470        assert!(sig.matches_key("token"));
471        assert!(!sig.matches_key("token_type"));
472    }
473
474    #[test]
475    fn signal_invalid_regex_returns_error() {
476        let result = FieldNameSignal::new("[invalid(", Category::AuthToken, None, 3.5);
477        assert!(result.is_err());
478    }
479
480    #[test]
481    fn signal_default_label_derived_from_pattern() {
482        let sig = FieldNameSignal::new("^secret$", Category::AuthToken, None, 3.5).unwrap();
483        assert_eq!(sig.label, "field-signal:^secret$");
484    }
485
486    #[test]
487    fn signal_custom_label_preserved() {
488        let sig = FieldNameSignal::new(
489            "^secret$",
490            Category::AuthToken,
491            Some("my-label".into()),
492            3.5,
493        )
494        .unwrap();
495        assert_eq!(sig.label, "my-label");
496    }
497
498    // ── find_field_signal ────────────────────────────────────────────────────
499
500    #[test]
501    fn find_returns_none_for_empty_signals() {
502        assert!(find_field_signal("password", &[]).is_none());
503    }
504
505    #[test]
506    fn find_returns_first_matching_signal() {
507        let s1 = FieldNameSignal::new("^password$", Category::AuthToken, Some("s1".into()), 3.0)
508            .unwrap();
509        let s2 =
510            FieldNameSignal::new("^token$", Category::AuthToken, Some("s2".into()), 3.5).unwrap();
511        let signals = vec![s1, s2];
512
513        let found = find_field_signal("token", &signals).unwrap();
514        assert_eq!(found.label, "s2");
515    }
516
517    #[test]
518    fn find_returns_none_when_no_match() {
519        let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
520        assert!(find_field_signal("hostname", &[sig]).is_none());
521    }
522}