sanitize_engine/processor/mod.rs
1//! Structured processors for format-aware sanitization.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌──────────────────┐ ┌───────────────────┐ ┌──────────────────┐
7//! │ Input bytes │ ──▶ │ ProcessorRegistry │ ──▶ │ Output bytes │
8//! │ (file content) │ │ (profile matching) │ │ (sanitized) │
9//! └──────────────────┘ └────────┬───────────┘ └──────────────────┘
10//! │
11//! ┌────────▼────────┐
12//! │ dyn Processor │
13//! │ │
14//! │ KeyValue │ ← gitlab.rb-style
15//! │ JsonProcessor │ ← JSON files
16//! │ YamlProcessor │ ← YAML files
17//! │ XmlProcessor │ ← XML files
18//! │ CsvProcessor │ ← CSV/TSV files
19//! └────────┬────────┘
20//! │
21//! ┌────────▼────────┐
22//! │ MappingStore │
23//! │ (one-way dedup) │
24//! └─────────────────┘
25//! ```
26//!
27//! # File-Type Profiles
28//!
29//! A [`FileTypeProfile`] specifies which processor to use and what
30//! fields/keys to sanitize. Users provide profiles to control which
31//! parts of a structured file are replaced. If no profile matches,
32//! the caller falls back to the streaming scanner.
33//!
34//! # Extensibility
35//!
36//! Implement the [`Processor`] trait and register it with the
37//! [`ProcessorRegistry`]. The registry matches profiles to processors
38//! by name and dispatches processing.
39
40pub mod archive;
41pub mod csv_proc;
42pub mod env_proc;
43pub mod ini_proc;
44pub mod json_proc;
45pub mod jsonl_proc;
46pub mod key_value;
47pub(crate) mod limits;
48pub mod log_line;
49pub mod profile;
50pub mod registry;
51pub mod toml_proc;
52pub mod xml_proc;
53pub mod yaml_proc;
54
55// Re-export core types.
56pub use profile::{FieldNameSignal, FieldRule, FileTypeProfile, DEFAULT_FIELD_SIGNAL_THRESHOLD};
57pub use registry::ProcessorRegistry;
58
59use crate::category::Category;
60use crate::error::{Result, SanitizeError};
61use crate::store::MappingStore;
62use std::io;
63
64// ---------------------------------------------------------------------------
65// Processor trait
66// ---------------------------------------------------------------------------
67
68/// A structured processor that can sanitize a specific file format while
69/// preserving its structure and formatting as much as possible.
70///
71/// Processors are **stateless** — all mutable state lives in the
72/// [`MappingStore`] they receive. This makes processors `Send + Sync`
73/// and reusable across files.
74///
75/// # Contract
76///
77/// - `name()` must return a unique, lowercase identifier (e.g. `"json"`).
78/// - `can_handle()` is a fast heuristic check; it may inspect a few
79/// bytes or the file extension but should not fully parse.
80/// - `process()` performs the full structured sanitization. It should
81/// preserve formatting/whitespace where possible and only replace
82/// values in fields matched by the profile's [`FieldRule`]s.
83/// - Replacements are **one-way** via the `MappingStore` — no reverse
84/// mapping is produced.
85pub trait Processor: Send + Sync {
86 /// Unique name for this processor (e.g. `"json"`, `"yaml"`, `"key_value"`).
87 fn name(&self) -> &'static str;
88
89 /// Quick heuristic: can this processor handle the given content?
90 ///
91 /// Implementations may check magic bytes, file extension hints in
92 /// the profile, or the first few bytes of content. This is called
93 /// before `process()` and should be fast.
94 fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool;
95
96 /// Process the content, replacing matched field values one-way.
97 ///
98 /// # Arguments
99 ///
100 /// - `content` — raw file bytes.
101 /// - `profile` — the user-supplied profile with field rules.
102 /// - `store` — the mapping store for dedup-consistent one-way replacements.
103 ///
104 /// # Returns
105 ///
106 /// The sanitized content as bytes, preserving structure/formatting
107 /// where possible.
108 ///
109 /// # Errors
110 ///
111 /// Returns [`SanitizeError`] if parsing or replacement generation fails.
112 fn process(
113 &self,
114 content: &[u8],
115 profile: &FileTypeProfile,
116 store: &MappingStore,
117 ) -> Result<Vec<u8>>;
118
119 /// Whether this processor supports bounded-memory streaming via
120 /// [`process_stream`](Self::process_stream).
121 ///
122 /// Processors that return `true` here are eligible for the streaming
123 /// structured path in the CLI, which opens the file as a reader instead
124 /// of reading it fully into memory. The default is `false`.
125 fn supports_streaming(&self) -> bool {
126 false
127 }
128
129 /// Process content from a reader, writing sanitized output to a writer.
130 ///
131 /// The default implementation reads the entire reader into memory and
132 /// delegates to [`process`](Self::process). Processors that return
133 /// `true` from [`supports_streaming`](Self::supports_streaming) should
134 /// override this to handle data incrementally, keeping memory usage
135 /// bounded regardless of input size.
136 ///
137 /// # Errors
138 ///
139 /// Returns [`SanitizeError`] on read, parse,
140 /// or write failure.
141 fn process_stream(
142 &self,
143 reader: &mut dyn io::Read,
144 writer: &mut dyn io::Write,
145 profile: &FileTypeProfile,
146 store: &MappingStore,
147 ) -> Result<()> {
148 let mut buf = Vec::new();
149 io::Read::read_to_end(reader, &mut buf)?;
150 let out = self.process(&buf, profile, store)?;
151 io::Write::write_all(writer, &out)?;
152 Ok(())
153 }
154}
155
156// ---------------------------------------------------------------------------
157// Helpers shared across processors
158// ---------------------------------------------------------------------------
159
160/// Replace a value through the mapping store using a field rule's category.
161///
162/// Returns the original `value` unchanged when it is shorter than
163/// `rule.min_length` (if set). This prevents broad glob patterns like
164/// `*token*` from redacting obviously non-secret values such as `"false"`,
165/// `"0"`, or `"nil"`.
166pub(crate) fn replace_value(value: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
167 if let Some(min) = rule.min_length {
168 if value.len() < min {
169 return Ok(value.to_string());
170 }
171 }
172 let category = rule
173 .category
174 .clone()
175 .unwrap_or(Category::Custom("field".into()));
176 let sanitized = store.get_or_insert(&category, value)?;
177 Ok(sanitized.to_string())
178}
179
180/// Build a dot-separated key path by appending `key` to `prefix`.
181///
182/// Returns `key` unchanged when `prefix` is empty.
183#[must_use]
184pub(crate) fn build_path(prefix: &str, key: &str) -> String {
185 if prefix.is_empty() {
186 key.to_string()
187 } else {
188 format!("{}.{}", prefix, key)
189 }
190}
191
192/// Check whether a single glob `pattern` matches `key_path`.
193///
194/// `*` is the only wildcard character. It matches any sequence of characters,
195/// including empty strings and path separators (`.`, `[`, `]`).
196///
197/// | Pattern | Matches |
198/// |---------|---------|
199/// | `"*"` | anything |
200/// | `"password"` | `"password"` exactly |
201/// | `"*.password"` | `"password"`, `"db.password"`, `"a.b.password"` |
202/// | `"db.*"` | `"db.host"`, `"db.port"`, `"db.nested.key"` |
203/// | `"*password*"` | any key containing `"password"` as a substring |
204/// | `"*['smtp_password']"` | `"gitlab_rails['smtp_password']"` (bracket notation) |
205#[must_use]
206pub(crate) fn pattern_matches(pattern: &str, key_path: &str) -> bool {
207 // Fast path: `*` matches everything.
208 if pattern == "*" {
209 return true;
210 }
211 // Fast path: exact match.
212 if pattern == key_path {
213 return true;
214 }
215 // Fast path: no wildcards — only the exact match above can succeed.
216 if !pattern.contains('*') {
217 return false;
218 }
219 // Dot-path glob: `*.suffix` — requires a dot boundary before the suffix
220 // so that `*.password` matches `db.password` but not `dbpassword`.
221 if let Some(suffix) = pattern.strip_prefix("*.") {
222 if !suffix.contains('*')
223 && (key_path == suffix
224 || key_path
225 .strip_suffix(suffix)
226 .is_some_and(|rest| rest.ends_with('.')))
227 {
228 return true;
229 }
230 }
231 // Dot-path glob: `prefix.*` — `db.*` matches `db.host`, `db.nested.key`.
232 if let Some(prefix) = pattern.strip_suffix(".*") {
233 if !prefix.contains('*')
234 && key_path
235 .strip_prefix(prefix)
236 .is_some_and(|rest| rest.starts_with('.'))
237 {
238 return true;
239 }
240 }
241 // General multi-wildcard glob: split on `*` and verify segments appear in
242 // order. This handles patterns like `*password*`, `*['key']`, `a*b*c`.
243 glob_matches(pattern, key_path)
244}
245
246use crate::allowlist::glob_matches;
247
248/// Compute Shannon entropy of `data` in bits per character.
249///
250/// Returns `0.0` for empty input. Uses a fixed 256-element frequency table
251/// so the cost is O(n) time and O(1) space regardless of alphabet size.
252#[inline]
253#[allow(clippy::cast_precision_loss)]
254pub(crate) fn shannon_entropy(data: &[u8]) -> f64 {
255 if data.is_empty() {
256 return 0.0;
257 }
258 let mut counts = [0u32; 256];
259 for &b in data {
260 counts[b as usize] += 1;
261 }
262 let len = data.len() as f64;
263 counts
264 .iter()
265 .filter(|&&c| c > 0)
266 .map(|&c| {
267 let p = f64::from(c) / len;
268 -p * p.log2()
269 })
270 .sum()
271}
272
273/// Return the first [`FieldNameSignal`] whose key pattern matches `key`.
274///
275/// `key` is the **bare** field name (leaf key only, not the full dot-path).
276#[must_use]
277pub(crate) fn find_field_signal<'a>(
278 key: &str,
279 signals: &'a [FieldNameSignal],
280) -> Option<&'a FieldNameSignal> {
281 signals.iter().find(|sig| sig.matches_key(key))
282}
283
284/// Replace `value` via the mapping store when its entropy meets the signal's gate.
285///
286/// Returns `Some(replacement)` when the value's Shannon entropy is at or above
287/// `sig.threshold`, or `None` when the entropy is too low to be a real secret
288/// (e.g. `"Bearer"`, `"basic"`, `"true"`).
289pub(crate) fn replace_by_signal(
290 value: &str,
291 sig: &FieldNameSignal,
292 store: &MappingStore,
293) -> Result<Option<String>> {
294 if value.is_empty() {
295 return Ok(None);
296 }
297 if shannon_entropy(value.as_bytes()) < sig.threshold {
298 return Ok(None);
299 }
300 let replaced = store.get_or_insert(&sig.category, value)?;
301 Ok(Some(replaced.to_string()))
302}
303
304/// Return the first rule in `profile` whose pattern matches `key_path`.
305///
306/// Supports exact matches and glob patterns — see [`pattern_matches`] for the
307/// full pattern syntax including dot-path globs and bracket notation.
308#[must_use]
309pub(crate) fn find_matching_rule<'a>(
310 key_path: &str,
311 profile: &'a FileTypeProfile,
312) -> Option<&'a FieldRule> {
313 profile
314 .fields
315 .iter()
316 .find(|rule| pattern_matches(&rule.pattern, key_path))
317}
318
319// ---------------------------------------------------------------------------
320// Shared tree walker
321// ---------------------------------------------------------------------------
322
323/// Visitor interface over a structured value tree.
324///
325/// Implemented by [`serde_json::Value`], [`serde_yaml_ng::Value`], and
326/// [`toml::Value`] so that [`walk_tree`] can drive sanitization without
327/// knowing the format it is operating on.
328pub(crate) trait TreeNode {
329 /// Call `f(key, child)` for every entry in this map node.
330 /// Is a no-op (returns `Ok(())`) if this node is not a map.
331 fn for_each_map_entry<F>(&mut self, f: F) -> Result<()>
332 where
333 F: FnMut(&str, &mut Self) -> Result<()>;
334
335 /// Call `f(item)` for every item in this sequence node.
336 /// Is a no-op (returns `Ok(())`) if this node is not a sequence.
337 fn for_each_seq_item<F>(&mut self, f: F) -> Result<()>
338 where
339 F: FnMut(&mut Self) -> Result<()>;
340
341 /// Mutable access to the inner `String` if this is a string node.
342 fn as_str_mut(&mut self) -> Option<&mut String>;
343
344 /// `true` if this is a non-string primitive scalar (number, bool, datetime, …).
345 fn is_scalar(&self) -> bool;
346
347 /// String representation used as the replacement input for scalar values.
348 fn scalar_to_string(&self) -> String;
349
350 /// Replace this node's content with a string value in-place.
351 fn set_string(&mut self, s: String);
352}
353
354/// Recursively walk a structured value tree, replacing matched leaf values.
355///
356/// This is the shared implementation for the JSON, YAML, and TOML processors.
357/// Each processor implements [`TreeNode`] for its own value type and wraps
358/// this call in a thin format-named function.
359pub(crate) fn walk_tree<V: TreeNode>(
360 value: &mut V,
361 prefix: &str,
362 profile: &FileTypeProfile,
363 store: &MappingStore,
364 depth: usize,
365 format_name: &str,
366) -> Result<()> {
367 if depth > limits::DEFAULT_DEPTH {
368 return Err(SanitizeError::RecursionDepthExceeded(format!(
369 "{format_name} recursion depth exceeds limit of {}",
370 limits::DEFAULT_DEPTH
371 )));
372 }
373 value.for_each_map_entry(|key, v| {
374 let path = build_path(prefix, key);
375 if let Some(s) = v.as_str_mut() {
376 if let Some(rule) = find_matching_rule(&path, profile) {
377 *s = replace_value(s, rule, store)?;
378 } else if let Some(sig) = find_field_signal(key, &profile.field_name_signals) {
379 if let Some(replaced) = replace_by_signal(s, sig, store)? {
380 *s = replaced;
381 }
382 }
383 } else if v.is_scalar() {
384 if let Some(rule) = find_matching_rule(&path, profile) {
385 let repr = v.scalar_to_string();
386 let replaced = replace_value(&repr, rule, store)?;
387 v.set_string(replaced);
388 } else if let Some(sig) = find_field_signal(key, &profile.field_name_signals) {
389 let repr = v.scalar_to_string();
390 if let Some(replaced) = replace_by_signal(&repr, sig, store)? {
391 v.set_string(replaced);
392 }
393 }
394 } else {
395 walk_tree(v, &path, profile, store, depth + 1, format_name)?;
396 }
397 Ok(())
398 })?;
399 value.for_each_seq_item(|item| walk_tree(item, prefix, profile, store, depth + 1, format_name))
400}
401
402// ---------------------------------------------------------------------------
403// Unit tests
404// ---------------------------------------------------------------------------
405
406#[cfg(test)]
407mod tests {
408 use super::*;
409 use crate::category::Category;
410
411 // ── shannon_entropy ──────────────────────────────────────────────────────
412
413 #[test]
414 #[allow(clippy::float_cmp)]
415 fn entropy_empty_is_zero() {
416 assert_eq!(shannon_entropy(b""), 0.0);
417 }
418
419 #[test]
420 #[allow(clippy::float_cmp)]
421 fn entropy_single_byte_is_zero() {
422 // All characters the same → zero entropy.
423 assert_eq!(shannon_entropy(b"aaaa"), 0.0);
424 }
425
426 #[test]
427 fn entropy_two_equal_symbols_is_one_bit() {
428 // "ab" repeated — 2 equally likely symbols → exactly 1.0 bit.
429 assert!((shannon_entropy(b"abababab") - 1.0).abs() < 1e-10);
430 }
431
432 #[test]
433 fn entropy_high_for_random_hex() {
434 // 32-char hex string should be well above 3.5 bits/char.
435 let h = shannon_entropy(b"a3f8c2d1e9b7f4a2c8d3e1b9f7a4c2d1");
436 assert!(h > 3.5, "expected entropy > 3.5, got {h}");
437 }
438
439 #[test]
440 fn entropy_low_for_word() {
441 // "Bearer" uses only 5 distinct chars, should be below 3.0.
442 let h = shannon_entropy(b"Bearer");
443 assert!(h < 3.0, "expected entropy < 3.0, got {h}");
444 }
445
446 // ── FieldNameSignal::matches_key ─────────────────────────────────────────
447
448 #[test]
449 fn signal_matches_exact_key() {
450 let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
451 assert!(sig.matches_key("password"));
452 assert!(!sig.matches_key("db_password"));
453 assert!(!sig.matches_key("PASSWORD_HASH"));
454 }
455
456 #[test]
457 fn signal_match_is_case_insensitive() {
458 let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
459 assert!(sig.matches_key("PASSWORD"));
460 assert!(sig.matches_key("Password"));
461 }
462
463 #[test]
464 fn signal_alternation_pattern() {
465 let sig =
466 FieldNameSignal::new(r"^(password|secret|token)$", Category::AuthToken, None, 3.5)
467 .unwrap();
468 assert!(sig.matches_key("password"));
469 assert!(sig.matches_key("secret"));
470 assert!(sig.matches_key("token"));
471 assert!(!sig.matches_key("token_type"));
472 }
473
474 #[test]
475 fn signal_invalid_regex_returns_error() {
476 let result = FieldNameSignal::new("[invalid(", Category::AuthToken, None, 3.5);
477 assert!(result.is_err());
478 }
479
480 #[test]
481 fn signal_default_label_derived_from_pattern() {
482 let sig = FieldNameSignal::new("^secret$", Category::AuthToken, None, 3.5).unwrap();
483 assert_eq!(sig.label, "field-signal:^secret$");
484 }
485
486 #[test]
487 fn signal_custom_label_preserved() {
488 let sig = FieldNameSignal::new(
489 "^secret$",
490 Category::AuthToken,
491 Some("my-label".into()),
492 3.5,
493 )
494 .unwrap();
495 assert_eq!(sig.label, "my-label");
496 }
497
498 // ── find_field_signal ────────────────────────────────────────────────────
499
500 #[test]
501 fn find_returns_none_for_empty_signals() {
502 assert!(find_field_signal("password", &[]).is_none());
503 }
504
505 #[test]
506 fn find_returns_first_matching_signal() {
507 let s1 = FieldNameSignal::new("^password$", Category::AuthToken, Some("s1".into()), 3.0)
508 .unwrap();
509 let s2 =
510 FieldNameSignal::new("^token$", Category::AuthToken, Some("s2".into()), 3.5).unwrap();
511 let signals = vec![s1, s2];
512
513 let found = find_field_signal("token", &signals).unwrap();
514 assert_eq!(found.label, "s2");
515 }
516
517 #[test]
518 fn find_returns_none_when_no_match() {
519 let sig = FieldNameSignal::new("^password$", Category::AuthToken, None, 3.5).unwrap();
520 assert!(find_field_signal("hostname", &[sig]).is_none());
521 }
522}