sanitize_engine/processor/mod.rs
1//! Structured processors for format-aware sanitization.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌──────────────────┐ ┌───────────────────┐ ┌──────────────────┐
7//! │ Input bytes │ ──▶ │ ProcessorRegistry │ ──▶ │ Output bytes │
8//! │ (file content) │ │ (profile matching) │ │ (sanitized) │
9//! └──────────────────┘ └────────┬───────────┘ └──────────────────┘
10//! │
11//! ┌────────▼────────┐
12//! │ dyn Processor │
13//! │ │
14//! │ KeyValue │ ← gitlab.rb-style
15//! │ JsonProcessor │ ← JSON files
16//! │ YamlProcessor │ ← YAML files
17//! │ XmlProcessor │ ← XML files
18//! │ CsvProcessor │ ← CSV/TSV files
19//! └────────┬────────┘
20//! │
21//! ┌────────▼────────┐
22//! │ MappingStore │
23//! │ (one-way dedup) │
24//! └─────────────────┘
25//! ```
26//!
27//! # File-Type Profiles
28//!
29//! A [`FileTypeProfile`] specifies which processor to use and what
30//! fields/keys to sanitize. Users provide profiles to control which
31//! parts of a structured file are replaced. If no profile matches,
32//! the caller falls back to the streaming scanner.
33//!
34//! # Extensibility
35//!
36//! Implement the [`Processor`] trait and register it with the
37//! [`ProcessorRegistry`]. The registry matches profiles to processors
38//! by name and dispatches processing.
39
40pub mod archive;
41pub mod csv_proc;
42pub mod json_proc;
43pub mod key_value;
44pub mod profile;
45pub mod registry;
46pub mod xml_proc;
47pub mod yaml_proc;
48
49// Re-export core types.
50pub use profile::{FieldRule, FileTypeProfile};
51pub use registry::ProcessorRegistry;
52
53use crate::category::Category;
54use crate::error::Result;
55use crate::store::MappingStore;
56
57// ---------------------------------------------------------------------------
58// Processor trait
59// ---------------------------------------------------------------------------
60
61/// A structured processor that can sanitize a specific file format while
62/// preserving its structure and formatting as much as possible.
63///
64/// Processors are **stateless** — all mutable state lives in the
65/// [`MappingStore`] they receive. This makes processors `Send + Sync`
66/// and reusable across files.
67///
68/// # Contract
69///
70/// - `name()` must return a unique, lowercase identifier (e.g. `"json"`).
71/// - `can_handle()` is a fast heuristic check; it may inspect a few
72/// bytes or the file extension but should not fully parse.
73/// - `process()` performs the full structured sanitization. It should
74/// preserve formatting/whitespace where possible and only replace
75/// values in fields matched by the profile's [`FieldRule`]s.
76/// - Replacements are **one-way** via the `MappingStore` — no reverse
77/// mapping is produced.
78pub trait Processor: Send + Sync {
79 /// Unique name for this processor (e.g. `"json"`, `"yaml"`, `"key_value"`).
80 fn name(&self) -> &'static str;
81
82 /// Quick heuristic: can this processor handle the given content?
83 ///
84 /// Implementations may check magic bytes, file extension hints in
85 /// the profile, or the first few bytes of content. This is called
86 /// before `process()` and should be fast.
87 fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool;
88
89 /// Process the content, replacing matched field values one-way.
90 ///
91 /// # Arguments
92 ///
93 /// - `content` — raw file bytes.
94 /// - `profile` — the user-supplied profile with field rules.
95 /// - `store` — the mapping store for dedup-consistent one-way replacements.
96 ///
97 /// # Returns
98 ///
99 /// The sanitized content as bytes, preserving structure/formatting
100 /// where possible.
101 ///
102 /// # Errors
103 ///
104 /// Returns [`SanitizeError`](crate::error::SanitizeError) if parsing or replacement generation fails.
105 fn process(
106 &self,
107 content: &[u8],
108 profile: &FileTypeProfile,
109 store: &MappingStore,
110 ) -> Result<Vec<u8>>;
111}
112
113// ---------------------------------------------------------------------------
114// Helpers shared across processors
115// ---------------------------------------------------------------------------
116
117/// Replace a value through the mapping store using a field rule's category.
118pub(crate) fn replace_value(value: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
119 let category = rule
120 .category
121 .clone()
122 .unwrap_or(Category::Custom("field".into()));
123 let sanitized = store.get_or_insert(&category, value)?;
124 Ok(sanitized.to_string())
125}
126
127/// Check whether a dotted key path matches any of the rules in a profile.
128///
129/// Supports exact matches and simple glob patterns:
130/// - `"password"` matches `"password"` exactly.
131/// - `"*.password"` matches any key ending in `.password`.
132/// - `"db.*"` matches any key starting with `db.`.
133pub(crate) fn find_matching_rule<'a>(
134 key_path: &str,
135 profile: &'a FileTypeProfile,
136) -> Option<&'a FieldRule> {
137 profile.fields.iter().find(|rule| {
138 if rule.pattern == "*" {
139 return true;
140 }
141 if rule.pattern == key_path {
142 return true;
143 }
144 // Simple glob: *.suffix
145 if let Some(suffix) = rule.pattern.strip_prefix("*.") {
146 if key_path.ends_with(&format!(".{}", suffix)) || key_path == suffix {
147 return true;
148 }
149 }
150 // Simple glob: prefix.*
151 if let Some(prefix) = rule.pattern.strip_suffix(".*") {
152 if key_path.starts_with(&format!("{}.", prefix)) {
153 return true;
154 }
155 }
156 false
157 })
158}