sanitize_engine/processor/mod.rs
1//! Structured processors for format-aware sanitization.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌──────────────────┐ ┌───────────────────┐ ┌──────────────────┐
7//! │ Input bytes │ ──▶ │ ProcessorRegistry │ ──▶ │ Output bytes │
8//! │ (file content) │ │ (profile matching) │ │ (sanitized) │
9//! └──────────────────┘ └────────┬───────────┘ └──────────────────┘
10//! │
11//! ┌────────▼────────┐
12//! │ dyn Processor │
13//! │ │
14//! │ KeyValue │ ← gitlab.rb-style
15//! │ JsonProcessor │ ← JSON files
16//! │ YamlProcessor │ ← YAML files
17//! │ XmlProcessor │ ← XML files
18//! │ CsvProcessor │ ← CSV/TSV files
19//! └────────┬────────┘
20//! │
21//! ┌────────▼────────┐
22//! │ MappingStore │
23//! │ (one-way dedup) │
24//! └─────────────────┘
25//! ```
26//!
27//! # File-Type Profiles
28//!
29//! A [`FileTypeProfile`] specifies which processor to use and what
30//! fields/keys to sanitize. Users provide profiles to control which
31//! parts of a structured file are replaced. If no profile matches,
32//! the caller falls back to the streaming scanner.
33//!
34//! # Extensibility
35//!
36//! Implement the [`Processor`] trait and register it with the
37//! [`ProcessorRegistry`]. The registry matches profiles to processors
38//! by name and dispatches processing.
39
40pub mod archive;
41pub mod csv_proc;
42pub mod env_proc;
43pub mod ini_proc;
44pub mod json_proc;
45pub mod key_value;
46pub mod log_line;
47pub mod profile;
48pub mod registry;
49pub mod toml_proc;
50pub mod xml_proc;
51pub mod yaml_proc;
52
53// Re-export core types.
54pub use profile::{FieldRule, FileTypeProfile};
55pub use registry::ProcessorRegistry;
56
57use crate::category::Category;
58use crate::error::Result;
59use crate::store::MappingStore;
60
61// ---------------------------------------------------------------------------
62// Processor trait
63// ---------------------------------------------------------------------------
64
65/// A structured processor that can sanitize a specific file format while
66/// preserving its structure and formatting as much as possible.
67///
68/// Processors are **stateless** — all mutable state lives in the
69/// [`MappingStore`] they receive. This makes processors `Send + Sync`
70/// and reusable across files.
71///
72/// # Contract
73///
74/// - `name()` must return a unique, lowercase identifier (e.g. `"json"`).
75/// - `can_handle()` is a fast heuristic check; it may inspect a few
76/// bytes or the file extension but should not fully parse.
77/// - `process()` performs the full structured sanitization. It should
78/// preserve formatting/whitespace where possible and only replace
79/// values in fields matched by the profile's [`FieldRule`]s.
80/// - Replacements are **one-way** via the `MappingStore` — no reverse
81/// mapping is produced.
82pub trait Processor: Send + Sync {
83 /// Unique name for this processor (e.g. `"json"`, `"yaml"`, `"key_value"`).
84 fn name(&self) -> &'static str;
85
86 /// Quick heuristic: can this processor handle the given content?
87 ///
88 /// Implementations may check magic bytes, file extension hints in
89 /// the profile, or the first few bytes of content. This is called
90 /// before `process()` and should be fast.
91 fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool;
92
93 /// Process the content, replacing matched field values one-way.
94 ///
95 /// # Arguments
96 ///
97 /// - `content` — raw file bytes.
98 /// - `profile` — the user-supplied profile with field rules.
99 /// - `store` — the mapping store for dedup-consistent one-way replacements.
100 ///
101 /// # Returns
102 ///
103 /// The sanitized content as bytes, preserving structure/formatting
104 /// where possible.
105 ///
106 /// # Errors
107 ///
108 /// Returns [`SanitizeError`](crate::error::SanitizeError) if parsing or replacement generation fails.
109 fn process(
110 &self,
111 content: &[u8],
112 profile: &FileTypeProfile,
113 store: &MappingStore,
114 ) -> Result<Vec<u8>>;
115}
116
117// ---------------------------------------------------------------------------
118// Helpers shared across processors
119// ---------------------------------------------------------------------------
120
121/// Replace a value through the mapping store using a field rule's category.
122pub(crate) fn replace_value(value: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
123 let category = rule
124 .category
125 .clone()
126 .unwrap_or(Category::Custom("field".into()));
127 let sanitized = store.get_or_insert(&category, value)?;
128 Ok(sanitized.to_string())
129}
130
131/// Build a dot-separated key path by appending `key` to `prefix`.
132///
133/// Returns `key` unchanged when `prefix` is empty.
134#[must_use]
135pub(crate) fn build_path(prefix: &str, key: &str) -> String {
136 if prefix.is_empty() {
137 key.to_string()
138 } else {
139 format!("{}.{}", prefix, key)
140 }
141}
142
143/// Check whether a single glob `pattern` matches `key_path`.
144///
145/// Supported patterns:
146/// - `"*"` — matches anything.
147/// - `"password"` — exact match.
148/// - `"*.password"` — any key ending in `.password`.
149/// - `"db.*"` — any key starting with `db.`.
150#[must_use]
151pub(crate) fn pattern_matches(pattern: &str, key_path: &str) -> bool {
152 if pattern == "*" {
153 return true;
154 }
155 if pattern == key_path {
156 return true;
157 }
158 // Simple glob: *.suffix
159 if let Some(suffix) = pattern.strip_prefix("*.") {
160 if key_path == suffix
161 || key_path
162 .strip_suffix(suffix)
163 .is_some_and(|rest| rest.ends_with('.'))
164 {
165 return true;
166 }
167 }
168 // Simple glob: prefix.*
169 if let Some(prefix) = pattern.strip_suffix(".*") {
170 if key_path
171 .strip_prefix(prefix)
172 .is_some_and(|rest| rest.starts_with('.'))
173 {
174 return true;
175 }
176 }
177 false
178}
179
180/// Check whether a dotted key path matches any of the rules in a profile.
181///
182/// Supports exact matches and simple glob patterns:
183/// - `"password"` matches `"password"` exactly.
184/// - `"*.password"` matches any key ending in `.password`.
185/// - `"db.*"` matches any key starting with `db.`.
186#[must_use]
187pub(crate) fn find_matching_rule<'a>(
188 key_path: &str,
189 profile: &'a FileTypeProfile,
190) -> Option<&'a FieldRule> {
191 profile
192 .fields
193 .iter()
194 .find(|rule| pattern_matches(&rule.pattern, key_path))
195}