Skip to main content

sanitize_engine/processor/
mod.rs

1//! Structured processors for format-aware sanitization.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌──────────────────┐     ┌───────────────────┐     ┌──────────────────┐
7//! │  Input bytes     │ ──▶ │ ProcessorRegistry  │ ──▶ │  Output bytes    │
8//! │  (file content)  │     │ (profile matching) │     │  (sanitized)     │
9//! └──────────────────┘     └────────┬───────────┘     └──────────────────┘
10//!                                   │
11//!                          ┌────────▼────────┐
12//!                          │ dyn Processor    │
13//!                          │                  │
14//!                          │  KeyValue        │ ← gitlab.rb-style
15//!                          │  JsonProcessor   │ ← JSON files
16//!                          │  YamlProcessor   │ ← YAML files
17//!                          │  XmlProcessor    │ ← XML files
18//!                          │  CsvProcessor    │ ← CSV/TSV files
19//!                          └────────┬────────┘
20//!                                   │
21//!                          ┌────────▼────────┐
22//!                          │  MappingStore    │
23//!                          │  (one-way dedup) │
24//!                          └─────────────────┘
25//! ```
26//!
27//! # File-Type Profiles
28//!
29//! A [`FileTypeProfile`] specifies which processor to use and what
30//! fields/keys to sanitize. Users provide profiles to control which
31//! parts of a structured file are replaced. If no profile matches,
32//! the caller falls back to the streaming scanner.
33//!
34//! # Extensibility
35//!
36//! Implement the [`Processor`] trait and register it with the
37//! [`ProcessorRegistry`]. The registry matches profiles to processors
38//! by name and dispatches processing.
39
40pub mod archive;
41pub mod csv_proc;
42pub mod json_proc;
43pub mod key_value;
44pub mod profile;
45pub mod registry;
46pub mod xml_proc;
47pub mod yaml_proc;
48
49// Re-export core types.
50pub use profile::{FieldRule, FileTypeProfile};
51pub use registry::ProcessorRegistry;
52
53use crate::category::Category;
54use crate::error::Result;
55use crate::store::MappingStore;
56
57// ---------------------------------------------------------------------------
58// Processor trait
59// ---------------------------------------------------------------------------
60
61/// A structured processor that can sanitize a specific file format while
62/// preserving its structure and formatting as much as possible.
63///
64/// Processors are **stateless** — all mutable state lives in the
65/// [`MappingStore`] they receive. This makes processors `Send + Sync`
66/// and reusable across files.
67///
68/// # Contract
69///
70/// - `name()` must return a unique, lowercase identifier (e.g. `"json"`).
71/// - `can_handle()` is a fast heuristic check; it may inspect a few
72///   bytes or the file extension but should not fully parse.
73/// - `process()` performs the full structured sanitization. It should
74///   preserve formatting/whitespace where possible and only replace
75///   values in fields matched by the profile's [`FieldRule`]s.
76/// - Replacements are **one-way** via the `MappingStore` — no reverse
77///   mapping is produced.
78pub trait Processor: Send + Sync {
79    /// Unique name for this processor (e.g. `"json"`, `"yaml"`, `"key_value"`).
80    fn name(&self) -> &'static str;
81
82    /// Quick heuristic: can this processor handle the given content?
83    ///
84    /// Implementations may check magic bytes, file extension hints in
85    /// the profile, or the first few bytes of content. This is called
86    /// before `process()` and should be fast.
87    fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool;
88
89    /// Process the content, replacing matched field values one-way.
90    ///
91    /// # Arguments
92    ///
93    /// - `content` — raw file bytes.
94    /// - `profile` — the user-supplied profile with field rules.
95    /// - `store` — the mapping store for dedup-consistent one-way replacements.
96    ///
97    /// # Returns
98    ///
99    /// The sanitized content as bytes, preserving structure/formatting
100    /// where possible.
101    ///
102    /// # Errors
103    ///
104    /// Returns [`SanitizeError`](crate::error::SanitizeError) if parsing or replacement generation fails.
105    fn process(
106        &self,
107        content: &[u8],
108        profile: &FileTypeProfile,
109        store: &MappingStore,
110    ) -> Result<Vec<u8>>;
111}
112
113// ---------------------------------------------------------------------------
114// Helpers shared across processors
115// ---------------------------------------------------------------------------
116
117/// Replace a value through the mapping store using a field rule's category.
118pub(crate) fn replace_value(value: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
119    let category = rule
120        .category
121        .clone()
122        .unwrap_or(Category::Custom("field".into()));
123    let sanitized = store.get_or_insert(&category, value)?;
124    Ok(sanitized.to_string())
125}
126
127/// Check whether a dotted key path matches any of the rules in a profile.
128///
129/// Supports exact matches and simple glob patterns:
130/// - `"password"` matches `"password"` exactly.
131/// - `"*.password"` matches any key ending in `.password`.
132/// - `"db.*"` matches any key starting with `db.`.
133pub(crate) fn find_matching_rule<'a>(
134    key_path: &str,
135    profile: &'a FileTypeProfile,
136) -> Option<&'a FieldRule> {
137    profile.fields.iter().find(|rule| {
138        if rule.pattern == "*" {
139            return true;
140        }
141        if rule.pattern == key_path {
142            return true;
143        }
144        // Simple glob: *.suffix
145        if let Some(suffix) = rule.pattern.strip_prefix("*.") {
146            if key_path.ends_with(&format!(".{}", suffix)) || key_path == suffix {
147                return true;
148            }
149        }
150        // Simple glob: prefix.*
151        if let Some(prefix) = rule.pattern.strip_suffix(".*") {
152            if key_path.starts_with(&format!("{}.", prefix)) {
153                return true;
154            }
155        }
156        false
157    })
158}