Skip to main content

sanitize_engine/processor/
mod.rs

1//! Structured processors for format-aware sanitization.
2//!
3//! # Architecture
4//!
5//! ```text
6//! ┌──────────────────┐     ┌───────────────────┐     ┌──────────────────┐
7//! │  Input bytes     │ ──▶ │ ProcessorRegistry  │ ──▶ │  Output bytes    │
8//! │  (file content)  │     │ (profile matching) │     │  (sanitized)     │
9//! └──────────────────┘     └────────┬───────────┘     └──────────────────┘
10//!                                   │
11//!                          ┌────────▼────────┐
12//!                          │ dyn Processor    │
13//!                          │                  │
14//!                          │  KeyValue        │ ← gitlab.rb-style
15//!                          │  JsonProcessor   │ ← JSON files
16//!                          │  YamlProcessor   │ ← YAML files
17//!                          │  XmlProcessor    │ ← XML files
18//!                          │  CsvProcessor    │ ← CSV/TSV files
19//!                          └────────┬────────┘
20//!                                   │
21//!                          ┌────────▼────────┐
22//!                          │  MappingStore    │
23//!                          │  (one-way dedup) │
24//!                          └─────────────────┘
25//! ```
26//!
27//! # File-Type Profiles
28//!
29//! A [`FileTypeProfile`] specifies which processor to use and what
30//! fields/keys to sanitize. Users provide profiles to control which
31//! parts of a structured file are replaced. If no profile matches,
32//! the caller falls back to the streaming scanner.
33//!
34//! # Extensibility
35//!
36//! Implement the [`Processor`] trait and register it with the
37//! [`ProcessorRegistry`]. The registry matches profiles to processors
38//! by name and dispatches processing.
39
40pub mod archive;
41pub mod csv_proc;
42pub mod env_proc;
43pub mod ini_proc;
44pub mod json_proc;
45pub mod key_value;
46pub mod log_line;
47pub mod profile;
48pub mod registry;
49pub mod toml_proc;
50pub mod xml_proc;
51pub mod yaml_proc;
52
53// Re-export core types.
54pub use profile::{FieldRule, FileTypeProfile};
55pub use registry::ProcessorRegistry;
56
57use crate::category::Category;
58use crate::error::Result;
59use crate::store::MappingStore;
60
61// ---------------------------------------------------------------------------
62// Processor trait
63// ---------------------------------------------------------------------------
64
65/// A structured processor that can sanitize a specific file format while
66/// preserving its structure and formatting as much as possible.
67///
68/// Processors are **stateless** — all mutable state lives in the
69/// [`MappingStore`] they receive. This makes processors `Send + Sync`
70/// and reusable across files.
71///
72/// # Contract
73///
74/// - `name()` must return a unique, lowercase identifier (e.g. `"json"`).
75/// - `can_handle()` is a fast heuristic check; it may inspect a few
76///   bytes or the file extension but should not fully parse.
77/// - `process()` performs the full structured sanitization. It should
78///   preserve formatting/whitespace where possible and only replace
79///   values in fields matched by the profile's [`FieldRule`]s.
80/// - Replacements are **one-way** via the `MappingStore` — no reverse
81///   mapping is produced.
82pub trait Processor: Send + Sync {
83    /// Unique name for this processor (e.g. `"json"`, `"yaml"`, `"key_value"`).
84    fn name(&self) -> &'static str;
85
86    /// Quick heuristic: can this processor handle the given content?
87    ///
88    /// Implementations may check magic bytes, file extension hints in
89    /// the profile, or the first few bytes of content. This is called
90    /// before `process()` and should be fast.
91    fn can_handle(&self, content: &[u8], profile: &FileTypeProfile) -> bool;
92
93    /// Process the content, replacing matched field values one-way.
94    ///
95    /// # Arguments
96    ///
97    /// - `content` — raw file bytes.
98    /// - `profile` — the user-supplied profile with field rules.
99    /// - `store` — the mapping store for dedup-consistent one-way replacements.
100    ///
101    /// # Returns
102    ///
103    /// The sanitized content as bytes, preserving structure/formatting
104    /// where possible.
105    ///
106    /// # Errors
107    ///
108    /// Returns [`SanitizeError`](crate::error::SanitizeError) if parsing or replacement generation fails.
109    fn process(
110        &self,
111        content: &[u8],
112        profile: &FileTypeProfile,
113        store: &MappingStore,
114    ) -> Result<Vec<u8>>;
115}
116
117// ---------------------------------------------------------------------------
118// Helpers shared across processors
119// ---------------------------------------------------------------------------
120
121/// Replace a value through the mapping store using a field rule's category.
122pub(crate) fn replace_value(value: &str, rule: &FieldRule, store: &MappingStore) -> Result<String> {
123    let category = rule
124        .category
125        .clone()
126        .unwrap_or(Category::Custom("field".into()));
127    let sanitized = store.get_or_insert(&category, value)?;
128    Ok(sanitized.to_string())
129}
130
131/// Build a dot-separated key path by appending `key` to `prefix`.
132///
133/// Returns `key` unchanged when `prefix` is empty.
134#[must_use]
135pub(crate) fn build_path(prefix: &str, key: &str) -> String {
136    if prefix.is_empty() {
137        key.to_string()
138    } else {
139        format!("{}.{}", prefix, key)
140    }
141}
142
143/// Check whether a single glob `pattern` matches `key_path`.
144///
145/// Supported patterns:
146/// - `"*"` — matches anything.
147/// - `"password"` — exact match.
148/// - `"*.password"` — any key ending in `.password`.
149/// - `"db.*"` — any key starting with `db.`.
150#[must_use]
151pub(crate) fn pattern_matches(pattern: &str, key_path: &str) -> bool {
152    if pattern == "*" {
153        return true;
154    }
155    if pattern == key_path {
156        return true;
157    }
158    // Simple glob: *.suffix
159    if let Some(suffix) = pattern.strip_prefix("*.") {
160        if key_path == suffix
161            || key_path
162                .strip_suffix(suffix)
163                .is_some_and(|rest| rest.ends_with('.'))
164        {
165            return true;
166        }
167    }
168    // Simple glob: prefix.*
169    if let Some(prefix) = pattern.strip_suffix(".*") {
170        if key_path
171            .strip_prefix(prefix)
172            .is_some_and(|rest| rest.starts_with('.'))
173        {
174            return true;
175        }
176    }
177    false
178}
179
180/// Check whether a dotted key path matches any of the rules in a profile.
181///
182/// Supports exact matches and simple glob patterns:
183/// - `"password"` matches `"password"` exactly.
184/// - `"*.password"` matches any key ending in `.password`.
185/// - `"db.*"` matches any key starting with `db.`.
186#[must_use]
187pub(crate) fn find_matching_rule<'a>(
188    key_path: &str,
189    profile: &'a FileTypeProfile,
190) -> Option<&'a FieldRule> {
191    profile
192        .fields
193        .iter()
194        .find(|rule| pattern_matches(&rule.pattern, key_path))
195}