sanitize_engine/processor/profile.rs
1//! File-type profiles for structured processors.
2//!
3//! A [`FileTypeProfile`] tells the processing pipeline which processor
4//! to use and which fields/keys within the file should be sanitized.
5
6use crate::category::Category;
7use glob::Pattern;
8use serde::{Deserialize, Serialize};
9
10// ---------------------------------------------------------------------------
11// FieldRule
12// ---------------------------------------------------------------------------
13
14/// A rule describing a single field/key to sanitize.
15///
16/// # Pattern Syntax
17///
18/// - Exact key: `"password"`, `"db_host"`.
19/// - Dotted path: `"database.password"`, `"smtp.user"`.
20/// - Glob suffix: `"*.password"` — matches any key ending in `.password`.
21/// - Glob prefix: `"db.*"` — matches any key starting with `db.`.
22/// - Wildcard: `"*"` — matches every field.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct FieldRule {
25 /// Key pattern to match (see Pattern Syntax above).
26 pub pattern: String,
27
28 /// Category for replacement generation. Defaults to `Custom("field")`
29 /// if not specified.
30 #[serde(default, skip_serializing_if = "Option::is_none")]
31 pub category: Option<Category>,
32
33 /// Optional human-readable label for reporting.
34 #[serde(default, skip_serializing_if = "Option::is_none")]
35 pub label: Option<String>,
36}
37
38impl FieldRule {
39 /// Create a new field rule with just a pattern.
40 #[must_use]
41 pub fn new(pattern: impl Into<String>) -> Self {
42 Self {
43 pattern: pattern.into(),
44 category: None,
45 label: None,
46 }
47 }
48
49 /// Set the category for this rule.
50 #[must_use]
51 pub fn with_category(mut self, category: Category) -> Self {
52 self.category = Some(category);
53 self
54 }
55
56 /// Set the label for this rule.
57 #[must_use]
58 pub fn with_label(mut self, label: impl Into<String>) -> Self {
59 self.label = Some(label.into());
60 self
61 }
62}
63
64// ---------------------------------------------------------------------------
65// FileTypeProfile
66// ---------------------------------------------------------------------------
67
68/// Specifies which processor to use and what fields to sanitize.
69///
70/// # File matching
71///
72/// A file is processed by this profile when **all** of the following hold:
73///
74/// 1. Its name ends with one of the `extensions` (required — an empty list
75/// matches nothing).
76/// 2. If `include` is non-empty, the filename matches **at least one** of
77/// those glob patterns.
78/// 3. The filename does **not** match any `exclude` glob pattern.
79///
80/// Glob patterns use `*` (any chars within a path component) and `**`
81/// (any chars including path separators).
82///
83/// # Example (YAML)
84///
85/// ```yaml
86/// - processor: json
87/// extensions: [".json"]
88/// # Only apply to files whose names start with "config"
89/// include: ["config*.json"]
90/// # Never apply to log files
91/// exclude: ["*.log.json", "logs/**"]
92/// fields:
93/// - pattern: "*.password"
94/// category: "custom:password"
95/// ```
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct FileTypeProfile {
98 /// Name of the processor to use (e.g. `"key_value"`, `"json"`).
99 pub processor: String,
100
101 /// File extensions this profile applies to (e.g. `[".rb", ".conf"]`).
102 #[serde(default)]
103 pub extensions: Vec<String>,
104
105 /// If non-empty, the filename must match at least one of these glob
106 /// patterns in addition to the extension check.
107 #[serde(default)]
108 pub include: Vec<String>,
109
110 /// Filenames matching any of these glob patterns are excluded from
111 /// structured processing even if they match the extension (and include).
112 #[serde(default)]
113 pub exclude: Vec<String>,
114
115 /// Field rules: which keys/paths to sanitize.
116 pub fields: Vec<FieldRule>,
117
118 /// Free-form options passed to the processor (e.g. delimiter, comment chars).
119 #[serde(default)]
120 pub options: std::collections::HashMap<String, String>,
121}
122
123impl FileTypeProfile {
124 /// Create a minimal profile for a given processor.
125 #[must_use]
126 pub fn new(processor: impl Into<String>, fields: Vec<FieldRule>) -> Self {
127 Self {
128 processor: processor.into(),
129 extensions: Vec::new(),
130 include: Vec::new(),
131 exclude: Vec::new(),
132 fields,
133 options: std::collections::HashMap::new(),
134 }
135 }
136
137 /// Add an extension to this profile.
138 #[must_use]
139 pub fn with_extension(mut self, ext: impl Into<String>) -> Self {
140 self.extensions.push(ext.into());
141 self
142 }
143
144 /// Add a free-form option.
145 #[must_use]
146 pub fn with_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
147 self.options.insert(key.into(), value.into());
148 self
149 }
150
151 /// Check whether a filename should be processed by this profile.
152 ///
153 /// Returns `true` when all three conditions hold:
154 ///
155 /// 1. The filename ends with one of `extensions` (an empty list → `false`).
156 /// 2. If `include` is non-empty, the filename matches at least one glob.
157 /// 3. The filename does **not** match any `exclude` glob.
158 ///
159 /// Invalid glob patterns in `include`/`exclude` are silently skipped.
160 ///
161 /// # Examples
162 ///
163 /// ```
164 /// use sanitize_engine::processor::profile::FieldRule;
165 /// use sanitize_engine::processor::profile::FileTypeProfile;
166 ///
167 /// let profile = FileTypeProfile::new("json", vec![])
168 /// .with_extension(".json");
169 ///
170 /// assert!(profile.matches_filename("config.json"));
171 /// assert!(profile.matches_filename("logs/app.json"));
172 /// assert!(!profile.matches_filename("config.yml"));
173 ///
174 /// // Exclude log-formatted JSON files.
175 /// let profile = FileTypeProfile::new("json", vec![])
176 /// .with_extension(".json")
177 /// .with_exclude("*.log.json")
178 /// .with_exclude("logs/**");
179 ///
180 /// assert!(profile.matches_filename("config.json"));
181 /// assert!(!profile.matches_filename("app.log.json"));
182 /// assert!(!profile.matches_filename("logs/events.json"));
183 ///
184 /// // Include only config files.
185 /// let profile = FileTypeProfile::new("json", vec![])
186 /// .with_extension(".json")
187 /// .with_include("config*.json");
188 ///
189 /// assert!(profile.matches_filename("config.json"));
190 /// assert!(profile.matches_filename("config-prod.json"));
191 /// assert!(!profile.matches_filename("events.json"));
192 /// ```
193 pub fn matches_filename(&self, filename: &str) -> bool {
194 // 1. Extension must match.
195 if self.extensions.is_empty() {
196 return false;
197 }
198 if !self.extensions.iter().any(|ext| filename.ends_with(ext.as_str())) {
199 return false;
200 }
201
202 // Extract the basename for patterns that don't contain a path separator.
203 // This lets users write `config*.json` and have it match
204 // `/any/path/config-prod.json` without needing a `**/` prefix.
205 let basename: &str = std::path::Path::new(filename)
206 .file_name()
207 .and_then(|n| n.to_str())
208 .unwrap_or(filename);
209
210 let glob_matches = |pat: &str| {
211 Pattern::new(pat).map_or(false, |p| {
212 p.matches(filename) || p.matches(basename)
213 })
214 };
215
216 // 2. Include filter (opt-in narrowing): must match at least one pattern.
217 if !self.include.is_empty() {
218 if !self.include.iter().any(|pat| glob_matches(pat)) {
219 return false;
220 }
221 }
222
223 // 3. Exclude filter: must not match any pattern.
224 if self.exclude.iter().any(|pat| glob_matches(pat)) {
225 return false;
226 }
227
228 true
229 }
230
231 /// Add a glob pattern to the `include` list.
232 #[must_use]
233 pub fn with_include(mut self, pat: impl Into<String>) -> Self {
234 self.include.push(pat.into());
235 self
236 }
237
238 /// Add a glob pattern to the `exclude` list.
239 #[must_use]
240 pub fn with_exclude(mut self, pat: impl Into<String>) -> Self {
241 self.exclude.push(pat.into());
242 self
243 }
244}
245
246// ---------------------------------------------------------------------------
247// Serde support for Category (as string)
248// ---------------------------------------------------------------------------
249
250impl Serialize for Category {
251 fn serialize<S: serde::Serializer>(
252 &self,
253 serializer: S,
254 ) -> std::result::Result<S::Ok, S::Error> {
255 serializer.serialize_str(&self.to_string())
256 }
257}
258
259impl<'de> Deserialize<'de> for Category {
260 fn deserialize<D: serde::Deserializer<'de>>(
261 deserializer: D,
262 ) -> std::result::Result<Self, D::Error> {
263 let s = String::deserialize(deserializer)?;
264 Ok(match s.as_str() {
265 "email" => Category::Email,
266 "name" => Category::Name,
267 "phone" => Category::Phone,
268 "ipv4" => Category::IpV4,
269 "ipv6" => Category::IpV6,
270 "credit_card" => Category::CreditCard,
271 "ssn" => Category::Ssn,
272 "hostname" => Category::Hostname,
273 "mac_address" => Category::MacAddress,
274 "container_id" => Category::ContainerId,
275 "uuid" => Category::Uuid,
276 "jwt" => Category::Jwt,
277 "auth_token" => Category::AuthToken,
278 "file_path" => Category::FilePath,
279 "windows_sid" => Category::WindowsSid,
280 "url" => Category::Url,
281 "aws_arn" => Category::AwsArn,
282 "azure_resource_id" => Category::AzureResourceId,
283 other => {
284 let tag = other.strip_prefix("custom:").unwrap_or(other);
285 Category::Custom(tag.into())
286 }
287 })
288 }
289}