sanitize_engine/processor/profile.rs
1//! File-type profiles for structured processors.
2//!
3//! A [`FileTypeProfile`] tells the processing pipeline which processor
4//! to use and which fields/keys within the file should be sanitized.
5
6use crate::category::Category;
7use glob::Pattern;
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10
11// ---------------------------------------------------------------------------
12// FieldNameSignal
13// ---------------------------------------------------------------------------
14
15/// Default Shannon entropy threshold (bits per character) for built-in field-name signals.
16///
17/// Values whose entropy is **below** this threshold are left unchanged even
18/// when their key name matches a sensitive keyword, preventing false positives
19/// on enum-like values such as `token_type: Bearer` or `auth: basic`.
20///
21/// Override per signal in the secrets file with `threshold: <f64>`, or disable
22/// the heuristic entirely with `--no-field-signal`:
23///
24/// ```yaml
25/// # Lower threshold: catch more, including weaker secrets
26/// - kind: field-name
27/// pattern: "^(password|secret)$"
28/// threshold: 3.0
29///
30/// # Higher threshold: only flag high-entropy tokens
31/// - kind: field-name
32/// pattern: "^(token|key)$"
33/// threshold: 4.0
34/// ```
35pub const DEFAULT_FIELD_SIGNAL_THRESHOLD: f64 = 3.5;
36
37/// A field-name–based heuristic signal used during structured processing.
38///
39/// When no explicit [`FieldRule`] covers a key, the processor checks the bare
40/// key name against all active signals. If a signal matches **and** the
41/// value's Shannon entropy meets or exceeds `threshold`, the value is replaced
42/// using `category` — as if an explicit rule had been defined.
43///
44/// # Entropy threshold guidance
45///
46/// | Threshold | Behaviour |
47/// |-----------|-----------|
48/// | **3.0** | Catches most secrets including moderately weak ones; recommended for high-confidence keywords (`password`, `secret`) |
49/// | **3.5** | Balanced default — skips plain enum values like `Bearer`, `basic`, `true` |
50/// | **4.0** | Conservative — only high-entropy tokens; use when false-positive rate matters |
51///
52/// # Configuring via secrets file
53///
54/// Add `kind: field-name` entries to your secrets file. The `pattern` field
55/// is a case-insensitive regex matched against the **bare key name** (not the
56/// full dot-path). `threshold` defaults to [`DEFAULT_FIELD_SIGNAL_THRESHOLD`]
57/// when omitted.
58///
59/// ```yaml
60/// # Strong signal: flag any `password`/`secret`/`private_key` with entropy ≥ 3.0
61/// - kind: field-name
62/// pattern: "^(password|passwd|secret|private_key|client_secret)$"
63/// category: custom:credential
64/// label: my-strong-signals
65/// threshold: 3.0
66///
67/// # Medium signal: flag `token`/`api_key` only when value looks like a real token
68/// - kind: field-name
69/// pattern: "^(token|api_key|access_key)$"
70/// category: custom:credential
71/// threshold: 3.5
72/// ```
73///
74/// Suppress false positives on specific values with `kind: allow`:
75///
76/// ```yaml
77/// - kind: allow
78/// values: ["Bearer", "basic", "oauth2", "true", "false"]
79/// ```
80///
81/// # Built-in defaults
82///
83/// When default patterns or `--app` is active, two built-in signals are
84/// injected automatically (unless `--no-field-signal` is passed):
85///
86/// - **Strong** (`threshold: 3.0`): `password`, `passwd`, `secret`,
87/// `private_key`, `api_secret`, `client_secret`
88/// - **Medium** (`threshold: 3.5`): `api_key`, `access_key`, `auth_token`,
89/// `token`, `signing_key`, `encryption_key`, `credential`, `cert`
90#[derive(Debug, Clone)]
91pub struct FieldNameSignal {
92 /// Original pattern string — shown in error messages and log output.
93 pub key_pattern: String,
94 /// Case-insensitive regex compiled from `key_pattern`.
95 pub(crate) key_regex: Regex,
96 /// Replacement category applied to values that pass the entropy gate.
97 pub category: Category,
98 /// Label used in findings and reports.
99 /// Defaults to `"field-signal:<key_pattern>"`.
100 pub label: String,
101 /// Shannon entropy threshold in bits per character.
102 ///
103 /// Values **below** this threshold are left unchanged.
104 /// See the table above and [`DEFAULT_FIELD_SIGNAL_THRESHOLD`].
105 pub threshold: f64,
106}
107
108impl FieldNameSignal {
109 /// Construct a new signal, compiling `key_pattern` as a case-insensitive regex.
110 ///
111 /// # Errors
112 ///
113 /// Returns a human-readable error string if `key_pattern` is not a valid regex.
114 pub fn new(
115 key_pattern: impl Into<String>,
116 category: Category,
117 label: Option<String>,
118 threshold: f64,
119 ) -> Result<Self, String> {
120 let key_pattern = key_pattern.into();
121 let key_regex = regex::RegexBuilder::new(&key_pattern)
122 .case_insensitive(true)
123 .build()
124 .map_err(|e| format!("field-name signal pattern {:?}: {e}", key_pattern))?;
125 let label = label.unwrap_or_else(|| format!("field-signal:{}", key_pattern));
126 Ok(Self {
127 key_pattern,
128 key_regex,
129 category,
130 label,
131 threshold,
132 })
133 }
134
135 /// Returns `true` if `key` (bare field name, not a dot-path) matches this signal.
136 #[inline]
137 #[must_use]
138 pub fn matches_key(&self, key: &str) -> bool {
139 self.key_regex.is_match(key)
140 }
141}
142
143// ---------------------------------------------------------------------------
144// FieldRule
145// ---------------------------------------------------------------------------
146
147/// A rule describing a single field/key to sanitize.
148///
149/// # Pattern Syntax
150///
151/// - Exact key: `"password"`, `"db_host"`.
152/// - Dotted path: `"database.password"`, `"smtp.user"`.
153/// - Glob suffix: `"*.password"` — matches any key ending in `.password`.
154/// - Glob prefix: `"db.*"` — matches any key starting with `db.`.
155/// - Wildcard: `"*"` — matches every field.
156///
157/// # Sub-processor
158///
159/// When a field's value is itself a structured document (e.g. YAML embedded
160/// in a Ruby heredoc), set `sub_processor` to the processor name and provide
161/// `sub_fields` with rules for the nested content. The parent processor
162/// extracts the value and delegates it to the named sub-processor.
163///
164/// ```yaml
165/// - pattern: "*['ldap_servers']"
166/// sub_processor: yaml
167/// sub_fields:
168/// - pattern: "*.password"
169/// category: custom:password
170/// - pattern: "*.bind_dn"
171/// category: custom:dn
172/// ```
173#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct FieldRule {
175 /// Key pattern to match (see Pattern Syntax above).
176 pub pattern: String,
177
178 /// Category for replacement generation. Defaults to `Custom("field")`
179 /// if not specified. Ignored when `sub_processor` is set.
180 #[serde(default, skip_serializing_if = "Option::is_none")]
181 pub category: Option<Category>,
182
183 /// Optional human-readable label for reporting.
184 #[serde(default, skip_serializing_if = "Option::is_none")]
185 pub label: Option<String>,
186
187 /// Minimum byte length a value must reach before it is replaced.
188 ///
189 /// Values shorter than this threshold pass through unchanged. Use this
190 /// to avoid redacting obviously non-secret values matched by broad glob
191 /// patterns (e.g. `"false"`, `"0"`, `"nil"` matched by `*secret*`).
192 ///
193 /// A value of `8` is a reasonable default for token/password fields.
194 /// Omit (or set to `0`) to replace all matching values regardless of length.
195 #[serde(default, skip_serializing_if = "Option::is_none")]
196 pub min_length: Option<usize>,
197
198 /// Name of the processor to use for the field's value when it contains
199 /// an embedded structured document (e.g. `"yaml"`, `"json"`, `"toml"`).
200 #[serde(default, skip_serializing_if = "Option::is_none")]
201 pub sub_processor: Option<String>,
202
203 /// Field rules applied by `sub_processor` to the nested content.
204 /// Ignored when `sub_processor` is `None`.
205 #[serde(default, skip_serializing_if = "Vec::is_empty")]
206 pub sub_fields: Vec<FieldRule>,
207}
208
209impl FieldRule {
210 /// Create a new field rule with just a pattern.
211 #[must_use]
212 pub fn new(pattern: impl Into<String>) -> Self {
213 Self {
214 pattern: pattern.into(),
215 category: None,
216 label: None,
217 min_length: None,
218 sub_processor: None,
219 sub_fields: Vec::new(),
220 }
221 }
222
223 /// Set the minimum value length required for replacement.
224 #[must_use]
225 pub fn with_min_length(mut self, min: usize) -> Self {
226 self.min_length = Some(min);
227 self
228 }
229
230 /// Set the category for this rule.
231 #[must_use]
232 pub fn with_category(mut self, category: Category) -> Self {
233 self.category = Some(category);
234 self
235 }
236
237 /// Set the label for this rule.
238 #[must_use]
239 pub fn with_label(mut self, label: impl Into<String>) -> Self {
240 self.label = Some(label.into());
241 self
242 }
243
244 /// Set the sub-processor name for embedded structured content.
245 #[must_use]
246 pub fn with_sub_processor(mut self, name: impl Into<String>) -> Self {
247 self.sub_processor = Some(name.into());
248 self
249 }
250
251 /// Set the field rules applied by the sub-processor.
252 #[must_use]
253 pub fn with_sub_fields(mut self, fields: Vec<FieldRule>) -> Self {
254 self.sub_fields = fields;
255 self
256 }
257}
258
259// ---------------------------------------------------------------------------
260// FileTypeProfile
261// ---------------------------------------------------------------------------
262
263/// Specifies which processor to use and what fields to sanitize.
264///
265/// # File matching
266///
267/// A file is processed by this profile when **all** of the following hold:
268///
269/// 1. Its name ends with one of the `extensions` (required — an empty list
270/// matches nothing).
271/// 2. If `include` is non-empty, the filename matches **at least one** of
272/// those glob patterns.
273/// 3. The filename does **not** match any `exclude` glob pattern.
274///
275/// Glob patterns use `*` (any chars within a path component) and `**`
276/// (any chars including path separators).
277///
278/// # Example (YAML)
279///
280/// ```yaml
281/// - processor: json
282/// extensions: [".json"]
283/// # Only apply to files whose names start with "config"
284/// include: ["config*.json"]
285/// # Never apply to log files
286/// exclude: ["*.log.json", "logs/**"]
287/// fields:
288/// - pattern: "*.password"
289/// category: "custom:password"
290/// ```
291#[derive(Debug, Clone, Serialize, Deserialize)]
292pub struct FileTypeProfile {
293 /// Name of the processor to use (e.g. `"key_value"`, `"json"`).
294 pub processor: String,
295
296 /// File extensions this profile applies to (e.g. `[".rb", ".conf"]`).
297 #[serde(default)]
298 pub extensions: Vec<String>,
299
300 /// If non-empty, the filename must match at least one of these glob
301 /// patterns in addition to the extension check.
302 #[serde(default)]
303 pub include: Vec<String>,
304
305 /// Filenames matching any of these glob patterns are excluded from
306 /// structured processing even if they match the extension (and include).
307 #[serde(default)]
308 pub exclude: Vec<String>,
309
310 /// Field rules: which keys/paths to sanitize.
311 pub fields: Vec<FieldRule>,
312
313 /// Free-form options passed to the processor (e.g. delimiter, comment chars).
314 #[serde(default)]
315 pub options: std::collections::HashMap<String, String>,
316
317 /// Field-name signals injected at runtime from `kind: field-name` secrets
318 /// entries and from built-in defaults when default patterns or `--app` is
319 /// active. Never serialized to or deserialized from the profile file on
320 /// disk — configure signals in your secrets file instead.
321 #[serde(skip)]
322 pub field_name_signals: Vec<FieldNameSignal>,
323}
324
325impl FileTypeProfile {
326 /// Create a minimal profile for a given processor.
327 #[must_use]
328 pub fn new(processor: impl Into<String>, fields: Vec<FieldRule>) -> Self {
329 Self {
330 processor: processor.into(),
331 extensions: Vec::new(),
332 include: Vec::new(),
333 exclude: Vec::new(),
334 fields,
335 options: std::collections::HashMap::new(),
336 field_name_signals: Vec::new(),
337 }
338 }
339
340 /// Add an extension to this profile.
341 #[must_use]
342 pub fn with_extension(mut self, ext: impl Into<String>) -> Self {
343 self.extensions.push(ext.into());
344 self
345 }
346
347 /// Add a free-form option.
348 #[must_use]
349 pub fn with_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
350 self.options.insert(key.into(), value.into());
351 self
352 }
353
354 /// Check whether a filename should be processed by this profile.
355 ///
356 /// Returns `true` when all three conditions hold:
357 ///
358 /// 1. The filename ends with one of `extensions` (an empty list → `false`).
359 /// 2. If `include` is non-empty, the filename matches at least one glob.
360 /// 3. The filename does **not** match any `exclude` glob.
361 ///
362 /// Invalid glob patterns in `include`/`exclude` are silently skipped.
363 ///
364 /// # Examples
365 ///
366 /// ```
367 /// use sanitize_engine::processor::profile::FieldRule;
368 /// use sanitize_engine::processor::profile::FileTypeProfile;
369 ///
370 /// let profile = FileTypeProfile::new("json", vec![])
371 /// .with_extension(".json");
372 ///
373 /// assert!(profile.matches_filename("config.json"));
374 /// assert!(profile.matches_filename("logs/app.json"));
375 /// assert!(!profile.matches_filename("config.yml"));
376 ///
377 /// // Exclude log-formatted JSON files.
378 /// let profile = FileTypeProfile::new("json", vec![])
379 /// .with_extension(".json")
380 /// .with_exclude("*.log.json")
381 /// .with_exclude("logs/**");
382 ///
383 /// assert!(profile.matches_filename("config.json"));
384 /// assert!(!profile.matches_filename("app.log.json"));
385 /// assert!(!profile.matches_filename("logs/events.json"));
386 ///
387 /// // Include only config files.
388 /// let profile = FileTypeProfile::new("json", vec![])
389 /// .with_extension(".json")
390 /// .with_include("config*.json");
391 ///
392 /// assert!(profile.matches_filename("config.json"));
393 /// assert!(profile.matches_filename("config-prod.json"));
394 /// assert!(!profile.matches_filename("events.json"));
395 /// ```
396 pub fn matches_filename(&self, filename: &str) -> bool {
397 // 1. Extension must match.
398 if self.extensions.is_empty() {
399 return false;
400 }
401 if !self
402 .extensions
403 .iter()
404 .any(|ext| filename.ends_with(ext.as_str()))
405 {
406 return false;
407 }
408
409 // Extract the basename for patterns that don't contain a path separator.
410 // This lets users write `config*.json` and have it match
411 // `/any/path/config-prod.json` without needing a `**/` prefix.
412 let basename: &str = std::path::Path::new(filename)
413 .file_name()
414 .and_then(|n| n.to_str())
415 .unwrap_or(filename);
416
417 let glob_matches =
418 |pat: &str| Pattern::new(pat).is_ok_and(|p| p.matches(filename) || p.matches(basename));
419
420 // 2. Include filter (opt-in narrowing): must match at least one pattern.
421 if !self.include.is_empty() && !self.include.iter().any(|pat| glob_matches(pat)) {
422 return false;
423 }
424
425 // 3. Exclude filter: must not match any pattern.
426 if self.exclude.iter().any(|pat| glob_matches(pat)) {
427 return false;
428 }
429
430 true
431 }
432
433 /// Add a glob pattern to the `include` list.
434 #[must_use]
435 pub fn with_include(mut self, pat: impl Into<String>) -> Self {
436 self.include.push(pat.into());
437 self
438 }
439
440 /// Add a glob pattern to the `exclude` list.
441 #[must_use]
442 pub fn with_exclude(mut self, pat: impl Into<String>) -> Self {
443 self.exclude.push(pat.into());
444 self
445 }
446}
447
448// ---------------------------------------------------------------------------
449// Serde support for Category (as string)
450// ---------------------------------------------------------------------------
451
452impl Serialize for Category {
453 fn serialize<S: serde::Serializer>(
454 &self,
455 serializer: S,
456 ) -> std::result::Result<S::Ok, S::Error> {
457 serializer.serialize_str(&self.to_string())
458 }
459}
460
461impl<'de> Deserialize<'de> for Category {
462 fn deserialize<D: serde::Deserializer<'de>>(
463 deserializer: D,
464 ) -> std::result::Result<Self, D::Error> {
465 let s = String::deserialize(deserializer)?;
466 Ok(match s.as_str() {
467 "email" => Category::Email,
468 "name" => Category::Name,
469 "phone" => Category::Phone,
470 "ipv4" => Category::IpV4,
471 "ipv6" => Category::IpV6,
472 "credit_card" => Category::CreditCard,
473 "ssn" => Category::Ssn,
474 "hostname" => Category::Hostname,
475 "mac_address" => Category::MacAddress,
476 "container_id" => Category::ContainerId,
477 "uuid" => Category::Uuid,
478 "jwt" => Category::Jwt,
479 "auth_token" => Category::AuthToken,
480 "file_path" => Category::FilePath,
481 "windows_sid" => Category::WindowsSid,
482 "url" => Category::Url,
483 "aws_arn" => Category::AwsArn,
484 "azure_resource_id" => Category::AzureResourceId,
485 other => {
486 let tag = other.strip_prefix("custom:").unwrap_or(other);
487 Category::Custom(tag.into())
488 }
489 })
490 }
491}
492
493#[cfg(test)]
494mod tests {
495 use super::*;
496
497 // ---- FieldRule builders ----
498
499 #[test]
500 fn field_rule_with_min_length() {
501 let rule = FieldRule::new("*.password").with_min_length(8);
502 assert_eq!(rule.min_length, Some(8));
503 }
504
505 #[test]
506 fn field_rule_with_category() {
507 let rule = FieldRule::new("*.email").with_category(Category::Email);
508 assert_eq!(rule.category, Some(Category::Email));
509 }
510
511 #[test]
512 fn field_rule_with_label() {
513 let rule = FieldRule::new("*.token").with_label("my-token");
514 assert_eq!(rule.label.as_deref(), Some("my-token"));
515 }
516
517 // ---- FileTypeProfile builders ----
518
519 #[test]
520 fn profile_with_include_narrows_match() {
521 let profile = FileTypeProfile::new("json", vec![])
522 .with_extension(".json")
523 .with_include("config*.json");
524
525 assert!(profile.matches_filename("config.json"));
526 assert!(profile.matches_filename("config-prod.json"));
527 assert!(!profile.matches_filename("events.json"));
528 }
529
530 #[test]
531 fn profile_with_exclude_blocks_match() {
532 let profile = FileTypeProfile::new("json", vec![])
533 .with_extension(".json")
534 .with_exclude("*.log.json");
535
536 assert!(profile.matches_filename("config.json"));
537 assert!(!profile.matches_filename("server.log.json"));
538 }
539
540 #[test]
541 fn profile_include_and_exclude_combined() {
542 let profile = FileTypeProfile::new("json", vec![])
543 .with_extension(".json")
544 .with_include("config*.json")
545 .with_exclude("config-secret.json");
546
547 assert!(profile.matches_filename("config-prod.json"));
548 assert!(!profile.matches_filename("config-secret.json"));
549 assert!(!profile.matches_filename("events.json"));
550 }
551
552 #[test]
553 fn profile_no_extensions_matches_nothing() {
554 let profile = FileTypeProfile::new("json", vec![]);
555 assert!(!profile.matches_filename("anything.json"));
556 }
557
558 // ---- Category serde roundtrip ----
559
560 #[test]
561 fn category_serialize_deserialize_roundtrip() {
562 let cases: &[(&str, Category)] = &[
563 ("email", Category::Email),
564 ("ipv4", Category::IpV4),
565 ("custom:my_key", Category::Custom("my_key".into())),
566 ];
567 for (s, expected) in cases {
568 let json = format!("\"{}\"", s);
569 let got: Category = serde_json::from_str(&json).unwrap();
570 assert_eq!(got, *expected, "deserializing {s}");
571 let serialized = serde_json::to_string(&got).unwrap();
572 assert_eq!(serialized, json, "serializing {s}");
573 }
574 }
575}