Skip to main content

sanitize_engine/processor/
profile.rs

1//! File-type profiles for structured processors.
2//!
3//! A [`FileTypeProfile`] tells the processing pipeline which processor
4//! to use and which fields/keys within the file should be sanitized.
5
6use crate::category::Category;
7use glob::Pattern;
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10
11// ---------------------------------------------------------------------------
12// FieldNameSignal
13// ---------------------------------------------------------------------------
14
15/// Default Shannon entropy threshold (bits per character) for built-in field-name signals.
16///
17/// Values whose entropy is **below** this threshold are left unchanged even
18/// when their key name matches a sensitive keyword, preventing false positives
19/// on enum-like values such as `token_type: Bearer` or `auth: basic`.
20///
21/// Override per signal in the secrets file with `threshold: <f64>`, or disable
22/// the heuristic entirely with `--no-field-signal`:
23///
24/// ```yaml
25/// # Lower threshold: catch more, including weaker secrets
26/// - kind: field-name
27///   pattern: "^(password|secret)$"
28///   threshold: 3.0
29///
30/// # Higher threshold: only flag high-entropy tokens
31/// - kind: field-name
32///   pattern: "^(token|key)$"
33///   threshold: 4.0
34/// ```
35pub const DEFAULT_FIELD_SIGNAL_THRESHOLD: f64 = 3.5;
36
37/// A field-name–based heuristic signal used during structured processing.
38///
39/// When no explicit [`FieldRule`] covers a key, the processor checks the bare
40/// key name against all active signals.  If a signal matches **and** the
41/// value's Shannon entropy meets or exceeds `threshold`, the value is replaced
42/// using `category` — as if an explicit rule had been defined.
43///
44/// # Entropy threshold guidance
45///
46/// | Threshold | Behaviour |
47/// |-----------|-----------|
48/// | **3.0** | Catches most secrets including moderately weak ones; recommended for high-confidence keywords (`password`, `secret`) |
49/// | **3.5** | Balanced default — skips plain enum values like `Bearer`, `basic`, `true` |
50/// | **4.0** | Conservative — only high-entropy tokens; use when false-positive rate matters |
51///
52/// # Configuring via secrets file
53///
54/// Add `kind: field-name` entries to your secrets file.  The `pattern` field
55/// is a case-insensitive regex matched against the **bare key name** (not the
56/// full dot-path).  `threshold` defaults to [`DEFAULT_FIELD_SIGNAL_THRESHOLD`]
57/// when omitted.
58///
59/// ```yaml
60/// # Strong signal: flag any `password`/`secret`/`private_key` with entropy ≥ 3.0
61/// - kind: field-name
62///   pattern: "^(password|passwd|secret|private_key|client_secret)$"
63///   category: custom:credential
64///   label: my-strong-signals
65///   threshold: 3.0
66///
67/// # Medium signal: flag `token`/`api_key` only when value looks like a real token
68/// - kind: field-name
69///   pattern: "^(token|api_key|access_key)$"
70///   category: custom:credential
71///   threshold: 3.5
72/// ```
73///
74/// Suppress false positives on specific values with `kind: allow`:
75///
76/// ```yaml
77/// - kind: allow
78///   values: ["Bearer", "basic", "oauth2", "true", "false"]
79/// ```
80///
81/// # Built-in defaults
82///
83/// When default patterns or `--app` is active, two built-in signals are
84/// injected automatically (unless `--no-field-signal` is passed):
85///
86/// - **Strong** (`threshold: 3.0`): `password`, `passwd`, `secret`,
87///   `private_key`, `api_secret`, `client_secret`
88/// - **Medium** (`threshold: 3.5`): `api_key`, `access_key`, `auth_token`,
89///   `token`, `signing_key`, `encryption_key`, `credential`, `cert`
90#[derive(Debug, Clone)]
91pub struct FieldNameSignal {
92    /// Original pattern string — shown in error messages and log output.
93    pub key_pattern: String,
94    /// Case-insensitive regex compiled from `key_pattern`.
95    pub(crate) key_regex: Regex,
96    /// Replacement category applied to values that pass the entropy gate.
97    pub category: Category,
98    /// Label used in findings and reports.
99    /// Defaults to `"field-signal:<key_pattern>"`.
100    pub label: String,
101    /// Shannon entropy threshold in bits per character.
102    ///
103    /// Values **below** this threshold are left unchanged.
104    /// See the table above and [`DEFAULT_FIELD_SIGNAL_THRESHOLD`].
105    pub threshold: f64,
106}
107
108impl FieldNameSignal {
109    /// Construct a new signal, compiling `key_pattern` as a case-insensitive regex.
110    ///
111    /// # Errors
112    ///
113    /// Returns a human-readable error string if `key_pattern` is not a valid regex.
114    pub fn new(
115        key_pattern: impl Into<String>,
116        category: Category,
117        label: Option<String>,
118        threshold: f64,
119    ) -> Result<Self, String> {
120        let key_pattern = key_pattern.into();
121        let key_regex = regex::RegexBuilder::new(&key_pattern)
122            .case_insensitive(true)
123            .build()
124            .map_err(|e| format!("field-name signal pattern {:?}: {e}", key_pattern))?;
125        let label = label.unwrap_or_else(|| format!("field-signal:{}", key_pattern));
126        Ok(Self {
127            key_pattern,
128            key_regex,
129            category,
130            label,
131            threshold,
132        })
133    }
134
135    /// Returns `true` if `key` (bare field name, not a dot-path) matches this signal.
136    #[inline]
137    #[must_use]
138    pub fn matches_key(&self, key: &str) -> bool {
139        self.key_regex.is_match(key)
140    }
141}
142
143// ---------------------------------------------------------------------------
144// FieldRule
145// ---------------------------------------------------------------------------
146
147/// A rule describing a single field/key to sanitize.
148///
149/// # Pattern Syntax
150///
151/// - Exact key: `"password"`, `"db_host"`.
152/// - Dotted path: `"database.password"`, `"smtp.user"`.
153/// - Glob suffix: `"*.password"` — matches any key ending in `.password`.
154/// - Glob prefix: `"db.*"` — matches any key starting with `db.`.
155/// - Wildcard: `"*"` — matches every field.
156///
157/// # Sub-processor
158///
159/// When a field's value is itself a structured document (e.g. YAML embedded
160/// in a Ruby heredoc), set `sub_processor` to the processor name and provide
161/// `sub_fields` with rules for the nested content. The parent processor
162/// extracts the value and delegates it to the named sub-processor.
163///
164/// ```yaml
165/// - pattern: "*['ldap_servers']"
166///   sub_processor: yaml
167///   sub_fields:
168///     - pattern: "*.password"
169///       category: custom:password
170///     - pattern: "*.bind_dn"
171///       category: custom:dn
172/// ```
173#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct FieldRule {
175    /// Key pattern to match (see Pattern Syntax above).
176    pub pattern: String,
177
178    /// Category for replacement generation. Defaults to `Custom("field")`
179    /// if not specified. Ignored when `sub_processor` is set.
180    #[serde(default, skip_serializing_if = "Option::is_none")]
181    pub category: Option<Category>,
182
183    /// Optional human-readable label for reporting.
184    #[serde(default, skip_serializing_if = "Option::is_none")]
185    pub label: Option<String>,
186
187    /// Minimum byte length a value must reach before it is replaced.
188    ///
189    /// Values shorter than this threshold pass through unchanged. Use this
190    /// to avoid redacting obviously non-secret values matched by broad glob
191    /// patterns (e.g. `"false"`, `"0"`, `"nil"` matched by `*secret*`).
192    ///
193    /// A value of `8` is a reasonable default for token/password fields.
194    /// Omit (or set to `0`) to replace all matching values regardless of length.
195    #[serde(default, skip_serializing_if = "Option::is_none")]
196    pub min_length: Option<usize>,
197
198    /// Name of the processor to use for the field's value when it contains
199    /// an embedded structured document (e.g. `"yaml"`, `"json"`, `"toml"`).
200    #[serde(default, skip_serializing_if = "Option::is_none")]
201    pub sub_processor: Option<String>,
202
203    /// Field rules applied by `sub_processor` to the nested content.
204    /// Ignored when `sub_processor` is `None`.
205    #[serde(default, skip_serializing_if = "Vec::is_empty")]
206    pub sub_fields: Vec<FieldRule>,
207}
208
209impl FieldRule {
210    /// Create a new field rule with just a pattern.
211    #[must_use]
212    pub fn new(pattern: impl Into<String>) -> Self {
213        Self {
214            pattern: pattern.into(),
215            category: None,
216            label: None,
217            min_length: None,
218            sub_processor: None,
219            sub_fields: Vec::new(),
220        }
221    }
222
223    /// Set the minimum value length required for replacement.
224    #[must_use]
225    pub fn with_min_length(mut self, min: usize) -> Self {
226        self.min_length = Some(min);
227        self
228    }
229
230    /// Set the category for this rule.
231    #[must_use]
232    pub fn with_category(mut self, category: Category) -> Self {
233        self.category = Some(category);
234        self
235    }
236
237    /// Set the label for this rule.
238    #[must_use]
239    pub fn with_label(mut self, label: impl Into<String>) -> Self {
240        self.label = Some(label.into());
241        self
242    }
243
244    /// Set the sub-processor name for embedded structured content.
245    #[must_use]
246    pub fn with_sub_processor(mut self, name: impl Into<String>) -> Self {
247        self.sub_processor = Some(name.into());
248        self
249    }
250
251    /// Set the field rules applied by the sub-processor.
252    #[must_use]
253    pub fn with_sub_fields(mut self, fields: Vec<FieldRule>) -> Self {
254        self.sub_fields = fields;
255        self
256    }
257}
258
259// ---------------------------------------------------------------------------
260// FileTypeProfile
261// ---------------------------------------------------------------------------
262
263/// Specifies which processor to use and what fields to sanitize.
264///
265/// # File matching
266///
267/// A file is processed by this profile when **all** of the following hold:
268///
269/// 1. Its name ends with one of the `extensions` (required — an empty list
270///    matches nothing).
271/// 2. If `include` is non-empty, the filename matches **at least one** of
272///    those glob patterns.
273/// 3. The filename does **not** match any `exclude` glob pattern.
274///
275/// Glob patterns use `*` (any chars within a path component) and `**`
276/// (any chars including path separators).
277///
278/// # Example (YAML)
279///
280/// ```yaml
281/// - processor: json
282///   extensions: [".json"]
283///   # Only apply to files whose names start with "config"
284///   include: ["config*.json"]
285///   # Never apply to log files
286///   exclude: ["*.log.json", "logs/**"]
287///   fields:
288///     - pattern: "*.password"
289///       category: "custom:password"
290/// ```
291#[derive(Debug, Clone, Serialize, Deserialize)]
292pub struct FileTypeProfile {
293    /// Name of the processor to use (e.g. `"key_value"`, `"json"`).
294    pub processor: String,
295
296    /// File extensions this profile applies to (e.g. `[".rb", ".conf"]`).
297    #[serde(default)]
298    pub extensions: Vec<String>,
299
300    /// If non-empty, the filename must match at least one of these glob
301    /// patterns in addition to the extension check.
302    #[serde(default)]
303    pub include: Vec<String>,
304
305    /// Filenames matching any of these glob patterns are excluded from
306    /// structured processing even if they match the extension (and include).
307    #[serde(default)]
308    pub exclude: Vec<String>,
309
310    /// Field rules: which keys/paths to sanitize.
311    pub fields: Vec<FieldRule>,
312
313    /// Free-form options passed to the processor (e.g. delimiter, comment chars).
314    #[serde(default)]
315    pub options: std::collections::HashMap<String, String>,
316
317    /// Field-name signals injected at runtime from `kind: field-name` secrets
318    /// entries and from built-in defaults when default patterns or `--app` is
319    /// active.  Never serialized to or deserialized from the profile file on
320    /// disk — configure signals in your secrets file instead.
321    #[serde(skip)]
322    pub field_name_signals: Vec<FieldNameSignal>,
323}
324
325impl FileTypeProfile {
326    /// Create a minimal profile for a given processor.
327    #[must_use]
328    pub fn new(processor: impl Into<String>, fields: Vec<FieldRule>) -> Self {
329        Self {
330            processor: processor.into(),
331            extensions: Vec::new(),
332            include: Vec::new(),
333            exclude: Vec::new(),
334            fields,
335            options: std::collections::HashMap::new(),
336            field_name_signals: Vec::new(),
337        }
338    }
339
340    /// Add an extension to this profile.
341    #[must_use]
342    pub fn with_extension(mut self, ext: impl Into<String>) -> Self {
343        self.extensions.push(ext.into());
344        self
345    }
346
347    /// Add a free-form option.
348    #[must_use]
349    pub fn with_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
350        self.options.insert(key.into(), value.into());
351        self
352    }
353
354    /// Check whether a filename should be processed by this profile.
355    ///
356    /// Returns `true` when all three conditions hold:
357    ///
358    /// 1. The filename ends with one of `extensions` (an empty list → `false`).
359    /// 2. If `include` is non-empty, the filename matches at least one glob.
360    /// 3. The filename does **not** match any `exclude` glob.
361    ///
362    /// Invalid glob patterns in `include`/`exclude` are silently skipped.
363    ///
364    /// # Examples
365    ///
366    /// ```
367    /// use sanitize_engine::processor::profile::FieldRule;
368    /// use sanitize_engine::processor::profile::FileTypeProfile;
369    ///
370    /// let profile = FileTypeProfile::new("json", vec![])
371    ///     .with_extension(".json");
372    ///
373    /// assert!(profile.matches_filename("config.json"));
374    /// assert!(profile.matches_filename("logs/app.json"));
375    /// assert!(!profile.matches_filename("config.yml"));
376    ///
377    /// // Exclude log-formatted JSON files.
378    /// let profile = FileTypeProfile::new("json", vec![])
379    ///     .with_extension(".json")
380    ///     .with_exclude("*.log.json")
381    ///     .with_exclude("logs/**");
382    ///
383    /// assert!(profile.matches_filename("config.json"));
384    /// assert!(!profile.matches_filename("app.log.json"));
385    /// assert!(!profile.matches_filename("logs/events.json"));
386    ///
387    /// // Include only config files.
388    /// let profile = FileTypeProfile::new("json", vec![])
389    ///     .with_extension(".json")
390    ///     .with_include("config*.json");
391    ///
392    /// assert!(profile.matches_filename("config.json"));
393    /// assert!(profile.matches_filename("config-prod.json"));
394    /// assert!(!profile.matches_filename("events.json"));
395    /// ```
396    pub fn matches_filename(&self, filename: &str) -> bool {
397        // 1. Extension must match.
398        if self.extensions.is_empty() {
399            return false;
400        }
401        if !self
402            .extensions
403            .iter()
404            .any(|ext| filename.ends_with(ext.as_str()))
405        {
406            return false;
407        }
408
409        // Extract the basename for patterns that don't contain a path separator.
410        // This lets users write `config*.json` and have it match
411        // `/any/path/config-prod.json` without needing a `**/` prefix.
412        let basename: &str = std::path::Path::new(filename)
413            .file_name()
414            .and_then(|n| n.to_str())
415            .unwrap_or(filename);
416
417        let glob_matches =
418            |pat: &str| Pattern::new(pat).is_ok_and(|p| p.matches(filename) || p.matches(basename));
419
420        // 2. Include filter (opt-in narrowing): must match at least one pattern.
421        if !self.include.is_empty() && !self.include.iter().any(|pat| glob_matches(pat)) {
422            return false;
423        }
424
425        // 3. Exclude filter: must not match any pattern.
426        if self.exclude.iter().any(|pat| glob_matches(pat)) {
427            return false;
428        }
429
430        true
431    }
432
433    /// Add a glob pattern to the `include` list.
434    #[must_use]
435    pub fn with_include(mut self, pat: impl Into<String>) -> Self {
436        self.include.push(pat.into());
437        self
438    }
439
440    /// Add a glob pattern to the `exclude` list.
441    #[must_use]
442    pub fn with_exclude(mut self, pat: impl Into<String>) -> Self {
443        self.exclude.push(pat.into());
444        self
445    }
446}
447
448// ---------------------------------------------------------------------------
449// Serde support for Category (as string)
450// ---------------------------------------------------------------------------
451
452impl Serialize for Category {
453    fn serialize<S: serde::Serializer>(
454        &self,
455        serializer: S,
456    ) -> std::result::Result<S::Ok, S::Error> {
457        serializer.serialize_str(&self.to_string())
458    }
459}
460
461impl<'de> Deserialize<'de> for Category {
462    fn deserialize<D: serde::Deserializer<'de>>(
463        deserializer: D,
464    ) -> std::result::Result<Self, D::Error> {
465        let s = String::deserialize(deserializer)?;
466        Ok(match s.as_str() {
467            "email" => Category::Email,
468            "name" => Category::Name,
469            "phone" => Category::Phone,
470            "ipv4" => Category::IpV4,
471            "ipv6" => Category::IpV6,
472            "credit_card" => Category::CreditCard,
473            "ssn" => Category::Ssn,
474            "hostname" => Category::Hostname,
475            "mac_address" => Category::MacAddress,
476            "container_id" => Category::ContainerId,
477            "uuid" => Category::Uuid,
478            "jwt" => Category::Jwt,
479            "auth_token" => Category::AuthToken,
480            "file_path" => Category::FilePath,
481            "windows_sid" => Category::WindowsSid,
482            "url" => Category::Url,
483            "aws_arn" => Category::AwsArn,
484            "azure_resource_id" => Category::AzureResourceId,
485            other => {
486                let tag = other.strip_prefix("custom:").unwrap_or(other);
487                Category::Custom(tag.into())
488            }
489        })
490    }
491}
492
493#[cfg(test)]
494mod tests {
495    use super::*;
496
497    // ---- FieldRule builders ----
498
499    #[test]
500    fn field_rule_with_min_length() {
501        let rule = FieldRule::new("*.password").with_min_length(8);
502        assert_eq!(rule.min_length, Some(8));
503    }
504
505    #[test]
506    fn field_rule_with_category() {
507        let rule = FieldRule::new("*.email").with_category(Category::Email);
508        assert_eq!(rule.category, Some(Category::Email));
509    }
510
511    #[test]
512    fn field_rule_with_label() {
513        let rule = FieldRule::new("*.token").with_label("my-token");
514        assert_eq!(rule.label.as_deref(), Some("my-token"));
515    }
516
517    // ---- FileTypeProfile builders ----
518
519    #[test]
520    fn profile_with_include_narrows_match() {
521        let profile = FileTypeProfile::new("json", vec![])
522            .with_extension(".json")
523            .with_include("config*.json");
524
525        assert!(profile.matches_filename("config.json"));
526        assert!(profile.matches_filename("config-prod.json"));
527        assert!(!profile.matches_filename("events.json"));
528    }
529
530    #[test]
531    fn profile_with_exclude_blocks_match() {
532        let profile = FileTypeProfile::new("json", vec![])
533            .with_extension(".json")
534            .with_exclude("*.log.json");
535
536        assert!(profile.matches_filename("config.json"));
537        assert!(!profile.matches_filename("server.log.json"));
538    }
539
540    #[test]
541    fn profile_include_and_exclude_combined() {
542        let profile = FileTypeProfile::new("json", vec![])
543            .with_extension(".json")
544            .with_include("config*.json")
545            .with_exclude("config-secret.json");
546
547        assert!(profile.matches_filename("config-prod.json"));
548        assert!(!profile.matches_filename("config-secret.json"));
549        assert!(!profile.matches_filename("events.json"));
550    }
551
552    #[test]
553    fn profile_no_extensions_matches_nothing() {
554        let profile = FileTypeProfile::new("json", vec![]);
555        assert!(!profile.matches_filename("anything.json"));
556    }
557
558    // ---- Category serde roundtrip ----
559
560    #[test]
561    fn category_serialize_deserialize_roundtrip() {
562        let cases: &[(&str, Category)] = &[
563            ("email", Category::Email),
564            ("ipv4", Category::IpV4),
565            ("custom:my_key", Category::Custom("my_key".into())),
566        ];
567        for (s, expected) in cases {
568            let json = format!("\"{}\"", s);
569            let got: Category = serde_json::from_str(&json).unwrap();
570            assert_eq!(got, *expected, "deserializing {s}");
571            let serialized = serde_json::to_string(&got).unwrap();
572            assert_eq!(serialized, json, "serializing {s}");
573        }
574    }
575}