Skip to main content

sanitize_engine/processor/
profile.rs

1//! File-type profiles for structured processors.
2//!
3//! A [`FileTypeProfile`] tells the processing pipeline which processor
4//! to use and which fields/keys within the file should be sanitized.
5
6use crate::category::Category;
7use glob::Pattern;
8use serde::{Deserialize, Serialize};
9
10// ---------------------------------------------------------------------------
11// FieldRule
12// ---------------------------------------------------------------------------
13
14/// A rule describing a single field/key to sanitize.
15///
16/// # Pattern Syntax
17///
18/// - Exact key: `"password"`, `"db_host"`.
19/// - Dotted path: `"database.password"`, `"smtp.user"`.
20/// - Glob suffix: `"*.password"` — matches any key ending in `.password`.
21/// - Glob prefix: `"db.*"` — matches any key starting with `db.`.
22/// - Wildcard: `"*"` — matches every field.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct FieldRule {
25    /// Key pattern to match (see Pattern Syntax above).
26    pub pattern: String,
27
28    /// Category for replacement generation. Defaults to `Custom("field")`
29    /// if not specified.
30    #[serde(default, skip_serializing_if = "Option::is_none")]
31    pub category: Option<Category>,
32
33    /// Optional human-readable label for reporting.
34    #[serde(default, skip_serializing_if = "Option::is_none")]
35    pub label: Option<String>,
36}
37
38impl FieldRule {
39    /// Create a new field rule with just a pattern.
40    #[must_use]
41    pub fn new(pattern: impl Into<String>) -> Self {
42        Self {
43            pattern: pattern.into(),
44            category: None,
45            label: None,
46        }
47    }
48
49    /// Set the category for this rule.
50    #[must_use]
51    pub fn with_category(mut self, category: Category) -> Self {
52        self.category = Some(category);
53        self
54    }
55
56    /// Set the label for this rule.
57    #[must_use]
58    pub fn with_label(mut self, label: impl Into<String>) -> Self {
59        self.label = Some(label.into());
60        self
61    }
62}
63
64// ---------------------------------------------------------------------------
65// FileTypeProfile
66// ---------------------------------------------------------------------------
67
68/// Specifies which processor to use and what fields to sanitize.
69///
70/// # File matching
71///
72/// A file is processed by this profile when **all** of the following hold:
73///
74/// 1. Its name ends with one of the `extensions` (required — an empty list
75///    matches nothing).
76/// 2. If `include` is non-empty, the filename matches **at least one** of
77///    those glob patterns.
78/// 3. The filename does **not** match any `exclude` glob pattern.
79///
80/// Glob patterns use `*` (any chars within a path component) and `**`
81/// (any chars including path separators).
82///
83/// # Example (YAML)
84///
85/// ```yaml
86/// - processor: json
87///   extensions: [".json"]
88///   # Only apply to files whose names start with "config"
89///   include: ["config*.json"]
90///   # Never apply to log files
91///   exclude: ["*.log.json", "logs/**"]
92///   fields:
93///     - pattern: "*.password"
94///       category: "custom:password"
95/// ```
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct FileTypeProfile {
98    /// Name of the processor to use (e.g. `"key_value"`, `"json"`).
99    pub processor: String,
100
101    /// File extensions this profile applies to (e.g. `[".rb", ".conf"]`).
102    #[serde(default)]
103    pub extensions: Vec<String>,
104
105    /// If non-empty, the filename must match at least one of these glob
106    /// patterns in addition to the extension check.
107    #[serde(default)]
108    pub include: Vec<String>,
109
110    /// Filenames matching any of these glob patterns are excluded from
111    /// structured processing even if they match the extension (and include).
112    #[serde(default)]
113    pub exclude: Vec<String>,
114
115    /// Field rules: which keys/paths to sanitize.
116    pub fields: Vec<FieldRule>,
117
118    /// Free-form options passed to the processor (e.g. delimiter, comment chars).
119    #[serde(default)]
120    pub options: std::collections::HashMap<String, String>,
121}
122
123impl FileTypeProfile {
124    /// Create a minimal profile for a given processor.
125    #[must_use]
126    pub fn new(processor: impl Into<String>, fields: Vec<FieldRule>) -> Self {
127        Self {
128            processor: processor.into(),
129            extensions: Vec::new(),
130            include: Vec::new(),
131            exclude: Vec::new(),
132            fields,
133            options: std::collections::HashMap::new(),
134        }
135    }
136
137    /// Add an extension to this profile.
138    #[must_use]
139    pub fn with_extension(mut self, ext: impl Into<String>) -> Self {
140        self.extensions.push(ext.into());
141        self
142    }
143
144    /// Add a free-form option.
145    #[must_use]
146    pub fn with_option(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
147        self.options.insert(key.into(), value.into());
148        self
149    }
150
151    /// Check whether a filename should be processed by this profile.
152    ///
153    /// Returns `true` when all three conditions hold:
154    ///
155    /// 1. The filename ends with one of `extensions` (an empty list → `false`).
156    /// 2. If `include` is non-empty, the filename matches at least one glob.
157    /// 3. The filename does **not** match any `exclude` glob.
158    ///
159    /// Invalid glob patterns in `include`/`exclude` are silently skipped.
160    ///
161    /// # Examples
162    ///
163    /// ```
164    /// use sanitize_engine::processor::profile::FieldRule;
165    /// use sanitize_engine::processor::profile::FileTypeProfile;
166    ///
167    /// let profile = FileTypeProfile::new("json", vec![])
168    ///     .with_extension(".json");
169    ///
170    /// assert!(profile.matches_filename("config.json"));
171    /// assert!(profile.matches_filename("logs/app.json"));
172    /// assert!(!profile.matches_filename("config.yml"));
173    ///
174    /// // Exclude log-formatted JSON files.
175    /// let profile = FileTypeProfile::new("json", vec![])
176    ///     .with_extension(".json")
177    ///     .with_exclude("*.log.json")
178    ///     .with_exclude("logs/**");
179    ///
180    /// assert!(profile.matches_filename("config.json"));
181    /// assert!(!profile.matches_filename("app.log.json"));
182    /// assert!(!profile.matches_filename("logs/events.json"));
183    ///
184    /// // Include only config files.
185    /// let profile = FileTypeProfile::new("json", vec![])
186    ///     .with_extension(".json")
187    ///     .with_include("config*.json");
188    ///
189    /// assert!(profile.matches_filename("config.json"));
190    /// assert!(profile.matches_filename("config-prod.json"));
191    /// assert!(!profile.matches_filename("events.json"));
192    /// ```
193    pub fn matches_filename(&self, filename: &str) -> bool {
194        // 1. Extension must match.
195        if self.extensions.is_empty() {
196            return false;
197        }
198        if !self.extensions.iter().any(|ext| filename.ends_with(ext.as_str())) {
199            return false;
200        }
201
202        // Extract the basename for patterns that don't contain a path separator.
203        // This lets users write `config*.json` and have it match
204        // `/any/path/config-prod.json` without needing a `**/` prefix.
205        let basename: &str = std::path::Path::new(filename)
206            .file_name()
207            .and_then(|n| n.to_str())
208            .unwrap_or(filename);
209
210        let glob_matches = |pat: &str| {
211            Pattern::new(pat).map_or(false, |p| {
212                p.matches(filename) || p.matches(basename)
213            })
214        };
215
216        // 2. Include filter (opt-in narrowing): must match at least one pattern.
217        if !self.include.is_empty() {
218            if !self.include.iter().any(|pat| glob_matches(pat)) {
219                return false;
220            }
221        }
222
223        // 3. Exclude filter: must not match any pattern.
224        if self.exclude.iter().any(|pat| glob_matches(pat)) {
225            return false;
226        }
227
228        true
229    }
230
231    /// Add a glob pattern to the `include` list.
232    #[must_use]
233    pub fn with_include(mut self, pat: impl Into<String>) -> Self {
234        self.include.push(pat.into());
235        self
236    }
237
238    /// Add a glob pattern to the `exclude` list.
239    #[must_use]
240    pub fn with_exclude(mut self, pat: impl Into<String>) -> Self {
241        self.exclude.push(pat.into());
242        self
243    }
244}
245
246// ---------------------------------------------------------------------------
247// Serde support for Category (as string)
248// ---------------------------------------------------------------------------
249
250impl Serialize for Category {
251    fn serialize<S: serde::Serializer>(
252        &self,
253        serializer: S,
254    ) -> std::result::Result<S::Ok, S::Error> {
255        serializer.serialize_str(&self.to_string())
256    }
257}
258
259impl<'de> Deserialize<'de> for Category {
260    fn deserialize<D: serde::Deserializer<'de>>(
261        deserializer: D,
262    ) -> std::result::Result<Self, D::Error> {
263        let s = String::deserialize(deserializer)?;
264        Ok(match s.as_str() {
265            "email" => Category::Email,
266            "name" => Category::Name,
267            "phone" => Category::Phone,
268            "ipv4" => Category::IpV4,
269            "ipv6" => Category::IpV6,
270            "credit_card" => Category::CreditCard,
271            "ssn" => Category::Ssn,
272            "hostname" => Category::Hostname,
273            "mac_address" => Category::MacAddress,
274            "container_id" => Category::ContainerId,
275            "uuid" => Category::Uuid,
276            "jwt" => Category::Jwt,
277            "auth_token" => Category::AuthToken,
278            "file_path" => Category::FilePath,
279            "windows_sid" => Category::WindowsSid,
280            "url" => Category::Url,
281            "aws_arn" => Category::AwsArn,
282            "azure_resource_id" => Category::AzureResourceId,
283            other => {
284                let tag = other.strip_prefix("custom:").unwrap_or(other);
285                Category::Custom(tag.into())
286            }
287        })
288    }
289}