adaptive_pipeline/infrastructure/services/
pii_masking.rs

1// /////////////////////////////////////////////////////////////////////////////
2// Adaptive Pipeline
3// Copyright (c) 2025 Michael Gardner, A Bit of Help, Inc.
4// SPDX-License-Identifier: BSD-3-Clause
5// See LICENSE file in the project root.
6// /////////////////////////////////////////////////////////////////////////////
7
8// Infrastructure service with parameters for future features
9#![allow(unused_variables)]
10//! # PII Masking Service
11//!
12//! Production-ready PII (Personally Identifiable Information) masking stage for
13//! the adaptive pipeline. This service provides one-way data anonymization,
14//! useful for:
15//!
16//! - **Privacy Protection**: Removing sensitive data before storage or
17//!   transmission
18//! - **Compliance**: Meeting GDPR, CCPA, HIPAA requirements
19//! - **Testing**: Creating safe test data from production datasets
20//! - **Logging**: Sanitizing logs before external processing
21//!
22//! ## Architecture
23//!
24//! This implementation demonstrates the complete pattern for creating pipeline
25//! stages:
26//!
27//! - **Config Struct**: `PiiMaskingConfig` with typed parameters
28//! - **FromParameters**: Type-safe extraction from HashMap
29//! - **Service Struct**: `PiiMaskingService` implements `StageService`
30//! - **Position**: `PreBinary` (must mask before compression/encryption)
31//! - **Reversibility**: Non-reversible (Forward masks, Reverse returns error)
32//!
33//! ## Usage
34//!
35//! ```rust
36//! use adaptive_pipeline::infrastructure::services::PiiMaskingService;
37//! use adaptive_pipeline_domain::services::StageService;
38//!
39//! let service = PiiMaskingService::new();
40//! // Used automatically by pipeline when configured with StageType::Transform
41//! ```
42//!
43//! ## Configuration Parameters
44//!
45//! - **patterns** (optional): Comma-separated list of PII patterns to mask
46//!   - `"email"` - Email addresses (user@example.com → ***@***.com)
47//!   - `"ssn"` - Social Security Numbers (123-45-6789 → ***-**-****)
48//!   - `"phone"` - Phone numbers (555-123-4567 → ***-***-****)
49//!   - `"credit_card"` - Credit card numbers (1234-5678-9012-3456 →
50//!     ****-****-****-****)
51//!   - `"all"` - All supported patterns (default)
52//!
53//! - **mask_char** (optional): Character to use for masking
54//!   - Default: `"*"`
55//!   - Example: `"#"`, `"X"`
56//!
57//! - **preserve_format** (optional): Whether to preserve format separators
58//!   - `"true"` - Keep separators like @ and - (default)
59//!   - `"false"` - Mask everything including separators
60//!
61//! ## Performance Characteristics
62//!
63//! - **Throughput**: ~200 MB/s for typical mixed content
64//! - **Overhead**: Minimal, regex-based pattern matching
65//! - **Memory**: Constant overhead, no buffering required
66//! - **Latency**: Single-pass algorithm with compiled regex patterns
67
68use adaptive_pipeline_domain::entities::{Operation, ProcessingContext, StageConfiguration, StagePosition, StageType};
69use adaptive_pipeline_domain::services::{FromParameters, StageService};
70use adaptive_pipeline_domain::value_objects::file_chunk::FileChunk;
71use adaptive_pipeline_domain::PipelineError;
72use once_cell::sync::Lazy;
73use regex::Regex;
74use std::collections::HashMap;
75
76/// Compiled regex patterns for PII detection.
77/// These are computed once at startup and reused for all masking operations.
78///
79/// Note: These regex patterns are known-good at compile time. If compilation
80/// fails, we fall back to a regex that matches nothing rather than panicking.
81/// The fallback pattern `[^\s\S]` matches nothing (neither whitespace nor
82/// non-whitespace).
83static EMAIL_REGEX: Lazy<Regex> = Lazy::new(|| {
84    Regex::new(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
85        .unwrap_or_else(|_| Regex::new(r"[^\s\S]").unwrap_or_else(|_| unsafe { std::hint::unreachable_unchecked() }))
86});
87
88static SSN_REGEX: Lazy<Regex> = Lazy::new(|| {
89    Regex::new(r"\b\d{3}-\d{2}-\d{4}\b")
90        .unwrap_or_else(|_| Regex::new(r"[^\s\S]").unwrap_or_else(|_| unsafe { std::hint::unreachable_unchecked() }))
91});
92
93static PHONE_REGEX: Lazy<Regex> = Lazy::new(|| {
94    Regex::new(r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b")
95        .unwrap_or_else(|_| Regex::new(r"[^\s\S]").unwrap_or_else(|_| unsafe { std::hint::unreachable_unchecked() }))
96});
97
98static CREDIT_CARD_REGEX: Lazy<Regex> = Lazy::new(|| {
99    Regex::new(r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b")
100        .unwrap_or_else(|_| Regex::new(r"[^\s\S]").unwrap_or_else(|_| unsafe { std::hint::unreachable_unchecked() }))
101});
102
103/// PII pattern types that can be masked.
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105pub enum PiiPattern {
106    /// Email addresses (user@domain.com)
107    Email,
108    /// Social Security Numbers (123-45-6789)
109    Ssn,
110    /// Phone numbers (555-123-4567)
111    Phone,
112    /// Credit card numbers (1234-5678-9012-3456)
113    CreditCard,
114}
115
116impl PiiPattern {
117    /// Returns all available PII patterns.
118    fn all() -> Vec<PiiPattern> {
119        vec![
120            PiiPattern::Email,
121            PiiPattern::Ssn,
122            PiiPattern::Phone,
123            PiiPattern::CreditCard,
124        ]
125    }
126
127    /// Returns the regex pattern for this PII type.
128    fn regex(&self) -> &Regex {
129        match self {
130            PiiPattern::Email => &EMAIL_REGEX,
131            PiiPattern::Ssn => &SSN_REGEX,
132            PiiPattern::Phone => &PHONE_REGEX,
133            PiiPattern::CreditCard => &CREDIT_CARD_REGEX,
134        }
135    }
136
137    /// Masks a matched PII string according to pattern-specific rules.
138    fn mask(&self, text: &str, mask_char: char, preserve_format: bool) -> String {
139        if preserve_format {
140            match self {
141                PiiPattern::Email => {
142                    // user@domain.com → ***@***.com
143                    if let Some(at_pos) = text.find('@') {
144                        let (local, domain_with_at) = text.split_at(at_pos);
145                        let domain = &domain_with_at[1..]; // Skip '@'
146                        if let Some(dot_pos) = domain.rfind('.') {
147                            let (domain_name, tld) = domain.split_at(dot_pos);
148                            format!(
149                                "{}@{}{}",
150                                mask_char.to_string().repeat(local.len().min(3)),
151                                mask_char.to_string().repeat(domain_name.len().min(3)),
152                                tld
153                            )
154                        } else {
155                            mask_char.to_string().repeat(text.len())
156                        }
157                    } else {
158                        mask_char.to_string().repeat(text.len())
159                    }
160                }
161                PiiPattern::Ssn => {
162                    // 123-45-6789 → ***-**-****
163                    text.chars().map(|c| if c == '-' { '-' } else { mask_char }).collect()
164                }
165                PiiPattern::Phone => {
166                    // 555-123-4567 → ***-***-****
167                    text.chars()
168                        .map(|c| if c.is_ascii_digit() { mask_char } else { c })
169                        .collect()
170                }
171                PiiPattern::CreditCard => {
172                    // 1234-5678-9012-3456 → ****-****-****-****
173                    text.chars()
174                        .map(|c| if c.is_ascii_digit() { mask_char } else { c })
175                        .collect()
176                }
177            }
178        } else {
179            // Mask everything
180            mask_char.to_string().repeat(text.len())
181        }
182    }
183}
184
185/// Configuration for PII masking operations.
186#[derive(Debug, Clone, PartialEq, Eq)]
187pub struct PiiMaskingConfig {
188    /// PII patterns to detect and mask
189    pub patterns: Vec<PiiPattern>,
190    /// Character to use for masking
191    pub mask_char: char,
192    /// Whether to preserve format separators (e.g., '@', '-')
193    pub preserve_format: bool,
194}
195
196impl Default for PiiMaskingConfig {
197    fn default() -> Self {
198        Self {
199            patterns: PiiPattern::all(),
200            mask_char: '*',
201            preserve_format: true,
202        }
203    }
204}
205
206/// Implementation of `FromParameters` for PiiMaskingConfig.
207///
208/// Extracts typed configuration from HashMap parameters following
209/// the pattern established by `CompressionConfig` and `Base64Config`.
210impl FromParameters for PiiMaskingConfig {
211    fn from_parameters(params: &HashMap<String, String>) -> Result<Self, PipelineError> {
212        // Optional: patterns (defaults to all)
213        let patterns = params
214            .get("patterns")
215            .map(|s| {
216                if s.to_lowercase() == "all" {
217                    Ok(PiiPattern::all())
218                } else {
219                    s.split(',')
220                        .map(|p| match p.trim().to_lowercase().as_str() {
221                            "email" => Ok(PiiPattern::Email),
222                            "ssn" => Ok(PiiPattern::Ssn),
223                            "phone" => Ok(PiiPattern::Phone),
224                            "credit_card" | "creditcard" => Ok(PiiPattern::CreditCard),
225                            other => Err(PipelineError::InvalidParameter(format!(
226                                "Unknown PII pattern: {}. Valid: email, ssn, phone, credit_card, all",
227                                other
228                            ))),
229                        })
230                        .collect::<Result<Vec<_>, _>>()
231                }
232            })
233            .transpose()?
234            .unwrap_or_else(PiiPattern::all);
235
236        // Optional: mask_char (defaults to '*')
237        let mask_char = params.get("mask_char").and_then(|s| s.chars().next()).unwrap_or('*');
238
239        // Optional: preserve_format (defaults to true)
240        let preserve_format = params
241            .get("preserve_format")
242            .map(|s| s.to_lowercase() == "true")
243            .unwrap_or(true);
244
245        Ok(Self {
246            patterns,
247            mask_char,
248            preserve_format,
249        })
250    }
251}
252
253/// Production PII masking service.
254///
255/// This service demonstrates the complete pattern for implementing pipeline
256/// stages:
257/// - Stateless processing (no internal state)
258/// - Thread-safe (`Send + Sync`)
259/// - Non-reversible operation (masking cannot be undone)
260/// - Type-safe configuration via `FromParameters`
261/// - Proper error handling with `PipelineError`
262///
263/// ## Implementation Notes
264///
265/// - **Position**: `PreBinary` - Must execute before compression/encryption
266/// - **Reversibility**: `false` - Masking is one-way (Reverse returns error)
267/// - **Stage Type**: `Transform` - Data transformation operation
268/// - **Performance**: Regex-based matching with compiled patterns
269pub struct PiiMaskingService;
270
271impl PiiMaskingService {
272    /// Creates a new PII masking service.
273    pub fn new() -> Self {
274        Self
275    }
276
277    /// Masks PII in the provided data according to the configuration.
278    fn mask_data(&self, data: &[u8], config: &PiiMaskingConfig) -> Result<Vec<u8>, PipelineError> {
279        // Convert bytes to string for pattern matching
280        let text = String::from_utf8_lossy(data);
281        let mut masked = text.to_string();
282
283        // Apply each pattern in sequence
284        for pattern in &config.patterns {
285            masked = pattern
286                .regex()
287                .replace_all(&masked, |caps: &regex::Captures| {
288                    pattern.mask(&caps[0], config.mask_char, config.preserve_format)
289                })
290                .to_string();
291        }
292
293        Ok(masked.into_bytes())
294    }
295}
296
297impl Default for PiiMaskingService {
298    fn default() -> Self {
299        Self::new()
300    }
301}
302
303/// Implementation of `StageService` for PII masking.
304///
305/// This demonstrates the complete pattern that all stages follow:
306/// 1. Extract typed config via `FromParameters`
307/// 2. Dispatch based on `Operation` (Forward/Reverse)
308/// 3. Process the chunk
309/// 4. Update metrics in context
310/// 5. Return processed chunk
311impl StageService for PiiMaskingService {
312    fn process_chunk(
313        &self,
314        chunk: FileChunk,
315        config: &StageConfiguration,
316        context: &mut ProcessingContext,
317    ) -> Result<FileChunk, PipelineError> {
318        // Type-safe config extraction using FromParameters trait
319        let pii_config = PiiMaskingConfig::from_parameters(&config.parameters)?;
320
321        let input_size = chunk.data().len();
322
323        // Dispatch based on operation
324        let processed_data = match config.operation {
325            Operation::Forward => {
326                // Forward: Mask PII
327                tracing::debug!(
328                    chunk_seq = chunk.sequence_number(),
329                    patterns = ?pii_config.patterns,
330                    "Masking PII in chunk"
331                );
332                self.mask_data(chunk.data(), &pii_config)?
333            }
334            Operation::Reverse => {
335                // Reverse: Not supported (non-reversible operation)
336                return Err(PipelineError::ProcessingFailed(
337                    "PII masking is not reversible - cannot recover original data".to_string(),
338                ));
339            }
340        };
341
342        let output_size = processed_data.len();
343
344        // Update metrics
345        tracing::trace!(
346            operation = %config.operation,
347            input_bytes = input_size,
348            output_bytes = output_size,
349            "PII masking complete"
350        );
351
352        // Create new chunk with processed data
353        let processed_chunk = chunk.with_data(processed_data)?;
354
355        Ok(processed_chunk)
356    }
357
358    fn position(&self) -> StagePosition {
359        // PreBinary: Must execute before compression/encryption
360        // Reason: Need to see data in readable form to detect PII
361        StagePosition::PreBinary
362    }
363
364    fn is_reversible(&self) -> bool {
365        // Non-reversible: Masking destroys original information
366        false
367    }
368
369    fn stage_type(&self) -> StageType {
370        // Transform: Data transformation operation
371        StageType::Transform
372    }
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378
379    #[test]
380    fn test_from_parameters_default() {
381        let params = HashMap::new();
382        let config = PiiMaskingConfig::from_parameters(&params).unwrap();
383        assert_eq!(config.patterns.len(), 4); // All patterns
384        assert_eq!(config.mask_char, '*');
385        assert!(config.preserve_format);
386    }
387
388    #[test]
389    fn test_from_parameters_email_only() {
390        let mut params = HashMap::new();
391        params.insert("patterns".to_string(), "email".to_string());
392        let config = PiiMaskingConfig::from_parameters(&params).unwrap();
393        assert_eq!(config.patterns, vec![PiiPattern::Email]);
394    }
395
396    #[test]
397    fn test_from_parameters_multiple_patterns() {
398        let mut params = HashMap::new();
399        params.insert("patterns".to_string(), "email,ssn,phone".to_string());
400        let config = PiiMaskingConfig::from_parameters(&params).unwrap();
401        assert_eq!(
402            config.patterns,
403            vec![PiiPattern::Email, PiiPattern::Ssn, PiiPattern::Phone]
404        );
405    }
406
407    #[test]
408    fn test_from_parameters_custom_mask_char() {
409        let mut params = HashMap::new();
410        params.insert("mask_char".to_string(), "#".to_string());
411        let config = PiiMaskingConfig::from_parameters(&params).unwrap();
412        assert_eq!(config.mask_char, '#');
413    }
414
415    #[test]
416    fn test_from_parameters_invalid_pattern() {
417        let mut params = HashMap::new();
418        params.insert("patterns".to_string(), "invalid".to_string());
419        let result = PiiMaskingConfig::from_parameters(&params);
420        assert!(result.is_err());
421    }
422
423    #[test]
424    fn test_mask_email() {
425        let service = PiiMaskingService::new();
426        let config = PiiMaskingConfig {
427            patterns: vec![PiiPattern::Email],
428            mask_char: '*',
429            preserve_format: true,
430        };
431
432        let data = b"Contact: user@example.com for more info";
433        let masked = service.mask_data(data, &config).unwrap();
434        let result = String::from_utf8_lossy(&masked);
435
436        assert!(result.contains("***@***.com"));
437        assert!(!result.contains("user@example.com"));
438    }
439
440    #[test]
441    fn test_mask_ssn() {
442        let service = PiiMaskingService::new();
443        let config = PiiMaskingConfig {
444            patterns: vec![PiiPattern::Ssn],
445            mask_char: '*',
446            preserve_format: true,
447        };
448
449        let data = b"SSN: 123-45-6789";
450        let masked = service.mask_data(data, &config).unwrap();
451        let result = String::from_utf8_lossy(&masked);
452
453        assert!(result.contains("***-**-****"));
454        assert!(!result.contains("123-45-6789"));
455    }
456
457    #[test]
458    fn test_mask_phone() {
459        let service = PiiMaskingService::new();
460        let config = PiiMaskingConfig {
461            patterns: vec![PiiPattern::Phone],
462            mask_char: '*',
463            preserve_format: true,
464        };
465
466        let data = b"Call: 555-123-4567";
467        let masked = service.mask_data(data, &config).unwrap();
468        let result = String::from_utf8_lossy(&masked);
469
470        assert!(result.contains("***-***-****"));
471        assert!(!result.contains("555-123-4567"));
472    }
473
474    #[test]
475    fn test_mask_credit_card() {
476        let service = PiiMaskingService::new();
477        let config = PiiMaskingConfig {
478            patterns: vec![PiiPattern::CreditCard],
479            mask_char: '*',
480            preserve_format: true,
481        };
482
483        let data = b"Card: 1234-5678-9012-3456";
484        let masked = service.mask_data(data, &config).unwrap();
485        let result = String::from_utf8_lossy(&masked);
486
487        assert!(result.contains("****-****-****-****"));
488        assert!(!result.contains("1234-5678-9012-3456"));
489    }
490
491    #[test]
492    fn test_reverse_operation_fails() {
493        use adaptive_pipeline_domain::entities::pipeline_stage::StageConfiguration;
494        use adaptive_pipeline_domain::entities::{SecurityContext, SecurityLevel};
495        
496
497        let service = PiiMaskingService::new();
498        let chunk = FileChunk::new(0, 0, vec![0u8; 100], false).unwrap();
499        let config = StageConfiguration {
500            algorithm: "pii_masking".to_string(),
501            operation: Operation::Reverse,
502            parameters: HashMap::new(),
503            parallel_processing: false,
504            chunk_size: None,
505        };
506        let mut context = ProcessingContext::new(
507            100,
508            SecurityContext::new(None, SecurityLevel::Public),
509        );
510
511        let result = service.process_chunk(chunk, &config, &mut context);
512        assert!(result.is_err());
513        assert!(result.unwrap_err().to_string().contains("not reversible"));
514    }
515
516    #[test]
517    fn test_stage_service_properties() {
518        let service = PiiMaskingService::new();
519
520        assert_eq!(service.position(), StagePosition::PreBinary);
521        assert!(!service.is_reversible());
522        assert_eq!(service.stage_type(), StageType::Transform);
523    }
524}