Skip to main content

synapse_pingora/dlp/
scanner.rs

1//! DLP Scanner Implementation
2//!
3//! Thread-safe scanner for detecting sensitive data in response bodies.
4//!
5//! Performance optimizations:
6//! - Aho-Corasick automaton for single-pass multi-pattern matching
7//! - Configurable inspection depth cap to bound scan time
8//! - Content-type filtering to skip binary payloads
9
10use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
11use lazy_static::lazy_static;
12use regex::{Regex, RegexSet};
13use serde::{Deserialize, Serialize};
14use sha2::{Digest, Sha256};
15use std::collections::{HashMap, VecDeque};
16use std::sync::atomic::{AtomicU64, Ordering};
17use std::sync::Arc;
18use std::time::Instant;
19use tokio::sync::RwLock;
20
21/// Sensitive data type categories
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
23#[serde(rename_all = "snake_case")]
24#[non_exhaustive]
25pub enum SensitiveDataType {
26    CreditCard,
27    Ssn,
28    Email,
29    Phone,
30    ApiKey,
31    Password,
32    Iban,
33    IpAddress,
34    AwsKey,
35    PrivateKey,
36    Jwt,
37    MedicalRecord,
38    Custom,
39}
40
41impl SensitiveDataType {
42    pub fn as_str(&self) -> &'static str {
43        match self {
44            Self::CreditCard => "credit_card",
45            Self::Ssn => "ssn",
46            Self::Email => "email",
47            Self::Phone => "phone",
48            Self::ApiKey => "api_key",
49            Self::Password => "password",
50            Self::Iban => "iban",
51            Self::IpAddress => "ip_address",
52            Self::AwsKey => "aws_key",
53            Self::PrivateKey => "private_key",
54            Self::Jwt => "jwt",
55            Self::MedicalRecord => "medical_record",
56            Self::Custom => "custom",
57        }
58    }
59}
60
61/// Pattern severity levels
62#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
63#[non_exhaustive]
64pub enum PatternSeverity {
65    Low,
66    Medium,
67    High,
68    Critical,
69}
70
71impl PatternSeverity {
72    pub fn as_str(&self) -> &'static str {
73        match self {
74            Self::Low => "low",
75            Self::Medium => "medium",
76            Self::High => "high",
77            Self::Critical => "critical",
78        }
79    }
80}
81
82/// A matched sensitive data pattern
83#[derive(Debug, Clone)]
84pub struct DlpMatch {
85    pub pattern_name: &'static str,
86    pub data_type: SensitiveDataType,
87    pub severity: PatternSeverity,
88    pub masked_value: String,
89    /// Start position in the scanned content (local to scan call)
90    pub start: usize,
91    /// End position in the scanned content (local to scan call)
92    pub end: usize,
93    /// Absolute offset in the original stream (set by StreamingScanner)
94    /// None for non-streaming scans
95    pub stream_offset: Option<usize>,
96}
97
98/// Result of a DLP scan
99#[derive(Debug, Clone, Default)]
100pub struct ScanResult {
101    pub scanned: bool,
102    pub has_matches: bool,
103    pub matches: Vec<DlpMatch>,
104    pub match_count: usize,
105    pub scan_time_us: u64,
106    pub content_length: usize,
107    /// True if content was truncated to max_body_inspection_bytes
108    pub truncated: bool,
109    /// Original content length before truncation (0 if not truncated)
110    pub original_length: usize,
111}
112
113/// DLP scanner statistics
114#[derive(Debug, Clone)]
115pub struct DlpStats {
116    pub total_scans: u64,
117    pub total_matches: u64,
118    pub matches_by_type: HashMap<SensitiveDataType, u64>,
119    pub matches_by_severity: HashMap<PatternSeverity, u64>,
120}
121
122/// A recorded DLP violation for audit/monitoring
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct DlpViolation {
125    pub timestamp: u64,
126    pub pattern_name: String,
127    pub data_type: String,
128    pub severity: String,
129    pub masked_value: String,
130    pub client_ip: Option<String>,
131    pub path: String,
132}
133
134/// Redaction mode for sensitive data
135#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
136#[serde(rename_all = "snake_case")]
137#[non_exhaustive]
138#[derive(Default)]
139pub enum RedactionMode {
140    /// Mask all characters (e.g., "**********")
141    Full,
142    /// Show partial characters (e.g., "****-1234") - Default
143    #[default]
144    Partial,
145    /// Replace with hash (e.g., "sha256:...")
146    Hash,
147    /// No redaction (dangerous, debug only)
148    None,
149}
150
151/// Error type for DLP configuration validation
152#[derive(Debug, Clone, PartialEq, Eq)]
153pub enum DlpConfigError {
154    /// Hash mode requires a salt to be configured
155    HashModeRequiresSalt,
156    /// Custom keyword is empty
157    EmptyCustomKeyword,
158    /// Custom keyword exceeds maximum length (1024 chars)
159    CustomKeywordTooLong(usize),
160    /// Too many custom keywords (max 1000)
161    TooManyCustomKeywords(usize),
162}
163
164impl std::fmt::Display for DlpConfigError {
165    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
166        match self {
167            Self::HashModeRequiresSalt => {
168                write!(f, "RedactionMode::Hash requires hash_salt to be configured")
169            }
170            Self::EmptyCustomKeyword => write!(f, "custom_keywords contains empty string"),
171            Self::CustomKeywordTooLong(len) => {
172                write!(f, "custom keyword exceeds max length 1024: {} chars", len)
173            }
174            Self::TooManyCustomKeywords(count) => {
175                write!(f, "too many custom keywords (max 1000): {}", count)
176            }
177        }
178    }
179}
180
181impl std::error::Error for DlpConfigError {}
182
183/// Builder for RedactionConfig with common presets
184#[derive(Debug, Clone, Default)]
185pub struct RedactionConfigBuilder {
186    default_mode: RedactionMode,
187    per_type: HashMap<SensitiveDataType, RedactionMode>,
188    hash_salt: Option<String>,
189}
190
191impl RedactionConfigBuilder {
192    /// Create a new builder with default partial redaction
193    pub fn new() -> Self {
194        Self::default()
195    }
196
197    /// Preset: Mask all data types with full redaction
198    pub fn mask_all() -> Self {
199        Self {
200            default_mode: RedactionMode::Full,
201            per_type: HashMap::new(),
202            hash_salt: None,
203        }
204    }
205
206    /// Preset: Hash PII (SSN, medical records), mask credentials
207    pub fn hash_pii_mask_credentials(salt: String) -> Self {
208        let mut per_type = HashMap::new();
209        per_type.insert(SensitiveDataType::Ssn, RedactionMode::Hash);
210        per_type.insert(SensitiveDataType::MedicalRecord, RedactionMode::Hash);
211        per_type.insert(SensitiveDataType::CreditCard, RedactionMode::Hash);
212        per_type.insert(SensitiveDataType::Iban, RedactionMode::Hash);
213        // Credentials use partial for debugging
214        per_type.insert(SensitiveDataType::Password, RedactionMode::Full);
215        per_type.insert(SensitiveDataType::ApiKey, RedactionMode::Partial);
216        per_type.insert(SensitiveDataType::AwsKey, RedactionMode::Partial);
217        Self {
218            default_mode: RedactionMode::Partial,
219            per_type,
220            hash_salt: Some(salt),
221        }
222    }
223
224    /// Set the default redaction mode for types not explicitly configured
225    pub fn with_default(mut self, mode: RedactionMode) -> Self {
226        self.default_mode = mode;
227        self
228    }
229
230    /// Set redaction mode for a specific data type
231    pub fn with_type(mut self, data_type: SensitiveDataType, mode: RedactionMode) -> Self {
232        self.per_type.insert(data_type, mode);
233        self
234    }
235
236    /// Set the hash salt (required if any type uses RedactionMode::Hash)
237    pub fn with_salt(mut self, salt: String) -> Self {
238        self.hash_salt = Some(salt);
239        self
240    }
241
242    /// Build the redaction configuration
243    pub fn build(self) -> (HashMap<SensitiveDataType, RedactionMode>, Option<String>) {
244        (self.per_type, self.hash_salt)
245    }
246}
247
248/// DLP configuration
249#[derive(Debug, Clone, Deserialize)]
250pub struct DlpConfig {
251    pub enabled: bool,
252    /// Maximum body size to accept for scanning (reject if larger)
253    pub max_scan_size: usize,
254    /// Maximum matches before stopping scan
255    pub max_matches: usize,
256    /// Only scan text-based content types
257    pub scan_text_only: bool,
258    /// Maximum bytes to inspect for DLP patterns (truncate if larger).
259    /// This bounds scan time for large payloads. Default 8KB.
260    /// Content beyond this limit is not scanned but the request continues.
261    pub max_body_inspection_bytes: usize,
262    /// Fast mode: Skip low-priority patterns (email, phone, IPv4) for better performance.
263    /// Only scans critical patterns: credit cards, SSN, AWS keys, API keys, passwords, private keys, JWT, IBAN, medical records.
264    /// Reduces scan time by ~30-40% for typical payloads.
265    pub fast_mode: bool,
266    /// List of custom keywords to detect (e.g., project codenames)
267    pub custom_keywords: Option<Vec<String>>,
268    /// Redaction settings per data type
269    #[serde(default)]
270    pub redaction: HashMap<SensitiveDataType, RedactionMode>,
271    /// Salt for hash-based redaction (REQUIRED if any type uses RedactionMode::Hash)
272    /// Should be a cryptographically random string, at least 32 bytes
273    pub hash_salt: Option<String>,
274}
275
276impl Default for DlpConfig {
277    fn default() -> Self {
278        Self {
279            enabled: true,
280            max_scan_size: 5 * 1024 * 1024, // 5MB max (reject if larger)
281            max_matches: 100,               // Stop after 100 matches
282            scan_text_only: true,
283            max_body_inspection_bytes: 8 * 1024, // 8KB inspection cap for performance
284            fast_mode: false,                    // Disabled by default for comprehensive scanning
285            custom_keywords: None,
286            redaction: HashMap::new(),
287            hash_salt: None,
288        }
289    }
290}
291
292impl DlpConfig {
293    /// Validate the configuration
294    /// Returns error if:
295    /// - Any data type uses Hash mode but no salt is configured
296    /// - Custom keywords contain empty strings or exceed length limits
297    pub fn validate(&self) -> Result<(), DlpConfigError> {
298        // Check if hash mode is used without salt
299        let uses_hash = self.redaction.values().any(|m| *m == RedactionMode::Hash);
300        if uses_hash && self.hash_salt.is_none() {
301            return Err(DlpConfigError::HashModeRequiresSalt);
302        }
303
304        // Validate custom keywords
305        if let Some(keywords) = &self.custom_keywords {
306            if keywords.len() > 1000 {
307                return Err(DlpConfigError::TooManyCustomKeywords(keywords.len()));
308            }
309            for kw in keywords {
310                if kw.is_empty() {
311                    return Err(DlpConfigError::EmptyCustomKeyword);
312                }
313                if kw.len() > 1024 {
314                    return Err(DlpConfigError::CustomKeywordTooLong(kw.len()));
315                }
316            }
317        }
318
319        Ok(())
320    }
321
322    /// Get the maximum pattern length for overlap calculation
323    /// Returns the longest pattern that could span chunk boundaries
324    pub fn max_pattern_length(&self) -> usize {
325        // Longest built-in patterns:
326        // - Private key headers: ~60 chars
327        // - JWT tokens: can be very long (1000+)
328        // - IBAN: 34 chars
329        // - Custom keywords: up to 1024 chars
330        let builtin_max = 100; // Conservative estimate for most patterns
331        let custom_max = self
332            .custom_keywords
333            .as_ref()
334            .map(|kws| kws.iter().map(|k| k.len()).max().unwrap_or(0))
335            .unwrap_or(0);
336        builtin_max.max(custom_max)
337    }
338}
339
340/// Internal pattern definition
341struct Pattern {
342    name: &'static str,
343    data_type: SensitiveDataType,
344    severity: PatternSeverity,
345    regex: &'static Regex,
346    validator: Option<fn(&str) -> bool>,
347}
348
349// ============================================================================
350// Validators
351// ============================================================================
352
353/// Validate credit card using Luhn algorithm (zero-allocation implementation)
354pub fn validate_credit_card(number: &str) -> bool {
355    let mut sum = 0u32;
356    let mut digit_count = 0usize;
357    let mut has_nonzero = false;
358    let mut is_even = false;
359
360    // Process digits from right to left without allocation
361    for c in number.chars().rev() {
362        if !c.is_ascii_digit() {
363            continue;
364        }
365
366        let mut digit = c.to_digit(10).unwrap_or(0);
367        digit_count += 1;
368
369        if digit != 0 {
370            has_nonzero = true;
371        }
372
373        if is_even {
374            digit *= 2;
375            if digit > 9 {
376                digit -= 9;
377            }
378        }
379
380        sum += digit;
381        is_even = !is_even;
382    }
383
384    // Valid card: 13-19 digits, not all zeros, Luhn checksum passes
385    (13..=19).contains(&digit_count) && has_nonzero && sum.is_multiple_of(10)
386}
387
388/// Validate SSN format (zero-allocation implementation)
389///
390/// Validates against SSA rules including:
391/// - Invalid area numbers (000, 666, 900-999 reserved for ITIN)
392/// - Invalid group numbers (00)
393/// - Invalid serial numbers (0000)
394/// - Advertising SSNs (987-65-4320 through 987-65-4329 used in commercials)
395pub fn validate_ssn(ssn: &str) -> bool {
396    // Parse digits in-place without allocation
397    let mut area: u32 = 0;
398    let mut group: u32 = 0;
399    let mut serial: u32 = 0;
400    let mut digit_count = 0;
401
402    for c in ssn.chars() {
403        if let Some(d) = c.to_digit(10) {
404            match digit_count {
405                0..=2 => area = area * 10 + d,
406                3..=4 => group = group * 10 + d,
407                5..=8 => serial = serial * 10 + d,
408                _ => return false, // Too many digits
409            }
410            digit_count += 1;
411        }
412    }
413
414    // Must have exactly 9 digits
415    if digit_count != 9 {
416        return false;
417    }
418
419    // Area cannot be 000, 666, or 900-999 (ITIN range)
420    if area == 0 || area == 666 || area >= 900 {
421        return false;
422    }
423
424    // Group cannot be 00
425    if group == 0 {
426        return false;
427    }
428
429    // Serial cannot be 0000
430    if serial == 0 {
431        return false;
432    }
433
434    // Reject advertising SSNs used in commercials (987-65-4320 to 987-65-4329)
435    if area == 987 && group == 65 && (4320..=4329).contains(&serial) {
436        return false;
437    }
438
439    true
440}
441
442/// Validate US phone number format (zero-allocation implementation)
443///
444/// Reduces false positives by checking:
445/// - Must be 10 or 11 digits (with country code)
446/// - If 11 digits, must start with 1
447/// - Area code cannot be N11 (e.g., 411, 911 - service codes)
448pub fn validate_phone(phone: &str) -> bool {
449    // Parse digits in-place without allocation
450    // Max 11 digits (with country code)
451    let mut digits = [0u8; 11];
452    let mut digit_count = 0;
453
454    for c in phone.chars() {
455        if let Some(d) = c.to_digit(10) {
456            if digit_count >= 11 {
457                return false; // Too many digits
458            }
459            digits[digit_count] = d as u8;
460            digit_count += 1;
461        }
462    }
463
464    // Must be 10 or 11 digits
465    if digit_count != 10 && digit_count != 11 {
466        return false;
467    }
468
469    // If 11 digits, must start with country code 1
470    if digit_count == 11 && digits[0] != 1 {
471        return false;
472    }
473
474    // Get area code (skip country code if present)
475    let area_start = if digit_count == 11 { 1 } else { 0 };
476    let area_code: u32 = (digits[area_start] as u32) * 100
477        + (digits[area_start + 1] as u32) * 10
478        + (digits[area_start + 2] as u32);
479
480    // Area code cannot be 0xx or 1xx
481    if area_code < 200 {
482        return false;
483    }
484
485    // Area code cannot be N11 (service codes like 411, 911)
486    if area_code % 100 == 11 {
487        return false;
488    }
489
490    true
491}
492
493/// Country-specific IBAN lengths (ISO 13616)
494const IBAN_LENGTHS: &[(&str, usize)] = &[
495    ("AL", 28),
496    ("AD", 24),
497    ("AT", 20),
498    ("AZ", 28),
499    ("BH", 22),
500    ("BY", 28),
501    ("BE", 16),
502    ("BA", 20),
503    ("BR", 29),
504    ("BG", 22),
505    ("CR", 22),
506    ("HR", 21),
507    ("CY", 28),
508    ("CZ", 24),
509    ("DK", 18),
510    ("DO", 28),
511    ("TL", 23),
512    ("EE", 20),
513    ("FO", 18),
514    ("FI", 18),
515    ("FR", 27),
516    ("GE", 22),
517    ("DE", 22),
518    ("GI", 23),
519    ("GR", 27),
520    ("GL", 18),
521    ("GT", 28),
522    ("HU", 28),
523    ("IS", 26),
524    ("IQ", 23),
525    ("IE", 22),
526    ("IL", 23),
527    ("IT", 27),
528    ("JO", 30),
529    ("KZ", 20),
530    ("XK", 20),
531    ("KW", 30),
532    ("LV", 21),
533    ("LB", 28),
534    ("LI", 21),
535    ("LT", 20),
536    ("LU", 20),
537    ("MK", 19),
538    ("MT", 31),
539    ("MR", 27),
540    ("MU", 30),
541    ("MC", 27),
542    ("MD", 24),
543    ("ME", 22),
544    ("NL", 18),
545    ("NO", 15),
546    ("PK", 24),
547    ("PS", 29),
548    ("PL", 28),
549    ("PT", 25),
550    ("QA", 29),
551    ("RO", 24),
552    ("SM", 27),
553    ("SA", 24),
554    ("RS", 22),
555    ("SC", 31),
556    ("SK", 24),
557    ("SI", 19),
558    ("ES", 24),
559    ("SE", 24),
560    ("CH", 21),
561    ("TN", 24),
562    ("TR", 26),
563    ("UA", 29),
564    ("AE", 23),
565    ("GB", 22),
566    ("VA", 22),
567    ("VG", 24),
568];
569
570/// Validate IBAN format using mod-97 check with country-specific length validation
571/// (zero-allocation implementation)
572///
573/// Uses in-place character iteration and direct mod-97 computation without
574/// building intermediate strings. Letters are converted to their numeric
575/// representation (A=10, B=11, etc.) and processed directly in the modulo chain.
576pub fn validate_iban(iban: &str) -> bool {
577    // First pass: count length, extract first 4 chars, validate basic format
578    // Store first 4 characters (country code + check digits) as bytes for later
579    let mut first_four = [0u8; 4];
580    let mut first_four_idx = 0;
581    let mut total_len = 0;
582
583    for c in iban.chars() {
584        if c.is_whitespace() {
585            continue;
586        }
587        let upper = c.to_ascii_uppercase();
588
589        if total_len < 4 {
590            // Validate format: first 2 must be letters, next 2 must be digits
591            match total_len {
592                0 | 1 if !upper.is_ascii_alphabetic() => {
593                    return false;
594                }
595                2 | 3 if !upper.is_ascii_digit() => {
596                    return false;
597                }
598                _ => {}
599            }
600            first_four[first_four_idx] = upper as u8;
601            first_four_idx += 1;
602        }
603        total_len += 1;
604    }
605
606    // IBAN must be 15-34 characters (after removing whitespace)
607    if !(15..=34).contains(&total_len) {
608        return false;
609    }
610
611    // Validate country-specific length if known
612    // Country code is first two characters
613    let country_code = [first_four[0], first_four[1]];
614    for &(code, expected_len) in IBAN_LENGTHS.iter() {
615        if code.as_bytes() == country_code {
616            if total_len != expected_len {
617                return false;
618            }
619            break;
620        }
621    }
622
623    // Second pass: compute mod-97 directly
624    // IBAN validation rearranges: BBAN (chars 4+) followed by country+check (chars 0-3)
625    // We process each character, converting letters to their numeric value (A=10..Z=35)
626    // For letters (2 digits), we do: remainder = (remainder * 100 + value) % 97
627    // For digits (1 digit), we do: remainder = (remainder * 10 + digit) % 97
628    let mut remainder: u64 = 0;
629    let mut char_idx = 0;
630
631    // Process BBAN first (skip first 4 characters)
632    for c in iban.chars() {
633        if c.is_whitespace() {
634            continue;
635        }
636        char_idx += 1;
637        if char_idx <= 4 {
638            continue; // Skip country code and check digits
639        }
640
641        let upper = c.to_ascii_uppercase();
642        if upper.is_ascii_alphabetic() {
643            // A=10, B=11, ..., Z=35 (two digits, so multiply by 100)
644            let value = (upper as u64) - ('A' as u64) + 10;
645            remainder = (remainder * 100 + value) % 97;
646        } else if let Some(d) = upper.to_digit(10) {
647            remainder = (remainder * 10 + d as u64) % 97;
648        }
649    }
650
651    // Now process the first 4 characters (country code + check digits)
652    for &byte in &first_four {
653        let c = byte as char;
654        if c.is_ascii_alphabetic() {
655            let value = (c as u64) - ('A' as u64) + 10;
656            remainder = (remainder * 100 + value) % 97;
657        } else if let Some(d) = c.to_digit(10) {
658            remainder = (remainder * 10 + d as u64) % 97;
659        }
660    }
661
662    remainder == 1
663}
664
665// ============================================================================
666// Compiled Patterns
667// ============================================================================
668
669lazy_static! {
670    // Credit Cards - handle mixed separators (spaces, dashes, or none)
671    // Pattern allows any combination: "1234 5678-9012 3456" or "1234-5678 9012-3456"
672    static ref RE_VISA: Regex = Regex::new(r"\b4\d{3}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")
673        .expect("RE_VISA is a valid regex pattern");
674    static ref RE_MASTERCARD: Regex = Regex::new(r"\b5[1-5]\d{2}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")
675        .expect("RE_MASTERCARD is a valid regex pattern");
676    static ref RE_AMEX: Regex = Regex::new(r"\b3[47]\d{2}[\s-]?\d{6}[\s-]?\d{5}\b")
677        .expect("RE_AMEX is a valid regex pattern");
678    static ref RE_DISCOVER: Regex = Regex::new(r"\b6(?:011|5\d{2})[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")
679        .expect("RE_DISCOVER is a valid regex pattern");
680
681    // SSN
682    static ref RE_SSN_FORMATTED: Regex = Regex::new(r"\b\d{3}-\d{2}-\d{4}\b")
683        .expect("RE_SSN_FORMATTED is a valid regex pattern");
684    // Note: Using \b instead of lookaround (not supported by Rust regex)
685    // The SSN validator filters out false positives anyway
686    static ref RE_SSN_UNFORMATTED: Regex = Regex::new(r"\b\d{9}\b")
687        .expect("RE_SSN_UNFORMATTED is a valid regex pattern");
688
689    // Email - length limits prevent ReDoS via catastrophic backtracking
690    static ref RE_EMAIL: Regex = Regex::new(r"\b[a-zA-Z0-9._%+-]{1,64}@[a-zA-Z0-9.-]{1,253}\.[a-zA-Z]{2,10}\b")
691        .expect("RE_EMAIL is a valid regex pattern");
692
693    // Phone
694    static ref RE_US_PHONE: Regex = Regex::new(r"\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b")
695        .expect("RE_US_PHONE is a valid regex pattern");
696    static ref RE_INTL_PHONE: Regex = Regex::new(r"\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}")
697        .expect("RE_INTL_PHONE is a valid regex pattern");
698
699    // AWS
700    static ref RE_AWS_ACCESS_KEY: Regex = Regex::new(r"\b(AKIA[0-9A-Z]{16})\b")
701        .expect("RE_AWS_ACCESS_KEY is a valid regex pattern");
702    static ref RE_AWS_SECRET_KEY: Regex = Regex::new(r"\b([a-zA-Z0-9+/]{40})\b")
703        .expect("RE_AWS_SECRET_KEY is a valid regex pattern");
704    static ref RE_AWS_SESSION_TOKEN: Regex = Regex::new(r#"(?i)aws.{0,10}session.{0,10}token.{0,5}['"]?([A-Za-z0-9/+=]{100,})"#)
705        .expect("RE_AWS_SESSION_TOKEN is a valid regex pattern");
706
707    // API Keys
708    static ref RE_GENERIC_API_KEY: Regex = Regex::new(r"(?i)\b(?:api[_-]?key|apikey)[\s]*[=:]\s*['\x22]?([a-zA-Z0-9_-]{20,})['\x22]?")
709        .expect("RE_GENERIC_API_KEY is a valid regex pattern");
710    static ref RE_GITHUB_TOKEN: Regex = Regex::new(r"\b(gh[ps]_[a-zA-Z0-9]{36,})\b")
711        .expect("RE_GITHUB_TOKEN is a valid regex pattern");
712    static ref RE_GITHUB_FINE_GRAINED_PAT: Regex = Regex::new(r"\b(github_pat_[a-zA-Z0-9_]{22,})\b")
713        .expect("RE_GITHUB_FINE_GRAINED_PAT is a valid regex pattern");
714    static ref RE_STRIPE_KEY: Regex = Regex::new(r"\b((?:sk|pk|rk)_(?:live|test)_[a-zA-Z0-9]{24,})\b")
715        .expect("RE_STRIPE_KEY is a valid regex pattern");
716    static ref RE_GOOGLE_API_KEY: Regex = Regex::new(r"AIza[a-zA-Z0-9_-]{35}")
717        .expect("RE_GOOGLE_API_KEY is a valid regex pattern");
718
719    // Passwords
720    static ref RE_PASSWORD_URL: Regex = Regex::new(r"(?i)\b(?:password|passwd|pwd)=([^\s&]+)")
721        .expect("RE_PASSWORD_URL is a valid regex pattern");
722    static ref RE_PASSWORD_JSON: Regex = Regex::new(r#"(?i)"(?:password|passwd|pwd)"\s*:\s*"([^"]+)""#)
723        .expect("RE_PASSWORD_JSON is a valid regex pattern");
724
725    // IBAN
726    static ref RE_IBAN: Regex = Regex::new(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
727        .expect("RE_IBAN is a valid regex pattern");
728
729    // IP Address
730    static ref RE_IPV4: Regex = Regex::new(r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b")
731        .expect("RE_IPV4 is a valid regex pattern");
732
733    // Private Keys
734    static ref RE_RSA_PRIVATE_KEY: Regex = Regex::new(r"-----BEGIN (?:RSA )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA )?PRIVATE KEY-----")
735        .expect("RE_RSA_PRIVATE_KEY is a valid regex pattern");
736    static ref RE_EC_PRIVATE_KEY: Regex = Regex::new(r"-----BEGIN EC PRIVATE KEY-----[\s\S]*?-----END EC PRIVATE KEY-----")
737        .expect("RE_EC_PRIVATE_KEY is a valid regex pattern");
738
739    // JWT - minimum segment lengths reduce false positives on base64 data
740    static ref RE_JWT: Regex = Regex::new(r"\b(eyJ[a-zA-Z0-9_-]{10,}\.eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{20,})\b")
741        .expect("RE_JWT is a valid regex pattern");
742
743    // Medical Record
744    static ref RE_MEDICAL_RECORD: Regex = Regex::new(r"(?i)\b(?:MRN|medical[_\s-]?record[_\s-]?(?:number|#|num))[\s:]*([A-Z0-9]{6,})")
745        .expect("RE_MEDICAL_RECORD is a valid regex pattern");
746
747    /// All patterns for scanning
748    static ref PATTERNS: Vec<Pattern> = vec![
749        // Credit Cards
750        Pattern { name: "Visa Card", data_type: SensitiveDataType::CreditCard, severity: PatternSeverity::Critical, regex: &RE_VISA, validator: Some(validate_credit_card) },
751        Pattern { name: "MasterCard", data_type: SensitiveDataType::CreditCard, severity: PatternSeverity::Critical, regex: &RE_MASTERCARD, validator: Some(validate_credit_card) },
752        Pattern { name: "American Express", data_type: SensitiveDataType::CreditCard, severity: PatternSeverity::Critical, regex: &RE_AMEX, validator: Some(validate_credit_card) },
753        Pattern { name: "Discover Card", data_type: SensitiveDataType::CreditCard, severity: PatternSeverity::Critical, regex: &RE_DISCOVER, validator: Some(validate_credit_card) },
754
755        // SSN
756        Pattern { name: "SSN (formatted)", data_type: SensitiveDataType::Ssn, severity: PatternSeverity::Critical, regex: &RE_SSN_FORMATTED, validator: Some(validate_ssn) },
757        Pattern { name: "SSN (unformatted)", data_type: SensitiveDataType::Ssn, severity: PatternSeverity::Critical, regex: &RE_SSN_UNFORMATTED, validator: Some(validate_ssn) },
758
759        // Email
760        Pattern { name: "Email Address", data_type: SensitiveDataType::Email, severity: PatternSeverity::Medium, regex: &RE_EMAIL, validator: None },
761
762        // Phone
763        Pattern { name: "US Phone Number", data_type: SensitiveDataType::Phone, severity: PatternSeverity::Medium, regex: &RE_US_PHONE, validator: Some(validate_phone) },
764        Pattern { name: "International Phone", data_type: SensitiveDataType::Phone, severity: PatternSeverity::Medium, regex: &RE_INTL_PHONE, validator: None },
765
766        // AWS
767        Pattern { name: "AWS Access Key", data_type: SensitiveDataType::AwsKey, severity: PatternSeverity::Critical, regex: &RE_AWS_ACCESS_KEY, validator: None },
768        Pattern { name: "AWS Secret Key", data_type: SensitiveDataType::AwsKey, severity: PatternSeverity::Critical, regex: &RE_AWS_SECRET_KEY, validator: None },
769        Pattern { name: "AWS Session Token", data_type: SensitiveDataType::AwsKey, severity: PatternSeverity::Critical, regex: &RE_AWS_SESSION_TOKEN, validator: None },
770
771        // API Keys
772        Pattern { name: "Generic API Key", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::High, regex: &RE_GENERIC_API_KEY, validator: None },
773        Pattern { name: "GitHub Token", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::Critical, regex: &RE_GITHUB_TOKEN, validator: None },
774        Pattern { name: "GitHub Fine-grained PAT", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::Critical, regex: &RE_GITHUB_FINE_GRAINED_PAT, validator: None },
775        Pattern { name: "Stripe API Key", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::Critical, regex: &RE_STRIPE_KEY, validator: None },
776        Pattern { name: "Google API Key", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::High, regex: &RE_GOOGLE_API_KEY, validator: None },
777
778        // Passwords
779        Pattern { name: "Password in URL", data_type: SensitiveDataType::Password, severity: PatternSeverity::Critical, regex: &RE_PASSWORD_URL, validator: None },
780        Pattern { name: "Password in JSON", data_type: SensitiveDataType::Password, severity: PatternSeverity::Critical, regex: &RE_PASSWORD_JSON, validator: None },
781
782        // IBAN
783        Pattern { name: "IBAN", data_type: SensitiveDataType::Iban, severity: PatternSeverity::High, regex: &RE_IBAN, validator: Some(validate_iban) },
784
785        // IP Address
786        Pattern { name: "IPv4 Address", data_type: SensitiveDataType::IpAddress, severity: PatternSeverity::Low, regex: &RE_IPV4, validator: None },
787
788        // Private Keys
789        Pattern { name: "RSA Private Key", data_type: SensitiveDataType::PrivateKey, severity: PatternSeverity::Critical, regex: &RE_RSA_PRIVATE_KEY, validator: None },
790        Pattern { name: "EC Private Key", data_type: SensitiveDataType::PrivateKey, severity: PatternSeverity::Critical, regex: &RE_EC_PRIVATE_KEY, validator: None },
791
792        // JWT
793        Pattern { name: "JWT Token", data_type: SensitiveDataType::Jwt, severity: PatternSeverity::High, regex: &RE_JWT, validator: None },
794
795        // Medical Record
796        Pattern { name: "Medical Record Number", data_type: SensitiveDataType::MedicalRecord, severity: PatternSeverity::Critical, regex: &RE_MEDICAL_RECORD, validator: None },
797    ];
798
799    // ========================================================================
800    // Aho-Corasick Prefilter Automaton
801    // ========================================================================
802    //
803    // For patterns with reliable literal prefixes, we use Aho-Corasick for
804    // single-pass multi-pattern detection. This is O(n) in content length
805    // regardless of pattern count, vs O(n * patterns) for sequential regex.
806    //
807    // Strategy: AC finds candidate regions, then we validate with full regex.
808
809    /// Literal prefixes for Aho-Corasick prefiltering.
810    /// Each entry: (literal_prefix, pattern_index_in_PATTERNS)
811    ///
812    /// PATTERNS order (for reference):
813    ///  0: Visa, 1: MasterCard, 2: Amex, 3: Discover
814    ///  4: SSN formatted, 5: SSN unformatted
815    ///  6: Email, 7: US Phone, 8: Intl Phone
816    ///  9: AWS Access Key, 10: AWS Secret Key, 11: AWS Session Token
817    /// 12: Generic API Key, 13: GitHub Token, 14: GitHub Fine-grained PAT
818    /// 15: Stripe API Key, 16: Google API Key
819    /// 17: Password URL, 18: Password JSON
820    /// 19: IBAN, 20: IPv4
821    /// 21: RSA Private Key, 22: EC Private Key
822    /// 23: JWT Token, 24: Medical Record
823    static ref AC_PREFIXES: Vec<(&'static str, usize)> = vec![
824        // Credit cards (indices 0-3): digit prefixes
825        ("4", 0),      // Visa starts with 4
826        ("51", 1), ("52", 1), ("53", 1), ("54", 1), ("55", 1), // MasterCard 51-55
827        ("34", 2), ("37", 2), // Amex 34, 37
828        ("6011", 3), ("65", 3), // Discover
829
830        // AWS keys (indices 9-11)
831        ("AKIA", 9),   // AWS Access Key (index 9)
832        ("aws", 11), ("AWS", 11), // AWS Session Token (index 11)
833
834        // API Keys (indices 12-16)
835        ("api_key", 12), ("api-key", 12), ("apikey", 12), ("API_KEY", 12), // Generic API Key (12)
836        ("ghp_", 13), ("ghs_", 13), // GitHub Token (13)
837        ("github_pat_", 14), // GitHub Fine-grained PAT (14)
838        ("sk_live_", 15), ("sk_test_", 15), ("pk_live_", 15), ("pk_test_", 15), ("rk_live_", 15), // Stripe (15)
839        ("AIza", 16), // Google API Key (16)
840
841        // Passwords (indices 17-18)
842        ("password=", 17), ("passwd=", 17), ("pwd=", 17), // Password in URL (17)
843        ("\"password\"", 18), ("\"passwd\"", 18), ("\"pwd\"", 18), // Password in JSON (18)
844
845        // Private Keys (indices 21-22)
846        ("-----BEGIN RSA PRIVATE KEY", 21),
847        ("-----BEGIN PRIVATE KEY", 21),
848        ("-----BEGIN EC PRIVATE KEY", 22),
849
850        // JWT (index 23)
851        ("eyJ", 23),
852
853        // Email (index 6) - All emails contain @
854        ("@", 6),
855    ];
856
857    /// Pre-computed bitmask of pattern indices covered by Aho-Corasick prefiltering.
858    /// Each bit position corresponds to a PATTERNS index. If bit N is set, pattern N
859    /// has a literal prefix in AC_PREFIXES and can be skipped if AC finds no matches.
860    static ref AC_COVERED_MASK: u32 = {
861        let mut mask: u32 = 0;
862        for &(_, idx) in AC_PREFIXES.iter() {
863            mask |= 1 << idx;
864        }
865        mask
866    };
867
868    // ========================================================================
869    // RegexSet for Non-AC Patterns (Single-Pass Prefilter)
870    // ========================================================================
871    //
872    // Patterns without reliable literal prefixes for Aho-Corasick are grouped
873    // into a RegexSet for single-pass matching. This is O(n) vs O(n * patterns)
874    // for sequential regex execution.
875    //
876    // When RegexSet matches, we only run the individual pattern regex + validator.
877
878    /// Pattern indices that are NOT covered by Aho-Corasick prefiltering.
879    /// These patterns will use RegexSet for single-pass detection.
880    ///
881    /// Non-AC patterns:
882    ///  4: SSN formatted, 5: SSN unformatted
883    ///  7: US Phone, 8: Intl Phone
884    /// 10: AWS Secret Key
885    /// 19: IBAN, 20: IPv4
886    /// 24: Medical Record
887    static ref NON_AC_PATTERN_INDICES: Vec<usize> = vec![4, 5, 7, 8, 10, 19, 20, 24];
888
889    /// Pre-computed bitmask of pattern indices handled by RegexSet (non-AC patterns).
890    /// Each bit position corresponds to a PATTERNS index.
891    static ref NON_AC_PATTERN_MASK: u32 = {
892        let mut mask: u32 = 0;
893        for &idx in NON_AC_PATTERN_INDICES.iter() {
894            mask |= 1 << idx;
895        }
896        mask
897    };
898
899    /// RegexSet for non-AC patterns - single pass detects which patterns have potential matches
900    static ref NON_AC_REGEX_SET: RegexSet = RegexSet::new([
901        // Index 0 -> Pattern 4: SSN formatted
902        r"\b\d{3}-\d{2}-\d{4}\b",
903        // Index 1 -> Pattern 5: SSN unformatted
904        r"\b\d{9}\b",
905        // Index 2 -> Pattern 7: US Phone
906        r"\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b",
907        // Index 3 -> Pattern 8: International Phone
908        r"\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}",
909        // Index 4 -> Pattern 10: AWS Secret Key
910        r"\b[a-zA-Z0-9+/]{40}\b",
911        // Index 5 -> Pattern 19: IBAN
912        r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b",
913        // Index 6 -> Pattern 20: IPv4
914        r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
915        // Index 7 -> Pattern 24: Medical Record
916        r"(?i)\b(?:MRN|medical[_\s-]?record[_\s-]?(?:number|#|num))[\s:]*[A-Z0-9]{6,}",
917    ]).expect("Failed to build non-AC RegexSet");
918
919    /// Binary content types that should be skipped for DLP scanning
920    static ref SKIP_CONTENT_TYPES: Vec<&'static str> = vec![
921        "image/",
922        "audio/",
923        "video/",
924        "application/octet-stream",
925        "application/zip",
926        "application/gzip",
927        "application/x-gzip",
928        "application/x-tar",
929        "application/pdf",
930        "application/x-7z-compressed",
931        "application/x-rar-compressed",
932        "font/",
933        "model/",
934    ];
935}
936
937// ============================================================================
938// DLP Scanner
939// ============================================================================
940
941/// Thread-safe DLP scanner
942pub struct DlpScanner {
943    config: DlpConfig,
944    total_scans: AtomicU64,
945    total_matches: AtomicU64,
946    automaton: AhoCorasick,
947    // Map automaton pattern ID to (is_custom, original_index)
948    // If is_custom is false, original_index refers to PATTERNS index
949    // If is_custom is true, original_index refers to custom_keywords index
950    pattern_map: Vec<(bool, usize)>,
951    /// Recent violations buffer (limited to last 100)
952    recent_violations: Arc<RwLock<VecDeque<DlpViolation>>>,
953}
954
955impl Default for DlpScanner {
956    fn default() -> Self {
957        Self::new(DlpConfig::default())
958    }
959}
960
961impl DlpScanner {
962    /// Create a new DLP scanner with the given configuration.
963    ///
964    /// # Panics
965    /// Panics if configuration validation fails (e.g., Hash mode without salt).
966    /// Use `try_new` for fallible construction.
967    pub fn new(config: DlpConfig) -> Self {
968        Self::try_new(config).expect("DLP config validation failed")
969    }
970
971    /// Create a new DLP scanner with validation.
972    /// Returns error if configuration is invalid.
973    pub fn try_new(config: DlpConfig) -> Result<Self, DlpConfigError> {
974        // Validate configuration
975        config.validate()?;
976
977        // Force lazy_static initialization to validate all patterns at construction time.
978        let pattern_count = PATTERNS.len();
979
980        // Build combined patterns list for Aho-Corasick
981        // 1. Standard prefixes
982        let mut patterns = Vec::new();
983        let mut pattern_map = Vec::new();
984
985        for (prefix, idx) in AC_PREFIXES.iter() {
986            patterns.push(prefix.to_string());
987            pattern_map.push((false, *idx));
988        }
989
990        // 2. Custom keywords
991        if let Some(keywords) = &config.custom_keywords {
992            for (i, keyword) in keywords.iter().enumerate() {
993                patterns.push(keyword.clone());
994                pattern_map.push((true, i));
995            }
996        }
997
998        let automaton = AhoCorasickBuilder::new()
999            .match_kind(MatchKind::LeftmostFirst)
1000            .build(&patterns)
1001            .expect("Failed to build Aho-Corasick automaton");
1002
1003        log::debug!(
1004            "DLP scanner initialized with {} standard patterns and {} custom keywords",
1005            pattern_count,
1006            config
1007                .custom_keywords
1008                .as_ref()
1009                .map(|k| k.len())
1010                .unwrap_or(0)
1011        );
1012
1013        Ok(Self {
1014            config,
1015            total_scans: AtomicU64::new(0),
1016            total_matches: AtomicU64::new(0),
1017            automaton,
1018            pattern_map,
1019            recent_violations: Arc::new(RwLock::new(VecDeque::with_capacity(100))),
1020        })
1021    }
1022
1023    /// Check if scanner is enabled
1024    pub fn is_enabled(&self) -> bool {
1025        self.config.enabled
1026    }
1027
1028    /// Record violations from a scan result
1029    pub fn record_violations(&self, result: &ScanResult, client_ip: Option<&str>, path: &str) {
1030        if !result.has_matches {
1031            return;
1032        }
1033
1034        let now = std::time::SystemTime::now()
1035            .duration_since(std::time::UNIX_EPOCH)
1036            .map(|d| d.as_millis() as u64)
1037            .unwrap_or(0);
1038
1039        let mut violations = match self.recent_violations.try_write() {
1040            Ok(guard) => guard,
1041            Err(_) => {
1042                // Best-effort capture. Avoid blocking in async contexts.
1043                return;
1044            }
1045        };
1046
1047        for m in &result.matches {
1048            if violations.len() >= 100 {
1049                violations.pop_front();
1050            }
1051
1052            violations.push_back(DlpViolation {
1053                timestamp: now,
1054                pattern_name: m.pattern_name.to_string(),
1055                data_type: m.data_type.as_str().to_string(),
1056                severity: m.severity.as_str().to_string(),
1057                masked_value: m.masked_value.clone(),
1058                client_ip: client_ip.map(|s| s.to_string()),
1059                path: path.to_string(),
1060            });
1061        }
1062    }
1063
1064    /// Get recent violations
1065    pub async fn get_recent_violations(&self) -> Vec<DlpViolation> {
1066        let violations = self.recent_violations.read().await;
1067        violations.iter().cloned().rev().collect()
1068    }
1069
1070    /// Scan content for sensitive data with optimizations:
1071    /// - Inspection depth cap (truncation for large payloads)
1072    /// - Aho-Corasick prefiltering for patterns with literal prefixes
1073    #[must_use = "scan results contain sensitive data findings that should be processed"]
1074    pub fn scan(&self, content: &str) -> ScanResult {
1075        if !self.config.enabled {
1076            return ScanResult::default();
1077        }
1078
1079        let start = Instant::now();
1080        let original_length = content.len();
1081
1082        // Check max size (hard limit - reject entirely)
1083        if original_length > self.config.max_scan_size {
1084            return ScanResult {
1085                scanned: false,
1086                content_length: original_length,
1087                ..Default::default()
1088            };
1089        }
1090
1091        // Apply inspection depth cap (soft limit - truncate and continue)
1092        let (scan_content, truncated) = if original_length > self.config.max_body_inspection_bytes {
1093            // Find a safe truncation point (don't cut in middle of UTF-8 char)
1094            let mut truncate_at = self.config.max_body_inspection_bytes;
1095            while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
1096                truncate_at -= 1;
1097            }
1098            log::debug!(
1099                "DLP: Truncating {} bytes to {} for inspection",
1100                original_length,
1101                truncate_at
1102            );
1103            (&content[..truncate_at], true)
1104        } else {
1105            (content, false)
1106        };
1107
1108        let content_length = scan_content.len();
1109        let mut matches = Vec::new();
1110
1111        // Phase 1: Use Aho-Corasick to find candidate positions for patterns with literal prefixes
1112        // This is O(n) single-pass vs O(n * patterns) for sequential regex
1113        // Using u32 bitset instead of HashSet to eliminate heap allocation (25 patterns fit in 32 bits)
1114        let mut ac_candidates: u32 = 0;
1115        for ac_match in self.automaton.find_iter(scan_content) {
1116            let pattern_id = ac_match.pattern().as_usize();
1117            if let Some((is_custom, idx)) = self.pattern_map.get(pattern_id) {
1118                if *is_custom {
1119                    // Direct match for custom keyword
1120                    if matches.len() < self.config.max_matches {
1121                        if let Some(keywords) = self.config.custom_keywords.as_ref() {
1122                            if let Some(_keyword) = keywords.get(*idx) {
1123                                matches.push(DlpMatch {
1124                                    pattern_name: "Custom Keyword",
1125                                    data_type: SensitiveDataType::Custom,
1126                                    severity: PatternSeverity::High,
1127                                    masked_value: "***".to_string(), // Simple mask for custom keywords
1128                                    start: ac_match.start(),
1129                                    end: ac_match.end(),
1130                                    stream_offset: None,
1131                                });
1132                            }
1133                        }
1134                    }
1135                } else {
1136                    // Candidate for standard pattern
1137                    ac_candidates |= 1 << idx;
1138                }
1139            }
1140        }
1141
1142        // Phase 1b: Use RegexSet to find candidates for non-AC patterns (SSN, Phone, IBAN, IPv4, etc.)
1143        // This is O(n) single-pass for all non-AC patterns combined
1144        // Using u32 bitset instead of HashSet to eliminate heap allocation
1145        let regex_set_matches = NON_AC_REGEX_SET.matches(scan_content);
1146        let mut non_ac_candidates: u32 = 0;
1147        for (set_idx, &pattern_idx) in NON_AC_PATTERN_INDICES.iter().enumerate() {
1148            if regex_set_matches.matched(set_idx) {
1149                non_ac_candidates |= 1 << pattern_idx;
1150            }
1151        }
1152
1153        // Phase 2: Scan with patterns - only scan patterns that have candidates from prefilters
1154        // AC_COVERED_MASK and NON_AC_PATTERN_MASK are pre-computed lazy_static u32 bitmasks
1155
1156        // Fast mode skips low-priority patterns: Email(6), US Phone(7), Intl Phone(8), IPv4(20)
1157        const FAST_MODE_SKIP_PATTERNS: [usize; 4] = [6, 7, 8, 20];
1158
1159        'outer: for (pattern_idx, pattern) in PATTERNS.iter().enumerate() {
1160            // Early exit if we've hit max matches
1161            if matches.len() >= self.config.max_matches {
1162                break 'outer;
1163            }
1164
1165            // Fast mode: skip low-priority patterns (email, phone, IPv4)
1166            if self.config.fast_mode && FAST_MODE_SKIP_PATTERNS.contains(&pattern_idx) {
1167                continue;
1168            }
1169
1170            let pattern_bit = 1u32 << pattern_idx;
1171
1172            // Skip patterns covered by AC if AC didn't find any candidates
1173            // Using bitwise: check if pattern is in AC_COVERED_MASK but not in ac_candidates
1174            if (*AC_COVERED_MASK & pattern_bit) != 0 && (ac_candidates & pattern_bit) == 0 {
1175                continue;
1176            }
1177
1178            // Skip non-AC patterns if RegexSet didn't find any candidates
1179            // Using bitwise: check if pattern is in NON_AC_PATTERN_MASK but not in non_ac_candidates
1180            if (*NON_AC_PATTERN_MASK & pattern_bit) != 0 && (non_ac_candidates & pattern_bit) == 0 {
1181                continue;
1182            }
1183
1184            for m in pattern.regex.find_iter(scan_content) {
1185                // Check limit before processing each match
1186                if matches.len() >= self.config.max_matches {
1187                    break 'outer;
1188                }
1189
1190                let matched_value = m.as_str();
1191
1192                // Apply validator if present
1193                if let Some(validator) = pattern.validator {
1194                    if !validator(matched_value) {
1195                        continue;
1196                    }
1197                }
1198
1199                let masked = self.mask_value(matched_value, pattern.data_type);
1200
1201                matches.push(DlpMatch {
1202                    pattern_name: pattern.name,
1203                    data_type: pattern.data_type,
1204                    severity: pattern.severity,
1205                    masked_value: masked,
1206                    start: m.start(),
1207                    end: m.end(),
1208                    stream_offset: None,
1209                });
1210            }
1211        }
1212
1213        let scan_time_us = start.elapsed().as_micros() as u64;
1214        let match_count = matches.len();
1215
1216        // Update stats
1217        self.total_scans.fetch_add(1, Ordering::Relaxed);
1218        self.total_matches
1219            .fetch_add(match_count as u64, Ordering::Relaxed);
1220
1221        ScanResult {
1222            scanned: true,
1223            has_matches: !matches.is_empty(),
1224            matches,
1225            match_count,
1226            scan_time_us,
1227            content_length,
1228            truncated,
1229            original_length: if truncated { original_length } else { 0 },
1230        }
1231    }
1232
1233    /// Scan bytes as UTF-8 text
1234    #[must_use = "scan results contain sensitive data findings that should be processed"]
1235    pub fn scan_bytes(&self, data: &[u8]) -> ScanResult {
1236        match std::str::from_utf8(data) {
1237            Ok(content) => self.scan(content),
1238            Err(_) => ScanResult::default(),
1239        }
1240    }
1241
1242    /// Check if content type should be scanned.
1243    /// Returns false for binary types (images, audio, video, archives, etc.)
1244    /// Returns true for text-based types that may contain sensitive data.
1245    pub fn is_scannable_content_type(&self, content_type: &str) -> bool {
1246        let ct_lower = content_type.to_lowercase();
1247
1248        // First check skip list (binary types)
1249        for skip_type in SKIP_CONTENT_TYPES.iter() {
1250            if ct_lower.starts_with(skip_type) || ct_lower.contains(skip_type) {
1251                return false;
1252            }
1253        }
1254
1255        // Check for multipart/form-data with file uploads (skip files, scan form fields)
1256        // For now, skip all multipart to avoid scanning uploaded file contents
1257        if ct_lower.starts_with("multipart/") {
1258            return false;
1259        }
1260
1261        // Scannable text types
1262        let text_types = [
1263            "text/",
1264            "application/json",
1265            "application/xml",
1266            "application/x-www-form-urlencoded",
1267            "application/javascript",
1268            "application/ld+json",
1269        ];
1270
1271        text_types
1272            .iter()
1273            .any(|t| ct_lower.starts_with(t) || ct_lower.contains(t))
1274    }
1275
1276    /// Quick check if content type should skip DLP entirely (binary content)
1277    pub fn should_skip_content_type(&self, content_type: &str) -> bool {
1278        !self.is_scannable_content_type(content_type)
1279    }
1280
1281    /// Mask a sensitive value for logging
1282    fn mask_value(&self, value: &str, data_type: SensitiveDataType) -> String {
1283        let mode = self
1284            .config
1285            .redaction
1286            .get(&data_type)
1287            .copied()
1288            .unwrap_or_default();
1289
1290        match mode {
1291            RedactionMode::None => return value.to_string(),
1292            RedactionMode::Hash => {
1293                // Use configured salt (validated at construction time if Hash mode is used)
1294                let salt = self.config.hash_salt.as_deref().unwrap_or("");
1295                let mut hasher = Sha256::new();
1296                hasher.update(salt.as_bytes());
1297                hasher.update(value.as_bytes());
1298                return format!("sha256:{:x}", hasher.finalize());
1299            }
1300            RedactionMode::Full => return "*".repeat(value.len().min(20)),
1301            RedactionMode::Partial => {} // Fall through to partial masking logic
1302        }
1303
1304        match data_type {
1305            SensitiveDataType::CreditCard => {
1306                let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect();
1307                if digits.len() >= 4 {
1308                    format!("****-****-****-{}", &digits[digits.len() - 4..])
1309                } else {
1310                    "****-****-****-****".to_string()
1311                }
1312            }
1313            SensitiveDataType::Ssn => {
1314                let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect();
1315                if digits.len() >= 4 {
1316                    format!("***-**-{}", &digits[digits.len() - 4..])
1317                } else {
1318                    "***-**-****".to_string()
1319                }
1320            }
1321            SensitiveDataType::Email => {
1322                if let Some(at_idx) = value.find('@') {
1323                    let (local, domain) = value.split_at(at_idx);
1324                    let prefix = if local.len() >= 3 { &local[..3] } else { local };
1325                    format!("{}***{}", prefix, domain)
1326                } else {
1327                    "***@***.***".to_string()
1328                }
1329            }
1330            SensitiveDataType::Phone => {
1331                let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect();
1332                if digits.len() >= 4 {
1333                    format!("***-***-{}", &digits[digits.len() - 4..])
1334                } else {
1335                    "***-***-****".to_string()
1336                }
1337            }
1338            SensitiveDataType::Iban => {
1339                if value.len() >= 6 {
1340                    format!("{}************{}", &value[..2], &value[value.len() - 4..])
1341                } else {
1342                    "**************".to_string()
1343                }
1344            }
1345            SensitiveDataType::Jwt => "eyJ***.eyJ***.***".to_string(),
1346            SensitiveDataType::PrivateKey => "[PRIVATE KEY REDACTED]".to_string(),
1347            SensitiveDataType::AwsKey | SensitiveDataType::ApiKey => {
1348                if value.len() >= 4 {
1349                    format!("{}...{}", &value[..4], &value[value.len() - 4..])
1350                } else {
1351                    "********".to_string()
1352                }
1353            }
1354            SensitiveDataType::Password => "********".to_string(),
1355            SensitiveDataType::IpAddress => {
1356                // Mask middle octets
1357                let parts: Vec<&str> = value.split('.').collect();
1358                if parts.len() == 4 {
1359                    format!("{}.***.***.{}", parts[0], parts[3])
1360                } else {
1361                    "***.***.***.***".to_string()
1362                }
1363            }
1364            SensitiveDataType::MedicalRecord => "MRN: ********".to_string(),
1365            SensitiveDataType::Custom => "***".to_string(),
1366        }
1367    }
1368
1369    /// Get scanner statistics
1370    pub fn stats(&self) -> DlpStats {
1371        DlpStats {
1372            total_scans: self.total_scans.load(Ordering::Relaxed),
1373            total_matches: self.total_matches.load(Ordering::Relaxed),
1374            matches_by_type: HashMap::new(), // Would need per-type counters for this
1375            matches_by_severity: HashMap::new(),
1376        }
1377    }
1378
1379    /// Get pattern count
1380    pub fn pattern_count(&self) -> usize {
1381        PATTERNS.len()
1382    }
1383}
1384
1385// ============================================================================
1386// Tests
1387// ============================================================================
1388
1389#[cfg(test)]
1390mod tests {
1391    use super::*;
1392
1393    // ────────────────────────────────────────────────────────────────────────
1394    // Luhn Validation Tests
1395    // ────────────────────────────────────────────────────────────────────────
1396
1397    #[test]
1398    fn test_luhn_valid_visa() {
1399        assert!(validate_credit_card("4532015112830366"));
1400        assert!(validate_credit_card("4532-0151-1283-0366"));
1401        assert!(validate_credit_card("4532 0151 1283 0366"));
1402    }
1403
1404    #[test]
1405    fn test_luhn_valid_mastercard() {
1406        assert!(validate_credit_card("5425233430109903"));
1407    }
1408
1409    #[test]
1410    fn test_luhn_valid_amex() {
1411        assert!(validate_credit_card("374245455400126"));
1412    }
1413
1414    #[test]
1415    fn test_luhn_invalid() {
1416        assert!(!validate_credit_card("1234567890123456"));
1417        assert!(!validate_credit_card("0000000000000000"));
1418        assert!(!validate_credit_card("12345")); // Too short
1419    }
1420
1421    // ────────────────────────────────────────────────────────────────────────
1422    // SSN Validation Tests
1423    // ────────────────────────────────────────────────────────────────────────
1424
1425    #[test]
1426    fn test_ssn_valid() {
1427        assert!(validate_ssn("123-45-6789"));
1428        assert!(validate_ssn("123456789"));
1429    }
1430
1431    #[test]
1432    fn test_ssn_invalid_area() {
1433        assert!(!validate_ssn("000-45-6789")); // Area 000
1434        assert!(!validate_ssn("666-45-6789")); // Area 666
1435        assert!(!validate_ssn("900-45-6789")); // Area 900+
1436    }
1437
1438    #[test]
1439    fn test_ssn_invalid_group() {
1440        assert!(!validate_ssn("123-00-6789")); // Group 00
1441    }
1442
1443    #[test]
1444    fn test_ssn_invalid_serial() {
1445        assert!(!validate_ssn("123-45-0000")); // Serial 0000
1446    }
1447
1448    #[test]
1449    fn test_ssn_advertising_numbers() {
1450        // SSNs 987-65-4320 through 987-65-4329 are used in advertising/commercials
1451        // Note: These are also rejected by area >= 900 rule (ITIN range)
1452        // but we have explicit checks for documentation/defense-in-depth
1453        assert!(!validate_ssn("987-65-4320"));
1454        assert!(!validate_ssn("987-65-4325"));
1455        assert!(!validate_ssn("987-65-4329"));
1456        // All 9xx area codes are ITIN range and should be rejected
1457        assert!(!validate_ssn("987-65-4319"));
1458        assert!(!validate_ssn("987-65-4330"));
1459        assert!(!validate_ssn("900-12-3456"));
1460        assert!(!validate_ssn("999-99-9999"));
1461    }
1462
1463    // ────────────────────────────────────────────────────────────────────────
1464    // Phone Validation Tests
1465    // ────────────────────────────────────────────────────────────────────────
1466
1467    #[test]
1468    fn test_phone_valid() {
1469        assert!(validate_phone("212-555-1234"));
1470        assert!(validate_phone("(212) 555-1234"));
1471        assert!(validate_phone("1-212-555-1234"));
1472        assert!(validate_phone("12125551234"));
1473        assert!(validate_phone("2125551234"));
1474    }
1475
1476    #[test]
1477    fn test_phone_invalid_length() {
1478        assert!(!validate_phone("555-1234")); // Too short
1479        assert!(!validate_phone("212-555-12345")); // Too long
1480    }
1481
1482    #[test]
1483    fn test_phone_invalid_area_code() {
1484        assert!(!validate_phone("012-555-1234")); // 0xx area code
1485        assert!(!validate_phone("112-555-1234")); // 1xx area code
1486    }
1487
1488    #[test]
1489    fn test_phone_service_codes() {
1490        // N11 codes are service numbers, not valid phone numbers
1491        assert!(!validate_phone("411-555-1234")); // Directory assistance
1492        assert!(!validate_phone("911-555-1234")); // Emergency
1493        assert!(!validate_phone("611-555-1234")); // Repair service
1494    }
1495
1496    // ────────────────────────────────────────────────────────────────────────
1497    // IBAN Validation Tests
1498    // ────────────────────────────────────────────────────────────────────────
1499
1500    #[test]
1501    fn test_iban_valid_de() {
1502        assert!(validate_iban("DE89370400440532013000"));
1503    }
1504
1505    #[test]
1506    fn test_iban_valid_gb() {
1507        assert!(validate_iban("GB82WEST12345698765432"));
1508    }
1509
1510    #[test]
1511    fn test_iban_valid_with_spaces() {
1512        assert!(validate_iban("DE89 3704 0044 0532 0130 00"));
1513    }
1514
1515    #[test]
1516    fn test_iban_invalid_checksum() {
1517        assert!(!validate_iban("DE00370400440532013000")); // Wrong check digits
1518    }
1519
1520    #[test]
1521    fn test_iban_too_short() {
1522        assert!(!validate_iban("DE89370400")); // Too short
1523    }
1524
1525    // ────────────────────────────────────────────────────────────────────────
1526    // Scanner Tests
1527    // ────────────────────────────────────────────────────────────────────────
1528
1529    #[test]
1530    fn test_scanner_creation() {
1531        let scanner = DlpScanner::default();
1532        assert!(scanner.is_enabled());
1533        assert_eq!(scanner.pattern_count(), 25); // 24 base + 1 GitHub Fine-grained PAT
1534    }
1535
1536    #[test]
1537    fn test_scanner_disabled() {
1538        let config = DlpConfig {
1539            enabled: false,
1540            ..Default::default()
1541        };
1542        let scanner = DlpScanner::new(config);
1543        let result = scanner.scan("4532015112830366");
1544        assert!(!result.scanned);
1545    }
1546
1547    #[test]
1548    fn test_scan_credit_card() {
1549        let scanner = DlpScanner::default();
1550        let result = scanner.scan("My card is 4532015112830366");
1551
1552        assert!(result.scanned);
1553        assert!(result.has_matches);
1554        assert_eq!(result.match_count, 1);
1555        assert_eq!(result.matches[0].data_type, SensitiveDataType::CreditCard);
1556        assert_eq!(result.matches[0].severity, PatternSeverity::Critical);
1557    }
1558
1559    #[test]
1560    fn test_scan_ssn() {
1561        let scanner = DlpScanner::default();
1562        let result = scanner.scan("SSN: 123-45-6789");
1563
1564        assert!(result.has_matches);
1565        assert_eq!(result.matches[0].data_type, SensitiveDataType::Ssn);
1566    }
1567
1568    #[test]
1569    fn test_scan_email() {
1570        let scanner = DlpScanner::default();
1571        let result = scanner.scan("Contact: user@example.com");
1572
1573        assert!(result.has_matches);
1574        assert_eq!(result.matches[0].data_type, SensitiveDataType::Email);
1575        assert_eq!(result.matches[0].severity, PatternSeverity::Medium);
1576    }
1577
1578    #[test]
1579    fn test_scan_jwt() {
1580        let scanner = DlpScanner::default();
1581        let jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c";
1582        let result = scanner.scan(&format!("Token: {}", jwt));
1583
1584        assert!(result.has_matches);
1585        let jwt_match = result
1586            .matches
1587            .iter()
1588            .find(|m| m.data_type == SensitiveDataType::Jwt);
1589        assert!(jwt_match.is_some());
1590    }
1591
1592    #[test]
1593    fn test_scan_aws_key() {
1594        let scanner = DlpScanner::default();
1595        let result = scanner.scan("AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE");
1596
1597        assert!(result.has_matches);
1598        let aws_match = result
1599            .matches
1600            .iter()
1601            .find(|m| m.data_type == SensitiveDataType::AwsKey);
1602        assert!(aws_match.is_some());
1603    }
1604
1605    #[test]
1606    fn test_scan_github_token() {
1607        let scanner = DlpScanner::default();
1608        let result = scanner.scan("GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
1609
1610        assert!(result.has_matches);
1611        let gh_match = result
1612            .matches
1613            .iter()
1614            .find(|m| m.pattern_name == "GitHub Token");
1615        assert!(gh_match.is_some());
1616    }
1617
1618    #[test]
1619    fn test_scan_github_fine_grained_pat() {
1620        let scanner = DlpScanner::default();
1621        let result = scanner
1622            .scan("GITHUB_TOKEN=github_pat_11ABCDEFG0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
1623
1624        assert!(result.has_matches);
1625        let gh_match = result
1626            .matches
1627            .iter()
1628            .find(|m| m.pattern_name == "GitHub Fine-grained PAT");
1629        assert!(gh_match.is_some(), "Should detect GitHub fine-grained PAT");
1630    }
1631
1632    #[test]
1633    fn test_scan_stripe_keys() {
1634        let scanner = DlpScanner::default();
1635
1636        // Secret key
1637        let result = scanner.scan("STRIPE_SECRET_KEY=sk_live_51ABCdefGHI123456789012345");
1638        assert!(result.has_matches, "Should detect Stripe secret key");
1639
1640        // Publishable key
1641        let result = scanner.scan("STRIPE_PK=pk_test_51ABCdefGHI123456789012345");
1642        assert!(result.has_matches, "Should detect Stripe publishable key");
1643
1644        // Restricted key (new)
1645        let result = scanner.scan("STRIPE_RK=rk_live_51ABCdefGHI123456789012345");
1646        assert!(result.has_matches, "Should detect Stripe restricted key");
1647    }
1648
1649    #[test]
1650    fn test_scan_private_key() {
1651        let scanner = DlpScanner::default();
1652        let key =
1653            "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEA...\n-----END RSA PRIVATE KEY-----";
1654        let result = scanner.scan(key);
1655
1656        assert!(result.has_matches);
1657        assert_eq!(result.matches[0].data_type, SensitiveDataType::PrivateKey);
1658    }
1659
1660    #[test]
1661    fn test_scan_password_in_url() {
1662        let scanner = DlpScanner::default();
1663        let result = scanner.scan("https://api.example.com/login?password=secret123");
1664
1665        assert!(result.has_matches);
1666        let pwd_match = result
1667            .matches
1668            .iter()
1669            .find(|m| m.data_type == SensitiveDataType::Password);
1670        assert!(pwd_match.is_some());
1671    }
1672
1673    #[test]
1674    fn test_scan_password_in_json() {
1675        let scanner = DlpScanner::default();
1676        let result = scanner.scan(r#"{"username": "admin", "password": "secret123"}"#);
1677
1678        assert!(result.has_matches);
1679        let pwd_match = result
1680            .matches
1681            .iter()
1682            .find(|m| m.data_type == SensitiveDataType::Password);
1683        assert!(pwd_match.is_some());
1684    }
1685
1686    #[test]
1687    fn test_scan_no_matches() {
1688        let scanner = DlpScanner::default();
1689        let result = scanner.scan("This is just normal text with no sensitive data.");
1690
1691        assert!(result.scanned);
1692        assert!(!result.has_matches);
1693        assert_eq!(result.match_count, 0);
1694    }
1695
1696    #[test]
1697    fn test_scan_multiple_matches() {
1698        let scanner = DlpScanner::default();
1699        let content = "Card: 4532015112830366, SSN: 123-45-6789, Email: test@example.com";
1700        let result = scanner.scan(content);
1701
1702        assert!(result.has_matches);
1703        assert!(result.match_count >= 3);
1704    }
1705
1706    #[test]
1707    fn test_masking() {
1708        let scanner = DlpScanner::default();
1709        let result = scanner.scan("4532015112830366");
1710
1711        assert!(result.has_matches);
1712        assert!(result.matches[0].masked_value.contains("****"));
1713        assert!(result.matches[0].masked_value.ends_with("0366"));
1714    }
1715
1716    #[test]
1717    fn test_content_type_detection() {
1718        let scanner = DlpScanner::default();
1719
1720        assert!(scanner.is_scannable_content_type("text/html"));
1721        assert!(scanner.is_scannable_content_type("application/json"));
1722        assert!(scanner.is_scannable_content_type("application/xml"));
1723        assert!(!scanner.is_scannable_content_type("image/png"));
1724        assert!(!scanner.is_scannable_content_type("application/octet-stream"));
1725    }
1726
1727    #[test]
1728    fn test_stats() {
1729        let scanner = DlpScanner::default();
1730        scanner.scan("Card: 4532015112830366");
1731        scanner.scan("No sensitive data here");
1732
1733        let stats = scanner.stats();
1734        assert_eq!(stats.total_scans, 2);
1735        assert!(stats.total_matches >= 1);
1736    }
1737
1738    #[test]
1739    fn test_custom_keywords() {
1740        let config = DlpConfig {
1741            enabled: true,
1742            custom_keywords: Some(vec!["ProjectX".to_string(), "InternalID-123".to_string()]),
1743            ..Default::default()
1744        };
1745        let scanner = DlpScanner::new(config);
1746
1747        let result = scanner.scan("Confidential: ProjectX launch date is soon.");
1748        assert!(result.has_matches);
1749        assert_eq!(result.match_count, 1);
1750        assert_eq!(result.matches[0].data_type, SensitiveDataType::Custom);
1751
1752        let result2 = scanner.scan("User ID: InternalID-123");
1753        assert!(result2.has_matches);
1754        assert_eq!(result2.matches[0].data_type, SensitiveDataType::Custom);
1755    }
1756
1757    #[test]
1758    fn test_redaction_modes() {
1759        let mut redaction = HashMap::new();
1760        redaction.insert(SensitiveDataType::CreditCard, RedactionMode::Full);
1761        redaction.insert(SensitiveDataType::Email, RedactionMode::Hash);
1762
1763        let config = DlpConfig {
1764            enabled: true,
1765            redaction,
1766            hash_salt: Some("test-salt-for-hashing".to_string()), // Required for Hash mode
1767            ..Default::default()
1768        };
1769        let scanner = DlpScanner::new(config);
1770
1771        let result = scanner.scan("Card: 4532015112830366, Email: test@example.com");
1772
1773        // Full redaction for card
1774        let card = result
1775            .matches
1776            .iter()
1777            .find(|m| m.data_type == SensitiveDataType::CreditCard)
1778            .unwrap();
1779        assert!(card.masked_value.chars().all(|c| c == '*'));
1780        assert!(!card.masked_value.contains("0366")); // No partial reveal
1781
1782        // Hash redaction for email
1783        let email = result
1784            .matches
1785            .iter()
1786            .find(|m| m.data_type == SensitiveDataType::Email)
1787            .unwrap();
1788        assert!(email.masked_value.starts_with("sha256:"));
1789        assert!(!email.masked_value.contains("test@example.com"));
1790    }
1791
1792    // ────────────────────────────────────────────────────────────────────────
1793    // Performance Tests
1794    // ────────────────────────────────────────────────────────────────────────
1795
1796    #[test]
1797    fn test_scan_performance() {
1798        // Use a scanner with high inspection cap to test full 100KB scan
1799        let config = DlpConfig {
1800            enabled: true,
1801            max_scan_size: 5 * 1024 * 1024,
1802            max_matches: 100,
1803            scan_text_only: true,
1804            max_body_inspection_bytes: 200 * 1024, // 200KB cap for this test
1805            fast_mode: false,
1806            ..Default::default()
1807        };
1808        let scanner = DlpScanner::new(config);
1809
1810        // Generate 100KB of content with some sensitive data
1811        let mut content = String::with_capacity(100_000);
1812        for i in 0..1000 {
1813            content.push_str(&format!("Line {}: This is normal text content.\n", i));
1814            if i % 100 == 0 {
1815                content.push_str("Credit card: 4532015112830366\n");
1816            }
1817        }
1818
1819        let result = scanner.scan(&content);
1820
1821        // Should complete in reasonable time
1822        // Debug mode: up to 75ms (allows for 24 patterns + system load), Release mode: under 5ms
1823        #[cfg(debug_assertions)]
1824        let max_time_us = 75_000;
1825        #[cfg(not(debug_assertions))]
1826        let max_time_us = 5_000;
1827
1828        assert!(
1829            result.scan_time_us < max_time_us,
1830            "Scan took {}μs, expected < {}μs for 100KB",
1831            result.scan_time_us,
1832            max_time_us
1833        );
1834        assert!(result.match_count >= 10); // At least 10 credit cards
1835    }
1836
1837    #[test]
1838    fn test_truncation() {
1839        // Default scanner with 8KB cap
1840        let scanner = DlpScanner::default();
1841
1842        // Generate 20KB of content with credit card at the start
1843        let mut content = String::from("Credit card: 4532015112830366\n");
1844        for _ in 0..500 {
1845            content.push_str("Lorem ipsum dolor sit amet, consectetur adipiscing elit.\n");
1846        }
1847
1848        let result = scanner.scan(&content);
1849
1850        // Should be truncated and still find the credit card
1851        assert!(result.truncated);
1852        assert!(result.original_length > result.content_length);
1853        assert!(result.has_matches);
1854        assert_eq!(result.match_count, 1);
1855    }
1856
1857    #[test]
1858    fn test_fast_mode() {
1859        // Content with both critical and low-priority patterns
1860        let content = r#"
1861            Critical data:
1862            Credit card: 4532015112830366
1863            SSN: 123-45-6789
1864            AWS Key: AKIAIOSFODNN7EXAMPLE
1865
1866            Low-priority data (skipped in fast mode):
1867            Email: user@example.com
1868            Phone: (555) 123-4567
1869            IP: 192.168.1.1
1870        "#;
1871
1872        // Normal scanner should find all matches
1873        let normal_scanner = DlpScanner::default();
1874        let normal_result = normal_scanner.scan(content);
1875
1876        // Fast mode scanner should skip email, phone, IP
1877        let fast_config = DlpConfig {
1878            fast_mode: true,
1879            ..Default::default()
1880        };
1881        let fast_scanner = DlpScanner::new(fast_config);
1882        let fast_result = fast_scanner.scan(content);
1883
1884        // Normal mode finds more patterns (includes email, phone, IP)
1885        assert!(
1886            normal_result.match_count > fast_result.match_count,
1887            "Normal mode ({}) should find more matches than fast mode ({})",
1888            normal_result.match_count,
1889            fast_result.match_count
1890        );
1891
1892        // Fast mode should still find critical patterns (credit card, SSN, AWS key)
1893        assert!(
1894            fast_result.match_count >= 3,
1895            "Fast mode should find at least 3 critical matches, found {}",
1896            fast_result.match_count
1897        );
1898
1899        // Verify fast mode doesn't find email/phone/IP
1900        let fast_types: Vec<_> = fast_result.matches.iter().map(|m| m.data_type).collect();
1901        assert!(
1902            !fast_types.contains(&SensitiveDataType::Email),
1903            "Fast mode should not detect emails"
1904        );
1905        assert!(
1906            !fast_types.contains(&SensitiveDataType::Phone),
1907            "Fast mode should not detect phone numbers"
1908        );
1909        assert!(
1910            !fast_types.contains(&SensitiveDataType::IpAddress),
1911            "Fast mode should not detect IP addresses"
1912        );
1913    }
1914}