1use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
11use lazy_static::lazy_static;
12use regex::{Regex, RegexSet};
13use serde::{Deserialize, Serialize};
14use sha2::{Digest, Sha256};
15use std::collections::{HashMap, VecDeque};
16use std::sync::atomic::{AtomicU64, Ordering};
17use std::sync::Arc;
18use std::time::Instant;
19use tokio::sync::RwLock;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
23#[serde(rename_all = "snake_case")]
24#[non_exhaustive]
25pub enum SensitiveDataType {
26 CreditCard,
27 Ssn,
28 Email,
29 Phone,
30 ApiKey,
31 Password,
32 Iban,
33 IpAddress,
34 AwsKey,
35 PrivateKey,
36 Jwt,
37 MedicalRecord,
38 Custom,
39}
40
41impl SensitiveDataType {
42 pub fn as_str(&self) -> &'static str {
43 match self {
44 Self::CreditCard => "credit_card",
45 Self::Ssn => "ssn",
46 Self::Email => "email",
47 Self::Phone => "phone",
48 Self::ApiKey => "api_key",
49 Self::Password => "password",
50 Self::Iban => "iban",
51 Self::IpAddress => "ip_address",
52 Self::AwsKey => "aws_key",
53 Self::PrivateKey => "private_key",
54 Self::Jwt => "jwt",
55 Self::MedicalRecord => "medical_record",
56 Self::Custom => "custom",
57 }
58 }
59}
60
61#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
63#[non_exhaustive]
64pub enum PatternSeverity {
65 Low,
66 Medium,
67 High,
68 Critical,
69}
70
71impl PatternSeverity {
72 pub fn as_str(&self) -> &'static str {
73 match self {
74 Self::Low => "low",
75 Self::Medium => "medium",
76 Self::High => "high",
77 Self::Critical => "critical",
78 }
79 }
80}
81
82#[derive(Debug, Clone)]
84pub struct DlpMatch {
85 pub pattern_name: &'static str,
86 pub data_type: SensitiveDataType,
87 pub severity: PatternSeverity,
88 pub masked_value: String,
89 pub start: usize,
91 pub end: usize,
93 pub stream_offset: Option<usize>,
96}
97
98#[derive(Debug, Clone, Default)]
100pub struct ScanResult {
101 pub scanned: bool,
102 pub has_matches: bool,
103 pub matches: Vec<DlpMatch>,
104 pub match_count: usize,
105 pub scan_time_us: u64,
106 pub content_length: usize,
107 pub truncated: bool,
109 pub original_length: usize,
111}
112
113#[derive(Debug, Clone)]
115pub struct DlpStats {
116 pub total_scans: u64,
117 pub total_matches: u64,
118 pub matches_by_type: HashMap<SensitiveDataType, u64>,
119 pub matches_by_severity: HashMap<PatternSeverity, u64>,
120}
121
122#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct DlpViolation {
125 pub timestamp: u64,
126 pub pattern_name: String,
127 pub data_type: String,
128 pub severity: String,
129 pub masked_value: String,
130 pub client_ip: Option<String>,
131 pub path: String,
132}
133
134#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize, Serialize)]
136#[serde(rename_all = "snake_case")]
137#[non_exhaustive]
138#[derive(Default)]
139pub enum RedactionMode {
140 Full,
142 #[default]
144 Partial,
145 Hash,
147 None,
149}
150
151#[derive(Debug, Clone, PartialEq, Eq)]
153pub enum DlpConfigError {
154 HashModeRequiresSalt,
156 EmptyCustomKeyword,
158 CustomKeywordTooLong(usize),
160 TooManyCustomKeywords(usize),
162}
163
164impl std::fmt::Display for DlpConfigError {
165 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
166 match self {
167 Self::HashModeRequiresSalt => {
168 write!(f, "RedactionMode::Hash requires hash_salt to be configured")
169 }
170 Self::EmptyCustomKeyword => write!(f, "custom_keywords contains empty string"),
171 Self::CustomKeywordTooLong(len) => {
172 write!(f, "custom keyword exceeds max length 1024: {} chars", len)
173 }
174 Self::TooManyCustomKeywords(count) => {
175 write!(f, "too many custom keywords (max 1000): {}", count)
176 }
177 }
178 }
179}
180
181impl std::error::Error for DlpConfigError {}
182
183#[derive(Debug, Clone, Default)]
185pub struct RedactionConfigBuilder {
186 default_mode: RedactionMode,
187 per_type: HashMap<SensitiveDataType, RedactionMode>,
188 hash_salt: Option<String>,
189}
190
191impl RedactionConfigBuilder {
192 pub fn new() -> Self {
194 Self::default()
195 }
196
197 pub fn mask_all() -> Self {
199 Self {
200 default_mode: RedactionMode::Full,
201 per_type: HashMap::new(),
202 hash_salt: None,
203 }
204 }
205
206 pub fn hash_pii_mask_credentials(salt: String) -> Self {
208 let mut per_type = HashMap::new();
209 per_type.insert(SensitiveDataType::Ssn, RedactionMode::Hash);
210 per_type.insert(SensitiveDataType::MedicalRecord, RedactionMode::Hash);
211 per_type.insert(SensitiveDataType::CreditCard, RedactionMode::Hash);
212 per_type.insert(SensitiveDataType::Iban, RedactionMode::Hash);
213 per_type.insert(SensitiveDataType::Password, RedactionMode::Full);
215 per_type.insert(SensitiveDataType::ApiKey, RedactionMode::Partial);
216 per_type.insert(SensitiveDataType::AwsKey, RedactionMode::Partial);
217 Self {
218 default_mode: RedactionMode::Partial,
219 per_type,
220 hash_salt: Some(salt),
221 }
222 }
223
224 pub fn with_default(mut self, mode: RedactionMode) -> Self {
226 self.default_mode = mode;
227 self
228 }
229
230 pub fn with_type(mut self, data_type: SensitiveDataType, mode: RedactionMode) -> Self {
232 self.per_type.insert(data_type, mode);
233 self
234 }
235
236 pub fn with_salt(mut self, salt: String) -> Self {
238 self.hash_salt = Some(salt);
239 self
240 }
241
242 pub fn build(self) -> (HashMap<SensitiveDataType, RedactionMode>, Option<String>) {
244 (self.per_type, self.hash_salt)
245 }
246}
247
248#[derive(Debug, Clone, Deserialize)]
250pub struct DlpConfig {
251 pub enabled: bool,
252 pub max_scan_size: usize,
254 pub max_matches: usize,
256 pub scan_text_only: bool,
258 pub max_body_inspection_bytes: usize,
262 pub fast_mode: bool,
266 pub custom_keywords: Option<Vec<String>>,
268 #[serde(default)]
270 pub redaction: HashMap<SensitiveDataType, RedactionMode>,
271 pub hash_salt: Option<String>,
274}
275
276impl Default for DlpConfig {
277 fn default() -> Self {
278 Self {
279 enabled: true,
280 max_scan_size: 5 * 1024 * 1024, max_matches: 100, scan_text_only: true,
283 max_body_inspection_bytes: 8 * 1024, fast_mode: false, custom_keywords: None,
286 redaction: HashMap::new(),
287 hash_salt: None,
288 }
289 }
290}
291
292impl DlpConfig {
293 pub fn validate(&self) -> Result<(), DlpConfigError> {
298 let uses_hash = self.redaction.values().any(|m| *m == RedactionMode::Hash);
300 if uses_hash && self.hash_salt.is_none() {
301 return Err(DlpConfigError::HashModeRequiresSalt);
302 }
303
304 if let Some(keywords) = &self.custom_keywords {
306 if keywords.len() > 1000 {
307 return Err(DlpConfigError::TooManyCustomKeywords(keywords.len()));
308 }
309 for kw in keywords {
310 if kw.is_empty() {
311 return Err(DlpConfigError::EmptyCustomKeyword);
312 }
313 if kw.len() > 1024 {
314 return Err(DlpConfigError::CustomKeywordTooLong(kw.len()));
315 }
316 }
317 }
318
319 Ok(())
320 }
321
322 pub fn max_pattern_length(&self) -> usize {
325 let builtin_max = 100; let custom_max = self
332 .custom_keywords
333 .as_ref()
334 .map(|kws| kws.iter().map(|k| k.len()).max().unwrap_or(0))
335 .unwrap_or(0);
336 builtin_max.max(custom_max)
337 }
338}
339
340struct Pattern {
342 name: &'static str,
343 data_type: SensitiveDataType,
344 severity: PatternSeverity,
345 regex: &'static Regex,
346 validator: Option<fn(&str) -> bool>,
347}
348
349pub fn validate_credit_card(number: &str) -> bool {
355 let mut sum = 0u32;
356 let mut digit_count = 0usize;
357 let mut has_nonzero = false;
358 let mut is_even = false;
359
360 for c in number.chars().rev() {
362 if !c.is_ascii_digit() {
363 continue;
364 }
365
366 let mut digit = c.to_digit(10).unwrap_or(0);
367 digit_count += 1;
368
369 if digit != 0 {
370 has_nonzero = true;
371 }
372
373 if is_even {
374 digit *= 2;
375 if digit > 9 {
376 digit -= 9;
377 }
378 }
379
380 sum += digit;
381 is_even = !is_even;
382 }
383
384 (13..=19).contains(&digit_count) && has_nonzero && sum.is_multiple_of(10)
386}
387
388pub fn validate_ssn(ssn: &str) -> bool {
396 let mut area: u32 = 0;
398 let mut group: u32 = 0;
399 let mut serial: u32 = 0;
400 let mut digit_count = 0;
401
402 for c in ssn.chars() {
403 if let Some(d) = c.to_digit(10) {
404 match digit_count {
405 0..=2 => area = area * 10 + d,
406 3..=4 => group = group * 10 + d,
407 5..=8 => serial = serial * 10 + d,
408 _ => return false, }
410 digit_count += 1;
411 }
412 }
413
414 if digit_count != 9 {
416 return false;
417 }
418
419 if area == 0 || area == 666 || area >= 900 {
421 return false;
422 }
423
424 if group == 0 {
426 return false;
427 }
428
429 if serial == 0 {
431 return false;
432 }
433
434 if area == 987 && group == 65 && (4320..=4329).contains(&serial) {
436 return false;
437 }
438
439 true
440}
441
442pub fn validate_phone(phone: &str) -> bool {
449 let mut digits = [0u8; 11];
452 let mut digit_count = 0;
453
454 for c in phone.chars() {
455 if let Some(d) = c.to_digit(10) {
456 if digit_count >= 11 {
457 return false; }
459 digits[digit_count] = d as u8;
460 digit_count += 1;
461 }
462 }
463
464 if digit_count != 10 && digit_count != 11 {
466 return false;
467 }
468
469 if digit_count == 11 && digits[0] != 1 {
471 return false;
472 }
473
474 let area_start = if digit_count == 11 { 1 } else { 0 };
476 let area_code: u32 = (digits[area_start] as u32) * 100
477 + (digits[area_start + 1] as u32) * 10
478 + (digits[area_start + 2] as u32);
479
480 if area_code < 200 {
482 return false;
483 }
484
485 if area_code % 100 == 11 {
487 return false;
488 }
489
490 true
491}
492
493const IBAN_LENGTHS: &[(&str, usize)] = &[
495 ("AL", 28),
496 ("AD", 24),
497 ("AT", 20),
498 ("AZ", 28),
499 ("BH", 22),
500 ("BY", 28),
501 ("BE", 16),
502 ("BA", 20),
503 ("BR", 29),
504 ("BG", 22),
505 ("CR", 22),
506 ("HR", 21),
507 ("CY", 28),
508 ("CZ", 24),
509 ("DK", 18),
510 ("DO", 28),
511 ("TL", 23),
512 ("EE", 20),
513 ("FO", 18),
514 ("FI", 18),
515 ("FR", 27),
516 ("GE", 22),
517 ("DE", 22),
518 ("GI", 23),
519 ("GR", 27),
520 ("GL", 18),
521 ("GT", 28),
522 ("HU", 28),
523 ("IS", 26),
524 ("IQ", 23),
525 ("IE", 22),
526 ("IL", 23),
527 ("IT", 27),
528 ("JO", 30),
529 ("KZ", 20),
530 ("XK", 20),
531 ("KW", 30),
532 ("LV", 21),
533 ("LB", 28),
534 ("LI", 21),
535 ("LT", 20),
536 ("LU", 20),
537 ("MK", 19),
538 ("MT", 31),
539 ("MR", 27),
540 ("MU", 30),
541 ("MC", 27),
542 ("MD", 24),
543 ("ME", 22),
544 ("NL", 18),
545 ("NO", 15),
546 ("PK", 24),
547 ("PS", 29),
548 ("PL", 28),
549 ("PT", 25),
550 ("QA", 29),
551 ("RO", 24),
552 ("SM", 27),
553 ("SA", 24),
554 ("RS", 22),
555 ("SC", 31),
556 ("SK", 24),
557 ("SI", 19),
558 ("ES", 24),
559 ("SE", 24),
560 ("CH", 21),
561 ("TN", 24),
562 ("TR", 26),
563 ("UA", 29),
564 ("AE", 23),
565 ("GB", 22),
566 ("VA", 22),
567 ("VG", 24),
568];
569
570pub fn validate_iban(iban: &str) -> bool {
577 let mut first_four = [0u8; 4];
580 let mut first_four_idx = 0;
581 let mut total_len = 0;
582
583 for c in iban.chars() {
584 if c.is_whitespace() {
585 continue;
586 }
587 let upper = c.to_ascii_uppercase();
588
589 if total_len < 4 {
590 match total_len {
592 0 | 1 if !upper.is_ascii_alphabetic() => {
593 return false;
594 }
595 2 | 3 if !upper.is_ascii_digit() => {
596 return false;
597 }
598 _ => {}
599 }
600 first_four[first_four_idx] = upper as u8;
601 first_four_idx += 1;
602 }
603 total_len += 1;
604 }
605
606 if !(15..=34).contains(&total_len) {
608 return false;
609 }
610
611 let country_code = [first_four[0], first_four[1]];
614 for &(code, expected_len) in IBAN_LENGTHS.iter() {
615 if code.as_bytes() == country_code {
616 if total_len != expected_len {
617 return false;
618 }
619 break;
620 }
621 }
622
623 let mut remainder: u64 = 0;
629 let mut char_idx = 0;
630
631 for c in iban.chars() {
633 if c.is_whitespace() {
634 continue;
635 }
636 char_idx += 1;
637 if char_idx <= 4 {
638 continue; }
640
641 let upper = c.to_ascii_uppercase();
642 if upper.is_ascii_alphabetic() {
643 let value = (upper as u64) - ('A' as u64) + 10;
645 remainder = (remainder * 100 + value) % 97;
646 } else if let Some(d) = upper.to_digit(10) {
647 remainder = (remainder * 10 + d as u64) % 97;
648 }
649 }
650
651 for &byte in &first_four {
653 let c = byte as char;
654 if c.is_ascii_alphabetic() {
655 let value = (c as u64) - ('A' as u64) + 10;
656 remainder = (remainder * 100 + value) % 97;
657 } else if let Some(d) = c.to_digit(10) {
658 remainder = (remainder * 10 + d as u64) % 97;
659 }
660 }
661
662 remainder == 1
663}
664
665lazy_static! {
670 static ref RE_VISA: Regex = Regex::new(r"\b4\d{3}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")
673 .expect("RE_VISA is a valid regex pattern");
674 static ref RE_MASTERCARD: Regex = Regex::new(r"\b5[1-5]\d{2}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")
675 .expect("RE_MASTERCARD is a valid regex pattern");
676 static ref RE_AMEX: Regex = Regex::new(r"\b3[47]\d{2}[\s-]?\d{6}[\s-]?\d{5}\b")
677 .expect("RE_AMEX is a valid regex pattern");
678 static ref RE_DISCOVER: Regex = Regex::new(r"\b6(?:011|5\d{2})[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")
679 .expect("RE_DISCOVER is a valid regex pattern");
680
681 static ref RE_SSN_FORMATTED: Regex = Regex::new(r"\b\d{3}-\d{2}-\d{4}\b")
683 .expect("RE_SSN_FORMATTED is a valid regex pattern");
684 static ref RE_SSN_UNFORMATTED: Regex = Regex::new(r"\b\d{9}\b")
687 .expect("RE_SSN_UNFORMATTED is a valid regex pattern");
688
689 static ref RE_EMAIL: Regex = Regex::new(r"\b[a-zA-Z0-9._%+-]{1,64}@[a-zA-Z0-9.-]{1,253}\.[a-zA-Z]{2,10}\b")
691 .expect("RE_EMAIL is a valid regex pattern");
692
693 static ref RE_US_PHONE: Regex = Regex::new(r"\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b")
695 .expect("RE_US_PHONE is a valid regex pattern");
696 static ref RE_INTL_PHONE: Regex = Regex::new(r"\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}")
697 .expect("RE_INTL_PHONE is a valid regex pattern");
698
699 static ref RE_AWS_ACCESS_KEY: Regex = Regex::new(r"\b(AKIA[0-9A-Z]{16})\b")
701 .expect("RE_AWS_ACCESS_KEY is a valid regex pattern");
702 static ref RE_AWS_SECRET_KEY: Regex = Regex::new(r"\b([a-zA-Z0-9+/]{40})\b")
703 .expect("RE_AWS_SECRET_KEY is a valid regex pattern");
704 static ref RE_AWS_SESSION_TOKEN: Regex = Regex::new(r#"(?i)aws.{0,10}session.{0,10}token.{0,5}['"]?([A-Za-z0-9/+=]{100,})"#)
705 .expect("RE_AWS_SESSION_TOKEN is a valid regex pattern");
706
707 static ref RE_GENERIC_API_KEY: Regex = Regex::new(r"(?i)\b(?:api[_-]?key|apikey)[\s]*[=:]\s*['\x22]?([a-zA-Z0-9_-]{20,})['\x22]?")
709 .expect("RE_GENERIC_API_KEY is a valid regex pattern");
710 static ref RE_GITHUB_TOKEN: Regex = Regex::new(r"\b(gh[ps]_[a-zA-Z0-9]{36,})\b")
711 .expect("RE_GITHUB_TOKEN is a valid regex pattern");
712 static ref RE_GITHUB_FINE_GRAINED_PAT: Regex = Regex::new(r"\b(github_pat_[a-zA-Z0-9_]{22,})\b")
713 .expect("RE_GITHUB_FINE_GRAINED_PAT is a valid regex pattern");
714 static ref RE_STRIPE_KEY: Regex = Regex::new(r"\b((?:sk|pk|rk)_(?:live|test)_[a-zA-Z0-9]{24,})\b")
715 .expect("RE_STRIPE_KEY is a valid regex pattern");
716 static ref RE_GOOGLE_API_KEY: Regex = Regex::new(r"AIza[a-zA-Z0-9_-]{35}")
717 .expect("RE_GOOGLE_API_KEY is a valid regex pattern");
718
719 static ref RE_PASSWORD_URL: Regex = Regex::new(r"(?i)\b(?:password|passwd|pwd)=([^\s&]+)")
721 .expect("RE_PASSWORD_URL is a valid regex pattern");
722 static ref RE_PASSWORD_JSON: Regex = Regex::new(r#"(?i)"(?:password|passwd|pwd)"\s*:\s*"([^"]+)""#)
723 .expect("RE_PASSWORD_JSON is a valid regex pattern");
724
725 static ref RE_IBAN: Regex = Regex::new(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
727 .expect("RE_IBAN is a valid regex pattern");
728
729 static ref RE_IPV4: Regex = Regex::new(r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b")
731 .expect("RE_IPV4 is a valid regex pattern");
732
733 static ref RE_RSA_PRIVATE_KEY: Regex = Regex::new(r"-----BEGIN (?:RSA )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA )?PRIVATE KEY-----")
735 .expect("RE_RSA_PRIVATE_KEY is a valid regex pattern");
736 static ref RE_EC_PRIVATE_KEY: Regex = Regex::new(r"-----BEGIN EC PRIVATE KEY-----[\s\S]*?-----END EC PRIVATE KEY-----")
737 .expect("RE_EC_PRIVATE_KEY is a valid regex pattern");
738
739 static ref RE_JWT: Regex = Regex::new(r"\b(eyJ[a-zA-Z0-9_-]{10,}\.eyJ[a-zA-Z0-9_-]{10,}\.[a-zA-Z0-9_-]{20,})\b")
741 .expect("RE_JWT is a valid regex pattern");
742
743 static ref RE_MEDICAL_RECORD: Regex = Regex::new(r"(?i)\b(?:MRN|medical[_\s-]?record[_\s-]?(?:number|#|num))[\s:]*([A-Z0-9]{6,})")
745 .expect("RE_MEDICAL_RECORD is a valid regex pattern");
746
747 static ref PATTERNS: Vec<Pattern> = vec![
749 Pattern { name: "Visa Card", data_type: SensitiveDataType::CreditCard, severity: PatternSeverity::Critical, regex: &RE_VISA, validator: Some(validate_credit_card) },
751 Pattern { name: "MasterCard", data_type: SensitiveDataType::CreditCard, severity: PatternSeverity::Critical, regex: &RE_MASTERCARD, validator: Some(validate_credit_card) },
752 Pattern { name: "American Express", data_type: SensitiveDataType::CreditCard, severity: PatternSeverity::Critical, regex: &RE_AMEX, validator: Some(validate_credit_card) },
753 Pattern { name: "Discover Card", data_type: SensitiveDataType::CreditCard, severity: PatternSeverity::Critical, regex: &RE_DISCOVER, validator: Some(validate_credit_card) },
754
755 Pattern { name: "SSN (formatted)", data_type: SensitiveDataType::Ssn, severity: PatternSeverity::Critical, regex: &RE_SSN_FORMATTED, validator: Some(validate_ssn) },
757 Pattern { name: "SSN (unformatted)", data_type: SensitiveDataType::Ssn, severity: PatternSeverity::Critical, regex: &RE_SSN_UNFORMATTED, validator: Some(validate_ssn) },
758
759 Pattern { name: "Email Address", data_type: SensitiveDataType::Email, severity: PatternSeverity::Medium, regex: &RE_EMAIL, validator: None },
761
762 Pattern { name: "US Phone Number", data_type: SensitiveDataType::Phone, severity: PatternSeverity::Medium, regex: &RE_US_PHONE, validator: Some(validate_phone) },
764 Pattern { name: "International Phone", data_type: SensitiveDataType::Phone, severity: PatternSeverity::Medium, regex: &RE_INTL_PHONE, validator: None },
765
766 Pattern { name: "AWS Access Key", data_type: SensitiveDataType::AwsKey, severity: PatternSeverity::Critical, regex: &RE_AWS_ACCESS_KEY, validator: None },
768 Pattern { name: "AWS Secret Key", data_type: SensitiveDataType::AwsKey, severity: PatternSeverity::Critical, regex: &RE_AWS_SECRET_KEY, validator: None },
769 Pattern { name: "AWS Session Token", data_type: SensitiveDataType::AwsKey, severity: PatternSeverity::Critical, regex: &RE_AWS_SESSION_TOKEN, validator: None },
770
771 Pattern { name: "Generic API Key", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::High, regex: &RE_GENERIC_API_KEY, validator: None },
773 Pattern { name: "GitHub Token", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::Critical, regex: &RE_GITHUB_TOKEN, validator: None },
774 Pattern { name: "GitHub Fine-grained PAT", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::Critical, regex: &RE_GITHUB_FINE_GRAINED_PAT, validator: None },
775 Pattern { name: "Stripe API Key", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::Critical, regex: &RE_STRIPE_KEY, validator: None },
776 Pattern { name: "Google API Key", data_type: SensitiveDataType::ApiKey, severity: PatternSeverity::High, regex: &RE_GOOGLE_API_KEY, validator: None },
777
778 Pattern { name: "Password in URL", data_type: SensitiveDataType::Password, severity: PatternSeverity::Critical, regex: &RE_PASSWORD_URL, validator: None },
780 Pattern { name: "Password in JSON", data_type: SensitiveDataType::Password, severity: PatternSeverity::Critical, regex: &RE_PASSWORD_JSON, validator: None },
781
782 Pattern { name: "IBAN", data_type: SensitiveDataType::Iban, severity: PatternSeverity::High, regex: &RE_IBAN, validator: Some(validate_iban) },
784
785 Pattern { name: "IPv4 Address", data_type: SensitiveDataType::IpAddress, severity: PatternSeverity::Low, regex: &RE_IPV4, validator: None },
787
788 Pattern { name: "RSA Private Key", data_type: SensitiveDataType::PrivateKey, severity: PatternSeverity::Critical, regex: &RE_RSA_PRIVATE_KEY, validator: None },
790 Pattern { name: "EC Private Key", data_type: SensitiveDataType::PrivateKey, severity: PatternSeverity::Critical, regex: &RE_EC_PRIVATE_KEY, validator: None },
791
792 Pattern { name: "JWT Token", data_type: SensitiveDataType::Jwt, severity: PatternSeverity::High, regex: &RE_JWT, validator: None },
794
795 Pattern { name: "Medical Record Number", data_type: SensitiveDataType::MedicalRecord, severity: PatternSeverity::Critical, regex: &RE_MEDICAL_RECORD, validator: None },
797 ];
798
799 static ref AC_PREFIXES: Vec<(&'static str, usize)> = vec![
824 ("4", 0), ("51", 1), ("52", 1), ("53", 1), ("54", 1), ("55", 1), ("34", 2), ("37", 2), ("6011", 3), ("65", 3), ("AKIA", 9), ("aws", 11), ("AWS", 11), ("api_key", 12), ("api-key", 12), ("apikey", 12), ("API_KEY", 12), ("ghp_", 13), ("ghs_", 13), ("github_pat_", 14), ("sk_live_", 15), ("sk_test_", 15), ("pk_live_", 15), ("pk_test_", 15), ("rk_live_", 15), ("AIza", 16), ("password=", 17), ("passwd=", 17), ("pwd=", 17), ("\"password\"", 18), ("\"passwd\"", 18), ("\"pwd\"", 18), ("-----BEGIN RSA PRIVATE KEY", 21),
847 ("-----BEGIN PRIVATE KEY", 21),
848 ("-----BEGIN EC PRIVATE KEY", 22),
849
850 ("eyJ", 23),
852
853 ("@", 6),
855 ];
856
857 static ref AC_COVERED_MASK: u32 = {
861 let mut mask: u32 = 0;
862 for &(_, idx) in AC_PREFIXES.iter() {
863 mask |= 1 << idx;
864 }
865 mask
866 };
867
868 static ref NON_AC_PATTERN_INDICES: Vec<usize> = vec![4, 5, 7, 8, 10, 19, 20, 24];
888
889 static ref NON_AC_PATTERN_MASK: u32 = {
892 let mut mask: u32 = 0;
893 for &idx in NON_AC_PATTERN_INDICES.iter() {
894 mask |= 1 << idx;
895 }
896 mask
897 };
898
899 static ref NON_AC_REGEX_SET: RegexSet = RegexSet::new([
901 r"\b\d{3}-\d{2}-\d{4}\b",
903 r"\b\d{9}\b",
905 r"\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b",
907 r"\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}",
909 r"\b[a-zA-Z0-9+/]{40}\b",
911 r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b",
913 r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
915 r"(?i)\b(?:MRN|medical[_\s-]?record[_\s-]?(?:number|#|num))[\s:]*[A-Z0-9]{6,}",
917 ]).expect("Failed to build non-AC RegexSet");
918
919 static ref SKIP_CONTENT_TYPES: Vec<&'static str> = vec![
921 "image/",
922 "audio/",
923 "video/",
924 "application/octet-stream",
925 "application/zip",
926 "application/gzip",
927 "application/x-gzip",
928 "application/x-tar",
929 "application/pdf",
930 "application/x-7z-compressed",
931 "application/x-rar-compressed",
932 "font/",
933 "model/",
934 ];
935}
936
937pub struct DlpScanner {
943 config: DlpConfig,
944 total_scans: AtomicU64,
945 total_matches: AtomicU64,
946 automaton: AhoCorasick,
947 pattern_map: Vec<(bool, usize)>,
951 recent_violations: Arc<RwLock<VecDeque<DlpViolation>>>,
953}
954
955impl Default for DlpScanner {
956 fn default() -> Self {
957 Self::new(DlpConfig::default())
958 }
959}
960
961impl DlpScanner {
962 pub fn new(config: DlpConfig) -> Self {
968 Self::try_new(config).expect("DLP config validation failed")
969 }
970
971 pub fn try_new(config: DlpConfig) -> Result<Self, DlpConfigError> {
974 config.validate()?;
976
977 let pattern_count = PATTERNS.len();
979
980 let mut patterns = Vec::new();
983 let mut pattern_map = Vec::new();
984
985 for (prefix, idx) in AC_PREFIXES.iter() {
986 patterns.push(prefix.to_string());
987 pattern_map.push((false, *idx));
988 }
989
990 if let Some(keywords) = &config.custom_keywords {
992 for (i, keyword) in keywords.iter().enumerate() {
993 patterns.push(keyword.clone());
994 pattern_map.push((true, i));
995 }
996 }
997
998 let automaton = AhoCorasickBuilder::new()
999 .match_kind(MatchKind::LeftmostFirst)
1000 .build(&patterns)
1001 .expect("Failed to build Aho-Corasick automaton");
1002
1003 log::debug!(
1004 "DLP scanner initialized with {} standard patterns and {} custom keywords",
1005 pattern_count,
1006 config
1007 .custom_keywords
1008 .as_ref()
1009 .map(|k| k.len())
1010 .unwrap_or(0)
1011 );
1012
1013 Ok(Self {
1014 config,
1015 total_scans: AtomicU64::new(0),
1016 total_matches: AtomicU64::new(0),
1017 automaton,
1018 pattern_map,
1019 recent_violations: Arc::new(RwLock::new(VecDeque::with_capacity(100))),
1020 })
1021 }
1022
1023 pub fn is_enabled(&self) -> bool {
1025 self.config.enabled
1026 }
1027
1028 pub fn record_violations(&self, result: &ScanResult, client_ip: Option<&str>, path: &str) {
1030 if !result.has_matches {
1031 return;
1032 }
1033
1034 let now = std::time::SystemTime::now()
1035 .duration_since(std::time::UNIX_EPOCH)
1036 .map(|d| d.as_millis() as u64)
1037 .unwrap_or(0);
1038
1039 let mut violations = match self.recent_violations.try_write() {
1040 Ok(guard) => guard,
1041 Err(_) => {
1042 return;
1044 }
1045 };
1046
1047 for m in &result.matches {
1048 if violations.len() >= 100 {
1049 violations.pop_front();
1050 }
1051
1052 violations.push_back(DlpViolation {
1053 timestamp: now,
1054 pattern_name: m.pattern_name.to_string(),
1055 data_type: m.data_type.as_str().to_string(),
1056 severity: m.severity.as_str().to_string(),
1057 masked_value: m.masked_value.clone(),
1058 client_ip: client_ip.map(|s| s.to_string()),
1059 path: path.to_string(),
1060 });
1061 }
1062 }
1063
1064 pub async fn get_recent_violations(&self) -> Vec<DlpViolation> {
1066 let violations = self.recent_violations.read().await;
1067 violations.iter().cloned().rev().collect()
1068 }
1069
1070 #[must_use = "scan results contain sensitive data findings that should be processed"]
1074 pub fn scan(&self, content: &str) -> ScanResult {
1075 if !self.config.enabled {
1076 return ScanResult::default();
1077 }
1078
1079 let start = Instant::now();
1080 let original_length = content.len();
1081
1082 if original_length > self.config.max_scan_size {
1084 return ScanResult {
1085 scanned: false,
1086 content_length: original_length,
1087 ..Default::default()
1088 };
1089 }
1090
1091 let (scan_content, truncated) = if original_length > self.config.max_body_inspection_bytes {
1093 let mut truncate_at = self.config.max_body_inspection_bytes;
1095 while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
1096 truncate_at -= 1;
1097 }
1098 log::debug!(
1099 "DLP: Truncating {} bytes to {} for inspection",
1100 original_length,
1101 truncate_at
1102 );
1103 (&content[..truncate_at], true)
1104 } else {
1105 (content, false)
1106 };
1107
1108 let content_length = scan_content.len();
1109 let mut matches = Vec::new();
1110
1111 let mut ac_candidates: u32 = 0;
1115 for ac_match in self.automaton.find_iter(scan_content) {
1116 let pattern_id = ac_match.pattern().as_usize();
1117 if let Some((is_custom, idx)) = self.pattern_map.get(pattern_id) {
1118 if *is_custom {
1119 if matches.len() < self.config.max_matches {
1121 if let Some(keywords) = self.config.custom_keywords.as_ref() {
1122 if let Some(_keyword) = keywords.get(*idx) {
1123 matches.push(DlpMatch {
1124 pattern_name: "Custom Keyword",
1125 data_type: SensitiveDataType::Custom,
1126 severity: PatternSeverity::High,
1127 masked_value: "***".to_string(), start: ac_match.start(),
1129 end: ac_match.end(),
1130 stream_offset: None,
1131 });
1132 }
1133 }
1134 }
1135 } else {
1136 ac_candidates |= 1 << idx;
1138 }
1139 }
1140 }
1141
1142 let regex_set_matches = NON_AC_REGEX_SET.matches(scan_content);
1146 let mut non_ac_candidates: u32 = 0;
1147 for (set_idx, &pattern_idx) in NON_AC_PATTERN_INDICES.iter().enumerate() {
1148 if regex_set_matches.matched(set_idx) {
1149 non_ac_candidates |= 1 << pattern_idx;
1150 }
1151 }
1152
1153 const FAST_MODE_SKIP_PATTERNS: [usize; 4] = [6, 7, 8, 20];
1158
1159 'outer: for (pattern_idx, pattern) in PATTERNS.iter().enumerate() {
1160 if matches.len() >= self.config.max_matches {
1162 break 'outer;
1163 }
1164
1165 if self.config.fast_mode && FAST_MODE_SKIP_PATTERNS.contains(&pattern_idx) {
1167 continue;
1168 }
1169
1170 let pattern_bit = 1u32 << pattern_idx;
1171
1172 if (*AC_COVERED_MASK & pattern_bit) != 0 && (ac_candidates & pattern_bit) == 0 {
1175 continue;
1176 }
1177
1178 if (*NON_AC_PATTERN_MASK & pattern_bit) != 0 && (non_ac_candidates & pattern_bit) == 0 {
1181 continue;
1182 }
1183
1184 for m in pattern.regex.find_iter(scan_content) {
1185 if matches.len() >= self.config.max_matches {
1187 break 'outer;
1188 }
1189
1190 let matched_value = m.as_str();
1191
1192 if let Some(validator) = pattern.validator {
1194 if !validator(matched_value) {
1195 continue;
1196 }
1197 }
1198
1199 let masked = self.mask_value(matched_value, pattern.data_type);
1200
1201 matches.push(DlpMatch {
1202 pattern_name: pattern.name,
1203 data_type: pattern.data_type,
1204 severity: pattern.severity,
1205 masked_value: masked,
1206 start: m.start(),
1207 end: m.end(),
1208 stream_offset: None,
1209 });
1210 }
1211 }
1212
1213 let scan_time_us = start.elapsed().as_micros() as u64;
1214 let match_count = matches.len();
1215
1216 self.total_scans.fetch_add(1, Ordering::Relaxed);
1218 self.total_matches
1219 .fetch_add(match_count as u64, Ordering::Relaxed);
1220
1221 ScanResult {
1222 scanned: true,
1223 has_matches: !matches.is_empty(),
1224 matches,
1225 match_count,
1226 scan_time_us,
1227 content_length,
1228 truncated,
1229 original_length: if truncated { original_length } else { 0 },
1230 }
1231 }
1232
1233 #[must_use = "scan results contain sensitive data findings that should be processed"]
1235 pub fn scan_bytes(&self, data: &[u8]) -> ScanResult {
1236 match std::str::from_utf8(data) {
1237 Ok(content) => self.scan(content),
1238 Err(_) => ScanResult::default(),
1239 }
1240 }
1241
1242 pub fn is_scannable_content_type(&self, content_type: &str) -> bool {
1246 let ct_lower = content_type.to_lowercase();
1247
1248 for skip_type in SKIP_CONTENT_TYPES.iter() {
1250 if ct_lower.starts_with(skip_type) || ct_lower.contains(skip_type) {
1251 return false;
1252 }
1253 }
1254
1255 if ct_lower.starts_with("multipart/") {
1258 return false;
1259 }
1260
1261 let text_types = [
1263 "text/",
1264 "application/json",
1265 "application/xml",
1266 "application/x-www-form-urlencoded",
1267 "application/javascript",
1268 "application/ld+json",
1269 ];
1270
1271 text_types
1272 .iter()
1273 .any(|t| ct_lower.starts_with(t) || ct_lower.contains(t))
1274 }
1275
1276 pub fn should_skip_content_type(&self, content_type: &str) -> bool {
1278 !self.is_scannable_content_type(content_type)
1279 }
1280
1281 fn mask_value(&self, value: &str, data_type: SensitiveDataType) -> String {
1283 let mode = self
1284 .config
1285 .redaction
1286 .get(&data_type)
1287 .copied()
1288 .unwrap_or_default();
1289
1290 match mode {
1291 RedactionMode::None => return value.to_string(),
1292 RedactionMode::Hash => {
1293 let salt = self.config.hash_salt.as_deref().unwrap_or("");
1295 let mut hasher = Sha256::new();
1296 hasher.update(salt.as_bytes());
1297 hasher.update(value.as_bytes());
1298 return format!("sha256:{:x}", hasher.finalize());
1299 }
1300 RedactionMode::Full => return "*".repeat(value.len().min(20)),
1301 RedactionMode::Partial => {} }
1303
1304 match data_type {
1305 SensitiveDataType::CreditCard => {
1306 let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect();
1307 if digits.len() >= 4 {
1308 format!("****-****-****-{}", &digits[digits.len() - 4..])
1309 } else {
1310 "****-****-****-****".to_string()
1311 }
1312 }
1313 SensitiveDataType::Ssn => {
1314 let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect();
1315 if digits.len() >= 4 {
1316 format!("***-**-{}", &digits[digits.len() - 4..])
1317 } else {
1318 "***-**-****".to_string()
1319 }
1320 }
1321 SensitiveDataType::Email => {
1322 if let Some(at_idx) = value.find('@') {
1323 let (local, domain) = value.split_at(at_idx);
1324 let prefix = if local.len() >= 3 { &local[..3] } else { local };
1325 format!("{}***{}", prefix, domain)
1326 } else {
1327 "***@***.***".to_string()
1328 }
1329 }
1330 SensitiveDataType::Phone => {
1331 let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect();
1332 if digits.len() >= 4 {
1333 format!("***-***-{}", &digits[digits.len() - 4..])
1334 } else {
1335 "***-***-****".to_string()
1336 }
1337 }
1338 SensitiveDataType::Iban => {
1339 if value.len() >= 6 {
1340 format!("{}************{}", &value[..2], &value[value.len() - 4..])
1341 } else {
1342 "**************".to_string()
1343 }
1344 }
1345 SensitiveDataType::Jwt => "eyJ***.eyJ***.***".to_string(),
1346 SensitiveDataType::PrivateKey => "[PRIVATE KEY REDACTED]".to_string(),
1347 SensitiveDataType::AwsKey | SensitiveDataType::ApiKey => {
1348 if value.len() >= 4 {
1349 format!("{}...{}", &value[..4], &value[value.len() - 4..])
1350 } else {
1351 "********".to_string()
1352 }
1353 }
1354 SensitiveDataType::Password => "********".to_string(),
1355 SensitiveDataType::IpAddress => {
1356 let parts: Vec<&str> = value.split('.').collect();
1358 if parts.len() == 4 {
1359 format!("{}.***.***.{}", parts[0], parts[3])
1360 } else {
1361 "***.***.***.***".to_string()
1362 }
1363 }
1364 SensitiveDataType::MedicalRecord => "MRN: ********".to_string(),
1365 SensitiveDataType::Custom => "***".to_string(),
1366 }
1367 }
1368
1369 pub fn stats(&self) -> DlpStats {
1371 DlpStats {
1372 total_scans: self.total_scans.load(Ordering::Relaxed),
1373 total_matches: self.total_matches.load(Ordering::Relaxed),
1374 matches_by_type: HashMap::new(), matches_by_severity: HashMap::new(),
1376 }
1377 }
1378
1379 pub fn pattern_count(&self) -> usize {
1381 PATTERNS.len()
1382 }
1383}
1384
1385#[cfg(test)]
1390mod tests {
1391 use super::*;
1392
1393 #[test]
1398 fn test_luhn_valid_visa() {
1399 assert!(validate_credit_card("4532015112830366"));
1400 assert!(validate_credit_card("4532-0151-1283-0366"));
1401 assert!(validate_credit_card("4532 0151 1283 0366"));
1402 }
1403
1404 #[test]
1405 fn test_luhn_valid_mastercard() {
1406 assert!(validate_credit_card("5425233430109903"));
1407 }
1408
1409 #[test]
1410 fn test_luhn_valid_amex() {
1411 assert!(validate_credit_card("374245455400126"));
1412 }
1413
1414 #[test]
1415 fn test_luhn_invalid() {
1416 assert!(!validate_credit_card("1234567890123456"));
1417 assert!(!validate_credit_card("0000000000000000"));
1418 assert!(!validate_credit_card("12345")); }
1420
1421 #[test]
1426 fn test_ssn_valid() {
1427 assert!(validate_ssn("123-45-6789"));
1428 assert!(validate_ssn("123456789"));
1429 }
1430
1431 #[test]
1432 fn test_ssn_invalid_area() {
1433 assert!(!validate_ssn("000-45-6789")); assert!(!validate_ssn("666-45-6789")); assert!(!validate_ssn("900-45-6789")); }
1437
1438 #[test]
1439 fn test_ssn_invalid_group() {
1440 assert!(!validate_ssn("123-00-6789")); }
1442
1443 #[test]
1444 fn test_ssn_invalid_serial() {
1445 assert!(!validate_ssn("123-45-0000")); }
1447
1448 #[test]
1449 fn test_ssn_advertising_numbers() {
1450 assert!(!validate_ssn("987-65-4320"));
1454 assert!(!validate_ssn("987-65-4325"));
1455 assert!(!validate_ssn("987-65-4329"));
1456 assert!(!validate_ssn("987-65-4319"));
1458 assert!(!validate_ssn("987-65-4330"));
1459 assert!(!validate_ssn("900-12-3456"));
1460 assert!(!validate_ssn("999-99-9999"));
1461 }
1462
1463 #[test]
1468 fn test_phone_valid() {
1469 assert!(validate_phone("212-555-1234"));
1470 assert!(validate_phone("(212) 555-1234"));
1471 assert!(validate_phone("1-212-555-1234"));
1472 assert!(validate_phone("12125551234"));
1473 assert!(validate_phone("2125551234"));
1474 }
1475
1476 #[test]
1477 fn test_phone_invalid_length() {
1478 assert!(!validate_phone("555-1234")); assert!(!validate_phone("212-555-12345")); }
1481
1482 #[test]
1483 fn test_phone_invalid_area_code() {
1484 assert!(!validate_phone("012-555-1234")); assert!(!validate_phone("112-555-1234")); }
1487
1488 #[test]
1489 fn test_phone_service_codes() {
1490 assert!(!validate_phone("411-555-1234")); assert!(!validate_phone("911-555-1234")); assert!(!validate_phone("611-555-1234")); }
1495
1496 #[test]
1501 fn test_iban_valid_de() {
1502 assert!(validate_iban("DE89370400440532013000"));
1503 }
1504
1505 #[test]
1506 fn test_iban_valid_gb() {
1507 assert!(validate_iban("GB82WEST12345698765432"));
1508 }
1509
1510 #[test]
1511 fn test_iban_valid_with_spaces() {
1512 assert!(validate_iban("DE89 3704 0044 0532 0130 00"));
1513 }
1514
1515 #[test]
1516 fn test_iban_invalid_checksum() {
1517 assert!(!validate_iban("DE00370400440532013000")); }
1519
1520 #[test]
1521 fn test_iban_too_short() {
1522 assert!(!validate_iban("DE89370400")); }
1524
1525 #[test]
1530 fn test_scanner_creation() {
1531 let scanner = DlpScanner::default();
1532 assert!(scanner.is_enabled());
1533 assert_eq!(scanner.pattern_count(), 25); }
1535
1536 #[test]
1537 fn test_scanner_disabled() {
1538 let config = DlpConfig {
1539 enabled: false,
1540 ..Default::default()
1541 };
1542 let scanner = DlpScanner::new(config);
1543 let result = scanner.scan("4532015112830366");
1544 assert!(!result.scanned);
1545 }
1546
1547 #[test]
1548 fn test_scan_credit_card() {
1549 let scanner = DlpScanner::default();
1550 let result = scanner.scan("My card is 4532015112830366");
1551
1552 assert!(result.scanned);
1553 assert!(result.has_matches);
1554 assert_eq!(result.match_count, 1);
1555 assert_eq!(result.matches[0].data_type, SensitiveDataType::CreditCard);
1556 assert_eq!(result.matches[0].severity, PatternSeverity::Critical);
1557 }
1558
1559 #[test]
1560 fn test_scan_ssn() {
1561 let scanner = DlpScanner::default();
1562 let result = scanner.scan("SSN: 123-45-6789");
1563
1564 assert!(result.has_matches);
1565 assert_eq!(result.matches[0].data_type, SensitiveDataType::Ssn);
1566 }
1567
1568 #[test]
1569 fn test_scan_email() {
1570 let scanner = DlpScanner::default();
1571 let result = scanner.scan("Contact: user@example.com");
1572
1573 assert!(result.has_matches);
1574 assert_eq!(result.matches[0].data_type, SensitiveDataType::Email);
1575 assert_eq!(result.matches[0].severity, PatternSeverity::Medium);
1576 }
1577
1578 #[test]
1579 fn test_scan_jwt() {
1580 let scanner = DlpScanner::default();
1581 let jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c";
1582 let result = scanner.scan(&format!("Token: {}", jwt));
1583
1584 assert!(result.has_matches);
1585 let jwt_match = result
1586 .matches
1587 .iter()
1588 .find(|m| m.data_type == SensitiveDataType::Jwt);
1589 assert!(jwt_match.is_some());
1590 }
1591
1592 #[test]
1593 fn test_scan_aws_key() {
1594 let scanner = DlpScanner::default();
1595 let result = scanner.scan("AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE");
1596
1597 assert!(result.has_matches);
1598 let aws_match = result
1599 .matches
1600 .iter()
1601 .find(|m| m.data_type == SensitiveDataType::AwsKey);
1602 assert!(aws_match.is_some());
1603 }
1604
1605 #[test]
1606 fn test_scan_github_token() {
1607 let scanner = DlpScanner::default();
1608 let result = scanner.scan("GITHUB_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
1609
1610 assert!(result.has_matches);
1611 let gh_match = result
1612 .matches
1613 .iter()
1614 .find(|m| m.pattern_name == "GitHub Token");
1615 assert!(gh_match.is_some());
1616 }
1617
1618 #[test]
1619 fn test_scan_github_fine_grained_pat() {
1620 let scanner = DlpScanner::default();
1621 let result = scanner
1622 .scan("GITHUB_TOKEN=github_pat_11ABCDEFG0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
1623
1624 assert!(result.has_matches);
1625 let gh_match = result
1626 .matches
1627 .iter()
1628 .find(|m| m.pattern_name == "GitHub Fine-grained PAT");
1629 assert!(gh_match.is_some(), "Should detect GitHub fine-grained PAT");
1630 }
1631
1632 #[test]
1633 fn test_scan_stripe_keys() {
1634 let scanner = DlpScanner::default();
1635
1636 let result = scanner.scan("STRIPE_SECRET_KEY=sk_live_51ABCdefGHI123456789012345");
1638 assert!(result.has_matches, "Should detect Stripe secret key");
1639
1640 let result = scanner.scan("STRIPE_PK=pk_test_51ABCdefGHI123456789012345");
1642 assert!(result.has_matches, "Should detect Stripe publishable key");
1643
1644 let result = scanner.scan("STRIPE_RK=rk_live_51ABCdefGHI123456789012345");
1646 assert!(result.has_matches, "Should detect Stripe restricted key");
1647 }
1648
1649 #[test]
1650 fn test_scan_private_key() {
1651 let scanner = DlpScanner::default();
1652 let key =
1653 "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEA...\n-----END RSA PRIVATE KEY-----";
1654 let result = scanner.scan(key);
1655
1656 assert!(result.has_matches);
1657 assert_eq!(result.matches[0].data_type, SensitiveDataType::PrivateKey);
1658 }
1659
1660 #[test]
1661 fn test_scan_password_in_url() {
1662 let scanner = DlpScanner::default();
1663 let result = scanner.scan("https://api.example.com/login?password=secret123");
1664
1665 assert!(result.has_matches);
1666 let pwd_match = result
1667 .matches
1668 .iter()
1669 .find(|m| m.data_type == SensitiveDataType::Password);
1670 assert!(pwd_match.is_some());
1671 }
1672
1673 #[test]
1674 fn test_scan_password_in_json() {
1675 let scanner = DlpScanner::default();
1676 let result = scanner.scan(r#"{"username": "admin", "password": "secret123"}"#);
1677
1678 assert!(result.has_matches);
1679 let pwd_match = result
1680 .matches
1681 .iter()
1682 .find(|m| m.data_type == SensitiveDataType::Password);
1683 assert!(pwd_match.is_some());
1684 }
1685
1686 #[test]
1687 fn test_scan_no_matches() {
1688 let scanner = DlpScanner::default();
1689 let result = scanner.scan("This is just normal text with no sensitive data.");
1690
1691 assert!(result.scanned);
1692 assert!(!result.has_matches);
1693 assert_eq!(result.match_count, 0);
1694 }
1695
1696 #[test]
1697 fn test_scan_multiple_matches() {
1698 let scanner = DlpScanner::default();
1699 let content = "Card: 4532015112830366, SSN: 123-45-6789, Email: test@example.com";
1700 let result = scanner.scan(content);
1701
1702 assert!(result.has_matches);
1703 assert!(result.match_count >= 3);
1704 }
1705
1706 #[test]
1707 fn test_masking() {
1708 let scanner = DlpScanner::default();
1709 let result = scanner.scan("4532015112830366");
1710
1711 assert!(result.has_matches);
1712 assert!(result.matches[0].masked_value.contains("****"));
1713 assert!(result.matches[0].masked_value.ends_with("0366"));
1714 }
1715
1716 #[test]
1717 fn test_content_type_detection() {
1718 let scanner = DlpScanner::default();
1719
1720 assert!(scanner.is_scannable_content_type("text/html"));
1721 assert!(scanner.is_scannable_content_type("application/json"));
1722 assert!(scanner.is_scannable_content_type("application/xml"));
1723 assert!(!scanner.is_scannable_content_type("image/png"));
1724 assert!(!scanner.is_scannable_content_type("application/octet-stream"));
1725 }
1726
1727 #[test]
1728 fn test_stats() {
1729 let scanner = DlpScanner::default();
1730 scanner.scan("Card: 4532015112830366");
1731 scanner.scan("No sensitive data here");
1732
1733 let stats = scanner.stats();
1734 assert_eq!(stats.total_scans, 2);
1735 assert!(stats.total_matches >= 1);
1736 }
1737
1738 #[test]
1739 fn test_custom_keywords() {
1740 let config = DlpConfig {
1741 enabled: true,
1742 custom_keywords: Some(vec!["ProjectX".to_string(), "InternalID-123".to_string()]),
1743 ..Default::default()
1744 };
1745 let scanner = DlpScanner::new(config);
1746
1747 let result = scanner.scan("Confidential: ProjectX launch date is soon.");
1748 assert!(result.has_matches);
1749 assert_eq!(result.match_count, 1);
1750 assert_eq!(result.matches[0].data_type, SensitiveDataType::Custom);
1751
1752 let result2 = scanner.scan("User ID: InternalID-123");
1753 assert!(result2.has_matches);
1754 assert_eq!(result2.matches[0].data_type, SensitiveDataType::Custom);
1755 }
1756
1757 #[test]
1758 fn test_redaction_modes() {
1759 let mut redaction = HashMap::new();
1760 redaction.insert(SensitiveDataType::CreditCard, RedactionMode::Full);
1761 redaction.insert(SensitiveDataType::Email, RedactionMode::Hash);
1762
1763 let config = DlpConfig {
1764 enabled: true,
1765 redaction,
1766 hash_salt: Some("test-salt-for-hashing".to_string()), ..Default::default()
1768 };
1769 let scanner = DlpScanner::new(config);
1770
1771 let result = scanner.scan("Card: 4532015112830366, Email: test@example.com");
1772
1773 let card = result
1775 .matches
1776 .iter()
1777 .find(|m| m.data_type == SensitiveDataType::CreditCard)
1778 .unwrap();
1779 assert!(card.masked_value.chars().all(|c| c == '*'));
1780 assert!(!card.masked_value.contains("0366")); let email = result
1784 .matches
1785 .iter()
1786 .find(|m| m.data_type == SensitiveDataType::Email)
1787 .unwrap();
1788 assert!(email.masked_value.starts_with("sha256:"));
1789 assert!(!email.masked_value.contains("test@example.com"));
1790 }
1791
1792 #[test]
1797 fn test_scan_performance() {
1798 let config = DlpConfig {
1800 enabled: true,
1801 max_scan_size: 5 * 1024 * 1024,
1802 max_matches: 100,
1803 scan_text_only: true,
1804 max_body_inspection_bytes: 200 * 1024, fast_mode: false,
1806 ..Default::default()
1807 };
1808 let scanner = DlpScanner::new(config);
1809
1810 let mut content = String::with_capacity(100_000);
1812 for i in 0..1000 {
1813 content.push_str(&format!("Line {}: This is normal text content.\n", i));
1814 if i % 100 == 0 {
1815 content.push_str("Credit card: 4532015112830366\n");
1816 }
1817 }
1818
1819 let result = scanner.scan(&content);
1820
1821 #[cfg(debug_assertions)]
1824 let max_time_us = 75_000;
1825 #[cfg(not(debug_assertions))]
1826 let max_time_us = 5_000;
1827
1828 assert!(
1829 result.scan_time_us < max_time_us,
1830 "Scan took {}μs, expected < {}μs for 100KB",
1831 result.scan_time_us,
1832 max_time_us
1833 );
1834 assert!(result.match_count >= 10); }
1836
1837 #[test]
1838 fn test_truncation() {
1839 let scanner = DlpScanner::default();
1841
1842 let mut content = String::from("Credit card: 4532015112830366\n");
1844 for _ in 0..500 {
1845 content.push_str("Lorem ipsum dolor sit amet, consectetur adipiscing elit.\n");
1846 }
1847
1848 let result = scanner.scan(&content);
1849
1850 assert!(result.truncated);
1852 assert!(result.original_length > result.content_length);
1853 assert!(result.has_matches);
1854 assert_eq!(result.match_count, 1);
1855 }
1856
1857 #[test]
1858 fn test_fast_mode() {
1859 let content = r#"
1861 Critical data:
1862 Credit card: 4532015112830366
1863 SSN: 123-45-6789
1864 AWS Key: AKIAIOSFODNN7EXAMPLE
1865
1866 Low-priority data (skipped in fast mode):
1867 Email: user@example.com
1868 Phone: (555) 123-4567
1869 IP: 192.168.1.1
1870 "#;
1871
1872 let normal_scanner = DlpScanner::default();
1874 let normal_result = normal_scanner.scan(content);
1875
1876 let fast_config = DlpConfig {
1878 fast_mode: true,
1879 ..Default::default()
1880 };
1881 let fast_scanner = DlpScanner::new(fast_config);
1882 let fast_result = fast_scanner.scan(content);
1883
1884 assert!(
1886 normal_result.match_count > fast_result.match_count,
1887 "Normal mode ({}) should find more matches than fast mode ({})",
1888 normal_result.match_count,
1889 fast_result.match_count
1890 );
1891
1892 assert!(
1894 fast_result.match_count >= 3,
1895 "Fast mode should find at least 3 critical matches, found {}",
1896 fast_result.match_count
1897 );
1898
1899 let fast_types: Vec<_> = fast_result.matches.iter().map(|m| m.data_type).collect();
1901 assert!(
1902 !fast_types.contains(&SensitiveDataType::Email),
1903 "Fast mode should not detect emails"
1904 );
1905 assert!(
1906 !fast_types.contains(&SensitiveDataType::Phone),
1907 "Fast mode should not detect phone numbers"
1908 );
1909 assert!(
1910 !fast_types.contains(&SensitiveDataType::IpAddress),
1911 "Fast mode should not detect IP addresses"
1912 );
1913 }
1914}