1use regex::{Regex, RegexSet};
71use serde::{Deserialize, Serialize};
72
73pub mod redaction;
74
75pub(crate) const SECRET_PREFIXES: &[&str] = &[
83 "Bearer ",
84 "sk-",
85 "pk-",
86 "xox",
87 "ghp_",
88 "gho_",
89 "ghs_",
90 "ghu_",
91 "github_pat_",
92 "AKIA",
93 "AIza",
94];
95
96pub use redaction::{
97 REDACTED_MARKER, RedactionLevel, RedactionPolicy, redact_error, redact_for_observability,
98 redact_string, redact_value,
99};
100
101#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
110#[serde(rename_all = "snake_case")]
111#[non_exhaustive]
112pub enum PiiCategory {
113 Secret,
115 Email,
117 Phone,
119 CreditCard,
121 Cpf,
123 Cnpj,
125 Rg,
127 Cnh,
129 PixKey,
131 IpAddress,
133 Jwt,
135 Custom(String),
137}
138
139impl PiiCategory {
140 #[must_use]
143 pub const fn as_tag(&self) -> &str {
144 match self {
145 Self::Secret => "secret",
146 Self::Email => "email",
147 Self::Phone => "phone",
148 Self::CreditCard => "credit_card",
149 Self::Cpf => "cpf",
150 Self::Cnpj => "cnpj",
151 Self::Rg => "rg",
152 Self::Cnh => "cnh",
153 Self::PixKey => "pix_key",
154 Self::IpAddress => "ip_address",
155 Self::Jwt => "jwt",
156 Self::Custom(name) => name.as_str(),
157 }
158 }
159}
160
161#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
164pub struct PiiSpan {
165 pub start: usize,
167 pub end: usize,
169 pub category: PiiCategory,
171}
172
173impl PiiSpan {
174 #[must_use]
175 pub const fn new(start: usize, end: usize, category: PiiCategory) -> Self {
176 Self {
177 start,
178 end,
179 category,
180 }
181 }
182
183 #[must_use]
184 pub const fn len(&self) -> usize {
185 self.end.saturating_sub(self.start)
186 }
187
188 #[must_use]
189 pub const fn is_empty(&self) -> bool {
190 self.end <= self.start
191 }
192
193 #[must_use]
196 pub const fn overlaps(&self, other: &Self) -> bool {
197 self.start < other.end && other.start < self.end
198 }
199}
200
201pub trait PiiDetector: Send + Sync + std::fmt::Debug {
215 fn detect(&self, text: &str) -> Vec<PiiSpan>;
218}
219
220impl<T: PiiDetector + ?Sized> PiiDetector for Box<T> {
221 fn detect(&self, text: &str) -> Vec<PiiSpan> {
222 (**self).detect(text)
223 }
224}
225
226impl<T: PiiDetector + ?Sized> PiiDetector for std::sync::Arc<T> {
227 fn detect(&self, text: &str) -> Vec<PiiSpan> {
228 (**self).detect(text)
229 }
230}
231
232#[derive(Clone, Copy, Debug, Default)]
239pub struct NoopDetector;
240
241impl PiiDetector for NoopDetector {
242 fn detect(&self, _text: &str) -> Vec<PiiSpan> {
243 Vec::new()
244 }
245}
246
247#[derive(Debug)]
258pub struct SecretDetector {
259 pattern: Regex,
260}
261
262impl SecretDetector {
263 pub fn baseline() -> Result<Self, regex::Error> {
270 let pattern = Regex::new(
271 r"(?x)
272 (?:
273 \bBearer\s+[A-Za-z0-9._~+/=\-]{8,}
274 | \bsk-[A-Za-z0-9_\-]{16,}
275 | \bpk-[A-Za-z0-9_\-]{16,}
276 | \bxox[abpsr]-[A-Za-z0-9\-]{8,}
277 | \bghp_[A-Za-z0-9]{20,}
278 | \bgho_[A-Za-z0-9]{20,}
279 | \bghs_[A-Za-z0-9]{20,}
280 | \bghu_[A-Za-z0-9]{20,}
281 | \bgithub_pat_[A-Za-z0-9_]{20,}
282 | \bAKIA[A-Z0-9]{16}
283 | \bAIza[A-Za-z0-9_\-]{30,}
284 )
285 ",
286 )?;
287 Ok(Self { pattern })
288 }
289}
290
291impl PiiDetector for SecretDetector {
292 fn detect(&self, text: &str) -> Vec<PiiSpan> {
293 self.pattern
294 .find_iter(text)
295 .map(|m| PiiSpan::new(m.start(), m.end(), PiiCategory::Secret))
296 .collect()
297 }
298}
299
300#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
311#[repr(u8)]
312pub enum DetectCategory {
313 Email = 0,
314 Phone = 1,
315 CreditCard = 2,
316 Cpf = 3,
317 Cnpj = 4,
318 PixUuid = 5,
319 Ipv4 = 6,
320 Jwt = 7,
321}
322
323impl DetectCategory {
324 const fn mask(self) -> u16 {
325 1u16 << (self as u8)
326 }
327}
328
329#[derive(Clone, Copy, Debug, Eq, PartialEq)]
335pub struct CategorySet(u16);
336
337impl CategorySet {
338 const ALL_MASK: u16 = DetectCategory::Email.mask()
339 | DetectCategory::Phone.mask()
340 | DetectCategory::CreditCard.mask()
341 | DetectCategory::Cpf.mask()
342 | DetectCategory::Cnpj.mask()
343 | DetectCategory::PixUuid.mask()
344 | DetectCategory::Ipv4.mask()
345 | DetectCategory::Jwt.mask();
346
347 #[must_use]
349 pub const fn all() -> Self {
350 Self(Self::ALL_MASK)
351 }
352
353 #[must_use]
355 pub const fn none() -> Self {
356 Self(0)
357 }
358
359 #[must_use]
361 pub const fn with(mut self, category: DetectCategory) -> Self {
362 self.0 |= category.mask();
363 self
364 }
365
366 #[must_use]
368 pub const fn without(mut self, category: DetectCategory) -> Self {
369 self.0 &= !category.mask();
370 self
371 }
372
373 #[must_use]
375 pub const fn contains(self, category: DetectCategory) -> bool {
376 self.0 & category.mask() != 0
377 }
378}
379
380impl Default for CategorySet {
381 fn default() -> Self {
382 Self::all()
383 }
384}
385
386const ENTITY_PATTERNS: [&str; 8] = [
393 r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}",
394 r"\+[1-9]\d{7,14}",
395 r"\b(?:\d[ \-]?){12,18}\d\b",
396 r"\b(?:\d{3}\.\d{3}\.\d{3}-\d{2}|\d{11})\b",
397 r"\b(?:\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}|\d{14})\b",
398 r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b",
399 r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|1?\d\d?)\b",
400 r"\beyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b",
401];
402
403#[derive(Debug)]
415pub struct EntityDetector {
416 email: Regex,
417 phone: Regex,
418 credit_card: Regex,
419 cpf: Regex,
420 cnpj: Regex,
421 pix_uuid: Regex,
422 ipv4: Regex,
423 jwt: Regex,
424 prefilter: RegexSet,
425 enabled: CategorySet,
426}
427
428impl EntityDetector {
429 pub fn new(enabled: CategorySet) -> Result<Self, regex::Error> {
435 Ok(Self {
436 email: Regex::new(ENTITY_PATTERNS[DetectCategory::Email as usize])?,
437 phone: Regex::new(ENTITY_PATTERNS[DetectCategory::Phone as usize])?,
438 credit_card: Regex::new(ENTITY_PATTERNS[DetectCategory::CreditCard as usize])?,
439 cpf: Regex::new(ENTITY_PATTERNS[DetectCategory::Cpf as usize])?,
440 cnpj: Regex::new(ENTITY_PATTERNS[DetectCategory::Cnpj as usize])?,
441 pix_uuid: Regex::new(ENTITY_PATTERNS[DetectCategory::PixUuid as usize])?,
442 ipv4: Regex::new(ENTITY_PATTERNS[DetectCategory::Ipv4 as usize])?,
443 jwt: Regex::new(ENTITY_PATTERNS[DetectCategory::Jwt as usize])?,
444 prefilter: RegexSet::new(ENTITY_PATTERNS)?,
445 enabled,
446 })
447 }
448
449 fn should_scan(&self, matches: ®ex::SetMatches, category: DetectCategory) -> bool {
452 self.enabled.contains(category) && matches.matched(category as usize)
453 }
454
455 pub fn baseline() -> Result<Self, regex::Error> {
460 Self::new(CategorySet::all())
461 }
462}
463
464impl PiiDetector for EntityDetector {
465 fn detect(&self, text: &str) -> Vec<PiiSpan> {
466 let mut spans = Vec::new();
467
468 let matches = self.prefilter.matches(text);
471 if !matches.matched_any() {
472 return spans;
473 }
474
475 if self.should_scan(&matches, DetectCategory::Email) {
476 for m in self.email.find_iter(text) {
477 spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Email));
478 }
479 }
480
481 if self.should_scan(&matches, DetectCategory::Phone) {
482 for m in self.phone.find_iter(text) {
483 spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Phone));
484 }
485 }
486
487 if self.should_scan(&matches, DetectCategory::CreditCard) {
488 for m in self.credit_card.find_iter(text) {
489 push_credit_card_spans(m.as_str(), m.start(), &mut spans);
490 }
491 }
492
493 if self.should_scan(&matches, DetectCategory::Cpf) {
494 for m in self.cpf.find_iter(text) {
495 if cpf_is_valid(m.as_str()) {
496 spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Cpf));
497 }
498 }
499 }
500
501 if self.should_scan(&matches, DetectCategory::Cnpj) {
502 for m in self.cnpj.find_iter(text) {
503 if cnpj_is_valid(m.as_str()) {
504 spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Cnpj));
505 }
506 }
507 }
508
509 if self.should_scan(&matches, DetectCategory::PixUuid) {
510 for m in self.pix_uuid.find_iter(text) {
511 spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::PixKey));
512 }
513 }
514
515 if self.should_scan(&matches, DetectCategory::Ipv4) {
516 for m in self.ipv4.find_iter(text) {
517 spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::IpAddress));
518 }
519 }
520
521 if self.should_scan(&matches, DetectCategory::Jwt) {
522 for m in self.jwt.find_iter(text) {
523 spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Jwt));
524 }
525 }
526
527 spans
528 }
529}
530
531fn push_credit_card_spans(matched: &str, base: usize, out: &mut Vec<PiiSpan>) {
547 if luhn_is_valid(matched) {
548 out.push(PiiSpan::new(
549 base,
550 base + matched.len(),
551 PiiCategory::CreditCard,
552 ));
553 return;
554 }
555
556 let bytes = matched.as_bytes();
557 let digit_offsets: Vec<usize> = matched
558 .bytes()
559 .enumerate()
560 .filter(|(_, b)| b.is_ascii_digit())
561 .map(|(i, _)| i)
562 .collect();
563 let n = digit_offsets.len();
564
565 let is_group_start = |di: usize| -> bool {
566 let off = digit_offsets[di];
567 off == 0 || !bytes[off - 1].is_ascii_digit()
568 };
569
570 let mut di = 0;
571 while di < n {
572 if !is_group_start(di) {
573 di += 1;
574 continue;
575 }
576 let max_len = (n - di).min(19);
577 let mut emitted = None;
578 if max_len >= 13 {
579 for len in (13..=max_len).rev() {
580 let start_off = digit_offsets[di];
581 let end_off = digit_offsets[di + len - 1] + 1;
583 if luhn_is_valid(&matched[start_off..end_off]) {
584 out.push(PiiSpan::new(
585 base + start_off,
586 base + end_off,
587 PiiCategory::CreditCard,
588 ));
589 emitted = Some(len);
590 break;
591 }
592 }
593 }
594 di += emitted.unwrap_or(1);
596 }
597}
598
599#[derive(Debug)]
606pub struct CompositeDetector {
607 detectors: Vec<Box<dyn PiiDetector>>,
608 dedup: bool,
609}
610
611impl CompositeDetector {
612 #[must_use]
613 pub fn new(detectors: Vec<Box<dyn PiiDetector>>) -> Self {
614 Self {
615 detectors,
616 dedup: true,
617 }
618 }
619
620 #[must_use]
623 pub const fn without_dedup(mut self) -> Self {
624 self.dedup = false;
625 self
626 }
627}
628
629impl PiiDetector for CompositeDetector {
630 fn detect(&self, text: &str) -> Vec<PiiSpan> {
631 let mut spans: Vec<PiiSpan> = self.detectors.iter().flat_map(|d| d.detect(text)).collect();
632 if self.dedup {
633 dedup_overlapping(&mut spans);
634 }
635 spans
636 }
637}
638
639#[derive(Debug)]
642pub struct BaselineDetector {
643 inner: CompositeDetector,
644}
645
646impl BaselineDetector {
647 pub fn new() -> Result<Self, regex::Error> {
652 let secrets: Box<dyn PiiDetector> = Box::new(SecretDetector::baseline()?);
653 let entities: Box<dyn PiiDetector> = Box::new(EntityDetector::baseline()?);
654 Ok(Self {
655 inner: CompositeDetector::new(vec![secrets, entities]),
656 })
657 }
658}
659
660impl PiiDetector for BaselineDetector {
661 fn detect(&self, text: &str) -> Vec<PiiSpan> {
662 self.inner.detect(text)
663 }
664}
665
666pub fn dedup_overlapping(spans: &mut Vec<PiiSpan>) {
683 spans.sort_by(|a, b| a.start.cmp(&b.start).then_with(|| b.len().cmp(&a.len())));
684 let mut kept: Vec<PiiSpan> = Vec::with_capacity(spans.len());
685 for span in spans.drain(..) {
686 if span.is_empty() {
687 continue;
688 }
689 match kept.last_mut() {
690 Some(prev) if prev.end > span.start => {
693 prev.end = prev.end.max(span.end);
694 }
695 _ => kept.push(span),
696 }
697 }
698 *spans = kept;
699}
700
701#[must_use]
707pub fn mask_spans(text: &str, spans: &[PiiSpan]) -> String {
708 mask_with(text, spans, |span, _matched| {
709 format!("[REDACTED:{}]", span.category.as_tag())
710 })
711}
712
713#[must_use]
723pub fn mask_with<F>(text: &str, spans: &[PiiSpan], f: F) -> String
724where
725 F: Fn(&PiiSpan, &str) -> String,
726{
727 if spans_are_clean(spans) {
731 return mask_sorted(text, spans, &f);
732 }
733 let mut sorted = spans.to_vec();
734 dedup_overlapping(&mut sorted);
735 mask_sorted(text, &sorted, &f)
736}
737
738fn spans_are_clean(spans: &[PiiSpan]) -> bool {
742 spans.iter().all(|s| !s.is_empty()) && spans.windows(2).all(|w| w[0].end <= w[1].start)
743}
744
745fn mask_sorted<F>(text: &str, sorted: &[PiiSpan], f: &F) -> String
752where
753 F: Fn(&PiiSpan, &str) -> String,
754{
755 let mut out = String::with_capacity(text.len());
756 let mut cursor = 0;
757 for span in sorted {
758 if span.start < cursor {
759 continue;
760 }
761 let (Some(prefix), Some(matched)) =
762 (text.get(cursor..span.start), text.get(span.start..span.end))
763 else {
764 continue;
765 };
766 out.push_str(prefix);
767 out.push_str(&f(span, matched));
768 cursor = span.end;
769 }
770 if let Some(suffix) = text.get(cursor..) {
771 out.push_str(suffix);
772 }
773 out
774}
775
776#[must_use]
779pub fn mask_pan(pan: &str) -> String {
780 let digits: Vec<char> = pan.chars().filter(char::is_ascii_digit).collect();
781 if digits.len() < 4 {
782 return format!("[REDACTED:{}]", PiiCategory::CreditCard.as_tag());
783 }
784 let last_four: String = digits.iter().rev().take(4).rev().copied().collect();
785 format!("****-****-****-{last_four}")
786}
787
788fn luhn_is_valid(s: &str) -> bool {
793 let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
794 if digits.len() < 13 || digits.len() > 19 {
795 return false;
796 }
797 let sum: u32 = digits
798 .iter()
799 .rev()
800 .enumerate()
801 .map(|(i, &d)| {
802 if i % 2 == 0 {
803 d
804 } else {
805 let doubled = d * 2;
806 if doubled > 9 { doubled - 9 } else { doubled }
807 }
808 })
809 .sum();
810 sum.is_multiple_of(10)
811}
812
813fn cpf_is_valid(s: &str) -> bool {
814 let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
815 if digits.len() != 11 {
816 return false;
817 }
818 if digits.iter().all(|&d| d == digits[0]) {
819 return false;
820 }
821 let Some(first_nine) = digits.get(..9) else {
822 return false;
823 };
824 let check1 = mod11_cpf_check(first_nine, 10);
825 if digits.get(9) != Some(&check1) {
826 return false;
827 }
828 let Some(first_ten) = digits.get(..10) else {
829 return false;
830 };
831 let check2 = mod11_cpf_check(first_ten, 11);
832 digits.get(10) == Some(&check2)
833}
834
835fn mod11_cpf_check(slice: &[u32], weight_start: u32) -> u32 {
836 let weights = (0_u32..).map(|i| weight_start.saturating_sub(i));
837 let sum: u32 = slice.iter().zip(weights).map(|(d, w)| d * w).sum();
838 let rem = sum % 11;
839 if rem < 2 { 0 } else { 11 - rem }
840}
841
842fn cnpj_is_valid(s: &str) -> bool {
843 const WEIGHTS1: [u32; 12] = [5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
844 const WEIGHTS2: [u32; 13] = [6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
845
846 let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
847 if digits.len() != 14 {
848 return false;
849 }
850 if digits.iter().all(|&d| d == digits[0]) {
851 return false;
852 }
853 let Some(first_twelve) = digits.get(..12) else {
854 return false;
855 };
856 let check1 = weighted_mod11(first_twelve, &WEIGHTS1);
857 if digits.get(12) != Some(&check1) {
858 return false;
859 }
860 let Some(first_thirteen) = digits.get(..13) else {
861 return false;
862 };
863 let check2 = weighted_mod11(first_thirteen, &WEIGHTS2);
864 digits.get(13) == Some(&check2)
865}
866
867fn weighted_mod11(slice: &[u32], weights: &[u32]) -> u32 {
868 let sum: u32 = slice.iter().zip(weights.iter()).map(|(d, w)| d * w).sum();
869 let rem = sum % 11;
870 if rem < 2 { 0 } else { 11 - rem }
871}
872
873#[cfg(test)]
878mod tests {
879 use super::*;
880
881 type TestResult = Result<(), regex::Error>;
882
883 #[test]
886 fn category_as_tag_returns_stable_strings() {
887 assert_eq!(PiiCategory::Secret.as_tag(), "secret");
888 assert_eq!(PiiCategory::Email.as_tag(), "email");
889 assert_eq!(PiiCategory::Phone.as_tag(), "phone");
890 assert_eq!(PiiCategory::CreditCard.as_tag(), "credit_card");
891 assert_eq!(PiiCategory::Cpf.as_tag(), "cpf");
892 assert_eq!(PiiCategory::Cnpj.as_tag(), "cnpj");
893 assert_eq!(PiiCategory::PixKey.as_tag(), "pix_key");
894 assert_eq!(PiiCategory::IpAddress.as_tag(), "ip_address");
895 assert_eq!(PiiCategory::Jwt.as_tag(), "jwt");
896 assert_eq!(PiiCategory::Custom("org_id".to_owned()).as_tag(), "org_id");
897 }
898
899 #[test]
900 fn unit_category_serialises_as_snake_case_string() -> serde_json::Result<()> {
901 let json = serde_json::to_string(&PiiCategory::Email)?;
902 assert_eq!(json, r#""email""#);
903 let back: PiiCategory = serde_json::from_str(&json)?;
904 assert_eq!(back, PiiCategory::Email);
905 Ok(())
906 }
907
908 #[test]
909 fn custom_category_round_trips() -> serde_json::Result<()> {
910 let original = PiiCategory::Custom("account_key".to_owned());
911 let json = serde_json::to_string(&original)?;
912 let back: PiiCategory = serde_json::from_str(&json)?;
913 assert_eq!(back, original);
914 Ok(())
915 }
916
917 #[test]
920 fn span_len_and_is_empty() {
921 let s = PiiSpan::new(5, 10, PiiCategory::Email);
922 assert_eq!(s.len(), 5);
923 assert!(!s.is_empty());
924 let z = PiiSpan::new(5, 5, PiiCategory::Email);
925 assert!(z.is_empty());
926 }
927
928 #[test]
929 fn span_overlaps_detects_shared_bytes_only() {
930 let a = PiiSpan::new(0, 5, PiiCategory::Email);
931 let b = PiiSpan::new(3, 8, PiiCategory::Email);
932 let c = PiiSpan::new(5, 10, PiiCategory::Email);
933 assert!(a.overlaps(&b));
934 assert!(!a.overlaps(&c)); assert!(!c.overlaps(&a));
936 }
937
938 #[test]
941 fn noop_detector_finds_nothing() {
942 let d = NoopDetector;
943 assert!(d.detect("sk-abc123 email a@b.co").is_empty());
944 }
945
946 #[test]
949 fn secret_detector_detects_common_prefixes() -> TestResult {
950 let d = SecretDetector::baseline()?;
951 let cases = [
952 "Authorization: Bearer eyJhbGciOiJIUzI1NiJ9.payload.sig",
953 "key=sk-abcdefghijklmnopqrstuv",
954 "GH token ghp_abcdefghijklmnopqrstuvwxyz",
955 "AWS AKIAIOSFODNN7EXAMPLE",
956 "xoxb-1234567890-slack",
957 "GOOGLE_KEY=AIzaSyA-abcdefghijklmnopqrstuvwxyz123",
958 ];
959 for text in cases {
960 let spans = d.detect(text);
961 assert_eq!(spans.len(), 1, "expected 1 span in {text:?}, got {spans:?}");
962 assert_eq!(spans[0].category, PiiCategory::Secret);
963 }
964 Ok(())
965 }
966
967 #[test]
968 fn secret_detector_ignores_non_secret_text() -> TestResult {
969 let d = SecretDetector::baseline()?;
970 assert!(d.detect("just some ordinary prose").is_empty());
971 assert!(d.detect("sk-short").is_empty()); Ok(())
973 }
974
975 #[test]
978 fn detects_email() -> TestResult {
979 let d = EntityDetector::baseline()?;
980 let spans = d.detect("please email me at ana.silva+tag@example.com tomorrow");
981 assert_eq!(spans.len(), 1);
982 assert_eq!(spans[0].category, PiiCategory::Email);
983 Ok(())
984 }
985
986 #[test]
989 fn detects_e164_phone() -> TestResult {
990 let d = EntityDetector::baseline()?;
991 let spans = d.detect("call +5511987654321 for support");
992 assert_eq!(spans.len(), 1);
993 assert_eq!(spans[0].category, PiiCategory::Phone);
994 Ok(())
995 }
996
997 #[test]
998 fn non_e164_phone_not_detected() -> TestResult {
999 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Phone))?;
1000 assert!(d.detect("call 11987654321").is_empty());
1002 Ok(())
1003 }
1004
1005 #[test]
1008 fn detects_luhn_valid_pan() -> TestResult {
1009 let d = EntityDetector::baseline()?;
1010 let spans = d.detect("card 4111 1111 1111 1111 expires soon");
1011 let pan_count = spans
1012 .iter()
1013 .filter(|s| s.category == PiiCategory::CreditCard)
1014 .count();
1015 assert_eq!(pan_count, 1);
1016 Ok(())
1017 }
1018
1019 #[test]
1020 fn rejects_luhn_invalid_pan() -> TestResult {
1021 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1022 let spans = d.detect("card 1234 5678 9012 3456");
1024 assert!(spans.is_empty(), "Luhn-invalid PAN leaked: {spans:?}");
1025 Ok(())
1026 }
1027
1028 #[test]
1029 fn detects_mastercard_test_pan() -> TestResult {
1030 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1031 let spans = d.detect("5500-0000-0000-0004");
1032 assert_eq!(spans.len(), 1);
1033 assert_eq!(spans[0].category, PiiCategory::CreditCard);
1034 Ok(())
1035 }
1036
1037 #[test]
1040 fn detects_valid_cpf_formatted() -> TestResult {
1041 let d = EntityDetector::baseline()?;
1042 let spans = d.detect("meu CPF é 111.444.777-35 ok?");
1043 let cpf_count = spans
1044 .iter()
1045 .filter(|s| s.category == PiiCategory::Cpf)
1046 .count();
1047 assert_eq!(cpf_count, 1);
1048 Ok(())
1049 }
1050
1051 #[test]
1052 fn detects_valid_cpf_unformatted() -> TestResult {
1053 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Cpf))?;
1054 let spans = d.detect("cpf 11144477735 confere");
1055 assert_eq!(spans.len(), 1);
1056 assert_eq!(spans[0].category, PiiCategory::Cpf);
1057 Ok(())
1058 }
1059
1060 #[test]
1061 fn rejects_invalid_cpf() -> TestResult {
1062 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Cpf))?;
1063 assert!(d.detect("cpf 12345678900").is_empty());
1065 assert!(d.detect("cpf 11111111111").is_empty());
1067 assert!(d.detect("cpf 123.456.789-00").is_empty());
1069 Ok(())
1070 }
1071
1072 #[test]
1075 fn detects_valid_cnpj_formatted() -> TestResult {
1076 let d = EntityDetector::baseline()?;
1077 let spans = d.detect("CNPJ 11.222.333/0001-81 registered");
1078 let cnpj_count = spans
1079 .iter()
1080 .filter(|s| s.category == PiiCategory::Cnpj)
1081 .count();
1082 assert_eq!(cnpj_count, 1);
1083 Ok(())
1084 }
1085
1086 #[test]
1087 fn detects_valid_cnpj_unformatted() -> TestResult {
1088 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Cnpj))?;
1089 let spans = d.detect("cnpj 11222333000181 ok");
1090 assert_eq!(spans.len(), 1);
1091 assert_eq!(spans[0].category, PiiCategory::Cnpj);
1092 Ok(())
1093 }
1094
1095 #[test]
1096 fn rejects_invalid_cnpj() -> TestResult {
1097 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Cnpj))?;
1098 assert!(d.detect("cnpj 12345678000100").is_empty());
1099 assert!(d.detect("cnpj 11111111111111").is_empty());
1100 Ok(())
1101 }
1102
1103 #[test]
1106 fn detects_pix_uuid_key() -> TestResult {
1107 let d = EntityDetector::baseline()?;
1108 let spans = d.detect("pix key 123e4567-e89b-12d3-a456-426614174000 configurada");
1109 let pix_count = spans
1110 .iter()
1111 .filter(|s| s.category == PiiCategory::PixKey)
1112 .count();
1113 assert_eq!(pix_count, 1);
1114 Ok(())
1115 }
1116
1117 #[test]
1120 fn detects_ipv4() -> TestResult {
1121 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Ipv4))?;
1122 let spans = d.detect("request from 192.168.1.100 blocked");
1123 assert_eq!(spans.len(), 1);
1124 assert_eq!(spans[0].category, PiiCategory::IpAddress);
1125 Ok(())
1126 }
1127
1128 #[test]
1129 fn rejects_out_of_range_ipv4_octets() -> TestResult {
1130 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Ipv4))?;
1131 assert!(d.detect("999.999.999.999").is_empty());
1132 Ok(())
1133 }
1134
1135 #[test]
1138 fn detects_jwt() -> TestResult {
1139 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Jwt))?;
1140 let jwt = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjMifQ.abc-_def";
1141 let spans = d.detect(&format!("token: {jwt} here"));
1142 assert_eq!(spans.len(), 1);
1143 assert_eq!(spans[0].category, PiiCategory::Jwt);
1144 Ok(())
1145 }
1146
1147 #[test]
1150 fn disabled_categories_are_skipped() -> TestResult {
1151 let d = EntityDetector::new(CategorySet::none())?;
1152 assert!(d.detect("a@b.co and 111.444.777-35").is_empty());
1153 Ok(())
1154 }
1155
1156 #[test]
1159 fn composite_merges_detectors() -> TestResult {
1160 let secrets: Box<dyn PiiDetector> = Box::new(SecretDetector::baseline()?);
1161 let entities: Box<dyn PiiDetector> = Box::new(EntityDetector::baseline()?);
1162 let composite = CompositeDetector::new(vec![secrets, entities]);
1163 let text = "login=a@b.co key=sk-abcdefghijklmnopqrstuv";
1164 let spans = composite.detect(text);
1165 assert_eq!(spans.len(), 2);
1166 let categories: Vec<&PiiCategory> = spans.iter().map(|s| &s.category).collect();
1168 assert!(categories.contains(&&PiiCategory::Email));
1169 assert!(categories.contains(&&PiiCategory::Secret));
1170 Ok(())
1171 }
1172
1173 #[test]
1174 fn composite_dedupes_overlapping_spans() {
1175 #[derive(Debug)]
1178 struct Always(PiiCategory);
1179 impl PiiDetector for Always {
1180 fn detect(&self, text: &str) -> Vec<PiiSpan> {
1181 if text.is_empty() {
1182 Vec::new()
1183 } else {
1184 vec![PiiSpan::new(0, text.len(), self.0.clone())]
1185 }
1186 }
1187 }
1188
1189 let composite = CompositeDetector::new(vec![
1190 Box::new(Always(PiiCategory::Email)),
1191 Box::new(Always(PiiCategory::Secret)),
1192 ]);
1193 let spans = composite.detect("hello");
1194 assert_eq!(spans.len(), 1);
1195 }
1196
1197 #[test]
1198 fn composite_without_dedup_preserves_overlaps() {
1199 #[derive(Debug)]
1200 struct Always(PiiCategory);
1201 impl PiiDetector for Always {
1202 fn detect(&self, text: &str) -> Vec<PiiSpan> {
1203 if text.is_empty() {
1204 Vec::new()
1205 } else {
1206 vec![PiiSpan::new(0, text.len(), self.0.clone())]
1207 }
1208 }
1209 }
1210
1211 let composite = CompositeDetector::new(vec![
1212 Box::new(Always(PiiCategory::Email)),
1213 Box::new(Always(PiiCategory::Secret)),
1214 ])
1215 .without_dedup();
1216 let spans = composite.detect("hello");
1217 assert_eq!(spans.len(), 2);
1218 }
1219
1220 #[test]
1223 fn baseline_finds_mixed_pii() -> TestResult {
1224 let d = BaselineDetector::new()?;
1225 let text = "email: a@b.co, CPF: 111.444.777-35, key: sk-abcdefghijklmnopqrstuv";
1226 let mut spans = d.detect(text);
1227 spans.sort_by_key(|s| s.start);
1228 let kinds: Vec<&PiiCategory> = spans.iter().map(|s| &s.category).collect();
1229 assert_eq!(
1230 kinds,
1231 vec![&PiiCategory::Email, &PiiCategory::Cpf, &PiiCategory::Secret,]
1232 );
1233 Ok(())
1234 }
1235
1236 #[test]
1239 fn dedup_keeps_longest_on_overlap() {
1240 let mut spans = vec![
1241 PiiSpan::new(0, 5, PiiCategory::Email),
1242 PiiSpan::new(0, 8, PiiCategory::Secret), PiiSpan::new(10, 15, PiiCategory::Phone),
1244 ];
1245 dedup_overlapping(&mut spans);
1246 assert_eq!(spans.len(), 2);
1247 assert_eq!(spans[0].category, PiiCategory::Secret);
1248 assert_eq!(spans[1].category, PiiCategory::Phone);
1249 }
1250
1251 #[test]
1252 fn dedup_drops_empty_spans() {
1253 let mut spans = vec![
1254 PiiSpan::new(5, 5, PiiCategory::Email),
1255 PiiSpan::new(10, 15, PiiCategory::Phone),
1256 ];
1257 dedup_overlapping(&mut spans);
1258 assert_eq!(spans.len(), 1);
1259 }
1260
1261 #[test]
1262 fn dedup_preserves_non_overlapping() {
1263 let mut spans = vec![
1264 PiiSpan::new(0, 3, PiiCategory::Email),
1265 PiiSpan::new(5, 8, PiiCategory::Phone),
1266 PiiSpan::new(10, 15, PiiCategory::Cpf),
1267 ];
1268 dedup_overlapping(&mut spans);
1269 assert_eq!(spans.len(), 3);
1270 }
1271
1272 #[test]
1275 fn mask_spans_produces_type_tagged_markers() -> TestResult {
1276 let d = BaselineDetector::new()?;
1277 let text = "email a@b.co please";
1278 let spans = d.detect(text);
1279 let masked = mask_spans(text, &spans);
1280 assert_eq!(masked, "email [REDACTED:email] please");
1281 Ok(())
1282 }
1283
1284 #[test]
1285 fn mask_spans_preserves_text_without_pii() -> TestResult {
1286 let d = BaselineDetector::new()?;
1287 let text = "no pii here, just prose";
1288 let masked = mask_spans(text, &d.detect(text));
1289 assert_eq!(masked, text);
1290 Ok(())
1291 }
1292
1293 #[test]
1294 fn mask_spans_handles_multiple_spans_in_order() -> TestResult {
1295 let d = BaselineDetector::new()?;
1296 let text = "a@b.co then c@d.co";
1297 let masked = mask_spans(text, &d.detect(text));
1298 assert_eq!(masked, "[REDACTED:email] then [REDACTED:email]");
1299 Ok(())
1300 }
1301
1302 #[test]
1305 fn mask_with_supports_format_preserving_pan_mask() -> TestResult {
1306 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1307 let text = "card 4111 1111 1111 1111 thanks";
1308 let spans = d.detect(text);
1309 let masked = mask_with(text, &spans, |span, matched| {
1310 if span.category == PiiCategory::CreditCard {
1311 mask_pan(matched)
1312 } else {
1313 format!("[REDACTED:{}]", span.category.as_tag())
1314 }
1315 });
1316 assert!(masked.contains("****-****-****-1111"), "got: {masked}");
1317 Ok(())
1318 }
1319
1320 #[test]
1321 fn mask_with_skips_non_boundary_spans_silently() {
1322 let text = "é abc";
1325 let spans = vec![PiiSpan::new(1, 3, PiiCategory::Email)];
1326 let masked = mask_with(text, &spans, |_, _| "X".to_owned());
1327 assert!(masked.contains("abc"));
1329 }
1330
1331 #[test]
1332 fn mask_with_skips_span_with_valid_start_invalid_end() {
1333 let text = "ab é"; let spans = vec![PiiSpan::new(1, 4, PiiCategory::Email)];
1338 let masked = mask_with(text, &spans, |_, _| "X".to_owned());
1339 assert_eq!(masked, "ab é");
1340 }
1341
1342 #[test]
1345 fn detects_pan_followed_by_trailing_digits() -> TestResult {
1346 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1350 let text = "card 4111 1111 1111 1111 150";
1351 let spans = d.detect(text);
1352 let pan_spans = spans
1353 .iter()
1354 .filter(|s| s.category == PiiCategory::CreditCard)
1355 .count();
1356 assert_eq!(pan_spans, 1, "expected the embedded PAN: {spans:?}");
1357 let masked = mask_spans(text, &spans);
1358 assert!(
1359 !masked.contains("4111 1111 1111 1111"),
1360 "PAN leaked: {masked}"
1361 );
1362 assert!(masked.contains("[REDACTED:credit_card]"), "got: {masked}");
1363 Ok(())
1364 }
1365
1366 #[test]
1367 fn sequential_filler_digits_do_not_false_positive() -> TestResult {
1368 let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1371 assert!(d.detect("order 1234 5678 9012 3456 processed").is_empty());
1372 Ok(())
1373 }
1374
1375 #[test]
1376 fn entity_detector_clean_string_via_prefilter() -> TestResult {
1377 let d = EntityDetector::baseline()?;
1379 assert!(
1380 d.detect("a perfectly ordinary sentence with no pii")
1381 .is_empty()
1382 );
1383 Ok(())
1384 }
1385
1386 #[test]
1389 fn dedup_merges_overlapping_tail_instead_of_dropping() {
1390 let mut spans = vec![
1394 PiiSpan::new(0, 6, PiiCategory::Secret),
1395 PiiSpan::new(4, 12, PiiCategory::Email),
1396 ];
1397 dedup_overlapping(&mut spans);
1398 assert_eq!(spans.len(), 1);
1399 assert_eq!(spans[0].start, 0);
1400 assert_eq!(spans[0].end, 12, "tail must be covered, not dropped");
1401 assert_eq!(spans[0].category, PiiCategory::Secret);
1402 }
1403
1404 #[test]
1405 fn mask_with_overlapping_spans_leaks_no_tail() {
1406 let text = "Bearer abc123def.ana@example.com";
1409 let spans = vec![
1411 PiiSpan::new(0, 20, PiiCategory::Secret),
1412 PiiSpan::new(17, 32, PiiCategory::Email),
1413 ];
1414 let masked = mask_spans(text, &spans);
1415 assert!(!masked.contains("@example.com"), "tail leaked: {masked}");
1416 }
1417
1418 #[test]
1421 fn mask_pan_keeps_last_four() {
1422 assert_eq!(mask_pan("4111 1111 1111 1111"), "****-****-****-1111");
1423 assert_eq!(mask_pan("4111111111111111"), "****-****-****-1111");
1424 assert_eq!(mask_pan("4111-1111-1111-1234"), "****-****-****-1234");
1425 }
1426
1427 #[test]
1428 fn mask_pan_falls_back_for_too_few_digits() {
1429 assert_eq!(mask_pan("abc"), "[REDACTED:credit_card]");
1430 assert_eq!(mask_pan("12"), "[REDACTED:credit_card]");
1431 }
1432
1433 #[test]
1436 fn luhn_rejects_wrong_length() {
1437 assert!(!luhn_is_valid("1234567890"));
1438 assert!(!luhn_is_valid("12345678901234567890"));
1439 }
1440
1441 #[test]
1442 fn cpf_validator_accepts_known_good() {
1443 assert!(cpf_is_valid("111.444.777-35"));
1444 assert!(cpf_is_valid("11144477735"));
1445 }
1446
1447 #[test]
1448 fn cnpj_validator_accepts_known_good() {
1449 assert!(cnpj_is_valid("11.222.333/0001-81"));
1450 assert!(cnpj_is_valid("11222333000181"));
1451 }
1452}