Skip to main content

agent_sdk_foundation/
privacy.rs

1//! PII detection and masking primitives.
2//!
3//! This module defines the [`PiiDetector`] trait and baseline
4//! implementations that locate entity-level PII (emails, phone
5//! numbers, credit cards, Brazilian identifiers) in plain text.
6//! Detection returns byte-offset spans so callers can mask, tokenize,
7//! or flag content without coupling to a single masking strategy.
8//!
9//! # Relationship to audit redaction
10//!
11//! This module also ships a key-name based [`RedactionPolicy`]
12//! (in [`redaction`]) that matches JSON fields like `password` or
13//! `api_key` and masks their values. The two layers are complementary:
14//! the policy handles structural redaction by key, and a
15//! [`PiiDetector`] scans any remaining string values for freeform
16//! entities (a card number dropped into a prompt, a CPF mentioned
17//! inside a tool response, a bearer token leaking into an error
18//! message).
19//!
20//! Integrations typically compose the two via
21//! [`redact_for_observability`].
22//!
23//! # Categories
24//!
25//! [`PiiCategory`] enumerates the well-known types plus a
26//! [`Custom`](PiiCategory::Custom) escape hatch for project-specific
27//! categories. Detectors emit spans tagged with a category so
28//! downstream masking can preserve type information (e.g. render
29//! `[REDACTED:email]` instead of a generic marker).
30//!
31//! # Built-in detectors
32//!
33//! - [`NoopDetector`] — detects nothing.
34//! - [`SecretDetector`] — recognises credential prefixes (`Bearer`,
35//!   `sk-`, GitHub PATs, AWS access keys, Google API keys).
36//! - [`EntityDetector`] — regex + check-digit validation for emails,
37//!   E.164 phones, credit card PANs (Luhn), Brazilian CPFs and CNPJs
38//!   (mod-11 check digits), Pix UUID keys, IPv4 addresses, JWTs.
39//! - [`CompositeDetector`] — wraps multiple detectors and
40//!   deduplicates overlapping spans.
41//! - [`BaselineDetector`] — convenience composite of
42//!   [`SecretDetector`] + [`EntityDetector`], suitable as the SDK
43//!   default detector.
44//!
45//! # Usage
46//!
47//! ```
48//! use agent_sdk_foundation::privacy::{BaselineDetector, PiiDetector, mask_spans};
49//!
50//! let detector = BaselineDetector::new()?;
51//! let text = "Pay Pix to CPF 111.444.777-35 or card 4111 1111 1111 1111.";
52//! let spans = detector.detect(text);
53//! let masked = mask_spans(text, &spans);
54//! assert!(masked.contains("[REDACTED:cpf]"));
55//! assert!(masked.contains("[REDACTED:credit_card]"));
56//! # Ok::<(), regex::Error>(())
57//! ```
58//!
59//! # Limitations
60//!
61//! - Detection is deterministic pattern + checksum based. It will
62//!   miss entities that require semantic understanding (person
63//!   names, postal addresses, document photos) — those need a
64//!   neural detector wired in through the same [`PiiDetector`]
65//!   trait.
66//! - Spans use UTF-8 byte offsets; passing non-boundary offsets to
67//!   [`mask_spans`] will silently skip those spans rather than
68//!   panic.
69
70use regex::{Regex, RegexSet};
71use serde::{Deserialize, Serialize};
72
73pub mod redaction;
74
75/// Credential value prefixes recognised for *wholesale* redaction.
76///
77/// Single source of truth for the baseline [`RedactionPolicy`](redaction::RedactionPolicy)
78/// `sensitive_value_prefixes` list, which is built from this slice. The
79/// [`SecretDetector`] covers the same credential families through its richer
80/// regex (each with per-prefix body and length rules); when adding a new
81/// family, update both so they do not drift.
82pub(crate) const SECRET_PREFIXES: &[&str] = &[
83    "Bearer ",
84    "sk-",
85    "pk-",
86    "xox",
87    "ghp_",
88    "gho_",
89    "ghs_",
90    "ghu_",
91    "github_pat_",
92    "AKIA",
93    "AIza",
94];
95
96pub use redaction::{
97    REDACTED_MARKER, RedactionLevel, RedactionPolicy, redact_error, redact_for_observability,
98    redact_string, redact_value,
99};
100
101// ─────────────────────────────────────────────────────────────────────
102// Category and span
103// ─────────────────────────────────────────────────────────────────────
104
105/// A well-known PII category tag.
106///
107/// Detectors emit spans labelled with a category so downstream
108/// masking and audit logic can preserve type information.
109#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
110#[serde(rename_all = "snake_case")]
111#[non_exhaustive]
112pub enum PiiCategory {
113    /// Credential or secret (API key, bearer token, password-shaped value).
114    Secret,
115    /// Email address.
116    Email,
117    /// Phone number (E.164 international format).
118    Phone,
119    /// Credit card PAN, Luhn-validated.
120    CreditCard,
121    /// Brazilian individual taxpayer ID (CPF), mod-11 validated.
122    Cpf,
123    /// Brazilian corporate taxpayer ID (CNPJ), mod-11 validated.
124    Cnpj,
125    /// Brazilian national ID (RG) — shape only, no check-digit.
126    Rg,
127    /// Brazilian driver's license number (CNH).
128    Cnh,
129    /// Pix instant-payment key in UUID form.
130    PixKey,
131    /// IPv4 address.
132    IpAddress,
133    /// JSON Web Token.
134    Jwt,
135    /// Custom, project-specific category.
136    Custom(String),
137}
138
139impl PiiCategory {
140    /// Stable machine-readable tag suitable for placeholder markers
141    /// such as `[REDACTED:<tag>]`.
142    #[must_use]
143    pub const fn as_tag(&self) -> &str {
144        match self {
145            Self::Secret => "secret",
146            Self::Email => "email",
147            Self::Phone => "phone",
148            Self::CreditCard => "credit_card",
149            Self::Cpf => "cpf",
150            Self::Cnpj => "cnpj",
151            Self::Rg => "rg",
152            Self::Cnh => "cnh",
153            Self::PixKey => "pix_key",
154            Self::IpAddress => "ip_address",
155            Self::Jwt => "jwt",
156            Self::Custom(name) => name.as_str(),
157        }
158    }
159}
160
161/// A span of PII located within a text buffer, expressed as half-open
162/// byte offsets.
163#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
164pub struct PiiSpan {
165    /// Inclusive start byte offset.
166    pub start: usize,
167    /// Exclusive end byte offset.
168    pub end: usize,
169    /// Category label assigned by the detector.
170    pub category: PiiCategory,
171}
172
173impl PiiSpan {
174    #[must_use]
175    pub const fn new(start: usize, end: usize, category: PiiCategory) -> Self {
176        Self {
177            start,
178            end,
179            category,
180        }
181    }
182
183    #[must_use]
184    pub const fn len(&self) -> usize {
185        self.end.saturating_sub(self.start)
186    }
187
188    #[must_use]
189    pub const fn is_empty(&self) -> bool {
190        self.end <= self.start
191    }
192
193    /// Whether this span strictly overlaps another (shared bytes,
194    /// not merely touching).
195    #[must_use]
196    pub const fn overlaps(&self, other: &Self) -> bool {
197        self.start < other.end && other.start < self.end
198    }
199}
200
201// ─────────────────────────────────────────────────────────────────────
202// PiiDetector trait
203// ─────────────────────────────────────────────────────────────────────
204
205/// Locates PII within plain text.
206///
207/// Implementations must be deterministic and side-effect free.
208/// The order of returned spans is unspecified; callers that need
209/// ordered output should sort by `start`.
210///
211/// [`Debug`] is a supertrait so detectors embedded in larger config
212/// structs (e.g. an audit redaction policy) can derive `Debug`
213/// without custom impls.
214pub trait PiiDetector: Send + Sync + std::fmt::Debug {
215    /// Find every PII span in `text`. Returned spans use UTF-8
216    /// char-boundary-safe byte offsets.
217    fn detect(&self, text: &str) -> Vec<PiiSpan>;
218}
219
220impl<T: PiiDetector + ?Sized> PiiDetector for Box<T> {
221    fn detect(&self, text: &str) -> Vec<PiiSpan> {
222        (**self).detect(text)
223    }
224}
225
226impl<T: PiiDetector + ?Sized> PiiDetector for std::sync::Arc<T> {
227    fn detect(&self, text: &str) -> Vec<PiiSpan> {
228        (**self).detect(text)
229    }
230}
231
232// ─────────────────────────────────────────────────────────────────────
233// NoopDetector
234// ─────────────────────────────────────────────────────────────────────
235
236/// Detector that never reports any spans — a sentinel for paths
237/// where PII detection is explicitly disabled.
238#[derive(Clone, Copy, Debug, Default)]
239pub struct NoopDetector;
240
241impl PiiDetector for NoopDetector {
242    fn detect(&self, _text: &str) -> Vec<PiiSpan> {
243        Vec::new()
244    }
245}
246
247// ─────────────────────────────────────────────────────────────────────
248// SecretDetector
249// ─────────────────────────────────────────────────────────────────────
250
251/// Detects credential-shaped tokens by prefix.
252///
253/// Covers Bearer tokens, OpenAI-style keys (`sk-`, `pk-`), Slack
254/// (`xox…`), GitHub PATs (`ghp_`, `gho_`, `ghs_`, `ghu_`,
255/// `github_pat_`), AWS access keys (`AKIA…`), and Google API keys
256/// (`AIza…`).
257#[derive(Debug)]
258pub struct SecretDetector {
259    pattern: Regex,
260}
261
262impl SecretDetector {
263    /// Baseline detector covering the token shapes listed above.
264    ///
265    /// # Errors
266    /// Returns a [`regex::Error`] if the internal pattern fails to
267    /// compile — this should only occur on a corrupted build of
268    /// the `regex` crate.
269    pub fn baseline() -> Result<Self, regex::Error> {
270        let pattern = Regex::new(
271            r"(?x)
272              (?:
273                  \bBearer\s+[A-Za-z0-9._~+/=\-]{8,}
274                | \bsk-[A-Za-z0-9_\-]{16,}
275                | \bpk-[A-Za-z0-9_\-]{16,}
276                | \bxox[abpsr]-[A-Za-z0-9\-]{8,}
277                | \bghp_[A-Za-z0-9]{20,}
278                | \bgho_[A-Za-z0-9]{20,}
279                | \bghs_[A-Za-z0-9]{20,}
280                | \bghu_[A-Za-z0-9]{20,}
281                | \bgithub_pat_[A-Za-z0-9_]{20,}
282                | \bAKIA[A-Z0-9]{16}
283                | \bAIza[A-Za-z0-9_\-]{30,}
284              )
285            ",
286        )?;
287        Ok(Self { pattern })
288    }
289}
290
291impl PiiDetector for SecretDetector {
292    fn detect(&self, text: &str) -> Vec<PiiSpan> {
293        self.pattern
294            .find_iter(text)
295            .map(|m| PiiSpan::new(m.start(), m.end(), PiiCategory::Secret))
296            .collect()
297    }
298}
299
300// ─────────────────────────────────────────────────────────────────────
301// EntityDetector
302// ─────────────────────────────────────────────────────────────────────
303
304/// Entity categories that [`EntityDetector`] knows how to look for.
305///
306/// This is the subset of [`PiiCategory`] that the entity detector
307/// implements — it intentionally excludes [`Secret`](PiiCategory::Secret)
308/// (handled by [`SecretDetector`]) and [`Custom`](PiiCategory::Custom)
309/// (handled via user-supplied detectors).
310#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
311#[repr(u8)]
312pub enum DetectCategory {
313    Email = 0,
314    Phone = 1,
315    CreditCard = 2,
316    Cpf = 3,
317    Cnpj = 4,
318    PixUuid = 5,
319    Ipv4 = 6,
320    Jwt = 7,
321}
322
323impl DetectCategory {
324    const fn mask(self) -> u16 {
325        1u16 << (self as u8)
326    }
327}
328
329/// Bitmask-packed set of entity categories to detect.
330///
331/// Construct with [`CategorySet::all`] or [`CategorySet::none`] and
332/// toggle individual categories with [`with`](Self::with) and
333/// [`without`](Self::without).
334#[derive(Clone, Copy, Debug, Eq, PartialEq)]
335pub struct CategorySet(u16);
336
337impl CategorySet {
338    const ALL_MASK: u16 = DetectCategory::Email.mask()
339        | DetectCategory::Phone.mask()
340        | DetectCategory::CreditCard.mask()
341        | DetectCategory::Cpf.mask()
342        | DetectCategory::Cnpj.mask()
343        | DetectCategory::PixUuid.mask()
344        | DetectCategory::Ipv4.mask()
345        | DetectCategory::Jwt.mask();
346
347    /// All categories enabled.
348    #[must_use]
349    pub const fn all() -> Self {
350        Self(Self::ALL_MASK)
351    }
352
353    /// All categories disabled.
354    #[must_use]
355    pub const fn none() -> Self {
356        Self(0)
357    }
358
359    /// Enable a single category, returning the updated set.
360    #[must_use]
361    pub const fn with(mut self, category: DetectCategory) -> Self {
362        self.0 |= category.mask();
363        self
364    }
365
366    /// Disable a single category, returning the updated set.
367    #[must_use]
368    pub const fn without(mut self, category: DetectCategory) -> Self {
369        self.0 &= !category.mask();
370        self
371    }
372
373    /// Whether the given category is enabled.
374    #[must_use]
375    pub const fn contains(self, category: DetectCategory) -> bool {
376        self.0 & category.mask() != 0
377    }
378}
379
380impl Default for CategorySet {
381    fn default() -> Self {
382        Self::all()
383    }
384}
385
386/// Source patterns for each entity category, ordered to match the
387/// [`DetectCategory`] discriminants (`Email = 0` … `Jwt = 7`).
388///
389/// Both the individual [`Regex`] fields and the [`RegexSet`] prefilter are
390/// built from these so the two can never drift; the prefilter's match indices
391/// line up 1:1 with `DetectCategory as usize`.
392const ENTITY_PATTERNS: [&str; 8] = [
393    r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}",
394    r"\+[1-9]\d{7,14}",
395    r"\b(?:\d[ \-]?){12,18}\d\b",
396    r"\b(?:\d{3}\.\d{3}\.\d{3}-\d{2}|\d{11})\b",
397    r"\b(?:\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}|\d{14})\b",
398    r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b",
399    r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|1?\d\d?)\b",
400    r"\beyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b",
401];
402
403/// Entity-aware detector using regex patterns plus check-digit
404/// validation where applicable.
405///
406/// PANs are validated with Luhn; CPFs and CNPJs with the Brazilian
407/// mod-11 algorithm. This sharply reduces false positives on random
408/// numeric strings (invoice numbers, order IDs, etc.) that share
409/// the same shape as real identifiers.
410///
411/// A [`RegexSet`] prefilter runs first: one multi-pattern pass identifies
412/// which categories appear at all, so a clean string (the common case) pays a
413/// single scan instead of eight independent `find_iter` passes.
414#[derive(Debug)]
415pub struct EntityDetector {
416    email: Regex,
417    phone: Regex,
418    credit_card: Regex,
419    cpf: Regex,
420    cnpj: Regex,
421    pix_uuid: Regex,
422    ipv4: Regex,
423    jwt: Regex,
424    prefilter: RegexSet,
425    enabled: CategorySet,
426}
427
428impl EntityDetector {
429    /// Construct a detector with an explicit category toggle set.
430    ///
431    /// # Errors
432    /// Returns a [`regex::Error`] if any internal pattern fails to
433    /// compile.
434    pub fn new(enabled: CategorySet) -> Result<Self, regex::Error> {
435        Ok(Self {
436            email: Regex::new(ENTITY_PATTERNS[DetectCategory::Email as usize])?,
437            phone: Regex::new(ENTITY_PATTERNS[DetectCategory::Phone as usize])?,
438            credit_card: Regex::new(ENTITY_PATTERNS[DetectCategory::CreditCard as usize])?,
439            cpf: Regex::new(ENTITY_PATTERNS[DetectCategory::Cpf as usize])?,
440            cnpj: Regex::new(ENTITY_PATTERNS[DetectCategory::Cnpj as usize])?,
441            pix_uuid: Regex::new(ENTITY_PATTERNS[DetectCategory::PixUuid as usize])?,
442            ipv4: Regex::new(ENTITY_PATTERNS[DetectCategory::Ipv4 as usize])?,
443            jwt: Regex::new(ENTITY_PATTERNS[DetectCategory::Jwt as usize])?,
444            prefilter: RegexSet::new(ENTITY_PATTERNS)?,
445            enabled,
446        })
447    }
448
449    /// Whether `category` is both enabled and reported as present by the
450    /// prefilter pass — gates the per-category `find_iter`.
451    fn should_scan(&self, matches: &regex::SetMatches, category: DetectCategory) -> bool {
452        self.enabled.contains(category) && matches.matched(category as usize)
453    }
454
455    /// Baseline detector with all categories enabled.
456    ///
457    /// # Errors
458    /// See [`EntityDetector::new`].
459    pub fn baseline() -> Result<Self, regex::Error> {
460        Self::new(CategorySet::all())
461    }
462}
463
464impl PiiDetector for EntityDetector {
465    fn detect(&self, text: &str) -> Vec<PiiSpan> {
466        let mut spans = Vec::new();
467
468        // One multi-pattern pass tells us which categories appear at all;
469        // a clean string short-circuits here without any per-category scan.
470        let matches = self.prefilter.matches(text);
471        if !matches.matched_any() {
472            return spans;
473        }
474
475        if self.should_scan(&matches, DetectCategory::Email) {
476            for m in self.email.find_iter(text) {
477                spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Email));
478            }
479        }
480
481        if self.should_scan(&matches, DetectCategory::Phone) {
482            for m in self.phone.find_iter(text) {
483                spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Phone));
484            }
485        }
486
487        if self.should_scan(&matches, DetectCategory::CreditCard) {
488            for m in self.credit_card.find_iter(text) {
489                push_credit_card_spans(m.as_str(), m.start(), &mut spans);
490            }
491        }
492
493        if self.should_scan(&matches, DetectCategory::Cpf) {
494            for m in self.cpf.find_iter(text) {
495                if cpf_is_valid(m.as_str()) {
496                    spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Cpf));
497                }
498            }
499        }
500
501        if self.should_scan(&matches, DetectCategory::Cnpj) {
502            for m in self.cnpj.find_iter(text) {
503                if cnpj_is_valid(m.as_str()) {
504                    spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Cnpj));
505                }
506            }
507        }
508
509        if self.should_scan(&matches, DetectCategory::PixUuid) {
510            for m in self.pix_uuid.find_iter(text) {
511                spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::PixKey));
512            }
513        }
514
515        if self.should_scan(&matches, DetectCategory::Ipv4) {
516            for m in self.ipv4.find_iter(text) {
517                spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::IpAddress));
518            }
519        }
520
521        if self.should_scan(&matches, DetectCategory::Jwt) {
522            for m in self.jwt.find_iter(text) {
523                spans.push(PiiSpan::new(m.start(), m.end(), PiiCategory::Jwt));
524            }
525        }
526
527        spans
528    }
529}
530
531/// Emit credit-card spans for one regex match's text.
532///
533/// `matched` is the matched run (it starts and ends with a digit; digits may
534/// be separated by single spaces or dashes). `base` is the byte offset of
535/// `matched` within the source text.
536///
537/// A clean PAN — the whole run passes Luhn — yields one span. Otherwise the
538/// run carried extra digits (e.g. `4111 1111 1111 1111 150`, a PAN followed by
539/// an amount): the greedy regex grabbed all of them and a single Luhn check on
540/// the combined digits fails, which previously let the real PAN leak unmasked.
541/// Instead, slide a 13-19 digit window that begins at a digit-group boundary
542/// (the run start or just after a separator) and emit the leftmost-longest
543/// Luhn-valid, non-overlapping sub-windows. Anchoring window starts to group
544/// boundaries keeps sequential filler like `1234 5678 9012 3456` from matching
545/// a coincidental interior sub-window.
546fn push_credit_card_spans(matched: &str, base: usize, out: &mut Vec<PiiSpan>) {
547    if luhn_is_valid(matched) {
548        out.push(PiiSpan::new(
549            base,
550            base + matched.len(),
551            PiiCategory::CreditCard,
552        ));
553        return;
554    }
555
556    let bytes = matched.as_bytes();
557    let digit_offsets: Vec<usize> = matched
558        .bytes()
559        .enumerate()
560        .filter(|(_, b)| b.is_ascii_digit())
561        .map(|(i, _)| i)
562        .collect();
563    let n = digit_offsets.len();
564
565    let is_group_start = |di: usize| -> bool {
566        let off = digit_offsets[di];
567        off == 0 || !bytes[off - 1].is_ascii_digit()
568    };
569
570    let mut di = 0;
571    while di < n {
572        if !is_group_start(di) {
573            di += 1;
574            continue;
575        }
576        let max_len = (n - di).min(19);
577        let mut emitted = None;
578        if max_len >= 13 {
579            for len in (13..=max_len).rev() {
580                let start_off = digit_offsets[di];
581                // ASCII digits are one byte, so `+ 1` lands on a char boundary.
582                let end_off = digit_offsets[di + len - 1] + 1;
583                if luhn_is_valid(&matched[start_off..end_off]) {
584                    out.push(PiiSpan::new(
585                        base + start_off,
586                        base + end_off,
587                        PiiCategory::CreditCard,
588                    ));
589                    emitted = Some(len);
590                    break;
591                }
592            }
593        }
594        // Skip past an emitted window; otherwise advance one digit and retry.
595        di += emitted.unwrap_or(1);
596    }
597}
598
599// ─────────────────────────────────────────────────────────────────────
600// CompositeDetector and BaselineDetector
601// ─────────────────────────────────────────────────────────────────────
602
603/// Aggregates multiple detectors, optionally deduplicating
604/// overlapping spans.
605#[derive(Debug)]
606pub struct CompositeDetector {
607    detectors: Vec<Box<dyn PiiDetector>>,
608    dedup: bool,
609}
610
611impl CompositeDetector {
612    #[must_use]
613    pub fn new(detectors: Vec<Box<dyn PiiDetector>>) -> Self {
614        Self {
615            detectors,
616            dedup: true,
617        }
618    }
619
620    /// Disable overlap deduplication. Use when callers want every
621    /// detector's raw output (e.g. for metrics or debugging).
622    #[must_use]
623    pub const fn without_dedup(mut self) -> Self {
624        self.dedup = false;
625        self
626    }
627}
628
629impl PiiDetector for CompositeDetector {
630    fn detect(&self, text: &str) -> Vec<PiiSpan> {
631        let mut spans: Vec<PiiSpan> = self.detectors.iter().flat_map(|d| d.detect(text)).collect();
632        if self.dedup {
633            dedup_overlapping(&mut spans);
634        }
635        spans
636    }
637}
638
639/// Convenience composite of [`SecretDetector`] + [`EntityDetector`]
640/// using default settings. Suitable as the SDK default detector.
641#[derive(Debug)]
642pub struct BaselineDetector {
643    inner: CompositeDetector,
644}
645
646impl BaselineDetector {
647    /// Construct the baseline detector.
648    ///
649    /// # Errors
650    /// See [`SecretDetector::baseline`] / [`EntityDetector::baseline`].
651    pub fn new() -> Result<Self, regex::Error> {
652        let secrets: Box<dyn PiiDetector> = Box::new(SecretDetector::baseline()?);
653        let entities: Box<dyn PiiDetector> = Box::new(EntityDetector::baseline()?);
654        Ok(Self {
655            inner: CompositeDetector::new(vec![secrets, entities]),
656        })
657    }
658}
659
660impl PiiDetector for BaselineDetector {
661    fn detect(&self, text: &str) -> Vec<PiiSpan> {
662        self.inner.detect(text)
663    }
664}
665
666// ─────────────────────────────────────────────────────────────────────
667// Span utilities
668// ─────────────────────────────────────────────────────────────────────
669
670/// Sort spans by start (asc) then length (desc), drop empties, and **merge**
671/// overlapping spans into a single covering interval.
672///
673/// The earlier span's category wins. Non-overlapping spans are preserved in
674/// left-to-right order.
675///
676/// Merging (rather than dropping the later span, as a previous version did) is
677/// a masking-safety requirement: a span that starts inside a kept span but
678/// extends past its end would otherwise be discarded wholesale, leaving its
679/// non-overlapping tail (`kept.end..span.end`) in cleartext. With custom
680/// detectors composed via [`CompositeDetector`], that tail can be the bulk of
681/// a credential or email — so it must be covered, not dropped.
682pub fn dedup_overlapping(spans: &mut Vec<PiiSpan>) {
683    spans.sort_by(|a, b| a.start.cmp(&b.start).then_with(|| b.len().cmp(&a.len())));
684    let mut kept: Vec<PiiSpan> = Vec::with_capacity(spans.len());
685    for span in spans.drain(..) {
686        if span.is_empty() {
687            continue;
688        }
689        match kept.last_mut() {
690            // Overlaps the previously kept span: extend it to cover both so no
691            // tail leaks. The first span's category is retained.
692            Some(prev) if prev.end > span.start => {
693                prev.end = prev.end.max(span.end);
694            }
695            _ => kept.push(span),
696        }
697    }
698    *spans = kept;
699}
700
701// ─────────────────────────────────────────────────────────────────────
702// Masking helpers
703// ─────────────────────────────────────────────────────────────────────
704
705/// Replace every span in `text` with `[REDACTED:<category>]`.
706#[must_use]
707pub fn mask_spans(text: &str, spans: &[PiiSpan]) -> String {
708    mask_with(text, spans, |span, _matched| {
709        format!("[REDACTED:{}]", span.category.as_tag())
710    })
711}
712
713/// Replace every span in `text` using a caller-provided masker.
714///
715/// The closure receives the span metadata plus the original matched
716/// substring. Useful for format-preserving masking (e.g.
717/// `****-****-****-1234` on PANs) or for reversible tokenization.
718///
719/// Overlapping spans are deduplicated via [`dedup_overlapping`]
720/// before masking. Non-char-boundary offsets are silently skipped
721/// to avoid panics.
722#[must_use]
723pub fn mask_with<F>(text: &str, spans: &[PiiSpan], f: F) -> String
724where
725    F: Fn(&PiiSpan, &str) -> String,
726{
727    // Fast path: the common production callers (baseline / composite
728    // detectors) already hand us sorted, non-empty, non-overlapping spans, so
729    // mask straight from the borrowed slice and skip the clone + sort + dedup.
730    if spans_are_clean(spans) {
731        return mask_sorted(text, spans, &f);
732    }
733    let mut sorted = spans.to_vec();
734    dedup_overlapping(&mut sorted);
735    mask_sorted(text, &sorted, &f)
736}
737
738/// Whether `spans` are already sorted by start, non-empty, and
739/// non-overlapping (touching is allowed) — i.e. safe to mask without a
740/// dedup pass.
741fn spans_are_clean(spans: &[PiiSpan]) -> bool {
742    spans.iter().all(|s| !s.is_empty()) && spans.windows(2).all(|w| w[0].end <= w[1].start)
743}
744
745/// Mask `text` using pre-sorted, non-overlapping `spans`.
746///
747/// A span whose start *or* end is not a UTF-8 char boundary is skipped
748/// entirely: the matched slice is fetched **before** any output is written, so
749/// a valid-start / invalid-end span never duplicates the prefix nor leaks the
750/// bytes it was meant to mask.
751fn mask_sorted<F>(text: &str, sorted: &[PiiSpan], f: &F) -> String
752where
753    F: Fn(&PiiSpan, &str) -> String,
754{
755    let mut out = String::with_capacity(text.len());
756    let mut cursor = 0;
757    for span in sorted {
758        if span.start < cursor {
759            continue;
760        }
761        let (Some(prefix), Some(matched)) =
762            (text.get(cursor..span.start), text.get(span.start..span.end))
763        else {
764            continue;
765        };
766        out.push_str(prefix);
767        out.push_str(&f(span, matched));
768        cursor = span.end;
769    }
770    if let Some(suffix) = text.get(cursor..) {
771        out.push_str(suffix);
772    }
773    out
774}
775
776/// Format-preserving PAN mask: keeps the last four digits and
777/// replaces the preceding digits with `*` grouped in fours.
778#[must_use]
779pub fn mask_pan(pan: &str) -> String {
780    let digits: Vec<char> = pan.chars().filter(char::is_ascii_digit).collect();
781    if digits.len() < 4 {
782        return format!("[REDACTED:{}]", PiiCategory::CreditCard.as_tag());
783    }
784    let last_four: String = digits.iter().rev().take(4).rev().copied().collect();
785    format!("****-****-****-{last_four}")
786}
787
788// ─────────────────────────────────────────────────────────────────────
789// Check-digit validators (private)
790// ─────────────────────────────────────────────────────────────────────
791
792fn luhn_is_valid(s: &str) -> bool {
793    let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
794    if digits.len() < 13 || digits.len() > 19 {
795        return false;
796    }
797    let sum: u32 = digits
798        .iter()
799        .rev()
800        .enumerate()
801        .map(|(i, &d)| {
802            if i % 2 == 0 {
803                d
804            } else {
805                let doubled = d * 2;
806                if doubled > 9 { doubled - 9 } else { doubled }
807            }
808        })
809        .sum();
810    sum.is_multiple_of(10)
811}
812
813fn cpf_is_valid(s: &str) -> bool {
814    let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
815    if digits.len() != 11 {
816        return false;
817    }
818    if digits.iter().all(|&d| d == digits[0]) {
819        return false;
820    }
821    let Some(first_nine) = digits.get(..9) else {
822        return false;
823    };
824    let check1 = mod11_cpf_check(first_nine, 10);
825    if digits.get(9) != Some(&check1) {
826        return false;
827    }
828    let Some(first_ten) = digits.get(..10) else {
829        return false;
830    };
831    let check2 = mod11_cpf_check(first_ten, 11);
832    digits.get(10) == Some(&check2)
833}
834
835fn mod11_cpf_check(slice: &[u32], weight_start: u32) -> u32 {
836    let weights = (0_u32..).map(|i| weight_start.saturating_sub(i));
837    let sum: u32 = slice.iter().zip(weights).map(|(d, w)| d * w).sum();
838    let rem = sum % 11;
839    if rem < 2 { 0 } else { 11 - rem }
840}
841
842fn cnpj_is_valid(s: &str) -> bool {
843    const WEIGHTS1: [u32; 12] = [5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
844    const WEIGHTS2: [u32; 13] = [6, 5, 4, 3, 2, 9, 8, 7, 6, 5, 4, 3, 2];
845
846    let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
847    if digits.len() != 14 {
848        return false;
849    }
850    if digits.iter().all(|&d| d == digits[0]) {
851        return false;
852    }
853    let Some(first_twelve) = digits.get(..12) else {
854        return false;
855    };
856    let check1 = weighted_mod11(first_twelve, &WEIGHTS1);
857    if digits.get(12) != Some(&check1) {
858        return false;
859    }
860    let Some(first_thirteen) = digits.get(..13) else {
861        return false;
862    };
863    let check2 = weighted_mod11(first_thirteen, &WEIGHTS2);
864    digits.get(13) == Some(&check2)
865}
866
867fn weighted_mod11(slice: &[u32], weights: &[u32]) -> u32 {
868    let sum: u32 = slice.iter().zip(weights.iter()).map(|(d, w)| d * w).sum();
869    let rem = sum % 11;
870    if rem < 2 { 0 } else { 11 - rem }
871}
872
873// ─────────────────────────────────────────────────────────────────────
874// Tests
875// ─────────────────────────────────────────────────────────────────────
876
877#[cfg(test)]
878mod tests {
879    use super::*;
880
881    type TestResult = Result<(), regex::Error>;
882
883    // ── PiiCategory ─────────────────────────────────────────────
884
885    #[test]
886    fn category_as_tag_returns_stable_strings() {
887        assert_eq!(PiiCategory::Secret.as_tag(), "secret");
888        assert_eq!(PiiCategory::Email.as_tag(), "email");
889        assert_eq!(PiiCategory::Phone.as_tag(), "phone");
890        assert_eq!(PiiCategory::CreditCard.as_tag(), "credit_card");
891        assert_eq!(PiiCategory::Cpf.as_tag(), "cpf");
892        assert_eq!(PiiCategory::Cnpj.as_tag(), "cnpj");
893        assert_eq!(PiiCategory::PixKey.as_tag(), "pix_key");
894        assert_eq!(PiiCategory::IpAddress.as_tag(), "ip_address");
895        assert_eq!(PiiCategory::Jwt.as_tag(), "jwt");
896        assert_eq!(PiiCategory::Custom("org_id".to_owned()).as_tag(), "org_id");
897    }
898
899    #[test]
900    fn unit_category_serialises_as_snake_case_string() -> serde_json::Result<()> {
901        let json = serde_json::to_string(&PiiCategory::Email)?;
902        assert_eq!(json, r#""email""#);
903        let back: PiiCategory = serde_json::from_str(&json)?;
904        assert_eq!(back, PiiCategory::Email);
905        Ok(())
906    }
907
908    #[test]
909    fn custom_category_round_trips() -> serde_json::Result<()> {
910        let original = PiiCategory::Custom("account_key".to_owned());
911        let json = serde_json::to_string(&original)?;
912        let back: PiiCategory = serde_json::from_str(&json)?;
913        assert_eq!(back, original);
914        Ok(())
915    }
916
917    // ── PiiSpan ─────────────────────────────────────────────────
918
919    #[test]
920    fn span_len_and_is_empty() {
921        let s = PiiSpan::new(5, 10, PiiCategory::Email);
922        assert_eq!(s.len(), 5);
923        assert!(!s.is_empty());
924        let z = PiiSpan::new(5, 5, PiiCategory::Email);
925        assert!(z.is_empty());
926    }
927
928    #[test]
929    fn span_overlaps_detects_shared_bytes_only() {
930        let a = PiiSpan::new(0, 5, PiiCategory::Email);
931        let b = PiiSpan::new(3, 8, PiiCategory::Email);
932        let c = PiiSpan::new(5, 10, PiiCategory::Email);
933        assert!(a.overlaps(&b));
934        assert!(!a.overlaps(&c)); // Touching is not overlap
935        assert!(!c.overlaps(&a));
936    }
937
938    // ── NoopDetector ────────────────────────────────────────────
939
940    #[test]
941    fn noop_detector_finds_nothing() {
942        let d = NoopDetector;
943        assert!(d.detect("sk-abc123 email a@b.co").is_empty());
944    }
945
946    // ── SecretDetector ──────────────────────────────────────────
947
948    #[test]
949    fn secret_detector_detects_common_prefixes() -> TestResult {
950        let d = SecretDetector::baseline()?;
951        let cases = [
952            "Authorization: Bearer eyJhbGciOiJIUzI1NiJ9.payload.sig",
953            "key=sk-abcdefghijklmnopqrstuv",
954            "GH token ghp_abcdefghijklmnopqrstuvwxyz",
955            "AWS AKIAIOSFODNN7EXAMPLE",
956            "xoxb-1234567890-slack",
957            "GOOGLE_KEY=AIzaSyA-abcdefghijklmnopqrstuvwxyz123",
958        ];
959        for text in cases {
960            let spans = d.detect(text);
961            assert_eq!(spans.len(), 1, "expected 1 span in {text:?}, got {spans:?}");
962            assert_eq!(spans[0].category, PiiCategory::Secret);
963        }
964        Ok(())
965    }
966
967    #[test]
968    fn secret_detector_ignores_non_secret_text() -> TestResult {
969        let d = SecretDetector::baseline()?;
970        assert!(d.detect("just some ordinary prose").is_empty());
971        assert!(d.detect("sk-short").is_empty()); // Below min body length
972        Ok(())
973    }
974
975    // ── EntityDetector: email ────────────────────────────────────
976
977    #[test]
978    fn detects_email() -> TestResult {
979        let d = EntityDetector::baseline()?;
980        let spans = d.detect("please email me at ana.silva+tag@example.com tomorrow");
981        assert_eq!(spans.len(), 1);
982        assert_eq!(spans[0].category, PiiCategory::Email);
983        Ok(())
984    }
985
986    // ── EntityDetector: phone ────────────────────────────────────
987
988    #[test]
989    fn detects_e164_phone() -> TestResult {
990        let d = EntityDetector::baseline()?;
991        let spans = d.detect("call +5511987654321 for support");
992        assert_eq!(spans.len(), 1);
993        assert_eq!(spans[0].category, PiiCategory::Phone);
994        Ok(())
995    }
996
997    #[test]
998    fn non_e164_phone_not_detected() -> TestResult {
999        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Phone))?;
1000        // Missing leading '+' — not E.164.
1001        assert!(d.detect("call 11987654321").is_empty());
1002        Ok(())
1003    }
1004
1005    // ── EntityDetector: credit card (Luhn) ───────────────────────
1006
1007    #[test]
1008    fn detects_luhn_valid_pan() -> TestResult {
1009        let d = EntityDetector::baseline()?;
1010        let spans = d.detect("card 4111 1111 1111 1111 expires soon");
1011        let pan_count = spans
1012            .iter()
1013            .filter(|s| s.category == PiiCategory::CreditCard)
1014            .count();
1015        assert_eq!(pan_count, 1);
1016        Ok(())
1017    }
1018
1019    #[test]
1020    fn rejects_luhn_invalid_pan() -> TestResult {
1021        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1022        // 16 digits but not Luhn-valid
1023        let spans = d.detect("card 1234 5678 9012 3456");
1024        assert!(spans.is_empty(), "Luhn-invalid PAN leaked: {spans:?}");
1025        Ok(())
1026    }
1027
1028    #[test]
1029    fn detects_mastercard_test_pan() -> TestResult {
1030        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1031        let spans = d.detect("5500-0000-0000-0004");
1032        assert_eq!(spans.len(), 1);
1033        assert_eq!(spans[0].category, PiiCategory::CreditCard);
1034        Ok(())
1035    }
1036
1037    // ── EntityDetector: CPF ─────────────────────────────────────
1038
1039    #[test]
1040    fn detects_valid_cpf_formatted() -> TestResult {
1041        let d = EntityDetector::baseline()?;
1042        let spans = d.detect("meu CPF é 111.444.777-35 ok?");
1043        let cpf_count = spans
1044            .iter()
1045            .filter(|s| s.category == PiiCategory::Cpf)
1046            .count();
1047        assert_eq!(cpf_count, 1);
1048        Ok(())
1049    }
1050
1051    #[test]
1052    fn detects_valid_cpf_unformatted() -> TestResult {
1053        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Cpf))?;
1054        let spans = d.detect("cpf 11144477735 confere");
1055        assert_eq!(spans.len(), 1);
1056        assert_eq!(spans[0].category, PiiCategory::Cpf);
1057        Ok(())
1058    }
1059
1060    #[test]
1061    fn rejects_invalid_cpf() -> TestResult {
1062        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Cpf))?;
1063        // 11 digits but wrong check digits
1064        assert!(d.detect("cpf 12345678900").is_empty());
1065        // All-same digits — rejected
1066        assert!(d.detect("cpf 11111111111").is_empty());
1067        // Invalid formatted
1068        assert!(d.detect("cpf 123.456.789-00").is_empty());
1069        Ok(())
1070    }
1071
1072    // ── EntityDetector: CNPJ ────────────────────────────────────
1073
1074    #[test]
1075    fn detects_valid_cnpj_formatted() -> TestResult {
1076        let d = EntityDetector::baseline()?;
1077        let spans = d.detect("CNPJ 11.222.333/0001-81 registered");
1078        let cnpj_count = spans
1079            .iter()
1080            .filter(|s| s.category == PiiCategory::Cnpj)
1081            .count();
1082        assert_eq!(cnpj_count, 1);
1083        Ok(())
1084    }
1085
1086    #[test]
1087    fn detects_valid_cnpj_unformatted() -> TestResult {
1088        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Cnpj))?;
1089        let spans = d.detect("cnpj 11222333000181 ok");
1090        assert_eq!(spans.len(), 1);
1091        assert_eq!(spans[0].category, PiiCategory::Cnpj);
1092        Ok(())
1093    }
1094
1095    #[test]
1096    fn rejects_invalid_cnpj() -> TestResult {
1097        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Cnpj))?;
1098        assert!(d.detect("cnpj 12345678000100").is_empty());
1099        assert!(d.detect("cnpj 11111111111111").is_empty());
1100        Ok(())
1101    }
1102
1103    // ── EntityDetector: Pix UUID ─────────────────────────────────
1104
1105    #[test]
1106    fn detects_pix_uuid_key() -> TestResult {
1107        let d = EntityDetector::baseline()?;
1108        let spans = d.detect("pix key 123e4567-e89b-12d3-a456-426614174000 configurada");
1109        let pix_count = spans
1110            .iter()
1111            .filter(|s| s.category == PiiCategory::PixKey)
1112            .count();
1113        assert_eq!(pix_count, 1);
1114        Ok(())
1115    }
1116
1117    // ── EntityDetector: IPv4 ────────────────────────────────────
1118
1119    #[test]
1120    fn detects_ipv4() -> TestResult {
1121        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Ipv4))?;
1122        let spans = d.detect("request from 192.168.1.100 blocked");
1123        assert_eq!(spans.len(), 1);
1124        assert_eq!(spans[0].category, PiiCategory::IpAddress);
1125        Ok(())
1126    }
1127
1128    #[test]
1129    fn rejects_out_of_range_ipv4_octets() -> TestResult {
1130        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Ipv4))?;
1131        assert!(d.detect("999.999.999.999").is_empty());
1132        Ok(())
1133    }
1134
1135    // ── EntityDetector: JWT ─────────────────────────────────────
1136
1137    #[test]
1138    fn detects_jwt() -> TestResult {
1139        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::Jwt))?;
1140        let jwt = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjMifQ.abc-_def";
1141        let spans = d.detect(&format!("token: {jwt} here"));
1142        assert_eq!(spans.len(), 1);
1143        assert_eq!(spans[0].category, PiiCategory::Jwt);
1144        Ok(())
1145    }
1146
1147    // ── EntityDetector: toggle set ──────────────────────────────
1148
1149    #[test]
1150    fn disabled_categories_are_skipped() -> TestResult {
1151        let d = EntityDetector::new(CategorySet::none())?;
1152        assert!(d.detect("a@b.co and 111.444.777-35").is_empty());
1153        Ok(())
1154    }
1155
1156    // ── CompositeDetector ──────────────────────────────────────
1157
1158    #[test]
1159    fn composite_merges_detectors() -> TestResult {
1160        let secrets: Box<dyn PiiDetector> = Box::new(SecretDetector::baseline()?);
1161        let entities: Box<dyn PiiDetector> = Box::new(EntityDetector::baseline()?);
1162        let composite = CompositeDetector::new(vec![secrets, entities]);
1163        let text = "login=a@b.co key=sk-abcdefghijklmnopqrstuv";
1164        let spans = composite.detect(text);
1165        assert_eq!(spans.len(), 2);
1166        // Categories present, order by start.
1167        let categories: Vec<&PiiCategory> = spans.iter().map(|s| &s.category).collect();
1168        assert!(categories.contains(&&PiiCategory::Email));
1169        assert!(categories.contains(&&PiiCategory::Secret));
1170        Ok(())
1171    }
1172
1173    #[test]
1174    fn composite_dedupes_overlapping_spans() {
1175        // Two detectors that both match the same bytes produce a
1176        // single output span after dedup.
1177        #[derive(Debug)]
1178        struct Always(PiiCategory);
1179        impl PiiDetector for Always {
1180            fn detect(&self, text: &str) -> Vec<PiiSpan> {
1181                if text.is_empty() {
1182                    Vec::new()
1183                } else {
1184                    vec![PiiSpan::new(0, text.len(), self.0.clone())]
1185                }
1186            }
1187        }
1188
1189        let composite = CompositeDetector::new(vec![
1190            Box::new(Always(PiiCategory::Email)),
1191            Box::new(Always(PiiCategory::Secret)),
1192        ]);
1193        let spans = composite.detect("hello");
1194        assert_eq!(spans.len(), 1);
1195    }
1196
1197    #[test]
1198    fn composite_without_dedup_preserves_overlaps() {
1199        #[derive(Debug)]
1200        struct Always(PiiCategory);
1201        impl PiiDetector for Always {
1202            fn detect(&self, text: &str) -> Vec<PiiSpan> {
1203                if text.is_empty() {
1204                    Vec::new()
1205                } else {
1206                    vec![PiiSpan::new(0, text.len(), self.0.clone())]
1207                }
1208            }
1209        }
1210
1211        let composite = CompositeDetector::new(vec![
1212            Box::new(Always(PiiCategory::Email)),
1213            Box::new(Always(PiiCategory::Secret)),
1214        ])
1215        .without_dedup();
1216        let spans = composite.detect("hello");
1217        assert_eq!(spans.len(), 2);
1218    }
1219
1220    // ── BaselineDetector ──────────────────────────────────────
1221
1222    #[test]
1223    fn baseline_finds_mixed_pii() -> TestResult {
1224        let d = BaselineDetector::new()?;
1225        let text = "email: a@b.co, CPF: 111.444.777-35, key: sk-abcdefghijklmnopqrstuv";
1226        let mut spans = d.detect(text);
1227        spans.sort_by_key(|s| s.start);
1228        let kinds: Vec<&PiiCategory> = spans.iter().map(|s| &s.category).collect();
1229        assert_eq!(
1230            kinds,
1231            vec![&PiiCategory::Email, &PiiCategory::Cpf, &PiiCategory::Secret,]
1232        );
1233        Ok(())
1234    }
1235
1236    // ── dedup_overlapping ─────────────────────────────────────
1237
1238    #[test]
1239    fn dedup_keeps_longest_on_overlap() {
1240        let mut spans = vec![
1241            PiiSpan::new(0, 5, PiiCategory::Email),
1242            PiiSpan::new(0, 8, PiiCategory::Secret), // longer, same start
1243            PiiSpan::new(10, 15, PiiCategory::Phone),
1244        ];
1245        dedup_overlapping(&mut spans);
1246        assert_eq!(spans.len(), 2);
1247        assert_eq!(spans[0].category, PiiCategory::Secret);
1248        assert_eq!(spans[1].category, PiiCategory::Phone);
1249    }
1250
1251    #[test]
1252    fn dedup_drops_empty_spans() {
1253        let mut spans = vec![
1254            PiiSpan::new(5, 5, PiiCategory::Email),
1255            PiiSpan::new(10, 15, PiiCategory::Phone),
1256        ];
1257        dedup_overlapping(&mut spans);
1258        assert_eq!(spans.len(), 1);
1259    }
1260
1261    #[test]
1262    fn dedup_preserves_non_overlapping() {
1263        let mut spans = vec![
1264            PiiSpan::new(0, 3, PiiCategory::Email),
1265            PiiSpan::new(5, 8, PiiCategory::Phone),
1266            PiiSpan::new(10, 15, PiiCategory::Cpf),
1267        ];
1268        dedup_overlapping(&mut spans);
1269        assert_eq!(spans.len(), 3);
1270    }
1271
1272    // ── mask_spans ────────────────────────────────────────────
1273
1274    #[test]
1275    fn mask_spans_produces_type_tagged_markers() -> TestResult {
1276        let d = BaselineDetector::new()?;
1277        let text = "email a@b.co please";
1278        let spans = d.detect(text);
1279        let masked = mask_spans(text, &spans);
1280        assert_eq!(masked, "email [REDACTED:email] please");
1281        Ok(())
1282    }
1283
1284    #[test]
1285    fn mask_spans_preserves_text_without_pii() -> TestResult {
1286        let d = BaselineDetector::new()?;
1287        let text = "no pii here, just prose";
1288        let masked = mask_spans(text, &d.detect(text));
1289        assert_eq!(masked, text);
1290        Ok(())
1291    }
1292
1293    #[test]
1294    fn mask_spans_handles_multiple_spans_in_order() -> TestResult {
1295        let d = BaselineDetector::new()?;
1296        let text = "a@b.co then c@d.co";
1297        let masked = mask_spans(text, &d.detect(text));
1298        assert_eq!(masked, "[REDACTED:email] then [REDACTED:email]");
1299        Ok(())
1300    }
1301
1302    // ── mask_with ─────────────────────────────────────────────
1303
1304    #[test]
1305    fn mask_with_supports_format_preserving_pan_mask() -> TestResult {
1306        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1307        let text = "card 4111 1111 1111 1111 thanks";
1308        let spans = d.detect(text);
1309        let masked = mask_with(text, &spans, |span, matched| {
1310            if span.category == PiiCategory::CreditCard {
1311                mask_pan(matched)
1312            } else {
1313                format!("[REDACTED:{}]", span.category.as_tag())
1314            }
1315        });
1316        assert!(masked.contains("****-****-****-1111"), "got: {masked}");
1317        Ok(())
1318    }
1319
1320    #[test]
1321    fn mask_with_skips_non_boundary_spans_silently() {
1322        // Multi-byte char at bytes 0..2 (é), span landing inside
1323        // that char should be skipped rather than panic.
1324        let text = "é abc";
1325        let spans = vec![PiiSpan::new(1, 3, PiiCategory::Email)];
1326        let masked = mask_with(text, &spans, |_, _| "X".to_owned());
1327        // Either unchanged or prefix kept — what matters is no panic.
1328        assert!(masked.contains("abc"));
1329    }
1330
1331    #[test]
1332    fn mask_with_skips_span_with_valid_start_invalid_end() {
1333        // `span.start` is a char boundary (between 'a' and 'b') but `span.end`
1334        // lands inside the multi-byte 'é'. The span must be skipped without
1335        // duplicating the already-emitted prefix or leaking PII bytes.
1336        let text = "ab é"; // bytes: a=0 b=1 ' '=2 é=3..5
1337        let spans = vec![PiiSpan::new(1, 4, PiiCategory::Email)];
1338        let masked = mask_with(text, &spans, |_, _| "X".to_owned());
1339        assert_eq!(masked, "ab é");
1340    }
1341
1342    // ── credit-card sliding window (PAN + trailing digits) ─────
1343
1344    #[test]
1345    fn detects_pan_followed_by_trailing_digits() -> TestResult {
1346        // A valid 16-digit PAN followed by an amount: the greedy regex grabs
1347        // all 19 digits and a single Luhn check on the run fails. The real PAN
1348        // is a Luhn-valid sub-window and must still be masked, not leaked.
1349        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1350        let text = "card 4111 1111 1111 1111 150";
1351        let spans = d.detect(text);
1352        let pan_spans = spans
1353            .iter()
1354            .filter(|s| s.category == PiiCategory::CreditCard)
1355            .count();
1356        assert_eq!(pan_spans, 1, "expected the embedded PAN: {spans:?}");
1357        let masked = mask_spans(text, &spans);
1358        assert!(
1359            !masked.contains("4111 1111 1111 1111"),
1360            "PAN leaked: {masked}"
1361        );
1362        assert!(masked.contains("[REDACTED:credit_card]"), "got: {masked}");
1363        Ok(())
1364    }
1365
1366    #[test]
1367    fn sequential_filler_digits_do_not_false_positive() -> TestResult {
1368        // Sliding must not flag a coincidental interior Luhn-valid sub-window
1369        // in obviously-sequential filler that is not a PAN.
1370        let d = EntityDetector::new(CategorySet::none().with(DetectCategory::CreditCard))?;
1371        assert!(d.detect("order 1234 5678 9012 3456 processed").is_empty());
1372        Ok(())
1373    }
1374
1375    #[test]
1376    fn entity_detector_clean_string_via_prefilter() -> TestResult {
1377        // The RegexSet prefilter short-circuits a string with no PII.
1378        let d = EntityDetector::baseline()?;
1379        assert!(
1380            d.detect("a perfectly ordinary sentence with no pii")
1381                .is_empty()
1382        );
1383        Ok(())
1384    }
1385
1386    // ── dedup_overlapping: merge (no leaked tail) ──────────────
1387
1388    #[test]
1389    fn dedup_merges_overlapping_tail_instead_of_dropping() {
1390        // A later span that starts inside the kept span but extends past its
1391        // end must be merged (covering interval), not dropped — otherwise the
1392        // tail bytes stay unmasked.
1393        let mut spans = vec![
1394            PiiSpan::new(0, 6, PiiCategory::Secret),
1395            PiiSpan::new(4, 12, PiiCategory::Email),
1396        ];
1397        dedup_overlapping(&mut spans);
1398        assert_eq!(spans.len(), 1);
1399        assert_eq!(spans[0].start, 0);
1400        assert_eq!(spans[0].end, 12, "tail must be covered, not dropped");
1401        assert_eq!(spans[0].category, PiiCategory::Secret);
1402    }
1403
1404    #[test]
1405    fn mask_with_overlapping_spans_leaks_no_tail() {
1406        // Concrete leak case: a secret span overlapping an email span. The
1407        // overlapping tail ('@example.com') must not survive masking.
1408        let text = "Bearer abc123def.ana@example.com";
1409        // 'Bearer abc123def.ana' = bytes 0..20, 'ana@example.com' = 17..32.
1410        let spans = vec![
1411            PiiSpan::new(0, 20, PiiCategory::Secret),
1412            PiiSpan::new(17, 32, PiiCategory::Email),
1413        ];
1414        let masked = mask_spans(text, &spans);
1415        assert!(!masked.contains("@example.com"), "tail leaked: {masked}");
1416    }
1417
1418    // ── mask_pan ──────────────────────────────────────────────
1419
1420    #[test]
1421    fn mask_pan_keeps_last_four() {
1422        assert_eq!(mask_pan("4111 1111 1111 1111"), "****-****-****-1111");
1423        assert_eq!(mask_pan("4111111111111111"), "****-****-****-1111");
1424        assert_eq!(mask_pan("4111-1111-1111-1234"), "****-****-****-1234");
1425    }
1426
1427    #[test]
1428    fn mask_pan_falls_back_for_too_few_digits() {
1429        assert_eq!(mask_pan("abc"), "[REDACTED:credit_card]");
1430        assert_eq!(mask_pan("12"), "[REDACTED:credit_card]");
1431    }
1432
1433    // ── Luhn, CPF, CNPJ validators (edge cases) ───────────────
1434
1435    #[test]
1436    fn luhn_rejects_wrong_length() {
1437        assert!(!luhn_is_valid("1234567890"));
1438        assert!(!luhn_is_valid("12345678901234567890"));
1439    }
1440
1441    #[test]
1442    fn cpf_validator_accepts_known_good() {
1443        assert!(cpf_is_valid("111.444.777-35"));
1444        assert!(cpf_is_valid("11144477735"));
1445    }
1446
1447    #[test]
1448    fn cnpj_validator_accepts_known_good() {
1449        assert!(cnpj_is_valid("11.222.333/0001-81"));
1450        assert!(cnpj_is_valid("11222333000181"));
1451    }
1452}