Skip to main content

llm_pii_redact/
lib.rs

1//! # llm-pii-redact
2//!
3//! Regex-based PII redaction for LLM prompts and tool outputs.
4//!
5//! Scans text for common PII (emails, phone numbers, SSNs, credit cards
6//! with Luhn validation, IPv4, IPv6, IBANs, URLs) and replaces each match
7//! with a stable placeholder like `<EMAIL_0>`. A mapping from placeholder
8//! to original value is returned so callers can [`reveal`] the original
9//! text after the LLM has responded.
10//!
11//! [`reveal`]: Redactor::reveal
12//!
13//! ## Quick example
14//!
15//! ```
16//! use llm_pii_redact::Redactor;
17//!
18//! let r = Redactor::default();
19//! let out = r.redact("Email me at ops@example.invalid or call 555-123-4567");
20//! assert!(!out.text.contains("ops@example.invalid"));
21//! assert!(!out.text.contains("555-123-4567"));
22//!
23//! let original = r.reveal(&out.text, &out.mapping);
24//! assert_eq!(original, "Email me at ops@example.invalid or call 555-123-4567");
25//! ```
26//!
27//! ## Custom patterns
28//!
29//! Start from an empty [`Redactor`] and add your own:
30//!
31//! ```
32//! use llm_pii_redact::Redactor;
33//!
34//! let r = Redactor::new()
35//!     .with_pattern("AWS_KEY", r"AKIA[0-9A-Z]{16}")
36//!     .unwrap();
37//! let out = r.redact("key=AKIAABCDEFGHIJKLMNOP ok");
38//! assert!(out.text.contains("<AWS_KEY_0>"));
39//! assert_eq!(out.mapping["<AWS_KEY_0>"], "AKIAABCDEFGHIJKLMNOP");
40//! ```
41//!
42//! Or take a built-in detector by itself:
43//!
44//! ```
45//! use llm_pii_redact::Redactor;
46//!
47//! let r = Redactor::email();
48//! let out = r.redact("ping ops@example.invalid and call 555-123-4567");
49//! assert!(out.text.contains("<EMAIL_0>"));
50//! assert!(out.text.contains("555-123-4567"));
51//! ```
52//!
53//! ## Companion crates
54//!
55//! - [`tool-secret-scrubber`](https://crates.io/crates/tool-secret-scrubber):
56//!   API keys, JWTs, bearer tokens, AWS keys. PII detectors live here.
57
58#![deny(missing_docs)]
59
60use std::collections::HashMap;
61
62use regex::Regex;
63
64/// Built-in PII type label `"EMAIL"`.
65pub const EMAIL: &str = "EMAIL";
66/// Built-in PII type label `"PHONE_US"`.
67pub const PHONE_US: &str = "PHONE_US";
68/// Built-in PII type label `"SSN"`.
69pub const SSN: &str = "SSN";
70/// Built-in PII type label `"CREDIT_CARD"`.
71pub const CREDIT_CARD: &str = "CREDIT_CARD";
72/// Built-in PII type label `"IP_V4"`.
73pub const IP_V4: &str = "IP_V4";
74/// Built-in PII type label `"IP_V6"`.
75pub const IP_V6: &str = "IP_V6";
76/// Built-in PII type label `"IBAN"`.
77pub const IBAN: &str = "IBAN";
78/// Built-in PII type label `"URL"`.
79pub const URL: &str = "URL";
80
81const EMAIL_RE: &str = r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b";
82// `regex` crate has no lookaround, so we surround with a (?:\D|^)/(?:\D|$) frame
83// and capture the actual number. The Redactor walks Captures to get group 1.
84const PHONE_US_RE: &str =
85    r"(?:^|\D)((?:\+?1[\s.\-]?)?(?:\(\d{3}\)|\d{3})[\s.\-]?\d{3}[\s.\-]?\d{4})(?:\D|$)";
86const SSN_RE: &str = r"\b(?:\d{3}-\d{2}-\d{4}|\d{9})\b";
87const CREDIT_CARD_RE: &str = r"(?:^|\D)((?:\d[ \-]?){12,18}\d)(?:\D|$)";
88const IP_V4_RE: &str = concat!(
89    r"\b",
90    r"(?:(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}",
91    r"(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)",
92    r"\b"
93);
94const IP_V6_RE: &str = concat!(
95    r"(?:^|[^\w:])",
96    r"(",
97    r"(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}",
98    r"|",
99    r"(?:[A-Fa-f0-9]{1,4}:){1,7}:",
100    r"|",
101    r":(?::[A-Fa-f0-9]{1,4}){1,7}",
102    r"|",
103    r"(?:[A-Fa-f0-9]{1,4}:){1,6}(?::[A-Fa-f0-9]{1,4}){1,6}",
104    r")",
105    r"(?:[^\w:]|$)"
106);
107const IBAN_RE: &str = r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b";
108const URL_RE: &str = r#"\bhttps?://[^\s<>"')]+"#;
109
110/// One detected PII span.
111#[derive(Debug, Clone, PartialEq, Eq)]
112pub struct Detection {
113    /// Detector name (e.g. `"EMAIL"`) that produced the match.
114    pub kind: String,
115    /// Matched substring.
116    pub value: String,
117    /// Inclusive byte offset where the match starts in the input.
118    pub start: usize,
119    /// Exclusive byte offset where the match ends in the input.
120    pub end: usize,
121}
122
123/// Result of [`Redactor::redact`].
124///
125/// `text` is the redacted output. `mapping` sends each placeholder
126/// (e.g. `"<EMAIL_0>"`) back to its original value so [`Redactor::reveal`]
127/// can restore the input. Repeated values share a single placeholder.
128#[derive(Debug, Clone, PartialEq, Eq, Default)]
129#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
130pub struct Redacted {
131    /// Redacted text with placeholders substituted in.
132    pub text: String,
133    /// Placeholder to original value.
134    pub mapping: HashMap<String, String>,
135}
136
137/// One named detector: a label and its compiled regex.
138///
139/// For credit cards the matcher also runs a Luhn check; for phone numbers
140/// and credit cards the regex contains a capture group around the actual
141/// value because the `regex` crate has no lookaround.
142#[derive(Debug, Clone)]
143struct Detector {
144    name: String,
145    regex: Regex,
146    // Some patterns need to look at framing characters; group 1 holds the
147    // real match in that case. None means use the whole match.
148    capture_group: Option<usize>,
149    needs_luhn: bool,
150}
151
152impl Detector {
153    fn new(name: &str, pat: &str) -> Self {
154        Self {
155            name: name.to_string(),
156            regex: Regex::new(pat).expect("built-in pattern compiles"),
157            capture_group: None,
158            needs_luhn: false,
159        }
160    }
161}
162
163/// Configurable PII redactor.
164///
165/// Use [`Redactor::default`] for all built-in detectors, or [`Redactor::new`]
166/// for an empty redactor you build up with [`Redactor::with_pattern`].
167///
168/// Single-detector helpers ([`Redactor::email`], [`Redactor::phone`],
169/// [`Redactor::ssn`], [`Redactor::cc`], [`Redactor::ip`]) return a redactor
170/// configured for just that type.
171#[derive(Debug, Clone)]
172pub struct Redactor {
173    detectors: Vec<Detector>,
174}
175
176impl Default for Redactor {
177    /// All built-in detectors, registered in the order `EMAIL`, `PHONE_US`,
178    /// `SSN`, `CREDIT_CARD`, `IP_V4`, `IP_V6`, `IBAN`, `URL`. Registration
179    /// order matters on overlaps: the earlier detector wins.
180    fn default() -> Self {
181        Self {
182            detectors: default_detectors(),
183        }
184    }
185}
186
187fn default_detectors() -> Vec<Detector> {
188    vec![
189        Detector::new(EMAIL, EMAIL_RE),
190        Detector {
191            name: PHONE_US.to_string(),
192            regex: Regex::new(PHONE_US_RE).expect("phone pattern compiles"),
193            capture_group: Some(1),
194            needs_luhn: false,
195        },
196        Detector::new(SSN, SSN_RE),
197        Detector {
198            name: CREDIT_CARD.to_string(),
199            regex: Regex::new(CREDIT_CARD_RE).expect("cc pattern compiles"),
200            capture_group: Some(1),
201            needs_luhn: true,
202        },
203        Detector::new(IP_V4, IP_V4_RE),
204        Detector {
205            name: IP_V6.to_string(),
206            regex: Regex::new(IP_V6_RE).expect("ipv6 pattern compiles"),
207            capture_group: Some(1),
208            needs_luhn: false,
209        },
210        Detector::new(IBAN, IBAN_RE),
211        Detector::new(URL, URL_RE),
212    ]
213}
214
215impl Redactor {
216    /// Empty redactor with no detectors. Build it up with
217    /// [`Redactor::with_pattern`].
218    pub fn new() -> Self {
219        Self {
220            detectors: Vec::new(),
221        }
222    }
223
224    /// Register an additional named detector.
225    ///
226    /// `name` becomes the placeholder prefix (`<NAME_0>`, `<NAME_1>`, ...).
227    /// `pattern` is a `regex` crate-compatible regex source string.
228    ///
229    /// Returns the modified redactor on success. Returns
230    /// [`regex::Error`] if the pattern fails to compile.
231    ///
232    /// ```
233    /// use llm_pii_redact::Redactor;
234    ///
235    /// let r = Redactor::new()
236    ///     .with_pattern("AWS_KEY", r"AKIA[0-9A-Z]{16}")
237    ///     .unwrap();
238    /// let out = r.redact("AKIAABCDEFGHIJKLMNOP");
239    /// assert_eq!(out.mapping["<AWS_KEY_0>"], "AKIAABCDEFGHIJKLMNOP");
240    /// ```
241    pub fn with_pattern(mut self, name: &str, pattern: &str) -> Result<Self, regex::Error> {
242        if name.is_empty() {
243            return Err(regex::Error::Syntax("name must be non-empty".into()));
244        }
245        let regex = Regex::new(pattern)?;
246        self.detectors.push(Detector {
247            name: name.to_string(),
248            regex,
249            capture_group: None,
250            needs_luhn: false,
251        });
252        Ok(self)
253    }
254
255    /// Redactor with only the EMAIL detector.
256    pub fn email() -> Self {
257        Self {
258            detectors: vec![Detector::new(EMAIL, EMAIL_RE)],
259        }
260    }
261
262    /// Redactor with only the PHONE_US detector.
263    pub fn phone() -> Self {
264        Self {
265            detectors: vec![Detector {
266                name: PHONE_US.to_string(),
267                regex: Regex::new(PHONE_US_RE).expect("phone pattern compiles"),
268                capture_group: Some(1),
269                needs_luhn: false,
270            }],
271        }
272    }
273
274    /// Redactor with only the SSN detector.
275    pub fn ssn() -> Self {
276        Self {
277            detectors: vec![Detector::new(SSN, SSN_RE)],
278        }
279    }
280
281    /// Redactor with only the CREDIT_CARD detector. Matches pass the Luhn
282    /// checksum.
283    pub fn cc() -> Self {
284        Self {
285            detectors: vec![Detector {
286                name: CREDIT_CARD.to_string(),
287                regex: Regex::new(CREDIT_CARD_RE).expect("cc pattern compiles"),
288                capture_group: Some(1),
289                needs_luhn: true,
290            }],
291        }
292    }
293
294    /// Redactor with both IPv4 and IPv6 detectors.
295    pub fn ip() -> Self {
296        Self {
297            detectors: vec![
298                Detector::new(IP_V4, IP_V4_RE),
299                Detector {
300                    name: IP_V6.to_string(),
301                    regex: Regex::new(IP_V6_RE).expect("ipv6 pattern compiles"),
302                    capture_group: Some(1),
303                    needs_luhn: false,
304                },
305            ],
306        }
307    }
308
309    /// Names of the registered detectors, in registration order.
310    pub fn detector_names(&self) -> Vec<&str> {
311        self.detectors.iter().map(|d| d.name.as_str()).collect()
312    }
313
314    /// Return every PII match in `text` without modifying it.
315    ///
316    /// Matches are returned in document order. When two enabled detectors
317    /// overlap on the same span, the detector registered first wins so the
318    /// result is unambiguous.
319    pub fn detect(&self, text: &str) -> Vec<Detection> {
320        if text.is_empty() {
321            return Vec::new();
322        }
323        let mut raw: Vec<Detection> = Vec::new();
324        for det in &self.detectors {
325            for caps in det.regex.captures_iter(text) {
326                let m = match det.capture_group {
327                    Some(idx) => match caps.get(idx) {
328                        Some(m) => m,
329                        None => continue,
330                    },
331                    None => caps.get(0).expect("group 0 always present"),
332                };
333                let value = m.as_str();
334                if det.needs_luhn && !luhn_ok(value) {
335                    continue;
336                }
337                raw.push(Detection {
338                    kind: det.name.clone(),
339                    value: value.to_string(),
340                    start: m.start(),
341                    end: m.end(),
342                });
343            }
344        }
345        raw.sort_by_key(|d| (d.start, d.end));
346
347        // Earlier match wins on overlap. With equal start, the shorter range
348        // would come first in the sort; that is fine because the Python lib
349        // accepts a candidate when `start >= last_end`.
350        let mut accepted: Vec<Detection> = Vec::new();
351        let mut last_end: usize = 0;
352        let mut have_one = false;
353        for d in raw {
354            if !have_one || d.start >= last_end {
355                last_end = d.end;
356                have_one = true;
357                accepted.push(d);
358            }
359        }
360        accepted
361    }
362
363    /// Replace each detected PII span with a stable placeholder.
364    ///
365    /// Repeated values share a placeholder so the output is deterministic.
366    /// The returned [`Redacted::mapping`] lets [`Redactor::reveal`] restore
367    /// the original text.
368    pub fn redact(&self, text: &str) -> Redacted {
369        let detections = self.detect(text);
370        if detections.is_empty() {
371            return Redacted {
372                text: text.to_string(),
373                mapping: HashMap::new(),
374            };
375        }
376
377        let mut per_type_index: HashMap<String, usize> = HashMap::new();
378        let mut value_to_placeholder: HashMap<(String, String), String> = HashMap::new();
379        let mut mapping: HashMap<String, String> = HashMap::new();
380
381        for d in &detections {
382            let key = (d.kind.clone(), d.value.clone());
383            if !value_to_placeholder.contains_key(&key) {
384                let idx = per_type_index.entry(d.kind.clone()).or_insert(0);
385                let placeholder = format!("<{}_{}>", d.kind, *idx);
386                *idx += 1;
387                value_to_placeholder.insert(key.clone(), placeholder.clone());
388                mapping.insert(placeholder, d.value.clone());
389            }
390        }
391
392        let mut out = String::with_capacity(text.len());
393        let mut cursor = 0usize;
394        for d in &detections {
395            out.push_str(&text[cursor..d.start]);
396            let key = (d.kind.clone(), d.value.clone());
397            out.push_str(&value_to_placeholder[&key]);
398            cursor = d.end;
399        }
400        out.push_str(&text[cursor..]);
401
402        Redacted { text: out, mapping }
403    }
404
405    /// Reverse a [`Redactor::redact`] call by substituting placeholders
406    /// back to their original values.
407    ///
408    /// The mapping is applied longest-key-first to keep `<EMAIL_10>` from
409    /// colliding with `<EMAIL_1>`. Unknown placeholders are left alone.
410    pub fn reveal(&self, text: &str, mapping: &HashMap<String, String>) -> String {
411        if mapping.is_empty() {
412            return text.to_string();
413        }
414        let mut keys: Vec<&String> = mapping.keys().collect();
415        keys.sort_by(|a, b| b.len().cmp(&a.len()));
416        let mut out = text.to_string();
417        for k in keys {
418            if out.contains(k.as_str()) {
419                out = out.replace(k.as_str(), &mapping[k]);
420            }
421        }
422        out
423    }
424}
425
426/// Return `true` if the digit-only characters of `s` pass the Luhn
427/// checksum used to validate credit card numbers.
428fn luhn_ok(s: &str) -> bool {
429    let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
430    if digits.len() < 13 || digits.len() > 19 {
431        return false;
432    }
433    let mut total: u32 = 0;
434    for (i, d) in digits.iter().rev().enumerate() {
435        let mut v = *d;
436        if i % 2 == 1 {
437            v *= 2;
438            if v > 9 {
439                v -= 9;
440            }
441        }
442        total += v;
443    }
444    total % 10 == 0
445}
446
447#[cfg(test)]
448mod luhn_tests {
449    use super::luhn_ok;
450
451    #[test]
452    fn known_valid_visa_passes() {
453        assert!(luhn_ok("4111111111111111"));
454    }
455
456    #[test]
457    fn flipped_last_digit_fails() {
458        assert!(!luhn_ok("4111111111111112"));
459    }
460
461    #[test]
462    fn too_short_fails() {
463        assert!(!luhn_ok("411111"));
464    }
465
466    #[test]
467    fn too_long_fails() {
468        assert!(!luhn_ok("41111111111111111111"));
469    }
470
471    #[test]
472    fn ignores_non_digits() {
473        assert!(luhn_ok("4111-1111-1111-1111"));
474    }
475}