llm_pii_redact/lib.rs
1//! # llm-pii-redact
2//!
3//! Regex-based PII redaction for LLM prompts and tool outputs.
4//!
5//! Scans text for common PII (emails, phone numbers, SSNs, credit cards
6//! with Luhn validation, IPv4, IPv6, IBANs, URLs) and replaces each match
7//! with a stable placeholder like `<EMAIL_0>`. A mapping from placeholder
8//! to original value is returned so callers can [`reveal`] the original
9//! text after the LLM has responded.
10//!
11//! [`reveal`]: Redactor::reveal
12//!
13//! ## Quick example
14//!
15//! ```
16//! use llm_pii_redact::Redactor;
17//!
18//! let r = Redactor::default();
19//! let out = r.redact("Email me at ops@example.invalid or call 555-123-4567");
20//! assert!(!out.text.contains("ops@example.invalid"));
21//! assert!(!out.text.contains("555-123-4567"));
22//!
23//! let original = r.reveal(&out.text, &out.mapping);
24//! assert_eq!(original, "Email me at ops@example.invalid or call 555-123-4567");
25//! ```
26//!
27//! ## Custom patterns
28//!
29//! Start from an empty [`Redactor`] and add your own:
30//!
31//! ```
32//! use llm_pii_redact::Redactor;
33//!
34//! let r = Redactor::new()
35//! .with_pattern("AWS_KEY", r"AKIA[0-9A-Z]{16}")
36//! .unwrap();
37//! let out = r.redact("key=AKIAABCDEFGHIJKLMNOP ok");
38//! assert!(out.text.contains("<AWS_KEY_0>"));
39//! assert_eq!(out.mapping["<AWS_KEY_0>"], "AKIAABCDEFGHIJKLMNOP");
40//! ```
41//!
42//! Or take a built-in detector by itself:
43//!
44//! ```
45//! use llm_pii_redact::Redactor;
46//!
47//! let r = Redactor::email();
48//! let out = r.redact("ping ops@example.invalid and call 555-123-4567");
49//! assert!(out.text.contains("<EMAIL_0>"));
50//! assert!(out.text.contains("555-123-4567"));
51//! ```
52//!
53//! ## Companion crates
54//!
55//! - [`tool-secret-scrubber`](https://crates.io/crates/tool-secret-scrubber):
56//! API keys, JWTs, bearer tokens, AWS keys. PII detectors live here.
57
58#![deny(missing_docs)]
59
60use std::collections::HashMap;
61
62use regex::Regex;
63
64/// Built-in PII type label `"EMAIL"`.
65pub const EMAIL: &str = "EMAIL";
66/// Built-in PII type label `"PHONE_US"`.
67pub const PHONE_US: &str = "PHONE_US";
68/// Built-in PII type label `"SSN"`.
69pub const SSN: &str = "SSN";
70/// Built-in PII type label `"CREDIT_CARD"`.
71pub const CREDIT_CARD: &str = "CREDIT_CARD";
72/// Built-in PII type label `"IP_V4"`.
73pub const IP_V4: &str = "IP_V4";
74/// Built-in PII type label `"IP_V6"`.
75pub const IP_V6: &str = "IP_V6";
76/// Built-in PII type label `"IBAN"`.
77pub const IBAN: &str = "IBAN";
78/// Built-in PII type label `"URL"`.
79pub const URL: &str = "URL";
80
81const EMAIL_RE: &str = r"\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b";
82// `regex` crate has no lookaround, so we surround with a (?:\D|^)/(?:\D|$) frame
83// and capture the actual number. The Redactor walks Captures to get group 1.
84const PHONE_US_RE: &str =
85 r"(?:^|\D)((?:\+?1[\s.\-]?)?(?:\(\d{3}\)|\d{3})[\s.\-]?\d{3}[\s.\-]?\d{4})(?:\D|$)";
86const SSN_RE: &str = r"\b(?:\d{3}-\d{2}-\d{4}|\d{9})\b";
87const CREDIT_CARD_RE: &str = r"(?:^|\D)((?:\d[ \-]?){12,18}\d)(?:\D|$)";
88const IP_V4_RE: &str = concat!(
89 r"\b",
90 r"(?:(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}",
91 r"(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)",
92 r"\b"
93);
94const IP_V6_RE: &str = concat!(
95 r"(?:^|[^\w:])",
96 r"(",
97 r"(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}",
98 r"|",
99 r"(?:[A-Fa-f0-9]{1,4}:){1,7}:",
100 r"|",
101 r":(?::[A-Fa-f0-9]{1,4}){1,7}",
102 r"|",
103 r"(?:[A-Fa-f0-9]{1,4}:){1,6}(?::[A-Fa-f0-9]{1,4}){1,6}",
104 r")",
105 r"(?:[^\w:]|$)"
106);
107const IBAN_RE: &str = r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b";
108const URL_RE: &str = r#"\bhttps?://[^\s<>"')]+"#;
109
110/// One detected PII span.
111#[derive(Debug, Clone, PartialEq, Eq)]
112pub struct Detection {
113 /// Detector name (e.g. `"EMAIL"`) that produced the match.
114 pub kind: String,
115 /// Matched substring.
116 pub value: String,
117 /// Inclusive byte offset where the match starts in the input.
118 pub start: usize,
119 /// Exclusive byte offset where the match ends in the input.
120 pub end: usize,
121}
122
123/// Result of [`Redactor::redact`].
124///
125/// `text` is the redacted output. `mapping` sends each placeholder
126/// (e.g. `"<EMAIL_0>"`) back to its original value so [`Redactor::reveal`]
127/// can restore the input. Repeated values share a single placeholder.
128#[derive(Debug, Clone, PartialEq, Eq, Default)]
129#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
130pub struct Redacted {
131 /// Redacted text with placeholders substituted in.
132 pub text: String,
133 /// Placeholder to original value.
134 pub mapping: HashMap<String, String>,
135}
136
137/// One named detector: a label and its compiled regex.
138///
139/// For credit cards the matcher also runs a Luhn check; for phone numbers
140/// and credit cards the regex contains a capture group around the actual
141/// value because the `regex` crate has no lookaround.
142#[derive(Debug, Clone)]
143struct Detector {
144 name: String,
145 regex: Regex,
146 // Some patterns need to look at framing characters; group 1 holds the
147 // real match in that case. None means use the whole match.
148 capture_group: Option<usize>,
149 needs_luhn: bool,
150}
151
152impl Detector {
153 fn new(name: &str, pat: &str) -> Self {
154 Self {
155 name: name.to_string(),
156 regex: Regex::new(pat).expect("built-in pattern compiles"),
157 capture_group: None,
158 needs_luhn: false,
159 }
160 }
161}
162
163/// Configurable PII redactor.
164///
165/// Use [`Redactor::default`] for all built-in detectors, or [`Redactor::new`]
166/// for an empty redactor you build up with [`Redactor::with_pattern`].
167///
168/// Single-detector helpers ([`Redactor::email`], [`Redactor::phone`],
169/// [`Redactor::ssn`], [`Redactor::cc`], [`Redactor::ip`]) return a redactor
170/// configured for just that type.
171#[derive(Debug, Clone)]
172pub struct Redactor {
173 detectors: Vec<Detector>,
174}
175
176impl Default for Redactor {
177 /// All built-in detectors, registered in the order `EMAIL`, `PHONE_US`,
178 /// `SSN`, `CREDIT_CARD`, `IP_V4`, `IP_V6`, `IBAN`, `URL`. Registration
179 /// order matters on overlaps: the earlier detector wins.
180 fn default() -> Self {
181 Self {
182 detectors: default_detectors(),
183 }
184 }
185}
186
187fn default_detectors() -> Vec<Detector> {
188 vec![
189 Detector::new(EMAIL, EMAIL_RE),
190 Detector {
191 name: PHONE_US.to_string(),
192 regex: Regex::new(PHONE_US_RE).expect("phone pattern compiles"),
193 capture_group: Some(1),
194 needs_luhn: false,
195 },
196 Detector::new(SSN, SSN_RE),
197 Detector {
198 name: CREDIT_CARD.to_string(),
199 regex: Regex::new(CREDIT_CARD_RE).expect("cc pattern compiles"),
200 capture_group: Some(1),
201 needs_luhn: true,
202 },
203 Detector::new(IP_V4, IP_V4_RE),
204 Detector {
205 name: IP_V6.to_string(),
206 regex: Regex::new(IP_V6_RE).expect("ipv6 pattern compiles"),
207 capture_group: Some(1),
208 needs_luhn: false,
209 },
210 Detector::new(IBAN, IBAN_RE),
211 Detector::new(URL, URL_RE),
212 ]
213}
214
215impl Redactor {
216 /// Empty redactor with no detectors. Build it up with
217 /// [`Redactor::with_pattern`].
218 pub fn new() -> Self {
219 Self {
220 detectors: Vec::new(),
221 }
222 }
223
224 /// Register an additional named detector.
225 ///
226 /// `name` becomes the placeholder prefix (`<NAME_0>`, `<NAME_1>`, ...).
227 /// `pattern` is a `regex` crate-compatible regex source string.
228 ///
229 /// Returns the modified redactor on success. Returns
230 /// [`regex::Error`] if the pattern fails to compile.
231 ///
232 /// ```
233 /// use llm_pii_redact::Redactor;
234 ///
235 /// let r = Redactor::new()
236 /// .with_pattern("AWS_KEY", r"AKIA[0-9A-Z]{16}")
237 /// .unwrap();
238 /// let out = r.redact("AKIAABCDEFGHIJKLMNOP");
239 /// assert_eq!(out.mapping["<AWS_KEY_0>"], "AKIAABCDEFGHIJKLMNOP");
240 /// ```
241 pub fn with_pattern(mut self, name: &str, pattern: &str) -> Result<Self, regex::Error> {
242 if name.is_empty() {
243 return Err(regex::Error::Syntax("name must be non-empty".into()));
244 }
245 let regex = Regex::new(pattern)?;
246 self.detectors.push(Detector {
247 name: name.to_string(),
248 regex,
249 capture_group: None,
250 needs_luhn: false,
251 });
252 Ok(self)
253 }
254
255 /// Redactor with only the EMAIL detector.
256 pub fn email() -> Self {
257 Self {
258 detectors: vec![Detector::new(EMAIL, EMAIL_RE)],
259 }
260 }
261
262 /// Redactor with only the PHONE_US detector.
263 pub fn phone() -> Self {
264 Self {
265 detectors: vec![Detector {
266 name: PHONE_US.to_string(),
267 regex: Regex::new(PHONE_US_RE).expect("phone pattern compiles"),
268 capture_group: Some(1),
269 needs_luhn: false,
270 }],
271 }
272 }
273
274 /// Redactor with only the SSN detector.
275 pub fn ssn() -> Self {
276 Self {
277 detectors: vec![Detector::new(SSN, SSN_RE)],
278 }
279 }
280
281 /// Redactor with only the CREDIT_CARD detector. Matches pass the Luhn
282 /// checksum.
283 pub fn cc() -> Self {
284 Self {
285 detectors: vec![Detector {
286 name: CREDIT_CARD.to_string(),
287 regex: Regex::new(CREDIT_CARD_RE).expect("cc pattern compiles"),
288 capture_group: Some(1),
289 needs_luhn: true,
290 }],
291 }
292 }
293
294 /// Redactor with both IPv4 and IPv6 detectors.
295 pub fn ip() -> Self {
296 Self {
297 detectors: vec![
298 Detector::new(IP_V4, IP_V4_RE),
299 Detector {
300 name: IP_V6.to_string(),
301 regex: Regex::new(IP_V6_RE).expect("ipv6 pattern compiles"),
302 capture_group: Some(1),
303 needs_luhn: false,
304 },
305 ],
306 }
307 }
308
309 /// Names of the registered detectors, in registration order.
310 pub fn detector_names(&self) -> Vec<&str> {
311 self.detectors.iter().map(|d| d.name.as_str()).collect()
312 }
313
314 /// Return every PII match in `text` without modifying it.
315 ///
316 /// Matches are returned in document order. When two enabled detectors
317 /// overlap on the same span, the detector registered first wins so the
318 /// result is unambiguous.
319 pub fn detect(&self, text: &str) -> Vec<Detection> {
320 if text.is_empty() {
321 return Vec::new();
322 }
323 let mut raw: Vec<Detection> = Vec::new();
324 for det in &self.detectors {
325 for caps in det.regex.captures_iter(text) {
326 let m = match det.capture_group {
327 Some(idx) => match caps.get(idx) {
328 Some(m) => m,
329 None => continue,
330 },
331 None => caps.get(0).expect("group 0 always present"),
332 };
333 let value = m.as_str();
334 if det.needs_luhn && !luhn_ok(value) {
335 continue;
336 }
337 raw.push(Detection {
338 kind: det.name.clone(),
339 value: value.to_string(),
340 start: m.start(),
341 end: m.end(),
342 });
343 }
344 }
345 raw.sort_by_key(|d| (d.start, d.end));
346
347 // Earlier match wins on overlap. With equal start, the shorter range
348 // would come first in the sort; that is fine because the Python lib
349 // accepts a candidate when `start >= last_end`.
350 let mut accepted: Vec<Detection> = Vec::new();
351 let mut last_end: usize = 0;
352 let mut have_one = false;
353 for d in raw {
354 if !have_one || d.start >= last_end {
355 last_end = d.end;
356 have_one = true;
357 accepted.push(d);
358 }
359 }
360 accepted
361 }
362
363 /// Replace each detected PII span with a stable placeholder.
364 ///
365 /// Repeated values share a placeholder so the output is deterministic.
366 /// The returned [`Redacted::mapping`] lets [`Redactor::reveal`] restore
367 /// the original text.
368 pub fn redact(&self, text: &str) -> Redacted {
369 let detections = self.detect(text);
370 if detections.is_empty() {
371 return Redacted {
372 text: text.to_string(),
373 mapping: HashMap::new(),
374 };
375 }
376
377 let mut per_type_index: HashMap<String, usize> = HashMap::new();
378 let mut value_to_placeholder: HashMap<(String, String), String> = HashMap::new();
379 let mut mapping: HashMap<String, String> = HashMap::new();
380
381 for d in &detections {
382 let key = (d.kind.clone(), d.value.clone());
383 if !value_to_placeholder.contains_key(&key) {
384 let idx = per_type_index.entry(d.kind.clone()).or_insert(0);
385 let placeholder = format!("<{}_{}>", d.kind, *idx);
386 *idx += 1;
387 value_to_placeholder.insert(key.clone(), placeholder.clone());
388 mapping.insert(placeholder, d.value.clone());
389 }
390 }
391
392 let mut out = String::with_capacity(text.len());
393 let mut cursor = 0usize;
394 for d in &detections {
395 out.push_str(&text[cursor..d.start]);
396 let key = (d.kind.clone(), d.value.clone());
397 out.push_str(&value_to_placeholder[&key]);
398 cursor = d.end;
399 }
400 out.push_str(&text[cursor..]);
401
402 Redacted { text: out, mapping }
403 }
404
405 /// Reverse a [`Redactor::redact`] call by substituting placeholders
406 /// back to their original values.
407 ///
408 /// The mapping is applied longest-key-first to keep `<EMAIL_10>` from
409 /// colliding with `<EMAIL_1>`. Unknown placeholders are left alone.
410 pub fn reveal(&self, text: &str, mapping: &HashMap<String, String>) -> String {
411 if mapping.is_empty() {
412 return text.to_string();
413 }
414 let mut keys: Vec<&String> = mapping.keys().collect();
415 keys.sort_by(|a, b| b.len().cmp(&a.len()));
416 let mut out = text.to_string();
417 for k in keys {
418 if out.contains(k.as_str()) {
419 out = out.replace(k.as_str(), &mapping[k]);
420 }
421 }
422 out
423 }
424}
425
426/// Return `true` if the digit-only characters of `s` pass the Luhn
427/// checksum used to validate credit card numbers.
428fn luhn_ok(s: &str) -> bool {
429 let digits: Vec<u32> = s.chars().filter_map(|c| c.to_digit(10)).collect();
430 if digits.len() < 13 || digits.len() > 19 {
431 return false;
432 }
433 let mut total: u32 = 0;
434 for (i, d) in digits.iter().rev().enumerate() {
435 let mut v = *d;
436 if i % 2 == 1 {
437 v *= 2;
438 if v > 9 {
439 v -= 9;
440 }
441 }
442 total += v;
443 }
444 total % 10 == 0
445}
446
447#[cfg(test)]
448mod luhn_tests {
449 use super::luhn_ok;
450
451 #[test]
452 fn known_valid_visa_passes() {
453 assert!(luhn_ok("4111111111111111"));
454 }
455
456 #[test]
457 fn flipped_last_digit_fails() {
458 assert!(!luhn_ok("4111111111111112"));
459 }
460
461 #[test]
462 fn too_short_fails() {
463 assert!(!luhn_ok("411111"));
464 }
465
466 #[test]
467 fn too_long_fails() {
468 assert!(!luhn_ok("41111111111111111111"));
469 }
470
471 #[test]
472 fn ignores_non_digits() {
473 assert!(luhn_ok("4111-1111-1111-1111"));
474 }
475}