Skip to main content

regex_pii_rs/
lib.rs

1//! # regex-pii-rs
2//!
3//! Detect and redact common PII (emails, NA-format phones, US SSNs,
4//! credit-card-shaped numbers, prefixed API keys) without pulling in
5//! the `regex` crate. Hand-rolled scanners, zero deps.
6//!
7//! ## Example
8//!
9//! ```
10//! use regex_pii_rs::{find, redact};
11//! let s = "Contact jane.doe@example.com or 555-123-4567.";
12//! let hits = find(s);
13//! assert!(hits.iter().any(|f| f.kind == "email"));
14//! assert!(!redact(s).contains("jane.doe"));
15//! ```
16
17#![deny(missing_docs)]
18
19/// One detection.
20#[derive(Debug, Clone, PartialEq, Eq)]
21pub struct Finding {
22    /// Category: `email`, `phone`, `ssn`, `credit_card`, `api_key`.
23    pub kind: &'static str,
24    /// Matched text.
25    pub value: String,
26    /// Byte offset in the input.
27    pub byte_pos: usize,
28}
29
30/// Return every detection in `s`, sorted by position.
31pub fn find(s: &str) -> Vec<Finding> {
32    let mut out = Vec::new();
33    out.extend(scan_emails(s));
34    out.extend(scan_phones(s));
35    out.extend(scan_ssns(s));
36    out.extend(scan_cards(s));
37    out.extend(scan_api_keys(s));
38    out.sort_by_key(|f| f.byte_pos);
39    out
40}
41
42/// Replace every finding with `[REDACTED:<kind>]`.
43pub fn redact(s: &str) -> String {
44    let findings = find(s);
45    if findings.is_empty() {
46        return s.to_string();
47    }
48    let mut out = String::with_capacity(s.len());
49    let mut cursor = 0;
50    for f in &findings {
51        if f.byte_pos < cursor {
52            continue; // overlap (e.g. credit_card sub-matches phone)
53        }
54        out.push_str(&s[cursor..f.byte_pos]);
55        out.push_str(&format!("[REDACTED:{}]", f.kind));
56        cursor = f.byte_pos + f.value.len();
57    }
58    out.push_str(&s[cursor..]);
59    out
60}
61
62// --- per-kind scanners ---------------------------------------------------
63
64fn scan_emails(s: &str) -> Vec<Finding> {
65    let mut out = Vec::new();
66    let bytes = s.as_bytes();
67    for (i, &b) in bytes.iter().enumerate() {
68        if b == b'@' {
69            // walk left for local-part
70            let mut start = i;
71            while start > 0 && is_email_local(bytes[start - 1]) {
72                start -= 1;
73            }
74            // walk right for domain
75            let mut end = i + 1;
76            while end < bytes.len() && is_email_domain(bytes[end]) {
77                end += 1;
78            }
79            if start < i && end > i + 1 && s[i + 1..end].contains('.') {
80                out.push(Finding {
81                    kind: "email",
82                    value: s[start..end].to_string(),
83                    byte_pos: start,
84                });
85            }
86        }
87    }
88    out
89}
90
91fn is_email_local(c: u8) -> bool {
92    c.is_ascii_alphanumeric() || matches!(c, b'.' | b'_' | b'%' | b'+' | b'-')
93}
94fn is_email_domain(c: u8) -> bool {
95    c.is_ascii_alphanumeric() || matches!(c, b'.' | b'-')
96}
97
98fn scan_phones(s: &str) -> Vec<Finding> {
99    // Matches `(NNN) NNN-NNNN`, `NNN-NNN-NNNN`, `NNN.NNN.NNNN`,
100    // `+1 NNN-NNN-NNNN`. Implemented as a small state machine over
101    // digit-or-separator tokens.
102    let bytes = s.as_bytes();
103    let mut out = Vec::new();
104    let mut i = 0;
105    while i < bytes.len() {
106        let start = i;
107        let digit_chunk = |i: usize| {
108            let mut j = i;
109            while j < bytes.len() && bytes[j].is_ascii_digit() {
110                j += 1;
111            }
112            j - i
113        };
114        // Optional +1
115        if bytes[i] == b'+' && i + 1 < bytes.len() && bytes[i + 1] == b'1' {
116            i += 2;
117            while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'-' || bytes[i] == b'.') {
118                i += 1;
119            }
120        }
121        // Optional `(NNN) `
122        if i < bytes.len() && bytes[i] == b'(' && i + 4 < bytes.len() && bytes[i + 4] == b')' {
123            let in_paren = &bytes[i + 1..i + 4];
124            if in_paren.iter().all(|b| b.is_ascii_digit()) {
125                i += 5;
126                while i < bytes.len() && (bytes[i] == b' ') {
127                    i += 1;
128                }
129                let mid = digit_chunk(i);
130                if mid == 3
131                    && i + 3 < bytes.len()
132                    && matches!(bytes[i + 3], b'-' | b'.' | b' ')
133                {
134                    let last_start = i + 4;
135                    if digit_chunk(last_start) == 4 {
136                        out.push(Finding {
137                            kind: "phone",
138                            value: s[start..last_start + 4].to_string(),
139                            byte_pos: start,
140                        });
141                        i = last_start + 4;
142                        continue;
143                    }
144                }
145            }
146        }
147        // `NNN-NNN-NNNN` or `NNN.NNN.NNNN`
148        if digit_chunk(i) == 3
149            && i + 3 < bytes.len()
150            && matches!(bytes[i + 3], b'-' | b'.')
151        {
152            let sep = bytes[i + 3];
153            let mid_start = i + 4;
154            if digit_chunk(mid_start) == 3
155                && mid_start + 3 < bytes.len()
156                && bytes[mid_start + 3] == sep
157            {
158                let last_start = mid_start + 4;
159                if digit_chunk(last_start) == 4 {
160                    out.push(Finding {
161                        kind: "phone",
162                        value: s[start..last_start + 4].to_string(),
163                        byte_pos: start,
164                    });
165                    i = last_start + 4;
166                    continue;
167                }
168            }
169        }
170        i = start + 1;
171    }
172    out
173}
174
175fn scan_ssns(s: &str) -> Vec<Finding> {
176    let bytes = s.as_bytes();
177    let mut out = Vec::new();
178    let mut i = 0;
179    while i + 11 <= bytes.len() {
180        let slice = &bytes[i..i + 11];
181        if slice.iter().enumerate().all(|(k, c)| match k {
182            3 | 6 => *c == b'-',
183            _ => c.is_ascii_digit(),
184        }) {
185            // Boundary check: avoid taking part of a longer digit run.
186            let left_ok = i == 0 || !bytes[i - 1].is_ascii_digit();
187            let right_ok = i + 11 == bytes.len() || !bytes[i + 11].is_ascii_digit();
188            if left_ok && right_ok {
189                out.push(Finding {
190                    kind: "ssn",
191                    value: s[i..i + 11].to_string(),
192                    byte_pos: i,
193                });
194                i += 11;
195                continue;
196            }
197        }
198        i += 1;
199    }
200    out
201}
202
203fn scan_cards(s: &str) -> Vec<Finding> {
204    // 13–19 digits, optionally separated by spaces or dashes. We don't
205    // Luhn-check (false positives on phone-like sequences would be
206    // worse than missing a few rejects).
207    let bytes = s.as_bytes();
208    let mut out = Vec::new();
209    let mut i = 0;
210    while i < bytes.len() {
211        let start = i;
212        let mut digits = 0;
213        let mut seps = 0;
214        while i < bytes.len() {
215            if bytes[i].is_ascii_digit() {
216                digits += 1;
217                i += 1;
218            } else if matches!(bytes[i], b' ' | b'-') && digits > 0 {
219                seps += 1;
220                i += 1;
221            } else {
222                break;
223            }
224        }
225        let span_len = i - start;
226        if (13..=19).contains(&digits) && (4..=span_len).contains(&span_len) {
227            let value = &s[start..i];
228            // Trim trailing separator if any.
229            let trimmed = value.trim_end_matches(|c: char| c == ' ' || c == '-');
230            // Avoid taking what we'd also match as a phone (10 digits + seps).
231            if digits >= 13 {
232                out.push(Finding {
233                    kind: "credit_card",
234                    value: trimmed.to_string(),
235                    byte_pos: start,
236                });
237                continue;
238            }
239        }
240        if i == start {
241            i += 1;
242        }
243        let _ = seps;
244    }
245    out
246}
247
248fn scan_api_keys(s: &str) -> Vec<Finding> {
249    let prefixes: &[&str] = &["sk-", "sk_live_", "sk_test_", "ghp_", "xoxb-", "rk_live_"];
250    let mut out = Vec::new();
251    for p in prefixes {
252        let mut start = 0;
253        while let Some(pos) = s[start..].find(p) {
254            let abs = start + pos;
255            // Greedy match across [A-Za-z0-9_-].
256            let bytes = s.as_bytes();
257            let mut end = abs + p.len();
258            while end < bytes.len()
259                && (bytes[end].is_ascii_alphanumeric() || matches!(bytes[end], b'_' | b'-'))
260            {
261                end += 1;
262            }
263            let tail = end - (abs + p.len());
264            if tail >= 16 {
265                out.push(Finding {
266                    kind: "api_key",
267                    value: s[abs..end].to_string(),
268                    byte_pos: abs,
269                });
270            }
271            start = end.max(abs + 1);
272        }
273    }
274    out
275}