Skip to main content

agx_core/
pii.rs

1//! Heuristic PII / credential scanner for `--scan-pii`. Reports
2//! matches, does not mutate — pair with `--redact` when the intent is
3//! to scrub.
4//!
5//! Pattern coverage is intentionally narrow: well-known credential
6//! shapes where a false positive is extremely unlikely, and the
7//! public email / phone / IPv4 shapes that come up in agent traces.
8//! We stay prefix-based (no `regex` crate) because:
9//!
10//! 1. The patterns we care about all have unambiguous prefixes.
11//! 2. A runtime regex dep adds ~500KB to the default binary and a
12//!    build-time hit most users don't need. `--redact` already lives
13//!    behind literal-substring masking for the same reason.
14//! 3. False-negatives on unusual shapes are acceptable for v1; users
15//!    who need regex-powered detection can grep the `--export json`
16//!    output themselves.
17//!
18//! Categories land as `Category` enum variants so JSON output stays
19//! stable when we add new patterns (new variants, not renames).
20
21use serde::Serialize;
22
23/// One PII match. `offset` is a char-based index into the input
24/// string so callers showing a snippet can safely slice.
25#[derive(Debug, Clone, Serialize)]
26pub struct Match {
27    pub category: Category,
28    /// 0-based step index (when scanning a step); synthesized as 0
29    /// for free-text scans.
30    pub step_index: usize,
31    /// Short excerpt of the match plus a few chars on each side, for
32    /// human-readable output. Length capped to keep summaries terse.
33    pub snippet: String,
34}
35
36/// Known PII / credential categories. Serialized as snake_case so the
37/// JSON shape is downstream-friendly.
38#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq, Hash)]
39#[serde(rename_all = "snake_case")]
40pub enum Category {
41    Email,
42    Ipv4,
43    AwsAccessKey,
44    StripeSecretKey,
45    StripePublishableKey,
46    GithubToken,
47    OpenaiKey,
48    AnthropicKey,
49    SshPrivateKeyHeader,
50    JwtToken,
51}
52
53impl Category {
54    pub fn label(self) -> &'static str {
55        match self {
56            Category::Email => "email",
57            Category::Ipv4 => "ipv4",
58            Category::AwsAccessKey => "aws_access_key",
59            Category::StripeSecretKey => "stripe_secret_key",
60            Category::StripePublishableKey => "stripe_publishable_key",
61            Category::GithubToken => "github_token",
62            Category::OpenaiKey => "openai_key",
63            Category::AnthropicKey => "anthropic_key",
64            Category::SshPrivateKeyHeader => "ssh_private_key_header",
65            Category::JwtToken => "jwt_token",
66        }
67    }
68}
69
70/// Scan a single string for all known PII shapes.
71#[must_use]
72pub fn scan(text: &str) -> Vec<Match> {
73    scan_with_step(text, 0)
74}
75
76/// Scan a string associated with a specific step index. The step
77/// index is copied into every emitted Match so corpus-level summaries
78/// can rank by step position.
79#[must_use]
80pub fn scan_with_step(text: &str, step_index: usize) -> Vec<Match> {
81    let mut out = Vec::new();
82    // Prefix patterns — most credential shapes live here. Each entry
83    // is (category, prefix, min_tail_chars). A match starts at the
84    // prefix and includes the prefix + `min_tail_chars` following
85    // alphanumeric / hyphen / underscore bytes. Prefixes are distinct
86    // enough across real outputs that this is a very low-false-
87    // positive shape.
88    const PREFIXES: &[(Category, &str, usize)] = &[
89        (Category::AwsAccessKey, "AKIA", 16),
90        (Category::AwsAccessKey, "ASIA", 16),
91        (Category::StripeSecretKey, "sk_live_", 24),
92        (Category::StripeSecretKey, "sk_test_", 24),
93        (Category::StripePublishableKey, "pk_live_", 24),
94        (Category::StripePublishableKey, "pk_test_", 24),
95        (Category::GithubToken, "ghp_", 36),
96        (Category::GithubToken, "gho_", 36),
97        (Category::GithubToken, "ghu_", 36),
98        (Category::GithubToken, "ghs_", 36),
99        (Category::GithubToken, "ghr_", 36),
100        (Category::AnthropicKey, "sk-ant-", 32),
101    ];
102    for &(cat, prefix, min_tail) in PREFIXES {
103        scan_prefix(text, step_index, cat, prefix, min_tail, &mut out);
104    }
105
106    // OpenAI key: `sk-` followed by ≥32 chars, but must NOT start
107    // with `sk-ant-` (that's the Anthropic key handled above).
108    scan_openai_key(text, step_index, &mut out);
109
110    // Email addresses — minimal heuristic that avoids the regex dep.
111    scan_email(text, step_index, &mut out);
112
113    // IPv4 — 4 groups of 1-3 digits separated by dots, each group 0-255.
114    scan_ipv4(text, step_index, &mut out);
115
116    // SSH private-key armor strings. Exact markers.
117    const SSH_HEADERS: &[&str] = &[
118        "-----BEGIN OPENSSH PRIVATE KEY-----",
119        "-----BEGIN RSA PRIVATE KEY-----",
120        "-----BEGIN DSA PRIVATE KEY-----",
121        "-----BEGIN EC PRIVATE KEY-----",
122        "-----BEGIN PRIVATE KEY-----",
123    ];
124    for header in SSH_HEADERS {
125        if text.contains(header) {
126            out.push(Match {
127                category: Category::SshPrivateKeyHeader,
128                step_index,
129                snippet: (*header).to_string(),
130            });
131        }
132    }
133
134    // JWT tokens: three base64url groups joined by `.`, starting with
135    // `eyJ` (the base64 of `{"`). Common in agent tool outputs that
136    // call authenticated APIs.
137    scan_jwt(text, step_index, &mut out);
138
139    out
140}
141
142/// Scan every step's `detail` and `label`, returning all matches
143/// indexed by step position. Convenience wrapper for the CLI
144/// dispatcher in main.rs.
145#[must_use]
146pub fn scan_steps(steps: &[crate::timeline::Step]) -> Vec<Match> {
147    let mut all = Vec::new();
148    for (i, step) in steps.iter().enumerate() {
149        all.extend(scan_with_step(&step.detail, i));
150        all.extend(scan_with_step(&step.label, i));
151    }
152    all
153}
154
155// ---------- internal helpers ----------
156
157fn is_token_byte(b: u8) -> bool {
158    b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
159}
160
161fn scan_prefix(
162    text: &str,
163    step_index: usize,
164    cat: Category,
165    prefix: &str,
166    min_tail: usize,
167    out: &mut Vec<Match>,
168) {
169    let bytes = text.as_bytes();
170    let prefix_bytes = prefix.as_bytes();
171    let mut i = 0;
172    while i + prefix_bytes.len() <= bytes.len() {
173        if &bytes[i..i + prefix_bytes.len()] == prefix_bytes {
174            // Count trailing token bytes. If enough, emit a match.
175            let tail_start = i + prefix_bytes.len();
176            let mut tail = 0;
177            while tail_start + tail < bytes.len() && is_token_byte(bytes[tail_start + tail]) {
178                tail += 1;
179            }
180            if tail >= min_tail {
181                let end = tail_start + tail;
182                let snippet = snippet_around(text, i, end);
183                out.push(Match {
184                    category: cat,
185                    step_index,
186                    snippet,
187                });
188                i = end;
189                continue;
190            }
191        }
192        i += 1;
193    }
194}
195
196fn scan_openai_key(text: &str, step_index: usize, out: &mut Vec<Match>) {
197    // Match `sk-` + ≥32 token chars, skip `sk-ant-`.
198    let bytes = text.as_bytes();
199    let mut i = 0;
200    while i + 3 <= bytes.len() {
201        if &bytes[i..i + 3] == b"sk-" {
202            // Reject the Anthropic prefix.
203            if bytes[i..].starts_with(b"sk-ant-") {
204                i += 1;
205                continue;
206            }
207            let tail_start = i + 3;
208            let mut tail = 0;
209            while tail_start + tail < bytes.len() && is_token_byte(bytes[tail_start + tail]) {
210                tail += 1;
211            }
212            if tail >= 32 {
213                let end = tail_start + tail;
214                out.push(Match {
215                    category: Category::OpenaiKey,
216                    step_index,
217                    snippet: snippet_around(text, i, end),
218                });
219                i = end;
220                continue;
221            }
222        }
223        i += 1;
224    }
225}
226
227fn scan_email(text: &str, step_index: usize, out: &mut Vec<Match>) {
228    // Heuristic: find every `@`, check there's a local-part before
229    // and a domain-with-dot after. Not RFC-compliant but catches the
230    // shapes that actually show up in agent traces.
231    let bytes = text.as_bytes();
232    for (i, &b) in bytes.iter().enumerate() {
233        if b != b'@' {
234            continue;
235        }
236        // Walk backwards for the local part.
237        let mut start = i;
238        while start > 0 && is_email_local_byte(bytes[start - 1]) {
239            start -= 1;
240        }
241        if start == i {
242            continue;
243        }
244        // Walk forwards for the domain.
245        let mut end = i + 1;
246        while end < bytes.len() && is_email_domain_byte(bytes[end]) {
247            end += 1;
248        }
249        if end == i + 1 {
250            continue;
251        }
252        // The domain portion must contain a `.`.
253        let domain = &text[i + 1..end];
254        if !domain.contains('.') {
255            continue;
256        }
257        out.push(Match {
258            category: Category::Email,
259            step_index,
260            snippet: text[start..end].to_string(),
261        });
262    }
263}
264
265fn is_email_local_byte(b: u8) -> bool {
266    b.is_ascii_alphanumeric() || matches!(b, b'.' | b'_' | b'-' | b'+')
267}
268
269fn is_email_domain_byte(b: u8) -> bool {
270    b.is_ascii_alphanumeric() || b == b'.' || b == b'-'
271}
272
273fn scan_ipv4(text: &str, step_index: usize, out: &mut Vec<Match>) {
274    let bytes = text.as_bytes();
275    let mut i = 0;
276    while i < bytes.len() {
277        if !bytes[i].is_ascii_digit() {
278            i += 1;
279            continue;
280        }
281        // Try to parse up to 4 dot-separated octets starting at i.
282        if let Some(end) = parse_ipv4_at(bytes, i) {
283            out.push(Match {
284                category: Category::Ipv4,
285                step_index,
286                snippet: text[i..end].to_string(),
287            });
288            i = end;
289        } else {
290            // Skip past this run of digits.
291            while i < bytes.len() && bytes[i].is_ascii_digit() {
292                i += 1;
293            }
294        }
295    }
296}
297
298fn parse_ipv4_at(bytes: &[u8], start: usize) -> Option<usize> {
299    let mut pos = start;
300    for seg in 0..4 {
301        if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
302            return None;
303        }
304        let mut digits = 0;
305        let mut val: u32 = 0;
306        while pos < bytes.len() && bytes[pos].is_ascii_digit() && digits < 3 {
307            val = val * 10 + u32::from(bytes[pos] - b'0');
308            pos += 1;
309            digits += 1;
310        }
311        if val > 255 {
312            return None;
313        }
314        if seg < 3 {
315            if pos >= bytes.len() || bytes[pos] != b'.' {
316                return None;
317            }
318            pos += 1;
319        }
320    }
321    // Reject when immediately followed by another digit — otherwise
322    // we'd hit `12.34.56.789` reading as 12.34.56.78 with leftover 9.
323    if pos < bytes.len() && bytes[pos].is_ascii_digit() {
324        return None;
325    }
326    Some(pos)
327}
328
329fn scan_jwt(text: &str, step_index: usize, out: &mut Vec<Match>) {
330    // `eyJ` is the base64url of `{"` — the standard JWT header start.
331    // Three groups of base64url chars separated by `.`, each ≥16.
332    let bytes = text.as_bytes();
333    let mut i = 0;
334    while i + 3 <= bytes.len() {
335        if &bytes[i..i + 3] == b"eyJ" {
336            if let Some(end) = parse_jwt_at(bytes, i) {
337                out.push(Match {
338                    category: Category::JwtToken,
339                    step_index,
340                    snippet: text[i..end].to_string(),
341                });
342                i = end;
343                continue;
344            }
345        }
346        i += 1;
347    }
348}
349
350fn parse_jwt_at(bytes: &[u8], start: usize) -> Option<usize> {
351    let mut pos = start;
352    for seg in 0..3 {
353        let seg_start = pos;
354        while pos < bytes.len() && is_base64url_byte(bytes[pos]) {
355            pos += 1;
356        }
357        if pos - seg_start < 16 {
358            return None;
359        }
360        if seg < 2 {
361            if pos >= bytes.len() || bytes[pos] != b'.' {
362                return None;
363            }
364            pos += 1;
365        }
366    }
367    Some(pos)
368}
369
370fn is_base64url_byte(b: u8) -> bool {
371    b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
372}
373
374fn snippet_around(text: &str, start: usize, end: usize) -> String {
375    // Return the matched range directly. Callers can wrap in `…` or
376    // truncate if they want more context; keeping this tight means
377    // JSON output stays small.
378    text[start..end].to_string()
379}
380
381#[cfg(test)]
382mod tests {
383    use super::*;
384
385    #[test]
386    fn finds_aws_access_key() {
387        let m = scan("export AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE");
388        assert!(m.iter().any(|x| x.category == Category::AwsAccessKey));
389    }
390
391    #[test]
392    fn finds_stripe_keys() {
393        let m = scan("key: sk_live_aaaaaaaaaaaaaaaaaaaaaaaaaa");
394        assert!(m.iter().any(|x| x.category == Category::StripeSecretKey));
395        let m = scan("pub: pk_test_bbbbbbbbbbbbbbbbbbbbbbbbbb");
396        assert!(
397            m.iter()
398                .any(|x| x.category == Category::StripePublishableKey)
399        );
400    }
401
402    #[test]
403    fn finds_github_tokens() {
404        let tok = "ghp_".to_string() + &"a".repeat(36);
405        let m = scan(&tok);
406        assert!(m.iter().any(|x| x.category == Category::GithubToken));
407    }
408
409    #[test]
410    fn distinguishes_openai_from_anthropic() {
411        let openai = "sk-".to_string() + &"a".repeat(48);
412        let anthropic = "sk-ant-".to_string() + &"a".repeat(40);
413        let m_o = scan(&openai);
414        assert!(m_o.iter().any(|x| x.category == Category::OpenaiKey));
415        assert!(!m_o.iter().any(|x| x.category == Category::AnthropicKey));
416        let m_a = scan(&anthropic);
417        assert!(m_a.iter().any(|x| x.category == Category::AnthropicKey));
418        assert!(!m_a.iter().any(|x| x.category == Category::OpenaiKey));
419    }
420
421    #[test]
422    fn finds_emails() {
423        let m = scan("contact alice+test@example.com and bob@x.io");
424        let emails: Vec<_> = m.iter().filter(|x| x.category == Category::Email).collect();
425        assert_eq!(emails.len(), 2);
426    }
427
428    #[test]
429    fn rejects_bare_at_without_domain_dot() {
430        let m = scan("twitter handle @alice here");
431        assert!(!m.iter().any(|x| x.category == Category::Email));
432    }
433
434    #[test]
435    fn finds_ipv4_but_rejects_out_of_range() {
436        let m = scan("connect to 10.0.0.1 and 192.168.1.50");
437        let ips: Vec<_> = m.iter().filter(|x| x.category == Category::Ipv4).collect();
438        assert_eq!(ips.len(), 2);
439        let m = scan("fake 999.999.999.999 and 300.1.1.1");
440        assert!(!m.iter().any(|x| x.category == Category::Ipv4));
441    }
442
443    #[test]
444    fn finds_ssh_private_key_header() {
445        let text = "-----BEGIN OPENSSH PRIVATE KEY-----\nfake";
446        let m = scan(text);
447        assert!(
448            m.iter()
449                .any(|x| x.category == Category::SshPrivateKeyHeader)
450        );
451    }
452
453    #[test]
454    fn finds_jwt() {
455        // Three base64url groups ≥16 chars each, joined by dots.
456        let jwt = format!(
457            "{}.{}.{}",
458            "eyJ".to_string() + &"a".repeat(20),
459            "a".repeat(20),
460            "a".repeat(20)
461        );
462        let m = scan(&jwt);
463        assert!(m.iter().any(|x| x.category == Category::JwtToken));
464    }
465
466    #[test]
467    fn rejects_non_jwt_starting_with_eyj() {
468        // eyJ followed by non-base64 chars → no match.
469        let m = scan("eyJ{not a jwt");
470        assert!(!m.iter().any(|x| x.category == Category::JwtToken));
471    }
472
473    #[test]
474    fn scan_steps_indexes_by_step_position() {
475        use crate::timeline::{tool_result_step, user_text_step};
476        let steps = vec![
477            user_text_step("clean input"),
478            tool_result_step(
479                "t1",
480                "secret AKIAIOSFODNN7EXAMPLE found",
481                Some("Bash"),
482                None,
483            ),
484        ];
485        let matches = scan_steps(&steps);
486        assert!(
487            matches
488                .iter()
489                .any(|m| m.step_index == 1 && m.category == Category::AwsAccessKey)
490        );
491    }
492
493    #[test]
494    fn empty_input_returns_no_matches() {
495        assert!(scan("").is_empty());
496    }
497}