rtb_redact/
lib.rs

1//! Free-form secret redaction for log lines, telemetry events, and
2//! diagnostic surfaces.
3//!
4//! See `docs/development/specs/2026-04-23-rtb-redact-v0.1.md` for the
5//! full design and the seven-pass rule set.
6//!
7//! ```
8//! use rtb_redact::string;
9//!
10//! let scrubbed = string("connect to postgres://app:hunter2@db/mydb");
11//! assert!(scrubbed.contains("[redacted]"));
12//! assert!(!scrubbed.contains("hunter2"));
13//! ```
14
15#![forbid(unsafe_code)]
16
17use std::borrow::Cow;
18use std::sync::LazyLock;
19
20use regex::Regex;
21
22/// Header names whose values must be redacted at DEBUG / TRACE log
23/// levels. Case-insensitive match via
24/// [`is_sensitive_header`]. `phf::Set` keeps lookup O(1) as the list
25/// grows.
26pub static SENSITIVE_HEADERS: phf::Set<&'static str> = phf::phf_set! {
27    "authorization",
28    "proxy-authorization",
29    "cookie",
30    "set-cookie",
31    "x-api-key",
32    "x-auth-token",
33    "x-amz-security-token",
34    "x-goog-api-key",
35    "x-anthropic-api-key",
36    "x-openai-api-key",
37};
38
39const REDACTED: &str = "[redacted]";
40
41/// Redact secrets in `input`, returning a borrowed [`Cow`] when no
42/// redactions apply and an owned `String` otherwise.
43///
44/// The rule set is applied in the documented order (see the spec).
45/// `input` is expected to be UTF-8 (it is already typed as `&str`);
46/// callers holding `&[u8]` must convert themselves.
47#[must_use]
48pub fn string(input: &str) -> Cow<'_, str> {
49    if input.is_empty() {
50        return Cow::Borrowed(input);
51    }
52
53    // Fast path: if none of the anchor characters or keywords appear
54    // in the string, no rule can match. Avoids allocating.
55    if !fast_has_sensitive_anchor(input) {
56        return Cow::Borrowed(input);
57    }
58
59    let mut out = input.to_string();
60    apply_rules(&mut out);
61
62    // If nothing actually changed, return Borrowed so callers don't
63    // pay for the clone on false positives of the fast-path check.
64    if out == input {
65        Cow::Borrowed(input)
66    } else {
67        // SAFETY of correctness: `out` was derived from the input via
68        // regex replacements; there's no way to get back to &input
69        // from here without the Owned wrapper.
70        Cow::Owned(out)
71    }
72}
73
74/// Same as [`string`], but writes into a caller-supplied `String`.
75/// Useful for hot loops that want to reuse a buffer.
76pub fn string_into(input: &str, out: &mut String) {
77    out.clear();
78    if input.is_empty() {
79        return;
80    }
81    if !fast_has_sensitive_anchor(input) {
82        out.push_str(input);
83        return;
84    }
85    out.push_str(input);
86    apply_rules(out);
87}
88
89/// Case-insensitive membership check against [`SENSITIVE_HEADERS`].
90#[must_use]
91pub fn is_sensitive_header(name: &str) -> bool {
92    // `phf::Set` is case-sensitive. We lowercase the name into a small
93    // stack-friendly buffer; realistic header names are < 64 chars.
94    let mut buf = [0u8; 128];
95    let bytes = name.as_bytes();
96    if bytes.len() > buf.len() {
97        // Oversized header names can't be in the known-sensitive list.
98        return false;
99    }
100    for (i, &b) in bytes.iter().enumerate() {
101        buf[i] = b.to_ascii_lowercase();
102    }
103    // Safe: ASCII-lowercased ASCII stays ASCII stays UTF-8.
104    let lower = std::str::from_utf8(&buf[..bytes.len()]).unwrap_or("");
105    SENSITIVE_HEADERS.contains(lower)
106}
107
108/// Unconditionally redact a header value. Callers invoke this for
109/// any header name matching [`is_sensitive_header`].
110#[must_use]
111pub fn redact_header_value(value: &str) -> String {
112    if value.is_empty() {
113        String::new()
114    } else {
115        REDACTED.to_string()
116    }
117}
118
119// ---------------------------------------------------------------------
120// Internal: fast-path pre-check.
121// ---------------------------------------------------------------------
122
123/// Returns `true` if `input` contains any character or substring that
124/// could plausibly trigger a redaction rule. This avoids allocating
125/// and running seven regexes over clean strings.
126fn fast_has_sensitive_anchor(input: &str) -> bool {
127    // Any of these anchors could indicate a match. The check is
128    // intentionally loose — false positives here cost one allocation
129    // and seven regex runs on a small string.
130    input.contains('@')        // URL userinfo
131        || input.contains('=') // query params
132        || input.contains('?')
133        || input.contains('-') // token prefixes use hyphens heavily
134        || input.contains('_')
135        || input.contains('.') // JWT dots, AWS prefixes
136        || input.contains("-----BEGIN ")
137        || has_auth_scheme(input)
138        || has_long_run(input)
139}
140
141/// Crude check for "contains a whitespace-delimited run of 40+ chars
142/// that could be a token."
143fn has_long_run(input: &str) -> bool {
144    let mut run = 0usize;
145    for b in input.bytes() {
146        if b.is_ascii_alphanumeric()
147            || b == b'+'
148            || b == b'/'
149            || b == b'='
150            || b == b'_'
151            || b == b'-'
152        {
153            run += 1;
154            if run >= 40 {
155                return true;
156            }
157        } else {
158            run = 0;
159        }
160    }
161    false
162}
163
164fn has_auth_scheme(input: &str) -> bool {
165    // Case-insensitive lookahead for the three tokens we act on.
166    // Simple byte scan is enough; we're avoiding a regex here.
167    let lower_bytes = input.as_bytes();
168    for window in lower_bytes.windows(7) {
169        let w = window;
170        if eq_ignore_ascii_case(w, b"bearer ")
171            || eq_ignore_ascii_case(&w[..6], b"basic ")
172            || eq_ignore_ascii_case(&w[..6], b"token ")
173        {
174            return true;
175        }
176    }
177    false
178}
179
180fn eq_ignore_ascii_case(a: &[u8], b: &[u8]) -> bool {
181    if a.len() < b.len() {
182        return false;
183    }
184    a[..b.len()].iter().zip(b.iter()).all(|(x, y)| x.eq_ignore_ascii_case(y))
185}
186
187// ---------------------------------------------------------------------
188// Internal: rule application.
189// ---------------------------------------------------------------------
190
191fn apply_rules(out: &mut String) {
192    // 1. URL userinfo
193    replace_all(out, &RE_URL_USERINFO, |caps| format!("{}://{REDACTED}@", &caps[1]));
194    // 2. Authorization-header-style values.
195    replace_all(out, &RE_AUTH_SCHEME, |caps| format!("{} {REDACTED}", &caps[1]));
196    // 3. Query-parameter sensitive keys.
197    replace_all(out, &RE_QUERY_SENSITIVE, |caps| format!("{}={REDACTED}", &caps[1]));
198    // 7. PEM private key blocks — run before token rules so the
199    //    key body (which can otherwise match the long-run rule) is
200    //    collapsed into a single REDACTED marker.
201    replace_all(out, &RE_PEM_BLOCK, |_caps| {
202        "-----BEGIN PRIVATE KEY-----\n[redacted]\n-----END PRIVATE KEY-----".to_string()
203    });
204    // 4. Well-known credential prefixes.
205    replace_all(out, &RE_NAMED_PREFIX, |caps| {
206        let matched = &caps[0];
207        if matched.len() >= 20 {
208            REDACTED.to_string()
209        } else {
210            matched.to_string()
211        }
212    });
213    // 6. JWT-shaped tokens. (Run before the generic long-run rule so
214    //    partial overlaps don't produce a half-masked JWT.) The spec
215    //    requires total length >= 100 chars — short "eyJ.x.y.z"
216    //    strings pass through unchanged.
217    replace_all(out, &RE_JWT, |caps| {
218        let matched = &caps[0];
219        if matched.len() >= 100 {
220            REDACTED.to_string()
221        } else {
222            matched.to_string()
223        }
224    });
225    // 5. Long opaque tokens. The captured boundary chars (leading /
226    //    trailing whitespace, or start / end of input) are re-emitted
227    //    verbatim so the word spacing around the redaction is preserved.
228    replace_all(out, &RE_LONG_OPAQUE, |caps| format!("{}{REDACTED}{}", &caps[1], &caps[3]));
229}
230
231fn replace_all<F>(buf: &mut String, re: &Regex, mut f: F)
232where
233    F: FnMut(&regex::Captures<'_>) -> String,
234{
235    // Build a new string if any match exists; otherwise leave `buf`
236    // untouched. We don't use `Regex::replace_all` with a closure
237    // directly because it takes a Replacer by value and we want to
238    // keep the API on `&mut String` rather than allocating a Cow.
239    if !re.is_match(buf) {
240        return;
241    }
242    let mut out = String::with_capacity(buf.len());
243    let mut last = 0;
244    for caps in re.captures_iter(buf) {
245        let whole = caps.get(0).expect("captures always include group 0");
246        out.push_str(&buf[last..whole.start()]);
247        out.push_str(&f(&caps));
248        last = whole.end();
249    }
250    out.push_str(&buf[last..]);
251    *buf = out;
252}
253
254// ---------------------------------------------------------------------
255// Internal: compiled patterns. All literal; compiled once.
256// ---------------------------------------------------------------------
257
258static RE_URL_USERINFO: LazyLock<Regex> = LazyLock::new(|| {
259    // Matches scheme://user:pass@ (userinfo). Captures the scheme
260    // so we can re-emit it.
261    Regex::new(r"([a-zA-Z][a-zA-Z0-9+.-]*)://[^:\s/?#]+:[^@\s]+@").expect("valid regex")
262});
263
264static RE_AUTH_SCHEME: LazyLock<Regex> = LazyLock::new(|| {
265    // Bearer / Basic / Token <credential>. The credential is whatever
266    // follows the scheme until whitespace or end.
267    Regex::new(r"(?i)\b(Bearer|Basic|Token)\s+[A-Za-z0-9_\-.+/=]+").expect("valid regex")
268});
269
270static RE_QUERY_SENSITIVE: LazyLock<Regex> = LazyLock::new(|| {
271    // Match `<sensitive-key>=<value>` in query strings. Value runs up
272    // to `&`, whitespace, or end. Case-insensitive key match.
273    Regex::new(
274        r"(?i)([?&]?(?:api[_-]?key|access[_-]?token|refresh[_-]?token|token|password|passwd|secret|signature|sig|auth|x[_-]?api[_-]?key))=[^&\s#]+"
275    )
276    .expect("valid regex")
277});
278
279static RE_NAMED_PREFIX: LazyLock<Regex> = LazyLock::new(|| {
280    // Well-known provider tokens. Matches the prefix + trailing
281    // allowed characters. The closure in `apply_rules` checks for
282    // total length >= 20 before redacting.
283    Regex::new(
284        r"(?x)
285        \b(
286            sk-ant-[A-Za-z0-9_\-]+
287          | sk-[A-Za-z0-9_\-]+
288          | (?:ghp|gho|ghs|ghu)_[A-Za-z0-9]+
289          | glpat-[A-Za-z0-9_\-]+
290          | AIza[A-Za-z0-9_\-]+
291          | (?:AKIA|ASIA)[A-Z0-9]+
292          | xox[baprs]-[A-Za-z0-9\-]+
293          | SG\.[A-Za-z0-9_\-]{22,}\.[A-Za-z0-9_\-]{43,}
294        )
295        ",
296    )
297    .expect("valid regex")
298});
299
300static RE_JWT: LazyLock<Regex> = LazyLock::new(|| {
301    // eyJ... . ... . ... totalling >= 100 chars.
302    Regex::new(r"eyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+").expect("valid regex")
303});
304
305static RE_LONG_OPAQUE: LazyLock<Regex> = LazyLock::new(|| {
306    // Whitespace-bounded run of 40+ base64/hex-ish chars. The boundary
307    // chars are explicit capture groups so the replacement closure can
308    // preserve the surrounding whitespace.
309    Regex::new(r"(^|\s)([A-Za-z0-9+/=_\-]{40,})(\s|$)").expect("valid regex")
310});
311
312static RE_PEM_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
313    // DOTALL via (?s) so `.` crosses newlines.
314    Regex::new(r"(?s)-----BEGIN [A-Z ]*PRIVATE KEY-----.*?-----END [A-Z ]*PRIVATE KEY-----")
315        .expect("valid regex")
316});
rtb_redact/lib.rs

rtb_redact/
lib.rs