devboy_skills/trace/redact.rs
1//! Redaction of sensitive values before traces hit disk.
2//!
3//! Two mechanisms are layered:
4//!
5//! 1. Known credential shapes are masked regardless of where they
6//! appear in the tree. Currently: `ghp_`, `glpat-`, `pk_`, `sk-`,
7//! `xoxb-` / `xoxa-` / `xapp-`, `Bearer ` / `Basic ` (case-
8//! insensitive), plus a few other common prefixes. These all
9//! survive without knowing the configured credential set — useful
10//! when a token leaks into an error message, a git URL, or a
11//! user-supplied prompt.
12//! 2. Values of any string-valued environment variable whose name
13//! matches a sensitive suffix (`*_TOKEN` / `*_SECRET` / `*_KEY` /
14//! `*_PASSWORD` / `*_PASSPHRASE` / `AUTHORIZATION` / `COOKIE`) are
15//! masked — the redactor snapshots those at call time.
16//!
17//! Setting the `DEVBOY_TRACE_REDACTION=off` environment variable
18//! disables both passes for local debugging. Never default to off.
19//!
20//! ## Amortizing the env snapshot
21//!
22//! The top-level [`sanitize`] helper walks `std::env::vars()` on every
23//! call — fine for one-shot CLI invocations but wasteful inside a
24//! long-running producer like [`super::SessionTracer`] that writes
25//! many events. Build a [`Redactor`] once with
26//! [`Redactor::snapshot`] and reuse it for every event in the same
27//! session to pay the env scan just once.
28
29use std::collections::HashSet;
30
31use serde_json::Value;
32
33/// Redact sensitive data in `value`. Recursively walks maps and
34/// arrays. Strings are rewritten; numbers / bools / null pass through
35/// unchanged.
36///
37/// Each call snapshots `*_TOKEN` / `*_SECRET` / … env vars afresh so
38/// that tests using `temp_env::with_var` (and production callers that
39/// legitimately mutate the environment) see up-to-date state. Inside
40/// a long session, prefer [`Redactor::snapshot`] + [`Redactor::sanitize`].
41pub fn sanitize(value: Value) -> Value {
42 Redactor::snapshot().sanitize(value)
43}
44
45/// A reusable redactor that holds one env-var snapshot. Created via
46/// [`Redactor::snapshot`]; use once per long-running producer (e.g.
47/// one per `SessionTracer`) to avoid rescanning the environment on
48/// every event.
49#[derive(Debug, Clone)]
50pub struct Redactor {
51 enabled: bool,
52 secrets: HashSet<String>,
53}
54
55impl Redactor {
56 /// Capture the current set of sensitive env-var values and the
57 /// `DEVBOY_TRACE_REDACTION=off` opt-out state. Cheap to clone.
58 pub fn snapshot() -> Self {
59 if redaction_disabled() {
60 Self {
61 enabled: false,
62 secrets: HashSet::new(),
63 }
64 } else {
65 Self {
66 enabled: true,
67 secrets: known_env_secrets(),
68 }
69 }
70 }
71
72 /// Sanitize a single value using the captured env-var snapshot.
73 pub fn sanitize(&self, value: Value) -> Value {
74 if !self.enabled {
75 return value;
76 }
77 sanitize_with(&self.secrets, value)
78 }
79}
80
81fn redaction_disabled() -> bool {
82 match std::env::var("DEVBOY_TRACE_REDACTION") {
83 Ok(v) => matches!(v.to_lowercase().as_str(), "off" | "0" | "false" | "no"),
84 Err(_) => false,
85 }
86}
87
88fn sanitize_with(secrets: &HashSet<String>, value: Value) -> Value {
89 match value {
90 Value::String(s) => Value::String(redact_string(secrets, &s)),
91 Value::Array(xs) => {
92 Value::Array(xs.into_iter().map(|x| sanitize_with(secrets, x)).collect())
93 }
94 Value::Object(map) => {
95 let mut out = serde_json::Map::with_capacity(map.len());
96 for (k, v) in map {
97 // If the key itself hints at a secret, redact the whole
98 // value regardless of its type. This prevents structured
99 // leaks like `{"authorization": {"scheme": "Bearer",
100 // "value": "…"}}` where nested field names may not
101 // themselves trip the secret-key heuristic.
102 let new_val = if key_looks_secret(&k) {
103 Value::String("<redacted:secret-field>".to_string())
104 } else {
105 sanitize_with(secrets, v)
106 };
107 out.insert(k, new_val);
108 }
109 Value::Object(out)
110 }
111 other => other,
112 }
113}
114
115fn redact_string(secrets: &HashSet<String>, s: &str) -> String {
116 // 1. Exact env-var match.
117 if !s.is_empty() && secrets.contains(s) {
118 return "<redacted:credential>".to_string();
119 }
120 // 2. Known token prefixes. We search case-sensitively because every
121 // supported prefix is case-sensitive in practice.
122 if has_known_prefix(s) {
123 return "<redacted:token-pattern>".to_string();
124 }
125 // 3. Bearer / Basic schemes embedded inside a larger string. Don't
126 // rewrite the whole string — replace only the credential segment.
127 if let Some(rewritten) = mask_auth_header_segment(s) {
128 return rewritten;
129 }
130 s.to_string()
131}
132
133fn has_known_prefix(s: &str) -> bool {
134 // Case-sensitive prefixes. The publisher-defined provider tokens
135 // are all case-sensitive in the wild, so matching them strictly
136 // avoids redacting words that merely share the letters (e.g. an
137 // English sentence starting with "Ghp").
138 const CASE_SENSITIVE: &[&str] = &[
139 // GitHub PATs
140 "ghp_",
141 "github_pat_",
142 "gho_",
143 "ghu_",
144 "ghs_",
145 "ghr_",
146 // GitLab PATs
147 "glpat-",
148 // Publishable / secret key families shared across a few
149 // providers (Stripe, ClickUp, etc.). ADR-015 spec calls these
150 // out as a single `pk_` / `sk_` group — keep them generic.
151 "pk_",
152 "sk_",
153 // OpenAI-ish (also covers sk-ant-… via the `sk-` prefix).
154 "sk-",
155 // Slack
156 "xoxb-",
157 "xoxa-",
158 "xoxp-",
159 "xapp-",
160 ];
161 if CASE_SENSITIVE
162 .iter()
163 .any(|p| s.starts_with(p) && s.len() > p.len() + 8)
164 {
165 return true;
166 }
167 // Case-insensitive auth-scheme prefixes: HTTP scheme tokens are
168 // case-insensitive per RFC 7235, so `Bearer <tok>`, `bearer <tok>`
169 // and `BEARER <tok>` should all redact.
170 const SCHEME_CI: &[&str] = &["bearer ", "basic "];
171 let lower = s.to_ascii_lowercase();
172 SCHEME_CI
173 .iter()
174 .any(|p| lower.starts_with(p) && s.len() > p.len() + 8)
175}
176
177fn mask_auth_header_segment(s: &str) -> Option<String> {
178 // e.g. "Authorization: Bearer ghp_…" embedded inside a log line.
179 // HTTP auth schemes are case-insensitive (RFC 7235), so locate the
180 // needle in the lowercased copy but preserve the original casing
181 // of the scheme token in the rewritten output.
182 let lower = s.to_ascii_lowercase();
183 let needles = ["bearer ", "basic "];
184 for needle in needles {
185 if let Some(idx) = lower.find(needle) {
186 let head = &s[..idx];
187 let scheme = &s[idx..idx + needle.len()]; // original case preserved
188 // Credential runs until whitespace, comma, or semicolon.
189 let rest = &s[idx + needle.len()..];
190 let end = rest
191 .find(|c: char| c.is_whitespace() || c == ',' || c == ';')
192 .unwrap_or(rest.len());
193 if end >= 8 {
194 let tail = &rest[end..];
195 return Some(format!("{head}{scheme}<redacted:auth>{tail}"));
196 }
197 }
198 }
199 None
200}
201
202fn key_looks_secret(key: &str) -> bool {
203 let upper = key.to_ascii_uppercase();
204 const SUFFIXES: &[&str] = &[
205 "_TOKEN",
206 "_SECRET",
207 "_KEY",
208 "_PASSWORD",
209 "_PASSPHRASE",
210 "_AUTH",
211 ];
212 const EXACT: &[&str] = &["AUTHORIZATION", "COOKIE", "TOKEN", "SECRET", "PASSWORD"];
213 if EXACT.contains(&upper.as_str()) {
214 return true;
215 }
216 if SUFFIXES.iter().any(|suf| upper.ends_with(suf)) {
217 return true;
218 }
219 // Common devboy conventions.
220 // Use the upper-cased copy for the substring heuristic too, so
221 // mixed-case keys like `Password` / `Token` / `Secret` are caught
222 // consistently with the EXACT / SUFFIX branches above.
223 if upper.contains("PASSWORD") || upper.contains("SECRET") || upper.contains("TOKEN") {
224 return true;
225 }
226 false
227}
228
229fn known_env_secrets() -> HashSet<String> {
230 let mut out = HashSet::new();
231 for (name, value) in std::env::vars() {
232 if value.is_empty() {
233 continue;
234 }
235 if key_looks_secret(&name) {
236 out.insert(value);
237 }
238 }
239 out
240}
241
242/// Test-only helpers shared with sibling modules (notably `trace::tests`)
243/// that also touch `DEVBOY_TRACE_REDACTION`. Sharing the same mutex
244/// across modules is required: without it, a `temp_env::with_var(..,
245/// "off")` in one test leaks into a concurrently running assertion
246/// elsewhere in the crate and silently disables redaction mid-test.
247#[cfg(test)]
248pub(crate) mod test_support {
249 use std::sync::Mutex;
250
251 /// Serialise every test in `devboy-skills` that mutates the
252 /// process-wide environment. Two tests legitimately toggle
253 /// `DEVBOY_TRACE_REDACTION=off` via `temp_env::with_var`, and
254 /// `cargo test` runs the others concurrently — without this
255 /// mutex a sibling test's `off` setting can leak into an
256 /// unrelated test for the window it holds the var, making
257 /// arm64-Linux `events_are_redacted_before_writing` and
258 /// `masks_bare_bearer_value_case_insensitive` flake on CI. The
259 /// mutex is cheap (only contended during tests) and keeps the
260 /// production code path zero-overhead. Combined with
261 /// `temp_env::with_var`'s own save/restore logic this gives the
262 /// whole crate deterministic env state.
263 pub(crate) static ENV_TEST_MUTEX: Mutex<()> = Mutex::new(());
264
265 /// Acquire the crate-wide env-serialisation lock and run `f`
266 /// inside a `temp_env` guard that explicitly UNsets
267 /// `DEVBOY_TRACE_REDACTION`. Used by every test that expects the
268 /// default (enabled) redactor.
269 pub(crate) fn with_clean_env<R>(f: impl FnOnce() -> R) -> R {
270 let _guard = ENV_TEST_MUTEX.lock().unwrap_or_else(|p| p.into_inner());
271 temp_env::with_var("DEVBOY_TRACE_REDACTION", None::<&str>, f)
272 }
273}
274
275#[cfg(test)]
276mod tests {
277 use super::test_support::{ENV_TEST_MUTEX, with_clean_env};
278 use super::*;
279 use serde_json::json;
280
281 #[test]
282 fn masks_github_pat() {
283 with_clean_env(|| {
284 let v = json!({ "args": { "token": "ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" } });
285 let out = sanitize(v);
286 let s = serde_json::to_string(&out).unwrap();
287 assert!(!s.contains("ghp_aaaaaaaa"));
288 assert!(s.contains("<redacted"));
289 });
290 }
291
292 #[test]
293 fn masks_bearer_scheme_in_header_string() {
294 with_clean_env(|| {
295 let v = json!("Authorization: Bearer xxxxxxxxxxxxyyyyyyyyyyyy");
296 let out = sanitize(v);
297 let s = out.as_str().unwrap();
298 assert!(!s.contains("xxxxxxxxxxxxyyyyyyyyyyyy"), "got: {s}");
299 assert!(s.contains("<redacted"), "got: {s}");
300 });
301 }
302
303 #[test]
304 fn masks_by_key_name_even_when_value_looks_harmless() {
305 with_clean_env(|| {
306 // A value that does not match any known prefix but lives under
307 // a key called `password` must still be redacted.
308 let v = json!({ "password": "not-a-prefix" });
309 let out = sanitize(v);
310 assert_eq!(
311 out.get("password").and_then(|v| v.as_str()),
312 Some("<redacted:secret-field>")
313 );
314 });
315 }
316
317 #[test]
318 fn env_var_values_are_redacted_when_they_match_exactly() {
319 let _guard = ENV_TEST_MUTEX.lock().unwrap_or_else(|p| p.into_inner());
320 temp_env::with_vars(
321 [
322 ("DEVBOY_TRACE_REDACTION", None::<&str>),
323 (
324 "DEVBOY_TEST_TOKEN",
325 Some("super-secret-value-nothing-matches"),
326 ),
327 ],
328 || {
329 let v = json!({ "note": "leaked: super-secret-value-nothing-matches" });
330 let out = sanitize(v);
331 // The exact-match secret replacement only fires when
332 // the value IS the secret — not when it's embedded in
333 // a larger string. Embedded leakage is the DLP case we
334 // don't attempt to solve (see the doc comment).
335 let note = out.get("note").and_then(|v| v.as_str()).unwrap();
336 assert_eq!(note, "leaked: super-secret-value-nothing-matches");
337
338 let v = json!({ "raw": "super-secret-value-nothing-matches" });
339 let out = sanitize(v);
340 assert_eq!(
341 out.get("raw").and_then(|v| v.as_str()),
342 Some("<redacted:credential>")
343 );
344 },
345 );
346 }
347
348 #[test]
349 fn short_strings_are_not_redacted_by_prefix_check() {
350 with_clean_env(|| {
351 // `ghp_` alone must not be redacted — only long PAT-shaped
352 // strings are. This matters for documentation and for the
353 // redaction marker itself.
354 let v = json!("ghp_");
355 assert_eq!(sanitize(v).as_str(), Some("ghp_"));
356 });
357 }
358
359 #[test]
360 fn redaction_can_be_disabled_via_env() {
361 let _guard = ENV_TEST_MUTEX.lock().unwrap_or_else(|p| p.into_inner());
362 temp_env::with_var("DEVBOY_TRACE_REDACTION", Some("off"), || {
363 let v = json!({ "token": "ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" });
364 let out = sanitize(v.clone());
365 assert_eq!(out, v);
366 });
367 }
368
369 #[test]
370 fn masks_bearer_scheme_case_insensitive() {
371 with_clean_env(|| {
372 // HTTP schemes are case-insensitive per RFC 7235, so all of
373 // these variants must redact.
374 for header in [
375 "Authorization: Bearer xxxxxxxxxxxxyyyyyyyyyyyy",
376 "authorization: bearer xxxxxxxxxxxxyyyyyyyyyyyy",
377 "AUTHORIZATION: BEARER xxxxxxxxxxxxyyyyyyyyyyyy",
378 "authorization: BeArEr xxxxxxxxxxxxyyyyyyyyyyyy",
379 ] {
380 let out = sanitize(json!(header));
381 let s = out.as_str().unwrap();
382 assert!(
383 !s.contains("xxxxxxxxxxxxyyyyyyyyyyyy"),
384 "token leaked for header `{header}` → `{s}`"
385 );
386 assert!(
387 s.contains("<redacted"),
388 "no redaction marker for header `{header}` → `{s}`"
389 );
390 }
391 });
392 }
393
394 #[test]
395 fn masks_bare_bearer_value_case_insensitive() {
396 with_clean_env(|| {
397 // When the caller pasted just the `Bearer <token>` segment as
398 // a standalone value, the prefix check (not the header scanner)
399 // fires — must also be case-insensitive.
400 for raw in [
401 "Bearer abcdefghijklmnopqrstuvwx",
402 "bearer abcdefghijklmnopqrstuvwx",
403 "BEARER abcdefghijklmnopqrstuvwx",
404 "Basic YWxpY2U6aHVudGVyMjpkcmFnb24=",
405 ] {
406 let out = sanitize(json!(raw));
407 let s = out.as_str().unwrap();
408 assert!(s.contains("<redacted"), "not redacted: `{raw}` → `{s}`");
409 }
410 });
411 }
412
413 #[test]
414 fn masks_generic_pk_prefix() {
415 with_clean_env(|| {
416 // ADR-015 calls out a generic `pk_` prefix (not just
417 // `pk_live_` / `pk_test_`). Enough bytes after the prefix to
418 // clear the length guard so a bare `pk_` literal is left alone.
419 let v = json!({ "clickup_pk": "pk_abcdefghijklmnop" });
420 let out = sanitize(v);
421 assert_eq!(
422 out.get("clickup_pk").and_then(|v| v.as_str()),
423 Some("<redacted:token-pattern>"),
424 "generic pk_ prefix should redact"
425 );
426
427 // Short `pk_` literal stays untouched (e.g. in docs).
428 let doc = json!("pk_");
429 assert_eq!(sanitize(doc).as_str(), Some("pk_"));
430 });
431 }
432
433 #[test]
434 fn redactor_snapshot_amortizes_env_scan() {
435 let _guard = ENV_TEST_MUTEX.lock().unwrap_or_else(|p| p.into_inner());
436 let redactor = temp_env::with_vars(
437 [
438 ("DEVBOY_TRACE_REDACTION", None::<&str>),
439 ("DEVBOY_REDACTOR_CACHE_TOKEN", Some("cached-token-zzzzzzzz")),
440 ],
441 Redactor::snapshot,
442 );
443 // The env var is gone at this point, but the snapshot remembers.
444 let out = redactor.sanitize(json!({ "raw": "cached-token-zzzzzzzz" }));
445 assert_eq!(
446 out.get("raw").and_then(|v| v.as_str()),
447 Some("<redacted:credential>")
448 );
449 }
450
451 #[test]
452 fn redactor_snapshot_respects_disable_env() {
453 let _guard = ENV_TEST_MUTEX.lock().unwrap_or_else(|p| p.into_inner());
454 temp_env::with_var("DEVBOY_TRACE_REDACTION", Some("off"), || {
455 let redactor = Redactor::snapshot();
456 let v = json!({ "token": "ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" });
457 assert_eq!(redactor.sanitize(v.clone()), v);
458 });
459 }
460}