Skip to main content

harn_vm/redact/
mod.rs

1//! Unified redaction policy for persisted and rendered operational data.
2//!
3//! Harn writes transcripts, receipts, event logs, portal JSON, connector
4//! status snapshots, and workflow artifacts. Each of those surfaces was
5//! previously responsible for its own ad-hoc scrubbing of HTTP headers,
6//! URL query parameters, JSON tokens, and free-form strings. This module
7//! is the single source of truth for "what is sensitive" so the same
8//! representative secret cannot leak through two surfaces by accident.
9//!
10//! # Categories
11//!
12//! - **Auth headers, cookies, signature/proxy tokens** — covered by
13//!   [`RedactionPolicy::redact_headers`].
14//! - **URLs with credentials in userinfo or sensitive query parameters**
15//!   — covered by [`RedactionPolicy::redact_url`].
16//! - **JSON fields whose name is auth/credential-shaped** — covered by
17//!   [`RedactionPolicy::redact_json_in_place`].
18//! - **Free-form strings carrying high-confidence secret patterns**
19//!   (Stripe `sk_live_…`, GitHub `ghp_…`, AWS `AKIA…`, Bearer tokens,
20//!   `-----BEGIN … PRIVATE KEY-----`) — covered by
21//!   [`RedactionPolicy::redact_string`] and applied recursively by
22//!   [`RedactionPolicy::redact_json_in_place`].
23//!
24//! # Host configuration
25//!
26//! Hosts compose policies via the builder methods (`with_safe_header`,
27//! `with_extra_field`, `with_extra_url_param`, `disable_string_scan`).
28//! Active policies are pushed onto a thread-local stack the same way
29//! approval policies are, so a single orchestrator startup site can
30//! install host overrides for every persistence path that calls
31//! [`current_policy`].
32
33mod patterns;
34
35use std::borrow::Cow;
36use std::cell::RefCell;
37use std::collections::{BTreeMap, BTreeSet};
38
39use serde_json::Value as JsonValue;
40use url::Url;
41
42pub use patterns::scan_secret_patterns;
43
44/// Placeholder string used everywhere a redacted value would otherwise
45/// appear. Kept as a single constant so portal CSS, downstream parsers,
46/// and humans grepping logs can rely on one form.
47pub const REDACTED_PLACEHOLDER: &str = "[redacted]";
48
49/// Header value for redacted HTTP headers. Identical to
50/// [`REDACTED_PLACEHOLDER`] today, exposed as a separate symbol so the
51/// trigger/event tests that pre-date the unified module remain readable.
52pub const REDACTED_HEADER_VALUE: &str = REDACTED_PLACEHOLDER;
53
54#[derive(Clone, Debug, PartialEq, Eq)]
55pub struct RedactionPolicy {
56    safe_headers: BTreeSet<String>,
57    deny_header_substrings: BTreeSet<String>,
58    extra_deny_header_substrings: BTreeSet<String>,
59    extra_field_names: BTreeSet<String>,
60    extra_url_params: BTreeSet<String>,
61    scan_strings: bool,
62    redact_url_userinfo: bool,
63}
64
65impl Default for RedactionPolicy {
66    fn default() -> Self {
67        Self {
68            safe_headers: default_safe_headers(),
69            deny_header_substrings: default_deny_header_substrings(),
70            extra_deny_header_substrings: BTreeSet::new(),
71            extra_field_names: BTreeSet::new(),
72            extra_url_params: BTreeSet::new(),
73            scan_strings: true,
74            redact_url_userinfo: true,
75        }
76    }
77}
78
79impl RedactionPolicy {
80    /// Permissive policy used by tests that need raw data. No headers,
81    /// fields, or strings are scrubbed.
82    pub fn passthrough() -> Self {
83        Self {
84            safe_headers: BTreeSet::new(),
85            deny_header_substrings: BTreeSet::new(),
86            extra_deny_header_substrings: BTreeSet::new(),
87            extra_field_names: BTreeSet::new(),
88            extra_url_params: BTreeSet::new(),
89            scan_strings: false,
90            redact_url_userinfo: false,
91        }
92    }
93
94    /// Add a header (case-insensitive) to the safe-list. Header
95    /// redaction will leave its value untouched even if the name would
96    /// otherwise look auth-shaped (e.g. an `x-…-key` header that is
97    /// actually a request-id).
98    pub fn with_safe_header(mut self, name: impl Into<String>) -> Self {
99        self.safe_headers.insert(name.into().to_ascii_lowercase());
100        self
101    }
102
103    /// Add a substring (case-insensitive) that always forces a header
104    /// to be treated as sensitive. Useful for product-specific token
105    /// header names that the default `cookie`/`authorization`/`token`/`secret`/`key`
106    /// substring set would miss.
107    pub fn with_deny_header_substring(mut self, fragment: impl Into<String>) -> Self {
108        self.extra_deny_header_substrings
109            .insert(fragment.into().to_ascii_lowercase());
110        self
111    }
112
113    /// Add a JSON field name (case-insensitive, exact match) that should
114    /// always be redacted regardless of value contents. Useful when a
115    /// host knows it stores `internal_audit_token` or similar.
116    pub fn with_extra_field(mut self, name: impl Into<String>) -> Self {
117        self.extra_field_names
118            .insert(name.into().to_ascii_lowercase());
119        self
120    }
121
122    /// Add an extra URL query parameter name to redact.
123    pub fn with_extra_url_param(mut self, name: impl Into<String>) -> Self {
124        self.extra_url_params
125            .insert(name.into().to_ascii_lowercase());
126        self
127    }
128
129    /// Disable the heuristic free-form string scanner. The scanner adds
130    /// a small but non-zero cost to every JSON payload walk; turn it off
131    /// for performance-critical paths that have already been audited.
132    pub fn disable_string_scan(mut self) -> Self {
133        self.scan_strings = false;
134        self
135    }
136
137    fn header_is_safe(&self, lower_name: &str) -> bool {
138        // Exact-name allowlist is one source of truth in `safe_headers`;
139        // suffix/substring rules below cover the families of debugging
140        // headers that providers emit with arbitrary suffixes.
141        if self.safe_headers.contains(lower_name) {
142            return true;
143        }
144        lower_name.ends_with("-event")
145            || lower_name.ends_with("-delivery")
146            || lower_name.contains("timestamp")
147            || lower_name.contains("request-id")
148    }
149
150    /// Whether a given HTTP header name should have its value replaced
151    /// with [`REDACTED_HEADER_VALUE`].
152    ///
153    /// Host-explicit deny substrings always win, even over the built-in
154    /// safe-list — that is how a host says "treat my own webhook
155    /// delivery header as sensitive even though Harn would normally
156    /// keep it for debugging."
157    pub fn header_is_sensitive(&self, name: &str) -> bool {
158        let lower = name.to_ascii_lowercase();
159        if self
160            .extra_deny_header_substrings
161            .iter()
162            .any(|fragment| lower.contains(fragment))
163        {
164            return true;
165        }
166        if self.header_is_safe(&lower) {
167            return false;
168        }
169        self.deny_header_substrings
170            .iter()
171            .any(|fragment| lower.contains(fragment))
172    }
173
174    /// Whether a JSON object field name should be replaced with the
175    /// redacted placeholder before the value is even inspected.
176    pub fn field_is_sensitive(&self, name: &str) -> bool {
177        let lower = name.to_ascii_lowercase();
178        if self.extra_field_names.contains(&lower) {
179            return true;
180        }
181        is_default_sensitive_field(&lower)
182    }
183
184    /// Whether a URL query parameter name should have its value
185    /// replaced.
186    pub fn url_param_is_sensitive(&self, name: &str) -> bool {
187        let lower = name.to_ascii_lowercase();
188        if self.extra_url_params.contains(&lower) {
189            return true;
190        }
191        is_default_sensitive_url_param(&lower)
192    }
193
194    /// Returns a [`BTreeMap`] of headers with sensitive values replaced
195    /// by [`REDACTED_HEADER_VALUE`].
196    pub fn redact_headers(&self, headers: &BTreeMap<String, String>) -> BTreeMap<String, String> {
197        headers
198            .iter()
199            .map(|(name, value)| {
200                if self.header_is_sensitive(name) {
201                    (name.clone(), REDACTED_HEADER_VALUE.to_string())
202                } else {
203                    (name.clone(), value.clone())
204                }
205            })
206            .collect()
207    }
208
209    /// Redact sensitive query parameters and credentials in URL
210    /// userinfo. Returns the input unchanged if nothing matches or the
211    /// URL fails to parse.
212    pub fn redact_url(&self, url: &str) -> String {
213        let Ok(mut parsed) = Url::parse(url) else {
214            return self.redact_string(url).into_owned();
215        };
216        let mut changed = false;
217
218        if self.redact_url_userinfo
219            && (!parsed.username().is_empty() || parsed.password().is_some())
220        {
221            // url::Url returns Err only when the URL cannot have a
222            // password (e.g. cannot-be-a-base). Treat that as a no-op.
223            if parsed.set_username("").is_ok() {
224                changed = true;
225            }
226            if parsed.set_password(None).is_ok() {
227                changed = true;
228            }
229        }
230
231        let pairs: Vec<(String, String)> = parsed
232            .query_pairs()
233            .map(|(key, value)| {
234                if self.url_param_is_sensitive(&key) {
235                    changed = true;
236                    (key.into_owned(), REDACTED_PLACEHOLDER.to_string())
237                } else {
238                    (key.into_owned(), value.into_owned())
239                }
240            })
241            .collect();
242        let original_query = parsed.query().map(str::to_string);
243        if !pairs.is_empty() {
244            parsed.set_query(None);
245            let mut query = parsed.query_pairs_mut();
246            for (key, value) in &pairs {
247                query.append_pair(key, value);
248            }
249        }
250        // `query_pairs_mut` always re-encodes; restore the original
251        // query string when nothing was actually redacted so we don't
252        // perturb otherwise stable URLs.
253        if !changed {
254            parsed.set_query(original_query.as_deref());
255            return parsed.to_string();
256        }
257        parsed.to_string()
258    }
259
260    /// Returns a redacted string. Cheap (`Cow::Borrowed`) when nothing
261    /// matched. Applies, in order: URL-shaped string detection (so the
262    /// userinfo or sensitive query params on `https://user:pw@…?api_key=…`
263    /// are scrubbed), then high-confidence secret pattern replacement.
264    pub fn redact_string<'a>(&self, value: &'a str) -> Cow<'a, str> {
265        if !self.scan_strings {
266            return Cow::Borrowed(value);
267        }
268        match self.redact_url_in_string(value) {
269            Cow::Borrowed(_) => scan_secret_patterns(value, REDACTED_PLACEHOLDER),
270            Cow::Owned(url_scrubbed) => {
271                let pattern_scrubbed =
272                    scan_secret_patterns(&url_scrubbed, REDACTED_PLACEHOLDER).into_owned();
273                Cow::Owned(pattern_scrubbed)
274            }
275        }
276    }
277
278    /// If `value` is a single URL with credentials or sensitive query
279    /// params, return the redacted form. Standalone URLs are common in
280    /// logged request envelopes; we don't try to walk arbitrary text
281    /// for embedded URLs because that turns into ad-hoc tokenization.
282    fn redact_url_in_string<'a>(&self, value: &'a str) -> Cow<'a, str> {
283        if !self.redact_url_userinfo
284            || !(value.starts_with("http://") || value.starts_with("https://"))
285        {
286            return Cow::Borrowed(value);
287        }
288        let trimmed = value.trim();
289        if trimmed.contains(char::is_whitespace) {
290            return Cow::Borrowed(value);
291        }
292        let redacted = self.redact_url(trimmed);
293        if redacted == trimmed {
294            Cow::Borrowed(value)
295        } else {
296            Cow::Owned(redacted)
297        }
298    }
299
300    /// Recursively walk a JSON value, redacting sensitive object fields
301    /// and string contents in place.
302    pub fn redact_json_in_place(&self, value: &mut JsonValue) {
303        match value {
304            JsonValue::Object(map) => {
305                let mut keys_to_redact: Vec<String> = Vec::new();
306                for (key, child) in map.iter_mut() {
307                    if self.field_is_sensitive(key) {
308                        keys_to_redact.push(key.clone());
309                    } else {
310                        self.redact_json_in_place(child);
311                    }
312                }
313                for key in keys_to_redact {
314                    map.insert(key, JsonValue::String(REDACTED_PLACEHOLDER.to_string()));
315                }
316            }
317            JsonValue::Array(items) => {
318                for item in items.iter_mut() {
319                    self.redact_json_in_place(item);
320                }
321            }
322            JsonValue::String(s) => {
323                let redacted = self.redact_string(s);
324                if let Cow::Owned(replacement) = redacted {
325                    *s = replacement;
326                }
327            }
328            _ => {}
329        }
330    }
331
332    /// Convenience for callers that have an immutable JSON value: clone
333    /// once and redact.
334    pub fn redact_json(&self, value: &JsonValue) -> JsonValue {
335        let mut clone = value.clone();
336        self.redact_json_in_place(&mut clone);
337        clone
338    }
339}
340
341fn default_safe_headers() -> BTreeSet<String> {
342    BTreeSet::from([
343        "content-length".to_string(),
344        "content-type".to_string(),
345        "request-id".to_string(),
346        "user-agent".to_string(),
347        "x-a2a-delivery".to_string(),
348        "x-a2a-signature".to_string(),
349        "x-correlation-id".to_string(),
350        "x-github-delivery".to_string(),
351        "x-github-event".to_string(),
352        "x-github-hook-id".to_string(),
353        "x-hub-signature-256".to_string(),
354        "x-linear-signature".to_string(),
355        "x-notion-signature".to_string(),
356        "x-request-id".to_string(),
357        "x-slack-request-timestamp".to_string(),
358        "x-slack-signature".to_string(),
359    ])
360}
361
362fn default_deny_header_substrings() -> BTreeSet<String> {
363    BTreeSet::from([
364        "authorization".to_string(),
365        "cookie".to_string(),
366        "secret".to_string(),
367        "token".to_string(),
368        "key".to_string(),
369    ])
370}
371
372fn is_default_sensitive_url_param(lower: &str) -> bool {
373    matches!(
374        lower,
375        "api_key"
376            | "apikey"
377            | "access_token"
378            | "refresh_token"
379            | "id_token"
380            | "client_secret"
381            | "password"
382            | "secret"
383            | "token"
384            | "auth"
385            | "bearer"
386            | "sig"
387            | "signature"
388    ) || lower.ends_with("_token")
389        || lower.ends_with("_secret")
390        || lower.ends_with("_password")
391}
392
393fn is_default_sensitive_field(lower: &str) -> bool {
394    matches!(
395        lower,
396        "authorization"
397            | "proxy-authorization"
398            | "cookie"
399            | "set-cookie"
400            | "api_key"
401            | "apikey"
402            | "api-key"
403            | "x-api-key"
404            | "x-auth-token"
405            | "x-csrf-token"
406            | "x-xsrf-token"
407            | "access_token"
408            | "refresh_token"
409            | "id_token"
410            | "bearer_token"
411            | "client_secret"
412            | "secret"
413            | "password"
414            | "passwd"
415            | "private_key"
416            | "session_token"
417    ) || lower.ends_with("_token")
418        || lower.ends_with("_secret")
419        || lower.ends_with("_password")
420        || lower.ends_with("_apikey")
421        || lower.ends_with("_api_key")
422}
423
424thread_local! {
425    static REDACTION_POLICY_STACK: RefCell<Vec<RedactionPolicy>> = const { RefCell::new(Vec::new()) };
426}
427
428/// Push a policy onto the thread-local stack. Pair every push with a
429/// [`pop_policy`] call (or use [`PolicyGuard`]).
430pub fn push_policy(policy: RedactionPolicy) {
431    REDACTION_POLICY_STACK.with(|stack| stack.borrow_mut().push(policy));
432}
433
434/// Pop the most recently pushed policy. Safe to call when the stack is
435/// empty.
436pub fn pop_policy() {
437    REDACTION_POLICY_STACK.with(|stack| {
438        stack.borrow_mut().pop();
439    });
440}
441
442/// Drop all installed policies. Used by `reset_thread_local_state` so
443/// test runs that share a thread cannot leak policy overrides into
444/// each other.
445pub fn clear_policy_stack() {
446    REDACTION_POLICY_STACK.with(|stack| stack.borrow_mut().clear());
447}
448
449/// Return the currently installed policy, falling back to
450/// [`RedactionPolicy::default`] when the stack is empty. Always returns
451/// an owned clone so callers can drop the borrow before recursing.
452pub fn current_policy() -> RedactionPolicy {
453    REDACTION_POLICY_STACK.with(|stack| {
454        stack
455            .borrow()
456            .last()
457            .cloned()
458            .unwrap_or_else(RedactionPolicy::default)
459    })
460}
461
462/// RAII guard that pushes a policy on construction and pops it on drop.
463///
464/// ```ignore
465/// let _guard = harn_vm::redact::PolicyGuard::new(RedactionPolicy::default());
466/// // … emit receipts, transcripts, etc.
467/// ```
468pub struct PolicyGuard;
469
470impl PolicyGuard {
471    pub fn new(policy: RedactionPolicy) -> Self {
472        push_policy(policy);
473        Self
474    }
475}
476
477impl Drop for PolicyGuard {
478    fn drop(&mut self) {
479        pop_policy();
480    }
481}
482
483#[cfg(test)]
484mod tests {
485    use super::*;
486    use serde_json::json;
487
488    fn sample_headers() -> BTreeMap<String, String> {
489        BTreeMap::from([
490            ("Authorization".to_string(), "Bearer secret123".to_string()),
491            ("Cookie".to_string(), "session=abc".to_string()),
492            ("Content-Type".to_string(), "application/json".to_string()),
493            ("X-Webhook-Token".to_string(), "tok-xyz".to_string()),
494            ("User-Agent".to_string(), "Harn/1.0".to_string()),
495            ("X-GitHub-Delivery".to_string(), "delivery-123".to_string()),
496        ])
497    }
498
499    #[test]
500    fn default_policy_redacts_auth_headers_and_keeps_safe_ones() {
501        let policy = RedactionPolicy::default();
502        let redacted = policy.redact_headers(&sample_headers());
503        assert_eq!(
504            redacted.get("Authorization").unwrap(),
505            REDACTED_HEADER_VALUE
506        );
507        assert_eq!(redacted.get("Cookie").unwrap(), REDACTED_HEADER_VALUE);
508        assert_eq!(
509            redacted.get("X-Webhook-Token").unwrap(),
510            REDACTED_HEADER_VALUE
511        );
512        assert_eq!(redacted.get("User-Agent").unwrap(), "Harn/1.0");
513        assert_eq!(redacted.get("X-GitHub-Delivery").unwrap(), "delivery-123");
514        assert_eq!(redacted.get("Content-Type").unwrap(), "application/json");
515    }
516
517    #[test]
518    fn passthrough_policy_redacts_nothing() {
519        let policy = RedactionPolicy::passthrough();
520        let redacted = policy.redact_headers(&sample_headers());
521        assert_eq!(redacted.get("Authorization").unwrap(), "Bearer secret123");
522    }
523
524    #[test]
525    fn host_can_extend_safe_and_deny_headers() {
526        let policy = RedactionPolicy::default()
527            .with_safe_header("X-Webhook-Token")
528            .with_deny_header_substring("delivery");
529        let redacted = policy.redact_headers(&sample_headers());
530        assert_eq!(redacted.get("X-Webhook-Token").unwrap(), "tok-xyz");
531        assert_eq!(
532            redacted.get("X-GitHub-Delivery").unwrap(),
533            REDACTED_HEADER_VALUE,
534            "host explicitly forced delivery to be sensitive"
535        );
536    }
537
538    #[test]
539    fn redact_url_strips_userinfo_and_sensitive_query_params() {
540        let policy = RedactionPolicy::default();
541        let redacted =
542            policy.redact_url("https://user:pw@api.example.com/v1?api_key=abcdef&page=2");
543        assert!(redacted.contains("api_key=%5Bredacted%5D"));
544        assert!(redacted.contains("page=2"));
545        assert!(!redacted.contains("user:pw@"));
546    }
547
548    #[test]
549    fn redact_url_leaves_clean_urls_alone() {
550        let policy = RedactionPolicy::default();
551        let url = "https://api.example.com/v1?page=2";
552        assert_eq!(policy.redact_url(url), url);
553    }
554
555    #[test]
556    fn redact_json_strips_sensitive_field_names_recursively() {
557        let policy = RedactionPolicy::default();
558        let mut value = json!({
559            "headers": {
560                "authorization": "Bearer abc",
561                "x-trace-id": "trace_1",
562            },
563            "list": [
564                { "auth_token": "tok_secret", "name": "alice" },
565                { "name": "bob" },
566            ],
567            "free_form": "Bearer ghp_abcdefghijklmnopqrstuvwxyz0123456789ABCD",
568            "url": "https://api.example.com/v1?api_key=hideme",
569        });
570        policy.redact_json_in_place(&mut value);
571        assert_eq!(value["headers"]["authorization"], REDACTED_PLACEHOLDER);
572        assert_eq!(value["headers"]["x-trace-id"], "trace_1");
573        assert_eq!(value["list"][0]["auth_token"], REDACTED_PLACEHOLDER);
574        assert_eq!(value["list"][0]["name"], "alice");
575        let free_form = value["free_form"].as_str().unwrap();
576        assert!(free_form.contains(REDACTED_PLACEHOLDER));
577        assert!(!free_form.contains("ghp_abcdefghijklmnopqrstuvwxyz0123456789ABCD"));
578    }
579
580    #[test]
581    fn policy_guard_pushes_and_pops_thread_local() {
582        clear_policy_stack();
583        assert_eq!(current_policy(), RedactionPolicy::default());
584        {
585            let policy = RedactionPolicy::default().with_extra_field("custom_token");
586            let _guard = PolicyGuard::new(policy.clone());
587            assert_eq!(current_policy(), policy);
588        }
589        assert_eq!(current_policy(), RedactionPolicy::default());
590    }
591
592    #[test]
593    fn redact_string_replaces_known_secret_patterns() {
594        let policy = RedactionPolicy::default();
595        let input =
596            "use sk-proj-abcdefghijklmnopqrstuvwxyz0123456789ABCD or AKIAABCDEFGHIJKLMNOP for now";
597        let out = policy.redact_string(input);
598        assert!(out.contains(REDACTED_PLACEHOLDER));
599        assert!(!out.contains("AKIAABCDEFGHIJKLMNOP"));
600        assert!(!out.contains("sk-proj-abcdefghijklmnopqrstuvwxyz0123456789ABCD"));
601    }
602}