Skip to main content

harn_vm/redact/
mod.rs

1//! Unified redaction policy for persisted and rendered operational data.
2//!
3//! Harn writes transcripts, receipts, event logs, portal JSON, connector
4//! status snapshots, and workflow artifacts. Each of those surfaces was
5//! previously responsible for its own ad-hoc scrubbing of HTTP headers,
6//! URL query parameters, JSON tokens, and free-form strings. This module
7//! is the single source of truth for "what is sensitive" so the same
8//! representative secret cannot leak through two surfaces by accident.
9//!
10//! # Categories
11//!
12//! - **Auth headers, cookies, signature/proxy tokens** — covered by
13//!   [`RedactionPolicy::redact_headers`].
14//! - **URLs with credentials in userinfo or sensitive query parameters**
15//!   — covered by [`RedactionPolicy::redact_url`].
16//! - **JSON fields whose name is auth/credential-shaped** — covered by
17//!   [`RedactionPolicy::redact_json_in_place`].
18//! - **Free-form strings carrying high-confidence secret patterns**
19//!   (Stripe `sk_live_…`, GitHub `ghp_…`, AWS `AKIA…`, Bearer tokens,
20//!   `-----BEGIN … PRIVATE KEY-----`) — covered by
21//!   [`RedactionPolicy::redact_string`] and applied recursively by
22//!   [`RedactionPolicy::redact_json_in_place`].
23//!
24//! # Host configuration
25//!
26//! Hosts compose policies via the builder methods (`with_safe_header`,
27//! `with_extra_field`, `with_extra_url_param`, `disable_string_scan`).
28//! Active policies are pushed onto a thread-local stack the same way
29//! approval policies are, so a single orchestrator startup site can
30//! install host overrides for every persistence path that calls
31//! [`current_policy`].
32
33mod patterns;
34
35use std::borrow::Cow;
36use std::cell::RefCell;
37use std::collections::{BTreeMap, BTreeSet};
38
39use serde_json::Value as JsonValue;
40use url::Url;
41
42pub use patterns::{
43    clear_audit_ring, clear_custom_patterns, custom_pattern_names, default_pattern_names,
44    drain_audit_ring, install_audit_sink, register_custom_pattern, scan_secret_patterns, AuditSink,
45    NamedPattern, RedactionEvent, TOKEN_REDACTION_AUDIT_TOPIC, TOKEN_REDACTION_DIAGNOSTIC,
46};
47
48/// Placeholder string used everywhere a redacted value would otherwise
49/// appear. Kept as a single constant so portal CSS, downstream parsers,
50/// and humans grepping logs can rely on one form.
51pub const REDACTED_PLACEHOLDER: &str = "[redacted]";
52
53/// Header value for redacted HTTP headers. Identical to
54/// [`REDACTED_PLACEHOLDER`] today, exposed as a separate symbol so the
55/// trigger/event tests that pre-date the unified module remain readable.
56pub const REDACTED_HEADER_VALUE: &str = REDACTED_PLACEHOLDER;
57
58#[derive(Clone, Debug, PartialEq, Eq)]
59pub struct RedactionPolicy {
60    safe_headers: BTreeSet<String>,
61    deny_header_substrings: BTreeSet<String>,
62    extra_deny_header_substrings: BTreeSet<String>,
63    extra_field_names: BTreeSet<String>,
64    extra_url_params: BTreeSet<String>,
65    scan_strings: bool,
66    redact_url_userinfo: bool,
67}
68
69impl Default for RedactionPolicy {
70    fn default() -> Self {
71        Self {
72            safe_headers: default_safe_headers(),
73            deny_header_substrings: default_deny_header_substrings(),
74            extra_deny_header_substrings: BTreeSet::new(),
75            extra_field_names: BTreeSet::new(),
76            extra_url_params: BTreeSet::new(),
77            scan_strings: true,
78            redact_url_userinfo: true,
79        }
80    }
81}
82
83impl RedactionPolicy {
84    /// Permissive policy used by tests that need raw data. No headers,
85    /// fields, or strings are scrubbed.
86    pub fn passthrough() -> Self {
87        Self {
88            safe_headers: BTreeSet::new(),
89            deny_header_substrings: BTreeSet::new(),
90            extra_deny_header_substrings: BTreeSet::new(),
91            extra_field_names: BTreeSet::new(),
92            extra_url_params: BTreeSet::new(),
93            scan_strings: false,
94            redact_url_userinfo: false,
95        }
96    }
97
98    /// Add a header (case-insensitive) to the safe-list. Header
99    /// redaction will leave its value untouched even if the name would
100    /// otherwise look auth-shaped (e.g. an `x-…-key` header that is
101    /// actually a request-id).
102    pub fn with_safe_header(mut self, name: impl Into<String>) -> Self {
103        self.safe_headers.insert(name.into().to_ascii_lowercase());
104        self
105    }
106
107    /// Add a substring (case-insensitive) that always forces a header
108    /// to be treated as sensitive. Useful for product-specific token
109    /// header names that the default `cookie`/`authorization`/`token`/`secret`/`key`
110    /// substring set would miss.
111    pub fn with_deny_header_substring(mut self, fragment: impl Into<String>) -> Self {
112        self.extra_deny_header_substrings
113            .insert(fragment.into().to_ascii_lowercase());
114        self
115    }
116
117    /// Add a JSON field name (case-insensitive, exact match) that should
118    /// always be redacted regardless of value contents. Useful when a
119    /// host knows it stores `internal_audit_token` or similar.
120    pub fn with_extra_field(mut self, name: impl Into<String>) -> Self {
121        self.extra_field_names
122            .insert(name.into().to_ascii_lowercase());
123        self
124    }
125
126    /// Add an extra URL query parameter name to redact.
127    pub fn with_extra_url_param(mut self, name: impl Into<String>) -> Self {
128        self.extra_url_params
129            .insert(name.into().to_ascii_lowercase());
130        self
131    }
132
133    /// Disable the heuristic free-form string scanner. The scanner adds
134    /// a small but non-zero cost to every JSON payload walk; turn it off
135    /// for performance-critical paths that have already been audited.
136    pub fn disable_string_scan(mut self) -> Self {
137        self.scan_strings = false;
138        self
139    }
140
141    fn header_is_safe(&self, lower_name: &str) -> bool {
142        // Exact-name allowlist is one source of truth in `safe_headers`;
143        // suffix/substring rules below cover the families of debugging
144        // headers that providers emit with arbitrary suffixes.
145        if self.safe_headers.contains(lower_name) {
146            return true;
147        }
148        lower_name.ends_with("-event")
149            || lower_name.ends_with("-delivery")
150            || lower_name.contains("timestamp")
151            || lower_name.contains("request-id")
152    }
153
154    /// Whether a given HTTP header name should have its value replaced
155    /// with [`REDACTED_HEADER_VALUE`].
156    ///
157    /// Host-explicit deny substrings always win, even over the built-in
158    /// safe-list — that is how a host says "treat my own webhook
159    /// delivery header as sensitive even though Harn would normally
160    /// keep it for debugging."
161    pub fn header_is_sensitive(&self, name: &str) -> bool {
162        let lower = name.to_ascii_lowercase();
163        if self
164            .extra_deny_header_substrings
165            .iter()
166            .any(|fragment| lower.contains(fragment))
167        {
168            return true;
169        }
170        if self.header_is_safe(&lower) {
171            return false;
172        }
173        self.deny_header_substrings
174            .iter()
175            .any(|fragment| lower.contains(fragment))
176    }
177
178    /// Whether a JSON object field name should be replaced with the
179    /// redacted placeholder before the value is even inspected.
180    pub fn field_is_sensitive(&self, name: &str) -> bool {
181        let lower = name.to_ascii_lowercase();
182        if self.extra_field_names.contains(&lower) {
183            return true;
184        }
185        is_default_sensitive_field(&lower)
186    }
187
188    /// Whether a URL query parameter name should have its value
189    /// replaced.
190    pub fn url_param_is_sensitive(&self, name: &str) -> bool {
191        let lower = name.to_ascii_lowercase();
192        if self.extra_url_params.contains(&lower) {
193            return true;
194        }
195        is_default_sensitive_url_param(&lower)
196    }
197
198    /// Returns a [`BTreeMap`] of headers with sensitive values replaced
199    /// by [`REDACTED_HEADER_VALUE`].
200    pub fn redact_headers(&self, headers: &BTreeMap<String, String>) -> BTreeMap<String, String> {
201        headers
202            .iter()
203            .map(|(name, value)| {
204                if self.header_is_sensitive(name) {
205                    (name.clone(), REDACTED_HEADER_VALUE.to_string())
206                } else {
207                    (name.clone(), value.clone())
208                }
209            })
210            .collect()
211    }
212
213    /// Redact sensitive query parameters and credentials in URL
214    /// userinfo. Returns the input unchanged if nothing matches or the
215    /// URL fails to parse.
216    pub fn redact_url(&self, url: &str) -> String {
217        let Ok(mut parsed) = Url::parse(url) else {
218            return self.redact_string(url).into_owned();
219        };
220        let mut changed = false;
221
222        if self.redact_url_userinfo
223            && (!parsed.username().is_empty() || parsed.password().is_some())
224        {
225            // url::Url returns Err only when the URL cannot have a
226            // password (e.g. cannot-be-a-base). Treat that as a no-op.
227            if parsed.set_username("").is_ok() {
228                changed = true;
229            }
230            if parsed.set_password(None).is_ok() {
231                changed = true;
232            }
233        }
234
235        let pairs: Vec<(String, String)> = parsed
236            .query_pairs()
237            .map(|(key, value)| {
238                if self.url_param_is_sensitive(&key) {
239                    changed = true;
240                    (key.into_owned(), REDACTED_PLACEHOLDER.to_string())
241                } else {
242                    (key.into_owned(), value.into_owned())
243                }
244            })
245            .collect();
246        let original_query = parsed.query().map(str::to_string);
247        if !pairs.is_empty() {
248            parsed.set_query(None);
249            let mut query = parsed.query_pairs_mut();
250            for (key, value) in &pairs {
251                query.append_pair(key, value);
252            }
253        }
254        // `query_pairs_mut` always re-encodes; restore the original
255        // query string when nothing was actually redacted so we don't
256        // perturb otherwise stable URLs.
257        if !changed {
258            parsed.set_query(original_query.as_deref());
259            return parsed.to_string();
260        }
261        parsed.to_string()
262    }
263
264    /// Returns a redacted string. Cheap (`Cow::Borrowed`) when nothing
265    /// matched. Applies, in order: URL-shaped string detection (so the
266    /// userinfo or sensitive query params on `https://user:pw@…?api_key=…`
267    /// are scrubbed), then high-confidence secret pattern replacement.
268    pub fn redact_string<'a>(&self, value: &'a str) -> Cow<'a, str> {
269        if !self.scan_strings {
270            return Cow::Borrowed(value);
271        }
272        match self.redact_url_in_string(value) {
273            Cow::Borrowed(_) => scan_secret_patterns(value, REDACTED_PLACEHOLDER),
274            Cow::Owned(url_scrubbed) => {
275                let pattern_scrubbed =
276                    scan_secret_patterns(&url_scrubbed, REDACTED_PLACEHOLDER).into_owned();
277                Cow::Owned(pattern_scrubbed)
278            }
279        }
280    }
281
282    /// If `value` is a single URL with credentials or sensitive query
283    /// params, return the redacted form. Standalone URLs are common in
284    /// logged request envelopes; we don't try to walk arbitrary text
285    /// for embedded URLs because that turns into ad-hoc tokenization.
286    fn redact_url_in_string<'a>(&self, value: &'a str) -> Cow<'a, str> {
287        if !self.redact_url_userinfo
288            || !(value.starts_with("http://") || value.starts_with("https://"))
289        {
290            return Cow::Borrowed(value);
291        }
292        let trimmed = value.trim();
293        if trimmed.contains(char::is_whitespace) {
294            return Cow::Borrowed(value);
295        }
296        let redacted = self.redact_url(trimmed);
297        if redacted == trimmed {
298            Cow::Borrowed(value)
299        } else {
300            Cow::Owned(redacted)
301        }
302    }
303
304    /// Recursively walk a JSON value, redacting sensitive object fields
305    /// and string contents in place.
306    pub fn redact_json_in_place(&self, value: &mut JsonValue) {
307        match value {
308            JsonValue::Object(map) => {
309                let mut keys_to_redact: Vec<String> = Vec::new();
310                for (key, child) in map.iter_mut() {
311                    if self.field_is_sensitive(key) {
312                        keys_to_redact.push(key.clone());
313                    } else {
314                        self.redact_json_in_place(child);
315                    }
316                }
317                for key in keys_to_redact {
318                    map.insert(key, JsonValue::String(REDACTED_PLACEHOLDER.to_string()));
319                }
320            }
321            JsonValue::Array(items) => {
322                for item in items.iter_mut() {
323                    self.redact_json_in_place(item);
324                }
325            }
326            JsonValue::String(s) => {
327                let redacted = self.redact_string(s);
328                if let Cow::Owned(replacement) = redacted {
329                    *s = replacement;
330                }
331            }
332            _ => {}
333        }
334    }
335
336    /// Convenience for callers that have an immutable JSON value: clone
337    /// once and redact.
338    pub fn redact_json(&self, value: &JsonValue) -> JsonValue {
339        let mut clone = value.clone();
340        self.redact_json_in_place(&mut clone);
341        clone
342    }
343}
344
345fn default_safe_headers() -> BTreeSet<String> {
346    BTreeSet::from([
347        "content-length".to_string(),
348        "content-type".to_string(),
349        "request-id".to_string(),
350        "user-agent".to_string(),
351        "x-a2a-delivery".to_string(),
352        "x-correlation-id".to_string(),
353        "x-github-delivery".to_string(),
354        "x-github-event".to_string(),
355        "x-github-hook-id".to_string(),
356        "x-request-id".to_string(),
357        "x-slack-request-timestamp".to_string(),
358    ])
359}
360
361fn default_deny_header_substrings() -> BTreeSet<String> {
362    BTreeSet::from([
363        "authorization".to_string(),
364        "cookie".to_string(),
365        "secret".to_string(),
366        "signature".to_string(),
367        "token".to_string(),
368        "key".to_string(),
369    ])
370}
371
372fn is_default_sensitive_url_param(lower: &str) -> bool {
373    let compact = compact_secret_name(lower);
374    matches!(
375        compact.as_str(),
376        "apikey"
377            | "accesstoken"
378            | "refreshtoken"
379            | "idtoken"
380            | "clientsecret"
381            | "password"
382            | "secret"
383            | "token"
384            | "auth"
385            | "bearer"
386            | "sig"
387            | "signature"
388    ) || compact.ends_with("token")
389        || compact.ends_with("secret")
390        || compact.ends_with("password")
391}
392
393fn is_default_sensitive_field(lower: &str) -> bool {
394    let compact = compact_secret_name(lower);
395    matches!(
396        compact.as_str(),
397        "authorization"
398            | "proxyauthorization"
399            | "cookie"
400            | "setcookie"
401            | "apikey"
402            | "xamzsecuritytoken"
403            | "xapikey"
404            | "xauthtoken"
405            | "xcsrftoken"
406            | "xxsrftoken"
407            | "accesstoken"
408            | "refreshtoken"
409            | "idtoken"
410            | "bearertoken"
411            | "clientsecret"
412            | "password"
413            | "secret"
414            | "passwd"
415            | "privatekey"
416            | "sessiontoken"
417    ) || compact.ends_with("token")
418        || compact.ends_with("secret")
419        || compact.ends_with("password")
420        || compact.ends_with("apikey")
421}
422
423fn compact_secret_name(lower: &str) -> String {
424    lower
425        .chars()
426        .filter(|ch| *ch != '_' && *ch != '-')
427        .collect()
428}
429
430thread_local! {
431    static REDACTION_POLICY_STACK: RefCell<Vec<RedactionPolicy>> = const { RefCell::new(Vec::new()) };
432}
433
434/// Push a policy onto the thread-local stack. Pair every push with a
435/// [`pop_policy`] call (or use [`PolicyGuard`]).
436pub fn push_policy(policy: RedactionPolicy) {
437    REDACTION_POLICY_STACK.with(|stack| stack.borrow_mut().push(policy));
438}
439
440/// Pop the most recently pushed policy. Safe to call when the stack is
441/// empty.
442pub fn pop_policy() {
443    REDACTION_POLICY_STACK.with(|stack| {
444        stack.borrow_mut().pop();
445    });
446}
447
448/// Drop all installed policies, custom token-redaction patterns, the
449/// audit sink, and the per-thread audit ring. Used by
450/// `reset_thread_local_state` so test runs that share a thread cannot
451/// leak policy overrides into each other.
452pub fn clear_policy_stack() {
453    REDACTION_POLICY_STACK.with(|stack| stack.borrow_mut().clear());
454    patterns::clear_custom_patterns();
455    let _ = patterns::install_audit_sink(None);
456    patterns::clear_audit_ring();
457}
458
459/// Return the currently installed policy, falling back to
460/// [`RedactionPolicy::default`] when the stack is empty. Always returns
461/// an owned clone so callers can drop the borrow before recursing.
462pub fn current_policy() -> RedactionPolicy {
463    REDACTION_POLICY_STACK.with(|stack| {
464        stack
465            .borrow()
466            .last()
467            .cloned()
468            .unwrap_or_else(RedactionPolicy::default)
469    })
470}
471
472/// RAII guard that pushes a policy on construction and pops it on drop.
473///
474/// ```ignore
475/// let _guard = harn_vm::redact::PolicyGuard::new(RedactionPolicy::default());
476/// // … emit receipts, transcripts, etc.
477/// ```
478pub struct PolicyGuard;
479
480impl PolicyGuard {
481    pub fn new(policy: RedactionPolicy) -> Self {
482        push_policy(policy);
483        Self
484    }
485}
486
487impl Drop for PolicyGuard {
488    fn drop(&mut self) {
489        pop_policy();
490    }
491}
492
493#[cfg(test)]
494mod tests {
495    use super::*;
496    use serde_json::json;
497
498    fn sample_headers() -> BTreeMap<String, String> {
499        BTreeMap::from([
500            ("Authorization".to_string(), "Bearer secret123".to_string()),
501            ("Cookie".to_string(), "session=abc".to_string()),
502            ("Content-Type".to_string(), "application/json".to_string()),
503            ("X-Webhook-Token".to_string(), "tok-xyz".to_string()),
504            (
505                "X-Slack-Signature".to_string(),
506                "v0=abcdef123456".to_string(),
507            ),
508            ("User-Agent".to_string(), "Harn/1.0".to_string()),
509            ("X-GitHub-Delivery".to_string(), "delivery-123".to_string()),
510        ])
511    }
512
513    #[test]
514    fn default_policy_redacts_auth_headers_and_keeps_safe_ones() {
515        let policy = RedactionPolicy::default();
516        let redacted = policy.redact_headers(&sample_headers());
517        assert_eq!(
518            redacted.get("Authorization").unwrap(),
519            REDACTED_HEADER_VALUE
520        );
521        assert_eq!(redacted.get("Cookie").unwrap(), REDACTED_HEADER_VALUE);
522        assert_eq!(
523            redacted.get("X-Webhook-Token").unwrap(),
524            REDACTED_HEADER_VALUE
525        );
526        assert_eq!(
527            redacted.get("X-Slack-Signature").unwrap(),
528            REDACTED_HEADER_VALUE
529        );
530        assert_eq!(redacted.get("User-Agent").unwrap(), "Harn/1.0");
531        assert_eq!(redacted.get("X-GitHub-Delivery").unwrap(), "delivery-123");
532        assert_eq!(redacted.get("Content-Type").unwrap(), "application/json");
533    }
534
535    #[test]
536    fn passthrough_policy_redacts_nothing() {
537        let policy = RedactionPolicy::passthrough();
538        let redacted = policy.redact_headers(&sample_headers());
539        assert_eq!(redacted.get("Authorization").unwrap(), "Bearer secret123");
540    }
541
542    #[test]
543    fn host_can_extend_safe_and_deny_headers() {
544        let policy = RedactionPolicy::default()
545            .with_safe_header("X-Webhook-Token")
546            .with_deny_header_substring("delivery");
547        let redacted = policy.redact_headers(&sample_headers());
548        assert_eq!(redacted.get("X-Webhook-Token").unwrap(), "tok-xyz");
549        assert_eq!(
550            redacted.get("X-GitHub-Delivery").unwrap(),
551            REDACTED_HEADER_VALUE,
552            "host explicitly forced delivery to be sensitive"
553        );
554    }
555
556    #[test]
557    fn redact_url_strips_userinfo_and_sensitive_query_params() {
558        let policy = RedactionPolicy::default();
559        let redacted = policy.redact_url(
560            "https://user:pw@api.example.com/v1?api_key=abcdef&clientSecret=hidden&page=2",
561        );
562        assert!(redacted.contains("api_key=%5Bredacted%5D"));
563        assert!(redacted.contains("clientSecret=%5Bredacted%5D"));
564        assert!(redacted.contains("page=2"));
565        assert!(!redacted.contains("user:pw@"));
566    }
567
568    #[test]
569    fn redact_url_leaves_clean_urls_alone() {
570        let policy = RedactionPolicy::default();
571        let url = "https://api.example.com/v1?page=2";
572        assert_eq!(policy.redact_url(url), url);
573    }
574
575    #[test]
576    fn redact_json_strips_sensitive_field_names_recursively() {
577        let policy = RedactionPolicy::default();
578        let mut value = json!({
579            "headers": {
580                "authorization": "Bearer abc",
581                "X-Amz-Security-Token": "session",
582                "x-trace-id": "trace_1",
583            },
584            "list": [
585                { "auth_token": "tok_secret", "accessToken": "camel", "name": "alice" },
586                { "name": "bob" },
587            ],
588            "clientSecret": "camel-secret",
589            "free_form": "Bearer ghp_abcdefghijklmnopqrstuvwxyz0123456789ABCD",
590            "url": "https://api.example.com/v1?api_key=hideme",
591        });
592        policy.redact_json_in_place(&mut value);
593        assert_eq!(value["headers"]["authorization"], REDACTED_PLACEHOLDER);
594        assert_eq!(
595            value["headers"]["X-Amz-Security-Token"],
596            REDACTED_PLACEHOLDER
597        );
598        assert_eq!(value["headers"]["x-trace-id"], "trace_1");
599        assert_eq!(value["list"][0]["auth_token"], REDACTED_PLACEHOLDER);
600        assert_eq!(value["list"][0]["accessToken"], REDACTED_PLACEHOLDER);
601        assert_eq!(value["list"][0]["name"], "alice");
602        assert_eq!(value["clientSecret"], REDACTED_PLACEHOLDER);
603        let free_form = value["free_form"].as_str().unwrap();
604        // Free-form pattern matches now produce the OA-06 named
605        // placeholder `<redacted:<pattern>:<len>>` so audit logs can
606        // attribute leaks to a specific provider.
607        assert!(
608            free_form.contains("<redacted:"),
609            "expected named placeholder, got: {free_form}"
610        );
611        assert!(!free_form.contains("ghp_abcdefghijklmnopqrstuvwxyz0123456789ABCD"));
612    }
613
614    #[test]
615    fn policy_guard_pushes_and_pops_thread_local() {
616        clear_policy_stack();
617        assert_eq!(current_policy(), RedactionPolicy::default());
618        {
619            let policy = RedactionPolicy::default().with_extra_field("custom_token");
620            let _guard = PolicyGuard::new(policy.clone());
621            assert_eq!(current_policy(), policy);
622        }
623        assert_eq!(current_policy(), RedactionPolicy::default());
624    }
625
626    #[test]
627    fn redact_string_replaces_known_secret_patterns() {
628        let policy = RedactionPolicy::default();
629        let input =
630            "use sk-proj-abcdefghijklmnopqrstuvwxyz0123456789ABCD or AKIAABCDEFGHIJKLMNOP for now";
631        let out = policy.redact_string(input);
632        // Each provider pattern emits its own `<redacted:<name>:<len>>`
633        // placeholder so audit logs can attribute the leak.
634        assert!(out.contains("<redacted:openai_key:"));
635        assert!(out.contains("<redacted:aws_access_key:"));
636        assert!(!out.contains("AKIAABCDEFGHIJKLMNOP"));
637        assert!(!out.contains("sk-proj-abcdefghijklmnopqrstuvwxyz0123456789ABCD"));
638    }
639}