Skip to main content

harn_vm/redact/
mod.rs

1//! Unified redaction policy for persisted and rendered operational data.
2//!
3//! Harn writes transcripts, receipts, event logs, portal JSON, connector
4//! status snapshots, and workflow artifacts. Each of those surfaces was
5//! previously responsible for its own ad-hoc scrubbing of HTTP headers,
6//! URL query parameters, JSON tokens, and free-form strings. This module
7//! is the single source of truth for "what is sensitive" so the same
8//! representative secret cannot leak through two surfaces by accident.
9//!
10//! # Categories
11//!
12//! - **Auth headers, cookies, signature/proxy tokens** — covered by
13//!   [`RedactionPolicy::redact_headers`].
14//! - **URLs with credentials in userinfo or sensitive query parameters**
15//!   — covered by [`RedactionPolicy::redact_url`].
16//! - **JSON fields whose name is auth/credential-shaped** — covered by
17//!   [`RedactionPolicy::redact_json_in_place`].
18//! - **Free-form strings carrying high-confidence secret patterns**
19//!   (Stripe `sk_live_…`, GitHub `ghp_…`, AWS `AKIA…`, Bearer tokens,
20//!   `-----BEGIN … PRIVATE KEY-----`) — covered by
21//!   [`RedactionPolicy::redact_string`] and applied recursively by
22//!   [`RedactionPolicy::redact_json_in_place`].
23//!
24//! # Host configuration
25//!
26//! Hosts compose policies via the builder methods (`with_safe_header`,
27//! `with_extra_field`, `with_extra_url_param`, `disable_string_scan`).
28//! Active policies are pushed onto a thread-local stack the same way
29//! approval policies are, so a single orchestrator startup site can
30//! install host overrides for every persistence path that calls
31//! [`current_policy`].
32
33mod patterns;
34
35use std::borrow::Cow;
36use std::cell::RefCell;
37use std::collections::{BTreeMap, BTreeSet};
38
39use serde_json::Value as JsonValue;
40use url::Url;
41
42pub use patterns::{
43    clear_audit_ring, clear_custom_patterns, custom_pattern_names, default_pattern_names,
44    drain_audit_ring, install_audit_sink, register_custom_pattern, scan_secret_patterns, AuditSink,
45    NamedPattern, RedactionEvent, TOKEN_REDACTION_AUDIT_TOPIC, TOKEN_REDACTION_DIAGNOSTIC,
46};
47
48/// Placeholder string used everywhere a redacted value would otherwise
49/// appear. Kept as a single constant so portal CSS, downstream parsers,
50/// and humans grepping logs can rely on one form.
51pub const REDACTED_PLACEHOLDER: &str = "[redacted]";
52
53/// Header value for redacted HTTP headers. Identical to
54/// [`REDACTED_PLACEHOLDER`] today, exposed as a separate symbol so the
55/// trigger/event tests that pre-date the unified module remain readable.
56pub const REDACTED_HEADER_VALUE: &str = REDACTED_PLACEHOLDER;
57
58#[derive(Clone, Debug, PartialEq, Eq)]
59pub struct RedactionPolicy {
60    safe_headers: BTreeSet<String>,
61    deny_header_substrings: BTreeSet<String>,
62    extra_deny_header_substrings: BTreeSet<String>,
63    extra_field_names: BTreeSet<String>,
64    extra_url_params: BTreeSet<String>,
65    scan_strings: bool,
66    redact_url_userinfo: bool,
67}
68
69impl Default for RedactionPolicy {
70    fn default() -> Self {
71        Self {
72            safe_headers: default_safe_headers(),
73            deny_header_substrings: default_deny_header_substrings(),
74            extra_deny_header_substrings: BTreeSet::new(),
75            extra_field_names: BTreeSet::new(),
76            extra_url_params: BTreeSet::new(),
77            scan_strings: true,
78            redact_url_userinfo: true,
79        }
80    }
81}
82
83impl RedactionPolicy {
84    /// Permissive policy used by tests that need raw data. No headers,
85    /// fields, or strings are scrubbed.
86    pub fn passthrough() -> Self {
87        Self {
88            safe_headers: BTreeSet::new(),
89            deny_header_substrings: BTreeSet::new(),
90            extra_deny_header_substrings: BTreeSet::new(),
91            extra_field_names: BTreeSet::new(),
92            extra_url_params: BTreeSet::new(),
93            scan_strings: false,
94            redact_url_userinfo: false,
95        }
96    }
97
98    /// Add a header (case-insensitive) to the safe-list. Header
99    /// redaction will leave its value untouched even if the name would
100    /// otherwise look auth-shaped (e.g. an `x-…-key` header that is
101    /// actually a request-id).
102    pub fn with_safe_header(mut self, name: impl Into<String>) -> Self {
103        self.safe_headers.insert(name.into().to_ascii_lowercase());
104        self
105    }
106
107    /// Add a substring (case-insensitive) that always forces a header
108    /// to be treated as sensitive. Useful for product-specific token
109    /// header names that the default `cookie`/`authorization`/`token`/`secret`/`key`
110    /// substring set would miss.
111    pub fn with_deny_header_substring(mut self, fragment: impl Into<String>) -> Self {
112        self.extra_deny_header_substrings
113            .insert(fragment.into().to_ascii_lowercase());
114        self
115    }
116
117    /// Add a JSON field name (case-insensitive, exact match) that should
118    /// always be redacted regardless of value contents. Useful when a
119    /// host knows it stores `internal_audit_token` or similar.
120    pub fn with_extra_field(mut self, name: impl Into<String>) -> Self {
121        self.extra_field_names
122            .insert(name.into().to_ascii_lowercase());
123        self
124    }
125
126    /// Add an extra URL query parameter name to redact.
127    pub fn with_extra_url_param(mut self, name: impl Into<String>) -> Self {
128        self.extra_url_params
129            .insert(name.into().to_ascii_lowercase());
130        self
131    }
132
133    /// Disable the heuristic free-form string scanner. The scanner adds
134    /// a small but non-zero cost to every JSON payload walk; turn it off
135    /// for performance-critical paths that have already been audited.
136    pub fn disable_string_scan(mut self) -> Self {
137        self.scan_strings = false;
138        self
139    }
140
141    fn header_is_safe(&self, lower_name: &str) -> bool {
142        // Exact-name allowlist is one source of truth in `safe_headers`;
143        // suffix/substring rules below cover the families of debugging
144        // headers that providers emit with arbitrary suffixes.
145        if self.safe_headers.contains(lower_name) {
146            return true;
147        }
148        lower_name.ends_with("-event")
149            || lower_name.ends_with("-delivery")
150            || lower_name.contains("timestamp")
151            || lower_name.contains("request-id")
152    }
153
154    /// Whether a given HTTP header name should have its value replaced
155    /// with [`REDACTED_HEADER_VALUE`].
156    ///
157    /// Host-explicit deny substrings always win, even over the built-in
158    /// safe-list — that is how a host says "treat my own webhook
159    /// delivery header as sensitive even though Harn would normally
160    /// keep it for debugging."
161    pub fn header_is_sensitive(&self, name: &str) -> bool {
162        let lower = name.to_ascii_lowercase();
163        if self
164            .extra_deny_header_substrings
165            .iter()
166            .any(|fragment| lower.contains(fragment))
167        {
168            return true;
169        }
170        if self.header_is_safe(&lower) {
171            return false;
172        }
173        self.deny_header_substrings
174            .iter()
175            .any(|fragment| lower.contains(fragment))
176    }
177
178    /// Whether a JSON object field name should be replaced with the
179    /// redacted placeholder before the value is even inspected.
180    pub fn field_is_sensitive(&self, name: &str) -> bool {
181        let lower = name.to_ascii_lowercase();
182        if self.extra_field_names.contains(&lower) {
183            return true;
184        }
185        is_default_sensitive_field(&lower)
186    }
187
188    /// Whether a URL query parameter name should have its value
189    /// replaced.
190    pub fn url_param_is_sensitive(&self, name: &str) -> bool {
191        let lower = name.to_ascii_lowercase();
192        if self.extra_url_params.contains(&lower) {
193            return true;
194        }
195        is_default_sensitive_url_param(&lower)
196    }
197
198    /// Returns a [`BTreeMap`] of headers with sensitive values replaced
199    /// by [`REDACTED_HEADER_VALUE`].
200    pub fn redact_headers(&self, headers: &BTreeMap<String, String>) -> BTreeMap<String, String> {
201        headers
202            .iter()
203            .map(|(name, value)| {
204                if self.header_is_sensitive(name) {
205                    (name.clone(), REDACTED_HEADER_VALUE.to_string())
206                } else {
207                    (name.clone(), value.clone())
208                }
209            })
210            .collect()
211    }
212
213    /// Redact sensitive query parameters and credentials in URL
214    /// userinfo. Returns the input unchanged if nothing matches or the
215    /// URL fails to parse.
216    pub fn redact_url(&self, url: &str) -> String {
217        let Ok(mut parsed) = Url::parse(url) else {
218            return self.redact_string(url).into_owned();
219        };
220        let mut changed = false;
221
222        if self.redact_url_userinfo
223            && (!parsed.username().is_empty() || parsed.password().is_some())
224        {
225            // url::Url returns Err only when the URL cannot have a
226            // password (e.g. cannot-be-a-base). Treat that as a no-op.
227            if parsed.set_username("").is_ok() {
228                changed = true;
229            }
230            if parsed.set_password(None).is_ok() {
231                changed = true;
232            }
233        }
234
235        let pairs: Vec<(String, String)> = parsed
236            .query_pairs()
237            .map(|(key, value)| {
238                if self.url_param_is_sensitive(&key) {
239                    changed = true;
240                    (key.into_owned(), REDACTED_PLACEHOLDER.to_string())
241                } else {
242                    (key.into_owned(), value.into_owned())
243                }
244            })
245            .collect();
246        let original_query = parsed.query().map(str::to_string);
247        if !pairs.is_empty() {
248            parsed.set_query(None);
249            let mut query = parsed.query_pairs_mut();
250            for (key, value) in &pairs {
251                query.append_pair(key, value);
252            }
253        }
254        // `query_pairs_mut` always re-encodes; restore the original
255        // query string when nothing was actually redacted so we don't
256        // perturb otherwise stable URLs.
257        if !changed {
258            parsed.set_query(original_query.as_deref());
259            return parsed.to_string();
260        }
261        parsed.to_string()
262    }
263
264    /// Returns a redacted string. Cheap (`Cow::Borrowed`) when nothing
265    /// matched. Applies, in order: URL-shaped string detection (so the
266    /// userinfo or sensitive query params on `https://user:pw@…?api_key=…`
267    /// are scrubbed), then high-confidence secret pattern replacement.
268    pub fn redact_string<'a>(&self, value: &'a str) -> Cow<'a, str> {
269        if !self.scan_strings {
270            return Cow::Borrowed(value);
271        }
272        match self.redact_url_in_string(value) {
273            Cow::Borrowed(_) => scan_secret_patterns(value, REDACTED_PLACEHOLDER),
274            Cow::Owned(url_scrubbed) => {
275                let pattern_scrubbed =
276                    scan_secret_patterns(&url_scrubbed, REDACTED_PLACEHOLDER).into_owned();
277                Cow::Owned(pattern_scrubbed)
278            }
279        }
280    }
281
282    /// If `value` is a single URL with credentials or sensitive query
283    /// params, return the redacted form. Standalone URLs are common in
284    /// logged request envelopes; we don't try to walk arbitrary text
285    /// for embedded URLs because that turns into ad-hoc tokenization.
286    fn redact_url_in_string<'a>(&self, value: &'a str) -> Cow<'a, str> {
287        if !self.redact_url_userinfo
288            || !(value.starts_with("http://") || value.starts_with("https://"))
289        {
290            return Cow::Borrowed(value);
291        }
292        let trimmed = value.trim();
293        if trimmed.contains(char::is_whitespace) {
294            return Cow::Borrowed(value);
295        }
296        let redacted = self.redact_url(trimmed);
297        if redacted == trimmed {
298            Cow::Borrowed(value)
299        } else {
300            Cow::Owned(redacted)
301        }
302    }
303
304    /// Recursively walk a JSON value, redacting sensitive object fields
305    /// and string contents in place.
306    pub fn redact_json_in_place(&self, value: &mut JsonValue) {
307        match value {
308            JsonValue::Object(map) => {
309                let mut keys_to_redact: Vec<String> = Vec::new();
310                for (key, child) in map.iter_mut() {
311                    if self.field_is_sensitive(key) {
312                        keys_to_redact.push(key.clone());
313                    } else {
314                        self.redact_json_in_place(child);
315                    }
316                }
317                for key in keys_to_redact {
318                    map.insert(key, JsonValue::String(REDACTED_PLACEHOLDER.to_string()));
319                }
320            }
321            JsonValue::Array(items) => {
322                for item in items.iter_mut() {
323                    self.redact_json_in_place(item);
324                }
325            }
326            JsonValue::String(s) => {
327                let redacted = self.redact_string(s);
328                if let Cow::Owned(replacement) = redacted {
329                    *s = replacement;
330                }
331            }
332            _ => {}
333        }
334    }
335
336    /// Convenience for callers that have an immutable JSON value: clone
337    /// once and redact.
338    pub fn redact_json(&self, value: &JsonValue) -> JsonValue {
339        let mut clone = value.clone();
340        self.redact_json_in_place(&mut clone);
341        clone
342    }
343}
344
345fn default_safe_headers() -> BTreeSet<String> {
346    BTreeSet::from([
347        "content-length".to_string(),
348        "content-type".to_string(),
349        "request-id".to_string(),
350        "user-agent".to_string(),
351        "x-a2a-delivery".to_string(),
352        "x-a2a-signature".to_string(),
353        "x-correlation-id".to_string(),
354        "x-github-delivery".to_string(),
355        "x-github-event".to_string(),
356        "x-github-hook-id".to_string(),
357        "x-hub-signature-256".to_string(),
358        "x-linear-signature".to_string(),
359        "x-notion-signature".to_string(),
360        "x-request-id".to_string(),
361        "x-slack-request-timestamp".to_string(),
362        "x-slack-signature".to_string(),
363    ])
364}
365
366fn default_deny_header_substrings() -> BTreeSet<String> {
367    BTreeSet::from([
368        "authorization".to_string(),
369        "cookie".to_string(),
370        "secret".to_string(),
371        "token".to_string(),
372        "key".to_string(),
373    ])
374}
375
376fn is_default_sensitive_url_param(lower: &str) -> bool {
377    matches!(
378        lower,
379        "api_key"
380            | "apikey"
381            | "access_token"
382            | "refresh_token"
383            | "id_token"
384            | "client_secret"
385            | "password"
386            | "secret"
387            | "token"
388            | "auth"
389            | "bearer"
390            | "sig"
391            | "signature"
392    ) || lower.ends_with("_token")
393        || lower.ends_with("_secret")
394        || lower.ends_with("_password")
395}
396
397fn is_default_sensitive_field(lower: &str) -> bool {
398    matches!(
399        lower,
400        "authorization"
401            | "proxy-authorization"
402            | "cookie"
403            | "set-cookie"
404            | "api_key"
405            | "apikey"
406            | "api-key"
407            | "x-amz-security-token"
408            | "x-api-key"
409            | "x-auth-token"
410            | "x-csrf-token"
411            | "x-xsrf-token"
412            | "access_token"
413            | "refresh_token"
414            | "id_token"
415            | "bearer_token"
416            | "client_secret"
417            | "secret"
418            | "password"
419            | "passwd"
420            | "private_key"
421            | "session_token"
422    ) || lower.ends_with("_token")
423        || lower.ends_with("_secret")
424        || lower.ends_with("_password")
425        || lower.ends_with("_apikey")
426        || lower.ends_with("_api_key")
427}
428
429thread_local! {
430    static REDACTION_POLICY_STACK: RefCell<Vec<RedactionPolicy>> = const { RefCell::new(Vec::new()) };
431}
432
433/// Push a policy onto the thread-local stack. Pair every push with a
434/// [`pop_policy`] call (or use [`PolicyGuard`]).
435pub fn push_policy(policy: RedactionPolicy) {
436    REDACTION_POLICY_STACK.with(|stack| stack.borrow_mut().push(policy));
437}
438
439/// Pop the most recently pushed policy. Safe to call when the stack is
440/// empty.
441pub fn pop_policy() {
442    REDACTION_POLICY_STACK.with(|stack| {
443        stack.borrow_mut().pop();
444    });
445}
446
447/// Drop all installed policies, custom token-redaction patterns, the
448/// audit sink, and the per-thread audit ring. Used by
449/// `reset_thread_local_state` so test runs that share a thread cannot
450/// leak policy overrides into each other.
451pub fn clear_policy_stack() {
452    REDACTION_POLICY_STACK.with(|stack| stack.borrow_mut().clear());
453    patterns::clear_custom_patterns();
454    let _ = patterns::install_audit_sink(None);
455    patterns::clear_audit_ring();
456}
457
458/// Return the currently installed policy, falling back to
459/// [`RedactionPolicy::default`] when the stack is empty. Always returns
460/// an owned clone so callers can drop the borrow before recursing.
461pub fn current_policy() -> RedactionPolicy {
462    REDACTION_POLICY_STACK.with(|stack| {
463        stack
464            .borrow()
465            .last()
466            .cloned()
467            .unwrap_or_else(RedactionPolicy::default)
468    })
469}
470
471/// RAII guard that pushes a policy on construction and pops it on drop.
472///
473/// ```ignore
474/// let _guard = harn_vm::redact::PolicyGuard::new(RedactionPolicy::default());
475/// // … emit receipts, transcripts, etc.
476/// ```
477pub struct PolicyGuard;
478
479impl PolicyGuard {
480    pub fn new(policy: RedactionPolicy) -> Self {
481        push_policy(policy);
482        Self
483    }
484}
485
486impl Drop for PolicyGuard {
487    fn drop(&mut self) {
488        pop_policy();
489    }
490}
491
492#[cfg(test)]
493mod tests {
494    use super::*;
495    use serde_json::json;
496
497    fn sample_headers() -> BTreeMap<String, String> {
498        BTreeMap::from([
499            ("Authorization".to_string(), "Bearer secret123".to_string()),
500            ("Cookie".to_string(), "session=abc".to_string()),
501            ("Content-Type".to_string(), "application/json".to_string()),
502            ("X-Webhook-Token".to_string(), "tok-xyz".to_string()),
503            ("User-Agent".to_string(), "Harn/1.0".to_string()),
504            ("X-GitHub-Delivery".to_string(), "delivery-123".to_string()),
505        ])
506    }
507
508    #[test]
509    fn default_policy_redacts_auth_headers_and_keeps_safe_ones() {
510        let policy = RedactionPolicy::default();
511        let redacted = policy.redact_headers(&sample_headers());
512        assert_eq!(
513            redacted.get("Authorization").unwrap(),
514            REDACTED_HEADER_VALUE
515        );
516        assert_eq!(redacted.get("Cookie").unwrap(), REDACTED_HEADER_VALUE);
517        assert_eq!(
518            redacted.get("X-Webhook-Token").unwrap(),
519            REDACTED_HEADER_VALUE
520        );
521        assert_eq!(redacted.get("User-Agent").unwrap(), "Harn/1.0");
522        assert_eq!(redacted.get("X-GitHub-Delivery").unwrap(), "delivery-123");
523        assert_eq!(redacted.get("Content-Type").unwrap(), "application/json");
524    }
525
526    #[test]
527    fn passthrough_policy_redacts_nothing() {
528        let policy = RedactionPolicy::passthrough();
529        let redacted = policy.redact_headers(&sample_headers());
530        assert_eq!(redacted.get("Authorization").unwrap(), "Bearer secret123");
531    }
532
533    #[test]
534    fn host_can_extend_safe_and_deny_headers() {
535        let policy = RedactionPolicy::default()
536            .with_safe_header("X-Webhook-Token")
537            .with_deny_header_substring("delivery");
538        let redacted = policy.redact_headers(&sample_headers());
539        assert_eq!(redacted.get("X-Webhook-Token").unwrap(), "tok-xyz");
540        assert_eq!(
541            redacted.get("X-GitHub-Delivery").unwrap(),
542            REDACTED_HEADER_VALUE,
543            "host explicitly forced delivery to be sensitive"
544        );
545    }
546
547    #[test]
548    fn redact_url_strips_userinfo_and_sensitive_query_params() {
549        let policy = RedactionPolicy::default();
550        let redacted =
551            policy.redact_url("https://user:pw@api.example.com/v1?api_key=abcdef&page=2");
552        assert!(redacted.contains("api_key=%5Bredacted%5D"));
553        assert!(redacted.contains("page=2"));
554        assert!(!redacted.contains("user:pw@"));
555    }
556
557    #[test]
558    fn redact_url_leaves_clean_urls_alone() {
559        let policy = RedactionPolicy::default();
560        let url = "https://api.example.com/v1?page=2";
561        assert_eq!(policy.redact_url(url), url);
562    }
563
564    #[test]
565    fn redact_json_strips_sensitive_field_names_recursively() {
566        let policy = RedactionPolicy::default();
567        let mut value = json!({
568            "headers": {
569                "authorization": "Bearer abc",
570                "X-Amz-Security-Token": "session",
571                "x-trace-id": "trace_1",
572            },
573            "list": [
574                { "auth_token": "tok_secret", "name": "alice" },
575                { "name": "bob" },
576            ],
577            "free_form": "Bearer ghp_abcdefghijklmnopqrstuvwxyz0123456789ABCD",
578            "url": "https://api.example.com/v1?api_key=hideme",
579        });
580        policy.redact_json_in_place(&mut value);
581        assert_eq!(value["headers"]["authorization"], REDACTED_PLACEHOLDER);
582        assert_eq!(
583            value["headers"]["X-Amz-Security-Token"],
584            REDACTED_PLACEHOLDER
585        );
586        assert_eq!(value["headers"]["x-trace-id"], "trace_1");
587        assert_eq!(value["list"][0]["auth_token"], REDACTED_PLACEHOLDER);
588        assert_eq!(value["list"][0]["name"], "alice");
589        let free_form = value["free_form"].as_str().unwrap();
590        // Free-form pattern matches now produce the OA-06 named
591        // placeholder `<redacted:<pattern>:<len>>` so audit logs can
592        // attribute leaks to a specific provider.
593        assert!(
594            free_form.contains("<redacted:"),
595            "expected named placeholder, got: {free_form}"
596        );
597        assert!(!free_form.contains("ghp_abcdefghijklmnopqrstuvwxyz0123456789ABCD"));
598    }
599
600    #[test]
601    fn policy_guard_pushes_and_pops_thread_local() {
602        clear_policy_stack();
603        assert_eq!(current_policy(), RedactionPolicy::default());
604        {
605            let policy = RedactionPolicy::default().with_extra_field("custom_token");
606            let _guard = PolicyGuard::new(policy.clone());
607            assert_eq!(current_policy(), policy);
608        }
609        assert_eq!(current_policy(), RedactionPolicy::default());
610    }
611
612    #[test]
613    fn redact_string_replaces_known_secret_patterns() {
614        let policy = RedactionPolicy::default();
615        let input =
616            "use sk-proj-abcdefghijklmnopqrstuvwxyz0123456789ABCD or AKIAABCDEFGHIJKLMNOP for now";
617        let out = policy.redact_string(input);
618        // Each provider pattern emits its own `<redacted:<name>:<len>>`
619        // placeholder so audit logs can attribute the leak.
620        assert!(out.contains("<redacted:openai_key:"));
621        assert!(out.contains("<redacted:aws_access_key:"));
622        assert!(!out.contains("AKIAABCDEFGHIJKLMNOP"));
623        assert!(!out.contains("sk-proj-abcdefghijklmnopqrstuvwxyz0123456789ABCD"));
624    }
625}