Skip to main content

harn_vm/redact/
patterns.rs

1//! Free-form string secret patterns reused for redaction.
2//!
3//! Each pattern is named so the replacement placeholder is
4//! `<redacted:<pattern_name>:<len>>` and audit events can attribute the
5//! redaction to a specific provider. The shared
6//! [`crate::secret_patterns`] catalog is also used by the
7//! `secret_scan` builtin, so a string that scanning reports is also a
8//! string that redaction scrubs.
9//!
10//! # Custom patterns
11//!
12//! Hosts and scripts can register additional named patterns through
13//! [`register_custom_pattern`]. Custom patterns live on a thread-local
14//! stack so test pollution stays contained and so a per-orchestrator
15//! override can be installed alongside the existing
16//! [`crate::redact::PolicyGuard`].
17//!
18//! # Audit
19//!
20//! Every redaction synchronously records a [`RedactionEvent`] in a
21//! per-thread ring drainable via [`drain_audit_ring`], and also fires
22//! an optional [`AuditSink`] callback. The default sink installed by
23//! the [`crate::stdlib::token_redaction`] stdlib forwards events to
24//! the live events pipeline and, on a multi-threaded Tokio runtime,
25//! to the `audit.token_redaction` event-log topic. Audit entries
26//! carry the diagnostic identifier `HARN-OAU-001` from the OA-06
27//! epic — they never include the raw token.
28
29use std::borrow::Cow;
30use std::cell::RefCell;
31use std::collections::BTreeMap;
32use std::sync::LazyLock;
33
34use regex::Regex;
35
36use crate::secret_patterns::DEFAULT_SECRET_PATTERN_SPECS;
37
38/// Stable identifier emitted in audit logs for every token-redaction
39/// event. Part of the OA-06 epic's compliance contract.
40pub const TOKEN_REDACTION_DIAGNOSTIC: &str = "HARN-OAU-001";
41
42/// Event-log topic used for token-redaction audit events.
43pub const TOKEN_REDACTION_AUDIT_TOPIC: &str = "audit.token_redaction";
44
45/// Upper bound on input length scanned by the secret detector. Inputs
46/// above this size short-circuit to "no redaction" so a pathological
47/// caller cannot trigger catastrophic regex behavior on the persistence
48/// hot path. Persisted JSON payloads larger than this are already
49/// abnormal; the receipt/event-log layers already cap message sizes
50/// well below this in practice.
51const MAX_SCAN_INPUT_BYTES: usize = 256 * 1024;
52
53/// One redaction pattern with a stable display name.
54#[derive(Clone)]
55pub struct NamedPattern {
56    /// Short, kebab-case identifier (e.g. `"github_pat_classic"`).
57    /// Stable across versions — emitted in audit events and in the
58    /// `<redacted:name:len>` placeholder.
59    pub name: &'static str,
60    /// Compiled regex. Always anchored on `\b` or non-word boundaries
61    /// so it does not chew unrelated identifiers.
62    pub regex: Regex,
63}
64
65impl std::fmt::Debug for NamedPattern {
66    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
67        f.debug_struct("NamedPattern")
68            .field("name", &self.name)
69            .field("regex", &self.regex.as_str())
70            .finish()
71    }
72}
73
74/// Default token patterns shipped with Harn. Order matters only for
75/// audit attribution when multiple patterns would match the same
76/// substring — earlier patterns win.
77pub static DEFAULT_PATTERNS: LazyLock<Vec<NamedPattern>> = LazyLock::new(|| {
78    DEFAULT_SECRET_PATTERN_SPECS
79        .iter()
80        .map(|spec| NamedPattern {
81            name: spec.redaction_name,
82            regex: Regex::new(spec.regex).unwrap_or_else(|error| {
83                panic!("invalid {} secret regex: {error}", spec.redaction_name)
84            }),
85        })
86        .collect()
87});
88
89thread_local! {
90    /// Custom token patterns installed by stdlib callers. Stored on a
91    /// per-thread stack the same way [`crate::redact::PolicyGuard`]
92    /// stores active policies; `reset_thread_local_state` clears them.
93    static CUSTOM_PATTERNS: RefCell<Vec<NamedPattern>> = const { RefCell::new(Vec::new()) };
94
95    /// Callback that receives one entry per pattern that matched.
96    /// Set by callers that want to audit redactions
97    /// (`stdlib::token_redaction` installs a default sink that
98    /// forwards to the event log when a runtime is available).
99    /// `None` means "no extra audit collection on this thread".
100    /// Every redaction also lands in [`AUDIT_RING`] regardless of
101    /// whether a sink is installed.
102    static AUDIT_SINK: RefCell<Option<AuditSink>> = const { RefCell::new(None) };
103
104    /// Authoritative per-thread audit ring. Always populated on
105    /// every redaction so the synchronous compliance contract holds
106    /// in every execution context (sync host calls, single-threaded
107    /// LocalSet, multi-thread runtime). Drained by stdlib via
108    /// [`drain_audit_ring`].
109    static AUDIT_RING: RefCell<Vec<RedactionEvent>> = const { RefCell::new(Vec::new()) };
110}
111
112/// Per-redaction event passed to an installed [`AuditSink`].
113#[derive(Clone, Debug, PartialEq, Eq)]
114pub struct RedactionEvent {
115    pub pattern_name: String,
116    pub match_count: usize,
117    /// Total bytes redacted across all matches of this pattern.
118    pub bytes_redacted: usize,
119}
120
121/// Thread-local callback invoked once per pattern that matched during a
122/// single `scan_secret_patterns` call.
123pub type AuditSink = std::rc::Rc<dyn Fn(&RedactionEvent)>;
124
125/// Register a custom named pattern on the calling thread. Returns an
126/// error if the regex fails to compile. The pattern is appended after
127/// the default catalog, so default patterns still win when multiple
128/// would match the same substring.
129pub fn register_custom_pattern(name: impl Into<String>, regex_source: &str) -> Result<(), String> {
130    let regex = Regex::new(regex_source).map_err(|error| format!("invalid regex: {error}"))?;
131    // Leak the name to `'static` so the pattern's name field stays
132    // borrow-free and serialization can carry the same lifetime as
133    // the default catalog. Custom patterns are rare and never freed
134    // — the leak is bounded by the number of distinct user-supplied
135    // names per process.
136    let name_static: &'static str = Box::leak(name.into().into_boxed_str());
137    CUSTOM_PATTERNS.with(|cell| {
138        cell.borrow_mut().push(NamedPattern {
139            name: name_static,
140            regex,
141        });
142    });
143    Ok(())
144}
145
146/// Drop all custom patterns installed via [`register_custom_pattern`]
147/// on the calling thread. Idempotent.
148pub fn clear_custom_patterns() {
149    CUSTOM_PATTERNS.with(|cell| cell.borrow_mut().clear());
150}
151
152/// Return the names of every default pattern, in catalog order.
153pub fn default_pattern_names() -> Vec<&'static str> {
154    DEFAULT_PATTERNS.iter().map(|p| p.name).collect()
155}
156
157/// Return the names of every custom pattern currently installed on the
158/// calling thread.
159pub fn custom_pattern_names() -> Vec<String> {
160    CUSTOM_PATTERNS.with(|cell| cell.borrow().iter().map(|p| p.name.to_string()).collect())
161}
162
163/// Install a per-thread audit sink. The previous sink (if any) is
164/// returned so callers can chain or restore.
165pub fn install_audit_sink(sink: Option<AuditSink>) -> Option<AuditSink> {
166    AUDIT_SINK.with(|cell| std::mem::replace(&mut *cell.borrow_mut(), sink))
167}
168
169fn emit_audit(events: &[RedactionEvent]) {
170    if events.is_empty() {
171        return;
172    }
173    // Always push to the per-thread ring so a synchronous
174    // `drain_audit_ring` call returns every event recorded since
175    // the last drain, regardless of whether an extra sink is
176    // installed on this thread.
177    AUDIT_RING.with(|ring| {
178        let mut ring = ring.borrow_mut();
179        for event in events {
180            // Bounded cap: 1024 entries is well above any realistic
181            // per-step audit pressure but small enough to be a
182            // no-op for normal workloads and to keep a runaway
183            // sink from OOMing the process.
184            if ring.len() >= 1024 {
185                ring.remove(0);
186            }
187            ring.push(event.clone());
188        }
189    });
190    let sink = AUDIT_SINK.with(|cell| cell.borrow().clone());
191    if let Some(sink) = sink {
192        for event in events {
193            sink(event);
194        }
195    }
196}
197
198/// Drain every audit event recorded on the calling thread since the
199/// last drain. The returned vec is in the order events fired.
200pub fn drain_audit_ring() -> Vec<RedactionEvent> {
201    AUDIT_RING.with(|ring| std::mem::take(&mut *ring.borrow_mut()))
202}
203
204/// Clear the per-thread audit ring without returning its contents.
205/// Used by `clear_policy_stack` so tests sharing a thread cannot
206/// leak audit events into each other.
207pub fn clear_audit_ring() {
208    AUDIT_RING.with(|ring| ring.borrow_mut().clear());
209}
210
211/// Build the per-match replacement string in the canonical
212/// `<redacted:<name>:<len>>` form. Length reflects the redacted match
213/// in UTF-8 bytes.
214fn replacement_for(name: &str, matched: &str) -> String {
215    format!("<redacted:{name}:{}>", matched.len())
216}
217
218/// Replace any high-confidence secret matches in `input` with the
219/// canonical `<redacted:<pattern_name>:<len>>` placeholder. Returns
220/// `Cow::Borrowed` when nothing matched, so callers paying for a clone
221/// only pay when there was real work.
222///
223/// The legacy `placeholder` argument is kept for callers that want a
224/// flat `[redacted]` form (e.g. headers and URL params). When the
225/// placeholder is the canonical `[redacted]` constant the named form
226/// is used; any other placeholder is substituted verbatim so callers
227/// that need a specific marker (URL-param escaping, etc.) still get
228/// it byte-for-byte.
229pub fn scan_secret_patterns<'a>(input: &'a str, placeholder: &str) -> Cow<'a, str> {
230    if input.is_empty() {
231        return Cow::Borrowed(input);
232    }
233    // Length cap is defense-in-depth against catastrophic regex
234    // behavior. None of the default patterns have nested
235    // quantifiers, but custom patterns can be arbitrary so the cap
236    // keeps a malicious script from blocking the persistence path.
237    if input.len() > MAX_SCAN_INPUT_BYTES {
238        return Cow::Borrowed(input);
239    }
240    let use_named_placeholder = placeholder == crate::redact::REDACTED_PLACEHOLDER;
241
242    let mut owned: Option<String> = None;
243    let mut audit_events: BTreeMap<&'static str, RedactionEvent> = BTreeMap::new();
244
245    // Drive defaults then custom patterns. We collect custom
246    // patterns into a Vec so the closure does not borrow the
247    // thread-local across the regex calls.
248    let custom: Vec<NamedPattern> = CUSTOM_PATTERNS.with(|cell| cell.borrow().clone());
249    let all_patterns = DEFAULT_PATTERNS.iter().chain(custom.iter());
250
251    for pattern in all_patterns {
252        let target: &str = owned.as_deref().unwrap_or(input);
253        let matches: Vec<(usize, usize)> = pattern
254            .regex
255            .find_iter(target)
256            .map(|m| (m.start(), m.end()))
257            .collect();
258        if matches.is_empty() {
259            continue;
260        }
261        let total_bytes: usize = matches.iter().map(|(s, e)| e - s).sum();
262        audit_events.insert(
263            pattern.name,
264            RedactionEvent {
265                pattern_name: pattern.name.to_string(),
266                match_count: matches.len(),
267                bytes_redacted: total_bytes,
268            },
269        );
270
271        // Walk matches in reverse so we can splice without
272        // recomputing offsets after each cut.
273        let mut buffer = target.to_string();
274        for (start, end) in matches.into_iter().rev() {
275            let matched_slice = &buffer[start..end];
276            let replacement = if use_named_placeholder {
277                replacement_for(pattern.name, matched_slice)
278            } else {
279                placeholder.to_string()
280            };
281            buffer.replace_range(start..end, &replacement);
282        }
283        owned = Some(buffer);
284    }
285
286    let result = match owned {
287        Some(value) if value == input => Cow::Borrowed(input),
288        Some(value) => Cow::Owned(value),
289        None => Cow::Borrowed(input),
290    };
291
292    if matches!(result, Cow::Owned(_)) {
293        let events: Vec<RedactionEvent> = audit_events.into_values().collect();
294        emit_audit(&events);
295    }
296
297    result
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303
304    fn run_clean() {
305        clear_custom_patterns();
306        install_audit_sink(None);
307        clear_audit_ring();
308    }
309
310    #[test]
311    fn returns_borrowed_when_clean() {
312        run_clean();
313        let out = scan_secret_patterns("just plain text", crate::redact::REDACTED_PLACEHOLDER);
314        assert!(matches!(out, Cow::Borrowed(_)));
315    }
316
317    #[test]
318    fn replaces_aws_and_github_tokens_with_named_placeholder() {
319        run_clean();
320        let input = "AKIAABCDEFGHIJKLMNOP and ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
321        let out = scan_secret_patterns(input, crate::redact::REDACTED_PLACEHOLDER);
322        let rendered = out.into_owned();
323        assert!(rendered.contains("<redacted:aws_access_key:20>"));
324        assert!(rendered.contains("<redacted:github_token:40>"));
325        assert!(!rendered.contains("AKIAABCDEFGHIJKLMNOP"));
326    }
327
328    #[test]
329    fn legacy_placeholder_path_still_works_for_url_param_values() {
330        run_clean();
331        let input = "AKIAABCDEFGHIJKLMNOP";
332        // A non-`[redacted]` placeholder is used verbatim — this is
333        // the URL-param escaping path.
334        let out = scan_secret_patterns(input, "%5Bredacted%5D");
335        assert!(out.contains("%5Bredacted%5D"));
336        assert!(!out.contains("AKIAABCDEFGHIJKLMNOP"));
337    }
338
339    #[test]
340    fn replaces_bearer_token_inside_text() {
341        run_clean();
342        let input = "header: Authorization: Bearer abcDEFghi123_-+/=xyz tail";
343        let out = scan_secret_patterns(input, crate::redact::REDACTED_PLACEHOLDER);
344        assert!(out.contains("<redacted:bearer_token:"));
345        assert!(!out.contains("abcDEFghi123_-+/=xyz"));
346        assert!(out.contains("tail"));
347    }
348
349    #[test]
350    fn replaces_jwt_tokens() {
351        run_clean();
352        let input = "token=eyJabcd.eyJefgh.signature_pad here";
353        let out = scan_secret_patterns(input, crate::redact::REDACTED_PLACEHOLDER);
354        assert!(out.contains("<redacted:jwt:"));
355        assert!(!out.contains("eyJabcd.eyJefgh.signature_pad"));
356    }
357
358    #[test]
359    fn replaces_private_key_blocks() {
360        run_clean();
361        let input =
362            "-----BEGIN OPENSSH PRIVATE KEY-----\nsecret-material\n-----END OPENSSH PRIVATE KEY-----";
363        let out = scan_secret_patterns(input, crate::redact::REDACTED_PLACEHOLDER);
364        assert!(out.contains("<redacted:private_key_block:"));
365        assert!(!out.contains("secret-material"));
366    }
367
368    #[test]
369    fn custom_pattern_redacts_and_is_introspectable() {
370        run_clean();
371        register_custom_pattern("acme_token", r"\bACME-[A-Z0-9]{8}\b").unwrap();
372        assert_eq!(custom_pattern_names(), vec!["acme_token".to_string()]);
373        let out = scan_secret_patterns(
374            "header ACME-12345678 trailer",
375            crate::redact::REDACTED_PLACEHOLDER,
376        );
377        assert!(
378            out.contains("<redacted:acme_token:13>"),
379            "expected acme_token redaction, got: {out}"
380        );
381        clear_custom_patterns();
382        assert!(custom_pattern_names().is_empty());
383    }
384
385    #[test]
386    fn audit_sink_receives_one_event_per_matching_pattern() {
387        use std::cell::RefCell;
388        use std::rc::Rc;
389        run_clean();
390        let captured: Rc<RefCell<Vec<RedactionEvent>>> = Rc::new(RefCell::new(Vec::new()));
391        let sink_captured = captured.clone();
392        install_audit_sink(Some(Rc::new(move |event| {
393            sink_captured.borrow_mut().push(event.clone());
394        })));
395        let input =
396            "AKIAABCDEFGHIJKLMNOP AKIA0000000000000000 ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
397        let out = scan_secret_patterns(input, crate::redact::REDACTED_PLACEHOLDER);
398        assert!(matches!(out, Cow::Owned(_)));
399        let events = captured.borrow();
400        assert_eq!(events.len(), 2);
401        let by_name: BTreeMap<&str, &RedactionEvent> = events
402            .iter()
403            .map(|event| (event.pattern_name.as_str(), event))
404            .collect();
405        assert_eq!(by_name.get("aws_access_key").unwrap().match_count, 2);
406        assert_eq!(by_name.get("github_token").unwrap().match_count, 1);
407        // The synchronous ring captures the same events so a
408        // compliance drain returns them regardless of which sink
409        // (if any) is installed.
410        drop(events);
411        install_audit_sink(None);
412        let ring = drain_audit_ring();
413        assert_eq!(ring.len(), 2);
414    }
415
416    #[test]
417    fn audit_ring_records_events_even_without_a_sink() {
418        run_clean();
419        let _ = scan_secret_patterns("AKIAABCDEFGHIJKLMNOP", crate::redact::REDACTED_PLACEHOLDER);
420        let ring = drain_audit_ring();
421        assert_eq!(ring.len(), 1);
422        assert_eq!(ring[0].pattern_name, "aws_access_key");
423        // Drain is destructive.
424        assert!(drain_audit_ring().is_empty());
425    }
426
427    #[test]
428    fn input_above_cap_is_passthrough() {
429        run_clean();
430        let huge = "AKIAABCDEFGHIJKLMNOP".repeat(MAX_SCAN_INPUT_BYTES / 20 + 1);
431        let out = scan_secret_patterns(&huge, crate::redact::REDACTED_PLACEHOLDER);
432        assert!(matches!(out, Cow::Borrowed(_)));
433    }
434
435    #[test]
436    fn default_pattern_names_are_stable() {
437        let names = default_pattern_names();
438        assert!(names.contains(&"jwt"));
439        assert!(names.contains(&"github_token"));
440        assert!(names.contains(&"github_pat_fine"));
441        assert!(names.contains(&"slack_token"));
442        assert!(names.contains(&"aws_access_key"));
443        assert!(names.contains(&"private_key_block"));
444        assert!(names.contains(&"bearer_token"));
445    }
446}