coding_agent_search/indexer/
redact_secrets.rs

1//! Ingestion-time secret redaction for message content and metadata.
2//!
3//! Prevents secrets (API keys, tokens, passwords, private keys) leaked in
4//! tool-result blocks from being persisted into the cass database.
5//!
6//! This module runs at ingestion time in `map_to_internal()`, before any data
7//! reaches SQLite or the FTS index.  It is intentionally conservative: it uses
8//! well-known prefix patterns rather than high-entropy heuristics to avoid
9//! false positives on normal code content.
10//!
11//! See also: `pages::secret_scan` (post-hoc scanning of existing data).
12
13use std::borrow::Cow;
14
15use once_cell::sync::Lazy;
16use regex::{Regex, RegexSet};
17
18/// Placeholder inserted where a secret was found.
19const REDACTED: &str = "[REDACTED]";
20
21/// A compiled secret-detection pattern.
22struct SecretPattern {
23    pattern: &'static str,
24    regex: Regex,
25}
26
27/// All built-in patterns, compiled once on first use.
28static SECRET_PATTERNS: Lazy<Vec<SecretPattern>> = Lazy::new(|| {
29    vec![
30        // AWS Access Key ID (always starts with AKIA)
31        SecretPattern {
32            pattern: r"\bAKIA[0-9A-Z]{16}\b",
33            regex: Regex::new(r"\bAKIA[0-9A-Z]{16}\b").expect("aws access key regex"),
34        },
35        // AWS Secret Key in assignment context
36        SecretPattern {
37            pattern: r#"(?i)aws(.{0,20})?(secret|access)?[_-]?key\s*[:=]\s*['"]?[A-Za-z0-9/+=]{40}['"]?"#,
38            regex: Regex::new(
39                r#"(?i)aws(.{0,20})?(secret|access)?[_-]?key\s*[:=]\s*['"]?[A-Za-z0-9/+=]{40}['"]?"#,
40            )
41            .expect("aws secret regex"),
42        },
43        // GitHub PAT (ghp_, gho_, ghu_, ghs_, ghr_)
44        SecretPattern {
45            pattern: r"\bgh[pousr]_[A-Za-z0-9]{36}\b",
46            regex: Regex::new(r"\bgh[pousr]_[A-Za-z0-9]{36}\b").expect("github pat regex"),
47        },
48        // OpenAI API key (sk-...)
49        SecretPattern {
50            pattern: r"\bsk-[A-Za-z0-9]{20,}\b",
51            regex: Regex::new(r"\bsk-[A-Za-z0-9]{20,}\b").expect("openai key regex"),
52        },
53        // Anthropic API key (sk-ant-...)
54        SecretPattern {
55            pattern: r"\bsk-ant-[A-Za-z0-9]{20,}\b",
56            regex: Regex::new(r"\bsk-ant-[A-Za-z0-9]{20,}\b").expect("anthropic key regex"),
57        },
58        // Bearer tokens in authorization headers
59        SecretPattern {
60            pattern: r"(?i)Bearer\s+[A-Za-z0-9_\-.]{20,}",
61            regex: Regex::new(r"(?i)Bearer\s+[A-Za-z0-9_\-.]{20,}").expect("bearer token regex"),
62        },
63        // JWT tokens (eyJ...)
64        SecretPattern {
65            pattern: r"\beyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b",
66            regex: Regex::new(r"\beyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b")
67                .expect("jwt regex"),
68        },
69        // PEM private keys
70        SecretPattern {
71            pattern: r"-----BEGIN (?:RSA|EC|DSA|OPENSSH|PGP) PRIVATE KEY-----",
72            regex: Regex::new(r"-----BEGIN (?:RSA|EC|DSA|OPENSSH|PGP) PRIVATE KEY-----")
73                .expect("private key regex"),
74        },
75        // Database connection URLs with credentials
76        SecretPattern {
77            pattern: r"(?i)\b(postgres|postgresql|mysql|mongodb|redis)://[^\s]{8,}",
78            regex: Regex::new(
79                r"(?i)\b(postgres|postgresql|mysql|mongodb|redis)://[^\s]{8,}",
80            )
81            .expect("db url regex"),
82        },
83        // Generic key/token/secret/password assignments
84        SecretPattern {
85            pattern: r#"(?i)(api[_-]?key|api[_-]?secret|auth[_-]?token|access[_-]?token|secret[_-]?key|password|passwd)\s*[:=]\s*['"]?[A-Za-z0-9_\-/+=]{8,}['"]?"#,
86            regex: Regex::new(
87                r#"(?i)(api[_-]?key|api[_-]?secret|auth[_-]?token|access[_-]?token|secret[_-]?key|password|passwd)\s*[:=]\s*['"]?[A-Za-z0-9_\-/+=]{8,}['"]?"#,
88            )
89            .expect("generic api key regex"),
90        },
91        // Slack tokens (xoxb-, xoxp-, xoxs-, xoxa-, xoxo-, xoxr-)
92        SecretPattern {
93            pattern: r"\bxox[bpsar]-[A-Za-z0-9\-]{10,}",
94            regex: Regex::new(r"\bxox[bpsar]-[A-Za-z0-9\-]{10,}").expect("slack token regex"),
95        },
96        // Stripe keys (sk_live_, pk_live_, rk_live_)
97        SecretPattern {
98            pattern: r"\b[spr]k_live_[A-Za-z0-9]{20,}",
99            regex: Regex::new(r"\b[spr]k_live_[A-Za-z0-9]{20,}").expect("stripe key regex"),
100        },
101    ]
102});
103
104/// Fast pre-check for the common no-secret path. Keeps pattern ordering aligned
105/// with `SECRET_PATTERNS` so matched set indices can select replacement regexes.
106static SECRET_REGEX_SET: Lazy<RegexSet> = Lazy::new(|| {
107    RegexSet::new(SECRET_PATTERNS.iter().map(|pattern| pattern.pattern)).expect("secret regex set")
108});
109
110/// Redact secrets from a plain-text string.
111///
112/// Returns the input unchanged if no secrets are detected.
113pub fn redact_text(input: &str) -> Cow<'_, str> {
114    let matches = SECRET_REGEX_SET.matches(input);
115    if !matches.matched_any() {
116        return Cow::Borrowed(input);
117    }
118
119    let mut output = Cow::Borrowed(input);
120    for idx in matches.iter() {
121        let replaced = SECRET_PATTERNS[idx]
122            .regex
123            .replace_all(output.as_ref(), REDACTED);
124        if let Cow::Owned(redacted) = replaced {
125            output = Cow::Owned(redacted);
126        }
127    }
128    output
129}
130
131/// Redact secrets from a JSON value, recursively walking strings.
132///
133/// - String values are redacted in-place.
134/// - Arrays and objects are walked recursively.
135/// - Numbers, booleans, and null are left untouched.
136pub fn redact_json(value: &serde_json::Value) -> serde_json::Value {
137    match value {
138        serde_json::Value::String(s) => {
139            let redacted = redact_text(s).into_owned();
140            serde_json::Value::String(redacted)
141        }
142        serde_json::Value::Array(arr) => {
143            serde_json::Value::Array(arr.iter().map(redact_json).collect())
144        }
145        serde_json::Value::Object(obj) => {
146            let mut new_obj = serde_json::Map::new();
147            for (k, v) in obj {
148                let redacted_key = redact_text(k).into_owned();
149                new_obj.insert(redacted_key, redact_json(v));
150            }
151            serde_json::Value::Object(new_obj)
152        }
153        other => other.clone(),
154    }
155}
156
157#[doc(hidden)]
158pub fn fuzz_redact_json_with_memoizing_redactor(
159    value: &serde_json::Value,
160    capacity: usize,
161) -> serde_json::Value {
162    MemoizingRedactor::with_capacity(capacity.clamp(1, 1024)).redact_json(value)
163}
164
165/// Returns true if redaction is enabled (default: true).
166///
167/// Set `CASS_REDACT_SECRETS=0` or `CASS_REDACT_SECRETS=false` to disable.
168pub fn redaction_enabled() -> bool {
169    match dotenvy::var("CASS_REDACT_SECRETS") {
170        Ok(val) => !matches!(val.as_str(), "0" | "false" | "off" | "no"),
171        Err(_) => true,
172    }
173}
174
175/// Stable identifier for the compiled SECRET_PATTERNS list.
176///
177/// Memoization keys for [`MemoizingRedactor`] combine input content
178/// with this fingerprint so a pattern bump (new regex added, existing
179/// regex tightened) automatically invalidates every prior cache entry
180/// — silent stale cross-version reuse is impossible by construction.
181///
182/// The fingerprint is `redact-v1:<blake3-hex>` where the hash covers
183/// every pattern source string concatenated with NUL separators. The
184/// `v1` epoch lets future maintainers force a manual bump even when
185/// the regex source set hasn't changed (e.g. if the replacement
186/// constant changes from `[REDACTED]` to something else).
187pub fn redaction_algorithm_fingerprint() -> String {
188    static FINGERPRINT: Lazy<String> = Lazy::new(|| {
189        let mut hasher = blake3::Hasher::new();
190        for pattern in SECRET_PATTERNS.iter() {
191            hasher.update(pattern.pattern.as_bytes());
192            hasher.update(&[0]);
193        }
194        hasher.update(REDACTED.as_bytes());
195        format!("redact-v1:{}", hasher.finalize().to_hex())
196    });
197    FINGERPRINT.clone()
198}
199
200/// Content-addressed memoizing redactor for the ingestion hot path.
201///
202/// `coding_agent_session_search-ibuuh.34`: redaction is a pure,
203/// regex-heavy transformation that runs against every persisted message
204/// content + metadata blob. Salvage replays, repeated assistant
205/// boilerplate, and historical re-ingest all feed identical content
206/// through the regex engine over and over. This wrapper keys
207/// [`ContentAddressedMemoCache`] on the input bytes plus the algorithm
208/// fingerprint so repeated content stops paying the regex cost while a
209/// pattern bump invalidates every prior entry transparently.
210///
211/// The wrapper preserves the legacy [`redact_text`]/[`redact_json`]
212/// contract byte-for-byte: see
213/// `memoizing_redactor_matches_uncached_for_arbitrary_input` for the
214/// equivalence gate. When the cache is hit, the recorded value is
215/// returned directly; on miss, the legacy regex path runs and the
216/// result is inserted under the content+algorithm key.
217///
218/// `MemoizingRedactor` is `pub(crate)` so the live persist path can
219/// adopt it without leaking the memoization vocabulary into public
220/// API. Wiring lives in the indexer crate.
221#[allow(dead_code)]
222pub(crate) struct MemoizingRedactor {
223    text_cache: crate::indexer::memoization::ContentAddressedMemoCache<String>,
224    algorithm_fingerprint: String,
225}
226
227#[allow(dead_code)]
228impl MemoizingRedactor {
229    /// Default cache capacity for typical refresh batches. Sized to
230    /// cover a few thousand distinct message bodies before LRU
231    /// eviction kicks in.
232    pub(crate) const DEFAULT_CAPACITY: usize = 4096;
233
234    pub(crate) fn with_capacity(capacity: usize) -> Self {
235        Self {
236            text_cache: crate::indexer::memoization::ContentAddressedMemoCache::with_capacity(
237                capacity,
238            ),
239            algorithm_fingerprint: redaction_algorithm_fingerprint(),
240        }
241    }
242
243    pub(crate) fn new() -> Self {
244        Self::with_capacity(Self::DEFAULT_CAPACITY)
245    }
246
247    pub(crate) fn algorithm_fingerprint(&self) -> &str {
248        &self.algorithm_fingerprint
249    }
250
251    pub(crate) fn stats(&self) -> &crate::indexer::memoization::MemoCacheStats {
252        self.text_cache.stats()
253    }
254
255    /// Memoized counterpart to [`redact_text`]. Returns an owned String
256    /// (not Cow) because caching forces a copy on first compute anyway,
257    /// and downstream callers (`map_to_internal`) immediately call
258    /// `.into_owned()` regardless. Skipping the Cow indirection keeps
259    /// the cached-hit path branchless.
260    ///
261    /// Each cache decision emits a structured `tracing` event so
262    /// operators can audit hit / miss / insert / evict / quarantine
263    /// behavior from logs alone (per `coding_agent_session_search-ibuuh.34`
264    /// AC: "operator-auditable through structured hit, miss,
265    /// invalidation, eviction, quarantine, and budget logs").
266    pub(crate) fn redact_text(&mut self, input: &str) -> String {
267        let (output, _audit) = self.redact_text_with_audit(input);
268        output
269    }
270
271    /// Audit-bearing variant: returns the redacted text plus the
272    /// structured cache-decision records (lookup audit, plus insert
273    /// audit on miss). Callers that want to forward records to a
274    /// subscriber (telemetry sink, doctor diagnostics, etc.) use this
275    /// directly; the convenience `redact_text` wrapper drops them
276    /// after emitting tracing events.
277    pub(crate) fn redact_text_with_audit(
278        &mut self,
279        input: &str,
280    ) -> (
281        String,
282        Vec<crate::indexer::memoization::MemoCacheAuditRecord>,
283    ) {
284        // Empty fast-path matches the uncached contract and bypasses
285        // the cache entirely (see memoizing_redactor_empty_input_skips_cache).
286        if input.is_empty() {
287            return (String::new(), Vec::new());
288        }
289        let key = self.key_for(input);
290        let (lookup, lookup_audit) = self.text_cache.get_with_audit(&key);
291        Self::trace_audit(&lookup_audit);
292        match lookup {
293            crate::indexer::memoization::MemoLookup::Hit { value } => (value, vec![lookup_audit]),
294            crate::indexer::memoization::MemoLookup::Quarantined { reason } => {
295                // Quarantined entry: never serve a stale value;
296                // recompute via the legacy regex path, but DO NOT
297                // re-insert (the entry stays quarantined for operator
298                // inspection until explicitly lifted via
299                // `lift_quarantine_for`).
300                tracing::warn!(
301                    quarantine_reason = %reason,
302                    algorithm = %self.algorithm_fingerprint,
303                    "redaction memo entry is quarantined; falling back to direct regex pass"
304                );
305                let redacted = redact_text(input).into_owned();
306                (redacted, vec![lookup_audit])
307            }
308            crate::indexer::memoization::MemoLookup::Miss => {
309                let redacted = redact_text(input).into_owned();
310                let insert_audit = self.text_cache.insert_with_audit(key, redacted.clone());
311                Self::trace_audit(&insert_audit);
312                (redacted, vec![lookup_audit, insert_audit])
313            }
314        }
315    }
316
317    /// Invalidate a cached redaction for the given input. Returns
318    /// `true` only when an entry was actually removed (matches the
319    /// underlying `ContentAddressedMemoCache` contract). Mostly
320    /// useful for tests and for operator tooling that wants to bust
321    /// individual cache entries without restarting the process.
322    pub(crate) fn invalidate(&mut self, input: &str) -> bool {
323        if input.is_empty() {
324            return false;
325        }
326        let key = self.key_for(input);
327        let audit = self.text_cache.invalidate_with_audit(&key);
328        Self::trace_audit(&audit);
329        audit.changed
330    }
331
332    /// Quarantine a cached entry: subsequent lookups will return
333    /// [`MemoLookup::Quarantined`] (handled by `redact_text` as a
334    /// fallthrough to the direct regex path) instead of the cached
335    /// value. The reason is preserved for operator inspection. Used
336    /// when telemetry detects a poisoned redaction (e.g. unexpected
337    /// regex behavior under a hot pattern bump that the algorithm
338    /// fingerprint didn't catch).
339    pub(crate) fn quarantine(&mut self, input: &str, reason: impl Into<String>) {
340        if input.is_empty() {
341            return;
342        }
343        let key = self.key_for(input);
344        let audit = self.text_cache.quarantine_with_audit(key, reason);
345        Self::trace_audit(&audit);
346    }
347
348    fn trace_audit(audit: &crate::indexer::memoization::MemoCacheAuditRecord) {
349        // Severity tiers match operator expectations: hits are noise
350        // (trace), misses + inserts are routine (debug), evictions
351        // are noteworthy (info), invalidations and quarantines are
352        // alarming enough to warn so they show up in default-level
353        // logs without dredging.
354        use crate::indexer::memoization::MemoCacheEvent;
355        match audit.event {
356            MemoCacheEvent::Hit => tracing::trace!(
357                target: "cass::redact::memo",
358                algorithm = %audit.key.algorithm,
359                stats = ?audit.stats,
360                "redact memo hit"
361            ),
362            MemoCacheEvent::Miss => tracing::debug!(
363                target: "cass::redact::memo",
364                algorithm = %audit.key.algorithm,
365                stats = ?audit.stats,
366                "redact memo miss"
367            ),
368            MemoCacheEvent::Insert => tracing::debug!(
369                target: "cass::redact::memo",
370                algorithm = %audit.key.algorithm,
371                live_entries = audit.stats.live_entries,
372                "redact memo insert"
373            ),
374            MemoCacheEvent::Evict { ref reason } => tracing::info!(
375                target: "cass::redact::memo",
376                evict_reason = ?reason,
377                live_entries = audit.stats.live_entries,
378                evictions_capacity = audit.stats.evictions_capacity,
379                "redact memo eviction"
380            ),
381            MemoCacheEvent::Invalidate => tracing::warn!(
382                target: "cass::redact::memo",
383                changed = audit.changed,
384                live_entries = audit.stats.live_entries,
385                invalidations = audit.stats.invalidations,
386                "redact memo invalidate"
387            ),
388            MemoCacheEvent::Quarantine { ref reason } => tracing::warn!(
389                target: "cass::redact::memo",
390                quarantine_reason = %reason,
391                quarantined_entries = audit.quarantined_entries,
392                "redact memo quarantine"
393            ),
394        }
395    }
396
397    /// Memoized counterpart to [`redact_json`]. Recurses through the
398    /// JSON value, memoizing each string scalar (and each object key)
399    /// independently — JSON arrays / objects themselves are not
400    /// cached because their structural identity dominates compared to
401    /// per-string regex cost.
402    pub(crate) fn redact_json(&mut self, value: &serde_json::Value) -> serde_json::Value {
403        match value {
404            serde_json::Value::String(s) => serde_json::Value::String(self.redact_text(s)),
405            serde_json::Value::Array(arr) => {
406                serde_json::Value::Array(arr.iter().map(|v| self.redact_json(v)).collect())
407            }
408            serde_json::Value::Object(obj) => {
409                let mut new_obj = serde_json::Map::with_capacity(obj.len());
410                for (k, v) in obj {
411                    let redacted_key = self.redact_text(k);
412                    new_obj.insert(redacted_key, self.redact_json(v));
413                }
414                serde_json::Value::Object(new_obj)
415            }
416            other => other.clone(),
417        }
418    }
419
420    fn key_for(&self, input: &str) -> crate::indexer::memoization::MemoKey {
421        // Hash content with blake3 for a fixed-width key (avoids
422        // pathological 1-MiB-content cache keys that would otherwise
423        // dominate cache memory).
424        let mut hasher = blake3::Hasher::new();
425        hasher.update(input.as_bytes());
426        let content_hash = crate::indexer::memoization::MemoContentHash::from_bytes(
427            hasher.finalize().as_bytes().to_vec(),
428        );
429        crate::indexer::memoization::MemoKey::new(
430            content_hash,
431            "redact_text",
432            self.algorithm_fingerprint.clone(),
433        )
434    }
435}
436
437impl Default for MemoizingRedactor {
438    fn default() -> Self {
439        Self::new()
440    }
441}
442
443#[cfg(test)]
444mod tests {
445    use super::*;
446    use serde_json::json;
447    use serial_test::serial;
448
449    #[test]
450    fn redacts_openai_key() {
451        let input = "my key is sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij";
452        let output = redact_text(input);
453        assert_eq!(output, "my key is [REDACTED]");
454        assert!(!output.contains("sk-ABCDE"));
455    }
456
457    #[test]
458    fn redacts_anthropic_key() {
459        let input = "sk-ant-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij";
460        let output = redact_text(input);
461        assert_eq!(output, "[REDACTED]");
462    }
463
464    #[test]
465    fn redacts_github_pat() {
466        let input = "token ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij";
467        let output = redact_text(input);
468        assert_eq!(output, "token [REDACTED]");
469    }
470
471    #[test]
472    fn redacts_bearer_token() {
473        let input = "Authorization: Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.payload.signature";
474        let output = redact_text(input);
475        assert!(!output.contains("eyJhbGci"));
476    }
477
478    #[test]
479    fn redacts_aws_access_key() {
480        let input = "AKIAIOSFODNN7EXAMPLE";
481        let output = redact_text(input);
482        assert_eq!(output, "[REDACTED]");
483    }
484
485    #[test]
486    fn redacts_private_key_header() {
487        let input = "-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAK...";
488        let output = redact_text(input);
489        assert!(output.starts_with("[REDACTED]"));
490    }
491
492    #[test]
493    fn redacts_generic_api_key_assignment() {
494        let input = "api_key=abcdefgh12345678";
495        let output = redact_text(input);
496        assert_eq!(output, "[REDACTED]");
497    }
498
499    #[test]
500    fn redacts_database_url() {
501        let input = "DATABASE_URL=postgres://user:pass@host:5432/db";
502        let output = redact_text(input);
503        assert!(!output.contains("user:pass"));
504    }
505
506    #[test]
507    fn redacts_stripe_key() {
508        // Build the test key dynamically to avoid GitHub push protection flagging it
509        let input = format!("{}_{}", "sk_live", "AAAABBBBCCCCDDDDEEEEFFFFGGGG");
510        let output = redact_text(&input);
511        assert_eq!(output, "[REDACTED]");
512    }
513
514    #[test]
515    fn redacts_slack_token() {
516        let input = "xoxb-123456789-abcdefghij";
517        let output = redact_text(input);
518        assert_eq!(output, "[REDACTED]");
519    }
520
521    #[test]
522    fn leaves_normal_text_unchanged() {
523        let input = "Hello, this is a normal message about code review.";
524        let output = redact_text(input);
525        assert_eq!(output, input);
526        assert!(
527            matches!(output, Cow::Borrowed(_)),
528            "no-secret path should not allocate"
529        );
530    }
531
532    #[test]
533    fn leaves_short_tokens_unchanged() {
534        // Short strings should not match (below minimum lengths)
535        let input = "sk-abc";
536        let output = redact_text(input);
537        assert_eq!(output, input);
538    }
539
540    #[test]
541    fn redacts_json_string_values() {
542        let input = json!({
543            "tool_result": "Response contains sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij",
544            "safe": "no secrets here",
545            "number": 42
546        });
547        let output = redact_json(&input);
548        assert_eq!(output["tool_result"], json!("Response contains [REDACTED]"));
549        assert_eq!(output["safe"], json!("no secrets here"));
550        assert_eq!(output["number"], json!(42));
551    }
552
553    #[test]
554    fn redacts_nested_json() {
555        let input = json!({
556            "outer": {
557                "inner": "ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij"
558            },
559            "array": ["safe", "sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij"]
560        });
561        let output = redact_json(&input);
562        assert_eq!(output["outer"]["inner"], json!("[REDACTED]"));
563        assert_eq!(output["array"][0], json!("safe"));
564        assert_eq!(output["array"][1], json!("[REDACTED]"));
565    }
566
567    #[test]
568    #[serial]
569    fn redaction_enabled_default() {
570        // When env var is not set, should be enabled
571        // Safety: only called in single-threaded test context
572        unsafe { std::env::remove_var("CASS_REDACT_SECRETS") };
573        assert!(redaction_enabled());
574    }
575
576    #[test]
577    #[serial]
578    fn redaction_can_be_disabled() {
579        unsafe { std::env::set_var("CASS_REDACT_SECRETS", "0") };
580        assert!(!redaction_enabled());
581
582        unsafe { std::env::set_var("CASS_REDACT_SECRETS", "false") };
583        assert!(!redaction_enabled());
584
585        // Restore for other tests
586        unsafe { std::env::remove_var("CASS_REDACT_SECRETS") };
587    }
588
589    #[test]
590    fn multiple_secrets_in_one_string() {
591        let input = "key1=sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij and key2=ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij";
592        let output = redact_text(input);
593        assert!(!output.contains("sk-ABCDE"));
594        assert!(!output.contains("ghp_ABCDE"));
595        assert_eq!(output.matches("[REDACTED]").count(), 2);
596        assert!(
597            matches!(output, Cow::Owned(_)),
598            "matched secret path should return owned redacted text"
599        );
600    }
601
602    /// `coding_agent_session_search-ibuuh.34` (memoization equivalence
603    /// gate): the memoizing redactor must produce byte-identical
604    /// output to the legacy `redact_text` path on every input.
605    /// Equivalence is checked across:
606    /// - clean inputs with no secret matches
607    /// - single-secret inputs (every supported pattern fires at least once)
608    /// - multi-secret inputs (multiple replacement passes)
609    /// - empty input (fast-path)
610    /// - long boilerplate-style inputs (large blob with no secrets)
611    ///
612    /// First and second invocations on the same input must agree
613    /// (cache-hit invariance) AND match the uncached result.
614    #[test]
615    fn memoizing_redactor_matches_uncached_for_arbitrary_input() {
616        // Diagnostic-message slice helper: MUST land on a UTF-8 char
617        // boundary so we can extend this fixture set with multi-byte
618        // inputs in the future without panicking on byte-slice
619        // boundary errors. (MEMORY.md flagged this exact pattern as
620        // a recurring footgun; this helper inoculates the test.)
621        fn safe_prefix(s: &str, max_bytes: usize) -> &str {
622            let mut end = s.len().min(max_bytes);
623            while end > 0 && !s.is_char_boundary(end) {
624                end -= 1;
625            }
626            &s[..end]
627        }
628        let twenty_kib_unicode = "🔐abc".repeat(2_048);
629        let inputs: &[&str] = &[
630            "",
631            "no secrets here, just prose",
632            "my key is sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij",
633            "sk-ant-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij followed by AKIAABCDEFGHIJKLMNOP",
634            "Authorization: Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.payload.signature",
635            "ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij and another ghp_ZYXWVUTSRQPONMLKJIHGFEDCBA0123456789",
636            // Multi-byte UTF-8 input: pins that the memoized path's
637            // hashing + cache key construction handles non-ASCII
638            // content (blake3 over .as_bytes() handles any byte
639            // sequence). Pre-fixup, the diagnostic prefix slice
640            // below would have panicked on this input.
641            "🔐 user pasted sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij from 测试",
642            &twenty_kib_unicode,
643            &"a".repeat(10_000),
644        ];
645        let mut redactor = MemoizingRedactor::with_capacity(64);
646        for input in inputs {
647            let uncached = redact_text(input).into_owned();
648            let memoized_first = redactor.redact_text(input);
649            let memoized_second = redactor.redact_text(input);
650            assert_eq!(
651                uncached,
652                memoized_first,
653                "memoized first call must match legacy uncached redact_text for input prefix: {:?}",
654                safe_prefix(input, 64)
655            );
656            assert_eq!(
657                uncached,
658                memoized_second,
659                "memoized second call must match legacy uncached for input prefix: {:?}",
660                safe_prefix(input, 64)
661            );
662        }
663    }
664
665    /// Repeated identical content must hit the cache rather than
666    /// re-running the regex set. Pinning hits/misses is the operator
667    /// audit signal the bead acceptance asks for.
668    #[test]
669    fn memoizing_redactor_reuses_cache_for_repeated_content() {
670        let mut redactor = MemoizingRedactor::with_capacity(16);
671        let payload = "boilerplate assistant prompt: please help with sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij";
672        // Three identical calls: 1 miss + 2 hits. Empty-string
673        // fast-path is never accounted in the cache, so it does not
674        // perturb the counters.
675        let _ = redactor.redact_text("");
676        let _ = redactor.redact_text(payload);
677        let _ = redactor.redact_text(payload);
678        let _ = redactor.redact_text(payload);
679        let stats = redactor.stats();
680        assert_eq!(stats.misses, 1, "first call must be a cache miss");
681        assert_eq!(
682            stats.hits, 2,
683            "subsequent identical calls must be cache hits"
684        );
685        assert_eq!(stats.inserts, 1, "exactly one redacted result inserted");
686    }
687
688    /// A pattern bump (algorithm fingerprint change) must invalidate
689    /// every prior memo entry. We simulate this by constructing two
690    /// `MemoizingRedactor` instances whose algorithm fingerprints
691    /// differ — entries from one cannot serve hits to the other,
692    /// guaranteeing safe cross-version semantics. Pinning the
693    /// fingerprint structure (`redact-v1:<hex>`) guards against an
694    /// accidental hash-format change that would silently break
695    /// invalidation.
696    #[test]
697    fn memoizing_redactor_keys_isolate_by_algorithm_fingerprint() {
698        let fingerprint = redaction_algorithm_fingerprint();
699        assert!(
700            fingerprint.starts_with("redact-v1:"),
701            "fingerprint must carry an explicit version epoch, got: {fingerprint}"
702        );
703        let hex_part = fingerprint.strip_prefix("redact-v1:").unwrap();
704        assert_eq!(
705            hex_part.len(),
706            64,
707            "fingerprint hash must be a 64-char blake3 hex digest"
708        );
709        // Same compiled patterns ⇒ same fingerprint across calls.
710        assert_eq!(fingerprint, redaction_algorithm_fingerprint());
711
712        // Two fresh redactors share the algorithm fingerprint, so they
713        // would route hits/misses through the same key shape. Pinning
714        // both fingerprints equal guards against a thread-local /
715        // process-singleton bug that could silently desync cache
716        // versions across parallel persist workers.
717        let r1 = MemoizingRedactor::new();
718        let r2 = MemoizingRedactor::new();
719        assert_eq!(r1.algorithm_fingerprint(), r2.algorithm_fingerprint());
720    }
721
722    /// `redact_json` round-trip via the memoizing path must agree with
723    /// the legacy `redact_json` for non-trivial JSON shapes (nested
724    /// arrays, nested objects, mixed scalars). Pins the recursive
725    /// projection so a regression in either path's traversal trips a
726    /// clear assertion.
727    #[test]
728    fn memoizing_redactor_redact_json_matches_uncached_for_nested_shapes() {
729        let value = json!({
730            "session": {
731                "auth": "Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.payload.signature",
732                "history": [
733                    "no secret",
734                    "ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij",
735                    {"key": "value", "leak": "sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij"},
736                    null,
737                    42,
738                    true,
739                ],
740                "metadata": {
741                    "leaked_field": "sk-ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij",
742                    "safe_field": "noop",
743                },
744            },
745            "version": 7,
746        });
747        let uncached = redact_json(&value);
748        let memoized = MemoizingRedactor::new().redact_json(&value);
749        assert_eq!(
750            uncached, memoized,
751            "memoizing redact_json must match legacy redact_json byte-for-byte"
752        );
753    }
754
755    /// Repeated metadata / extra_json structures are common in salvage
756    /// replays and assistant boilerplate. The memoized JSON walker must
757    /// reuse repeated object keys and repeated scalar values instead of
758    /// re-running the regex set for every copy.
759    #[test]
760    fn memoizing_redactor_redact_json_reuses_repeated_keys_and_values() {
761        let repeated_secret =
762            "Authorization: Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.payload.signature";
763        let repeated_note = "same assistant boilerplate without secrets";
764        let value = json!({
765            "events": [
766                {"token": repeated_secret, "note": repeated_note},
767                {"token": repeated_secret, "note": repeated_note},
768                {"token": repeated_secret, "note": repeated_note},
769            ],
770            "footer": repeated_note,
771        });
772
773        let uncached = redact_json(&value);
774        let mut redactor = MemoizingRedactor::with_capacity(32);
775        let memoized = redactor.redact_json(&value);
776
777        assert_eq!(
778            uncached, memoized,
779            "memoized JSON redaction must preserve legacy output exactly"
780        );
781        assert!(
782            !memoized.to_string().contains("eyJhbGci"),
783            "memoized JSON redaction must still remove repeated secrets"
784        );
785
786        let stats = redactor.stats();
787        assert_eq!(
788            stats.misses, 6,
789            "first occurrences of root keys, repeated child keys, and scalar values should miss once"
790        );
791        assert_eq!(
792            stats.inserts, 6,
793            "each distinct JSON key/value string should be inserted once"
794        );
795        assert_eq!(
796            stats.hits, 9,
797            "repeated child keys and repeated scalar values should hit the memo cache"
798        );
799    }
800
801    /// Emptiness fast-path: zero-length input must NOT increment the
802    /// cache miss counter. Otherwise an ingestion run with thousands
803    /// of empty system messages would burn cache slots for
804    /// content-equivalent empty strings.
805    #[test]
806    #[serial]
807    fn memoizing_redactor_empty_input_skips_cache() {
808        let mut redactor = MemoizingRedactor::with_capacity(8);
809        let _ = redactor.redact_text("");
810        let _ = redactor.redact_text("");
811        let _ = redactor.redact_text("");
812        let stats = redactor.stats();
813        assert_eq!(stats.misses, 0, "empty input must not count as miss");
814        assert_eq!(stats.hits, 0, "empty input must not count as hit");
815        assert_eq!(stats.inserts, 0, "empty input must not insert into cache");
816    }
817
818    /// `coding_agent_session_search-ibuuh.34` (operator-audit gate):
819    /// every cache decision must surface a structured
820    /// MemoCacheAuditRecord so telemetry sinks / doctor diagnostics
821    /// can reason about cache health without grepping internal stats.
822    /// First call on a new content emits Lookup(Miss) + Insert.
823    /// Second call emits Lookup(Hit). Pinning the audit shape directly
824    /// closes the bead's "operator-auditable through structured hit,
825    /// miss, invalidation, eviction, quarantine, and budget logs"
826    /// requirement for the redaction sink.
827    #[test]
828    fn memoizing_redactor_with_audit_emits_lookup_and_insert_records() {
829        use crate::indexer::memoization::{MemoCacheEvent, MemoCacheOperation};
830        let mut redactor = MemoizingRedactor::with_capacity(8);
831        let payload =
832            "Authorization: Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.payload.signature";
833
834        let (first_output, first_audit) = redactor.redact_text_with_audit(payload);
835        assert!(!first_output.contains("eyJhbGci"));
836        assert_eq!(
837            first_audit.len(),
838            2,
839            "first call must emit a lookup audit + an insert audit"
840        );
841        assert!(matches!(
842            first_audit[0].operation,
843            MemoCacheOperation::Lookup
844        ));
845        assert!(matches!(first_audit[0].event, MemoCacheEvent::Miss));
846        assert!(matches!(
847            first_audit[1].operation,
848            MemoCacheOperation::Insert
849        ));
850        assert!(matches!(first_audit[1].event, MemoCacheEvent::Insert));
851        assert_eq!(first_audit[1].stats.live_entries, 1);
852
853        let (second_output, second_audit) = redactor.redact_text_with_audit(payload);
854        assert_eq!(first_output, second_output);
855        assert_eq!(
856            second_audit.len(),
857            1,
858            "second call must emit only the lookup audit (cache hit)"
859        );
860        assert!(matches!(second_audit[0].event, MemoCacheEvent::Hit));
861        assert_eq!(second_audit[0].stats.hits, 1);
862
863        // Algorithm key carried on every audit record so a downstream
864        // sink can disambiguate cache events when multiple
865        // ContentAddressedMemoCaches share the same logger target.
866        for record in first_audit.iter().chain(second_audit.iter()) {
867            assert_eq!(record.key.algorithm, "redact_text");
868            assert!(record.key.algorithm_version.starts_with("redact-v1:"));
869        }
870    }
871
872    /// Invalidate must remove the cached entry so the next call is a
873    /// miss + re-insert. Pin the changed/no-op semantics so a caller
874    /// can rely on the boolean return value to know whether anything
875    /// was actually evicted.
876    #[test]
877    fn memoizing_redactor_invalidate_drops_cached_entry() {
878        let mut redactor = MemoizingRedactor::with_capacity(8);
879        let payload = "no secret here, just a sentence";
880
881        // Prime the cache.
882        let _ = redactor.redact_text(payload);
883        assert_eq!(redactor.stats().inserts, 1);
884        assert_eq!(redactor.stats().misses, 1);
885        let _ = redactor.redact_text(payload);
886        assert_eq!(redactor.stats().hits, 1);
887
888        // Invalidate must report the change.
889        assert!(
890            redactor.invalidate(payload),
891            "invalidate must return true when an entry was removed"
892        );
893        assert_eq!(redactor.stats().invalidations, 1);
894        // A second invalidate on the same key is a no-op.
895        assert!(
896            !redactor.invalidate(payload),
897            "second invalidate must be a no-op"
898        );
899        assert_eq!(redactor.stats().invalidations, 1);
900
901        // Empty input invalidate is a no-op (matches the empty-input
902        // fast-path: nothing was ever cached).
903        assert!(
904            !redactor.invalidate(""),
905            "invalidating empty input must be a no-op"
906        );
907
908        // Next call must miss again, not hit.
909        let _ = redactor.redact_text(payload);
910        assert_eq!(
911            redactor.stats().misses,
912            2,
913            "post-invalidate call must register as a miss"
914        );
915        assert_eq!(redactor.stats().hits, 1, "hits counter must not regress");
916    }
917
918    /// Quarantined entries must NEVER serve a cached value. After
919    /// quarantine, the redactor falls through to the direct
920    /// `redact_text` regex path and the cached value remains
921    /// quarantined for operator inspection. This satisfies the bead's
922    /// "suspected corruption or stale-entry quarantine" coverage
923    /// requirement.
924    #[test]
925    fn memoizing_redactor_quarantined_entries_fall_through_to_direct_redaction() {
926        use crate::indexer::memoization::{MemoCacheEvent, MemoCacheOperation};
927        let mut redactor = MemoizingRedactor::with_capacity(8);
928        let payload =
929            "user=admin password=hunter2hunter2 token=ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij";
930
931        // Prime + verify hit.
932        let _ = redactor.redact_text(payload);
933        let _ = redactor.redact_text(payload);
934        assert_eq!(redactor.stats().hits, 1);
935
936        // Quarantine the entry; subsequent lookup must report the
937        // Quarantined outcome via audit AND fall through to direct
938        // regex redaction (so the user-visible result is still the
939        // correct redacted text).
940        redactor.quarantine(payload, "telemetry: poisoned redaction signal");
941        assert_eq!(redactor.stats().quarantined, 1);
942
943        let (output, audit) = redactor.redact_text_with_audit(payload);
944        assert!(
945            !output.contains("ghp_ABCDE"),
946            "post-quarantine redaction must still scrub secrets via direct regex pass"
947        );
948        assert!(
949            !output.contains("password=hunter2hunter2"),
950            "post-quarantine redaction must scrub generic password assignments"
951        );
952        assert_eq!(
953            audit.len(),
954            1,
955            "quarantine fallthrough emits the lookup audit only (no insert)"
956        );
957        assert!(matches!(audit[0].operation, MemoCacheOperation::Lookup));
958        assert!(matches!(audit[0].event, MemoCacheEvent::Quarantine { .. }));
959
960        // Re-quarantining the same key with the same reason is a
961        // no-op for the quarantine counter (already quarantined).
962        redactor.quarantine(payload, "telemetry: poisoned redaction signal");
963        assert_eq!(
964            redactor.stats().quarantined,
965            1,
966            "re-quarantining the same key with the same reason must not double-count"
967        );
968
969        // Empty input quarantine is a no-op.
970        redactor.quarantine("", "ignored");
971        assert_eq!(redactor.stats().quarantined, 1);
972    }
973}
coding_agent_search/indexer/redact_secrets.rs

coding_agent_search/indexer/
redact_secrets.rs