spool-memory 0.1.0

Local-first developer memory system — persistent, structured knowledge for AI coding tools
Documentation
//! Heuristic secrets redaction.
//!
//! ## Threat model
//! Hooks (post-tool-use, stop, pre-compact) capture raw conversation
//! / tool-call text into the distill queue. That text occasionally
//! contains:
//! - API keys (`sk-…`, `pk-…`, `xai-…`, `xoxb-…`)
//! - Bearer / JWT tokens (`Bearer ey…`, `eyJ…` standalone)
//! - GitHub tokens (`ghp_…`, `gho_…`, `ghu_…`, `ghr_…`, `ghs_…`)
//! - Long opaque base64 / hex blobs that *might* be tokens
//!
//! We deliberately use simple regex heuristics — NOT a perfect parser.
//! The goal is "strip the obvious 90%, surface a flag for the rest".
//! The flag is consumed by Stop hook to decide whether the signal is
//! safe to write or should be dropped entirely.
//!
//! ## What this module does NOT do
//! - It does NOT promise zero-secret output. Format-specific tokens
//!   (custom envs, internal API formats) will slip through.
//! - It does NOT ship the redacted payload anywhere — the caller
//!   decides whether to record / drop / send to sampling.
//! - It does NOT redact in place; we always return a new String so
//!   callers can keep the original for audit / forensic logs.
//!
//! ## Output
//! [`redact`] returns a [`RedactReport`] with:
//! - `redacted` — the cleaned text
//! - `hits` — list of `(pattern_name, count)` for each pattern that
//!   matched. Stop hook uses this to decide:
//!   - 0 hits → write payload as-is
//!   - 1+ hits → write the redacted version + record `redacted_kinds`
//!     in metadata; or drop entirely if a strict policy is enabled
//!     (R4 may add this).

use once_cell::sync::Lazy;
use regex::Regex;

const REPLACEMENT: &str = "[REDACTED]";

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RedactReport {
    pub redacted: String,
    pub hits: Vec<RedactHit>,
}

impl RedactReport {
    pub fn is_clean(&self) -> bool {
        self.hits.is_empty()
    }

    /// Distinct kinds of secrets matched, sorted for determinism.
    pub fn kinds(&self) -> Vec<String> {
        let mut names: Vec<String> = self.hits.iter().map(|h| h.kind.clone()).collect();
        names.sort();
        names.dedup();
        names
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RedactHit {
    pub kind: String,
    pub count: usize,
}

struct Pattern {
    kind: &'static str,
    regex: Regex,
}

// Patterns are intentionally narrow to limit false positives. We
// require obvious anchors (prefix tokens, length floors) so generic
// alphanumeric blobs don't accidentally match.
static PATTERNS: Lazy<Vec<Pattern>> = Lazy::new(|| {
    vec![
        Pattern {
            kind: "openai_api_key",
            // sk-… up to ~64 chars, alphanumerics / -_; matches both
            // legacy and the newer `sk-proj-…` format.
            regex: Regex::new(r"sk-(?:proj-)?[A-Za-z0-9_\-]{20,}").unwrap(),
        },
        Pattern {
            kind: "openai_pk_key",
            regex: Regex::new(r"\bpk-[A-Za-z0-9_\-]{20,}").unwrap(),
        },
        Pattern {
            kind: "anthropic_api_key",
            regex: Regex::new(r"sk-ant-[A-Za-z0-9_\-]{20,}").unwrap(),
        },
        Pattern {
            kind: "github_token",
            regex: Regex::new(r"\bgh[pousr]_[A-Za-z0-9]{30,}").unwrap(),
        },
        Pattern {
            kind: "slack_bot_token",
            regex: Regex::new(r"\bxox[baprs]-[A-Za-z0-9-]{10,}").unwrap(),
        },
        Pattern {
            kind: "google_api_key",
            regex: Regex::new(r"\bAIza[0-9A-Za-z_\-]{35}").unwrap(),
        },
        Pattern {
            kind: "aws_access_key_id",
            regex: Regex::new(r"\bAKIA[0-9A-Z]{16}\b").unwrap(),
        },
        // Authorization: Bearer <token> — match the token portion,
        // not the literal "Bearer " word, so the surrounding context
        // remains readable after redaction.
        Pattern {
            kind: "bearer_token",
            regex: Regex::new(r"(?i)Bearer\s+[A-Za-z0-9_\-\.=]{20,}").unwrap(),
        },
        // Standalone JWT: header.payload.sig — three base64 segments
        // separated by `.`, each ≥10 chars to dodge most version
        // strings.
        Pattern {
            kind: "jwt",
            regex: Regex::new(
                r"\beyJ[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}\b",
            )
            .unwrap(),
        },
    ]
});

/// Redact known secret patterns from `text`. Returns the cleaned text
/// alongside per-pattern hit counts.
pub fn redact(text: &str) -> RedactReport {
    let mut current = text.to_string();
    let mut hits = Vec::new();
    for pattern in PATTERNS.iter() {
        let count = pattern.regex.find_iter(&current).count();
        if count == 0 {
            continue;
        }
        current = pattern
            .regex
            .replace_all(&current, REPLACEMENT)
            .into_owned();
        hits.push(RedactHit {
            kind: pattern.kind.to_string(),
            count,
        });
    }
    RedactReport {
        redacted: current,
        hits,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn redact_returns_unchanged_text_when_clean() {
        let report = redact("hello world");
        assert_eq!(report.redacted, "hello world");
        assert!(report.is_clean());
        assert!(report.kinds().is_empty());
    }

    #[test]
    fn redact_strips_openai_key() {
        let raw = "use sk-abc1234567890DEFGHIJ for the call";
        let report = redact(raw);
        assert!(report.redacted.contains(REPLACEMENT));
        assert!(!report.redacted.contains("sk-abc"));
        assert_eq!(report.kinds(), vec!["openai_api_key"]);
        assert_eq!(report.hits[0].count, 1);
    }

    #[test]
    fn redact_strips_anthropic_key_first_when_overlap() {
        // sk-ant- starts with "sk-" but the more specific pattern is
        // listed AFTER openai_api_key. PATTERNS run in order, so the
        // first redaction (openai_api_key) takes the whole token. The
        // anthropic_api_key pattern then doesn't match because the
        // text became `[REDACTED]`. We assert exactly one hit
        // recorded — the FIRST matching kind wins.
        let raw = "key=sk-ant-abc1234567890DEFGHIJ done";
        let report = redact(raw);
        assert_eq!(report.kinds().len(), 1, "{:?}", report.kinds());
        assert!(report.redacted.contains(REPLACEMENT));
    }

    #[test]
    fn redact_strips_github_token() {
        let raw = "token=ghp_abcdefghijklmnop1234567890ABCDEF rest";
        let report = redact(raw);
        assert!(report.redacted.contains(REPLACEMENT));
        assert_eq!(report.kinds(), vec!["github_token"]);
    }

    #[test]
    fn redact_strips_bearer_token_keeping_surrounding_text() {
        let raw = "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9 ok";
        let report = redact(raw);
        assert!(report.redacted.contains("Authorization:"));
        assert!(report.redacted.contains(REPLACEMENT));
        // The literal "Bearer " word is part of the matched span by
        // design, so the cleaned text reads "Authorization: [REDACTED]
        // ok". We just check that the actual token is gone.
        assert!(!report.redacted.contains("eyJhbGc"));
        // Both the bearer pattern and the JWT pattern would normally
        // match this token, but the bearer regex consumes the span
        // first, so the JWT pattern sees `[REDACTED]` and can't
        // re-match.
        assert!(report.kinds().contains(&"bearer_token".to_string()));
    }

    #[test]
    fn redact_strips_slack_token() {
        let raw = "send to xoxb-12345678901-aBcDeFgHiJkLmN done";
        let report = redact(raw);
        assert!(report.redacted.contains(REPLACEMENT));
        assert_eq!(report.kinds(), vec!["slack_bot_token"]);
    }

    #[test]
    fn redact_strips_google_api_key() {
        let raw = "key=AIzaSy0123456789ABCDEF0123456789ABCDEF012 rest";
        let report = redact(raw);
        assert!(report.redacted.contains(REPLACEMENT));
        assert_eq!(report.kinds(), vec!["google_api_key"]);
    }

    #[test]
    fn redact_strips_aws_access_key() {
        let raw = "AKIAIOSFODNN7EXAMPLE is the key";
        let report = redact(raw);
        assert!(report.redacted.contains(REPLACEMENT));
        assert_eq!(report.kinds(), vec!["aws_access_key_id"]);
    }

    #[test]
    fn redact_strips_jwt_when_standalone() {
        // Three-segment standalone JWT (no Bearer prefix).
        let raw = "tok=eyJabc1234567890.eyJpYXQiOjE3MDA.signaturE12345 done";
        let report = redact(raw);
        assert!(report.redacted.contains(REPLACEMENT));
        assert!(report.kinds().contains(&"jwt".to_string()));
    }

    #[test]
    fn redact_records_repeat_count_for_same_pattern() {
        let raw = "first=sk-aaa1234567890ABCDEFGHIJ second=sk-bbb1234567890ABCDEFGHIJ";
        let report = redact(raw);
        assert_eq!(report.hits.len(), 1);
        assert_eq!(report.hits[0].kind, "openai_api_key");
        assert_eq!(report.hits[0].count, 2);
    }

    #[test]
    fn redact_handles_multiple_kinds_in_one_text() {
        let raw = "use sk-abc1234567890ABCDEFGHIJ and ghp_abcdefghijklmnop1234567890ABCDEF";
        let report = redact(raw);
        let kinds = report.kinds();
        assert!(kinds.contains(&"openai_api_key".to_string()));
        assert!(kinds.contains(&"github_token".to_string()));
        assert!(report.redacted.matches(REPLACEMENT).count() >= 2);
    }

    #[test]
    fn redact_does_not_match_short_obvious_non_secrets() {
        // Below the length floor of any pattern.
        let raw = "id=sk-abc1 short=ghp_abc";
        let report = redact(raw);
        assert!(report.is_clean());
        assert_eq!(report.redacted, raw);
    }

    #[test]
    fn redact_does_not_corrupt_unicode() {
        let raw = "测试 sk-abc1234567890DEFGHIJ 完成";
        let report = redact(raw);
        assert!(report.redacted.contains("测试"));
        assert!(report.redacted.contains("完成"));
        assert!(report.redacted.contains(REPLACEMENT));
    }

    #[test]
    fn kinds_sorted_and_deduped() {
        let report = RedactReport {
            redacted: "[REDACTED]".into(),
            hits: vec![
                RedactHit {
                    kind: "github_token".into(),
                    count: 1,
                },
                RedactHit {
                    kind: "github_token".into(),
                    count: 2,
                },
                RedactHit {
                    kind: "openai_api_key".into(),
                    count: 1,
                },
            ],
        };
        assert_eq!(report.kinds(), vec!["github_token", "openai_api_key"]);
    }
}