Skip to main content

aura_redact/
lib.rs

1//! # aura-redact
2//!
3//! Two-pass secret and PII scrubber for arbitrary text.
4//!
5//! 1. **Pattern pass** — regex matches for emails, IPv4 addresses, and common
6//!    API token prefixes (`sk-`, `ghp_`, `xoxb-`, `AIza...`).
7//! 2. **Entropy pass** — Shannon-entropy analysis catches high-entropy tokens
8//!    (random keys, base64 payloads) that pattern matching misses.
9//!
10//! Designed to be called on any string before it leaves the local machine —
11//! LLM logs, error reports, telemetry, crash dumps.
12//!
13//! ## Example
14//!
15//! ```
16//! let dirty = "Email admin@example.com — token sk-proj-1234567890abcdefghij";
17//! let clean = aura_redact::Redactor::scrub(dirty);
18//! assert!(clean.contains("[REDACTED_EMAIL]"));
19//! assert!(clean.contains("[REDACTED_TOKEN]"));
20//! ```
21//!
22//! Originally extracted from [Aura](https://auravcs.com), the semantic
23//! version control engine.
24
25use regex::Regex;
26use std::collections::HashMap;
27
28/// The Semantic Scrubber: Uses Information Theory and Regex to protect data.
29pub struct Redactor;
30
31impl Redactor {
32    /// Main entry point to scrub a string before it leaves the local machine.
33    pub fn scrub(text: &str) -> String {
34        // Pass 1: Pattern-based heuristic scrubbing (Regex)
35        let mut scrubbed = Self::scrub_patterns(text);
36
37        // Pass 2: Information Theory scrubbing (Shannon Entropy)
38        scrubbed = Self::scrub_high_entropy(&scrubbed);
39
40        scrubbed
41    }
42
43    /// PASS 1: Scrub known patterns like emails, IPs, and common token prefixes.
44    fn scrub_patterns(text: &str) -> String {
45        let mut result = text.to_string();
46
47        // Email Pattern
48        let email_re = Regex::new(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}").unwrap();
49        result = email_re
50            .replace_all(&result, "[REDACTED_EMAIL]")
51            .to_string();
52
53        // IPv4 Pattern
54        let ip_re = Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap();
55        result = ip_re.replace_all(&result, "[REDACTED_IP]").to_string();
56
57        // Common API Key Prefixes (e.g., sk-..., ghp_..., xoxb-...)
58        let token_re = Regex::new(r"(?i)(sk-|ghp_|xoxb-|AIza)[a-zA-Z0-9_\-]+").unwrap();
59        result = token_re
60            .replace_all(&result, "[REDACTED_TOKEN]")
61            .to_string();
62
63        result
64    }
65
66    /// PASS 2: Scrub tokens with high Shannon Entropy (cryptographic keys, base64 payloads).
67    fn scrub_high_entropy(text: &str) -> String {
68        // IMPROVEMENT: Split by common code delimiters to avoid treating long lines as single tokens
69        let mut result = text.to_string();
70        let delimiters = [
71            ' ', '\t', '\n', '\r', '(', ')', '{', '}', '[', ']', ',', ';', ':', '=', '+', '-', '*',
72            '/',
73        ];
74
75        // We temporarily replace delimiters with spaces for easy tokenization
76        let mut tokenizable = text.to_string();
77        for d in delimiters {
78            tokenizable = tokenizable.replace(d, " ");
79        }
80
81        let tokens: Vec<&str> = tokenizable.split_whitespace().collect();
82
83        for token in tokens {
84            // Only analyze tokens of significant length (e.g., > 20 chars for real secrets)
85            if token.len() > 20 {
86                let entropy = Self::calculate_shannon_entropy(token);
87                // Threshold: 5.2 bits/char is a more reliable indicator for random keys in code.
88                // Standard code density usually sits between 4.0 and 4.8.
89                if entropy > 5.2 {
90                    result = result.replace(token, "[REDACTED_HIGH_ENTROPY]");
91                }
92            }
93        }
94
95        result
96    }
97
98    /// Calculates Shannon Entropy in bits per character.
99    /// Formula: H = -sum(p_i * log2(p_i))
100    fn calculate_shannon_entropy(s: &str) -> f64 {
101        if s.is_empty() {
102            return 0.0;
103        }
104
105        let mut frequencies = HashMap::new();
106        for c in s.chars() {
107            *frequencies.entry(c).or_insert(0) += 1;
108        }
109
110        let len = s.chars().count() as f64;
111        let mut entropy = 0.0;
112
113        for &count in frequencies.values() {
114            let p = count as f64 / len;
115            entropy -= p * p.log2();
116        }
117
118        entropy
119    }
120}
121
122#[cfg(test)]
123mod tests {
124    use super::*;
125
126    #[test]
127    fn test_email_redaction() {
128        let input = "Contact me at admin@example.com for keys.";
129        let output = Redactor::scrub(input);
130        assert!(output.contains("[REDACTED_EMAIL]"));
131        assert!(!output.contains("admin@example.com"));
132    }
133
134    #[test]
135    fn test_high_entropy_redaction() {
136        // API tokens with known prefixes should be caught by pattern matching
137        let input = "Token: sk-proj-1234567890abcdefghij";
138        let output = Redactor::scrub(input);
139        assert!(
140            output.contains("[REDACTED_TOKEN]"),
141            "Expected token redaction for sk- prefix. Got: {}",
142            output
143        );
144
145        // GitHub token prefix
146        let input2 = "ghp_aBcDeFgHiJkLmNoPqRsTuVwXyZ12345";
147        let output2 = Redactor::scrub(input2);
148        assert!(
149            output2.contains("[REDACTED_TOKEN]"),
150            "Expected token redaction for ghp_ prefix. Got: {}",
151            output2
152        );
153    }
154
155    #[test]
156    fn test_low_entropy_preservation() {
157        // Normal English sentence (low entropy)
158        let input = "This is a perfectly normal sentence that should not be redacted.";
159        let output = Redactor::scrub(input);
160        assert!(!output.contains("[REDACTED]"));
161        assert_eq!(input, output);
162    }
163}