Skip to main content

hematite/agent/
edge_redact.rs

1// Edge Redaction — privacy-preserving filter for MCP server mode.
2//
3// Runs after inspect_host() and before the response crosses the wire to any
4// cloud agent. Strips identifiers that should never leave the machine:
5//   - Usernames embedded in file paths
6//   - MAC addresses
7//   - Hardware serial numbers
8//   - Hostnames / computer names
9//   - Credential-shaped values (API keys, tokens, passwords)
10//   - AWS access key IDs
11//
12// Each category is tracked separately so the cloud model receives a clear
13// redaction receipt explaining what was sanitized and how much, without
14// revealing the original values.
15//
16// Enable: hematite --mcp-server --edge-redact
17
18use lazy_static::lazy_static;
19use regex::Regex;
20use std::collections::BTreeMap;
21
22pub struct RedactResult {
23    /// Sanitized text safe to send to a cloud model.
24    pub text: String,
25    /// Total number of individual substitutions made.
26    pub redaction_count: usize,
27    /// Human-readable summary line prepended to the text.
28    pub summary_header: String,
29    /// Per-category hit counts — used by the audit trail.
30    pub tier1_hits: BTreeMap<&'static str, usize>,
31}
32
33struct Pattern {
34    re: Regex,
35    label: &'static str,
36    replacement: &'static str,
37}
38
39lazy_static! {
40    static ref PATTERNS: Vec<Pattern> = vec![
41        // Windows username in paths: C:\Users\<name>\ or C:/Users/<name>/
42        Pattern {
43            re: Regex::new(r"(?i)(C:[/\\]Users[/\\])([^/\\\r\n\t ]+)([/\\])").unwrap(),
44            label: "username-path",
45            replacement: "${1}[USER]${3}",
46        },
47        // Linux/macOS home paths: /home/<name>/ or /Users/<name>/
48        Pattern {
49            re: Regex::new(r"(/(?:home|Users)/)([^/\r\n\t ]+)(/)").unwrap(),
50            label: "username-path",
51            replacement: "${1}[USER]${3}",
52        },
53        // MAC addresses (colon or hyphen-separated)
54        Pattern {
55            re: Regex::new(r"\b([0-9A-Fa-f]{2}[:\-]){5}[0-9A-Fa-f]{2}\b").unwrap(),
56            label: "mac-address",
57            replacement: "[MAC]",
58        },
59        // Hardware / disk serial numbers
60        Pattern {
61            re: Regex::new(r"(?i)(serial\s*(?:number)?[:=]\s*)([^\s\r\n]{4,})").unwrap(),
62            label: "serial-number",
63            replacement: "${1}[SERIAL]",
64        },
65        // Computer / hostname labels
66        Pattern {
67            re: Regex::new(
68                r"(?i)((?:hostname|computer\s*name|machine\s*name|device\s*name|netbios\s*name)\s*[:=]\s*)([^\s\r\n]+)"
69            ).unwrap(),
70            label: "hostname",
71            replacement: "${1}[HOSTNAME]",
72        },
73        // AWS access key IDs
74        Pattern {
75            re: Regex::new(r"\bAKIA[0-9A-Z]{16}\b").unwrap(),
76            label: "aws-key",
77            replacement: "[AWS-KEY]",
78        },
79        // Generic credential values: KEY=xxx, TOKEN=xxx, PASSWORD=xxx, etc.
80        // Only fires when the label looks credential-shaped and value is ≥8 chars.
81        Pattern {
82            re: Regex::new(
83                r"(?i)((?:api[_\-]?key|secret[_\-]?key|access[_\-]?token|auth[_\-]?token|password|passwd|pwd|private[_\-]?key|client[_\-]?secret)[^\s=:]*\s*[:=]\s*)(\S{8,})"
84            ).unwrap(),
85            label: "credential",
86            replacement: "${1}[REDACTED]",
87        },
88    ];
89}
90
91/// Apply all redaction patterns to `input`.
92/// Returns the sanitized text plus a receipt of what was removed.
93pub fn redact(input: &str) -> RedactResult {
94    let mut text = input.to_string();
95    let mut counts: BTreeMap<&'static str, usize> = BTreeMap::new();
96
97    for pattern in PATTERNS.iter() {
98        let hits = pattern.re.find_iter(&text).count();
99        if hits > 0 {
100            *counts.entry(pattern.label).or_insert(0) += hits;
101            text = pattern
102                .re
103                .replace_all(&text, pattern.replacement)
104                .into_owned();
105        }
106    }
107
108    let total: usize = counts.values().sum();
109
110    let summary_header = if total == 0 {
111        String::from("[edge-redact: no sensitive patterns detected]")
112    } else {
113        let detail: Vec<String> = counts
114            .iter()
115            .map(|(label, n)| format!("{label} \u{00d7}{n}"))
116            .collect();
117        format!(
118            "[edge-redact: {total} substitution(s) — {} — values replaced before leaving this machine]",
119            detail.join(", ")
120        )
121    };
122
123    RedactResult {
124        text,
125        redaction_count: total,
126        summary_header,
127        tier1_hits: counts,
128    }
129}
130
131/// Wrap a tool result with the edge-redact header so the cloud model
132/// always sees a clear privacy receipt at the top of the response.
133pub fn apply(raw: &str) -> String {
134    let result = redact(raw);
135    format!("{}\n\n{}", result.summary_header, result.text)
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    #[test]
143    fn redacts_windows_username_path() {
144        let input = "path: C:\\Users\\johndoe\\Documents\\project";
145        let r = redact(input);
146        assert!(r.text.contains("[USER]"), "should redact username");
147        assert!(
148            !r.text.contains("johndoe"),
149            "should not contain raw username"
150        );
151        assert!(r.redaction_count > 0);
152    }
153
154    #[test]
155    fn redacts_mac_address() {
156        let input = "MAC: 00:1A:2B:3C:4D:5E adapter connected";
157        let r = redact(input);
158        assert!(r.text.contains("[MAC]"), "should redact MAC");
159        assert!(
160            !r.text.contains("00:1A:2B:3C:4D:5E"),
161            "raw MAC must not appear"
162        );
163    }
164
165    #[test]
166    fn redacts_serial_number() {
167        let input = "SerialNumber: WD-WX12345678";
168        let r = redact(input);
169        assert!(r.text.contains("[SERIAL]"), "should redact serial");
170        assert!(
171            !r.text.contains("WD-WX12345678"),
172            "raw serial must not appear"
173        );
174    }
175
176    #[test]
177    fn redacts_hostname_label() {
178        let input = "ComputerName: CORP-LAPTOP-007";
179        let r = redact(input);
180        assert!(r.text.contains("[HOSTNAME]"), "should redact hostname");
181        assert!(
182            !r.text.contains("CORP-LAPTOP-007"),
183            "raw hostname must not appear"
184        );
185    }
186
187    #[test]
188    fn redacts_aws_key() {
189        let input = "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE found in env";
190        let r = redact(input);
191        assert!(r.text.contains("[AWS-KEY]"), "should redact AWS key");
192        assert!(
193            !r.text.contains("AKIAIOSFODNN7EXAMPLE"),
194            "raw key must not appear"
195        );
196    }
197
198    #[test]
199    fn redacts_credential_value() {
200        let input = "API_KEY=sk-supersecretvalue123 exported";
201        let r = redact(input);
202        assert!(
203            r.text.contains("[REDACTED]"),
204            "should redact credential value"
205        );
206        assert!(
207            !r.text.contains("sk-supersecretvalue123"),
208            "raw secret must not appear"
209        );
210    }
211
212    #[test]
213    fn clean_input_passes_through_unchanged() {
214        let input = "Processes: 42 running\nCPU: 12%\nRAM: 8.1 GB / 32 GB";
215        let r = redact(input);
216        assert_eq!(r.redaction_count, 0);
217        assert_eq!(r.text, input);
218        assert!(r.summary_header.contains("no sensitive patterns"));
219    }
220
221    #[test]
222    fn apply_always_prepends_header() {
223        let out = apply("CPU: 15%");
224        assert!(out.starts_with("[edge-redact:"), "header must be first");
225    }
226}