Skip to main content

trace_share_core/
sanitize.rs

1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use std::{
4    fs,
5    path::PathBuf,
6    process::{Command, Stdio},
7};
8
9use crate::models::CanonicalEvent;
10
11#[derive(Debug, Clone, Default, Serialize, Deserialize)]
12pub struct SanitizationReport {
13    pub total_redactions: usize,
14    pub secret_redactions: usize,
15    pub email_redactions: usize,
16    pub ip_redactions: usize,
17    pub path_redactions: usize,
18    pub sample_redacted: Vec<String>,
19}
20
21pub fn sanitize_events(events: &[CanonicalEvent]) -> (Vec<CanonicalEvent>, SanitizationReport) {
22    let mut report = SanitizationReport::default();
23    let mut out = events.to_vec();
24    apply_gitleaks_if_available(&mut out, &mut report);
25
26    for event in &mut out {
27        let before = event.text.clone();
28        event.text = redact_text(&event.text, &mut report);
29        if before != event.text && report.sample_redacted.len() < 5 {
30            report.sample_redacted.push(event.text.clone());
31        }
32    }
33
34    (out, report)
35}
36
37pub fn contains_sensitive_patterns(text: &str) -> bool {
38    let mut probe = text.to_string();
39    for marker in [
40        "[REDACTED]",
41        "[REDACTED_EMAIL]",
42        "[REDACTED_IP]",
43        "[REDACTED_PATH]",
44        "[REDACTED_QUERY]",
45        "[REDACTED_GITLEAKS]",
46        "[REDACTED_JWT]",
47        "[REDACTED_PEM]",
48        "[REDACTED_USERHOST]",
49        "[REDACTED_ENTROPY]",
50    ] {
51        probe = probe.replace(marker, "");
52    }
53
54    let token_re = token_regex();
55    let bearer_re = bearer_regex();
56    let jwt_re = jwt_regex();
57    let pem_re = pem_private_key_regex();
58    let email_re = email_regex();
59    let ip_re = ip_regex();
60    let url_query_re = url_query_regex();
61    let user_host_re = user_host_regex();
62    let host_assign_re = host_assignment_regex();
63    let path_re = path_regex();
64
65    token_re.is_match(&probe)
66        || bearer_re.is_match(&probe)
67        || jwt_re.is_match(&probe)
68        || pem_re.is_match(&probe)
69        || email_re.is_match(&probe)
70        || ip_re.is_match(&probe)
71        || user_host_re.is_match(&probe)
72        || host_assign_re.is_match(&probe)
73        || path_re.is_match(&probe)
74        || url_query_re.is_match(&probe)
75        || contains_high_entropy_token(&probe)
76}
77
78fn redact_text(input: &str, report: &mut SanitizationReport) -> String {
79    let token_re = token_regex();
80    let bearer_re = bearer_regex();
81    let jwt_re = jwt_regex();
82    let pem_re = pem_private_key_regex();
83    let email_re = email_regex();
84    let ip_re = ip_regex();
85    let path_re = path_regex();
86    let url_query_re = url_query_regex();
87    let user_host_re = user_host_regex();
88    let host_assign_re = host_assignment_regex();
89
90    let mut text = input.to_string();
91
92    let n = token_re.find_iter(&text).count();
93    if n > 0 {
94        text = token_re.replace_all(&text, "$1=[REDACTED]").to_string();
95        report.secret_redactions += n;
96        report.total_redactions += n;
97    }
98
99    let n = bearer_re.find_iter(&text).count();
100    if n > 0 {
101        text = bearer_re.replace_all(&text, "$1 [REDACTED]").to_string();
102        report.secret_redactions += n;
103        report.total_redactions += n;
104    }
105
106    let n = jwt_re.find_iter(&text).count();
107    if n > 0 {
108        text = jwt_re.replace_all(&text, "[REDACTED_JWT]").to_string();
109        report.secret_redactions += n;
110        report.total_redactions += n;
111    }
112
113    let n = pem_re.find_iter(&text).count();
114    if n > 0 {
115        text = pem_re.replace_all(&text, "[REDACTED_PEM]").to_string();
116        report.secret_redactions += n;
117        report.total_redactions += n;
118    }
119
120    let n = email_re.find_iter(&text).count();
121    if n > 0 {
122        text = email_re.replace_all(&text, "[REDACTED_EMAIL]").to_string();
123        report.email_redactions += n;
124        report.total_redactions += n;
125    }
126
127    let n = ip_re.find_iter(&text).count();
128    if n > 0 {
129        text = ip_re.replace_all(&text, "[REDACTED_IP]").to_string();
130        report.ip_redactions += n;
131        report.total_redactions += n;
132    }
133
134    let n = path_re.find_iter(&text).count();
135    if n > 0 {
136        text = path_re.replace_all(&text, "[REDACTED_PATH]").to_string();
137        report.path_redactions += n;
138        report.total_redactions += n;
139    }
140
141    let n = user_host_re.find_iter(&text).count();
142    if n > 0 {
143        text = user_host_re
144            .replace_all(&text, "[REDACTED_USERHOST]")
145            .to_string();
146        report.secret_redactions += n;
147        report.total_redactions += n;
148    }
149
150    let n = host_assign_re.find_iter(&text).count();
151    if n > 0 {
152        text = host_assign_re
153            .replace_all(&text, "$1=[REDACTED_USERHOST]")
154            .to_string();
155        report.secret_redactions += n;
156        report.total_redactions += n;
157    }
158
159    let n = url_query_re.find_iter(&text).count();
160    if n > 0 {
161        text = url_query_re
162            .replace_all(&text, "$1?[REDACTED_QUERY]")
163            .to_string();
164        report.secret_redactions += n;
165        report.total_redactions += n;
166    }
167
168    if contains_high_entropy_token(&text) {
169        text = redact_high_entropy_tokens(&text, report);
170    }
171
172    text
173}
174
175fn token_regex() -> Regex {
176    Regex::new(
177        r#"(?i)(api[_-]?key|access[_-]?key|token|secret|authorization|password|passwd)\s*[:=]\s*[^\s,"']+"#,
178    )
179    .unwrap()
180}
181
182fn bearer_regex() -> Regex {
183    Regex::new(r#"(?i)\b(authorization:?\s*bearer)\s+[A-Za-z0-9\-._~+/=]{8,}"#).unwrap()
184}
185
186fn jwt_regex() -> Regex {
187    Regex::new(r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b").unwrap()
188}
189
190fn pem_private_key_regex() -> Regex {
191    Regex::new(r"(?s)-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----.*?-----END [A-Z0-9 ]*PRIVATE KEY-----")
192        .unwrap()
193}
194
195fn email_regex() -> Regex {
196    Regex::new(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b").unwrap()
197}
198
199fn ip_regex() -> Regex {
200    Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap()
201}
202
203fn path_regex() -> Regex {
204    Regex::new(
205        r"(?i)(?:/Users/[^/\s]+|/home/[^/\s]+|/root/[^/\s]*|[A-Za-z]:[\\/](?:[^\\/\s]+[\\/])*[^\\/\s]+)",
206    )
207    .unwrap()
208}
209
210fn url_query_regex() -> Regex {
211    Regex::new(r"(https?://[^\s?]+)\?[^\s]+").unwrap()
212}
213
214fn user_host_regex() -> Regex {
215    Regex::new(r"\b[A-Za-z0-9._-]{2,32}@[A-Za-z0-9._-]{2,64}\b").unwrap()
216}
217
218fn host_assignment_regex() -> Regex {
219    Regex::new(r#"(?i)\b(hostname|host|user|username)\s*[:=]\s*([A-Za-z0-9._-]{2,64})"#).unwrap()
220}
221
222fn contains_high_entropy_token(text: &str) -> bool {
223    text.split(|c: char| {
224        c.is_whitespace() || matches!(c, '"' | '\'' | ',' | ';' | '(' | ')' | '[' | ']')
225    })
226    .any(is_high_entropy_token)
227}
228
229fn redact_high_entropy_tokens(text: &str, report: &mut SanitizationReport) -> String {
230    let mut out = String::with_capacity(text.len());
231    for token in text.split_inclusive(|c: char| c.is_whitespace()) {
232        let trimmed = token.trim();
233        if is_high_entropy_token(trimmed) {
234            out.push_str(&token.replace(trimmed, "[REDACTED_ENTROPY]"));
235            report.secret_redactions += 1;
236            report.total_redactions += 1;
237        } else {
238            out.push_str(token);
239        }
240    }
241    out
242}
243
244fn is_high_entropy_token(token: &str) -> bool {
245    if token.len() < 24 {
246        return false;
247    }
248    if token.chars().all(|c| c.is_ascii_hexdigit()) {
249        return false;
250    }
251    if !token
252        .chars()
253        .all(|c| c.is_ascii_alphanumeric() || "-_~+/=".contains(c))
254    {
255        return false;
256    }
257    let has_upper = token.chars().any(|c| c.is_ascii_uppercase());
258    let has_lower = token.chars().any(|c| c.is_ascii_lowercase());
259    let has_digit = token.chars().any(|c| c.is_ascii_digit());
260    (has_upper && has_lower && has_digit) || token.len() >= 32
261}
262
263fn apply_gitleaks_if_available(events: &mut [CanonicalEvent], report: &mut SanitizationReport) {
264    let Some(gitleaks_bin) = find_gitleaks_binary() else {
265        return;
266    };
267
268    let temp_dir =
269        std::env::temp_dir().join(format!("trace-share-gitleaks-{}", uuid::Uuid::new_v4()));
270    if fs::create_dir_all(&temp_dir).is_err() {
271        return;
272    }
273
274    let mut file_map = Vec::new();
275    for (i, event) in events.iter().enumerate() {
276        let file_path = temp_dir.join(format!("event-{i}.txt"));
277        if fs::write(&file_path, &event.text).is_ok() {
278            file_map.push((i, file_path));
279        }
280    }
281
282    if file_map.is_empty() {
283        let _ = fs::remove_dir_all(&temp_dir);
284        return;
285    }
286
287    let report_path = temp_dir.join("gitleaks-report.json");
288    let output = Command::new(gitleaks_bin)
289        .arg("detect")
290        .arg("--no-git")
291        .arg("--source")
292        .arg(&temp_dir)
293        .arg("--report-format")
294        .arg("json")
295        .arg("--report-path")
296        .arg(&report_path)
297        .stdout(Stdio::null())
298        .stderr(Stdio::piped())
299        .output();
300
301    let Ok(output) = output else {
302        let _ = fs::remove_dir_all(&temp_dir);
303        return;
304    };
305
306    // gitleaks exits with non-zero when leaks are found. We still parse report.
307    if !report_path.exists() && !output.status.success() {
308        let _ = fs::remove_dir_all(&temp_dir);
309        return;
310    }
311
312    let report_text = fs::read_to_string(&report_path).unwrap_or_default();
313    if report_text.trim().is_empty() {
314        let _ = fs::remove_dir_all(&temp_dir);
315        return;
316    }
317
318    let leaks = serde_json::from_str::<Vec<GitleaksFinding>>(&report_text).unwrap_or_default();
319    for finding in leaks {
320        if let Some(idx) = finding
321            .file
322            .as_deref()
323            .and_then(extract_event_index)
324            .filter(|idx| *idx < events.len())
325        {
326            if let Some(secret) = finding.secret.as_deref() {
327                if !secret.is_empty() && events[idx].text.contains(secret) {
328                    events[idx].text = events[idx].text.replace(secret, "[REDACTED_GITLEAKS]");
329                    report.secret_redactions += 1;
330                    report.total_redactions += 1;
331                }
332            }
333        }
334    }
335
336    let _ = fs::remove_dir_all(&temp_dir);
337}
338
339fn extract_event_index(path_text: &str) -> Option<usize> {
340    let binding = PathBuf::from(path_text);
341    let name = binding.file_name()?.to_str()?;
342    let idx = name
343        .strip_prefix("event-")?
344        .strip_suffix(".txt")?
345        .parse::<usize>()
346        .ok()?;
347    Some(idx)
348}
349
350fn find_gitleaks_binary() -> Option<PathBuf> {
351    let path = std::env::var_os("PATH")?;
352    std::env::split_paths(&path).find_map(|dir| {
353        let candidate = dir.join("gitleaks");
354        if candidate.exists() {
355            return Some(candidate);
356        }
357        #[cfg(windows)]
358        {
359            let candidate_exe = dir.join("gitleaks.exe");
360            if candidate_exe.exists() {
361                return Some(candidate_exe);
362            }
363        }
364        None
365    })
366}
367
368#[derive(Debug, Clone, Default, Deserialize)]
369struct GitleaksFinding {
370    #[serde(rename = "File")]
371    file: Option<String>,
372    #[serde(rename = "Secret")]
373    secret: Option<String>,
374}
375
376#[cfg(test)]
377mod tests {
378    use chrono::Utc;
379
380    use crate::models::CanonicalEvent;
381
382    use super::{contains_sensitive_patterns, sanitize_events};
383
384    #[test]
385    fn redacts_known_patterns() {
386        let input = vec![CanonicalEvent {
387            source: "x".to_string(),
388            session_id: "s".to_string(),
389            ts: Utc::now(),
390            kind: "user_msg".to_string(),
391            text: "token=abc123 email me at a@b.com from 127.0.0.1 /home/user/repo C:\\Users\\alice\\repo authorization: bearer ABCDEFGHIJ".to_string(),
392            tool: None,
393            meta: None,
394        }];
395
396        let (sanitized, report) = sanitize_events(&input);
397        assert!(sanitized[0].text.contains("[REDACTED]"));
398        assert!(sanitized[0].text.contains("[REDACTED_EMAIL]"));
399        assert!(sanitized[0].text.contains("[REDACTED_IP]"));
400        assert!(sanitized[0].text.contains("[REDACTED_PATH]"));
401        assert!(
402            sanitized[0]
403                .text
404                .to_ascii_lowercase()
405                .contains("authorization=[redacted]")
406        );
407        assert!(report.total_redactions >= 4);
408    }
409
410    #[test]
411    fn redacts_jwt_pem_and_entropy() {
412        let jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.cGF5bG9hZC12YWx1ZS0xMjM0NTY3ODkw.sigvalue1234567890ABCD";
413        let pem = "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC\n-----END PRIVATE KEY-----";
414        let entropy = "AbCDef1234567890GhIjKlMnOpQrStUv";
415        let input = vec![CanonicalEvent {
416            source: "x".to_string(),
417            session_id: "s".to_string(),
418            ts: Utc::now(),
419            kind: "user_msg".to_string(),
420            text: format!("{jwt}\n{pem}\nsecret:{entropy}"),
421            tool: None,
422            meta: None,
423        }];
424        let (sanitized, _) = sanitize_events(&input);
425        let out = &sanitized[0].text;
426        assert!(out.contains("[REDACTED_JWT]"));
427        assert!(out.contains("[REDACTED_PEM]"));
428        assert!(out.contains("[REDACTED]") || out.contains("[REDACTED_ENTROPY]"));
429    }
430
431    #[test]
432    fn extracts_gitleaks_event_index() {
433        assert_eq!(super::extract_event_index("/tmp/x/event-12.txt"), Some(12));
434        assert_eq!(super::extract_event_index("event-2.txt"), Some(2));
435        assert_eq!(super::extract_event_index("random.txt"), None);
436    }
437
438    #[test]
439    fn detects_sensitive_patterns() {
440        assert!(contains_sensitive_patterns("token=abc123"));
441        assert!(contains_sensitive_patterns("email is test@example.com"));
442        assert!(contains_sensitive_patterns("visit https://x.y/z?a=1"));
443        assert!(contains_sensitive_patterns(
444            "cwd C:\\Users\\evang\\work\\trace-share"
445        ));
446        assert!(contains_sensitive_patterns(
447            "eyJhbGciOiJIUzI1NiJ9.abc1234567.zyx0987654"
448        ));
449        assert!(contains_sensitive_patterns(
450            "-----BEGIN PRIVATE KEY-----abc-----END PRIVATE KEY-----"
451        ));
452        assert!(!contains_sensitive_patterns("clean text only"));
453        assert!(!contains_sensitive_patterns("token=[REDACTED]"));
454    }
455}