Skip to main content

trace_share_core/
sanitize.rs

1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use std::{
4    fs,
5    path::PathBuf,
6    process::{Command, Stdio},
7};
8
9use crate::models::CanonicalEvent;
10
11#[derive(Debug, Clone, Default, Serialize, Deserialize)]
12pub struct SanitizationReport {
13    pub total_redactions: usize,
14    pub secret_redactions: usize,
15    pub email_redactions: usize,
16    pub ip_redactions: usize,
17    pub path_redactions: usize,
18    pub sample_redacted: Vec<String>,
19}
20
21pub fn sanitize_events(events: &[CanonicalEvent]) -> (Vec<CanonicalEvent>, SanitizationReport) {
22    let mut report = SanitizationReport::default();
23    let mut out = events.to_vec();
24    apply_gitleaks_if_available(&mut out, &mut report);
25
26    for event in &mut out {
27        let before = event.text.clone();
28        event.text = redact_text(&event.text, &mut report);
29        if before != event.text && report.sample_redacted.len() < 5 {
30            report.sample_redacted.push(event.text.clone());
31        }
32    }
33
34    (out, report)
35}
36
37pub fn contains_sensitive_patterns(text: &str) -> bool {
38    let mut probe = text.to_string();
39    for marker in [
40        "[REDACTED]",
41        "[REDACTED_EMAIL]",
42        "[REDACTED_IP]",
43        "[REDACTED_PATH]",
44        "[REDACTED_QUERY]",
45        "[REDACTED_GITLEAKS]",
46        "[REDACTED_JWT]",
47        "[REDACTED_PEM]",
48        "[REDACTED_USERHOST]",
49        "[REDACTED_ENTROPY]",
50    ] {
51        probe = probe.replace(marker, "");
52    }
53
54    let token_re = token_regex();
55    let bearer_re = bearer_regex();
56    let jwt_re = jwt_regex();
57    let pem_re = pem_private_key_regex();
58    let email_re = email_regex();
59    let ip_re = ip_regex();
60    let url_query_re = url_query_regex();
61    let user_host_re = user_host_regex();
62    let host_assign_re = host_assignment_regex();
63    let path_re = path_regex();
64
65    token_re.is_match(&probe)
66        || bearer_re.is_match(&probe)
67        || jwt_re.is_match(&probe)
68        || pem_re.is_match(&probe)
69        || email_re.is_match(&probe)
70        || ip_re.is_match(&probe)
71        || user_host_re.is_match(&probe)
72        || host_assign_re.is_match(&probe)
73        || path_re.is_match(&probe)
74        || url_query_re.is_match(&probe)
75        || contains_high_entropy_token(&probe)
76}
77
78fn redact_text(input: &str, report: &mut SanitizationReport) -> String {
79    let token_re = token_regex();
80    let bearer_re = bearer_regex();
81    let jwt_re = jwt_regex();
82    let pem_re = pem_private_key_regex();
83    let email_re = email_regex();
84    let ip_re = ip_regex();
85    let path_re = path_regex();
86    let url_query_re = url_query_regex();
87    let user_host_re = user_host_regex();
88    let host_assign_re = host_assignment_regex();
89
90    let mut text = input.to_string();
91
92    let n = token_re.find_iter(&text).count();
93    if n > 0 {
94        text = token_re.replace_all(&text, "$1=[REDACTED]").to_string();
95        report.secret_redactions += n;
96        report.total_redactions += n;
97    }
98
99    let n = bearer_re.find_iter(&text).count();
100    if n > 0 {
101        text = bearer_re.replace_all(&text, "$1 [REDACTED]").to_string();
102        report.secret_redactions += n;
103        report.total_redactions += n;
104    }
105
106    let n = jwt_re.find_iter(&text).count();
107    if n > 0 {
108        text = jwt_re.replace_all(&text, "[REDACTED_JWT]").to_string();
109        report.secret_redactions += n;
110        report.total_redactions += n;
111    }
112
113    let n = pem_re.find_iter(&text).count();
114    if n > 0 {
115        text = pem_re.replace_all(&text, "[REDACTED_PEM]").to_string();
116        report.secret_redactions += n;
117        report.total_redactions += n;
118    }
119
120    let n = email_re.find_iter(&text).count();
121    if n > 0 {
122        text = email_re.replace_all(&text, "[REDACTED_EMAIL]").to_string();
123        report.email_redactions += n;
124        report.total_redactions += n;
125    }
126
127    let n = ip_re.find_iter(&text).count();
128    if n > 0 {
129        text = ip_re.replace_all(&text, "[REDACTED_IP]").to_string();
130        report.ip_redactions += n;
131        report.total_redactions += n;
132    }
133
134    let n = path_re.find_iter(&text).count();
135    if n > 0 {
136        text = path_re.replace_all(&text, "[REDACTED_PATH]").to_string();
137        report.path_redactions += n;
138        report.total_redactions += n;
139    }
140
141    let n = user_host_re.find_iter(&text).count();
142    if n > 0 {
143        text = user_host_re
144            .replace_all(&text, "[REDACTED_USERHOST]")
145            .to_string();
146        report.secret_redactions += n;
147        report.total_redactions += n;
148    }
149
150    let n = host_assign_re.find_iter(&text).count();
151    if n > 0 {
152        text = host_assign_re
153            .replace_all(&text, "$1=[REDACTED_USERHOST]")
154            .to_string();
155        report.secret_redactions += n;
156        report.total_redactions += n;
157    }
158
159    let n = url_query_re.find_iter(&text).count();
160    if n > 0 {
161        text = url_query_re
162            .replace_all(&text, "$1?[REDACTED_QUERY]")
163            .to_string();
164        report.secret_redactions += n;
165        report.total_redactions += n;
166    }
167
168    if contains_high_entropy_token(&text) {
169        text = redact_high_entropy_tokens(&text, report);
170    }
171
172    text
173}
174
175fn token_regex() -> Regex {
176    Regex::new(
177        r#"(?i)(api[_-]?key|access[_-]?key|token|secret|authorization|password|passwd)\s*[:=]\s*[^\s,"']+"#,
178    )
179    .unwrap()
180}
181
182fn bearer_regex() -> Regex {
183    Regex::new(r#"(?i)\b(authorization:?\s*bearer)\s+[A-Za-z0-9\-._~+/=]{8,}"#).unwrap()
184}
185
186fn jwt_regex() -> Regex {
187    Regex::new(r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b").unwrap()
188}
189
190fn pem_private_key_regex() -> Regex {
191    Regex::new(r"(?s)-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----.*?-----END [A-Z0-9 ]*PRIVATE KEY-----")
192        .unwrap()
193}
194
195fn email_regex() -> Regex {
196    Regex::new(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b").unwrap()
197}
198
199fn ip_regex() -> Regex {
200    Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap()
201}
202
203fn path_regex() -> Regex {
204    Regex::new(r"(?:/Users/[^/\s]+|/home/[^/\s]+|/root/[^/\s]*|[A-Za-z]:\\\\Users\\\\[^\\\s]+)")
205        .unwrap()
206}
207
208fn url_query_regex() -> Regex {
209    Regex::new(r"(https?://[^\s?]+)\?[^\s]+").unwrap()
210}
211
212fn user_host_regex() -> Regex {
213    Regex::new(r"\b[A-Za-z0-9._-]{2,32}@[A-Za-z0-9._-]{2,64}\b").unwrap()
214}
215
216fn host_assignment_regex() -> Regex {
217    Regex::new(r#"(?i)\b(hostname|host|user|username)\s*[:=]\s*([A-Za-z0-9._-]{2,64})"#).unwrap()
218}
219
220fn contains_high_entropy_token(text: &str) -> bool {
221    text.split(|c: char| {
222        c.is_whitespace() || matches!(c, '"' | '\'' | ',' | ';' | '(' | ')' | '[' | ']')
223    })
224    .any(is_high_entropy_token)
225}
226
227fn redact_high_entropy_tokens(text: &str, report: &mut SanitizationReport) -> String {
228    let mut out = String::with_capacity(text.len());
229    for token in text.split_inclusive(|c: char| c.is_whitespace()) {
230        let trimmed = token.trim();
231        if is_high_entropy_token(trimmed) {
232            out.push_str(&token.replace(trimmed, "[REDACTED_ENTROPY]"));
233            report.secret_redactions += 1;
234            report.total_redactions += 1;
235        } else {
236            out.push_str(token);
237        }
238    }
239    out
240}
241
242fn is_high_entropy_token(token: &str) -> bool {
243    if token.len() < 24 {
244        return false;
245    }
246    if token.chars().all(|c| c.is_ascii_hexdigit()) {
247        return false;
248    }
249    if !token
250        .chars()
251        .all(|c| c.is_ascii_alphanumeric() || "-_~+/=".contains(c))
252    {
253        return false;
254    }
255    let has_upper = token.chars().any(|c| c.is_ascii_uppercase());
256    let has_lower = token.chars().any(|c| c.is_ascii_lowercase());
257    let has_digit = token.chars().any(|c| c.is_ascii_digit());
258    (has_upper && has_lower && has_digit) || token.len() >= 32
259}
260
261fn apply_gitleaks_if_available(events: &mut [CanonicalEvent], report: &mut SanitizationReport) {
262    let Some(gitleaks_bin) = find_gitleaks_binary() else {
263        return;
264    };
265
266    let temp_dir =
267        std::env::temp_dir().join(format!("trace-share-gitleaks-{}", uuid::Uuid::new_v4()));
268    if fs::create_dir_all(&temp_dir).is_err() {
269        return;
270    }
271
272    let mut file_map = Vec::new();
273    for (i, event) in events.iter().enumerate() {
274        let file_path = temp_dir.join(format!("event-{i}.txt"));
275        if fs::write(&file_path, &event.text).is_ok() {
276            file_map.push((i, file_path));
277        }
278    }
279
280    if file_map.is_empty() {
281        let _ = fs::remove_dir_all(&temp_dir);
282        return;
283    }
284
285    let report_path = temp_dir.join("gitleaks-report.json");
286    let output = Command::new(gitleaks_bin)
287        .arg("detect")
288        .arg("--no-git")
289        .arg("--source")
290        .arg(&temp_dir)
291        .arg("--report-format")
292        .arg("json")
293        .arg("--report-path")
294        .arg(&report_path)
295        .stdout(Stdio::null())
296        .stderr(Stdio::piped())
297        .output();
298
299    let Ok(output) = output else {
300        let _ = fs::remove_dir_all(&temp_dir);
301        return;
302    };
303
304    // gitleaks exits with non-zero when leaks are found. We still parse report.
305    if !report_path.exists() && !output.status.success() {
306        let _ = fs::remove_dir_all(&temp_dir);
307        return;
308    }
309
310    let report_text = fs::read_to_string(&report_path).unwrap_or_default();
311    if report_text.trim().is_empty() {
312        let _ = fs::remove_dir_all(&temp_dir);
313        return;
314    }
315
316    let leaks = serde_json::from_str::<Vec<GitleaksFinding>>(&report_text).unwrap_or_default();
317    for finding in leaks {
318        if let Some(idx) = finding
319            .file
320            .as_deref()
321            .and_then(extract_event_index)
322            .filter(|idx| *idx < events.len())
323        {
324            if let Some(secret) = finding.secret.as_deref() {
325                if !secret.is_empty() && events[idx].text.contains(secret) {
326                    events[idx].text = events[idx].text.replace(secret, "[REDACTED_GITLEAKS]");
327                    report.secret_redactions += 1;
328                    report.total_redactions += 1;
329                }
330            }
331        }
332    }
333
334    let _ = fs::remove_dir_all(&temp_dir);
335}
336
337fn extract_event_index(path_text: &str) -> Option<usize> {
338    let binding = PathBuf::from(path_text);
339    let name = binding.file_name()?.to_str()?;
340    let idx = name
341        .strip_prefix("event-")?
342        .strip_suffix(".txt")?
343        .parse::<usize>()
344        .ok()?;
345    Some(idx)
346}
347
348fn find_gitleaks_binary() -> Option<PathBuf> {
349    let path = std::env::var_os("PATH")?;
350    std::env::split_paths(&path).find_map(|dir| {
351        let candidate = dir.join("gitleaks");
352        if candidate.exists() {
353            return Some(candidate);
354        }
355        #[cfg(windows)]
356        {
357            let candidate_exe = dir.join("gitleaks.exe");
358            if candidate_exe.exists() {
359                return Some(candidate_exe);
360            }
361        }
362        None
363    })
364}
365
366#[derive(Debug, Clone, Default, Deserialize)]
367struct GitleaksFinding {
368    #[serde(rename = "File")]
369    file: Option<String>,
370    #[serde(rename = "Secret")]
371    secret: Option<String>,
372}
373
374#[cfg(test)]
375mod tests {
376    use chrono::Utc;
377
378    use crate::models::CanonicalEvent;
379
380    use super::{contains_sensitive_patterns, sanitize_events};
381
382    #[test]
383    fn redacts_known_patterns() {
384        let input = vec![CanonicalEvent {
385            source: "x".to_string(),
386            session_id: "s".to_string(),
387            ts: Utc::now(),
388            kind: "user_msg".to_string(),
389            text: "token=abc123 email me at a@b.com from 127.0.0.1 /home/user/repo authorization: bearer ABCDEFGHIJ".to_string(),
390            tool: None,
391            meta: None,
392        }];
393
394        let (sanitized, report) = sanitize_events(&input);
395        assert!(sanitized[0].text.contains("[REDACTED]"));
396        assert!(sanitized[0].text.contains("[REDACTED_EMAIL]"));
397        assert!(sanitized[0].text.contains("[REDACTED_IP]"));
398        assert!(sanitized[0].text.contains("[REDACTED_PATH]"));
399        assert!(
400            sanitized[0]
401                .text
402                .to_ascii_lowercase()
403                .contains("authorization=[redacted]")
404        );
405        assert!(report.total_redactions >= 4);
406    }
407
408    #[test]
409    fn redacts_jwt_pem_and_entropy() {
410        let jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.cGF5bG9hZC12YWx1ZS0xMjM0NTY3ODkw.sigvalue1234567890ABCD";
411        let pem = "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC\n-----END PRIVATE KEY-----";
412        let entropy = "AbCDef1234567890GhIjKlMnOpQrStUv";
413        let input = vec![CanonicalEvent {
414            source: "x".to_string(),
415            session_id: "s".to_string(),
416            ts: Utc::now(),
417            kind: "user_msg".to_string(),
418            text: format!("{jwt}\n{pem}\nsecret:{entropy}"),
419            tool: None,
420            meta: None,
421        }];
422        let (sanitized, _) = sanitize_events(&input);
423        let out = &sanitized[0].text;
424        assert!(out.contains("[REDACTED_JWT]"));
425        assert!(out.contains("[REDACTED_PEM]"));
426        assert!(out.contains("[REDACTED]") || out.contains("[REDACTED_ENTROPY]"));
427    }
428
429    #[test]
430    fn extracts_gitleaks_event_index() {
431        assert_eq!(super::extract_event_index("/tmp/x/event-12.txt"), Some(12));
432        assert_eq!(super::extract_event_index("event-2.txt"), Some(2));
433        assert_eq!(super::extract_event_index("random.txt"), None);
434    }
435
436    #[test]
437    fn detects_sensitive_patterns() {
438        assert!(contains_sensitive_patterns("token=abc123"));
439        assert!(contains_sensitive_patterns("email is test@example.com"));
440        assert!(contains_sensitive_patterns("visit https://x.y/z?a=1"));
441        assert!(contains_sensitive_patterns(
442            "eyJhbGciOiJIUzI1NiJ9.abc1234567.zyx0987654"
443        ));
444        assert!(contains_sensitive_patterns(
445            "-----BEGIN PRIVATE KEY-----abc-----END PRIVATE KEY-----"
446        ));
447        assert!(!contains_sensitive_patterns("clean text only"));
448        assert!(!contains_sensitive_patterns("token=[REDACTED]"));
449    }
450}