1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use std::{
4 fs,
5 path::PathBuf,
6 process::{Command, Stdio},
7};
8
9use crate::models::CanonicalEvent;
10
11#[derive(Debug, Clone, Default, Serialize, Deserialize)]
12pub struct SanitizationReport {
13 pub total_redactions: usize,
14 pub secret_redactions: usize,
15 pub email_redactions: usize,
16 pub ip_redactions: usize,
17 pub path_redactions: usize,
18 pub sample_redacted: Vec<String>,
19}
20
21pub fn sanitize_events(events: &[CanonicalEvent]) -> (Vec<CanonicalEvent>, SanitizationReport) {
22 let mut report = SanitizationReport::default();
23 let mut out = events.to_vec();
24 apply_gitleaks_if_available(&mut out, &mut report);
25
26 for event in &mut out {
27 let before = event.text.clone();
28 event.text = redact_text(&event.text, &mut report);
29 if before != event.text && report.sample_redacted.len() < 5 {
30 report.sample_redacted.push(event.text.clone());
31 }
32 }
33
34 (out, report)
35}
36
37pub fn contains_sensitive_patterns(text: &str) -> bool {
38 let mut probe = text.to_string();
39 for marker in [
40 "[REDACTED]",
41 "[REDACTED_EMAIL]",
42 "[REDACTED_IP]",
43 "[REDACTED_PATH]",
44 "[REDACTED_QUERY]",
45 "[REDACTED_GITLEAKS]",
46 "[REDACTED_JWT]",
47 "[REDACTED_PEM]",
48 "[REDACTED_USERHOST]",
49 "[REDACTED_ENTROPY]",
50 ] {
51 probe = probe.replace(marker, "");
52 }
53
54 let token_re = token_regex();
55 let bearer_re = bearer_regex();
56 let jwt_re = jwt_regex();
57 let pem_re = pem_private_key_regex();
58 let email_re = email_regex();
59 let ip_re = ip_regex();
60 let url_query_re = url_query_regex();
61 let user_host_re = user_host_regex();
62 let host_assign_re = host_assignment_regex();
63 let path_re = path_regex();
64
65 token_re.is_match(&probe)
66 || bearer_re.is_match(&probe)
67 || jwt_re.is_match(&probe)
68 || pem_re.is_match(&probe)
69 || email_re.is_match(&probe)
70 || ip_re.is_match(&probe)
71 || user_host_re.is_match(&probe)
72 || host_assign_re.is_match(&probe)
73 || path_re.is_match(&probe)
74 || url_query_re.is_match(&probe)
75 || contains_high_entropy_token(&probe)
76}
77
78fn redact_text(input: &str, report: &mut SanitizationReport) -> String {
79 let token_re = token_regex();
80 let bearer_re = bearer_regex();
81 let jwt_re = jwt_regex();
82 let pem_re = pem_private_key_regex();
83 let email_re = email_regex();
84 let ip_re = ip_regex();
85 let path_re = path_regex();
86 let url_query_re = url_query_regex();
87 let user_host_re = user_host_regex();
88 let host_assign_re = host_assignment_regex();
89
90 let mut text = input.to_string();
91
92 let n = token_re.find_iter(&text).count();
93 if n > 0 {
94 text = token_re.replace_all(&text, "$1=[REDACTED]").to_string();
95 report.secret_redactions += n;
96 report.total_redactions += n;
97 }
98
99 let n = bearer_re.find_iter(&text).count();
100 if n > 0 {
101 text = bearer_re.replace_all(&text, "$1 [REDACTED]").to_string();
102 report.secret_redactions += n;
103 report.total_redactions += n;
104 }
105
106 let n = jwt_re.find_iter(&text).count();
107 if n > 0 {
108 text = jwt_re.replace_all(&text, "[REDACTED_JWT]").to_string();
109 report.secret_redactions += n;
110 report.total_redactions += n;
111 }
112
113 let n = pem_re.find_iter(&text).count();
114 if n > 0 {
115 text = pem_re.replace_all(&text, "[REDACTED_PEM]").to_string();
116 report.secret_redactions += n;
117 report.total_redactions += n;
118 }
119
120 let n = email_re.find_iter(&text).count();
121 if n > 0 {
122 text = email_re.replace_all(&text, "[REDACTED_EMAIL]").to_string();
123 report.email_redactions += n;
124 report.total_redactions += n;
125 }
126
127 let n = ip_re.find_iter(&text).count();
128 if n > 0 {
129 text = ip_re.replace_all(&text, "[REDACTED_IP]").to_string();
130 report.ip_redactions += n;
131 report.total_redactions += n;
132 }
133
134 let n = path_re.find_iter(&text).count();
135 if n > 0 {
136 text = path_re.replace_all(&text, "[REDACTED_PATH]").to_string();
137 report.path_redactions += n;
138 report.total_redactions += n;
139 }
140
141 let n = user_host_re.find_iter(&text).count();
142 if n > 0 {
143 text = user_host_re
144 .replace_all(&text, "[REDACTED_USERHOST]")
145 .to_string();
146 report.secret_redactions += n;
147 report.total_redactions += n;
148 }
149
150 let n = host_assign_re.find_iter(&text).count();
151 if n > 0 {
152 text = host_assign_re
153 .replace_all(&text, "$1=[REDACTED_USERHOST]")
154 .to_string();
155 report.secret_redactions += n;
156 report.total_redactions += n;
157 }
158
159 let n = url_query_re.find_iter(&text).count();
160 if n > 0 {
161 text = url_query_re
162 .replace_all(&text, "$1?[REDACTED_QUERY]")
163 .to_string();
164 report.secret_redactions += n;
165 report.total_redactions += n;
166 }
167
168 if contains_high_entropy_token(&text) {
169 text = redact_high_entropy_tokens(&text, report);
170 }
171
172 text
173}
174
175fn token_regex() -> Regex {
176 Regex::new(
177 r#"(?i)(api[_-]?key|access[_-]?key|token|secret|authorization|password|passwd)\s*[:=]\s*[^\s,"']+"#,
178 )
179 .unwrap()
180}
181
182fn bearer_regex() -> Regex {
183 Regex::new(r#"(?i)\b(authorization:?\s*bearer)\s+[A-Za-z0-9\-._~+/=]{8,}"#).unwrap()
184}
185
186fn jwt_regex() -> Regex {
187 Regex::new(r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b").unwrap()
188}
189
190fn pem_private_key_regex() -> Regex {
191 Regex::new(r"(?s)-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----.*?-----END [A-Z0-9 ]*PRIVATE KEY-----")
192 .unwrap()
193}
194
195fn email_regex() -> Regex {
196 Regex::new(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b").unwrap()
197}
198
199fn ip_regex() -> Regex {
200 Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap()
201}
202
203fn path_regex() -> Regex {
204 Regex::new(
205 r"(?i)(?:/Users/[^/\s]+|/home/[^/\s]+|/root/[^/\s]*|[A-Za-z]:[\\/](?:[^\\/\s]+[\\/])*[^\\/\s]+)",
206 )
207 .unwrap()
208}
209
210fn url_query_regex() -> Regex {
211 Regex::new(r"(https?://[^\s?]+)\?[^\s]+").unwrap()
212}
213
214fn user_host_regex() -> Regex {
215 Regex::new(r"\b[A-Za-z0-9._-]{2,32}@[A-Za-z0-9._-]{2,64}\b").unwrap()
216}
217
218fn host_assignment_regex() -> Regex {
219 Regex::new(r#"(?i)\b(hostname|host|user|username)\s*[:=]\s*([A-Za-z0-9._-]{2,64})"#).unwrap()
220}
221
222fn contains_high_entropy_token(text: &str) -> bool {
223 text.split(|c: char| {
224 c.is_whitespace() || matches!(c, '"' | '\'' | ',' | ';' | '(' | ')' | '[' | ']')
225 })
226 .any(is_high_entropy_token)
227}
228
229fn redact_high_entropy_tokens(text: &str, report: &mut SanitizationReport) -> String {
230 let mut out = String::with_capacity(text.len());
231 for token in text.split_inclusive(|c: char| c.is_whitespace()) {
232 let trimmed = token.trim();
233 if is_high_entropy_token(trimmed) {
234 out.push_str(&token.replace(trimmed, "[REDACTED_ENTROPY]"));
235 report.secret_redactions += 1;
236 report.total_redactions += 1;
237 } else {
238 out.push_str(token);
239 }
240 }
241 out
242}
243
244fn is_high_entropy_token(token: &str) -> bool {
245 if token.len() < 24 {
246 return false;
247 }
248 if token.chars().all(|c| c.is_ascii_hexdigit()) {
249 return false;
250 }
251 if !token
252 .chars()
253 .all(|c| c.is_ascii_alphanumeric() || "-_~+/=".contains(c))
254 {
255 return false;
256 }
257 let has_upper = token.chars().any(|c| c.is_ascii_uppercase());
258 let has_lower = token.chars().any(|c| c.is_ascii_lowercase());
259 let has_digit = token.chars().any(|c| c.is_ascii_digit());
260 (has_upper && has_lower && has_digit) || token.len() >= 32
261}
262
263fn apply_gitleaks_if_available(events: &mut [CanonicalEvent], report: &mut SanitizationReport) {
264 let Some(gitleaks_bin) = find_gitleaks_binary() else {
265 return;
266 };
267
268 let temp_dir =
269 std::env::temp_dir().join(format!("trace-share-gitleaks-{}", uuid::Uuid::new_v4()));
270 if fs::create_dir_all(&temp_dir).is_err() {
271 return;
272 }
273
274 let mut file_map = Vec::new();
275 for (i, event) in events.iter().enumerate() {
276 let file_path = temp_dir.join(format!("event-{i}.txt"));
277 if fs::write(&file_path, &event.text).is_ok() {
278 file_map.push((i, file_path));
279 }
280 }
281
282 if file_map.is_empty() {
283 let _ = fs::remove_dir_all(&temp_dir);
284 return;
285 }
286
287 let report_path = temp_dir.join("gitleaks-report.json");
288 let output = Command::new(gitleaks_bin)
289 .arg("detect")
290 .arg("--no-git")
291 .arg("--source")
292 .arg(&temp_dir)
293 .arg("--report-format")
294 .arg("json")
295 .arg("--report-path")
296 .arg(&report_path)
297 .stdout(Stdio::null())
298 .stderr(Stdio::piped())
299 .output();
300
301 let Ok(output) = output else {
302 let _ = fs::remove_dir_all(&temp_dir);
303 return;
304 };
305
306 if !report_path.exists() && !output.status.success() {
308 let _ = fs::remove_dir_all(&temp_dir);
309 return;
310 }
311
312 let report_text = fs::read_to_string(&report_path).unwrap_or_default();
313 if report_text.trim().is_empty() {
314 let _ = fs::remove_dir_all(&temp_dir);
315 return;
316 }
317
318 let leaks = serde_json::from_str::<Vec<GitleaksFinding>>(&report_text).unwrap_or_default();
319 for finding in leaks {
320 if let Some(idx) = finding
321 .file
322 .as_deref()
323 .and_then(extract_event_index)
324 .filter(|idx| *idx < events.len())
325 {
326 if let Some(secret) = finding.secret.as_deref() {
327 if !secret.is_empty() && events[idx].text.contains(secret) {
328 events[idx].text = events[idx].text.replace(secret, "[REDACTED_GITLEAKS]");
329 report.secret_redactions += 1;
330 report.total_redactions += 1;
331 }
332 }
333 }
334 }
335
336 let _ = fs::remove_dir_all(&temp_dir);
337}
338
339fn extract_event_index(path_text: &str) -> Option<usize> {
340 let binding = PathBuf::from(path_text);
341 let name = binding.file_name()?.to_str()?;
342 let idx = name
343 .strip_prefix("event-")?
344 .strip_suffix(".txt")?
345 .parse::<usize>()
346 .ok()?;
347 Some(idx)
348}
349
350fn find_gitleaks_binary() -> Option<PathBuf> {
351 let path = std::env::var_os("PATH")?;
352 std::env::split_paths(&path).find_map(|dir| {
353 let candidate = dir.join("gitleaks");
354 if candidate.exists() {
355 return Some(candidate);
356 }
357 #[cfg(windows)]
358 {
359 let candidate_exe = dir.join("gitleaks.exe");
360 if candidate_exe.exists() {
361 return Some(candidate_exe);
362 }
363 }
364 None
365 })
366}
367
368#[derive(Debug, Clone, Default, Deserialize)]
369struct GitleaksFinding {
370 #[serde(rename = "File")]
371 file: Option<String>,
372 #[serde(rename = "Secret")]
373 secret: Option<String>,
374}
375
376#[cfg(test)]
377mod tests {
378 use chrono::Utc;
379
380 use crate::models::CanonicalEvent;
381
382 use super::{contains_sensitive_patterns, sanitize_events};
383
384 #[test]
385 fn redacts_known_patterns() {
386 let input = vec![CanonicalEvent {
387 source: "x".to_string(),
388 session_id: "s".to_string(),
389 ts: Utc::now(),
390 kind: "user_msg".to_string(),
391 text: "token=abc123 email me at a@b.com from 127.0.0.1 /home/user/repo C:\\Users\\alice\\repo authorization: bearer ABCDEFGHIJ".to_string(),
392 tool: None,
393 meta: None,
394 }];
395
396 let (sanitized, report) = sanitize_events(&input);
397 assert!(sanitized[0].text.contains("[REDACTED]"));
398 assert!(sanitized[0].text.contains("[REDACTED_EMAIL]"));
399 assert!(sanitized[0].text.contains("[REDACTED_IP]"));
400 assert!(sanitized[0].text.contains("[REDACTED_PATH]"));
401 assert!(
402 sanitized[0]
403 .text
404 .to_ascii_lowercase()
405 .contains("authorization=[redacted]")
406 );
407 assert!(report.total_redactions >= 4);
408 }
409
410 #[test]
411 fn redacts_jwt_pem_and_entropy() {
412 let jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.cGF5bG9hZC12YWx1ZS0xMjM0NTY3ODkw.sigvalue1234567890ABCD";
413 let pem = "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC\n-----END PRIVATE KEY-----";
414 let entropy = "AbCDef1234567890GhIjKlMnOpQrStUv";
415 let input = vec![CanonicalEvent {
416 source: "x".to_string(),
417 session_id: "s".to_string(),
418 ts: Utc::now(),
419 kind: "user_msg".to_string(),
420 text: format!("{jwt}\n{pem}\nsecret:{entropy}"),
421 tool: None,
422 meta: None,
423 }];
424 let (sanitized, _) = sanitize_events(&input);
425 let out = &sanitized[0].text;
426 assert!(out.contains("[REDACTED_JWT]"));
427 assert!(out.contains("[REDACTED_PEM]"));
428 assert!(out.contains("[REDACTED]") || out.contains("[REDACTED_ENTROPY]"));
429 }
430
431 #[test]
432 fn extracts_gitleaks_event_index() {
433 assert_eq!(super::extract_event_index("/tmp/x/event-12.txt"), Some(12));
434 assert_eq!(super::extract_event_index("event-2.txt"), Some(2));
435 assert_eq!(super::extract_event_index("random.txt"), None);
436 }
437
438 #[test]
439 fn detects_sensitive_patterns() {
440 assert!(contains_sensitive_patterns("token=abc123"));
441 assert!(contains_sensitive_patterns("email is test@example.com"));
442 assert!(contains_sensitive_patterns("visit https://x.y/z?a=1"));
443 assert!(contains_sensitive_patterns(
444 "cwd C:\\Users\\evang\\work\\trace-share"
445 ));
446 assert!(contains_sensitive_patterns(
447 "eyJhbGciOiJIUzI1NiJ9.abc1234567.zyx0987654"
448 ));
449 assert!(contains_sensitive_patterns(
450 "-----BEGIN PRIVATE KEY-----abc-----END PRIVATE KEY-----"
451 ));
452 assert!(!contains_sensitive_patterns("clean text only"));
453 assert!(!contains_sensitive_patterns("token=[REDACTED]"));
454 }
455}