1use regex::Regex;
2use serde::{Deserialize, Serialize};
3use std::{
4 fs,
5 path::PathBuf,
6 process::{Command, Stdio},
7};
8
9use crate::models::CanonicalEvent;
10
11#[derive(Debug, Clone, Default, Serialize, Deserialize)]
12pub struct SanitizationReport {
13 pub total_redactions: usize,
14 pub secret_redactions: usize,
15 pub email_redactions: usize,
16 pub ip_redactions: usize,
17 pub path_redactions: usize,
18 pub sample_redacted: Vec<String>,
19}
20
21pub fn sanitize_events(events: &[CanonicalEvent]) -> (Vec<CanonicalEvent>, SanitizationReport) {
22 let mut report = SanitizationReport::default();
23 let mut out = events.to_vec();
24 apply_gitleaks_if_available(&mut out, &mut report);
25
26 for event in &mut out {
27 let before = event.text.clone();
28 event.text = redact_text(&event.text, &mut report);
29 if before != event.text && report.sample_redacted.len() < 5 {
30 report.sample_redacted.push(event.text.clone());
31 }
32 }
33
34 (out, report)
35}
36
37pub fn contains_sensitive_patterns(text: &str) -> bool {
38 let mut probe = text.to_string();
39 for marker in [
40 "[REDACTED]",
41 "[REDACTED_EMAIL]",
42 "[REDACTED_IP]",
43 "[REDACTED_PATH]",
44 "[REDACTED_QUERY]",
45 "[REDACTED_GITLEAKS]",
46 "[REDACTED_JWT]",
47 "[REDACTED_PEM]",
48 "[REDACTED_USERHOST]",
49 "[REDACTED_ENTROPY]",
50 ] {
51 probe = probe.replace(marker, "");
52 }
53
54 let token_re = token_regex();
55 let bearer_re = bearer_regex();
56 let jwt_re = jwt_regex();
57 let pem_re = pem_private_key_regex();
58 let email_re = email_regex();
59 let ip_re = ip_regex();
60 let url_query_re = url_query_regex();
61 let user_host_re = user_host_regex();
62 let host_assign_re = host_assignment_regex();
63 let path_re = path_regex();
64
65 token_re.is_match(&probe)
66 || bearer_re.is_match(&probe)
67 || jwt_re.is_match(&probe)
68 || pem_re.is_match(&probe)
69 || email_re.is_match(&probe)
70 || ip_re.is_match(&probe)
71 || user_host_re.is_match(&probe)
72 || host_assign_re.is_match(&probe)
73 || path_re.is_match(&probe)
74 || url_query_re.is_match(&probe)
75 || contains_high_entropy_token(&probe)
76}
77
78fn redact_text(input: &str, report: &mut SanitizationReport) -> String {
79 let token_re = token_regex();
80 let bearer_re = bearer_regex();
81 let jwt_re = jwt_regex();
82 let pem_re = pem_private_key_regex();
83 let email_re = email_regex();
84 let ip_re = ip_regex();
85 let path_re = path_regex();
86 let url_query_re = url_query_regex();
87 let user_host_re = user_host_regex();
88 let host_assign_re = host_assignment_regex();
89
90 let mut text = input.to_string();
91
92 let n = token_re.find_iter(&text).count();
93 if n > 0 {
94 text = token_re.replace_all(&text, "$1=[REDACTED]").to_string();
95 report.secret_redactions += n;
96 report.total_redactions += n;
97 }
98
99 let n = bearer_re.find_iter(&text).count();
100 if n > 0 {
101 text = bearer_re.replace_all(&text, "$1 [REDACTED]").to_string();
102 report.secret_redactions += n;
103 report.total_redactions += n;
104 }
105
106 let n = jwt_re.find_iter(&text).count();
107 if n > 0 {
108 text = jwt_re.replace_all(&text, "[REDACTED_JWT]").to_string();
109 report.secret_redactions += n;
110 report.total_redactions += n;
111 }
112
113 let n = pem_re.find_iter(&text).count();
114 if n > 0 {
115 text = pem_re.replace_all(&text, "[REDACTED_PEM]").to_string();
116 report.secret_redactions += n;
117 report.total_redactions += n;
118 }
119
120 let n = email_re.find_iter(&text).count();
121 if n > 0 {
122 text = email_re.replace_all(&text, "[REDACTED_EMAIL]").to_string();
123 report.email_redactions += n;
124 report.total_redactions += n;
125 }
126
127 let n = ip_re.find_iter(&text).count();
128 if n > 0 {
129 text = ip_re.replace_all(&text, "[REDACTED_IP]").to_string();
130 report.ip_redactions += n;
131 report.total_redactions += n;
132 }
133
134 let n = path_re.find_iter(&text).count();
135 if n > 0 {
136 text = path_re.replace_all(&text, "[REDACTED_PATH]").to_string();
137 report.path_redactions += n;
138 report.total_redactions += n;
139 }
140
141 let n = user_host_re.find_iter(&text).count();
142 if n > 0 {
143 text = user_host_re
144 .replace_all(&text, "[REDACTED_USERHOST]")
145 .to_string();
146 report.secret_redactions += n;
147 report.total_redactions += n;
148 }
149
150 let n = host_assign_re.find_iter(&text).count();
151 if n > 0 {
152 text = host_assign_re
153 .replace_all(&text, "$1=[REDACTED_USERHOST]")
154 .to_string();
155 report.secret_redactions += n;
156 report.total_redactions += n;
157 }
158
159 let n = url_query_re.find_iter(&text).count();
160 if n > 0 {
161 text = url_query_re
162 .replace_all(&text, "$1?[REDACTED_QUERY]")
163 .to_string();
164 report.secret_redactions += n;
165 report.total_redactions += n;
166 }
167
168 if contains_high_entropy_token(&text) {
169 text = redact_high_entropy_tokens(&text, report);
170 }
171
172 text
173}
174
175fn token_regex() -> Regex {
176 Regex::new(
177 r#"(?i)(api[_-]?key|access[_-]?key|token|secret|authorization|password|passwd)\s*[:=]\s*[^\s,"']+"#,
178 )
179 .unwrap()
180}
181
182fn bearer_regex() -> Regex {
183 Regex::new(r#"(?i)\b(authorization:?\s*bearer)\s+[A-Za-z0-9\-._~+/=]{8,}"#).unwrap()
184}
185
186fn jwt_regex() -> Regex {
187 Regex::new(r"\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b").unwrap()
188}
189
190fn pem_private_key_regex() -> Regex {
191 Regex::new(r"(?s)-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----.*?-----END [A-Z0-9 ]*PRIVATE KEY-----")
192 .unwrap()
193}
194
195fn email_regex() -> Regex {
196 Regex::new(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b").unwrap()
197}
198
199fn ip_regex() -> Regex {
200 Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap()
201}
202
203fn path_regex() -> Regex {
204 Regex::new(r"(?:/Users/[^/\s]+|/home/[^/\s]+|/root/[^/\s]*|[A-Za-z]:\\\\Users\\\\[^\\\s]+)")
205 .unwrap()
206}
207
208fn url_query_regex() -> Regex {
209 Regex::new(r"(https?://[^\s?]+)\?[^\s]+").unwrap()
210}
211
212fn user_host_regex() -> Regex {
213 Regex::new(r"\b[A-Za-z0-9._-]{2,32}@[A-Za-z0-9._-]{2,64}\b").unwrap()
214}
215
216fn host_assignment_regex() -> Regex {
217 Regex::new(r#"(?i)\b(hostname|host|user|username)\s*[:=]\s*([A-Za-z0-9._-]{2,64})"#).unwrap()
218}
219
220fn contains_high_entropy_token(text: &str) -> bool {
221 text.split(|c: char| {
222 c.is_whitespace() || matches!(c, '"' | '\'' | ',' | ';' | '(' | ')' | '[' | ']')
223 })
224 .any(is_high_entropy_token)
225}
226
227fn redact_high_entropy_tokens(text: &str, report: &mut SanitizationReport) -> String {
228 let mut out = String::with_capacity(text.len());
229 for token in text.split_inclusive(|c: char| c.is_whitespace()) {
230 let trimmed = token.trim();
231 if is_high_entropy_token(trimmed) {
232 out.push_str(&token.replace(trimmed, "[REDACTED_ENTROPY]"));
233 report.secret_redactions += 1;
234 report.total_redactions += 1;
235 } else {
236 out.push_str(token);
237 }
238 }
239 out
240}
241
242fn is_high_entropy_token(token: &str) -> bool {
243 if token.len() < 24 {
244 return false;
245 }
246 if token.chars().all(|c| c.is_ascii_hexdigit()) {
247 return false;
248 }
249 if !token
250 .chars()
251 .all(|c| c.is_ascii_alphanumeric() || "-_~+/=".contains(c))
252 {
253 return false;
254 }
255 let has_upper = token.chars().any(|c| c.is_ascii_uppercase());
256 let has_lower = token.chars().any(|c| c.is_ascii_lowercase());
257 let has_digit = token.chars().any(|c| c.is_ascii_digit());
258 (has_upper && has_lower && has_digit) || token.len() >= 32
259}
260
261fn apply_gitleaks_if_available(events: &mut [CanonicalEvent], report: &mut SanitizationReport) {
262 let Some(gitleaks_bin) = find_gitleaks_binary() else {
263 return;
264 };
265
266 let temp_dir =
267 std::env::temp_dir().join(format!("trace-share-gitleaks-{}", uuid::Uuid::new_v4()));
268 if fs::create_dir_all(&temp_dir).is_err() {
269 return;
270 }
271
272 let mut file_map = Vec::new();
273 for (i, event) in events.iter().enumerate() {
274 let file_path = temp_dir.join(format!("event-{i}.txt"));
275 if fs::write(&file_path, &event.text).is_ok() {
276 file_map.push((i, file_path));
277 }
278 }
279
280 if file_map.is_empty() {
281 let _ = fs::remove_dir_all(&temp_dir);
282 return;
283 }
284
285 let report_path = temp_dir.join("gitleaks-report.json");
286 let output = Command::new(gitleaks_bin)
287 .arg("detect")
288 .arg("--no-git")
289 .arg("--source")
290 .arg(&temp_dir)
291 .arg("--report-format")
292 .arg("json")
293 .arg("--report-path")
294 .arg(&report_path)
295 .stdout(Stdio::null())
296 .stderr(Stdio::piped())
297 .output();
298
299 let Ok(output) = output else {
300 let _ = fs::remove_dir_all(&temp_dir);
301 return;
302 };
303
304 if !report_path.exists() && !output.status.success() {
306 let _ = fs::remove_dir_all(&temp_dir);
307 return;
308 }
309
310 let report_text = fs::read_to_string(&report_path).unwrap_or_default();
311 if report_text.trim().is_empty() {
312 let _ = fs::remove_dir_all(&temp_dir);
313 return;
314 }
315
316 let leaks = serde_json::from_str::<Vec<GitleaksFinding>>(&report_text).unwrap_or_default();
317 for finding in leaks {
318 if let Some(idx) = finding
319 .file
320 .as_deref()
321 .and_then(extract_event_index)
322 .filter(|idx| *idx < events.len())
323 {
324 if let Some(secret) = finding.secret.as_deref() {
325 if !secret.is_empty() && events[idx].text.contains(secret) {
326 events[idx].text = events[idx].text.replace(secret, "[REDACTED_GITLEAKS]");
327 report.secret_redactions += 1;
328 report.total_redactions += 1;
329 }
330 }
331 }
332 }
333
334 let _ = fs::remove_dir_all(&temp_dir);
335}
336
337fn extract_event_index(path_text: &str) -> Option<usize> {
338 let binding = PathBuf::from(path_text);
339 let name = binding.file_name()?.to_str()?;
340 let idx = name
341 .strip_prefix("event-")?
342 .strip_suffix(".txt")?
343 .parse::<usize>()
344 .ok()?;
345 Some(idx)
346}
347
348fn find_gitleaks_binary() -> Option<PathBuf> {
349 let path = std::env::var_os("PATH")?;
350 std::env::split_paths(&path).find_map(|dir| {
351 let candidate = dir.join("gitleaks");
352 if candidate.exists() {
353 return Some(candidate);
354 }
355 #[cfg(windows)]
356 {
357 let candidate_exe = dir.join("gitleaks.exe");
358 if candidate_exe.exists() {
359 return Some(candidate_exe);
360 }
361 }
362 None
363 })
364}
365
366#[derive(Debug, Clone, Default, Deserialize)]
367struct GitleaksFinding {
368 #[serde(rename = "File")]
369 file: Option<String>,
370 #[serde(rename = "Secret")]
371 secret: Option<String>,
372}
373
374#[cfg(test)]
375mod tests {
376 use chrono::Utc;
377
378 use crate::models::CanonicalEvent;
379
380 use super::{contains_sensitive_patterns, sanitize_events};
381
382 #[test]
383 fn redacts_known_patterns() {
384 let input = vec![CanonicalEvent {
385 source: "x".to_string(),
386 session_id: "s".to_string(),
387 ts: Utc::now(),
388 kind: "user_msg".to_string(),
389 text: "token=abc123 email me at a@b.com from 127.0.0.1 /home/user/repo authorization: bearer ABCDEFGHIJ".to_string(),
390 tool: None,
391 meta: None,
392 }];
393
394 let (sanitized, report) = sanitize_events(&input);
395 assert!(sanitized[0].text.contains("[REDACTED]"));
396 assert!(sanitized[0].text.contains("[REDACTED_EMAIL]"));
397 assert!(sanitized[0].text.contains("[REDACTED_IP]"));
398 assert!(sanitized[0].text.contains("[REDACTED_PATH]"));
399 assert!(
400 sanitized[0]
401 .text
402 .to_ascii_lowercase()
403 .contains("authorization=[redacted]")
404 );
405 assert!(report.total_redactions >= 4);
406 }
407
408 #[test]
409 fn redacts_jwt_pem_and_entropy() {
410 let jwt = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.cGF5bG9hZC12YWx1ZS0xMjM0NTY3ODkw.sigvalue1234567890ABCD";
411 let pem = "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASC\n-----END PRIVATE KEY-----";
412 let entropy = "AbCDef1234567890GhIjKlMnOpQrStUv";
413 let input = vec![CanonicalEvent {
414 source: "x".to_string(),
415 session_id: "s".to_string(),
416 ts: Utc::now(),
417 kind: "user_msg".to_string(),
418 text: format!("{jwt}\n{pem}\nsecret:{entropy}"),
419 tool: None,
420 meta: None,
421 }];
422 let (sanitized, _) = sanitize_events(&input);
423 let out = &sanitized[0].text;
424 assert!(out.contains("[REDACTED_JWT]"));
425 assert!(out.contains("[REDACTED_PEM]"));
426 assert!(out.contains("[REDACTED]") || out.contains("[REDACTED_ENTROPY]"));
427 }
428
429 #[test]
430 fn extracts_gitleaks_event_index() {
431 assert_eq!(super::extract_event_index("/tmp/x/event-12.txt"), Some(12));
432 assert_eq!(super::extract_event_index("event-2.txt"), Some(2));
433 assert_eq!(super::extract_event_index("random.txt"), None);
434 }
435
436 #[test]
437 fn detects_sensitive_patterns() {
438 assert!(contains_sensitive_patterns("token=abc123"));
439 assert!(contains_sensitive_patterns("email is test@example.com"));
440 assert!(contains_sensitive_patterns("visit https://x.y/z?a=1"));
441 assert!(contains_sensitive_patterns(
442 "eyJhbGciOiJIUzI1NiJ9.abc1234567.zyx0987654"
443 ));
444 assert!(contains_sensitive_patterns(
445 "-----BEGIN PRIVATE KEY-----abc-----END PRIVATE KEY-----"
446 ));
447 assert!(!contains_sensitive_patterns("clean text only"));
448 assert!(!contains_sensitive_patterns("token=[REDACTED]"));
449 }
450}