1use regex::Regex;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum ReplacementKind {
13 FilePath,
14 AuthorName,
15 CompanyName,
16 ApiKey,
17 Email,
18 Url,
19 Custom,
20}
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct Replacement {
25 pub original: String,
26 pub replacement: String,
27 pub kind: ReplacementKind,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct AnonymizationResult {
33 pub text: String,
34 pub replacements: Vec<Replacement>,
35}
36
37pub struct Anonymizer {
39 custom_patterns: Vec<String>,
40}
41
42impl Anonymizer {
43 pub fn new() -> Self {
45 Self {
46 custom_patterns: Vec::new(),
47 }
48 }
49
50 pub fn with_custom_patterns(patterns: Vec<String>) -> Self {
52 Self {
53 custom_patterns: patterns,
54 }
55 }
56
57 pub fn anonymize(&self, text: &str) -> AnonymizationResult {
67 let mut result = text.to_string();
68 let mut replacements: Vec<Replacement> = Vec::new();
69 let mut file_counter: u32 = 0;
71 let mut author_counter: u32 = 0;
72 let mut email_counter: u32 = 0;
73 let mut seen: HashMap<String, String> = HashMap::new();
75
76 result = self.anonymize_api_keys(&result, &mut replacements, &mut seen);
78
79 result = self.anonymize_authors(
81 &result,
82 &mut replacements,
83 &mut author_counter,
84 &mut email_counter,
85 &mut seen,
86 );
87
88 result = self.anonymize_emails(&result, &mut replacements, &mut email_counter, &mut seen);
90
91 result = self.anonymize_file_paths(&result, &mut replacements, &mut file_counter, &mut seen);
93
94 result = self.anonymize_urls(&result, &mut replacements, &mut seen);
96
97 result = self.anonymize_custom(&result, &mut replacements, &mut seen);
99
100 AnonymizationResult {
101 text: result,
102 replacements,
103 }
104 }
105
106 pub fn deanonymize(text: &str, replacements: &[Replacement]) -> String {
108 let mut result = text.to_string();
109 for r in replacements.iter().rev() {
111 result = result.replace(&r.replacement, &r.original);
112 }
113 result
114 }
115
116 fn anonymize_api_keys(
119 &self,
120 text: &str,
121 replacements: &mut Vec<Replacement>,
122 seen: &mut HashMap<String, String>,
123 ) -> String {
124 let mut result = text.to_string();
125
126 let sk_re = Regex::new(r"sk-[A-Za-z0-9_-]{20,}").unwrap();
128 result = self.replace_pattern(&result, &sk_re, "[REDACTED_KEY]", ReplacementKind::ApiKey, replacements, seen);
129
130 let ghp_re = Regex::new(r"ghp_[A-Za-z0-9]{36,}").unwrap();
132 result = self.replace_pattern(&result, &ghp_re, "[REDACTED_KEY]", ReplacementKind::ApiKey, replacements, seen);
133
134 let akia_re = Regex::new(r"AKIA[A-Z0-9]{16,}").unwrap();
136 result = self.replace_pattern(&result, &akia_re, "[REDACTED_KEY]", ReplacementKind::ApiKey, replacements, seen);
137
138 let env_re = Regex::new(r#"(?i)([\w]*(?:KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL)[\w]*)[\s]*=[\s]*["']?([^\s"']+)["']?"#).unwrap();
140 for caps in env_re.captures_iter(&result.clone()) {
141 let full_match = caps.get(0).unwrap().as_str().to_string();
142 let var_name = caps.get(1).unwrap().as_str();
143 if !seen.contains_key(&full_match) {
144 let replacement_text = format!("{}=[REDACTED_KEY]", var_name);
145 seen.insert(full_match.clone(), replacement_text.clone());
146 replacements.push(Replacement {
147 original: full_match.clone(),
148 replacement: replacement_text.clone(),
149 kind: ReplacementKind::ApiKey,
150 });
151 }
152 let rep = seen.get(&full_match).unwrap().clone();
153 result = result.replacen(&full_match, &rep, 1);
154 }
155
156 result
157 }
158
159 fn anonymize_authors(
160 &self,
161 text: &str,
162 replacements: &mut Vec<Replacement>,
163 author_counter: &mut u32,
164 email_counter: &mut u32,
165 seen: &mut HashMap<String, String>,
166 ) -> String {
167 let mut result = text.to_string();
168
169 let git_author_re = Regex::new(r"(Author:\s*)([^<\n]+?)\s*<([^>]+)>").unwrap();
171 for caps in git_author_re.captures_iter(&result.clone()) {
172 let prefix = caps.get(1).unwrap().as_str();
173 let name = caps.get(2).unwrap().as_str().trim().to_string();
174 let email = caps.get(3).unwrap().as_str().to_string();
175 let full_match = caps.get(0).unwrap().as_str().to_string();
176
177 let name_rep = if let Some(r) = seen.get(&name) {
178 r.clone()
179 } else {
180 *author_counter += 1;
181 let r = format!("[AUTHOR_{}]", author_counter);
182 seen.insert(name.clone(), r.clone());
183 replacements.push(Replacement {
184 original: name.clone(),
185 replacement: r.clone(),
186 kind: ReplacementKind::AuthorName,
187 });
188 r
189 };
190
191 let email_rep = if let Some(r) = seen.get(&email) {
192 r.clone()
193 } else {
194 *email_counter += 1;
195 let r = format!("[EMAIL_{}]", email_counter);
196 seen.insert(email.clone(), r.clone());
197 replacements.push(Replacement {
198 original: email.clone(),
199 replacement: r.clone(),
200 kind: ReplacementKind::Email,
201 });
202 r
203 };
204
205 let replacement_text = format!("{}{} <{}>", prefix, name_rep, email_rep);
206 result = result.replacen(&full_match, &replacement_text, 1);
207 }
208
209 let jsdoc_re = Regex::new(r"(@author\s+)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)").unwrap();
211 for caps in jsdoc_re.captures_iter(&result.clone()) {
212 let prefix = caps.get(1).unwrap().as_str();
213 let name = caps.get(2).unwrap().as_str().to_string();
214 let full_match = caps.get(0).unwrap().as_str().to_string();
215
216 let name_rep = if let Some(r) = seen.get(&name) {
217 r.clone()
218 } else {
219 *author_counter += 1;
220 let r = format!("[AUTHOR_{}]", author_counter);
221 seen.insert(name.clone(), r.clone());
222 replacements.push(Replacement {
223 original: name.clone(),
224 replacement: r.clone(),
225 kind: ReplacementKind::AuthorName,
226 });
227 r
228 };
229
230 let replacement_text = format!("{}{}", prefix, name_rep);
231 result = result.replacen(&full_match, &replacement_text, 1);
232 }
233
234 result
235 }
236
237 fn anonymize_emails(
238 &self,
239 text: &str,
240 replacements: &mut Vec<Replacement>,
241 email_counter: &mut u32,
242 seen: &mut HashMap<String, String>,
243 ) -> String {
244 let email_re = Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap();
245 let mut result = text.to_string();
246
247 for m in email_re.find_iter(&result.clone()) {
248 let email = m.as_str().to_string();
249 if !seen.contains_key(&email) {
250 *email_counter += 1;
251 let rep = format!("[EMAIL_{}]", email_counter);
252 seen.insert(email.clone(), rep.clone());
253 replacements.push(Replacement {
254 original: email.clone(),
255 replacement: rep,
256 kind: ReplacementKind::Email,
257 });
258 }
259 let rep = seen.get(&email).unwrap().clone();
260 result = result.replacen(&email, &rep, 1);
261 }
262
263 result
264 }
265
266 fn anonymize_file_paths(
267 &self,
268 text: &str,
269 replacements: &mut Vec<Replacement>,
270 file_counter: &mut u32,
271 seen: &mut HashMap<String, String>,
272 ) -> String {
273 let path_re = Regex::new(
276 r"(?:(?:/[a-zA-Z_][a-zA-Z0-9._-]*/)+[a-zA-Z0-9._-]+\.[a-zA-Z]{1,10}|\.{0,2}/(?:[a-zA-Z0-9._-]+/)+[a-zA-Z0-9._-]+\.[a-zA-Z]{1,10})"
277 ).unwrap();
278
279 let mut result = text.to_string();
280
281 for m in path_re.find_iter(&result.clone()) {
282 let path = m.as_str().to_string();
283 if seen.contains_key(&path) {
284 let rep = seen.get(&path).unwrap().clone();
285 result = result.replacen(&path, &rep, 1);
286 continue;
287 }
288
289 let ext = path
291 .rsplit('.')
292 .next()
293 .unwrap_or("txt");
294
295 *file_counter += 1;
296 let rep = format!("[PROJECT]/src/[FILE_{:03}].{}", file_counter, ext);
297 seen.insert(path.clone(), rep.clone());
298 replacements.push(Replacement {
299 original: path.clone(),
300 replacement: rep.clone(),
301 kind: ReplacementKind::FilePath,
302 });
303 result = result.replacen(&path, &rep, 1);
304 }
305
306 result
307 }
308
309 fn anonymize_urls(
310 &self,
311 text: &str,
312 replacements: &mut Vec<Replacement>,
313 seen: &mut HashMap<String, String>,
314 ) -> String {
315 let url_re = Regex::new(
317 r#"https?://[^\s<>"']+[?&](?:token|key|secret|access_token|api_key|auth)=[^\s<>"'&]+"#
318 ).unwrap();
319
320 let mut result = text.to_string();
321
322 for m in url_re.find_iter(&result.clone()) {
323 let url_str = m.as_str().to_string();
324 if seen.contains_key(&url_str) {
325 let rep = seen.get(&url_str).unwrap().clone();
326 result = result.replacen(&url_str, &rep, 1);
327 continue;
328 }
329
330 if let Ok(parsed) = url::Url::parse(&url_str) {
332 let domain = parsed.host_str().unwrap_or("unknown");
333 let path = parsed.path();
334 let rep = format!("https://{}{}?[TOKEN_REDACTED]", domain, path);
335 seen.insert(url_str.clone(), rep.clone());
336 replacements.push(Replacement {
337 original: url_str.clone(),
338 replacement: rep.clone(),
339 kind: ReplacementKind::Url,
340 });
341 result = result.replacen(&url_str, &rep, 1);
342 }
343 }
344
345 result
346 }
347
348 fn anonymize_custom(
349 &self,
350 text: &str,
351 replacements: &mut Vec<Replacement>,
352 seen: &mut HashMap<String, String>,
353 ) -> String {
354 let mut result = text.to_string();
355
356 for (i, pattern) in self.custom_patterns.iter().enumerate() {
357 if let Ok(re) = Regex::new(pattern) {
358 let placeholder = format!("[CUSTOM_{}]", i + 1);
359 result = self.replace_pattern(
360 &result,
361 &re,
362 &placeholder,
363 ReplacementKind::Custom,
364 replacements,
365 seen,
366 );
367 }
368 }
369
370 result
371 }
372
373 fn replace_pattern(
375 &self,
376 text: &str,
377 re: &Regex,
378 placeholder: &str,
379 kind: ReplacementKind,
380 replacements: &mut Vec<Replacement>,
381 seen: &mut HashMap<String, String>,
382 ) -> String {
383 let mut result = text.to_string();
384
385 for m in re.find_iter(&result.clone()) {
386 let original = m.as_str().to_string();
387 if !seen.contains_key(&original) {
388 seen.insert(original.clone(), placeholder.to_string());
389 replacements.push(Replacement {
390 original: original.clone(),
391 replacement: placeholder.to_string(),
392 kind: kind.clone(),
393 });
394 }
395 let rep = seen.get(&original).unwrap().clone();
396 result = result.replacen(&original, &rep, 1);
397 }
398
399 result
400 }
401}
402
403impl Default for Anonymizer {
404 fn default() -> Self {
405 Self::new()
406 }
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412
413 #[test]
414 fn test_anonymize_absolute_file_paths() {
415 let anon = Anonymizer::new();
416 let text = "Error in /Users/john/project/src/main.ts at line 42";
417 let result = anon.anonymize(text);
418 assert!(result.text.contains("[PROJECT]/src/[FILE_001].ts"));
419 assert!(!result.text.contains("/Users/john"));
420 assert!(result.replacements.iter().any(|r| r.kind == ReplacementKind::FilePath));
421 }
422
423 #[test]
424 fn test_anonymize_relative_file_paths() {
425 let anon = Anonymizer::new();
426 let text = "Check ./src/components/header.tsx for issues";
427 let result = anon.anonymize(text);
428 assert!(result.text.contains("[PROJECT]/src/[FILE_001].tsx"));
429 assert!(!result.text.contains("./src/components/header.tsx"));
430 }
431
432 #[test]
433 fn test_anonymize_git_author() {
434 let anon = Anonymizer::new();
435 let text = "Author: John Smith <john.smith@example.com>";
436 let result = anon.anonymize(text);
437 assert!(result.text.contains("[AUTHOR_1]"));
438 assert!(result.text.contains("[EMAIL_1]"));
439 assert!(!result.text.contains("John Smith"));
440 assert!(!result.text.contains("john.smith@example.com"));
441 }
442
443 #[test]
444 fn test_anonymize_jsdoc_author() {
445 let anon = Anonymizer::new();
446 let text = "/** @author Jane Doe */";
447 let result = anon.anonymize(text);
448 assert!(result.text.contains("[AUTHOR_1]"));
449 assert!(!result.text.contains("Jane Doe"));
450 }
451
452 #[test]
453 fn test_anonymize_openai_api_key() {
454 let anon = Anonymizer::new();
455 let text = "OPENAI_API_KEY=sk-abc123def456ghi789jkl012mno345pqr678";
456 let result = anon.anonymize(text);
457 assert!(result.text.contains("[REDACTED_KEY]"));
458 assert!(!result.text.contains("sk-abc123def456ghi789jkl012mno345pqr678"));
459 }
460
461 #[test]
462 fn test_anonymize_github_token() {
463 let anon = Anonymizer::new();
464 let text = "token: ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmn";
465 let result = anon.anonymize(text);
466 assert!(result.text.contains("[REDACTED_KEY]"));
467 assert!(!result.text.contains("ghp_"));
468 }
469
470 #[test]
471 fn test_anonymize_aws_key() {
472 let anon = Anonymizer::new();
473 let text = "aws_access_key = AKIAIOSFODNN7EXAMPLE1";
474 let result = anon.anonymize(text);
475 assert!(result.text.contains("[REDACTED_KEY]"));
476 assert!(!result.text.contains("AKIAIOSFODNN7EXAMPLE1"));
477 }
478
479 #[test]
480 fn test_anonymize_env_var_assignment() {
481 let anon = Anonymizer::new();
482 let text = r#"DATABASE_PASSWORD="my_super_secret""#;
483 let result = anon.anonymize(text);
484 assert!(result.text.contains("[REDACTED_KEY]"));
485 assert!(!result.text.contains("my_super_secret"));
486 }
487
488 #[test]
489 fn test_anonymize_email_standalone() {
490 let anon = Anonymizer::new();
491 let text = "Contact us at support@kardo.dev for help";
492 let result = anon.anonymize(text);
493 assert!(result.text.contains("[EMAIL_1]"));
494 assert!(!result.text.contains("support@kardo.dev"));
495 }
496
497 #[test]
498 fn test_anonymize_url_with_token() {
499 let anon = Anonymizer::new();
500 let text = "Webhook: https://api.example.com/callback?token=abc123secret";
501 let result = anon.anonymize(text);
502 assert!(result.text.contains("[TOKEN_REDACTED]"));
503 assert!(!result.text.contains("abc123secret"));
504 assert!(result.text.contains("api.example.com"));
505 }
506
507 #[test]
508 fn test_roundtrip_deanonymize() {
509 let anon = Anonymizer::new();
510 let original = "Author: Alice Johnson <alice@corp.com> modified /Users/alice/project/src/app.ts";
511 let result = anon.anonymize(original);
512 assert!(!result.text.contains("Alice Johnson"));
514 assert!(!result.text.contains("alice@corp.com"));
515 let restored = Anonymizer::deanonymize(&result.text, &result.replacements);
517 assert_eq!(restored, original);
518 }
519
520 #[test]
521 fn test_custom_patterns() {
522 let anon = Anonymizer::with_custom_patterns(vec![
523 r"PROJ-\d{4}".to_string(),
524 ]);
525 let text = "Issue PROJ-1234 is related to PROJ-5678";
526 let result = anon.anonymize(text);
527 assert!(result.text.contains("[CUSTOM_1]"));
528 assert!(!result.text.contains("PROJ-1234"));
529 }
530
531 #[test]
532 fn test_empty_string() {
533 let anon = Anonymizer::new();
534 let result = anon.anonymize("");
535 assert_eq!(result.text, "");
536 assert!(result.replacements.is_empty());
537 }
538
539 #[test]
540 fn test_no_sensitive_data() {
541 let anon = Anonymizer::new();
542 let text = "This is a normal text with no sensitive data.";
543 let result = anon.anonymize(text);
544 assert_eq!(result.text, text);
545 assert!(result.replacements.is_empty());
546 }
547
548 #[test]
549 fn test_multiple_paths_increment_counter() {
550 let anon = Anonymizer::new();
551 let text = "Files: /home/user/project/src/a.ts and /home/user/project/src/b.rs";
552 let result = anon.anonymize(text);
553 assert!(result.text.contains("[FILE_001]"));
554 assert!(result.text.contains("[FILE_002]"));
555 }
556
557 #[test]
558 fn test_same_email_reuses_placeholder() {
559 let anon = Anonymizer::new();
560 let text = "Send to user@test.com and also user@test.com again";
561 let result = anon.anonymize(text);
562 let count = result.text.matches("[EMAIL_1]").count();
564 assert_eq!(count, 2);
565 let email_replacements: Vec<_> = result.replacements.iter()
567 .filter(|r| r.kind == ReplacementKind::Email)
568 .collect();
569 assert_eq!(email_replacements.len(), 1);
570 }
571}