Skip to main content

kardo_core/anonymize/
mod.rs

1//! Anonymization engine — strips sensitive data before sending to cloud AI.
2//!
3//! Detects and replaces file paths, author names, API keys, emails,
4//! URLs with tokens, and custom user-defined patterns.
5
6use regex::Regex;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10/// The kind of sensitive data that was replaced.
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum ReplacementKind {
13    FilePath,
14    AuthorName,
15    CompanyName,
16    ApiKey,
17    Email,
18    Url,
19    Custom,
20}
21
22/// A single replacement made during anonymization.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct Replacement {
25    pub original: String,
26    pub replacement: String,
27    pub kind: ReplacementKind,
28}
29
30/// Result of anonymizing text: the cleaned text and a list of replacements.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct AnonymizationResult {
33    pub text: String,
34    pub replacements: Vec<Replacement>,
35}
36
37/// Main anonymizer that detects and strips sensitive data from text.
38pub struct Anonymizer {
39    custom_patterns: Vec<String>,
40}
41
42impl Anonymizer {
43    /// Create an anonymizer with no custom patterns.
44    pub fn new() -> Self {
45        Self {
46            custom_patterns: Vec::new(),
47        }
48    }
49
50    /// Create an anonymizer with additional user-defined regex patterns.
51    pub fn with_custom_patterns(patterns: Vec<String>) -> Self {
52        Self {
53            custom_patterns: patterns,
54        }
55    }
56
57    /// Anonymize text, returning the cleaned text and a replacement map.
58    ///
59    /// Applies detection in this order:
60    /// 1. API keys (highest priority — avoid partial matches)
61    /// 2. Emails
62    /// 3. Author names (`@author`, `Author: Name <email>`)
63    /// 4. File paths
64    /// 5. URLs with auth tokens
65    /// 6. Custom patterns
66    pub fn anonymize(&self, text: &str) -> AnonymizationResult {
67        let mut result = text.to_string();
68        let mut replacements: Vec<Replacement> = Vec::new();
69        // Track counters for each replacement kind
70        let mut file_counter: u32 = 0;
71        let mut author_counter: u32 = 0;
72        let mut email_counter: u32 = 0;
73        // Dedup map: original -> replacement string (so the same original always maps the same)
74        let mut seen: HashMap<String, String> = HashMap::new();
75
76        // 1. API keys
77        result = self.anonymize_api_keys(&result, &mut replacements, &mut seen);
78
79        // 2. Author names (before emails, since author lines contain emails)
80        result = self.anonymize_authors(
81            &result,
82            &mut replacements,
83            &mut author_counter,
84            &mut email_counter,
85            &mut seen,
86        );
87
88        // 3. Emails (standalone, not already caught by author)
89        result = self.anonymize_emails(&result, &mut replacements, &mut email_counter, &mut seen);
90
91        // 4. File paths
92        result = self.anonymize_file_paths(&result, &mut replacements, &mut file_counter, &mut seen);
93
94        // 5. URLs with auth tokens
95        result = self.anonymize_urls(&result, &mut replacements, &mut seen);
96
97        // 6. Custom patterns
98        result = self.anonymize_custom(&result, &mut replacements, &mut seen);
99
100        AnonymizationResult {
101            text: result,
102            replacements,
103        }
104    }
105
106    /// De-anonymize text by reversing replacements.
107    pub fn deanonymize(text: &str, replacements: &[Replacement]) -> String {
108        let mut result = text.to_string();
109        // Apply replacements in reverse order to handle overlapping placeholders
110        for r in replacements.iter().rev() {
111            result = result.replace(&r.replacement, &r.original);
112        }
113        result
114    }
115
116    // ── Private helpers ──
117
118    fn anonymize_api_keys(
119        &self,
120        text: &str,
121        replacements: &mut Vec<Replacement>,
122        seen: &mut HashMap<String, String>,
123    ) -> String {
124        let mut result = text.to_string();
125
126        // Pattern: OpenAI sk-... keys
127        let sk_re = Regex::new(r"sk-[A-Za-z0-9_-]{20,}").unwrap();
128        result = self.replace_pattern(&result, &sk_re, "[REDACTED_KEY]", ReplacementKind::ApiKey, replacements, seen);
129
130        // Pattern: GitHub personal access tokens ghp_...
131        let ghp_re = Regex::new(r"ghp_[A-Za-z0-9]{36,}").unwrap();
132        result = self.replace_pattern(&result, &ghp_re, "[REDACTED_KEY]", ReplacementKind::ApiKey, replacements, seen);
133
134        // Pattern: AWS access keys AKIA...
135        let akia_re = Regex::new(r"AKIA[A-Z0-9]{16,}").unwrap();
136        result = self.replace_pattern(&result, &akia_re, "[REDACTED_KEY]", ReplacementKind::ApiKey, replacements, seen);
137
138        // Pattern: env var assignments like API_KEY=xxx, SECRET_KEY="xxx", TOKEN='xxx'
139        let env_re = Regex::new(r#"(?i)([\w]*(?:KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL)[\w]*)[\s]*=[\s]*["']?([^\s"']+)["']?"#).unwrap();
140        for caps in env_re.captures_iter(&result.clone()) {
141            let full_match = caps.get(0).unwrap().as_str().to_string();
142            let var_name = caps.get(1).unwrap().as_str();
143            if !seen.contains_key(&full_match) {
144                let replacement_text = format!("{}=[REDACTED_KEY]", var_name);
145                seen.insert(full_match.clone(), replacement_text.clone());
146                replacements.push(Replacement {
147                    original: full_match.clone(),
148                    replacement: replacement_text.clone(),
149                    kind: ReplacementKind::ApiKey,
150                });
151            }
152            let rep = seen.get(&full_match).unwrap().clone();
153            result = result.replacen(&full_match, &rep, 1);
154        }
155
156        result
157    }
158
159    fn anonymize_authors(
160        &self,
161        text: &str,
162        replacements: &mut Vec<Replacement>,
163        author_counter: &mut u32,
164        email_counter: &mut u32,
165        seen: &mut HashMap<String, String>,
166    ) -> String {
167        let mut result = text.to_string();
168
169        // Pattern: `Author: Name <email>` (git commit style)
170        let git_author_re = Regex::new(r"(Author:\s*)([^<\n]+?)\s*<([^>]+)>").unwrap();
171        for caps in git_author_re.captures_iter(&result.clone()) {
172            let prefix = caps.get(1).unwrap().as_str();
173            let name = caps.get(2).unwrap().as_str().trim().to_string();
174            let email = caps.get(3).unwrap().as_str().to_string();
175            let full_match = caps.get(0).unwrap().as_str().to_string();
176
177            let name_rep = if let Some(r) = seen.get(&name) {
178                r.clone()
179            } else {
180                *author_counter += 1;
181                let r = format!("[AUTHOR_{}]", author_counter);
182                seen.insert(name.clone(), r.clone());
183                replacements.push(Replacement {
184                    original: name.clone(),
185                    replacement: r.clone(),
186                    kind: ReplacementKind::AuthorName,
187                });
188                r
189            };
190
191            let email_rep = if let Some(r) = seen.get(&email) {
192                r.clone()
193            } else {
194                *email_counter += 1;
195                let r = format!("[EMAIL_{}]", email_counter);
196                seen.insert(email.clone(), r.clone());
197                replacements.push(Replacement {
198                    original: email.clone(),
199                    replacement: r.clone(),
200                    kind: ReplacementKind::Email,
201                });
202                r
203            };
204
205            let replacement_text = format!("{}{} <{}>", prefix, name_rep, email_rep);
206            result = result.replacen(&full_match, &replacement_text, 1);
207        }
208
209        // Pattern: `@author Name` (JSDoc style)
210        let jsdoc_re = Regex::new(r"(@author\s+)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)").unwrap();
211        for caps in jsdoc_re.captures_iter(&result.clone()) {
212            let prefix = caps.get(1).unwrap().as_str();
213            let name = caps.get(2).unwrap().as_str().to_string();
214            let full_match = caps.get(0).unwrap().as_str().to_string();
215
216            let name_rep = if let Some(r) = seen.get(&name) {
217                r.clone()
218            } else {
219                *author_counter += 1;
220                let r = format!("[AUTHOR_{}]", author_counter);
221                seen.insert(name.clone(), r.clone());
222                replacements.push(Replacement {
223                    original: name.clone(),
224                    replacement: r.clone(),
225                    kind: ReplacementKind::AuthorName,
226                });
227                r
228            };
229
230            let replacement_text = format!("{}{}", prefix, name_rep);
231            result = result.replacen(&full_match, &replacement_text, 1);
232        }
233
234        result
235    }
236
237    fn anonymize_emails(
238        &self,
239        text: &str,
240        replacements: &mut Vec<Replacement>,
241        email_counter: &mut u32,
242        seen: &mut HashMap<String, String>,
243    ) -> String {
244        let email_re = Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap();
245        let mut result = text.to_string();
246
247        for m in email_re.find_iter(&result.clone()) {
248            let email = m.as_str().to_string();
249            if !seen.contains_key(&email) {
250                *email_counter += 1;
251                let rep = format!("[EMAIL_{}]", email_counter);
252                seen.insert(email.clone(), rep.clone());
253                replacements.push(Replacement {
254                    original: email.clone(),
255                    replacement: rep,
256                    kind: ReplacementKind::Email,
257                });
258            }
259            let rep = seen.get(&email).unwrap().clone();
260            result = result.replacen(&email, &rep, 1);
261        }
262
263        result
264    }
265
266    fn anonymize_file_paths(
267        &self,
268        text: &str,
269        replacements: &mut Vec<Replacement>,
270        file_counter: &mut u32,
271        seen: &mut HashMap<String, String>,
272    ) -> String {
273        // Match absolute paths like /Users/name/project/src/file.ts
274        // or relative paths like src/components/file.tsx, ./lib/utils.ts
275        let path_re = Regex::new(
276            r"(?:(?:/[a-zA-Z_][a-zA-Z0-9._-]*/)+[a-zA-Z0-9._-]+\.[a-zA-Z]{1,10}|\.{0,2}/(?:[a-zA-Z0-9._-]+/)+[a-zA-Z0-9._-]+\.[a-zA-Z]{1,10})"
277        ).unwrap();
278
279        let mut result = text.to_string();
280
281        for m in path_re.find_iter(&result.clone()) {
282            let path = m.as_str().to_string();
283            if seen.contains_key(&path) {
284                let rep = seen.get(&path).unwrap().clone();
285                result = result.replacen(&path, &rep, 1);
286                continue;
287            }
288
289            // Extract the extension
290            let ext = path
291                .rsplit('.')
292                .next()
293                .unwrap_or("txt");
294
295            *file_counter += 1;
296            let rep = format!("[PROJECT]/src/[FILE_{:03}].{}", file_counter, ext);
297            seen.insert(path.clone(), rep.clone());
298            replacements.push(Replacement {
299                original: path.clone(),
300                replacement: rep.clone(),
301                kind: ReplacementKind::FilePath,
302            });
303            result = result.replacen(&path, &rep, 1);
304        }
305
306        result
307    }
308
309    fn anonymize_urls(
310        &self,
311        text: &str,
312        replacements: &mut Vec<Replacement>,
313        seen: &mut HashMap<String, String>,
314    ) -> String {
315        // Match URLs that contain tokens/keys in query params
316        let url_re = Regex::new(
317            r#"https?://[^\s<>"']+[?&](?:token|key|secret|access_token|api_key|auth)=[^\s<>"'&]+"#
318        ).unwrap();
319
320        let mut result = text.to_string();
321
322        for m in url_re.find_iter(&result.clone()) {
323            let url_str = m.as_str().to_string();
324            if seen.contains_key(&url_str) {
325                let rep = seen.get(&url_str).unwrap().clone();
326                result = result.replacen(&url_str, &rep, 1);
327                continue;
328            }
329
330            // Strip the token parameter but keep the domain
331            if let Ok(parsed) = url::Url::parse(&url_str) {
332                let domain = parsed.host_str().unwrap_or("unknown");
333                let path = parsed.path();
334                let rep = format!("https://{}{}?[TOKEN_REDACTED]", domain, path);
335                seen.insert(url_str.clone(), rep.clone());
336                replacements.push(Replacement {
337                    original: url_str.clone(),
338                    replacement: rep.clone(),
339                    kind: ReplacementKind::Url,
340                });
341                result = result.replacen(&url_str, &rep, 1);
342            }
343        }
344
345        result
346    }
347
348    fn anonymize_custom(
349        &self,
350        text: &str,
351        replacements: &mut Vec<Replacement>,
352        seen: &mut HashMap<String, String>,
353    ) -> String {
354        let mut result = text.to_string();
355
356        for (i, pattern) in self.custom_patterns.iter().enumerate() {
357            if let Ok(re) = Regex::new(pattern) {
358                let placeholder = format!("[CUSTOM_{}]", i + 1);
359                result = self.replace_pattern(
360                    &result,
361                    &re,
362                    &placeholder,
363                    ReplacementKind::Custom,
364                    replacements,
365                    seen,
366                );
367            }
368        }
369
370        result
371    }
372
373    /// Generic helper: replace all matches of `re` with `placeholder`, recording replacements.
374    fn replace_pattern(
375        &self,
376        text: &str,
377        re: &Regex,
378        placeholder: &str,
379        kind: ReplacementKind,
380        replacements: &mut Vec<Replacement>,
381        seen: &mut HashMap<String, String>,
382    ) -> String {
383        let mut result = text.to_string();
384
385        for m in re.find_iter(&result.clone()) {
386            let original = m.as_str().to_string();
387            if !seen.contains_key(&original) {
388                seen.insert(original.clone(), placeholder.to_string());
389                replacements.push(Replacement {
390                    original: original.clone(),
391                    replacement: placeholder.to_string(),
392                    kind: kind.clone(),
393                });
394            }
395            let rep = seen.get(&original).unwrap().clone();
396            result = result.replacen(&original, &rep, 1);
397        }
398
399        result
400    }
401}
402
403impl Default for Anonymizer {
404    fn default() -> Self {
405        Self::new()
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    fn test_anonymize_absolute_file_paths() {
415        let anon = Anonymizer::new();
416        let text = "Error in /Users/john/project/src/main.ts at line 42";
417        let result = anon.anonymize(text);
418        assert!(result.text.contains("[PROJECT]/src/[FILE_001].ts"));
419        assert!(!result.text.contains("/Users/john"));
420        assert!(result.replacements.iter().any(|r| r.kind == ReplacementKind::FilePath));
421    }
422
423    #[test]
424    fn test_anonymize_relative_file_paths() {
425        let anon = Anonymizer::new();
426        let text = "Check ./src/components/header.tsx for issues";
427        let result = anon.anonymize(text);
428        assert!(result.text.contains("[PROJECT]/src/[FILE_001].tsx"));
429        assert!(!result.text.contains("./src/components/header.tsx"));
430    }
431
432    #[test]
433    fn test_anonymize_git_author() {
434        let anon = Anonymizer::new();
435        let text = "Author: John Smith <john.smith@example.com>";
436        let result = anon.anonymize(text);
437        assert!(result.text.contains("[AUTHOR_1]"));
438        assert!(result.text.contains("[EMAIL_1]"));
439        assert!(!result.text.contains("John Smith"));
440        assert!(!result.text.contains("john.smith@example.com"));
441    }
442
443    #[test]
444    fn test_anonymize_jsdoc_author() {
445        let anon = Anonymizer::new();
446        let text = "/** @author Jane Doe */";
447        let result = anon.anonymize(text);
448        assert!(result.text.contains("[AUTHOR_1]"));
449        assert!(!result.text.contains("Jane Doe"));
450    }
451
452    #[test]
453    fn test_anonymize_openai_api_key() {
454        let anon = Anonymizer::new();
455        let text = "OPENAI_API_KEY=sk-abc123def456ghi789jkl012mno345pqr678";
456        let result = anon.anonymize(text);
457        assert!(result.text.contains("[REDACTED_KEY]"));
458        assert!(!result.text.contains("sk-abc123def456ghi789jkl012mno345pqr678"));
459    }
460
461    #[test]
462    fn test_anonymize_github_token() {
463        let anon = Anonymizer::new();
464        let text = "token: ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmn";
465        let result = anon.anonymize(text);
466        assert!(result.text.contains("[REDACTED_KEY]"));
467        assert!(!result.text.contains("ghp_"));
468    }
469
470    #[test]
471    fn test_anonymize_aws_key() {
472        let anon = Anonymizer::new();
473        let text = "aws_access_key = AKIAIOSFODNN7EXAMPLE1";
474        let result = anon.anonymize(text);
475        assert!(result.text.contains("[REDACTED_KEY]"));
476        assert!(!result.text.contains("AKIAIOSFODNN7EXAMPLE1"));
477    }
478
479    #[test]
480    fn test_anonymize_env_var_assignment() {
481        let anon = Anonymizer::new();
482        let text = r#"DATABASE_PASSWORD="my_super_secret""#;
483        let result = anon.anonymize(text);
484        assert!(result.text.contains("[REDACTED_KEY]"));
485        assert!(!result.text.contains("my_super_secret"));
486    }
487
488    #[test]
489    fn test_anonymize_email_standalone() {
490        let anon = Anonymizer::new();
491        let text = "Contact us at support@kardo.dev for help";
492        let result = anon.anonymize(text);
493        assert!(result.text.contains("[EMAIL_1]"));
494        assert!(!result.text.contains("support@kardo.dev"));
495    }
496
497    #[test]
498    fn test_anonymize_url_with_token() {
499        let anon = Anonymizer::new();
500        let text = "Webhook: https://api.example.com/callback?token=abc123secret";
501        let result = anon.anonymize(text);
502        assert!(result.text.contains("[TOKEN_REDACTED]"));
503        assert!(!result.text.contains("abc123secret"));
504        assert!(result.text.contains("api.example.com"));
505    }
506
507    #[test]
508    fn test_roundtrip_deanonymize() {
509        let anon = Anonymizer::new();
510        let original = "Author: Alice Johnson <alice@corp.com> modified /Users/alice/project/src/app.ts";
511        let result = anon.anonymize(original);
512        // Ensure sensitive data is gone
513        assert!(!result.text.contains("Alice Johnson"));
514        assert!(!result.text.contains("alice@corp.com"));
515        // Deanonymize
516        let restored = Anonymizer::deanonymize(&result.text, &result.replacements);
517        assert_eq!(restored, original);
518    }
519
520    #[test]
521    fn test_custom_patterns() {
522        let anon = Anonymizer::with_custom_patterns(vec![
523            r"PROJ-\d{4}".to_string(),
524        ]);
525        let text = "Issue PROJ-1234 is related to PROJ-5678";
526        let result = anon.anonymize(text);
527        assert!(result.text.contains("[CUSTOM_1]"));
528        assert!(!result.text.contains("PROJ-1234"));
529    }
530
531    #[test]
532    fn test_empty_string() {
533        let anon = Anonymizer::new();
534        let result = anon.anonymize("");
535        assert_eq!(result.text, "");
536        assert!(result.replacements.is_empty());
537    }
538
539    #[test]
540    fn test_no_sensitive_data() {
541        let anon = Anonymizer::new();
542        let text = "This is a normal text with no sensitive data.";
543        let result = anon.anonymize(text);
544        assert_eq!(result.text, text);
545        assert!(result.replacements.is_empty());
546    }
547
548    #[test]
549    fn test_multiple_paths_increment_counter() {
550        let anon = Anonymizer::new();
551        let text = "Files: /home/user/project/src/a.ts and /home/user/project/src/b.rs";
552        let result = anon.anonymize(text);
553        assert!(result.text.contains("[FILE_001]"));
554        assert!(result.text.contains("[FILE_002]"));
555    }
556
557    #[test]
558    fn test_same_email_reuses_placeholder() {
559        let anon = Anonymizer::new();
560        let text = "Send to user@test.com and also user@test.com again";
561        let result = anon.anonymize(text);
562        // Both occurrences should use the same placeholder
563        let count = result.text.matches("[EMAIL_1]").count();
564        assert_eq!(count, 2);
565        // Only one replacement entry
566        let email_replacements: Vec<_> = result.replacements.iter()
567            .filter(|r| r.kind == ReplacementKind::Email)
568            .collect();
569        assert_eq!(email_replacements.len(), 1);
570    }
571}