infiniloom_engine/
security.rs

1//! Security scanning for secrets and sensitive data
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7// Helper regex for word-boundary "example" detection (to skip documentation lines)
8static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
9    // Match "example" as a standalone word to skip documentation/tutorial content.
10    // This helps reduce false positives in example code and documentation.
11    //
12    // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
13    // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
14    // which runs separately. This regex is only used to skip entire lines that
15    // appear to be documentation examples (e.g., "# Example:" or "// example usage").
16    //
17    // The regex allows dots in word boundaries to handle domain examples like
18    // db.example.com without matching.
19    Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)")
20        .expect("RE_EXAMPLE_WORD: invalid regex pattern")
21});
22
23// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
24static RE_AWS_KEY: Lazy<Regex> =
25    Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").expect("RE_AWS_KEY: invalid regex pattern"));
26static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
27    Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
28        .expect("RE_AWS_SECRET: invalid regex pattern")
29});
30// GitHub Personal Access Token (classic) - 36 alphanumeric chars after prefix
31static RE_GITHUB_PAT: Lazy<Regex> = Lazy::new(|| {
32    Regex::new(r"ghp_[A-Za-z0-9]{36}").expect("RE_GITHUB_PAT: invalid regex pattern")
33});
34// GitHub fine-grained PAT
35static RE_GITHUB_FINE_PAT: Lazy<Regex> = Lazy::new(|| {
36    Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}")
37        .expect("RE_GITHUB_FINE_PAT: invalid regex pattern")
38});
39// GitHub OAuth, user-to-server, server-to-server, and refresh tokens
40static RE_GITHUB_OTHER_TOKENS: Lazy<Regex> = Lazy::new(|| {
41    Regex::new(r"gh[ours]_[A-Za-z0-9]{36,}").expect("RE_GITHUB_OTHER_TOKENS: invalid regex pattern")
42});
43static RE_PRIVATE_KEY: Lazy<Regex> = Lazy::new(|| {
44    Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")
45        .expect("RE_PRIVATE_KEY: invalid regex pattern")
46});
47static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
48    Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
49        .expect("RE_API_KEY: invalid regex pattern")
50});
51static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
52    Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
53        .expect("RE_SECRET_TOKEN: invalid regex pattern")
54});
55static RE_PASSWORD: Lazy<Regex> = Lazy::new(|| {
56    Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#)
57        .expect("RE_PASSWORD: invalid regex pattern")
58});
59static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
60    // Note: postgres and postgresql are both valid (postgresql:// is more common in practice)
61    Regex::new(
62        r#"(?i)(?:mongodb|postgres(?:ql)?|mysql|redis|mariadb|cockroachdb|mssql)://[^\s'"]+"#,
63    )
64    .expect("RE_CONN_STRING: invalid regex pattern")
65});
66static RE_JWT: Lazy<Regex> = Lazy::new(|| {
67    Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*")
68        .expect("RE_JWT: invalid regex pattern")
69});
70static RE_SLACK: Lazy<Regex> = Lazy::new(|| {
71    Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}")
72        .expect("RE_SLACK: invalid regex pattern")
73});
74static RE_STRIPE: Lazy<Regex> = Lazy::new(|| {
75    Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}")
76        .expect("RE_STRIPE: invalid regex pattern")
77});
78// OpenAI API keys (sk-... followed by alphanumeric characters)
79static RE_OPENAI: Lazy<Regex> = Lazy::new(|| {
80    Regex::new(r"sk-[A-Za-z0-9]{32,}").expect("RE_OPENAI: invalid regex pattern")
81});
82// Anthropic API keys (sk-ant-...)
83static RE_ANTHROPIC: Lazy<Regex> = Lazy::new(|| {
84    Regex::new(r"sk-ant-[A-Za-z0-9-]{40,}").expect("RE_ANTHROPIC: invalid regex pattern")
85});
86
87/// A detected secret or sensitive data
88#[derive(Debug, Clone)]
89pub struct SecretFinding {
90    /// Type of secret
91    pub kind: SecretKind,
92    /// File path
93    pub file: String,
94    /// Line number
95    pub line: u32,
96    /// Matched pattern (redacted)
97    pub pattern: String,
98    /// Severity level
99    pub severity: Severity,
100    /// Whether the secret was found in a comment (may be example/documentation)
101    pub in_comment: bool,
102}
103
104/// Kind of secret detected
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
106pub enum SecretKind {
107    /// API key
108    ApiKey,
109    /// Access token
110    AccessToken,
111    /// Private key
112    PrivateKey,
113    /// Password
114    Password,
115    /// Database connection string
116    ConnectionString,
117    /// AWS credentials
118    AwsCredential,
119    /// GitHub token
120    GitHubToken,
121    /// Generic secret
122    Generic,
123}
124
125impl SecretKind {
126    /// Get human-readable name
127    pub fn name(&self) -> &'static str {
128        match self {
129            Self::ApiKey => "API Key",
130            Self::AccessToken => "Access Token",
131            Self::PrivateKey => "Private Key",
132            Self::Password => "Password",
133            Self::ConnectionString => "Connection String",
134            Self::AwsCredential => "AWS Credential",
135            Self::GitHubToken => "GitHub Token",
136            Self::Generic => "Generic Secret",
137        }
138    }
139}
140
141/// Severity level
142#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
143pub enum Severity {
144    Low,
145    Medium,
146    High,
147    Critical,
148}
149
150/// Security scanner
151pub struct SecurityScanner {
152    patterns: Vec<SecretPattern>,
153    custom_patterns: Vec<CustomSecretPattern>,
154    allowlist: HashSet<String>,
155}
156
157struct SecretPattern {
158    kind: SecretKind,
159    regex: &'static Lazy<Regex>,
160    severity: Severity,
161}
162
163/// Custom user-defined secret pattern
164struct CustomSecretPattern {
165    regex: Regex,
166    severity: Severity,
167}
168
169impl Default for SecurityScanner {
170    fn default() -> Self {
171        Self::new()
172    }
173}
174
175impl SecurityScanner {
176    /// Create a new security scanner with default patterns
177    /// Uses pre-compiled static regex patterns for optimal performance
178    ///
179    /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
180    /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
181    /// and redaction.
182    pub fn new() -> Self {
183        let patterns = vec![
184            // === Critical: Specific cloud credentials (most specific patterns first) ===
185            // AWS
186            SecretPattern {
187                kind: SecretKind::AwsCredential,
188                regex: &RE_AWS_KEY,
189                severity: Severity::Critical,
190            },
191            SecretPattern {
192                kind: SecretKind::AwsCredential,
193                regex: &RE_AWS_SECRET,
194                severity: Severity::Critical,
195            },
196            // GitHub tokens (all types: ghp_, gho_, ghu_, ghs_, ghr_, github_pat_)
197            SecretPattern {
198                kind: SecretKind::GitHubToken,
199                regex: &RE_GITHUB_PAT,
200                severity: Severity::Critical,
201            },
202            SecretPattern {
203                kind: SecretKind::GitHubToken,
204                regex: &RE_GITHUB_FINE_PAT,
205                severity: Severity::Critical,
206            },
207            SecretPattern {
208                kind: SecretKind::GitHubToken,
209                regex: &RE_GITHUB_OTHER_TOKENS,
210                severity: Severity::Critical,
211            },
212            // Private keys
213            SecretPattern {
214                kind: SecretKind::PrivateKey,
215                regex: &RE_PRIVATE_KEY,
216                severity: Severity::Critical,
217            },
218            // Anthropic API keys (must come before OpenAI since sk-ant- is more specific)
219            SecretPattern {
220                kind: SecretKind::ApiKey,
221                regex: &RE_ANTHROPIC,
222                severity: Severity::Critical,
223            },
224            // OpenAI API keys (must come before Stripe since sk- is more general)
225            SecretPattern {
226                kind: SecretKind::ApiKey,
227                regex: &RE_OPENAI,
228                severity: Severity::Critical,
229            },
230            // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
231            SecretPattern {
232                kind: SecretKind::ApiKey,
233                regex: &RE_STRIPE,
234                severity: Severity::Critical,
235            },
236            // === High: Specific service tokens (must come before generic patterns) ===
237            // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
238            SecretPattern {
239                kind: SecretKind::AccessToken,
240                regex: &RE_SLACK,
241                severity: Severity::High,
242            },
243            // JWT tokens (specific pattern: eyJ...eyJ...signature)
244            SecretPattern {
245                kind: SecretKind::AccessToken,
246                regex: &RE_JWT,
247                severity: Severity::High,
248            },
249            // Connection strings (specific pattern: mongodb://, postgres://, etc.)
250            SecretPattern {
251                kind: SecretKind::ConnectionString,
252                regex: &RE_CONN_STRING,
253                severity: Severity::High,
254            },
255            // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
256            // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
257            SecretPattern {
258                kind: SecretKind::ApiKey,
259                regex: &RE_API_KEY,
260                severity: Severity::High,
261            },
262            // Generic secrets (matches secret=xxx, token=xxx, etc.)
263            SecretPattern {
264                kind: SecretKind::Generic,
265                regex: &RE_SECRET_TOKEN,
266                severity: Severity::High,
267            },
268            // Passwords
269            SecretPattern {
270                kind: SecretKind::Password,
271                regex: &RE_PASSWORD,
272                severity: Severity::High,
273            },
274        ];
275
276        Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
277    }
278
279    /// Add a pattern to allowlist
280    pub fn allowlist(&mut self, pattern: &str) {
281        self.allowlist.insert(pattern.to_owned());
282    }
283
284    /// Add a custom regex pattern for secret detection
285    ///
286    /// Custom patterns are matched as generic secrets with High severity.
287    /// Invalid regex patterns are silently ignored.
288    ///
289    /// # Example
290    /// ```
291    /// use infiniloom_engine::security::SecurityScanner;
292    ///
293    /// let mut scanner = SecurityScanner::new();
294    /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}");
295    /// ```
296    pub fn add_custom_pattern(&mut self, pattern: &str) {
297        if let Ok(regex) = Regex::new(pattern) {
298            self.custom_patterns
299                .push(CustomSecretPattern { regex, severity: Severity::High });
300        }
301    }
302
303    /// Add multiple custom patterns at once
304    pub fn add_custom_patterns(&mut self, patterns: &[String]) {
305        for pattern in patterns {
306            self.add_custom_pattern(pattern);
307        }
308    }
309
310    /// Scan content for secrets
311    pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
312        let mut findings = Vec::new();
313
314        for (line_num, line) in content.lines().enumerate() {
315            let trimmed = line.trim();
316
317            // Detect if line is likely a comment - skip entirely to reduce false positives
318            // Real secrets shouldn't be in comments anyway
319            let is_jsdoc_continuation =
320                trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
321            let is_comment = trimmed.starts_with("//")
322                || trimmed.starts_with('#')
323                || trimmed.starts_with("/*")
324                || trimmed.starts_with("*")
325                || is_jsdoc_continuation;
326
327            // Skip obvious false positives (example docs, placeholders, comments)
328            let is_obvious_false_positive = is_comment
329                || RE_EXAMPLE_WORD.is_match(trimmed)
330                || trimmed.to_lowercase().contains("placeholder")
331                || trimmed.contains("xxxxx");
332
333            if is_obvious_false_positive {
334                continue;
335            }
336
337            for pattern in &self.patterns {
338                // Use find_iter to catch ALL matches on a line, not just the first
339                for m in pattern.regex.find_iter(line) {
340                    let matched = m.as_str();
341
342                    // Check allowlist
343                    if self.allowlist.iter().any(|a| matched.contains(a)) {
344                        continue;
345                    }
346
347                    findings.push(SecretFinding {
348                        kind: pattern.kind,
349                        file: file_path.to_owned(),
350                        line: (line_num + 1) as u32,
351                        pattern: redact(matched),
352                        severity: pattern.severity,
353                        in_comment: false, // Non-comment lines only now
354                    });
355                }
356            }
357
358            // Check custom patterns
359            for custom in &self.custom_patterns {
360                for m in custom.regex.find_iter(line) {
361                    let matched = m.as_str();
362
363                    // Check allowlist
364                    if self.allowlist.iter().any(|a| matched.contains(a)) {
365                        continue;
366                    }
367
368                    findings.push(SecretFinding {
369                        kind: SecretKind::Generic,
370                        file: file_path.to_owned(),
371                        line: (line_num + 1) as u32,
372                        pattern: redact(matched),
373                        severity: custom.severity,
374                        in_comment: false,
375                    });
376                }
377            }
378        }
379
380        findings
381    }
382
383    /// Scan a file and return whether it's safe to include
384    pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
385        let findings = self.scan(content, file_path);
386        findings.iter().all(|f| f.severity < Severity::High)
387    }
388
389    /// Get summary of findings
390    pub fn summarize(findings: &[SecretFinding]) -> String {
391        if findings.is_empty() {
392            return "No secrets detected".to_owned();
393        }
394
395        let critical = findings
396            .iter()
397            .filter(|f| f.severity == Severity::Critical)
398            .count();
399        let high = findings
400            .iter()
401            .filter(|f| f.severity == Severity::High)
402            .count();
403
404        format!(
405            "Found {} potential secrets ({} critical, {} high severity)",
406            findings.len(),
407            critical,
408            high
409        )
410    }
411
412    /// Redact secrets from content, returning the redacted content
413    /// This replaces detected secrets with redacted versions in the actual content
414    pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
415        let mut result = content.to_owned();
416
417        for (line_num, line) in content.lines().enumerate() {
418            let trimmed = line.trim();
419
420            // Skip obvious false positives (example docs, placeholders)
421            let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
422                || trimmed.to_lowercase().contains("placeholder")
423                || trimmed.contains("xxxxx");
424
425            if is_obvious_false_positive {
426                continue;
427            }
428
429            for pattern in &self.patterns {
430                // Use find_iter to catch ALL matches on a line, not just the first
431                for m in pattern.regex.find_iter(line) {
432                    let matched = m.as_str();
433
434                    // Check allowlist
435                    if self.allowlist.iter().any(|a| matched.contains(a)) {
436                        continue;
437                    }
438
439                    // Only redact high severity and above
440                    if pattern.severity >= Severity::High {
441                        let redacted = redact(matched);
442                        // Replace in result - use line number to find the right occurrence
443                        let line_start = result
444                            .lines()
445                            .take(line_num)
446                            .map(|l| l.len() + 1)
447                            .sum::<usize>();
448                        if let Some(pos) = result[line_start..].find(matched) {
449                            let abs_pos = line_start + pos;
450                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
451                        }
452                    }
453                }
454            }
455
456            // Check custom patterns for redaction
457            for custom in &self.custom_patterns {
458                for m in custom.regex.find_iter(line) {
459                    let matched = m.as_str();
460
461                    // Check allowlist
462                    if self.allowlist.iter().any(|a| matched.contains(a)) {
463                        continue;
464                    }
465
466                    // Only redact high severity and above
467                    if custom.severity >= Severity::High {
468                        let redacted = redact(matched);
469                        let line_start = result
470                            .lines()
471                            .take(line_num)
472                            .map(|l| l.len() + 1)
473                            .sum::<usize>();
474                        if let Some(pos) = result[line_start..].find(matched) {
475                            let abs_pos = line_start + pos;
476                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
477                        }
478                    }
479                }
480            }
481        }
482
483        result
484    }
485
486    /// Scan and redact all secrets from content.
487    ///
488    /// Returns a tuple of (redacted_content, findings) where:
489    /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
490    /// - `findings` is a list of all detected secrets with metadata
491    ///
492    /// # Important
493    ///
494    /// Always check the findings list to understand what was redacted and whether
495    /// the file should be excluded from context entirely.
496    #[must_use = "security findings should be reviewed"]
497    pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
498        let findings = self.scan(content, file_path);
499        let redacted = self.redact_content(content, file_path);
500        (redacted, findings)
501    }
502}
503
504/// Redact a matched secret for display
505fn redact(s: &str) -> String {
506    if s.len() <= 8 {
507        return "*".repeat(s.len());
508    }
509
510    let prefix_len = 4.min(s.len() / 4);
511    let suffix_len = 4.min(s.len() / 4);
512
513    format!(
514        "{}{}{}",
515        &s[..prefix_len],
516        "*".repeat(s.len() - prefix_len - suffix_len),
517        &s[s.len() - suffix_len..]
518    )
519}
520
521#[cfg(test)]
522mod tests {
523    use super::*;
524
525    #[test]
526    fn test_aws_key_detection() {
527        let scanner = SecurityScanner::new();
528        let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
529
530        let findings = scanner.scan(content, "config.py");
531
532        assert!(!findings.is_empty());
533        assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
534    }
535
536    #[test]
537    fn test_github_token_detection() {
538        let scanner = SecurityScanner::new();
539        let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
540
541        let findings = scanner.scan(content, ".env");
542
543        assert!(!findings.is_empty());
544        assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
545    }
546
547    #[test]
548    fn test_private_key_detection() {
549        let scanner = SecurityScanner::new();
550        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
551
552        let findings = scanner.scan(content, "key.pem");
553
554        assert!(!findings.is_empty());
555        assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
556    }
557
558    #[test]
559    fn test_allowlist() {
560        let mut scanner = SecurityScanner::new();
561        scanner.allowlist("EXAMPLE");
562
563        let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
564        let findings = scanner.scan(content, "test.py");
565
566        assert!(findings.is_empty());
567    }
568
569    #[test]
570    fn test_redact() {
571        assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
572        assert_eq!(redact("short"), "*****");
573    }
574
575    #[test]
576    fn test_comments_are_skipped() {
577        let scanner = SecurityScanner::new();
578        let content = "# api_key = 'some_secret_key_12345678901234567890'";
579
580        let findings = scanner.scan(content, "test.py");
581
582        // Comments are skipped entirely to reduce false positives
583        assert!(findings.is_empty(), "Secrets in comments should be skipped");
584    }
585
586    #[test]
587    fn test_non_comment_detected() {
588        let scanner = SecurityScanner::new();
589        let content = "api_key = 'some_secret_key_12345678901234567890'";
590
591        let findings = scanner.scan(content, "test.py");
592
593        assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
594        assert!(
595            findings.iter().all(|f| !f.in_comment),
596            "in_comment should be false for non-comment lines"
597        );
598    }
599
600    #[test]
601    fn test_custom_pattern() {
602        let mut scanner = SecurityScanner::new();
603        scanner.add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}");
604
605        let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
606        let findings = scanner.scan(content, "test.py");
607
608        assert!(!findings.is_empty(), "Custom pattern should be detected");
609        assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
610    }
611
612    #[test]
613    fn test_custom_patterns_multiple() {
614        let mut scanner = SecurityScanner::new();
615        scanner.add_custom_patterns(&[
616            r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
617            r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
618        ]);
619
620        let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
621        let findings = scanner.scan(content, "test.py");
622
623        assert!(!findings.is_empty(), "Custom patterns should be detected");
624    }
625
626    #[test]
627    fn test_invalid_custom_pattern_ignored() {
628        let mut scanner = SecurityScanner::new();
629        // Invalid regex - unclosed bracket
630        scanner.add_custom_pattern(r"INVALID_[PATTERN");
631
632        // Should not panic, invalid patterns are ignored
633        let content = "INVALID_[PATTERN here";
634        let _findings = scanner.scan(content, "test.py");
635    }
636}