infiniloom_engine/
security.rs

1//! Security scanning for secrets and sensitive data
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7// Helper regex for word-boundary "example" detection (to skip documentation lines)
8static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
9    // Match "example" as a standalone word to skip documentation/tutorial content.
10    // This helps reduce false positives in example code and documentation.
11    //
12    // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
13    // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
14    // which runs separately. This regex is only used to skip entire lines that
15    // appear to be documentation examples (e.g., "# Example:" or "// example usage").
16    //
17    // The regex allows dots in word boundaries to handle domain examples like
18    // db.example.com without matching.
19    Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)")
20        .expect("RE_EXAMPLE_WORD: invalid regex pattern")
21});
22
23// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
24static RE_AWS_KEY: Lazy<Regex> =
25    Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").expect("RE_AWS_KEY: invalid regex pattern"));
26static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
27    Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
28        .expect("RE_AWS_SECRET: invalid regex pattern")
29});
30// GitHub Personal Access Token (classic) - 36 alphanumeric chars after prefix
31static RE_GITHUB_PAT: Lazy<Regex> =
32    Lazy::new(|| Regex::new(r"ghp_[A-Za-z0-9]{36}").expect("RE_GITHUB_PAT: invalid regex pattern"));
33// GitHub fine-grained PAT
34static RE_GITHUB_FINE_PAT: Lazy<Regex> = Lazy::new(|| {
35    Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}")
36        .expect("RE_GITHUB_FINE_PAT: invalid regex pattern")
37});
38// GitHub OAuth, user-to-server, server-to-server, and refresh tokens
39static RE_GITHUB_OTHER_TOKENS: Lazy<Regex> = Lazy::new(|| {
40    Regex::new(r"gh[ours]_[A-Za-z0-9]{36,}").expect("RE_GITHUB_OTHER_TOKENS: invalid regex pattern")
41});
42static RE_PRIVATE_KEY: Lazy<Regex> = Lazy::new(|| {
43    Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")
44        .expect("RE_PRIVATE_KEY: invalid regex pattern")
45});
46static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
47    Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
48        .expect("RE_API_KEY: invalid regex pattern")
49});
50static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
51    Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
52        .expect("RE_SECRET_TOKEN: invalid regex pattern")
53});
54static RE_PASSWORD: Lazy<Regex> = Lazy::new(|| {
55    Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#)
56        .expect("RE_PASSWORD: invalid regex pattern")
57});
58static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
59    // Note: postgres and postgresql are both valid (postgresql:// is more common in practice)
60    Regex::new(
61        r#"(?i)(?:mongodb|postgres(?:ql)?|mysql|redis|mariadb|cockroachdb|mssql)://[^\s'"]+"#,
62    )
63    .expect("RE_CONN_STRING: invalid regex pattern")
64});
65static RE_JWT: Lazy<Regex> = Lazy::new(|| {
66    Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*")
67        .expect("RE_JWT: invalid regex pattern")
68});
69static RE_SLACK: Lazy<Regex> = Lazy::new(|| {
70    Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}")
71        .expect("RE_SLACK: invalid regex pattern")
72});
73static RE_STRIPE: Lazy<Regex> = Lazy::new(|| {
74    Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}")
75        .expect("RE_STRIPE: invalid regex pattern")
76});
77// OpenAI API keys (sk-... followed by alphanumeric characters)
78static RE_OPENAI: Lazy<Regex> =
79    Lazy::new(|| Regex::new(r"sk-[A-Za-z0-9]{32,}").expect("RE_OPENAI: invalid regex pattern"));
80// Anthropic API keys (sk-ant-...)
81static RE_ANTHROPIC: Lazy<Regex> = Lazy::new(|| {
82    Regex::new(r"sk-ant-[A-Za-z0-9-]{40,}").expect("RE_ANTHROPIC: invalid regex pattern")
83});
84
85/// A detected secret or sensitive data
86#[derive(Debug, Clone)]
87pub struct SecretFinding {
88    /// Type of secret
89    pub kind: SecretKind,
90    /// File path
91    pub file: String,
92    /// Line number
93    pub line: u32,
94    /// Matched pattern (redacted)
95    pub pattern: String,
96    /// Severity level
97    pub severity: Severity,
98    /// Whether the secret was found in a comment (may be example/documentation)
99    pub in_comment: bool,
100}
101
102/// Kind of secret detected
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104pub enum SecretKind {
105    /// API key
106    ApiKey,
107    /// Access token
108    AccessToken,
109    /// Private key
110    PrivateKey,
111    /// Password
112    Password,
113    /// Database connection string
114    ConnectionString,
115    /// AWS credentials
116    AwsCredential,
117    /// GitHub token
118    GitHubToken,
119    /// Generic secret
120    Generic,
121}
122
123impl SecretKind {
124    /// Get human-readable name
125    pub fn name(&self) -> &'static str {
126        match self {
127            Self::ApiKey => "API Key",
128            Self::AccessToken => "Access Token",
129            Self::PrivateKey => "Private Key",
130            Self::Password => "Password",
131            Self::ConnectionString => "Connection String",
132            Self::AwsCredential => "AWS Credential",
133            Self::GitHubToken => "GitHub Token",
134            Self::Generic => "Generic Secret",
135        }
136    }
137}
138
139/// Severity level
140#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
141pub enum Severity {
142    Low,
143    Medium,
144    High,
145    Critical,
146}
147
148/// Security scanner
149pub struct SecurityScanner {
150    patterns: Vec<SecretPattern>,
151    custom_patterns: Vec<CustomSecretPattern>,
152    allowlist: HashSet<String>,
153}
154
155struct SecretPattern {
156    kind: SecretKind,
157    regex: &'static Lazy<Regex>,
158    severity: Severity,
159}
160
161/// Custom user-defined secret pattern
162struct CustomSecretPattern {
163    regex: Regex,
164    severity: Severity,
165}
166
167impl Default for SecurityScanner {
168    fn default() -> Self {
169        Self::new()
170    }
171}
172
173impl SecurityScanner {
174    /// Create a new security scanner with default patterns
175    /// Uses pre-compiled static regex patterns for optimal performance
176    ///
177    /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
178    /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
179    /// and redaction.
180    pub fn new() -> Self {
181        let patterns = vec![
182            // === Critical: Specific cloud credentials (most specific patterns first) ===
183            // AWS
184            SecretPattern {
185                kind: SecretKind::AwsCredential,
186                regex: &RE_AWS_KEY,
187                severity: Severity::Critical,
188            },
189            SecretPattern {
190                kind: SecretKind::AwsCredential,
191                regex: &RE_AWS_SECRET,
192                severity: Severity::Critical,
193            },
194            // GitHub tokens (all types: ghp_, gho_, ghu_, ghs_, ghr_, github_pat_)
195            SecretPattern {
196                kind: SecretKind::GitHubToken,
197                regex: &RE_GITHUB_PAT,
198                severity: Severity::Critical,
199            },
200            SecretPattern {
201                kind: SecretKind::GitHubToken,
202                regex: &RE_GITHUB_FINE_PAT,
203                severity: Severity::Critical,
204            },
205            SecretPattern {
206                kind: SecretKind::GitHubToken,
207                regex: &RE_GITHUB_OTHER_TOKENS,
208                severity: Severity::Critical,
209            },
210            // Private keys
211            SecretPattern {
212                kind: SecretKind::PrivateKey,
213                regex: &RE_PRIVATE_KEY,
214                severity: Severity::Critical,
215            },
216            // Anthropic API keys (must come before OpenAI since sk-ant- is more specific)
217            SecretPattern {
218                kind: SecretKind::ApiKey,
219                regex: &RE_ANTHROPIC,
220                severity: Severity::Critical,
221            },
222            // OpenAI API keys (must come before Stripe since sk- is more general)
223            SecretPattern {
224                kind: SecretKind::ApiKey,
225                regex: &RE_OPENAI,
226                severity: Severity::Critical,
227            },
228            // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
229            SecretPattern {
230                kind: SecretKind::ApiKey,
231                regex: &RE_STRIPE,
232                severity: Severity::Critical,
233            },
234            // === High: Specific service tokens (must come before generic patterns) ===
235            // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
236            SecretPattern {
237                kind: SecretKind::AccessToken,
238                regex: &RE_SLACK,
239                severity: Severity::High,
240            },
241            // JWT tokens (specific pattern: eyJ...eyJ...signature)
242            SecretPattern {
243                kind: SecretKind::AccessToken,
244                regex: &RE_JWT,
245                severity: Severity::High,
246            },
247            // Connection strings (specific pattern: mongodb://, postgres://, etc.)
248            SecretPattern {
249                kind: SecretKind::ConnectionString,
250                regex: &RE_CONN_STRING,
251                severity: Severity::High,
252            },
253            // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
254            // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
255            SecretPattern {
256                kind: SecretKind::ApiKey,
257                regex: &RE_API_KEY,
258                severity: Severity::High,
259            },
260            // Generic secrets (matches secret=xxx, token=xxx, etc.)
261            SecretPattern {
262                kind: SecretKind::Generic,
263                regex: &RE_SECRET_TOKEN,
264                severity: Severity::High,
265            },
266            // Passwords
267            SecretPattern {
268                kind: SecretKind::Password,
269                regex: &RE_PASSWORD,
270                severity: Severity::High,
271            },
272        ];
273
274        Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
275    }
276
277    /// Add a pattern to allowlist
278    pub fn allowlist(&mut self, pattern: &str) {
279        self.allowlist.insert(pattern.to_owned());
280    }
281
282    /// Add a custom regex pattern for secret detection
283    ///
284    /// Custom patterns are matched as generic secrets with High severity.
285    /// Invalid regex patterns are silently ignored.
286    ///
287    /// # Example
288    /// ```
289    /// use infiniloom_engine::security::SecurityScanner;
290    ///
291    /// let mut scanner = SecurityScanner::new();
292    /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}");
293    /// ```
294    pub fn add_custom_pattern(&mut self, pattern: &str) {
295        if let Ok(regex) = Regex::new(pattern) {
296            self.custom_patterns
297                .push(CustomSecretPattern { regex, severity: Severity::High });
298        }
299    }
300
301    /// Add multiple custom patterns at once
302    pub fn add_custom_patterns(&mut self, patterns: &[String]) {
303        for pattern in patterns {
304            self.add_custom_pattern(pattern);
305        }
306    }
307
308    /// Scan content for secrets
309    pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
310        let mut findings = Vec::new();
311
312        for (line_num, line) in content.lines().enumerate() {
313            let trimmed = line.trim();
314
315            // Detect if line is likely a comment - skip entirely to reduce false positives
316            // Real secrets shouldn't be in comments anyway
317            let is_jsdoc_continuation =
318                trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
319            let is_comment = trimmed.starts_with("//")
320                || trimmed.starts_with('#')
321                || trimmed.starts_with("/*")
322                || trimmed.starts_with("*")
323                || is_jsdoc_continuation;
324
325            // Skip obvious false positives (example docs, placeholders, comments)
326            let is_obvious_false_positive = is_comment
327                || RE_EXAMPLE_WORD.is_match(trimmed)
328                || trimmed.to_lowercase().contains("placeholder")
329                || trimmed.contains("xxxxx");
330
331            if is_obvious_false_positive {
332                continue;
333            }
334
335            for pattern in &self.patterns {
336                // Use find_iter to catch ALL matches on a line, not just the first
337                for m in pattern.regex.find_iter(line) {
338                    let matched = m.as_str();
339
340                    // Check allowlist
341                    if self.allowlist.iter().any(|a| matched.contains(a)) {
342                        continue;
343                    }
344
345                    findings.push(SecretFinding {
346                        kind: pattern.kind,
347                        file: file_path.to_owned(),
348                        line: (line_num + 1) as u32,
349                        pattern: redact(matched),
350                        severity: pattern.severity,
351                        in_comment: false, // Non-comment lines only now
352                    });
353                }
354            }
355
356            // Check custom patterns
357            for custom in &self.custom_patterns {
358                for m in custom.regex.find_iter(line) {
359                    let matched = m.as_str();
360
361                    // Check allowlist
362                    if self.allowlist.iter().any(|a| matched.contains(a)) {
363                        continue;
364                    }
365
366                    findings.push(SecretFinding {
367                        kind: SecretKind::Generic,
368                        file: file_path.to_owned(),
369                        line: (line_num + 1) as u32,
370                        pattern: redact(matched),
371                        severity: custom.severity,
372                        in_comment: false,
373                    });
374                }
375            }
376        }
377
378        findings
379    }
380
381    /// Scan a file and return whether it's safe to include
382    pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
383        let findings = self.scan(content, file_path);
384        findings.iter().all(|f| f.severity < Severity::High)
385    }
386
387    /// Get summary of findings
388    pub fn summarize(findings: &[SecretFinding]) -> String {
389        if findings.is_empty() {
390            return "No secrets detected".to_owned();
391        }
392
393        let critical = findings
394            .iter()
395            .filter(|f| f.severity == Severity::Critical)
396            .count();
397        let high = findings
398            .iter()
399            .filter(|f| f.severity == Severity::High)
400            .count();
401
402        format!(
403            "Found {} potential secrets ({} critical, {} high severity)",
404            findings.len(),
405            critical,
406            high
407        )
408    }
409
410    /// Redact secrets from content, returning the redacted content
411    /// This replaces detected secrets with redacted versions in the actual content
412    pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
413        let mut result = content.to_owned();
414
415        for (line_num, line) in content.lines().enumerate() {
416            let trimmed = line.trim();
417
418            // Skip obvious false positives (example docs, placeholders)
419            let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
420                || trimmed.to_lowercase().contains("placeholder")
421                || trimmed.contains("xxxxx");
422
423            if is_obvious_false_positive {
424                continue;
425            }
426
427            for pattern in &self.patterns {
428                // Use find_iter to catch ALL matches on a line, not just the first
429                for m in pattern.regex.find_iter(line) {
430                    let matched = m.as_str();
431
432                    // Check allowlist
433                    if self.allowlist.iter().any(|a| matched.contains(a)) {
434                        continue;
435                    }
436
437                    // Only redact high severity and above
438                    if pattern.severity >= Severity::High {
439                        let redacted = redact(matched);
440                        // Replace in result - use line number to find the right occurrence
441                        let line_start = result
442                            .lines()
443                            .take(line_num)
444                            .map(|l| l.len() + 1)
445                            .sum::<usize>();
446                        if let Some(pos) = result[line_start..].find(matched) {
447                            let abs_pos = line_start + pos;
448                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
449                        }
450                    }
451                }
452            }
453
454            // Check custom patterns for redaction
455            for custom in &self.custom_patterns {
456                for m in custom.regex.find_iter(line) {
457                    let matched = m.as_str();
458
459                    // Check allowlist
460                    if self.allowlist.iter().any(|a| matched.contains(a)) {
461                        continue;
462                    }
463
464                    // Only redact high severity and above
465                    if custom.severity >= Severity::High {
466                        let redacted = redact(matched);
467                        let line_start = result
468                            .lines()
469                            .take(line_num)
470                            .map(|l| l.len() + 1)
471                            .sum::<usize>();
472                        if let Some(pos) = result[line_start..].find(matched) {
473                            let abs_pos = line_start + pos;
474                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
475                        }
476                    }
477                }
478            }
479        }
480
481        result
482    }
483
484    /// Scan and redact all secrets from content.
485    ///
486    /// Returns a tuple of (redacted_content, findings) where:
487    /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
488    /// - `findings` is a list of all detected secrets with metadata
489    ///
490    /// # Important
491    ///
492    /// Always check the findings list to understand what was redacted and whether
493    /// the file should be excluded from context entirely.
494    #[must_use = "security findings should be reviewed"]
495    pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
496        let findings = self.scan(content, file_path);
497        let redacted = self.redact_content(content, file_path);
498        (redacted, findings)
499    }
500}
501
502/// Redact a matched secret for display
503fn redact(s: &str) -> String {
504    if s.len() <= 8 {
505        return "*".repeat(s.len());
506    }
507
508    let prefix_len = 4.min(s.len() / 4);
509    let suffix_len = 4.min(s.len() / 4);
510
511    format!(
512        "{}{}{}",
513        &s[..prefix_len],
514        "*".repeat(s.len() - prefix_len - suffix_len),
515        &s[s.len() - suffix_len..]
516    )
517}
518
519#[cfg(test)]
520mod tests {
521    use super::*;
522
523    #[test]
524    fn test_aws_key_detection() {
525        let scanner = SecurityScanner::new();
526        let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
527
528        let findings = scanner.scan(content, "config.py");
529
530        assert!(!findings.is_empty());
531        assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
532    }
533
534    #[test]
535    fn test_github_token_detection() {
536        let scanner = SecurityScanner::new();
537        let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
538
539        let findings = scanner.scan(content, ".env");
540
541        assert!(!findings.is_empty());
542        assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
543    }
544
545    #[test]
546    fn test_private_key_detection() {
547        let scanner = SecurityScanner::new();
548        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
549
550        let findings = scanner.scan(content, "key.pem");
551
552        assert!(!findings.is_empty());
553        assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
554    }
555
556    #[test]
557    fn test_allowlist() {
558        let mut scanner = SecurityScanner::new();
559        scanner.allowlist("EXAMPLE");
560
561        let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
562        let findings = scanner.scan(content, "test.py");
563
564        assert!(findings.is_empty());
565    }
566
567    #[test]
568    fn test_redact() {
569        assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
570        assert_eq!(redact("short"), "*****");
571    }
572
573    #[test]
574    fn test_comments_are_skipped() {
575        let scanner = SecurityScanner::new();
576        let content = "# api_key = 'some_secret_key_12345678901234567890'";
577
578        let findings = scanner.scan(content, "test.py");
579
580        // Comments are skipped entirely to reduce false positives
581        assert!(findings.is_empty(), "Secrets in comments should be skipped");
582    }
583
584    #[test]
585    fn test_non_comment_detected() {
586        let scanner = SecurityScanner::new();
587        let content = "api_key = 'some_secret_key_12345678901234567890'";
588
589        let findings = scanner.scan(content, "test.py");
590
591        assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
592        assert!(
593            findings.iter().all(|f| !f.in_comment),
594            "in_comment should be false for non-comment lines"
595        );
596    }
597
598    #[test]
599    fn test_custom_pattern() {
600        let mut scanner = SecurityScanner::new();
601        scanner.add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}");
602
603        let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
604        let findings = scanner.scan(content, "test.py");
605
606        assert!(!findings.is_empty(), "Custom pattern should be detected");
607        assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
608    }
609
610    #[test]
611    fn test_custom_patterns_multiple() {
612        let mut scanner = SecurityScanner::new();
613        scanner.add_custom_patterns(&[
614            r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
615            r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
616        ]);
617
618        let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
619        let findings = scanner.scan(content, "test.py");
620
621        assert!(!findings.is_empty(), "Custom patterns should be detected");
622    }
623
624    #[test]
625    fn test_invalid_custom_pattern_ignored() {
626        let mut scanner = SecurityScanner::new();
627        // Invalid regex - unclosed bracket
628        scanner.add_custom_pattern(r"INVALID_[PATTERN");
629
630        // Should not panic, invalid patterns are ignored
631        let content = "INVALID_[PATTERN here";
632        let _findings = scanner.scan(content, "test.py");
633    }
634}