infiniloom_engine/
security.rs

1//! Security scanning for secrets and sensitive data
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7// Helper regex for word-boundary "example" detection (to skip documentation lines)
8static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
9    // Match "example" as a standalone word to skip documentation/tutorial content.
10    // This helps reduce false positives in example code and documentation.
11    //
12    // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
13    // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
14    // which runs separately. This regex is only used to skip entire lines that
15    // appear to be documentation examples (e.g., "# Example:" or "// example usage").
16    //
17    // The regex allows dots in word boundaries to handle domain examples like
18    // db.example.com without matching.
19    Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)").unwrap()
20});
21
22// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
23static RE_AWS_KEY: Lazy<Regex> = Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").unwrap());
24static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
25    Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
26        .unwrap()
27});
28// GitHub Personal Access Token (classic) - 36 alphanumeric chars after prefix
29static RE_GITHUB_PAT: Lazy<Regex> = Lazy::new(|| Regex::new(r"ghp_[A-Za-z0-9]{36}").unwrap());
30// GitHub fine-grained PAT
31static RE_GITHUB_FINE_PAT: Lazy<Regex> =
32    Lazy::new(|| Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}").unwrap());
33// GitHub OAuth, user-to-server, server-to-server, and refresh tokens
34static RE_GITHUB_OTHER_TOKENS: Lazy<Regex> =
35    Lazy::new(|| Regex::new(r"gh[ours]_[A-Za-z0-9]{36,}").unwrap());
36static RE_PRIVATE_KEY: Lazy<Regex> =
37    Lazy::new(|| Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----").unwrap());
38static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
39    Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#).unwrap()
40});
41static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
42    Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#).unwrap()
43});
44static RE_PASSWORD: Lazy<Regex> =
45    Lazy::new(|| Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#).unwrap());
46static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
47    // Note: postgres and postgresql are both valid (postgresql:// is more common in practice)
48    Regex::new(r#"(?i)(?:mongodb|postgres(?:ql)?|mysql|redis|mariadb|cockroachdb|mssql)://[^\s'"]+"#)
49        .unwrap()
50});
51static RE_JWT: Lazy<Regex> =
52    Lazy::new(|| Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*").unwrap());
53static RE_SLACK: Lazy<Regex> =
54    Lazy::new(|| Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}").unwrap());
55static RE_STRIPE: Lazy<Regex> =
56    Lazy::new(|| Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}").unwrap());
57// OpenAI API keys (sk-... followed by alphanumeric characters)
58static RE_OPENAI: Lazy<Regex> =
59    Lazy::new(|| Regex::new(r"sk-[A-Za-z0-9]{32,}").unwrap());
60// Anthropic API keys (sk-ant-...)
61static RE_ANTHROPIC: Lazy<Regex> =
62    Lazy::new(|| Regex::new(r"sk-ant-[A-Za-z0-9-]{40,}").unwrap());
63
64/// A detected secret or sensitive data
65#[derive(Debug, Clone)]
66pub struct SecretFinding {
67    /// Type of secret
68    pub kind: SecretKind,
69    /// File path
70    pub file: String,
71    /// Line number
72    pub line: u32,
73    /// Matched pattern (redacted)
74    pub pattern: String,
75    /// Severity level
76    pub severity: Severity,
77    /// Whether the secret was found in a comment (may be example/documentation)
78    pub in_comment: bool,
79}
80
81/// Kind of secret detected
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
83pub enum SecretKind {
84    /// API key
85    ApiKey,
86    /// Access token
87    AccessToken,
88    /// Private key
89    PrivateKey,
90    /// Password
91    Password,
92    /// Database connection string
93    ConnectionString,
94    /// AWS credentials
95    AwsCredential,
96    /// GitHub token
97    GitHubToken,
98    /// Generic secret
99    Generic,
100}
101
102impl SecretKind {
103    /// Get human-readable name
104    pub fn name(&self) -> &'static str {
105        match self {
106            Self::ApiKey => "API Key",
107            Self::AccessToken => "Access Token",
108            Self::PrivateKey => "Private Key",
109            Self::Password => "Password",
110            Self::ConnectionString => "Connection String",
111            Self::AwsCredential => "AWS Credential",
112            Self::GitHubToken => "GitHub Token",
113            Self::Generic => "Generic Secret",
114        }
115    }
116}
117
118/// Severity level
119#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
120pub enum Severity {
121    Low,
122    Medium,
123    High,
124    Critical,
125}
126
127/// Security scanner
128pub struct SecurityScanner {
129    patterns: Vec<SecretPattern>,
130    custom_patterns: Vec<CustomSecretPattern>,
131    allowlist: HashSet<String>,
132}
133
134struct SecretPattern {
135    kind: SecretKind,
136    regex: &'static Lazy<Regex>,
137    severity: Severity,
138}
139
140/// Custom user-defined secret pattern
141struct CustomSecretPattern {
142    regex: Regex,
143    severity: Severity,
144}
145
146impl Default for SecurityScanner {
147    fn default() -> Self {
148        Self::new()
149    }
150}
151
152impl SecurityScanner {
153    /// Create a new security scanner with default patterns
154    /// Uses pre-compiled static regex patterns for optimal performance
155    ///
156    /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
157    /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
158    /// and redaction.
159    pub fn new() -> Self {
160        let patterns = vec![
161            // === Critical: Specific cloud credentials (most specific patterns first) ===
162            // AWS
163            SecretPattern {
164                kind: SecretKind::AwsCredential,
165                regex: &RE_AWS_KEY,
166                severity: Severity::Critical,
167            },
168            SecretPattern {
169                kind: SecretKind::AwsCredential,
170                regex: &RE_AWS_SECRET,
171                severity: Severity::Critical,
172            },
173            // GitHub tokens (all types: ghp_, gho_, ghu_, ghs_, ghr_, github_pat_)
174            SecretPattern {
175                kind: SecretKind::GitHubToken,
176                regex: &RE_GITHUB_PAT,
177                severity: Severity::Critical,
178            },
179            SecretPattern {
180                kind: SecretKind::GitHubToken,
181                regex: &RE_GITHUB_FINE_PAT,
182                severity: Severity::Critical,
183            },
184            SecretPattern {
185                kind: SecretKind::GitHubToken,
186                regex: &RE_GITHUB_OTHER_TOKENS,
187                severity: Severity::Critical,
188            },
189            // Private keys
190            SecretPattern {
191                kind: SecretKind::PrivateKey,
192                regex: &RE_PRIVATE_KEY,
193                severity: Severity::Critical,
194            },
195            // Anthropic API keys (must come before OpenAI since sk-ant- is more specific)
196            SecretPattern {
197                kind: SecretKind::ApiKey,
198                regex: &RE_ANTHROPIC,
199                severity: Severity::Critical,
200            },
201            // OpenAI API keys (must come before Stripe since sk- is more general)
202            SecretPattern {
203                kind: SecretKind::ApiKey,
204                regex: &RE_OPENAI,
205                severity: Severity::Critical,
206            },
207            // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
208            SecretPattern {
209                kind: SecretKind::ApiKey,
210                regex: &RE_STRIPE,
211                severity: Severity::Critical,
212            },
213            // === High: Specific service tokens (must come before generic patterns) ===
214            // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
215            SecretPattern {
216                kind: SecretKind::AccessToken,
217                regex: &RE_SLACK,
218                severity: Severity::High,
219            },
220            // JWT tokens (specific pattern: eyJ...eyJ...signature)
221            SecretPattern {
222                kind: SecretKind::AccessToken,
223                regex: &RE_JWT,
224                severity: Severity::High,
225            },
226            // Connection strings (specific pattern: mongodb://, postgres://, etc.)
227            SecretPattern {
228                kind: SecretKind::ConnectionString,
229                regex: &RE_CONN_STRING,
230                severity: Severity::High,
231            },
232            // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
233            // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
234            SecretPattern {
235                kind: SecretKind::ApiKey,
236                regex: &RE_API_KEY,
237                severity: Severity::High,
238            },
239            // Generic secrets (matches secret=xxx, token=xxx, etc.)
240            SecretPattern {
241                kind: SecretKind::Generic,
242                regex: &RE_SECRET_TOKEN,
243                severity: Severity::High,
244            },
245            // Passwords
246            SecretPattern {
247                kind: SecretKind::Password,
248                regex: &RE_PASSWORD,
249                severity: Severity::High,
250            },
251        ];
252
253        Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
254    }
255
256    /// Add a pattern to allowlist
257    pub fn allowlist(&mut self, pattern: &str) {
258        self.allowlist.insert(pattern.to_owned());
259    }
260
261    /// Add a custom regex pattern for secret detection
262    ///
263    /// Custom patterns are matched as generic secrets with High severity.
264    /// Invalid regex patterns are silently ignored.
265    ///
266    /// # Example
267    /// ```
268    /// use infiniloom_engine::security::SecurityScanner;
269    ///
270    /// let mut scanner = SecurityScanner::new();
271    /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}");
272    /// ```
273    pub fn add_custom_pattern(&mut self, pattern: &str) {
274        if let Ok(regex) = Regex::new(pattern) {
275            self.custom_patterns
276                .push(CustomSecretPattern { regex, severity: Severity::High });
277        }
278    }
279
280    /// Add multiple custom patterns at once
281    pub fn add_custom_patterns(&mut self, patterns: &[String]) {
282        for pattern in patterns {
283            self.add_custom_pattern(pattern);
284        }
285    }
286
287    /// Scan content for secrets
288    pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
289        let mut findings = Vec::new();
290
291        for (line_num, line) in content.lines().enumerate() {
292            let trimmed = line.trim();
293
294            // Detect if line is likely a comment - skip entirely to reduce false positives
295            // Real secrets shouldn't be in comments anyway
296            let is_jsdoc_continuation =
297                trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
298            let is_comment = trimmed.starts_with("//")
299                || trimmed.starts_with('#')
300                || trimmed.starts_with("/*")
301                || trimmed.starts_with("*")
302                || is_jsdoc_continuation;
303
304            // Skip obvious false positives (example docs, placeholders, comments)
305            let is_obvious_false_positive = is_comment
306                || RE_EXAMPLE_WORD.is_match(trimmed)
307                || trimmed.to_lowercase().contains("placeholder")
308                || trimmed.contains("xxxxx");
309
310            if is_obvious_false_positive {
311                continue;
312            }
313
314            for pattern in &self.patterns {
315                // Use find_iter to catch ALL matches on a line, not just the first
316                for m in pattern.regex.find_iter(line) {
317                    let matched = m.as_str();
318
319                    // Check allowlist
320                    if self.allowlist.iter().any(|a| matched.contains(a)) {
321                        continue;
322                    }
323
324                    findings.push(SecretFinding {
325                        kind: pattern.kind,
326                        file: file_path.to_owned(),
327                        line: (line_num + 1) as u32,
328                        pattern: redact(matched),
329                        severity: pattern.severity,
330                        in_comment: false, // Non-comment lines only now
331                    });
332                }
333            }
334
335            // Check custom patterns
336            for custom in &self.custom_patterns {
337                for m in custom.regex.find_iter(line) {
338                    let matched = m.as_str();
339
340                    // Check allowlist
341                    if self.allowlist.iter().any(|a| matched.contains(a)) {
342                        continue;
343                    }
344
345                    findings.push(SecretFinding {
346                        kind: SecretKind::Generic,
347                        file: file_path.to_owned(),
348                        line: (line_num + 1) as u32,
349                        pattern: redact(matched),
350                        severity: custom.severity,
351                        in_comment: false,
352                    });
353                }
354            }
355        }
356
357        findings
358    }
359
360    /// Scan a file and return whether it's safe to include
361    pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
362        let findings = self.scan(content, file_path);
363        findings.iter().all(|f| f.severity < Severity::High)
364    }
365
366    /// Get summary of findings
367    pub fn summarize(findings: &[SecretFinding]) -> String {
368        if findings.is_empty() {
369            return "No secrets detected".to_owned();
370        }
371
372        let critical = findings
373            .iter()
374            .filter(|f| f.severity == Severity::Critical)
375            .count();
376        let high = findings
377            .iter()
378            .filter(|f| f.severity == Severity::High)
379            .count();
380
381        format!(
382            "Found {} potential secrets ({} critical, {} high severity)",
383            findings.len(),
384            critical,
385            high
386        )
387    }
388
389    /// Redact secrets from content, returning the redacted content
390    /// This replaces detected secrets with redacted versions in the actual content
391    pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
392        let mut result = content.to_owned();
393
394        for (line_num, line) in content.lines().enumerate() {
395            let trimmed = line.trim();
396
397            // Skip obvious false positives (example docs, placeholders)
398            let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
399                || trimmed.to_lowercase().contains("placeholder")
400                || trimmed.contains("xxxxx");
401
402            if is_obvious_false_positive {
403                continue;
404            }
405
406            for pattern in &self.patterns {
407                // Use find_iter to catch ALL matches on a line, not just the first
408                for m in pattern.regex.find_iter(line) {
409                    let matched = m.as_str();
410
411                    // Check allowlist
412                    if self.allowlist.iter().any(|a| matched.contains(a)) {
413                        continue;
414                    }
415
416                    // Only redact high severity and above
417                    if pattern.severity >= Severity::High {
418                        let redacted = redact(matched);
419                        // Replace in result - use line number to find the right occurrence
420                        let line_start = result
421                            .lines()
422                            .take(line_num)
423                            .map(|l| l.len() + 1)
424                            .sum::<usize>();
425                        if let Some(pos) = result[line_start..].find(matched) {
426                            let abs_pos = line_start + pos;
427                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
428                        }
429                    }
430                }
431            }
432
433            // Check custom patterns for redaction
434            for custom in &self.custom_patterns {
435                for m in custom.regex.find_iter(line) {
436                    let matched = m.as_str();
437
438                    // Check allowlist
439                    if self.allowlist.iter().any(|a| matched.contains(a)) {
440                        continue;
441                    }
442
443                    // Only redact high severity and above
444                    if custom.severity >= Severity::High {
445                        let redacted = redact(matched);
446                        let line_start = result
447                            .lines()
448                            .take(line_num)
449                            .map(|l| l.len() + 1)
450                            .sum::<usize>();
451                        if let Some(pos) = result[line_start..].find(matched) {
452                            let abs_pos = line_start + pos;
453                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
454                        }
455                    }
456                }
457            }
458        }
459
460        result
461    }
462
463    /// Scan and redact all secrets from content.
464    ///
465    /// Returns a tuple of (redacted_content, findings) where:
466    /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
467    /// - `findings` is a list of all detected secrets with metadata
468    ///
469    /// # Important
470    ///
471    /// Always check the findings list to understand what was redacted and whether
472    /// the file should be excluded from context entirely.
473    #[must_use = "security findings should be reviewed"]
474    pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
475        let findings = self.scan(content, file_path);
476        let redacted = self.redact_content(content, file_path);
477        (redacted, findings)
478    }
479}
480
481/// Redact a matched secret for display
482fn redact(s: &str) -> String {
483    if s.len() <= 8 {
484        return "*".repeat(s.len());
485    }
486
487    let prefix_len = 4.min(s.len() / 4);
488    let suffix_len = 4.min(s.len() / 4);
489
490    format!(
491        "{}{}{}",
492        &s[..prefix_len],
493        "*".repeat(s.len() - prefix_len - suffix_len),
494        &s[s.len() - suffix_len..]
495    )
496}
497
498#[cfg(test)]
499mod tests {
500    use super::*;
501
502    #[test]
503    fn test_aws_key_detection() {
504        let scanner = SecurityScanner::new();
505        let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
506
507        let findings = scanner.scan(content, "config.py");
508
509        assert!(!findings.is_empty());
510        assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
511    }
512
513    #[test]
514    fn test_github_token_detection() {
515        let scanner = SecurityScanner::new();
516        let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
517
518        let findings = scanner.scan(content, ".env");
519
520        assert!(!findings.is_empty());
521        assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
522    }
523
524    #[test]
525    fn test_private_key_detection() {
526        let scanner = SecurityScanner::new();
527        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
528
529        let findings = scanner.scan(content, "key.pem");
530
531        assert!(!findings.is_empty());
532        assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
533    }
534
535    #[test]
536    fn test_allowlist() {
537        let mut scanner = SecurityScanner::new();
538        scanner.allowlist("EXAMPLE");
539
540        let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
541        let findings = scanner.scan(content, "test.py");
542
543        assert!(findings.is_empty());
544    }
545
546    #[test]
547    fn test_redact() {
548        assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
549        assert_eq!(redact("short"), "*****");
550    }
551
552    #[test]
553    fn test_comments_are_skipped() {
554        let scanner = SecurityScanner::new();
555        let content = "# api_key = 'some_secret_key_12345678901234567890'";
556
557        let findings = scanner.scan(content, "test.py");
558
559        // Comments are skipped entirely to reduce false positives
560        assert!(findings.is_empty(), "Secrets in comments should be skipped");
561    }
562
563    #[test]
564    fn test_non_comment_detected() {
565        let scanner = SecurityScanner::new();
566        let content = "api_key = 'some_secret_key_12345678901234567890'";
567
568        let findings = scanner.scan(content, "test.py");
569
570        assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
571        assert!(
572            findings.iter().all(|f| !f.in_comment),
573            "in_comment should be false for non-comment lines"
574        );
575    }
576
577    #[test]
578    fn test_custom_pattern() {
579        let mut scanner = SecurityScanner::new();
580        scanner.add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}");
581
582        let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
583        let findings = scanner.scan(content, "test.py");
584
585        assert!(!findings.is_empty(), "Custom pattern should be detected");
586        assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
587    }
588
589    #[test]
590    fn test_custom_patterns_multiple() {
591        let mut scanner = SecurityScanner::new();
592        scanner.add_custom_patterns(&[
593            r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
594            r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
595        ]);
596
597        let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
598        let findings = scanner.scan(content, "test.py");
599
600        assert!(!findings.is_empty(), "Custom patterns should be detected");
601    }
602
603    #[test]
604    fn test_invalid_custom_pattern_ignored() {
605        let mut scanner = SecurityScanner::new();
606        // Invalid regex - unclosed bracket
607        scanner.add_custom_pattern(r"INVALID_[PATTERN");
608
609        // Should not panic, invalid patterns are ignored
610        let content = "INVALID_[PATTERN here";
611        let _findings = scanner.scan(content, "test.py");
612    }
613}