infiniloom_engine/
security.rs

1//! Security scanning for secrets and sensitive data
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5use std::collections::HashSet;
6
7// Helper regex for word-boundary "example" detection (to skip documentation lines)
8static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
9    // Match "example" as a standalone word to skip documentation/tutorial content.
10    // This helps reduce false positives in example code and documentation.
11    //
12    // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
13    // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
14    // which runs separately. This regex is only used to skip entire lines that
15    // appear to be documentation examples (e.g., "# Example:" or "// example usage").
16    //
17    // The regex allows dots in word boundaries to handle domain examples like
18    // db.example.com without matching.
19    Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)").unwrap()
20});
21
22// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
23static RE_AWS_KEY: Lazy<Regex> = Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").unwrap());
24static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
25    Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
26        .unwrap()
27});
28static RE_GITHUB_PAT: Lazy<Regex> = Lazy::new(|| Regex::new(r"ghp_[A-Za-z0-9]{36}").unwrap());
29static RE_GITHUB_FINE_PAT: Lazy<Regex> =
30    Lazy::new(|| Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}").unwrap());
31static RE_PRIVATE_KEY: Lazy<Regex> =
32    Lazy::new(|| Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----").unwrap());
33static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
34    Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#).unwrap()
35});
36static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
37    Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#).unwrap()
38});
39static RE_PASSWORD: Lazy<Regex> =
40    Lazy::new(|| Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#).unwrap());
41static RE_CONN_STRING: Lazy<Regex> =
42    Lazy::new(|| Regex::new(r#"(?i)(?:mongodb|postgres|mysql|redis)://[^\s'"]+"#).unwrap());
43static RE_JWT: Lazy<Regex> =
44    Lazy::new(|| Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*").unwrap());
45static RE_SLACK: Lazy<Regex> =
46    Lazy::new(|| Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}").unwrap());
47static RE_STRIPE: Lazy<Regex> =
48    Lazy::new(|| Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}").unwrap());
49
50/// A detected secret or sensitive data
51#[derive(Debug, Clone)]
52pub struct SecretFinding {
53    /// Type of secret
54    pub kind: SecretKind,
55    /// File path
56    pub file: String,
57    /// Line number
58    pub line: u32,
59    /// Matched pattern (redacted)
60    pub pattern: String,
61    /// Severity level
62    pub severity: Severity,
63    /// Whether the secret was found in a comment (may be example/documentation)
64    pub in_comment: bool,
65}
66
67/// Kind of secret detected
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub enum SecretKind {
70    /// API key
71    ApiKey,
72    /// Access token
73    AccessToken,
74    /// Private key
75    PrivateKey,
76    /// Password
77    Password,
78    /// Database connection string
79    ConnectionString,
80    /// AWS credentials
81    AwsCredential,
82    /// GitHub token
83    GitHubToken,
84    /// Generic secret
85    Generic,
86}
87
88impl SecretKind {
89    /// Get human-readable name
90    pub fn name(&self) -> &'static str {
91        match self {
92            Self::ApiKey => "API Key",
93            Self::AccessToken => "Access Token",
94            Self::PrivateKey => "Private Key",
95            Self::Password => "Password",
96            Self::ConnectionString => "Connection String",
97            Self::AwsCredential => "AWS Credential",
98            Self::GitHubToken => "GitHub Token",
99            Self::Generic => "Generic Secret",
100        }
101    }
102}
103
104/// Severity level
105#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
106pub enum Severity {
107    Low,
108    Medium,
109    High,
110    Critical,
111}
112
113/// Security scanner
114pub struct SecurityScanner {
115    patterns: Vec<SecretPattern>,
116    custom_patterns: Vec<CustomSecretPattern>,
117    allowlist: HashSet<String>,
118}
119
120struct SecretPattern {
121    kind: SecretKind,
122    regex: &'static Lazy<Regex>,
123    severity: Severity,
124}
125
126/// Custom user-defined secret pattern
127struct CustomSecretPattern {
128    regex: Regex,
129    severity: Severity,
130}
131
132impl Default for SecurityScanner {
133    fn default() -> Self {
134        Self::new()
135    }
136}
137
138impl SecurityScanner {
139    /// Create a new security scanner with default patterns
140    /// Uses pre-compiled static regex patterns for optimal performance
141    ///
142    /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
143    /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
144    /// and redaction.
145    pub fn new() -> Self {
146        let patterns = vec![
147            // === Critical: Specific cloud credentials (most specific patterns first) ===
148            // AWS
149            SecretPattern {
150                kind: SecretKind::AwsCredential,
151                regex: &RE_AWS_KEY,
152                severity: Severity::Critical,
153            },
154            SecretPattern {
155                kind: SecretKind::AwsCredential,
156                regex: &RE_AWS_SECRET,
157                severity: Severity::Critical,
158            },
159            // GitHub
160            SecretPattern {
161                kind: SecretKind::GitHubToken,
162                regex: &RE_GITHUB_PAT,
163                severity: Severity::Critical,
164            },
165            SecretPattern {
166                kind: SecretKind::GitHubToken,
167                regex: &RE_GITHUB_FINE_PAT,
168                severity: Severity::Critical,
169            },
170            // Private keys
171            SecretPattern {
172                kind: SecretKind::PrivateKey,
173                regex: &RE_PRIVATE_KEY,
174                severity: Severity::Critical,
175            },
176            // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
177            SecretPattern {
178                kind: SecretKind::ApiKey,
179                regex: &RE_STRIPE,
180                severity: Severity::Critical,
181            },
182            // === High: Specific service tokens (must come before generic patterns) ===
183            // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
184            SecretPattern {
185                kind: SecretKind::AccessToken,
186                regex: &RE_SLACK,
187                severity: Severity::High,
188            },
189            // JWT tokens (specific pattern: eyJ...eyJ...signature)
190            SecretPattern {
191                kind: SecretKind::AccessToken,
192                regex: &RE_JWT,
193                severity: Severity::High,
194            },
195            // Connection strings (specific pattern: mongodb://, postgres://, etc.)
196            SecretPattern {
197                kind: SecretKind::ConnectionString,
198                regex: &RE_CONN_STRING,
199                severity: Severity::High,
200            },
201            // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
202            // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
203            SecretPattern {
204                kind: SecretKind::ApiKey,
205                regex: &RE_API_KEY,
206                severity: Severity::High,
207            },
208            // Generic secrets (matches secret=xxx, token=xxx, etc.)
209            SecretPattern {
210                kind: SecretKind::Generic,
211                regex: &RE_SECRET_TOKEN,
212                severity: Severity::High,
213            },
214            // Passwords
215            SecretPattern {
216                kind: SecretKind::Password,
217                regex: &RE_PASSWORD,
218                severity: Severity::High,
219            },
220        ];
221
222        Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
223    }
224
225    /// Add a pattern to allowlist
226    pub fn allowlist(&mut self, pattern: &str) {
227        self.allowlist.insert(pattern.to_owned());
228    }
229
230    /// Add a custom regex pattern for secret detection
231    ///
232    /// Custom patterns are matched as generic secrets with High severity.
233    /// Invalid regex patterns are silently ignored.
234    ///
235    /// # Example
236    /// ```
237    /// use infiniloom_engine::security::SecurityScanner;
238    ///
239    /// let mut scanner = SecurityScanner::new();
240    /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}");
241    /// ```
242    pub fn add_custom_pattern(&mut self, pattern: &str) {
243        if let Ok(regex) = Regex::new(pattern) {
244            self.custom_patterns
245                .push(CustomSecretPattern { regex, severity: Severity::High });
246        }
247    }
248
249    /// Add multiple custom patterns at once
250    pub fn add_custom_patterns(&mut self, patterns: &[String]) {
251        for pattern in patterns {
252            self.add_custom_pattern(pattern);
253        }
254    }
255
256    /// Scan content for secrets
257    pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
258        let mut findings = Vec::new();
259
260        for (line_num, line) in content.lines().enumerate() {
261            let trimmed = line.trim();
262
263            // Detect if line is likely a comment - skip entirely to reduce false positives
264            // Real secrets shouldn't be in comments anyway
265            let is_jsdoc_continuation =
266                trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
267            let is_comment = trimmed.starts_with("//")
268                || trimmed.starts_with('#')
269                || trimmed.starts_with("/*")
270                || trimmed.starts_with("*")
271                || is_jsdoc_continuation;
272
273            // Skip obvious false positives (example docs, placeholders, comments)
274            let is_obvious_false_positive = is_comment
275                || RE_EXAMPLE_WORD.is_match(trimmed)
276                || trimmed.to_lowercase().contains("placeholder")
277                || trimmed.contains("xxxxx");
278
279            if is_obvious_false_positive {
280                continue;
281            }
282
283            for pattern in &self.patterns {
284                // Use find_iter to catch ALL matches on a line, not just the first
285                for m in pattern.regex.find_iter(line) {
286                    let matched = m.as_str();
287
288                    // Check allowlist
289                    if self.allowlist.iter().any(|a| matched.contains(a)) {
290                        continue;
291                    }
292
293                    findings.push(SecretFinding {
294                        kind: pattern.kind,
295                        file: file_path.to_owned(),
296                        line: (line_num + 1) as u32,
297                        pattern: redact(matched),
298                        severity: pattern.severity,
299                        in_comment: false, // Non-comment lines only now
300                    });
301                }
302            }
303
304            // Check custom patterns
305            for custom in &self.custom_patterns {
306                for m in custom.regex.find_iter(line) {
307                    let matched = m.as_str();
308
309                    // Check allowlist
310                    if self.allowlist.iter().any(|a| matched.contains(a)) {
311                        continue;
312                    }
313
314                    findings.push(SecretFinding {
315                        kind: SecretKind::Generic,
316                        file: file_path.to_owned(),
317                        line: (line_num + 1) as u32,
318                        pattern: redact(matched),
319                        severity: custom.severity,
320                        in_comment: false,
321                    });
322                }
323            }
324        }
325
326        findings
327    }
328
329    /// Scan a file and return whether it's safe to include
330    pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
331        let findings = self.scan(content, file_path);
332        findings.iter().all(|f| f.severity < Severity::High)
333    }
334
335    /// Get summary of findings
336    pub fn summarize(findings: &[SecretFinding]) -> String {
337        if findings.is_empty() {
338            return "No secrets detected".to_owned();
339        }
340
341        let critical = findings
342            .iter()
343            .filter(|f| f.severity == Severity::Critical)
344            .count();
345        let high = findings
346            .iter()
347            .filter(|f| f.severity == Severity::High)
348            .count();
349
350        format!(
351            "Found {} potential secrets ({} critical, {} high severity)",
352            findings.len(),
353            critical,
354            high
355        )
356    }
357
358    /// Redact secrets from content, returning the redacted content
359    /// This replaces detected secrets with redacted versions in the actual content
360    pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
361        let mut result = content.to_owned();
362
363        for (line_num, line) in content.lines().enumerate() {
364            let trimmed = line.trim();
365
366            // Skip obvious false positives (example docs, placeholders)
367            let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
368                || trimmed.to_lowercase().contains("placeholder")
369                || trimmed.contains("xxxxx");
370
371            if is_obvious_false_positive {
372                continue;
373            }
374
375            for pattern in &self.patterns {
376                // Use find_iter to catch ALL matches on a line, not just the first
377                for m in pattern.regex.find_iter(line) {
378                    let matched = m.as_str();
379
380                    // Check allowlist
381                    if self.allowlist.iter().any(|a| matched.contains(a)) {
382                        continue;
383                    }
384
385                    // Only redact high severity and above
386                    if pattern.severity >= Severity::High {
387                        let redacted = redact(matched);
388                        // Replace in result - use line number to find the right occurrence
389                        let line_start = result
390                            .lines()
391                            .take(line_num)
392                            .map(|l| l.len() + 1)
393                            .sum::<usize>();
394                        if let Some(pos) = result[line_start..].find(matched) {
395                            let abs_pos = line_start + pos;
396                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
397                        }
398                    }
399                }
400            }
401
402            // Check custom patterns for redaction
403            for custom in &self.custom_patterns {
404                for m in custom.regex.find_iter(line) {
405                    let matched = m.as_str();
406
407                    // Check allowlist
408                    if self.allowlist.iter().any(|a| matched.contains(a)) {
409                        continue;
410                    }
411
412                    // Only redact high severity and above
413                    if custom.severity >= Severity::High {
414                        let redacted = redact(matched);
415                        let line_start = result
416                            .lines()
417                            .take(line_num)
418                            .map(|l| l.len() + 1)
419                            .sum::<usize>();
420                        if let Some(pos) = result[line_start..].find(matched) {
421                            let abs_pos = line_start + pos;
422                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
423                        }
424                    }
425                }
426            }
427        }
428
429        result
430    }
431
432    /// Scan and redact all secrets from content.
433    ///
434    /// Returns a tuple of (redacted_content, findings) where:
435    /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
436    /// - `findings` is a list of all detected secrets with metadata
437    ///
438    /// # Important
439    ///
440    /// Always check the findings list to understand what was redacted and whether
441    /// the file should be excluded from context entirely.
442    #[must_use = "security findings should be reviewed"]
443    pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
444        let findings = self.scan(content, file_path);
445        let redacted = self.redact_content(content, file_path);
446        (redacted, findings)
447    }
448}
449
450/// Redact a matched secret for display
451fn redact(s: &str) -> String {
452    if s.len() <= 8 {
453        return "*".repeat(s.len());
454    }
455
456    let prefix_len = 4.min(s.len() / 4);
457    let suffix_len = 4.min(s.len() / 4);
458
459    format!(
460        "{}{}{}",
461        &s[..prefix_len],
462        "*".repeat(s.len() - prefix_len - suffix_len),
463        &s[s.len() - suffix_len..]
464    )
465}
466
467#[cfg(test)]
468mod tests {
469    use super::*;
470
471    #[test]
472    fn test_aws_key_detection() {
473        let scanner = SecurityScanner::new();
474        let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
475
476        let findings = scanner.scan(content, "config.py");
477
478        assert!(!findings.is_empty());
479        assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
480    }
481
482    #[test]
483    fn test_github_token_detection() {
484        let scanner = SecurityScanner::new();
485        let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
486
487        let findings = scanner.scan(content, ".env");
488
489        assert!(!findings.is_empty());
490        assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
491    }
492
493    #[test]
494    fn test_private_key_detection() {
495        let scanner = SecurityScanner::new();
496        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
497
498        let findings = scanner.scan(content, "key.pem");
499
500        assert!(!findings.is_empty());
501        assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
502    }
503
504    #[test]
505    fn test_allowlist() {
506        let mut scanner = SecurityScanner::new();
507        scanner.allowlist("EXAMPLE");
508
509        let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
510        let findings = scanner.scan(content, "test.py");
511
512        assert!(findings.is_empty());
513    }
514
515    #[test]
516    fn test_redact() {
517        assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
518        assert_eq!(redact("short"), "*****");
519    }
520
521    #[test]
522    fn test_comments_are_skipped() {
523        let scanner = SecurityScanner::new();
524        let content = "# api_key = 'some_secret_key_12345678901234567890'";
525
526        let findings = scanner.scan(content, "test.py");
527
528        // Comments are skipped entirely to reduce false positives
529        assert!(findings.is_empty(), "Secrets in comments should be skipped");
530    }
531
532    #[test]
533    fn test_non_comment_detected() {
534        let scanner = SecurityScanner::new();
535        let content = "api_key = 'some_secret_key_12345678901234567890'";
536
537        let findings = scanner.scan(content, "test.py");
538
539        assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
540        assert!(
541            findings.iter().all(|f| !f.in_comment),
542            "in_comment should be false for non-comment lines"
543        );
544    }
545
546    #[test]
547    fn test_custom_pattern() {
548        let mut scanner = SecurityScanner::new();
549        scanner.add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}");
550
551        let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
552        let findings = scanner.scan(content, "test.py");
553
554        assert!(!findings.is_empty(), "Custom pattern should be detected");
555        assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
556    }
557
558    #[test]
559    fn test_custom_patterns_multiple() {
560        let mut scanner = SecurityScanner::new();
561        scanner.add_custom_patterns(&[
562            r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
563            r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
564        ]);
565
566        let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
567        let findings = scanner.scan(content, "test.py");
568
569        assert!(!findings.is_empty(), "Custom patterns should be detected");
570    }
571
572    #[test]
573    fn test_invalid_custom_pattern_ignored() {
574        let mut scanner = SecurityScanner::new();
575        // Invalid regex - unclosed bracket
576        scanner.add_custom_pattern(r"INVALID_[PATTERN");
577
578        // Should not panic, invalid patterns are ignored
579        let content = "INVALID_[PATTERN here";
580        let _findings = scanner.scan(content, "test.py");
581    }
582}