infiniloom_engine/
security.rs

1//! Security scanning for secrets and sensitive data
2//!
3//! This module provides automatic detection and redaction of secrets, API keys,
4//! tokens, and other sensitive data before sharing code with LLMs or external services.
5//!
6//! # Quick Start
7//!
8//! ```rust
9//! use infiniloom_engine::security::SecurityScanner;
10//!
11//! let scanner = SecurityScanner::new();
12//! let code = r#"
13//!     const AWS_KEY = "AKIAIOSFODNN7EXAMPLE";
14//!     const API_TOKEN = "sk-proj-abc123xyz789";
15//! "#;
16//!
17//! // Scan for secrets
18//! let findings = scanner.scan(code, "config.rs");
19//!
20//! if !findings.is_empty() {
21//!     println!("⚠️  Found {} secrets!", findings.len());
22//!     for finding in &findings {
23//!         println!("  {} on line {}: {}",
24//!             finding.kind.name(),
25//!             finding.line,
26//!             finding.pattern);  // Already redacted: "AKIA************MPLE"
27//!     }
28//! }
29//! ```
30//!
31//! # Scanning with Detailed Results
32//!
33//! The scanner returns structured findings with metadata:
34//!
35//! ```rust
36//! use infiniloom_engine::security::{SecurityScanner, Severity};
37//!
38//! let scanner = SecurityScanner::new();
39//! let findings = scanner.scan(r#"
40//!     DB_URL = "postgresql://user:pass@localhost/db"
41//!     STRIPE_KEY = "sk_live_abc123xyz789"
42//! "#, ".env");
43//!
44//! for finding in findings {
45//!     match finding.severity {
46//!         Severity::Critical => println!("🔴 CRITICAL: {}", finding.pattern),
47//!         Severity::High => println!("🟠 HIGH: {}", finding.pattern),
48//!         Severity::Medium => println!("🟡 MEDIUM: {}", finding.pattern),
49//!         Severity::Low => println!("🟢 LOW: {}", finding.pattern),
50//!     }
51//! }
52//! ```
53//!
54//! # Automatic Redaction
55//!
56//! Replace detected secrets with `[REDACTED]` markers:
57//!
58//! ```rust
59//! use infiniloom_engine::security::SecurityScanner;
60//!
61//! let scanner = SecurityScanner::new();
62//! let code = r#"
63//!     const apiKey = "sk-proj-secret123";
64//!     const githubToken = "ghp_abcdefghijklmnopqrstuvwxyz1234567890";
65//! "#;
66//!
67//! // Scan and redact in one operation
68//! let (redacted, findings) = scanner.scan_and_redact(code, "api.ts");
69//!
70//! println!("Original had {} secrets", findings.len());
71//! println!("Redacted version:\n{}", redacted);
72//! // Output: const apiKey = "sk-p****ect123";
73//! //         const githubToken = "ghp_****7890";
74//! ```
75//!
76//! # Custom Patterns
77//!
78//! Add organization-specific secret patterns:
79//!
80//! ```rust
81//! use infiniloom_engine::security::SecurityScanner;
82//!
83//! let mut scanner = SecurityScanner::new();
84//!
85//! // Add custom patterns for internal systems
86//! scanner.add_custom_pattern(r"MYCOMPANY_API_[A-Z0-9]{32}");
87//! scanner.add_custom_pattern(r"INTERNAL_TOKEN_[a-f0-9]{64}");
88//!
89//! // Or add multiple at once
90//! scanner.add_custom_patterns(&[
91//!     "ORG_SECRET_[A-Z0-9]{16}".to_string(),
92//!     "DEPLOY_KEY_[a-z0-9]{40}".to_string(),
93//! ]);
94//!
95//! // Now scan with both built-in and custom patterns
96//! let findings = scanner.scan(r#"
97//!     MYCOMPANY_API_ABCD1234EFGH5678IJKL9012MNOP
98//! "#, "internal.rs");
99//!
100//! assert!(!findings.is_empty());
101//! ```
102//!
103//! # Allowlist for Test Data
104//!
105//! Mark known test/example secrets as safe:
106//!
107//! ```rust
108//! use infiniloom_engine::security::SecurityScanner;
109//!
110//! let mut scanner = SecurityScanner::new();
111//!
112//! // Allowlist test keys that are intentionally public
113//! scanner.allowlist("EXAMPLE");
114//! scanner.allowlist("test_key");
115//! scanner.allowlist("mock_secret");
116//!
117//! // This won't trigger detection (contains "EXAMPLE")
118//! let test_code = r#"
119//!     AWS_KEY = "AKIAIOSFODNN7EXAMPLE"  // Official AWS test key
120//! "#;
121//!
122//! let findings = scanner.scan(test_code, "test.rs");
123//! assert!(findings.is_empty(), "Test keys should be allowed");
124//!
125//! // But this WILL trigger (real key format)
126//! let prod_code = r#"
127//!     AWS_KEY = "AKIAIOSFODNN7PRODKEY"
128//! "#;
129//!
130//! let findings = scanner.scan(prod_code, "prod.rs");
131//! assert!(!findings.is_empty(), "Real keys should be detected");
132//! ```
133//!
134//! # Repository Integration
135//!
136//! Scan all files in a repository:
137//!
138//! ```rust,ignore
139//! use infiniloom_engine::security::SecurityScanner;
140//!
141//! let scanner = SecurityScanner::new();
142//! let mut all_findings = Vec::new();
143//!
144//! for file in repository.files {
145//!     let findings = scanner.scan(&file.content, &file.relative_path);
146//!     all_findings.extend(findings);
147//! }
148//!
149//! if !all_findings.is_empty() {
150//!     eprintln!("⚠️  Security scan found {} secrets across {} files",
151//!         all_findings.len(),
152//!         all_findings.iter()
153//!             .map(|f| &f.file)
154//!             .collect::<std::collections::HashSet<_>>()
155//!             .len()
156//!     );
157//!
158//!     // Exit with error in CI/CD
159//!     std::process::exit(1);
160//! }
161//! ```
162//!
163//! # Severity-Based Filtering
164//!
165//! Work with different severity levels:
166//!
167//! ```rust
168//! use infiniloom_engine::security::{SecurityScanner, Severity};
169//!
170//! let scanner = SecurityScanner::new();
171//! let findings = scanner.scan(r#"
172//!     AWS_KEY = "AKIAIOSFODNN7PRODKEY"      # Critical
173//!     password = "weak123"                  # High
174//! "#, ".env");
175//!
176//! // Count by severity
177//! let critical_count = findings.iter()
178//!     .filter(|f| f.severity == Severity::Critical)
179//!     .count();
180//!
181//! let high_count = findings.iter()
182//!     .filter(|f| f.severity == Severity::High)
183//!     .count();
184//!
185//! println!("Critical: {}, High: {}", critical_count, high_count);
186//!
187//! // Check if safe to proceed (only low/medium severity)
188//! let is_safe = findings.iter()
189//!     .all(|f| f.severity < Severity::High);
190//!
191//! if !is_safe {
192//!     eprintln!("⛔ Cannot proceed - high/critical secrets detected");
193//! }
194//! ```
195//!
196//! # Supported Secret Types
197//!
198//! ## Cloud Credentials (Critical Severity)
199//! - **AWS**: Access keys (AKIA...), Secret access keys
200//! - **GitHub**: Personal access tokens (ghp_..., github_pat_...), OAuth tokens
201//! - **Private Keys**: RSA, EC, DSA, OpenSSH private keys
202//!
203//! ## API Keys (Critical Severity)
204//! - **OpenAI**: sk-... API keys
205//! - **Anthropic**: sk-ant-... API keys
206//! - **Stripe**: sk_live_..., pk_test_... keys
207//!
208//! ## Service Tokens (High Severity)
209//! - **Slack**: xoxb-..., xoxa-... tokens
210//! - **JWT**: Encoded JSON Web Tokens
211//! - **Database**: Connection strings (PostgreSQL, MongoDB, MySQL, Redis, etc.)
212//!
213//! ## Generic Secrets (High Severity)
214//! - Generic API keys (api_key=...)
215//! - Access tokens (token=..., secret=...)
216//! - Passwords (password=...)
217//!
218//! # Why Pre-compiled Patterns?
219//!
220//! The module uses `once_cell::sync::Lazy` for regex patterns:
221//!
222//! ```rust,ignore
223//! static RE_AWS_KEY: Lazy<Regex> =
224//!     Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").unwrap());
225//! ```
226//!
227//! **Benefits**:
228//! - Compiled once at first use
229//! - Reused across all scanner instances
230//! - Thread-safe sharing
231//! - Zero runtime compilation overhead
232//!
233//! **Pattern Order**: More specific patterns (Stripe, Slack, JWT) come BEFORE
234//! generic patterns (api_key, secret) to ensure accurate detection and avoid
235//! masking by broader patterns.
236//!
237//! # False Positive Reduction
238//!
239//! The scanner automatically skips:
240//! - **Comments**: Lines starting with //, #, /*, *
241//! - **Documentation**: Lines containing "example" as a word
242//! - **Placeholders**: Lines with "xxxxx" or "placeholder"
243//! - **Allowlisted patterns**: User-configured safe patterns
244//!
245//! This reduces false positives in documentation, test files, and examples
246//! while catching real secrets in code.
247
248use once_cell::sync::Lazy;
249use regex::Regex;
250use std::collections::HashSet;
251
252// Helper regex for word-boundary "example" detection (to skip documentation lines)
253static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
254    // Match "example" as a standalone word to skip documentation/tutorial content.
255    // This helps reduce false positives in example code and documentation.
256    //
257    // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
258    // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
259    // which runs separately. This regex is only used to skip entire lines that
260    // appear to be documentation examples (e.g., "# Example:" or "// example usage").
261    //
262    // The regex allows dots in word boundaries to handle domain examples like
263    // db.example.com without matching.
264    Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)")
265        .expect("RE_EXAMPLE_WORD: invalid regex pattern")
266});
267
268// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
269static RE_AWS_KEY: Lazy<Regex> =
270    Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").expect("RE_AWS_KEY: invalid regex pattern"));
271static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
272    Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
273        .expect("RE_AWS_SECRET: invalid regex pattern")
274});
275// GitHub Personal Access Token (classic) - 36 alphanumeric chars after prefix
276static RE_GITHUB_PAT: Lazy<Regex> =
277    Lazy::new(|| Regex::new(r"ghp_[A-Za-z0-9]{36}").expect("RE_GITHUB_PAT: invalid regex pattern"));
278// GitHub fine-grained PAT
279static RE_GITHUB_FINE_PAT: Lazy<Regex> = Lazy::new(|| {
280    Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}")
281        .expect("RE_GITHUB_FINE_PAT: invalid regex pattern")
282});
283// GitHub OAuth, user-to-server, server-to-server, and refresh tokens
284static RE_GITHUB_OTHER_TOKENS: Lazy<Regex> = Lazy::new(|| {
285    Regex::new(r"gh[ours]_[A-Za-z0-9]{36,}").expect("RE_GITHUB_OTHER_TOKENS: invalid regex pattern")
286});
287static RE_PRIVATE_KEY: Lazy<Regex> = Lazy::new(|| {
288    Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")
289        .expect("RE_PRIVATE_KEY: invalid regex pattern")
290});
291static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
292    Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
293        .expect("RE_API_KEY: invalid regex pattern")
294});
295static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
296    Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
297        .expect("RE_SECRET_TOKEN: invalid regex pattern")
298});
299static RE_PASSWORD: Lazy<Regex> = Lazy::new(|| {
300    Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#)
301        .expect("RE_PASSWORD: invalid regex pattern")
302});
303static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
304    // Note: postgres and postgresql are both valid (postgresql:// is more common in practice)
305    Regex::new(
306        r#"(?i)(?:mongodb|postgres(?:ql)?|mysql|redis|mariadb|cockroachdb|mssql)://[^\s'"]+"#,
307    )
308    .expect("RE_CONN_STRING: invalid regex pattern")
309});
310static RE_JWT: Lazy<Regex> = Lazy::new(|| {
311    Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*")
312        .expect("RE_JWT: invalid regex pattern")
313});
314static RE_SLACK: Lazy<Regex> = Lazy::new(|| {
315    Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}")
316        .expect("RE_SLACK: invalid regex pattern")
317});
318static RE_STRIPE: Lazy<Regex> = Lazy::new(|| {
319    Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}")
320        .expect("RE_STRIPE: invalid regex pattern")
321});
322// OpenAI API keys (sk-... followed by alphanumeric characters)
323static RE_OPENAI: Lazy<Regex> =
324    Lazy::new(|| Regex::new(r"sk-[A-Za-z0-9]{32,}").expect("RE_OPENAI: invalid regex pattern"));
325// Anthropic API keys (sk-ant-...)
326static RE_ANTHROPIC: Lazy<Regex> = Lazy::new(|| {
327    Regex::new(r"sk-ant-[A-Za-z0-9-]{40,}").expect("RE_ANTHROPIC: invalid regex pattern")
328});
329
330/// A detected secret or sensitive data
331#[derive(Debug, Clone)]
332pub struct SecretFinding {
333    /// Type of secret
334    pub kind: SecretKind,
335    /// File path
336    pub file: String,
337    /// Line number
338    pub line: u32,
339    /// Matched pattern (redacted)
340    pub pattern: String,
341    /// Severity level
342    pub severity: Severity,
343    /// Whether the secret was found in a comment (may be example/documentation)
344    pub in_comment: bool,
345}
346
347/// Kind of secret detected
348#[derive(Debug, Clone, Copy, PartialEq, Eq)]
349pub enum SecretKind {
350    /// API key
351    ApiKey,
352    /// Access token
353    AccessToken,
354    /// Private key
355    PrivateKey,
356    /// Password
357    Password,
358    /// Database connection string
359    ConnectionString,
360    /// AWS credentials
361    AwsCredential,
362    /// GitHub token
363    GitHubToken,
364    /// Generic secret
365    Generic,
366}
367
368impl SecretKind {
369    /// Get human-readable name
370    pub fn name(&self) -> &'static str {
371        match self {
372            Self::ApiKey => "API Key",
373            Self::AccessToken => "Access Token",
374            Self::PrivateKey => "Private Key",
375            Self::Password => "Password",
376            Self::ConnectionString => "Connection String",
377            Self::AwsCredential => "AWS Credential",
378            Self::GitHubToken => "GitHub Token",
379            Self::Generic => "Generic Secret",
380        }
381    }
382}
383
384/// Severity level
385#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
386pub enum Severity {
387    Low,
388    Medium,
389    High,
390    Critical,
391}
392
393/// Security scanner
394pub struct SecurityScanner {
395    patterns: Vec<SecretPattern>,
396    custom_patterns: Vec<CustomSecretPattern>,
397    allowlist: HashSet<String>,
398}
399
400struct SecretPattern {
401    kind: SecretKind,
402    regex: &'static Lazy<Regex>,
403    severity: Severity,
404}
405
406/// Custom user-defined secret pattern
407struct CustomSecretPattern {
408    regex: Regex,
409    severity: Severity,
410}
411
412impl Default for SecurityScanner {
413    fn default() -> Self {
414        Self::new()
415    }
416}
417
418impl SecurityScanner {
419    /// Create a new security scanner with default patterns
420    /// Uses pre-compiled static regex patterns for optimal performance
421    ///
422    /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
423    /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
424    /// and redaction.
425    pub fn new() -> Self {
426        let patterns = vec![
427            // === Critical: Specific cloud credentials (most specific patterns first) ===
428            // AWS
429            SecretPattern {
430                kind: SecretKind::AwsCredential,
431                regex: &RE_AWS_KEY,
432                severity: Severity::Critical,
433            },
434            SecretPattern {
435                kind: SecretKind::AwsCredential,
436                regex: &RE_AWS_SECRET,
437                severity: Severity::Critical,
438            },
439            // GitHub tokens (all types: ghp_, gho_, ghu_, ghs_, ghr_, github_pat_)
440            SecretPattern {
441                kind: SecretKind::GitHubToken,
442                regex: &RE_GITHUB_PAT,
443                severity: Severity::Critical,
444            },
445            SecretPattern {
446                kind: SecretKind::GitHubToken,
447                regex: &RE_GITHUB_FINE_PAT,
448                severity: Severity::Critical,
449            },
450            SecretPattern {
451                kind: SecretKind::GitHubToken,
452                regex: &RE_GITHUB_OTHER_TOKENS,
453                severity: Severity::Critical,
454            },
455            // Private keys
456            SecretPattern {
457                kind: SecretKind::PrivateKey,
458                regex: &RE_PRIVATE_KEY,
459                severity: Severity::Critical,
460            },
461            // Anthropic API keys (must come before OpenAI since sk-ant- is more specific)
462            SecretPattern {
463                kind: SecretKind::ApiKey,
464                regex: &RE_ANTHROPIC,
465                severity: Severity::Critical,
466            },
467            // OpenAI API keys (must come before Stripe since sk- is more general)
468            SecretPattern {
469                kind: SecretKind::ApiKey,
470                regex: &RE_OPENAI,
471                severity: Severity::Critical,
472            },
473            // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
474            SecretPattern {
475                kind: SecretKind::ApiKey,
476                regex: &RE_STRIPE,
477                severity: Severity::Critical,
478            },
479            // === High: Specific service tokens (must come before generic patterns) ===
480            // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
481            SecretPattern {
482                kind: SecretKind::AccessToken,
483                regex: &RE_SLACK,
484                severity: Severity::High,
485            },
486            // JWT tokens (specific pattern: eyJ...eyJ...signature)
487            SecretPattern {
488                kind: SecretKind::AccessToken,
489                regex: &RE_JWT,
490                severity: Severity::High,
491            },
492            // Connection strings (specific pattern: mongodb://, postgres://, etc.)
493            SecretPattern {
494                kind: SecretKind::ConnectionString,
495                regex: &RE_CONN_STRING,
496                severity: Severity::High,
497            },
498            // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
499            // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
500            SecretPattern {
501                kind: SecretKind::ApiKey,
502                regex: &RE_API_KEY,
503                severity: Severity::High,
504            },
505            // Generic secrets (matches secret=xxx, token=xxx, etc.)
506            SecretPattern {
507                kind: SecretKind::Generic,
508                regex: &RE_SECRET_TOKEN,
509                severity: Severity::High,
510            },
511            // Passwords
512            SecretPattern {
513                kind: SecretKind::Password,
514                regex: &RE_PASSWORD,
515                severity: Severity::High,
516            },
517        ];
518
519        Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
520    }
521
522    /// Add a pattern to allowlist
523    pub fn allowlist(&mut self, pattern: &str) {
524        self.allowlist.insert(pattern.to_owned());
525    }
526
527    /// Add a custom regex pattern for secret detection
528    ///
529    /// Custom patterns are matched as generic secrets with High severity.
530    /// Invalid regex patterns are silently ignored.
531    ///
532    /// # Example
533    /// ```
534    /// use infiniloom_engine::security::SecurityScanner;
535    ///
536    /// let mut scanner = SecurityScanner::new();
537    /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}");
538    /// ```
539    pub fn add_custom_pattern(&mut self, pattern: &str) {
540        if let Ok(regex) = Regex::new(pattern) {
541            self.custom_patterns
542                .push(CustomSecretPattern { regex, severity: Severity::High });
543        }
544    }
545
546    /// Add multiple custom patterns at once
547    pub fn add_custom_patterns(&mut self, patterns: &[String]) {
548        for pattern in patterns {
549            self.add_custom_pattern(pattern);
550        }
551    }
552
553    /// Scan content for secrets
554    pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
555        let mut findings = Vec::new();
556
557        for (line_num, line) in content.lines().enumerate() {
558            let trimmed = line.trim();
559
560            // Detect if line is likely a comment - skip entirely to reduce false positives
561            // Real secrets shouldn't be in comments anyway
562            let is_jsdoc_continuation =
563                trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
564            let is_comment = trimmed.starts_with("//")
565                || trimmed.starts_with('#')
566                || trimmed.starts_with("/*")
567                || trimmed.starts_with("*")
568                || is_jsdoc_continuation;
569
570            // Skip obvious false positives (example docs, placeholders, comments)
571            let is_obvious_false_positive = is_comment
572                || RE_EXAMPLE_WORD.is_match(trimmed)
573                || trimmed.to_lowercase().contains("placeholder")
574                || trimmed.contains("xxxxx");
575
576            if is_obvious_false_positive {
577                continue;
578            }
579
580            for pattern in &self.patterns {
581                // Use find_iter to catch ALL matches on a line, not just the first
582                for m in pattern.regex.find_iter(line) {
583                    let matched = m.as_str();
584
585                    // Check allowlist
586                    if self.allowlist.iter().any(|a| matched.contains(a)) {
587                        continue;
588                    }
589
590                    findings.push(SecretFinding {
591                        kind: pattern.kind,
592                        file: file_path.to_owned(),
593                        line: (line_num + 1) as u32,
594                        pattern: redact(matched),
595                        severity: pattern.severity,
596                        in_comment: false, // Non-comment lines only now
597                    });
598                }
599            }
600
601            // Check custom patterns
602            for custom in &self.custom_patterns {
603                for m in custom.regex.find_iter(line) {
604                    let matched = m.as_str();
605
606                    // Check allowlist
607                    if self.allowlist.iter().any(|a| matched.contains(a)) {
608                        continue;
609                    }
610
611                    findings.push(SecretFinding {
612                        kind: SecretKind::Generic,
613                        file: file_path.to_owned(),
614                        line: (line_num + 1) as u32,
615                        pattern: redact(matched),
616                        severity: custom.severity,
617                        in_comment: false,
618                    });
619                }
620            }
621        }
622
623        findings
624    }
625
626    /// Scan a file and return whether it's safe to include
627    pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
628        let findings = self.scan(content, file_path);
629        findings.iter().all(|f| f.severity < Severity::High)
630    }
631
632    /// Get summary of findings
633    pub fn summarize(findings: &[SecretFinding]) -> String {
634        if findings.is_empty() {
635            return "No secrets detected".to_owned();
636        }
637
638        let critical = findings
639            .iter()
640            .filter(|f| f.severity == Severity::Critical)
641            .count();
642        let high = findings
643            .iter()
644            .filter(|f| f.severity == Severity::High)
645            .count();
646
647        format!(
648            "Found {} potential secrets ({} critical, {} high severity)",
649            findings.len(),
650            critical,
651            high
652        )
653    }
654
655    /// Redact secrets from content, returning the redacted content
656    /// This replaces detected secrets with redacted versions in the actual content
657    pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
658        let mut result = content.to_owned();
659
660        for (line_num, line) in content.lines().enumerate() {
661            let trimmed = line.trim();
662
663            // Skip obvious false positives (example docs, placeholders)
664            let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
665                || trimmed.to_lowercase().contains("placeholder")
666                || trimmed.contains("xxxxx");
667
668            if is_obvious_false_positive {
669                continue;
670            }
671
672            for pattern in &self.patterns {
673                // Use find_iter to catch ALL matches on a line, not just the first
674                for m in pattern.regex.find_iter(line) {
675                    let matched = m.as_str();
676
677                    // Check allowlist
678                    if self.allowlist.iter().any(|a| matched.contains(a)) {
679                        continue;
680                    }
681
682                    // Only redact high severity and above
683                    if pattern.severity >= Severity::High {
684                        let redacted = redact(matched);
685                        // Replace in result - use line number to find the right occurrence
686                        let line_start = result
687                            .lines()
688                            .take(line_num)
689                            .map(|l| l.len() + 1)
690                            .sum::<usize>();
691                        if let Some(pos) = result[line_start..].find(matched) {
692                            let abs_pos = line_start + pos;
693                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
694                        }
695                    }
696                }
697            }
698
699            // Check custom patterns for redaction
700            for custom in &self.custom_patterns {
701                for m in custom.regex.find_iter(line) {
702                    let matched = m.as_str();
703
704                    // Check allowlist
705                    if self.allowlist.iter().any(|a| matched.contains(a)) {
706                        continue;
707                    }
708
709                    // Only redact high severity and above
710                    if custom.severity >= Severity::High {
711                        let redacted = redact(matched);
712                        let line_start = result
713                            .lines()
714                            .take(line_num)
715                            .map(|l| l.len() + 1)
716                            .sum::<usize>();
717                        if let Some(pos) = result[line_start..].find(matched) {
718                            let abs_pos = line_start + pos;
719                            result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
720                        }
721                    }
722                }
723            }
724        }
725
726        result
727    }
728
729    /// Scan and redact all secrets from content.
730    ///
731    /// Returns a tuple of (redacted_content, findings) where:
732    /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
733    /// - `findings` is a list of all detected secrets with metadata
734    ///
735    /// # Important
736    ///
737    /// Always check the findings list to understand what was redacted and whether
738    /// the file should be excluded from context entirely.
739    #[must_use = "security findings should be reviewed"]
740    pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
741        let findings = self.scan(content, file_path);
742        let redacted = self.redact_content(content, file_path);
743        (redacted, findings)
744    }
745}
746
747/// Redact a matched secret for display
748///
749/// This function is UTF-8 safe - it uses character counts rather than byte
750/// positions to avoid panics when secrets contain multi-byte characters.
751fn redact(s: &str) -> String {
752    let char_count = s.chars().count();
753
754    if char_count <= 8 {
755        return "*".repeat(char_count);
756    }
757
758    // Use character-based positions for UTF-8 safety
759    let prefix_chars = 4.min(char_count / 4);
760    let suffix_chars = 4.min(char_count / 4);
761    let redact_chars = char_count.saturating_sub(prefix_chars + suffix_chars);
762
763    // Collect prefix characters
764    let prefix: String = s.chars().take(prefix_chars).collect();
765
766    // Collect suffix characters
767    let suffix: String = s.chars().skip(char_count - suffix_chars).collect();
768
769    format!("{}{}{}", prefix, "*".repeat(redact_chars), suffix)
770}
771
772#[cfg(test)]
773mod tests {
774    use super::*;
775
776    #[test]
777    fn test_aws_key_detection() {
778        let scanner = SecurityScanner::new();
779        let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
780
781        let findings = scanner.scan(content, "config.py");
782
783        assert!(!findings.is_empty());
784        assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
785    }
786
787    #[test]
788    fn test_github_token_detection() {
789        let scanner = SecurityScanner::new();
790        let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
791
792        let findings = scanner.scan(content, ".env");
793
794        assert!(!findings.is_empty());
795        assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
796    }
797
798    #[test]
799    fn test_private_key_detection() {
800        let scanner = SecurityScanner::new();
801        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
802
803        let findings = scanner.scan(content, "key.pem");
804
805        assert!(!findings.is_empty());
806        assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
807    }
808
809    #[test]
810    fn test_allowlist() {
811        let mut scanner = SecurityScanner::new();
812        scanner.allowlist("EXAMPLE");
813
814        let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
815        let findings = scanner.scan(content, "test.py");
816
817        assert!(findings.is_empty());
818    }
819
820    #[test]
821    fn test_redact() {
822        assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
823        assert_eq!(redact("short"), "*****");
824    }
825
826    #[test]
827    fn test_redact_unicode_safety() {
828        // Test with Chinese characters (3 bytes each)
829        // Should not panic when slicing
830        let chinese_secret = "密钥ABCDEFGHIJKLMNOP密钥";
831        let result = redact(chinese_secret);
832        // Should produce valid UTF-8
833        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
834        // Should contain asterisks
835        assert!(result.contains('*'));
836
837        // Test with emoji (4 bytes each)
838        let emoji_secret = "🔑ABCDEFGHIJKLMNOP🔒";
839        let result = redact(emoji_secret);
840        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
841        assert!(result.contains('*'));
842
843        // Test with mixed multi-byte characters
844        let mixed_secret = "абвгдежзийклмноп"; // Cyrillic (2 bytes each)
845        let result = redact(mixed_secret);
846        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
847        assert!(result.contains('*'));
848
849        // Test short Unicode strings (should all be asterisks)
850        let short_chinese = "密钥";
851        let result = redact(short_chinese);
852        assert_eq!(result, "**"); // 2 characters
853    }
854
855    #[test]
856    fn test_redact_edge_cases() {
857        // Empty string
858        assert_eq!(redact(""), "");
859
860        // Single character
861        assert_eq!(redact("x"), "*");
862
863        // Exactly 8 characters (boundary)
864        assert_eq!(redact("12345678"), "********");
865
866        // 9 characters (first to show prefix/suffix)
867        let result = redact("123456789");
868        assert!(result.contains('*'));
869        assert!(result.starts_with('1') || result.starts_with('*'));
870    }
871
872    #[test]
873    fn test_comments_are_skipped() {
874        let scanner = SecurityScanner::new();
875        let content = "# api_key = 'some_secret_key_12345678901234567890'";
876
877        let findings = scanner.scan(content, "test.py");
878
879        // Comments are skipped entirely to reduce false positives
880        assert!(findings.is_empty(), "Secrets in comments should be skipped");
881    }
882
883    #[test]
884    fn test_non_comment_detected() {
885        let scanner = SecurityScanner::new();
886        let content = "api_key = 'some_secret_key_12345678901234567890'";
887
888        let findings = scanner.scan(content, "test.py");
889
890        assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
891        assert!(
892            findings.iter().all(|f| !f.in_comment),
893            "in_comment should be false for non-comment lines"
894        );
895    }
896
897    #[test]
898    fn test_custom_pattern() {
899        let mut scanner = SecurityScanner::new();
900        scanner.add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}");
901
902        let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
903        let findings = scanner.scan(content, "test.py");
904
905        assert!(!findings.is_empty(), "Custom pattern should be detected");
906        assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
907    }
908
909    #[test]
910    fn test_custom_patterns_multiple() {
911        let mut scanner = SecurityScanner::new();
912        scanner.add_custom_patterns(&[
913            r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
914            r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
915        ]);
916
917        let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
918        let findings = scanner.scan(content, "test.py");
919
920        assert!(!findings.is_empty(), "Custom patterns should be detected");
921    }
922
923    #[test]
924    fn test_invalid_custom_pattern_ignored() {
925        let mut scanner = SecurityScanner::new();
926        // Invalid regex - unclosed bracket
927        scanner.add_custom_pattern(r"INVALID_[PATTERN");
928
929        // Should not panic, invalid patterns are ignored
930        let content = "INVALID_[PATTERN here";
931        let _findings = scanner.scan(content, "test.py");
932    }
933}