Skip to main content

infiniloom_engine/
security.rs

1//! Security scanning for secrets and sensitive data
2//!
3//! This module provides automatic detection and redaction of secrets, API keys,
4//! tokens, and other sensitive data before sharing code with LLMs or external services.
5//!
6//! # Quick Start
7//!
8//! ```rust
9//! use infiniloom_engine::security::SecurityScanner;
10//!
11//! let scanner = SecurityScanner::new();
12//! let code = r#"
13//!     const AWS_KEY = "AKIAIOSFODNN7EXAMPLE";
14//!     const API_TOKEN = "sk-proj-abc123xyz789";
15//! "#;
16//!
17//! // Scan for secrets
18//! let findings = scanner.scan(code, "config.rs");
19//!
20//! if !findings.is_empty() {
21//!     println!("⚠️  Found {} secrets!", findings.len());
22//!     for finding in &findings {
23//!         println!("  {} on line {}: {}",
24//!             finding.kind.name(),
25//!             finding.line,
26//!             finding.pattern);  // Already redacted: "AKIA************MPLE"
27//!     }
28//! }
29//! ```
30//!
31//! # Scanning with Detailed Results
32//!
33//! The scanner returns structured findings with metadata:
34//!
35//! ```rust
36//! use infiniloom_engine::security::{SecurityScanner, Severity};
37//!
38//! let scanner = SecurityScanner::new();
39//! let findings = scanner.scan(r#"
40//!     DB_URL = "postgresql://user:pass@localhost/db"
41//!     STRIPE_KEY = "sk_live_abc123xyz789"
42//! "#, ".env");
43//!
44//! for finding in findings {
45//!     match finding.severity {
46//!         Severity::Critical => println!("🔴 CRITICAL: {}", finding.pattern),
47//!         Severity::High => println!("🟠 HIGH: {}", finding.pattern),
48//!         Severity::Medium => println!("🟡 MEDIUM: {}", finding.pattern),
49//!         Severity::Low => println!("🟢 LOW: {}", finding.pattern),
50//!     }
51//! }
52//! ```
53//!
54//! # Automatic Redaction
55//!
56//! Replace detected secrets with `[REDACTED]` markers:
57//!
58//! ```rust
59//! use infiniloom_engine::security::SecurityScanner;
60//!
61//! let scanner = SecurityScanner::new();
62//! let code = r#"
63//!     const apiKey = "sk-proj-secret123";
64//!     const githubToken = "ghp_abcdefghijklmnopqrstuvwxyz1234567890";
65//! "#;
66//!
67//! // Scan and redact in one operation
68//! let (redacted, findings) = scanner.scan_and_redact(code, "api.ts");
69//!
70//! println!("Original had {} secrets", findings.len());
71//! println!("Redacted version:\n{}", redacted);
72//! // Output: const apiKey = "sk-p****ect123";
73//! //         const githubToken = "ghp_****7890";
74//! ```
75//!
76//! # Custom Patterns
77//!
78//! Add organization-specific secret patterns:
79//!
80//! ```rust,no_run
81//! use infiniloom_engine::security::SecurityScanner;
82//!
83//! let mut scanner = SecurityScanner::new();
84//!
85//! // Add custom patterns for internal systems
86//! scanner.add_custom_pattern(r"MYCOMPANY_API_[A-Z0-9]{32}");
87//! scanner.add_custom_pattern(r"INTERNAL_TOKEN_[a-f0-9]{64}");
88//!
89//! // Or add multiple at once
90//! scanner.add_custom_patterns(&[
91//!     "ORG_SECRET_[A-Z0-9]{16}".to_string(),
92//!     "DEPLOY_KEY_[a-z0-9]{40}".to_string(),
93//! ]);
94//!
95//! // Now scan with both built-in and custom patterns
96//! let findings = scanner.scan(r#"
97//!     MYCOMPANY_API_ABCD1234EFGH5678IJKL9012MNOP
98//! "#, "internal.rs");
99//!
100//! assert!(!findings.is_empty());
101//! ```
102//!
103//! # Allowlist for Test Data
104//!
105//! Mark known test/example secrets as safe:
106//!
107//! ```rust
108//! use infiniloom_engine::security::SecurityScanner;
109//!
110//! let mut scanner = SecurityScanner::new();
111//!
112//! // Allowlist test keys that are intentionally public
113//! scanner.allowlist("EXAMPLE");
114//! scanner.allowlist("test_key");
115//! scanner.allowlist("mock_secret");
116//!
117//! // This won't trigger detection (contains "EXAMPLE")
118//! let test_code = r#"
119//!     AWS_KEY = "AKIAIOSFODNN7EXAMPLE"  // Official AWS test key
120//! "#;
121//!
122//! let findings = scanner.scan(test_code, "test.rs");
123//! assert!(findings.is_empty(), "Test keys should be allowed");
124//!
125//! // But this WILL trigger (real key format)
126//! let prod_code = r#"
127//!     AWS_KEY = "AKIAIOSFODNN7PRODKEY"
128//! "#;
129//!
130//! let findings = scanner.scan(prod_code, "prod.rs");
131//! assert!(!findings.is_empty(), "Real keys should be detected");
132//! ```
133//!
134//! # Repository Integration
135//!
136//! Scan all files in a repository:
137//!
138//! ```rust,ignore
139//! use infiniloom_engine::security::SecurityScanner;
140//!
141//! let scanner = SecurityScanner::new();
142//! let mut all_findings = Vec::new();
143//!
144//! for file in repository.files {
145//!     let findings = scanner.scan(&file.content, &file.relative_path);
146//!     all_findings.extend(findings);
147//! }
148//!
149//! if !all_findings.is_empty() {
150//!     eprintln!("⚠️  Security scan found {} secrets across {} files",
151//!         all_findings.len(),
152//!         all_findings.iter()
153//!             .map(|f| &f.file)
154//!             .collect::<std::collections::HashSet<_>>()
155//!             .len()
156//!     );
157//!
158//!     // Exit with error in CI/CD
159//!     std::process::exit(1);
160//! }
161//! ```
162//!
163//! # Severity-Based Filtering
164//!
165//! Work with different severity levels:
166//!
167//! ```rust
168//! use infiniloom_engine::security::{SecurityScanner, Severity};
169//!
170//! let scanner = SecurityScanner::new();
171//! let findings = scanner.scan(r#"
172//!     AWS_KEY = "AKIAIOSFODNN7PRODKEY"      # Critical
173//!     password = "weak123"                  # High
174//! "#, ".env");
175//!
176//! // Count by severity
177//! let critical_count = findings.iter()
178//!     .filter(|f| f.severity == Severity::Critical)
179//!     .count();
180//!
181//! let high_count = findings.iter()
182//!     .filter(|f| f.severity == Severity::High)
183//!     .count();
184//!
185//! println!("Critical: {}, High: {}", critical_count, high_count);
186//!
187//! // Check if safe to proceed (only low/medium severity)
188//! let is_safe = findings.iter()
189//!     .all(|f| f.severity < Severity::High);
190//!
191//! if !is_safe {
192//!     eprintln!("⛔ Cannot proceed - high/critical secrets detected");
193//! }
194//! ```
195//!
196//! # Supported Secret Types
197//!
198//! ## Cloud Credentials (Critical Severity)
199//! - **AWS**: Access keys (AKIA...), Secret access keys
200//! - **GitHub**: Personal access tokens (ghp_..., github_pat_...), OAuth tokens
201//! - **Private Keys**: RSA, EC, DSA, OpenSSH private keys
202//!
203//! ## API Keys (Critical Severity)
204//! - **OpenAI**: sk-... API keys
205//! - **Anthropic**: sk-ant-... API keys
206//! - **Stripe**: sk_live_..., pk_test_... keys
207//!
208//! ## Service Tokens (High Severity)
209//! - **Slack**: xoxb-..., xoxa-... tokens
210//! - **JWT**: Encoded JSON Web Tokens
211//! - **Database**: Connection strings (PostgreSQL, MongoDB, MySQL, Redis, etc.)
212//!
213//! ## Generic Secrets (High Severity)
214//! - Generic API keys (api_key=...)
215//! - Access tokens (token=..., secret=...)
216//! - Passwords (password=...)
217//!
218//! # Why Pre-compiled Patterns?
219//!
220//! The module uses `once_cell::sync::Lazy` for regex patterns:
221//!
222//! ```rust,ignore
223//! static RE_AWS_KEY: Lazy<Regex> =
224//!     Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").unwrap());
225//! ```
226//!
227//! **Benefits**:
228//! - Compiled once at first use
229//! - Reused across all scanner instances
230//! - Thread-safe sharing
231//! - Zero runtime compilation overhead
232//!
233//! **Pattern Order**: More specific patterns (Stripe, Slack, JWT) come BEFORE
234//! generic patterns (api_key, secret) to ensure accurate detection and avoid
235//! masking by broader patterns.
236//!
237//! # False Positive Reduction
238//!
239//! The scanner automatically skips:
240//! - **Comments**: Lines starting with //, #, /*, *
241//! - **Documentation**: Lines containing "example" as a word
242//! - **Placeholders**: Lines with "xxxxx" or "placeholder"
243//! - **Allowlisted patterns**: User-configured safe patterns
244//!
245//! This reduces false positives in documentation, test files, and examples
246//! while catching real secrets in code.
247
248use once_cell::sync::Lazy;
249use regex::Regex;
250use std::collections::HashSet;
251
252// Helper regex for word-boundary "example" detection (to skip documentation lines)
253static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
254    // Match "example" as a standalone word to skip documentation/tutorial content.
255    // This helps reduce false positives in example code and documentation.
256    //
257    // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
258    // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
259    // which runs separately. This regex is only used to skip entire lines that
260    // appear to be documentation examples (e.g., "# Example:" or "// example usage").
261    //
262    // The regex allows dots in word boundaries to handle domain examples like
263    // db.example.com without matching.
264    Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)")
265        .expect("RE_EXAMPLE_WORD: invalid regex pattern")
266});
267
268// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
269static RE_AWS_KEY: Lazy<Regex> =
270    Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").expect("RE_AWS_KEY: invalid regex pattern"));
271static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
272    Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
273        .expect("RE_AWS_SECRET: invalid regex pattern")
274});
275// GitHub Personal Access Token (classic) - 36 alphanumeric chars after prefix
276static RE_GITHUB_PAT: Lazy<Regex> =
277    Lazy::new(|| Regex::new(r"ghp_[A-Za-z0-9]{36}").expect("RE_GITHUB_PAT: invalid regex pattern"));
278// GitHub fine-grained PAT
279static RE_GITHUB_FINE_PAT: Lazy<Regex> = Lazy::new(|| {
280    Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}")
281        .expect("RE_GITHUB_FINE_PAT: invalid regex pattern")
282});
283// GitHub OAuth, user-to-server, server-to-server, and refresh tokens
284static RE_GITHUB_OTHER_TOKENS: Lazy<Regex> = Lazy::new(|| {
285    Regex::new(r"gh[ours]_[A-Za-z0-9]{36,}").expect("RE_GITHUB_OTHER_TOKENS: invalid regex pattern")
286});
287static RE_PRIVATE_KEY: Lazy<Regex> = Lazy::new(|| {
288    Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")
289        .expect("RE_PRIVATE_KEY: invalid regex pattern")
290});
291static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
292    Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
293        .expect("RE_API_KEY: invalid regex pattern")
294});
295static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
296    Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
297        .expect("RE_SECRET_TOKEN: invalid regex pattern")
298});
299static RE_PASSWORD: Lazy<Regex> = Lazy::new(|| {
300    Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#)
301        .expect("RE_PASSWORD: invalid regex pattern")
302});
303static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
304    // Note: postgres and postgresql are both valid (postgresql:// is more common in practice)
305    Regex::new(
306        r#"(?i)(?:mongodb|postgres(?:ql)?|mysql|redis|mariadb|cockroachdb|mssql)://[^\s'"]+"#,
307    )
308    .expect("RE_CONN_STRING: invalid regex pattern")
309});
310static RE_JWT: Lazy<Regex> = Lazy::new(|| {
311    Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*")
312        .expect("RE_JWT: invalid regex pattern")
313});
314static RE_SLACK: Lazy<Regex> = Lazy::new(|| {
315    Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}")
316        .expect("RE_SLACK: invalid regex pattern")
317});
318static RE_STRIPE: Lazy<Regex> = Lazy::new(|| {
319    Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}")
320        .expect("RE_STRIPE: invalid regex pattern")
321});
322// OpenAI API keys (sk-... followed by alphanumeric characters)
323// Note: Anthropic keys (sk-ant-...) are detected first in pattern order,
324// so this pattern won't match them due to the scan loop's first-match behavior.
325// Pattern allows letters, numbers, underscores, and hyphens after 'sk-'
326static RE_OPENAI: Lazy<Regex> =
327    Lazy::new(|| Regex::new(r"sk-[A-Za-z0-9_-]{32,}").expect("RE_OPENAI: invalid regex pattern"));
328// Anthropic API keys (sk-ant-...)
329static RE_ANTHROPIC: Lazy<Regex> = Lazy::new(|| {
330    Regex::new(r"sk-ant-[A-Za-z0-9-]{40,}").expect("RE_ANTHROPIC: invalid regex pattern")
331});
332
333/// Error type for security scanning operations
334#[derive(Debug, Clone)]
335pub enum SecurityError {
336    /// Invalid regex pattern for custom secret detection
337    InvalidPattern {
338        /// The invalid pattern
339        pattern: String,
340        /// The error message from regex compilation
341        message: String,
342    },
343}
344
345impl std::fmt::Display for SecurityError {
346    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
347        match self {
348            Self::InvalidPattern { pattern, message } => {
349                write!(f, "Invalid regex pattern '{}': {}", pattern, message)
350            },
351        }
352    }
353}
354
355impl std::error::Error for SecurityError {}
356
357/// A detected secret or sensitive data
358#[derive(Debug, Clone)]
359pub struct SecretFinding {
360    /// Type of secret
361    pub kind: SecretKind,
362    /// File path
363    pub file: String,
364    /// Line number
365    pub line: u32,
366    /// Matched pattern (redacted)
367    pub pattern: String,
368    /// Severity level
369    pub severity: Severity,
370    /// Whether the secret was found in a comment (may be example/documentation)
371    pub in_comment: bool,
372}
373
374/// Kind of secret detected
375#[derive(Debug, Clone, Copy, PartialEq, Eq)]
376pub enum SecretKind {
377    /// API key
378    ApiKey,
379    /// Access token
380    AccessToken,
381    /// Private key
382    PrivateKey,
383    /// Password
384    Password,
385    /// Database connection string
386    ConnectionString,
387    /// AWS credentials
388    AwsCredential,
389    /// GitHub token
390    GitHubToken,
391    /// Generic secret
392    Generic,
393}
394
395impl SecretKind {
396    /// Get human-readable name
397    pub fn name(&self) -> &'static str {
398        match self {
399            Self::ApiKey => "API Key",
400            Self::AccessToken => "Access Token",
401            Self::PrivateKey => "Private Key",
402            Self::Password => "Password",
403            Self::ConnectionString => "Connection String",
404            Self::AwsCredential => "AWS Credential",
405            Self::GitHubToken => "GitHub Token",
406            Self::Generic => "Generic Secret",
407        }
408    }
409}
410
411/// Severity level
412#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
413pub enum Severity {
414    Low,
415    Medium,
416    High,
417    Critical,
418}
419
420/// Security scanner
421pub struct SecurityScanner {
422    patterns: Vec<SecretPattern>,
423    custom_patterns: Vec<CustomSecretPattern>,
424    allowlist: HashSet<String>,
425}
426
427struct SecretPattern {
428    kind: SecretKind,
429    regex: &'static Lazy<Regex>,
430    severity: Severity,
431}
432
433/// Custom user-defined secret pattern
434struct CustomSecretPattern {
435    regex: Regex,
436    severity: Severity,
437}
438
439impl Default for SecurityScanner {
440    fn default() -> Self {
441        Self::new()
442    }
443}
444
445impl SecurityScanner {
446    /// Create a new security scanner with default patterns
447    /// Uses pre-compiled static regex patterns for optimal performance
448    ///
449    /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
450    /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
451    /// and redaction.
452    pub fn new() -> Self {
453        let patterns = vec![
454            // === Critical: Specific cloud credentials (most specific patterns first) ===
455            // AWS
456            SecretPattern {
457                kind: SecretKind::AwsCredential,
458                regex: &RE_AWS_KEY,
459                severity: Severity::Critical,
460            },
461            SecretPattern {
462                kind: SecretKind::AwsCredential,
463                regex: &RE_AWS_SECRET,
464                severity: Severity::Critical,
465            },
466            // GitHub tokens (all types: ghp_, gho_, ghu_, ghs_, ghr_, github_pat_)
467            SecretPattern {
468                kind: SecretKind::GitHubToken,
469                regex: &RE_GITHUB_PAT,
470                severity: Severity::Critical,
471            },
472            SecretPattern {
473                kind: SecretKind::GitHubToken,
474                regex: &RE_GITHUB_FINE_PAT,
475                severity: Severity::Critical,
476            },
477            SecretPattern {
478                kind: SecretKind::GitHubToken,
479                regex: &RE_GITHUB_OTHER_TOKENS,
480                severity: Severity::Critical,
481            },
482            // Private keys
483            SecretPattern {
484                kind: SecretKind::PrivateKey,
485                regex: &RE_PRIVATE_KEY,
486                severity: Severity::Critical,
487            },
488            // Anthropic API keys (must come before OpenAI since sk-ant- is more specific)
489            SecretPattern {
490                kind: SecretKind::ApiKey,
491                regex: &RE_ANTHROPIC,
492                severity: Severity::Critical,
493            },
494            // OpenAI API keys (must come before Stripe since sk- is more general)
495            SecretPattern {
496                kind: SecretKind::ApiKey,
497                regex: &RE_OPENAI,
498                severity: Severity::Critical,
499            },
500            // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
501            SecretPattern {
502                kind: SecretKind::ApiKey,
503                regex: &RE_STRIPE,
504                severity: Severity::Critical,
505            },
506            // === High: Specific service tokens (must come before generic patterns) ===
507            // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
508            SecretPattern {
509                kind: SecretKind::AccessToken,
510                regex: &RE_SLACK,
511                severity: Severity::High,
512            },
513            // JWT tokens (specific pattern: eyJ...eyJ...signature)
514            SecretPattern {
515                kind: SecretKind::AccessToken,
516                regex: &RE_JWT,
517                severity: Severity::High,
518            },
519            // Connection strings (specific pattern: mongodb://, postgres://, etc.)
520            SecretPattern {
521                kind: SecretKind::ConnectionString,
522                regex: &RE_CONN_STRING,
523                severity: Severity::High,
524            },
525            // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
526            // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
527            SecretPattern {
528                kind: SecretKind::ApiKey,
529                regex: &RE_API_KEY,
530                severity: Severity::High,
531            },
532            // Generic secrets (matches secret=xxx, token=xxx, etc.)
533            SecretPattern {
534                kind: SecretKind::Generic,
535                regex: &RE_SECRET_TOKEN,
536                severity: Severity::High,
537            },
538            // Passwords
539            SecretPattern {
540                kind: SecretKind::Password,
541                regex: &RE_PASSWORD,
542                severity: Severity::High,
543            },
544        ];
545
546        Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
547    }
548
549    /// Add a pattern to allowlist
550    pub fn allowlist(&mut self, pattern: &str) {
551        self.allowlist.insert(pattern.to_owned());
552    }
553
554    /// Add a custom regex pattern for secret detection
555    ///
556    /// Custom patterns are matched as generic secrets with High severity.
557    /// Returns an error if the regex pattern is invalid.
558    ///
559    /// # Example
560    /// ```
561    /// use infiniloom_engine::security::SecurityScanner;
562    ///
563    /// let mut scanner = SecurityScanner::new();
564    /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}").unwrap();
565    /// ```
566    ///
567    /// # Errors
568    /// Returns `SecurityError::InvalidPattern` if the regex pattern is invalid.
569    pub fn add_custom_pattern(&mut self, pattern: &str) -> Result<(), SecurityError> {
570        let regex = Regex::new(pattern).map_err(|e| SecurityError::InvalidPattern {
571            pattern: pattern.to_owned(),
572            message: e.to_string(),
573        })?;
574        self.custom_patterns
575            .push(CustomSecretPattern { regex, severity: Severity::High });
576        Ok(())
577    }
578
579    /// Add a custom regex pattern, ignoring invalid patterns
580    ///
581    /// This is a convenience method that silently ignores invalid patterns.
582    /// Use [`add_custom_pattern`] if you need to handle errors.
583    pub fn add_custom_pattern_unchecked(&mut self, pattern: &str) {
584        let _ = self.add_custom_pattern(pattern);
585    }
586
587    /// Add multiple custom patterns at once
588    ///
589    /// Returns the first error encountered, if any. Patterns before the error
590    /// will have been added successfully.
591    ///
592    /// # Errors
593    /// Returns `SecurityError::InvalidPattern` if any regex pattern is invalid.
594    pub fn add_custom_patterns(&mut self, patterns: &[String]) -> Result<(), SecurityError> {
595        for pattern in patterns {
596            self.add_custom_pattern(pattern)?;
597        }
598        Ok(())
599    }
600
601    /// Add multiple custom patterns, ignoring invalid patterns
602    ///
603    /// This is a convenience method that silently ignores invalid patterns.
604    /// Use [`add_custom_patterns`] if you need to handle errors.
605    pub fn add_custom_patterns_unchecked(&mut self, patterns: &[String]) {
606        for pattern in patterns {
607            self.add_custom_pattern_unchecked(pattern);
608        }
609    }
610
611    /// Scan content for secrets
612    pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
613        let mut findings = Vec::new();
614
615        for (line_num, line) in content.lines().enumerate() {
616            let trimmed = line.trim();
617
618            // Detect if line is likely a comment - skip entirely to reduce false positives
619            // Real secrets shouldn't be in comments anyway
620            let is_jsdoc_continuation =
621                trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
622            let is_comment = trimmed.starts_with("//")
623                || trimmed.starts_with('#')
624                || trimmed.starts_with("/*")
625                || trimmed.starts_with('*')
626                || is_jsdoc_continuation;
627
628            // Skip obvious false positives (example docs, placeholders, comments)
629            let is_obvious_false_positive = is_comment
630                || RE_EXAMPLE_WORD.is_match(trimmed)
631                || trimmed.to_lowercase().contains("placeholder")
632                || trimmed.contains("xxxxx");
633
634            if is_obvious_false_positive {
635                continue;
636            }
637
638            for pattern in &self.patterns {
639                // Use find_iter to catch ALL matches on a line, not just the first
640                for m in pattern.regex.find_iter(line) {
641                    let matched = m.as_str();
642
643                    // Check allowlist
644                    if self.allowlist.iter().any(|a| matched.contains(a)) {
645                        continue;
646                    }
647
648                    findings.push(SecretFinding {
649                        kind: pattern.kind,
650                        file: file_path.to_owned(),
651                        line: (line_num + 1) as u32,
652                        pattern: redact(matched),
653                        severity: pattern.severity,
654                        in_comment: false, // Non-comment lines only now
655                    });
656                }
657            }
658
659            // Check custom patterns
660            for custom in &self.custom_patterns {
661                for m in custom.regex.find_iter(line) {
662                    let matched = m.as_str();
663
664                    // Check allowlist
665                    if self.allowlist.iter().any(|a| matched.contains(a)) {
666                        continue;
667                    }
668
669                    findings.push(SecretFinding {
670                        kind: SecretKind::Generic,
671                        file: file_path.to_owned(),
672                        line: (line_num + 1) as u32,
673                        pattern: redact(matched),
674                        severity: custom.severity,
675                        in_comment: false,
676                    });
677                }
678            }
679        }
680
681        findings
682    }
683
684    /// Scan a file and return whether it's safe to include
685    pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
686        let findings = self.scan(content, file_path);
687        findings.iter().all(|f| f.severity < Severity::High)
688    }
689
690    /// Get summary of findings
691    pub fn summarize(findings: &[SecretFinding]) -> String {
692        if findings.is_empty() {
693            return "No secrets detected".to_owned();
694        }
695
696        let critical = findings
697            .iter()
698            .filter(|f| f.severity == Severity::Critical)
699            .count();
700        let high = findings
701            .iter()
702            .filter(|f| f.severity == Severity::High)
703            .count();
704
705        format!(
706            "Found {} potential secrets ({} critical, {} high severity)",
707            findings.len(),
708            critical,
709            high
710        )
711    }
712
713    /// Redact secrets from content, returning the redacted content
714    /// This replaces detected secrets with redacted versions in the actual content
715    ///
716    /// # Implementation Note
717    /// Uses a two-pass approach to handle multiple secrets on the same line correctly:
718    /// 1. First pass: collect all matches with their positions
719    /// 2. Second pass: replace in reverse order (right to left) so positions don't shift
720    pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
721        // Collect all matches that need redaction: (start_byte, end_byte, redacted_text)
722        let mut replacements: Vec<(usize, usize, String)> = Vec::new();
723
724        let mut current_byte_offset = 0usize;
725        for line in content.lines() {
726            let trimmed = line.trim();
727
728            // Skip obvious false positives (example docs, placeholders)
729            let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
730                || trimmed.to_lowercase().contains("placeholder")
731                || trimmed.contains("xxxxx");
732
733            if !is_obvious_false_positive {
734                // Check built-in patterns
735                for pattern in &self.patterns {
736                    if pattern.severity >= Severity::High {
737                        for m in pattern.regex.find_iter(line) {
738                            let matched = m.as_str();
739
740                            // Check allowlist
741                            if self.allowlist.iter().any(|a| matched.contains(a)) {
742                                continue;
743                            }
744
745                            let start = current_byte_offset + m.start();
746                            let end = current_byte_offset + m.end();
747                            replacements.push((start, end, redact(matched)));
748                        }
749                    }
750                }
751
752                // Check custom patterns
753                for custom in &self.custom_patterns {
754                    if custom.severity >= Severity::High {
755                        for m in custom.regex.find_iter(line) {
756                            let matched = m.as_str();
757
758                            // Check allowlist
759                            if self.allowlist.iter().any(|a| matched.contains(a)) {
760                                continue;
761                            }
762
763                            let start = current_byte_offset + m.start();
764                            let end = current_byte_offset + m.end();
765                            replacements.push((start, end, redact(matched)));
766                        }
767                    }
768                }
769            }
770
771            // Move to next line (+1 for newline character)
772            current_byte_offset += line.len() + 1;
773        }
774
775        // Sort replacements by length first (shorter = more specific), then by position
776        // This ensures more specific patterns (Stripe key) are preferred over
777        // generic patterns (api_key=xxx) that might include the key name
778        replacements.sort_by(|a, b| {
779            let a_len = a.1 - a.0;
780            let b_len = b.1 - b.0;
781            a_len.cmp(&b_len).then(a.0.cmp(&b.0))
782        });
783
784        // Remove overlapping ranges, keeping the more specific (shorter) match
785        // Since we sorted by length first, shorter matches are processed first
786        let mut filtered: Vec<(usize, usize, String)> = Vec::new();
787        for replacement in replacements {
788            // Check if this overlaps with any existing replacement
789            let overlaps = filtered.iter().any(|(start, end, _)| {
790                // Two ranges overlap if one starts before the other ends and vice versa
791                replacement.0 < *end && *start < replacement.1
792            });
793
794            if !overlaps {
795                filtered.push(replacement);
796            }
797            // If overlaps, skip this one (we already have the shorter/more specific match)
798        }
799
800        // Apply replacements in reverse order so positions don't shift
801        let mut result = content.to_owned();
802        for (start, end, redacted) in filtered.into_iter().rev() {
803            if end <= result.len() {
804                result.replace_range(start..end, &redacted);
805            }
806        }
807
808        result
809    }
810
811    /// Scan and redact all secrets from content.
812    ///
813    /// Returns a tuple of (redacted_content, findings) where:
814    /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
815    /// - `findings` is a list of all detected secrets with metadata
816    ///
817    /// # Important
818    ///
819    /// Always check the findings list to understand what was redacted and whether
820    /// the file should be excluded from context entirely.
821    #[must_use = "security findings should be reviewed"]
822    pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
823        let findings = self.scan(content, file_path);
824        let redacted = self.redact_content(content, file_path);
825        (redacted, findings)
826    }
827}
828
829/// Redact a matched secret for display
830///
831/// This function is UTF-8 safe - it uses character counts rather than byte
832/// positions to avoid panics when secrets contain multi-byte characters.
833fn redact(s: &str) -> String {
834    let char_count = s.chars().count();
835
836    if char_count <= 8 {
837        return "*".repeat(char_count);
838    }
839
840    // Use character-based positions for UTF-8 safety
841    let prefix_chars = 4.min(char_count / 4);
842    let suffix_chars = 4.min(char_count / 4);
843    let redact_chars = char_count.saturating_sub(prefix_chars + suffix_chars);
844
845    // Collect prefix characters
846    let prefix: String = s.chars().take(prefix_chars).collect();
847
848    // Collect suffix characters
849    let suffix: String = s.chars().skip(char_count - suffix_chars).collect();
850
851    format!("{}{}{}", prefix, "*".repeat(redact_chars), suffix)
852}
853
854#[cfg(test)]
855mod tests {
856    use super::*;
857
858    #[test]
859    fn test_aws_key_detection() {
860        let scanner = SecurityScanner::new();
861        let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
862
863        let findings = scanner.scan(content, "config.py");
864
865        assert!(!findings.is_empty());
866        assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
867    }
868
869    #[test]
870    fn test_github_token_detection() {
871        let scanner = SecurityScanner::new();
872        let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
873
874        let findings = scanner.scan(content, ".env");
875
876        assert!(!findings.is_empty());
877        assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
878    }
879
880    #[test]
881    fn test_private_key_detection() {
882        let scanner = SecurityScanner::new();
883        let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
884
885        let findings = scanner.scan(content, "key.pem");
886
887        assert!(!findings.is_empty());
888        assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
889    }
890
891    #[test]
892    fn test_allowlist() {
893        let mut scanner = SecurityScanner::new();
894        scanner.allowlist("EXAMPLE");
895
896        let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
897        let findings = scanner.scan(content, "test.py");
898
899        assert!(findings.is_empty());
900    }
901
902    #[test]
903    fn test_redact() {
904        assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
905        assert_eq!(redact("short"), "*****");
906    }
907
908    #[test]
909    fn test_redact_unicode_safety() {
910        // Test with Chinese characters (3 bytes each)
911        // Should not panic when slicing
912        let chinese_secret = "密钥ABCDEFGHIJKLMNOP密钥";
913        let result = redact(chinese_secret);
914        // Should produce valid UTF-8
915        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
916        // Should contain asterisks
917        assert!(result.contains('*'));
918
919        // Test with emoji (4 bytes each)
920        let emoji_secret = "🔑ABCDEFGHIJKLMNOP🔒";
921        let result = redact(emoji_secret);
922        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
923        assert!(result.contains('*'));
924
925        // Test with mixed multi-byte characters
926        let mixed_secret = "абвгдежзийклмноп"; // Cyrillic (2 bytes each)
927        let result = redact(mixed_secret);
928        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
929        assert!(result.contains('*'));
930
931        // Test short Unicode strings (should all be asterisks)
932        let short_chinese = "密钥";
933        let result = redact(short_chinese);
934        assert_eq!(result, "**"); // 2 characters
935    }
936
937    #[test]
938    fn test_redact_edge_cases() {
939        // Empty string
940        assert_eq!(redact(""), "");
941
942        // Single character
943        assert_eq!(redact("x"), "*");
944
945        // Exactly 8 characters (boundary)
946        assert_eq!(redact("12345678"), "********");
947
948        // 9 characters (first to show prefix/suffix)
949        let result = redact("123456789");
950        assert!(result.contains('*'));
951        assert!(result.starts_with('1') || result.starts_with('*'));
952    }
953
954    #[test]
955    fn test_comments_are_skipped() {
956        let scanner = SecurityScanner::new();
957        let content = "# api_key = 'some_secret_key_12345678901234567890'";
958
959        let findings = scanner.scan(content, "test.py");
960
961        // Comments are skipped entirely to reduce false positives
962        assert!(findings.is_empty(), "Secrets in comments should be skipped");
963    }
964
965    #[test]
966    fn test_non_comment_detected() {
967        let scanner = SecurityScanner::new();
968        let content = "api_key = 'some_secret_key_12345678901234567890'";
969
970        let findings = scanner.scan(content, "test.py");
971
972        assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
973        assert!(
974            findings.iter().all(|f| !f.in_comment),
975            "in_comment should be false for non-comment lines"
976        );
977    }
978
979    #[test]
980    fn test_custom_pattern() {
981        let mut scanner = SecurityScanner::new();
982        scanner
983            .add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}")
984            .unwrap();
985
986        let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
987        let findings = scanner.scan(content, "test.py");
988
989        assert!(!findings.is_empty(), "Custom pattern should be detected");
990        assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
991    }
992
993    #[test]
994    fn test_custom_patterns_multiple() {
995        let mut scanner = SecurityScanner::new();
996        scanner
997            .add_custom_patterns(&[
998                r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
999                r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
1000            ])
1001            .unwrap();
1002
1003        let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
1004        let findings = scanner.scan(content, "test.py");
1005
1006        assert!(!findings.is_empty(), "Custom patterns should be detected");
1007    }
1008
1009    #[test]
1010    fn test_invalid_custom_pattern_returns_error() {
1011        let mut scanner = SecurityScanner::new();
1012        // Invalid regex - unclosed bracket
1013        let result = scanner.add_custom_pattern(r"INVALID_[PATTERN");
1014
1015        // Should return an error with details
1016        assert!(result.is_err(), "Invalid regex should return error");
1017        let err = result.unwrap_err();
1018        match err {
1019            SecurityError::InvalidPattern { pattern, message } => {
1020                assert_eq!(pattern, r"INVALID_[PATTERN");
1021                assert!(!message.is_empty(), "Error message should not be empty");
1022            },
1023        }
1024    }
1025
1026    #[test]
1027    fn test_invalid_custom_pattern_unchecked() {
1028        let mut scanner = SecurityScanner::new();
1029        // Invalid regex - unclosed bracket (silently ignored with _unchecked)
1030        scanner.add_custom_pattern_unchecked(r"INVALID_[PATTERN");
1031
1032        // Should not panic, invalid patterns are ignored
1033        let content = "INVALID_[PATTERN here";
1034        let _findings = scanner.scan(content, "test.py");
1035    }
1036
1037    #[test]
1038    fn test_multiple_secrets_same_line() {
1039        let scanner = SecurityScanner::new();
1040
1041        // Two GitHub tokens on the same line
1042        let content = r#"TOKEN1="ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" TOKEN2="ghp_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb""#;
1043
1044        let findings = scanner.scan(content, "test.env");
1045        assert_eq!(findings.len(), 2, "Should detect both tokens on the same line");
1046
1047        // Test redaction of multiple secrets on same line
1048        let (redacted, _) = scanner.scan_and_redact(content, "test.env");
1049
1050        // Both tokens should be redacted
1051        assert!(
1052            !redacted.contains("ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
1053            "First token should be redacted"
1054        );
1055        assert!(
1056            !redacted.contains("ghp_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
1057            "Second token should be redacted"
1058        );
1059        assert!(redacted.contains('*'), "Redacted content should contain asterisks");
1060    }
1061
1062    #[test]
1063    fn test_redaction_preserves_structure() {
1064        let scanner = SecurityScanner::new();
1065        let content = "line1\napi_key = 'secret_key_12345678901234567890'\nline3";
1066
1067        let (redacted, _) = scanner.scan_and_redact(content, "test.py");
1068
1069        // Should preserve newlines and structure
1070        let lines: Vec<&str> = redacted.lines().collect();
1071        assert_eq!(lines.len(), 3, "Should preserve line count");
1072        assert_eq!(lines[0], "line1");
1073        assert_eq!(lines[2], "line3");
1074    }
1075}