infiniloom_engine/security.rs
1//! Security scanning for secrets and sensitive data
2//!
3//! This module provides automatic detection and redaction of secrets, API keys,
4//! tokens, and other sensitive data before sharing code with LLMs or external services.
5//!
6//! # Quick Start
7//!
8//! ```rust
9//! use infiniloom_engine::security::SecurityScanner;
10//!
11//! let scanner = SecurityScanner::new();
12//! let code = r#"
13//! const AWS_KEY = "AKIAIOSFODNN7EXAMPLE";
14//! const API_TOKEN = "sk-proj-abc123xyz789";
15//! "#;
16//!
17//! // Scan for secrets
18//! let findings = scanner.scan(code, "config.rs");
19//!
20//! if !findings.is_empty() {
21//! println!("⚠️ Found {} secrets!", findings.len());
22//! for finding in &findings {
23//! println!(" {} on line {}: {}",
24//! finding.kind.name(),
25//! finding.line,
26//! finding.pattern); // Already redacted: "AKIA************MPLE"
27//! }
28//! }
29//! ```
30//!
31//! # Scanning with Detailed Results
32//!
33//! The scanner returns structured findings with metadata:
34//!
35//! ```rust
36//! use infiniloom_engine::security::{SecurityScanner, Severity};
37//!
38//! let scanner = SecurityScanner::new();
39//! let findings = scanner.scan(r#"
40//! DB_URL = "postgresql://user:pass@localhost/db"
41//! STRIPE_KEY = "sk_live_abc123xyz789"
42//! "#, ".env");
43//!
44//! for finding in findings {
45//! match finding.severity {
46//! Severity::Critical => println!("🔴 CRITICAL: {}", finding.pattern),
47//! Severity::High => println!("🟠 HIGH: {}", finding.pattern),
48//! Severity::Medium => println!("🟡 MEDIUM: {}", finding.pattern),
49//! Severity::Low => println!("🟢 LOW: {}", finding.pattern),
50//! }
51//! }
52//! ```
53//!
54//! # Automatic Redaction
55//!
56//! Replace detected secrets with `[REDACTED]` markers:
57//!
58//! ```rust
59//! use infiniloom_engine::security::SecurityScanner;
60//!
61//! let scanner = SecurityScanner::new();
62//! let code = r#"
63//! const apiKey = "sk-proj-secret123";
64//! const githubToken = "ghp_abcdefghijklmnopqrstuvwxyz1234567890";
65//! "#;
66//!
67//! // Scan and redact in one operation
68//! let (redacted, findings) = scanner.scan_and_redact(code, "api.ts");
69//!
70//! println!("Original had {} secrets", findings.len());
71//! println!("Redacted version:\n{}", redacted);
72//! // Output: const apiKey = "sk-p****ect123";
73//! // const githubToken = "ghp_****7890";
74//! ```
75//!
76//! # Custom Patterns
77//!
78//! Add organization-specific secret patterns:
79//!
80//! ```rust,no_run
81//! use infiniloom_engine::security::SecurityScanner;
82//!
83//! let mut scanner = SecurityScanner::new();
84//!
85//! // Add custom patterns for internal systems
86//! scanner.add_custom_pattern(r"MYCOMPANY_API_[A-Z0-9]{32}");
87//! scanner.add_custom_pattern(r"INTERNAL_TOKEN_[a-f0-9]{64}");
88//!
89//! // Or add multiple at once
90//! scanner.add_custom_patterns(&[
91//! "ORG_SECRET_[A-Z0-9]{16}".to_string(),
92//! "DEPLOY_KEY_[a-z0-9]{40}".to_string(),
93//! ]);
94//!
95//! // Now scan with both built-in and custom patterns
96//! let findings = scanner.scan(r#"
97//! MYCOMPANY_API_ABCD1234EFGH5678IJKL9012MNOP
98//! "#, "internal.rs");
99//!
100//! assert!(!findings.is_empty());
101//! ```
102//!
103//! # Allowlist for Test Data
104//!
105//! Mark known test/example secrets as safe:
106//!
107//! ```rust
108//! use infiniloom_engine::security::SecurityScanner;
109//!
110//! let mut scanner = SecurityScanner::new();
111//!
112//! // Allowlist test keys that are intentionally public
113//! scanner.allowlist("EXAMPLE");
114//! scanner.allowlist("test_key");
115//! scanner.allowlist("mock_secret");
116//!
117//! // This won't trigger detection (contains "EXAMPLE")
118//! let test_code = r#"
119//! AWS_KEY = "AKIAIOSFODNN7EXAMPLE" // Official AWS test key
120//! "#;
121//!
122//! let findings = scanner.scan(test_code, "test.rs");
123//! assert!(findings.is_empty(), "Test keys should be allowed");
124//!
125//! // But this WILL trigger (real key format)
126//! let prod_code = r#"
127//! AWS_KEY = "AKIAIOSFODNN7PRODKEY"
128//! "#;
129//!
130//! let findings = scanner.scan(prod_code, "prod.rs");
131//! assert!(!findings.is_empty(), "Real keys should be detected");
132//! ```
133//!
134//! # Repository Integration
135//!
136//! Scan all files in a repository:
137//!
138//! ```rust,ignore
139//! use infiniloom_engine::security::SecurityScanner;
140//!
141//! let scanner = SecurityScanner::new();
142//! let mut all_findings = Vec::new();
143//!
144//! for file in repository.files {
145//! let findings = scanner.scan(&file.content, &file.relative_path);
146//! all_findings.extend(findings);
147//! }
148//!
149//! if !all_findings.is_empty() {
150//! eprintln!("⚠️ Security scan found {} secrets across {} files",
151//! all_findings.len(),
152//! all_findings.iter()
153//! .map(|f| &f.file)
154//! .collect::<std::collections::HashSet<_>>()
155//! .len()
156//! );
157//!
158//! // Exit with error in CI/CD
159//! std::process::exit(1);
160//! }
161//! ```
162//!
163//! # Severity-Based Filtering
164//!
165//! Work with different severity levels:
166//!
167//! ```rust
168//! use infiniloom_engine::security::{SecurityScanner, Severity};
169//!
170//! let scanner = SecurityScanner::new();
171//! let findings = scanner.scan(r#"
172//! AWS_KEY = "AKIAIOSFODNN7PRODKEY" # Critical
173//! password = "weak123" # High
174//! "#, ".env");
175//!
176//! // Count by severity
177//! let critical_count = findings.iter()
178//! .filter(|f| f.severity == Severity::Critical)
179//! .count();
180//!
181//! let high_count = findings.iter()
182//! .filter(|f| f.severity == Severity::High)
183//! .count();
184//!
185//! println!("Critical: {}, High: {}", critical_count, high_count);
186//!
187//! // Check if safe to proceed (only low/medium severity)
188//! let is_safe = findings.iter()
189//! .all(|f| f.severity < Severity::High);
190//!
191//! if !is_safe {
192//! eprintln!("⛔ Cannot proceed - high/critical secrets detected");
193//! }
194//! ```
195//!
196//! # Supported Secret Types
197//!
198//! ## Cloud Credentials (Critical Severity)
199//! - **AWS**: Access keys (AKIA...), Secret access keys
200//! - **GitHub**: Personal access tokens (ghp_..., github_pat_...), OAuth tokens
201//! - **Private Keys**: RSA, EC, DSA, OpenSSH private keys
202//!
203//! ## API Keys (Critical Severity)
204//! - **OpenAI**: sk-... API keys
205//! - **Anthropic**: sk-ant-... API keys
206//! - **Stripe**: sk_live_..., pk_test_... keys
207//!
208//! ## Service Tokens (High Severity)
209//! - **Slack**: xoxb-..., xoxa-... tokens
210//! - **JWT**: Encoded JSON Web Tokens
211//! - **Database**: Connection strings (PostgreSQL, MongoDB, MySQL, Redis, etc.)
212//!
213//! ## Generic Secrets (High Severity)
214//! - Generic API keys (api_key=...)
215//! - Access tokens (token=..., secret=...)
216//! - Passwords (password=...)
217//!
218//! # Why Pre-compiled Patterns?
219//!
220//! The module uses `once_cell::sync::Lazy` for regex patterns:
221//!
222//! ```rust,ignore
223//! static RE_AWS_KEY: Lazy<Regex> =
224//! Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").unwrap());
225//! ```
226//!
227//! **Benefits**:
228//! - Compiled once at first use
229//! - Reused across all scanner instances
230//! - Thread-safe sharing
231//! - Zero runtime compilation overhead
232//!
233//! **Pattern Order**: More specific patterns (Stripe, Slack, JWT) come BEFORE
234//! generic patterns (api_key, secret) to ensure accurate detection and avoid
235//! masking by broader patterns.
236//!
237//! # False Positive Reduction
238//!
239//! The scanner automatically skips:
240//! - **Comments**: Lines starting with //, #, /*, *
241//! - **Documentation**: Lines containing "example" as a word
242//! - **Placeholders**: Lines with "xxxxx" or "placeholder"
243//! - **Allowlisted patterns**: User-configured safe patterns
244//!
245//! This reduces false positives in documentation, test files, and examples
246//! while catching real secrets in code.
247
248use once_cell::sync::Lazy;
249use regex::Regex;
250use std::collections::HashSet;
251
252// Helper regex for word-boundary "example" detection (to skip documentation lines)
253static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
254 // Match "example" as a standalone word to skip documentation/tutorial content.
255 // This helps reduce false positives in example code and documentation.
256 //
257 // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
258 // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
259 // which runs separately. This regex is only used to skip entire lines that
260 // appear to be documentation examples (e.g., "# Example:" or "// example usage").
261 //
262 // The regex allows dots in word boundaries to handle domain examples like
263 // db.example.com without matching.
264 Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)")
265 .expect("RE_EXAMPLE_WORD: invalid regex pattern")
266});
267
268// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
269static RE_AWS_KEY: Lazy<Regex> =
270 Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").expect("RE_AWS_KEY: invalid regex pattern"));
271static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
272 Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
273 .expect("RE_AWS_SECRET: invalid regex pattern")
274});
275// GitHub Personal Access Token (classic) - 36 alphanumeric chars after prefix
276static RE_GITHUB_PAT: Lazy<Regex> =
277 Lazy::new(|| Regex::new(r"ghp_[A-Za-z0-9]{36}").expect("RE_GITHUB_PAT: invalid regex pattern"));
278// GitHub fine-grained PAT
279static RE_GITHUB_FINE_PAT: Lazy<Regex> = Lazy::new(|| {
280 Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}")
281 .expect("RE_GITHUB_FINE_PAT: invalid regex pattern")
282});
283// GitHub OAuth, user-to-server, server-to-server, and refresh tokens
284static RE_GITHUB_OTHER_TOKENS: Lazy<Regex> = Lazy::new(|| {
285 Regex::new(r"gh[ours]_[A-Za-z0-9]{36,}").expect("RE_GITHUB_OTHER_TOKENS: invalid regex pattern")
286});
287static RE_PRIVATE_KEY: Lazy<Regex> = Lazy::new(|| {
288 Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")
289 .expect("RE_PRIVATE_KEY: invalid regex pattern")
290});
291static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
292 Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
293 .expect("RE_API_KEY: invalid regex pattern")
294});
295static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
296 Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
297 .expect("RE_SECRET_TOKEN: invalid regex pattern")
298});
299static RE_PASSWORD: Lazy<Regex> = Lazy::new(|| {
300 Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#)
301 .expect("RE_PASSWORD: invalid regex pattern")
302});
303static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
304 // Note: postgres and postgresql are both valid (postgresql:// is more common in practice)
305 Regex::new(
306 r#"(?i)(?:mongodb|postgres(?:ql)?|mysql|redis|mariadb|cockroachdb|mssql)://[^\s'"]+"#,
307 )
308 .expect("RE_CONN_STRING: invalid regex pattern")
309});
310static RE_JWT: Lazy<Regex> = Lazy::new(|| {
311 Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*")
312 .expect("RE_JWT: invalid regex pattern")
313});
314static RE_SLACK: Lazy<Regex> = Lazy::new(|| {
315 Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}")
316 .expect("RE_SLACK: invalid regex pattern")
317});
318static RE_STRIPE: Lazy<Regex> = Lazy::new(|| {
319 Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}")
320 .expect("RE_STRIPE: invalid regex pattern")
321});
322// OpenAI API keys (sk-... followed by alphanumeric characters)
323// Note: Anthropic keys (sk-ant-...) are detected first in pattern order,
324// so this pattern won't match them due to the scan loop's first-match behavior.
325// Pattern allows letters, numbers, underscores, and hyphens after 'sk-'
326static RE_OPENAI: Lazy<Regex> =
327 Lazy::new(|| Regex::new(r"sk-[A-Za-z0-9_-]{32,}").expect("RE_OPENAI: invalid regex pattern"));
328// Anthropic API keys (sk-ant-...)
329static RE_ANTHROPIC: Lazy<Regex> = Lazy::new(|| {
330 Regex::new(r"sk-ant-[A-Za-z0-9-]{40,}").expect("RE_ANTHROPIC: invalid regex pattern")
331});
332
333/// Error type for security scanning operations
334#[derive(Debug, Clone)]
335pub enum SecurityError {
336 /// Invalid regex pattern for custom secret detection
337 InvalidPattern {
338 /// The invalid pattern
339 pattern: String,
340 /// The error message from regex compilation
341 message: String,
342 },
343}
344
345impl std::fmt::Display for SecurityError {
346 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
347 match self {
348 Self::InvalidPattern { pattern, message } => {
349 write!(f, "Invalid regex pattern '{}': {}", pattern, message)
350 },
351 }
352 }
353}
354
355impl std::error::Error for SecurityError {}
356
357/// A detected secret or sensitive data
358#[derive(Debug, Clone)]
359pub struct SecretFinding {
360 /// Type of secret
361 pub kind: SecretKind,
362 /// File path
363 pub file: String,
364 /// Line number
365 pub line: u32,
366 /// Matched pattern (redacted)
367 pub pattern: String,
368 /// Severity level
369 pub severity: Severity,
370 /// Whether the secret was found in a comment (may be example/documentation)
371 pub in_comment: bool,
372}
373
374/// Kind of secret detected
375#[derive(Debug, Clone, Copy, PartialEq, Eq)]
376pub enum SecretKind {
377 /// API key
378 ApiKey,
379 /// Access token
380 AccessToken,
381 /// Private key
382 PrivateKey,
383 /// Password
384 Password,
385 /// Database connection string
386 ConnectionString,
387 /// AWS credentials
388 AwsCredential,
389 /// GitHub token
390 GitHubToken,
391 /// Generic secret
392 Generic,
393}
394
395impl SecretKind {
396 /// Get human-readable name
397 pub fn name(&self) -> &'static str {
398 match self {
399 Self::ApiKey => "API Key",
400 Self::AccessToken => "Access Token",
401 Self::PrivateKey => "Private Key",
402 Self::Password => "Password",
403 Self::ConnectionString => "Connection String",
404 Self::AwsCredential => "AWS Credential",
405 Self::GitHubToken => "GitHub Token",
406 Self::Generic => "Generic Secret",
407 }
408 }
409}
410
411/// Severity level
412#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
413pub enum Severity {
414 Low,
415 Medium,
416 High,
417 Critical,
418}
419
420/// Security scanner
421pub struct SecurityScanner {
422 patterns: Vec<SecretPattern>,
423 custom_patterns: Vec<CustomSecretPattern>,
424 allowlist: HashSet<String>,
425}
426
427struct SecretPattern {
428 kind: SecretKind,
429 regex: &'static Lazy<Regex>,
430 severity: Severity,
431}
432
433/// Custom user-defined secret pattern
434struct CustomSecretPattern {
435 regex: Regex,
436 severity: Severity,
437}
438
439impl Default for SecurityScanner {
440 fn default() -> Self {
441 Self::new()
442 }
443}
444
445impl SecurityScanner {
446 /// Create a new security scanner with default patterns
447 /// Uses pre-compiled static regex patterns for optimal performance
448 ///
449 /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
450 /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
451 /// and redaction.
452 pub fn new() -> Self {
453 let patterns = vec![
454 // === Critical: Specific cloud credentials (most specific patterns first) ===
455 // AWS
456 SecretPattern {
457 kind: SecretKind::AwsCredential,
458 regex: &RE_AWS_KEY,
459 severity: Severity::Critical,
460 },
461 SecretPattern {
462 kind: SecretKind::AwsCredential,
463 regex: &RE_AWS_SECRET,
464 severity: Severity::Critical,
465 },
466 // GitHub tokens (all types: ghp_, gho_, ghu_, ghs_, ghr_, github_pat_)
467 SecretPattern {
468 kind: SecretKind::GitHubToken,
469 regex: &RE_GITHUB_PAT,
470 severity: Severity::Critical,
471 },
472 SecretPattern {
473 kind: SecretKind::GitHubToken,
474 regex: &RE_GITHUB_FINE_PAT,
475 severity: Severity::Critical,
476 },
477 SecretPattern {
478 kind: SecretKind::GitHubToken,
479 regex: &RE_GITHUB_OTHER_TOKENS,
480 severity: Severity::Critical,
481 },
482 // Private keys
483 SecretPattern {
484 kind: SecretKind::PrivateKey,
485 regex: &RE_PRIVATE_KEY,
486 severity: Severity::Critical,
487 },
488 // Anthropic API keys (must come before OpenAI since sk-ant- is more specific)
489 SecretPattern {
490 kind: SecretKind::ApiKey,
491 regex: &RE_ANTHROPIC,
492 severity: Severity::Critical,
493 },
494 // OpenAI API keys (must come before Stripe since sk- is more general)
495 SecretPattern {
496 kind: SecretKind::ApiKey,
497 regex: &RE_OPENAI,
498 severity: Severity::Critical,
499 },
500 // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
501 SecretPattern {
502 kind: SecretKind::ApiKey,
503 regex: &RE_STRIPE,
504 severity: Severity::Critical,
505 },
506 // === High: Specific service tokens (must come before generic patterns) ===
507 // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
508 SecretPattern {
509 kind: SecretKind::AccessToken,
510 regex: &RE_SLACK,
511 severity: Severity::High,
512 },
513 // JWT tokens (specific pattern: eyJ...eyJ...signature)
514 SecretPattern {
515 kind: SecretKind::AccessToken,
516 regex: &RE_JWT,
517 severity: Severity::High,
518 },
519 // Connection strings (specific pattern: mongodb://, postgres://, etc.)
520 SecretPattern {
521 kind: SecretKind::ConnectionString,
522 regex: &RE_CONN_STRING,
523 severity: Severity::High,
524 },
525 // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
526 // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
527 SecretPattern {
528 kind: SecretKind::ApiKey,
529 regex: &RE_API_KEY,
530 severity: Severity::High,
531 },
532 // Generic secrets (matches secret=xxx, token=xxx, etc.)
533 SecretPattern {
534 kind: SecretKind::Generic,
535 regex: &RE_SECRET_TOKEN,
536 severity: Severity::High,
537 },
538 // Passwords
539 SecretPattern {
540 kind: SecretKind::Password,
541 regex: &RE_PASSWORD,
542 severity: Severity::High,
543 },
544 ];
545
546 Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
547 }
548
549 /// Add a pattern to allowlist
550 pub fn allowlist(&mut self, pattern: &str) {
551 self.allowlist.insert(pattern.to_owned());
552 }
553
554 /// Add a custom regex pattern for secret detection
555 ///
556 /// Custom patterns are matched as generic secrets with High severity.
557 /// Returns an error if the regex pattern is invalid.
558 ///
559 /// # Example
560 /// ```
561 /// use infiniloom_engine::security::SecurityScanner;
562 ///
563 /// let mut scanner = SecurityScanner::new();
564 /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}").unwrap();
565 /// ```
566 ///
567 /// # Errors
568 /// Returns `SecurityError::InvalidPattern` if the regex pattern is invalid.
569 pub fn add_custom_pattern(&mut self, pattern: &str) -> Result<(), SecurityError> {
570 let regex = Regex::new(pattern).map_err(|e| SecurityError::InvalidPattern {
571 pattern: pattern.to_owned(),
572 message: e.to_string(),
573 })?;
574 self.custom_patterns
575 .push(CustomSecretPattern { regex, severity: Severity::High });
576 Ok(())
577 }
578
579 /// Add a custom regex pattern, ignoring invalid patterns
580 ///
581 /// This is a convenience method that silently ignores invalid patterns.
582 /// Use [`add_custom_pattern`] if you need to handle errors.
583 pub fn add_custom_pattern_unchecked(&mut self, pattern: &str) {
584 let _ = self.add_custom_pattern(pattern);
585 }
586
587 /// Add multiple custom patterns at once
588 ///
589 /// Returns the first error encountered, if any. Patterns before the error
590 /// will have been added successfully.
591 ///
592 /// # Errors
593 /// Returns `SecurityError::InvalidPattern` if any regex pattern is invalid.
594 pub fn add_custom_patterns(&mut self, patterns: &[String]) -> Result<(), SecurityError> {
595 for pattern in patterns {
596 self.add_custom_pattern(pattern)?;
597 }
598 Ok(())
599 }
600
601 /// Add multiple custom patterns, ignoring invalid patterns
602 ///
603 /// This is a convenience method that silently ignores invalid patterns.
604 /// Use [`add_custom_patterns`] if you need to handle errors.
605 pub fn add_custom_patterns_unchecked(&mut self, patterns: &[String]) {
606 for pattern in patterns {
607 self.add_custom_pattern_unchecked(pattern);
608 }
609 }
610
611 /// Scan content for secrets
612 pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
613 let mut findings = Vec::new();
614
615 for (line_num, line) in content.lines().enumerate() {
616 let trimmed = line.trim();
617
618 // Detect if line is likely a comment - skip entirely to reduce false positives
619 // Real secrets shouldn't be in comments anyway
620 let is_jsdoc_continuation =
621 trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
622 let is_comment = trimmed.starts_with("//")
623 || trimmed.starts_with('#')
624 || trimmed.starts_with("/*")
625 || trimmed.starts_with('*')
626 || is_jsdoc_continuation;
627
628 // Skip obvious false positives (example docs, placeholders, comments)
629 let is_obvious_false_positive = is_comment
630 || RE_EXAMPLE_WORD.is_match(trimmed)
631 || trimmed.to_lowercase().contains("placeholder")
632 || trimmed.contains("xxxxx");
633
634 if is_obvious_false_positive {
635 continue;
636 }
637
638 for pattern in &self.patterns {
639 // Use find_iter to catch ALL matches on a line, not just the first
640 for m in pattern.regex.find_iter(line) {
641 let matched = m.as_str();
642
643 // Check allowlist
644 if self.allowlist.iter().any(|a| matched.contains(a)) {
645 continue;
646 }
647
648 findings.push(SecretFinding {
649 kind: pattern.kind,
650 file: file_path.to_owned(),
651 line: (line_num + 1) as u32,
652 pattern: redact(matched),
653 severity: pattern.severity,
654 in_comment: false, // Non-comment lines only now
655 });
656 }
657 }
658
659 // Check custom patterns
660 for custom in &self.custom_patterns {
661 for m in custom.regex.find_iter(line) {
662 let matched = m.as_str();
663
664 // Check allowlist
665 if self.allowlist.iter().any(|a| matched.contains(a)) {
666 continue;
667 }
668
669 findings.push(SecretFinding {
670 kind: SecretKind::Generic,
671 file: file_path.to_owned(),
672 line: (line_num + 1) as u32,
673 pattern: redact(matched),
674 severity: custom.severity,
675 in_comment: false,
676 });
677 }
678 }
679 }
680
681 findings
682 }
683
684 /// Scan a file and return whether it's safe to include
685 pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
686 let findings = self.scan(content, file_path);
687 findings.iter().all(|f| f.severity < Severity::High)
688 }
689
690 /// Get summary of findings
691 pub fn summarize(findings: &[SecretFinding]) -> String {
692 if findings.is_empty() {
693 return "No secrets detected".to_owned();
694 }
695
696 let critical = findings
697 .iter()
698 .filter(|f| f.severity == Severity::Critical)
699 .count();
700 let high = findings
701 .iter()
702 .filter(|f| f.severity == Severity::High)
703 .count();
704
705 format!(
706 "Found {} potential secrets ({} critical, {} high severity)",
707 findings.len(),
708 critical,
709 high
710 )
711 }
712
713 /// Redact secrets from content, returning the redacted content
714 /// This replaces detected secrets with redacted versions in the actual content
715 ///
716 /// # Implementation Note
717 /// Uses a two-pass approach to handle multiple secrets on the same line correctly:
718 /// 1. First pass: collect all matches with their positions
719 /// 2. Second pass: replace in reverse order (right to left) so positions don't shift
720 pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
721 // Collect all matches that need redaction: (start_byte, end_byte, redacted_text)
722 let mut replacements: Vec<(usize, usize, String)> = Vec::new();
723
724 let mut current_byte_offset = 0usize;
725 for line in content.lines() {
726 let trimmed = line.trim();
727
728 // Skip obvious false positives (example docs, placeholders)
729 let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
730 || trimmed.to_lowercase().contains("placeholder")
731 || trimmed.contains("xxxxx");
732
733 if !is_obvious_false_positive {
734 // Check built-in patterns
735 for pattern in &self.patterns {
736 if pattern.severity >= Severity::High {
737 for m in pattern.regex.find_iter(line) {
738 let matched = m.as_str();
739
740 // Check allowlist
741 if self.allowlist.iter().any(|a| matched.contains(a)) {
742 continue;
743 }
744
745 let start = current_byte_offset + m.start();
746 let end = current_byte_offset + m.end();
747 replacements.push((start, end, redact(matched)));
748 }
749 }
750 }
751
752 // Check custom patterns
753 for custom in &self.custom_patterns {
754 if custom.severity >= Severity::High {
755 for m in custom.regex.find_iter(line) {
756 let matched = m.as_str();
757
758 // Check allowlist
759 if self.allowlist.iter().any(|a| matched.contains(a)) {
760 continue;
761 }
762
763 let start = current_byte_offset + m.start();
764 let end = current_byte_offset + m.end();
765 replacements.push((start, end, redact(matched)));
766 }
767 }
768 }
769 }
770
771 // Move to next line (+1 for newline character)
772 current_byte_offset += line.len() + 1;
773 }
774
775 // Sort replacements by length first (shorter = more specific), then by position
776 // This ensures more specific patterns (Stripe key) are preferred over
777 // generic patterns (api_key=xxx) that might include the key name
778 replacements.sort_by(|a, b| {
779 let a_len = a.1 - a.0;
780 let b_len = b.1 - b.0;
781 a_len.cmp(&b_len).then(a.0.cmp(&b.0))
782 });
783
784 // Remove overlapping ranges, keeping the more specific (shorter) match
785 // Since we sorted by length first, shorter matches are processed first
786 let mut filtered: Vec<(usize, usize, String)> = Vec::new();
787 for replacement in replacements {
788 // Check if this overlaps with any existing replacement
789 let overlaps = filtered.iter().any(|(start, end, _)| {
790 // Two ranges overlap if one starts before the other ends and vice versa
791 replacement.0 < *end && *start < replacement.1
792 });
793
794 if !overlaps {
795 filtered.push(replacement);
796 }
797 // If overlaps, skip this one (we already have the shorter/more specific match)
798 }
799
800 // Apply replacements in reverse order so positions don't shift
801 let mut result = content.to_owned();
802 for (start, end, redacted) in filtered.into_iter().rev() {
803 if end <= result.len() {
804 result.replace_range(start..end, &redacted);
805 }
806 }
807
808 result
809 }
810
811 /// Scan and redact all secrets from content.
812 ///
813 /// Returns a tuple of (redacted_content, findings) where:
814 /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
815 /// - `findings` is a list of all detected secrets with metadata
816 ///
817 /// # Important
818 ///
819 /// Always check the findings list to understand what was redacted and whether
820 /// the file should be excluded from context entirely.
821 #[must_use = "security findings should be reviewed"]
822 pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
823 let findings = self.scan(content, file_path);
824 let redacted = self.redact_content(content, file_path);
825 (redacted, findings)
826 }
827}
828
829/// Redact a matched secret for display
830///
831/// This function is UTF-8 safe - it uses character counts rather than byte
832/// positions to avoid panics when secrets contain multi-byte characters.
833fn redact(s: &str) -> String {
834 let char_count = s.chars().count();
835
836 if char_count <= 8 {
837 return "*".repeat(char_count);
838 }
839
840 // Use character-based positions for UTF-8 safety
841 let prefix_chars = 4.min(char_count / 4);
842 let suffix_chars = 4.min(char_count / 4);
843 let redact_chars = char_count.saturating_sub(prefix_chars + suffix_chars);
844
845 // Collect prefix characters
846 let prefix: String = s.chars().take(prefix_chars).collect();
847
848 // Collect suffix characters
849 let suffix: String = s.chars().skip(char_count - suffix_chars).collect();
850
851 format!("{}{}{}", prefix, "*".repeat(redact_chars), suffix)
852}
853
854#[cfg(test)]
855mod tests {
856 use super::*;
857
858 #[test]
859 fn test_aws_key_detection() {
860 let scanner = SecurityScanner::new();
861 let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
862
863 let findings = scanner.scan(content, "config.py");
864
865 assert!(!findings.is_empty());
866 assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
867 }
868
869 #[test]
870 fn test_github_token_detection() {
871 let scanner = SecurityScanner::new();
872 let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
873
874 let findings = scanner.scan(content, ".env");
875
876 assert!(!findings.is_empty());
877 assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
878 }
879
880 #[test]
881 fn test_private_key_detection() {
882 let scanner = SecurityScanner::new();
883 let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
884
885 let findings = scanner.scan(content, "key.pem");
886
887 assert!(!findings.is_empty());
888 assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
889 }
890
891 #[test]
892 fn test_allowlist() {
893 let mut scanner = SecurityScanner::new();
894 scanner.allowlist("EXAMPLE");
895
896 let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
897 let findings = scanner.scan(content, "test.py");
898
899 assert!(findings.is_empty());
900 }
901
902 #[test]
903 fn test_redact() {
904 assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
905 assert_eq!(redact("short"), "*****");
906 }
907
908 #[test]
909 fn test_redact_unicode_safety() {
910 // Test with Chinese characters (3 bytes each)
911 // Should not panic when slicing
912 let chinese_secret = "密钥ABCDEFGHIJKLMNOP密钥";
913 let result = redact(chinese_secret);
914 // Should produce valid UTF-8
915 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
916 // Should contain asterisks
917 assert!(result.contains('*'));
918
919 // Test with emoji (4 bytes each)
920 let emoji_secret = "🔑ABCDEFGHIJKLMNOP🔒";
921 let result = redact(emoji_secret);
922 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
923 assert!(result.contains('*'));
924
925 // Test with mixed multi-byte characters
926 let mixed_secret = "абвгдежзийклмноп"; // Cyrillic (2 bytes each)
927 let result = redact(mixed_secret);
928 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
929 assert!(result.contains('*'));
930
931 // Test short Unicode strings (should all be asterisks)
932 let short_chinese = "密钥";
933 let result = redact(short_chinese);
934 assert_eq!(result, "**"); // 2 characters
935 }
936
937 #[test]
938 fn test_redact_edge_cases() {
939 // Empty string
940 assert_eq!(redact(""), "");
941
942 // Single character
943 assert_eq!(redact("x"), "*");
944
945 // Exactly 8 characters (boundary)
946 assert_eq!(redact("12345678"), "********");
947
948 // 9 characters (first to show prefix/suffix)
949 let result = redact("123456789");
950 assert!(result.contains('*'));
951 assert!(result.starts_with('1') || result.starts_with('*'));
952 }
953
954 #[test]
955 fn test_comments_are_skipped() {
956 let scanner = SecurityScanner::new();
957 let content = "# api_key = 'some_secret_key_12345678901234567890'";
958
959 let findings = scanner.scan(content, "test.py");
960
961 // Comments are skipped entirely to reduce false positives
962 assert!(findings.is_empty(), "Secrets in comments should be skipped");
963 }
964
965 #[test]
966 fn test_non_comment_detected() {
967 let scanner = SecurityScanner::new();
968 let content = "api_key = 'some_secret_key_12345678901234567890'";
969
970 let findings = scanner.scan(content, "test.py");
971
972 assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
973 assert!(
974 findings.iter().all(|f| !f.in_comment),
975 "in_comment should be false for non-comment lines"
976 );
977 }
978
979 #[test]
980 fn test_custom_pattern() {
981 let mut scanner = SecurityScanner::new();
982 scanner
983 .add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}")
984 .unwrap();
985
986 let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
987 let findings = scanner.scan(content, "test.py");
988
989 assert!(!findings.is_empty(), "Custom pattern should be detected");
990 assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
991 }
992
993 #[test]
994 fn test_custom_patterns_multiple() {
995 let mut scanner = SecurityScanner::new();
996 scanner
997 .add_custom_patterns(&[
998 r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
999 r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
1000 ])
1001 .unwrap();
1002
1003 let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
1004 let findings = scanner.scan(content, "test.py");
1005
1006 assert!(!findings.is_empty(), "Custom patterns should be detected");
1007 }
1008
1009 #[test]
1010 fn test_invalid_custom_pattern_returns_error() {
1011 let mut scanner = SecurityScanner::new();
1012 // Invalid regex - unclosed bracket
1013 let result = scanner.add_custom_pattern(r"INVALID_[PATTERN");
1014
1015 // Should return an error with details
1016 assert!(result.is_err(), "Invalid regex should return error");
1017 let err = result.unwrap_err();
1018 match err {
1019 SecurityError::InvalidPattern { pattern, message } => {
1020 assert_eq!(pattern, r"INVALID_[PATTERN");
1021 assert!(!message.is_empty(), "Error message should not be empty");
1022 },
1023 }
1024 }
1025
1026 #[test]
1027 fn test_invalid_custom_pattern_unchecked() {
1028 let mut scanner = SecurityScanner::new();
1029 // Invalid regex - unclosed bracket (silently ignored with _unchecked)
1030 scanner.add_custom_pattern_unchecked(r"INVALID_[PATTERN");
1031
1032 // Should not panic, invalid patterns are ignored
1033 let content = "INVALID_[PATTERN here";
1034 let _findings = scanner.scan(content, "test.py");
1035 }
1036
1037 #[test]
1038 fn test_multiple_secrets_same_line() {
1039 let scanner = SecurityScanner::new();
1040
1041 // Two GitHub tokens on the same line
1042 let content = r#"TOKEN1="ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" TOKEN2="ghp_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb""#;
1043
1044 let findings = scanner.scan(content, "test.env");
1045 assert_eq!(findings.len(), 2, "Should detect both tokens on the same line");
1046
1047 // Test redaction of multiple secrets on same line
1048 let (redacted, _) = scanner.scan_and_redact(content, "test.env");
1049
1050 // Both tokens should be redacted
1051 assert!(
1052 !redacted.contains("ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
1053 "First token should be redacted"
1054 );
1055 assert!(
1056 !redacted.contains("ghp_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
1057 "Second token should be redacted"
1058 );
1059 assert!(redacted.contains('*'), "Redacted content should contain asterisks");
1060 }
1061
1062 #[test]
1063 fn test_redaction_preserves_structure() {
1064 let scanner = SecurityScanner::new();
1065 let content = "line1\napi_key = 'secret_key_12345678901234567890'\nline3";
1066
1067 let (redacted, _) = scanner.scan_and_redact(content, "test.py");
1068
1069 // Should preserve newlines and structure
1070 let lines: Vec<&str> = redacted.lines().collect();
1071 assert_eq!(lines.len(), 3, "Should preserve line count");
1072 assert_eq!(lines[0], "line1");
1073 assert_eq!(lines[2], "line3");
1074 }
1075}