infiniloom_engine/security.rs
1//! Security scanning for secrets and sensitive data
2//!
3//! This module provides automatic detection and redaction of secrets, API keys,
4//! tokens, and other sensitive data before sharing code with LLMs or external services.
5//!
6//! # Quick Start
7//!
8//! ```rust
9//! use infiniloom_engine::security::SecurityScanner;
10//!
11//! let scanner = SecurityScanner::new();
12//! let code = r#"
13//! const AWS_KEY = "AKIAIOSFODNN7EXAMPLE";
14//! const API_TOKEN = "sk-proj-abc123xyz789";
15//! "#;
16//!
17//! // Scan for secrets
18//! let findings = scanner.scan(code, "config.rs");
19//!
20//! if !findings.is_empty() {
21//! println!("⚠️ Found {} secrets!", findings.len());
22//! for finding in &findings {
23//! println!(" {} on line {}: {}",
24//! finding.kind.name(),
25//! finding.line,
26//! finding.pattern); // Already redacted: "AKIA************MPLE"
27//! }
28//! }
29//! ```
30//!
31//! # Scanning with Detailed Results
32//!
33//! The scanner returns structured findings with metadata:
34//!
35//! ```rust
36//! use infiniloom_engine::security::{SecurityScanner, Severity};
37//!
38//! let scanner = SecurityScanner::new();
39//! let findings = scanner.scan(r#"
40//! DB_URL = "postgresql://user:pass@localhost/db"
41//! STRIPE_KEY = "sk_live_abc123xyz789"
42//! "#, ".env");
43//!
44//! for finding in findings {
45//! match finding.severity {
46//! Severity::Critical => println!("🔴 CRITICAL: {}", finding.pattern),
47//! Severity::High => println!("🟠 HIGH: {}", finding.pattern),
48//! Severity::Medium => println!("🟡 MEDIUM: {}", finding.pattern),
49//! Severity::Low => println!("🟢 LOW: {}", finding.pattern),
50//! }
51//! }
52//! ```
53//!
54//! # Automatic Redaction
55//!
56//! Replace detected secrets with `[REDACTED]` markers:
57//!
58//! ```rust
59//! use infiniloom_engine::security::SecurityScanner;
60//!
61//! let scanner = SecurityScanner::new();
62//! let code = r#"
63//! const apiKey = "sk-proj-secret123";
64//! const githubToken = "ghp_abcdefghijklmnopqrstuvwxyz1234567890";
65//! "#;
66//!
67//! // Scan and redact in one operation
68//! let (redacted, findings) = scanner.scan_and_redact(code, "api.ts");
69//!
70//! println!("Original had {} secrets", findings.len());
71//! println!("Redacted version:\n{}", redacted);
72//! // Output: const apiKey = "sk-p****ect123";
73//! // const githubToken = "ghp_****7890";
74//! ```
75//!
76//! # Custom Patterns
77//!
78//! Add organization-specific secret patterns:
79//!
80//! ```rust
81//! use infiniloom_engine::security::SecurityScanner;
82//!
83//! let mut scanner = SecurityScanner::new();
84//!
85//! // Add custom patterns for internal systems
86//! scanner.add_custom_pattern(r"MYCOMPANY_API_[A-Z0-9]{32}");
87//! scanner.add_custom_pattern(r"INTERNAL_TOKEN_[a-f0-9]{64}");
88//!
89//! // Or add multiple at once
90//! scanner.add_custom_patterns(&[
91//! "ORG_SECRET_[A-Z0-9]{16}".to_string(),
92//! "DEPLOY_KEY_[a-z0-9]{40}".to_string(),
93//! ]);
94//!
95//! // Now scan with both built-in and custom patterns
96//! let findings = scanner.scan(r#"
97//! MYCOMPANY_API_ABCD1234EFGH5678IJKL9012MNOP
98//! "#, "internal.rs");
99//!
100//! assert!(!findings.is_empty());
101//! ```
102//!
103//! # Allowlist for Test Data
104//!
105//! Mark known test/example secrets as safe:
106//!
107//! ```rust
108//! use infiniloom_engine::security::SecurityScanner;
109//!
110//! let mut scanner = SecurityScanner::new();
111//!
112//! // Allowlist test keys that are intentionally public
113//! scanner.allowlist("EXAMPLE");
114//! scanner.allowlist("test_key");
115//! scanner.allowlist("mock_secret");
116//!
117//! // This won't trigger detection (contains "EXAMPLE")
118//! let test_code = r#"
119//! AWS_KEY = "AKIAIOSFODNN7EXAMPLE" // Official AWS test key
120//! "#;
121//!
122//! let findings = scanner.scan(test_code, "test.rs");
123//! assert!(findings.is_empty(), "Test keys should be allowed");
124//!
125//! // But this WILL trigger (real key format)
126//! let prod_code = r#"
127//! AWS_KEY = "AKIAIOSFODNN7PRODKEY"
128//! "#;
129//!
130//! let findings = scanner.scan(prod_code, "prod.rs");
131//! assert!(!findings.is_empty(), "Real keys should be detected");
132//! ```
133//!
134//! # Repository Integration
135//!
136//! Scan all files in a repository:
137//!
138//! ```rust,ignore
139//! use infiniloom_engine::security::SecurityScanner;
140//!
141//! let scanner = SecurityScanner::new();
142//! let mut all_findings = Vec::new();
143//!
144//! for file in repository.files {
145//! let findings = scanner.scan(&file.content, &file.relative_path);
146//! all_findings.extend(findings);
147//! }
148//!
149//! if !all_findings.is_empty() {
150//! eprintln!("⚠️ Security scan found {} secrets across {} files",
151//! all_findings.len(),
152//! all_findings.iter()
153//! .map(|f| &f.file)
154//! .collect::<std::collections::HashSet<_>>()
155//! .len()
156//! );
157//!
158//! // Exit with error in CI/CD
159//! std::process::exit(1);
160//! }
161//! ```
162//!
163//! # Severity-Based Filtering
164//!
165//! Work with different severity levels:
166//!
167//! ```rust
168//! use infiniloom_engine::security::{SecurityScanner, Severity};
169//!
170//! let scanner = SecurityScanner::new();
171//! let findings = scanner.scan(r#"
172//! AWS_KEY = "AKIAIOSFODNN7PRODKEY" # Critical
173//! password = "weak123" # High
174//! "#, ".env");
175//!
176//! // Count by severity
177//! let critical_count = findings.iter()
178//! .filter(|f| f.severity == Severity::Critical)
179//! .count();
180//!
181//! let high_count = findings.iter()
182//! .filter(|f| f.severity == Severity::High)
183//! .count();
184//!
185//! println!("Critical: {}, High: {}", critical_count, high_count);
186//!
187//! // Check if safe to proceed (only low/medium severity)
188//! let is_safe = findings.iter()
189//! .all(|f| f.severity < Severity::High);
190//!
191//! if !is_safe {
192//! eprintln!("⛔ Cannot proceed - high/critical secrets detected");
193//! }
194//! ```
195//!
196//! # Supported Secret Types
197//!
198//! ## Cloud Credentials (Critical Severity)
199//! - **AWS**: Access keys (AKIA...), Secret access keys
200//! - **GitHub**: Personal access tokens (ghp_..., github_pat_...), OAuth tokens
201//! - **Private Keys**: RSA, EC, DSA, OpenSSH private keys
202//!
203//! ## API Keys (Critical Severity)
204//! - **OpenAI**: sk-... API keys
205//! - **Anthropic**: sk-ant-... API keys
206//! - **Stripe**: sk_live_..., pk_test_... keys
207//!
208//! ## Service Tokens (High Severity)
209//! - **Slack**: xoxb-..., xoxa-... tokens
210//! - **JWT**: Encoded JSON Web Tokens
211//! - **Database**: Connection strings (PostgreSQL, MongoDB, MySQL, Redis, etc.)
212//!
213//! ## Generic Secrets (High Severity)
214//! - Generic API keys (api_key=...)
215//! - Access tokens (token=..., secret=...)
216//! - Passwords (password=...)
217//!
218//! # Why Pre-compiled Patterns?
219//!
220//! The module uses `once_cell::sync::Lazy` for regex patterns:
221//!
222//! ```rust,ignore
223//! static RE_AWS_KEY: Lazy<Regex> =
224//! Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").unwrap());
225//! ```
226//!
227//! **Benefits**:
228//! - Compiled once at first use
229//! - Reused across all scanner instances
230//! - Thread-safe sharing
231//! - Zero runtime compilation overhead
232//!
233//! **Pattern Order**: More specific patterns (Stripe, Slack, JWT) come BEFORE
234//! generic patterns (api_key, secret) to ensure accurate detection and avoid
235//! masking by broader patterns.
236//!
237//! # False Positive Reduction
238//!
239//! The scanner automatically skips:
240//! - **Comments**: Lines starting with //, #, /*, *
241//! - **Documentation**: Lines containing "example" as a word
242//! - **Placeholders**: Lines with "xxxxx" or "placeholder"
243//! - **Allowlisted patterns**: User-configured safe patterns
244//!
245//! This reduces false positives in documentation, test files, and examples
246//! while catching real secrets in code.
247
248use once_cell::sync::Lazy;
249use regex::Regex;
250use std::collections::HashSet;
251
252// Helper regex for word-boundary "example" detection (to skip documentation lines)
253static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
254 // Match "example" as a standalone word to skip documentation/tutorial content.
255 // This helps reduce false positives in example code and documentation.
256 //
257 // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
258 // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
259 // which runs separately. This regex is only used to skip entire lines that
260 // appear to be documentation examples (e.g., "# Example:" or "// example usage").
261 //
262 // The regex allows dots in word boundaries to handle domain examples like
263 // db.example.com without matching.
264 Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)")
265 .expect("RE_EXAMPLE_WORD: invalid regex pattern")
266});
267
268// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
269static RE_AWS_KEY: Lazy<Regex> =
270 Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").expect("RE_AWS_KEY: invalid regex pattern"));
271static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
272 Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
273 .expect("RE_AWS_SECRET: invalid regex pattern")
274});
275// GitHub Personal Access Token (classic) - 36 alphanumeric chars after prefix
276static RE_GITHUB_PAT: Lazy<Regex> =
277 Lazy::new(|| Regex::new(r"ghp_[A-Za-z0-9]{36}").expect("RE_GITHUB_PAT: invalid regex pattern"));
278// GitHub fine-grained PAT
279static RE_GITHUB_FINE_PAT: Lazy<Regex> = Lazy::new(|| {
280 Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}")
281 .expect("RE_GITHUB_FINE_PAT: invalid regex pattern")
282});
283// GitHub OAuth, user-to-server, server-to-server, and refresh tokens
284static RE_GITHUB_OTHER_TOKENS: Lazy<Regex> = Lazy::new(|| {
285 Regex::new(r"gh[ours]_[A-Za-z0-9]{36,}").expect("RE_GITHUB_OTHER_TOKENS: invalid regex pattern")
286});
287static RE_PRIVATE_KEY: Lazy<Regex> = Lazy::new(|| {
288 Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")
289 .expect("RE_PRIVATE_KEY: invalid regex pattern")
290});
291static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
292 Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
293 .expect("RE_API_KEY: invalid regex pattern")
294});
295static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
296 Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
297 .expect("RE_SECRET_TOKEN: invalid regex pattern")
298});
299static RE_PASSWORD: Lazy<Regex> = Lazy::new(|| {
300 Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#)
301 .expect("RE_PASSWORD: invalid regex pattern")
302});
303static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
304 // Note: postgres and postgresql are both valid (postgresql:// is more common in practice)
305 Regex::new(
306 r#"(?i)(?:mongodb|postgres(?:ql)?|mysql|redis|mariadb|cockroachdb|mssql)://[^\s'"]+"#,
307 )
308 .expect("RE_CONN_STRING: invalid regex pattern")
309});
310static RE_JWT: Lazy<Regex> = Lazy::new(|| {
311 Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*")
312 .expect("RE_JWT: invalid regex pattern")
313});
314static RE_SLACK: Lazy<Regex> = Lazy::new(|| {
315 Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}")
316 .expect("RE_SLACK: invalid regex pattern")
317});
318static RE_STRIPE: Lazy<Regex> = Lazy::new(|| {
319 Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}")
320 .expect("RE_STRIPE: invalid regex pattern")
321});
322// OpenAI API keys (sk-... followed by alphanumeric characters)
323static RE_OPENAI: Lazy<Regex> =
324 Lazy::new(|| Regex::new(r"sk-[A-Za-z0-9]{32,}").expect("RE_OPENAI: invalid regex pattern"));
325// Anthropic API keys (sk-ant-...)
326static RE_ANTHROPIC: Lazy<Regex> = Lazy::new(|| {
327 Regex::new(r"sk-ant-[A-Za-z0-9-]{40,}").expect("RE_ANTHROPIC: invalid regex pattern")
328});
329
330/// A detected secret or sensitive data
331#[derive(Debug, Clone)]
332pub struct SecretFinding {
333 /// Type of secret
334 pub kind: SecretKind,
335 /// File path
336 pub file: String,
337 /// Line number
338 pub line: u32,
339 /// Matched pattern (redacted)
340 pub pattern: String,
341 /// Severity level
342 pub severity: Severity,
343 /// Whether the secret was found in a comment (may be example/documentation)
344 pub in_comment: bool,
345}
346
347/// Kind of secret detected
348#[derive(Debug, Clone, Copy, PartialEq, Eq)]
349pub enum SecretKind {
350 /// API key
351 ApiKey,
352 /// Access token
353 AccessToken,
354 /// Private key
355 PrivateKey,
356 /// Password
357 Password,
358 /// Database connection string
359 ConnectionString,
360 /// AWS credentials
361 AwsCredential,
362 /// GitHub token
363 GitHubToken,
364 /// Generic secret
365 Generic,
366}
367
368impl SecretKind {
369 /// Get human-readable name
370 pub fn name(&self) -> &'static str {
371 match self {
372 Self::ApiKey => "API Key",
373 Self::AccessToken => "Access Token",
374 Self::PrivateKey => "Private Key",
375 Self::Password => "Password",
376 Self::ConnectionString => "Connection String",
377 Self::AwsCredential => "AWS Credential",
378 Self::GitHubToken => "GitHub Token",
379 Self::Generic => "Generic Secret",
380 }
381 }
382}
383
384/// Severity level
385#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
386pub enum Severity {
387 Low,
388 Medium,
389 High,
390 Critical,
391}
392
393/// Security scanner
394pub struct SecurityScanner {
395 patterns: Vec<SecretPattern>,
396 custom_patterns: Vec<CustomSecretPattern>,
397 allowlist: HashSet<String>,
398}
399
400struct SecretPattern {
401 kind: SecretKind,
402 regex: &'static Lazy<Regex>,
403 severity: Severity,
404}
405
406/// Custom user-defined secret pattern
407struct CustomSecretPattern {
408 regex: Regex,
409 severity: Severity,
410}
411
412impl Default for SecurityScanner {
413 fn default() -> Self {
414 Self::new()
415 }
416}
417
418impl SecurityScanner {
419 /// Create a new security scanner with default patterns
420 /// Uses pre-compiled static regex patterns for optimal performance
421 ///
422 /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
423 /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
424 /// and redaction.
425 pub fn new() -> Self {
426 let patterns = vec![
427 // === Critical: Specific cloud credentials (most specific patterns first) ===
428 // AWS
429 SecretPattern {
430 kind: SecretKind::AwsCredential,
431 regex: &RE_AWS_KEY,
432 severity: Severity::Critical,
433 },
434 SecretPattern {
435 kind: SecretKind::AwsCredential,
436 regex: &RE_AWS_SECRET,
437 severity: Severity::Critical,
438 },
439 // GitHub tokens (all types: ghp_, gho_, ghu_, ghs_, ghr_, github_pat_)
440 SecretPattern {
441 kind: SecretKind::GitHubToken,
442 regex: &RE_GITHUB_PAT,
443 severity: Severity::Critical,
444 },
445 SecretPattern {
446 kind: SecretKind::GitHubToken,
447 regex: &RE_GITHUB_FINE_PAT,
448 severity: Severity::Critical,
449 },
450 SecretPattern {
451 kind: SecretKind::GitHubToken,
452 regex: &RE_GITHUB_OTHER_TOKENS,
453 severity: Severity::Critical,
454 },
455 // Private keys
456 SecretPattern {
457 kind: SecretKind::PrivateKey,
458 regex: &RE_PRIVATE_KEY,
459 severity: Severity::Critical,
460 },
461 // Anthropic API keys (must come before OpenAI since sk-ant- is more specific)
462 SecretPattern {
463 kind: SecretKind::ApiKey,
464 regex: &RE_ANTHROPIC,
465 severity: Severity::Critical,
466 },
467 // OpenAI API keys (must come before Stripe since sk- is more general)
468 SecretPattern {
469 kind: SecretKind::ApiKey,
470 regex: &RE_OPENAI,
471 severity: Severity::Critical,
472 },
473 // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
474 SecretPattern {
475 kind: SecretKind::ApiKey,
476 regex: &RE_STRIPE,
477 severity: Severity::Critical,
478 },
479 // === High: Specific service tokens (must come before generic patterns) ===
480 // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
481 SecretPattern {
482 kind: SecretKind::AccessToken,
483 regex: &RE_SLACK,
484 severity: Severity::High,
485 },
486 // JWT tokens (specific pattern: eyJ...eyJ...signature)
487 SecretPattern {
488 kind: SecretKind::AccessToken,
489 regex: &RE_JWT,
490 severity: Severity::High,
491 },
492 // Connection strings (specific pattern: mongodb://, postgres://, etc.)
493 SecretPattern {
494 kind: SecretKind::ConnectionString,
495 regex: &RE_CONN_STRING,
496 severity: Severity::High,
497 },
498 // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
499 // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
500 SecretPattern {
501 kind: SecretKind::ApiKey,
502 regex: &RE_API_KEY,
503 severity: Severity::High,
504 },
505 // Generic secrets (matches secret=xxx, token=xxx, etc.)
506 SecretPattern {
507 kind: SecretKind::Generic,
508 regex: &RE_SECRET_TOKEN,
509 severity: Severity::High,
510 },
511 // Passwords
512 SecretPattern {
513 kind: SecretKind::Password,
514 regex: &RE_PASSWORD,
515 severity: Severity::High,
516 },
517 ];
518
519 Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
520 }
521
522 /// Add a pattern to allowlist
523 pub fn allowlist(&mut self, pattern: &str) {
524 self.allowlist.insert(pattern.to_owned());
525 }
526
527 /// Add a custom regex pattern for secret detection
528 ///
529 /// Custom patterns are matched as generic secrets with High severity.
530 /// Invalid regex patterns are silently ignored.
531 ///
532 /// # Example
533 /// ```
534 /// use infiniloom_engine::security::SecurityScanner;
535 ///
536 /// let mut scanner = SecurityScanner::new();
537 /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}");
538 /// ```
539 pub fn add_custom_pattern(&mut self, pattern: &str) {
540 if let Ok(regex) = Regex::new(pattern) {
541 self.custom_patterns
542 .push(CustomSecretPattern { regex, severity: Severity::High });
543 }
544 }
545
546 /// Add multiple custom patterns at once
547 pub fn add_custom_patterns(&mut self, patterns: &[String]) {
548 for pattern in patterns {
549 self.add_custom_pattern(pattern);
550 }
551 }
552
553 /// Scan content for secrets
554 pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
555 let mut findings = Vec::new();
556
557 for (line_num, line) in content.lines().enumerate() {
558 let trimmed = line.trim();
559
560 // Detect if line is likely a comment - skip entirely to reduce false positives
561 // Real secrets shouldn't be in comments anyway
562 let is_jsdoc_continuation =
563 trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
564 let is_comment = trimmed.starts_with("//")
565 || trimmed.starts_with('#')
566 || trimmed.starts_with("/*")
567 || trimmed.starts_with("*")
568 || is_jsdoc_continuation;
569
570 // Skip obvious false positives (example docs, placeholders, comments)
571 let is_obvious_false_positive = is_comment
572 || RE_EXAMPLE_WORD.is_match(trimmed)
573 || trimmed.to_lowercase().contains("placeholder")
574 || trimmed.contains("xxxxx");
575
576 if is_obvious_false_positive {
577 continue;
578 }
579
580 for pattern in &self.patterns {
581 // Use find_iter to catch ALL matches on a line, not just the first
582 for m in pattern.regex.find_iter(line) {
583 let matched = m.as_str();
584
585 // Check allowlist
586 if self.allowlist.iter().any(|a| matched.contains(a)) {
587 continue;
588 }
589
590 findings.push(SecretFinding {
591 kind: pattern.kind,
592 file: file_path.to_owned(),
593 line: (line_num + 1) as u32,
594 pattern: redact(matched),
595 severity: pattern.severity,
596 in_comment: false, // Non-comment lines only now
597 });
598 }
599 }
600
601 // Check custom patterns
602 for custom in &self.custom_patterns {
603 for m in custom.regex.find_iter(line) {
604 let matched = m.as_str();
605
606 // Check allowlist
607 if self.allowlist.iter().any(|a| matched.contains(a)) {
608 continue;
609 }
610
611 findings.push(SecretFinding {
612 kind: SecretKind::Generic,
613 file: file_path.to_owned(),
614 line: (line_num + 1) as u32,
615 pattern: redact(matched),
616 severity: custom.severity,
617 in_comment: false,
618 });
619 }
620 }
621 }
622
623 findings
624 }
625
626 /// Scan a file and return whether it's safe to include
627 pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
628 let findings = self.scan(content, file_path);
629 findings.iter().all(|f| f.severity < Severity::High)
630 }
631
632 /// Get summary of findings
633 pub fn summarize(findings: &[SecretFinding]) -> String {
634 if findings.is_empty() {
635 return "No secrets detected".to_owned();
636 }
637
638 let critical = findings
639 .iter()
640 .filter(|f| f.severity == Severity::Critical)
641 .count();
642 let high = findings
643 .iter()
644 .filter(|f| f.severity == Severity::High)
645 .count();
646
647 format!(
648 "Found {} potential secrets ({} critical, {} high severity)",
649 findings.len(),
650 critical,
651 high
652 )
653 }
654
655 /// Redact secrets from content, returning the redacted content
656 /// This replaces detected secrets with redacted versions in the actual content
657 pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
658 let mut result = content.to_owned();
659
660 for (line_num, line) in content.lines().enumerate() {
661 let trimmed = line.trim();
662
663 // Skip obvious false positives (example docs, placeholders)
664 let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
665 || trimmed.to_lowercase().contains("placeholder")
666 || trimmed.contains("xxxxx");
667
668 if is_obvious_false_positive {
669 continue;
670 }
671
672 for pattern in &self.patterns {
673 // Use find_iter to catch ALL matches on a line, not just the first
674 for m in pattern.regex.find_iter(line) {
675 let matched = m.as_str();
676
677 // Check allowlist
678 if self.allowlist.iter().any(|a| matched.contains(a)) {
679 continue;
680 }
681
682 // Only redact high severity and above
683 if pattern.severity >= Severity::High {
684 let redacted = redact(matched);
685 // Replace in result - use line number to find the right occurrence
686 let line_start = result
687 .lines()
688 .take(line_num)
689 .map(|l| l.len() + 1)
690 .sum::<usize>();
691 if let Some(pos) = result[line_start..].find(matched) {
692 let abs_pos = line_start + pos;
693 result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
694 }
695 }
696 }
697 }
698
699 // Check custom patterns for redaction
700 for custom in &self.custom_patterns {
701 for m in custom.regex.find_iter(line) {
702 let matched = m.as_str();
703
704 // Check allowlist
705 if self.allowlist.iter().any(|a| matched.contains(a)) {
706 continue;
707 }
708
709 // Only redact high severity and above
710 if custom.severity >= Severity::High {
711 let redacted = redact(matched);
712 let line_start = result
713 .lines()
714 .take(line_num)
715 .map(|l| l.len() + 1)
716 .sum::<usize>();
717 if let Some(pos) = result[line_start..].find(matched) {
718 let abs_pos = line_start + pos;
719 result.replace_range(abs_pos..abs_pos + matched.len(), &redacted);
720 }
721 }
722 }
723 }
724 }
725
726 result
727 }
728
729 /// Scan and redact all secrets from content.
730 ///
731 /// Returns a tuple of (redacted_content, findings) where:
732 /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
733 /// - `findings` is a list of all detected secrets with metadata
734 ///
735 /// # Important
736 ///
737 /// Always check the findings list to understand what was redacted and whether
738 /// the file should be excluded from context entirely.
739 #[must_use = "security findings should be reviewed"]
740 pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
741 let findings = self.scan(content, file_path);
742 let redacted = self.redact_content(content, file_path);
743 (redacted, findings)
744 }
745}
746
747/// Redact a matched secret for display
748///
749/// This function is UTF-8 safe - it uses character counts rather than byte
750/// positions to avoid panics when secrets contain multi-byte characters.
751fn redact(s: &str) -> String {
752 let char_count = s.chars().count();
753
754 if char_count <= 8 {
755 return "*".repeat(char_count);
756 }
757
758 // Use character-based positions for UTF-8 safety
759 let prefix_chars = 4.min(char_count / 4);
760 let suffix_chars = 4.min(char_count / 4);
761 let redact_chars = char_count.saturating_sub(prefix_chars + suffix_chars);
762
763 // Collect prefix characters
764 let prefix: String = s.chars().take(prefix_chars).collect();
765
766 // Collect suffix characters
767 let suffix: String = s.chars().skip(char_count - suffix_chars).collect();
768
769 format!("{}{}{}", prefix, "*".repeat(redact_chars), suffix)
770}
771
772#[cfg(test)]
773mod tests {
774 use super::*;
775
776 #[test]
777 fn test_aws_key_detection() {
778 let scanner = SecurityScanner::new();
779 let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
780
781 let findings = scanner.scan(content, "config.py");
782
783 assert!(!findings.is_empty());
784 assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
785 }
786
787 #[test]
788 fn test_github_token_detection() {
789 let scanner = SecurityScanner::new();
790 let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
791
792 let findings = scanner.scan(content, ".env");
793
794 assert!(!findings.is_empty());
795 assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
796 }
797
798 #[test]
799 fn test_private_key_detection() {
800 let scanner = SecurityScanner::new();
801 let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
802
803 let findings = scanner.scan(content, "key.pem");
804
805 assert!(!findings.is_empty());
806 assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
807 }
808
809 #[test]
810 fn test_allowlist() {
811 let mut scanner = SecurityScanner::new();
812 scanner.allowlist("EXAMPLE");
813
814 let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
815 let findings = scanner.scan(content, "test.py");
816
817 assert!(findings.is_empty());
818 }
819
820 #[test]
821 fn test_redact() {
822 assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
823 assert_eq!(redact("short"), "*****");
824 }
825
826 #[test]
827 fn test_redact_unicode_safety() {
828 // Test with Chinese characters (3 bytes each)
829 // Should not panic when slicing
830 let chinese_secret = "密钥ABCDEFGHIJKLMNOP密钥";
831 let result = redact(chinese_secret);
832 // Should produce valid UTF-8
833 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
834 // Should contain asterisks
835 assert!(result.contains('*'));
836
837 // Test with emoji (4 bytes each)
838 let emoji_secret = "🔑ABCDEFGHIJKLMNOP🔒";
839 let result = redact(emoji_secret);
840 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
841 assert!(result.contains('*'));
842
843 // Test with mixed multi-byte characters
844 let mixed_secret = "абвгдежзийклмноп"; // Cyrillic (2 bytes each)
845 let result = redact(mixed_secret);
846 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
847 assert!(result.contains('*'));
848
849 // Test short Unicode strings (should all be asterisks)
850 let short_chinese = "密钥";
851 let result = redact(short_chinese);
852 assert_eq!(result, "**"); // 2 characters
853 }
854
855 #[test]
856 fn test_redact_edge_cases() {
857 // Empty string
858 assert_eq!(redact(""), "");
859
860 // Single character
861 assert_eq!(redact("x"), "*");
862
863 // Exactly 8 characters (boundary)
864 assert_eq!(redact("12345678"), "********");
865
866 // 9 characters (first to show prefix/suffix)
867 let result = redact("123456789");
868 assert!(result.contains('*'));
869 assert!(result.starts_with('1') || result.starts_with('*'));
870 }
871
872 #[test]
873 fn test_comments_are_skipped() {
874 let scanner = SecurityScanner::new();
875 let content = "# api_key = 'some_secret_key_12345678901234567890'";
876
877 let findings = scanner.scan(content, "test.py");
878
879 // Comments are skipped entirely to reduce false positives
880 assert!(findings.is_empty(), "Secrets in comments should be skipped");
881 }
882
883 #[test]
884 fn test_non_comment_detected() {
885 let scanner = SecurityScanner::new();
886 let content = "api_key = 'some_secret_key_12345678901234567890'";
887
888 let findings = scanner.scan(content, "test.py");
889
890 assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
891 assert!(
892 findings.iter().all(|f| !f.in_comment),
893 "in_comment should be false for non-comment lines"
894 );
895 }
896
897 #[test]
898 fn test_custom_pattern() {
899 let mut scanner = SecurityScanner::new();
900 scanner.add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}");
901
902 let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
903 let findings = scanner.scan(content, "test.py");
904
905 assert!(!findings.is_empty(), "Custom pattern should be detected");
906 assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
907 }
908
909 #[test]
910 fn test_custom_patterns_multiple() {
911 let mut scanner = SecurityScanner::new();
912 scanner.add_custom_patterns(&[
913 r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
914 r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
915 ]);
916
917 let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
918 let findings = scanner.scan(content, "test.py");
919
920 assert!(!findings.is_empty(), "Custom patterns should be detected");
921 }
922
923 #[test]
924 fn test_invalid_custom_pattern_ignored() {
925 let mut scanner = SecurityScanner::new();
926 // Invalid regex - unclosed bracket
927 scanner.add_custom_pattern(r"INVALID_[PATTERN");
928
929 // Should not panic, invalid patterns are ignored
930 let content = "INVALID_[PATTERN here";
931 let _findings = scanner.scan(content, "test.py");
932 }
933}