infiniloom_engine/security.rs
1//! Security scanning for secrets and sensitive data
2//!
3//! This module provides automatic detection and redaction of secrets, API keys,
4//! tokens, and other sensitive data before sharing code with LLMs or external services.
5//!
6//! # Quick Start
7//!
8//! ```rust
9//! use infiniloom_engine::security::SecurityScanner;
10//!
11//! let scanner = SecurityScanner::new();
12//! let code = r#"
13//! const AWS_KEY = "AKIAIOSFODNN7EXAMPLE";
14//! const API_TOKEN = "sk-proj-abc123xyz789";
15//! "#;
16//!
17//! // Scan for secrets
18//! let findings = scanner.scan(code, "config.rs");
19//!
20//! if !findings.is_empty() {
21//! println!("⚠️ Found {} secrets!", findings.len());
22//! for finding in &findings {
23//! println!(" {} on line {}: {}",
24//! finding.kind.name(),
25//! finding.line,
26//! finding.pattern); // Already redacted: "AKIA************MPLE"
27//! }
28//! }
29//! ```
30//!
31//! # Scanning with Detailed Results
32//!
33//! The scanner returns structured findings with metadata:
34//!
35//! ```rust
36//! use infiniloom_engine::security::{SecurityScanner, Severity};
37//!
38//! let scanner = SecurityScanner::new();
39//! let findings = scanner.scan(r#"
40//! DB_URL = "postgresql://user:pass@localhost/db"
41//! STRIPE_KEY = "sk_live_abc123xyz789"
42//! "#, ".env");
43//!
44//! for finding in findings {
45//! match finding.severity {
46//! Severity::Critical => println!("🔴 CRITICAL: {}", finding.pattern),
47//! Severity::High => println!("🟠 HIGH: {}", finding.pattern),
48//! Severity::Medium => println!("🟡 MEDIUM: {}", finding.pattern),
49//! Severity::Low => println!("🟢 LOW: {}", finding.pattern),
50//! }
51//! }
52//! ```
53//!
54//! # Automatic Redaction
55//!
56//! Replace detected secrets with `[REDACTED]` markers:
57//!
58//! ```rust
59//! use infiniloom_engine::security::SecurityScanner;
60//!
61//! let scanner = SecurityScanner::new();
62//! let code = r#"
63//! const apiKey = "sk-proj-secret123";
64//! const githubToken = "ghp_abcdefghijklmnopqrstuvwxyz1234567890";
65//! "#;
66//!
67//! // Scan and redact in one operation
68//! let (redacted, findings) = scanner.scan_and_redact(code, "api.ts");
69//!
70//! println!("Original had {} secrets", findings.len());
71//! println!("Redacted version:\n{}", redacted);
72//! // Output: const apiKey = "sk-p****ect123";
73//! // const githubToken = "ghp_****7890";
74//! ```
75//!
76//! # Custom Patterns
77//!
78//! Add organization-specific secret patterns:
79//!
80//! ```rust,no_run
81//! use infiniloom_engine::security::SecurityScanner;
82//!
83//! let mut scanner = SecurityScanner::new();
84//!
85//! // Add custom patterns for internal systems
86//! scanner.add_custom_pattern(r"MYCOMPANY_API_[A-Z0-9]{32}");
87//! scanner.add_custom_pattern(r"INTERNAL_TOKEN_[a-f0-9]{64}");
88//!
89//! // Or add multiple at once
90//! scanner.add_custom_patterns(&[
91//! "ORG_SECRET_[A-Z0-9]{16}".to_string(),
92//! "DEPLOY_KEY_[a-z0-9]{40}".to_string(),
93//! ]);
94//!
95//! // Now scan with both built-in and custom patterns
96//! let findings = scanner.scan(r#"
97//! MYCOMPANY_API_ABCD1234EFGH5678IJKL9012MNOP
98//! "#, "internal.rs");
99//!
100//! assert!(!findings.is_empty());
101//! ```
102//!
103//! # Allowlist for Test Data
104//!
105//! Mark known test/example secrets as safe:
106//!
107//! ```rust
108//! use infiniloom_engine::security::SecurityScanner;
109//!
110//! let mut scanner = SecurityScanner::new();
111//!
112//! // Allowlist test keys that are intentionally public
113//! scanner.allowlist("EXAMPLE");
114//! scanner.allowlist("test_key");
115//! scanner.allowlist("mock_secret");
116//!
117//! // This won't trigger detection (contains "EXAMPLE")
118//! let test_code = r#"
119//! AWS_KEY = "AKIAIOSFODNN7EXAMPLE" // Official AWS test key
120//! "#;
121//!
122//! let findings = scanner.scan(test_code, "test.rs");
123//! assert!(findings.is_empty(), "Test keys should be allowed");
124//!
125//! // But this WILL trigger (real key format)
126//! let prod_code = r#"
127//! AWS_KEY = "AKIAIOSFODNN7PRODKEY"
128//! "#;
129//!
130//! let findings = scanner.scan(prod_code, "prod.rs");
131//! assert!(!findings.is_empty(), "Real keys should be detected");
132//! ```
133//!
134//! # Repository Integration
135//!
136//! Scan all files in a repository:
137//!
138//! ```rust,ignore
139//! use infiniloom_engine::security::SecurityScanner;
140//!
141//! let scanner = SecurityScanner::new();
142//! let mut all_findings = Vec::new();
143//!
144//! for file in repository.files {
145//! let findings = scanner.scan(&file.content, &file.relative_path);
146//! all_findings.extend(findings);
147//! }
148//!
149//! if !all_findings.is_empty() {
150//! eprintln!("⚠️ Security scan found {} secrets across {} files",
151//! all_findings.len(),
152//! all_findings.iter()
153//! .map(|f| &f.file)
154//! .collect::<std::collections::HashSet<_>>()
155//! .len()
156//! );
157//!
158//! // Exit with error in CI/CD
159//! std::process::exit(1);
160//! }
161//! ```
162//!
163//! # Severity-Based Filtering
164//!
165//! Work with different severity levels:
166//!
167//! ```rust
168//! use infiniloom_engine::security::{SecurityScanner, Severity};
169//!
170//! let scanner = SecurityScanner::new();
171//! let findings = scanner.scan(r#"
172//! AWS_KEY = "AKIAIOSFODNN7PRODKEY" # Critical
173//! password = "weak123" # High
174//! "#, ".env");
175//!
176//! // Count by severity
177//! let critical_count = findings.iter()
178//! .filter(|f| f.severity == Severity::Critical)
179//! .count();
180//!
181//! let high_count = findings.iter()
182//! .filter(|f| f.severity == Severity::High)
183//! .count();
184//!
185//! println!("Critical: {}, High: {}", critical_count, high_count);
186//!
187//! // Check if safe to proceed (only low/medium severity)
188//! let is_safe = findings.iter()
189//! .all(|f| f.severity < Severity::High);
190//!
191//! if !is_safe {
192//! eprintln!("⛔ Cannot proceed - high/critical secrets detected");
193//! }
194//! ```
195//!
196//! # Supported Secret Types
197//!
198//! ## Cloud Credentials (Critical Severity)
199//! - **AWS**: Access keys (AKIA...), Secret access keys
200//! - **GitHub**: Personal access tokens (ghp_..., github_pat_...), OAuth tokens
201//! - **Private Keys**: RSA, EC, DSA, OpenSSH private keys
202//!
203//! ## API Keys (Critical Severity)
204//! - **OpenAI**: sk-... API keys
205//! - **Anthropic**: sk-ant-... API keys
206//! - **Stripe**: sk_live_..., pk_test_... keys
207//!
208//! ## Service Tokens (High Severity)
209//! - **Slack**: xoxb-..., xoxa-... tokens
210//! - **JWT**: Encoded JSON Web Tokens
211//! - **Database**: Connection strings (PostgreSQL, MongoDB, MySQL, Redis, etc.)
212//!
213//! ## Generic Secrets (High Severity)
214//! - Generic API keys (api_key=...)
215//! - Access tokens (token=..., secret=...)
216//! - Passwords (password=...)
217//!
218//! # Why Pre-compiled Patterns?
219//!
220//! The module uses `once_cell::sync::Lazy` for regex patterns:
221//!
222//! ```rust,ignore
223//! static RE_AWS_KEY: Lazy<Regex> =
224//! Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").unwrap());
225//! ```
226//!
227//! **Benefits**:
228//! - Compiled once at first use
229//! - Reused across all scanner instances
230//! - Thread-safe sharing
231//! - Zero runtime compilation overhead
232//!
233//! **Pattern Order**: More specific patterns (Stripe, Slack, JWT) come BEFORE
234//! generic patterns (api_key, secret) to ensure accurate detection and avoid
235//! masking by broader patterns.
236//!
237//! # False Positive Reduction
238//!
239//! The scanner automatically skips:
240//! - **Comments**: Lines starting with //, #, /*, *
241//! - **Documentation**: Lines containing "example" as a word
242//! - **Placeholders**: Lines with "xxxxx" or "placeholder"
243//! - **Allowlisted patterns**: User-configured safe patterns
244//!
245//! This reduces false positives in documentation, test files, and examples
246//! while catching real secrets in code.
247
248use once_cell::sync::Lazy;
249use regex::Regex;
250use std::collections::HashSet;
251
252// Helper regex for word-boundary "example" detection (to skip documentation lines)
253static RE_EXAMPLE_WORD: Lazy<Regex> = Lazy::new(|| {
254 // Match "example" as a standalone word to skip documentation/tutorial content.
255 // This helps reduce false positives in example code and documentation.
256 //
257 // Note: This does NOT prevent detection of AWS keys containing "EXAMPLE" like
258 // AKIAIOSFODNN7EXAMPLE - those are detected by the AWS key pattern (RE_AWS_KEY)
259 // which runs separately. This regex is only used to skip entire lines that
260 // appear to be documentation examples (e.g., "# Example:" or "// example usage").
261 //
262 // The regex allows dots in word boundaries to handle domain examples like
263 // db.example.com without matching.
264 Regex::new(r"(?i)(?:^|[^a-zA-Z0-9.])example(?:[^a-zA-Z0-9.]|$)")
265 .expect("RE_EXAMPLE_WORD: invalid regex pattern")
266});
267
268// Pre-compiled regex patterns (compiled once, reused across all scanner instances)
269static RE_AWS_KEY: Lazy<Regex> =
270 Lazy::new(|| Regex::new(r"AKIA[0-9A-Z]{16}").expect("RE_AWS_KEY: invalid regex pattern"));
271static RE_AWS_SECRET: Lazy<Regex> = Lazy::new(|| {
272 Regex::new(r#"(?i)aws[_-]?secret[_-]?access[_-]?key['"]?\s*[:=]\s*['"]?([A-Za-z0-9/+=]{40})"#)
273 .expect("RE_AWS_SECRET: invalid regex pattern")
274});
275// GitHub Personal Access Token (classic) - 36 alphanumeric chars after prefix
276static RE_GITHUB_PAT: Lazy<Regex> =
277 Lazy::new(|| Regex::new(r"ghp_[A-Za-z0-9]{36}").expect("RE_GITHUB_PAT: invalid regex pattern"));
278// GitHub fine-grained PAT
279static RE_GITHUB_FINE_PAT: Lazy<Regex> = Lazy::new(|| {
280 Regex::new(r"github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59}")
281 .expect("RE_GITHUB_FINE_PAT: invalid regex pattern")
282});
283// GitHub OAuth, user-to-server, server-to-server, and refresh tokens
284static RE_GITHUB_OTHER_TOKENS: Lazy<Regex> = Lazy::new(|| {
285 Regex::new(r"gh[ours]_[A-Za-z0-9]{36,}").expect("RE_GITHUB_OTHER_TOKENS: invalid regex pattern")
286});
287static RE_PRIVATE_KEY: Lazy<Regex> = Lazy::new(|| {
288 Regex::new(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")
289 .expect("RE_PRIVATE_KEY: invalid regex pattern")
290});
291static RE_API_KEY: Lazy<Regex> = Lazy::new(|| {
292 Regex::new(r#"(?i)(?:api[_-]?key|apikey)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
293 .expect("RE_API_KEY: invalid regex pattern")
294});
295static RE_SECRET_TOKEN: Lazy<Regex> = Lazy::new(|| {
296 Regex::new(r#"(?i)(?:secret|token)['"]?\s*[:=]\s*['"]?([A-Za-z0-9_-]{20,})"#)
297 .expect("RE_SECRET_TOKEN: invalid regex pattern")
298});
299static RE_PASSWORD: Lazy<Regex> = Lazy::new(|| {
300 Regex::new(r#"(?i)password['"]?\s*[:=]\s*['"]?([^'"\s]{8,})"#)
301 .expect("RE_PASSWORD: invalid regex pattern")
302});
303static RE_CONN_STRING: Lazy<Regex> = Lazy::new(|| {
304 // Note: postgres and postgresql are both valid (postgresql:// is more common in practice)
305 Regex::new(
306 r#"(?i)(?:mongodb|postgres(?:ql)?|mysql|redis|mariadb|cockroachdb|mssql)://[^\s'"]+"#,
307 )
308 .expect("RE_CONN_STRING: invalid regex pattern")
309});
310static RE_JWT: Lazy<Regex> = Lazy::new(|| {
311 Regex::new(r"eyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*")
312 .expect("RE_JWT: invalid regex pattern")
313});
314static RE_SLACK: Lazy<Regex> = Lazy::new(|| {
315 Regex::new(r"xox[baprs]-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}")
316 .expect("RE_SLACK: invalid regex pattern")
317});
318static RE_STRIPE: Lazy<Regex> = Lazy::new(|| {
319 Regex::new(r"(?:sk|pk)_(?:test|live)_[A-Za-z0-9]{24,}")
320 .expect("RE_STRIPE: invalid regex pattern")
321});
322// OpenAI API keys (sk-... followed by alphanumeric characters)
323// Note: Anthropic keys (sk-ant-...) are detected first in pattern order,
324// so this pattern won't match them due to the scan loop's first-match behavior.
325// Pattern allows letters, numbers, underscores, and hyphens after 'sk-'
326static RE_OPENAI: Lazy<Regex> = Lazy::new(|| {
327 Regex::new(r"sk-[A-Za-z0-9][A-Za-z0-9_-]{31,}").expect("RE_OPENAI: invalid regex pattern")
328});
329// Anthropic API keys (sk-ant-...)
330static RE_ANTHROPIC: Lazy<Regex> = Lazy::new(|| {
331 Regex::new(r"sk-ant-[A-Za-z0-9-]{40,}").expect("RE_ANTHROPIC: invalid regex pattern")
332});
333// Google Cloud API keys
334static RE_GCP_API_KEY: Lazy<Regex> = Lazy::new(|| {
335 Regex::new(r"AIza[0-9A-Za-z_-]{35}").expect("RE_GCP_API_KEY: invalid regex pattern")
336});
337// Hugging Face tokens
338static RE_HUGGINGFACE: Lazy<Regex> = Lazy::new(|| {
339 Regex::new(r"hf_[A-Za-z0-9]{34,}").expect("RE_HUGGINGFACE: invalid regex pattern")
340});
341// Azure connection strings
342static RE_AZURE_CONN: Lazy<Regex> = Lazy::new(|| {
343 Regex::new(
344 r"(?i)DefaultEndpointsProtocol=https;AccountName=[^;]+;AccountKey=[A-Za-z0-9+/=]{44,}",
345 )
346 .expect("RE_AZURE_CONN: invalid regex pattern")
347});
348// DigitalOcean tokens
349static RE_DIGITALOCEAN: Lazy<Regex> = Lazy::new(|| {
350 Regex::new(r"dop_v1_[a-f0-9]{64}").expect("RE_DIGITALOCEAN: invalid regex pattern")
351});
352// SendGrid API keys
353static RE_SENDGRID: Lazy<Regex> = Lazy::new(|| {
354 Regex::new(r"SG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}")
355 .expect("RE_SENDGRID: invalid regex pattern")
356});
357// Twilio API keys
358static RE_TWILIO: Lazy<Regex> =
359 Lazy::new(|| Regex::new(r"\bSK[a-f0-9]{32}\b").expect("RE_TWILIO: invalid regex pattern"));
360
361/// Error type for security scanning operations
362#[derive(Debug, Clone)]
363pub enum SecurityError {
364 /// Invalid regex pattern for custom secret detection
365 InvalidPattern {
366 /// The invalid pattern
367 pattern: String,
368 /// The error message from regex compilation
369 message: String,
370 },
371}
372
373impl std::fmt::Display for SecurityError {
374 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
375 match self {
376 Self::InvalidPattern { pattern, message } => {
377 write!(f, "Invalid regex pattern '{}': {}", pattern, message)
378 },
379 }
380 }
381}
382
383impl std::error::Error for SecurityError {}
384
385/// A detected secret or sensitive data
386#[derive(Debug, Clone)]
387pub struct SecretFinding {
388 /// Type of secret
389 pub kind: SecretKind,
390 /// File path
391 pub file: String,
392 /// Line number
393 pub line: u32,
394 /// Matched pattern (redacted)
395 pub pattern: String,
396 /// Severity level
397 pub severity: Severity,
398 /// Whether the secret was found in a comment (may be example/documentation)
399 pub in_comment: bool,
400}
401
402/// Kind of secret detected
403#[derive(Debug, Clone, Copy, PartialEq, Eq)]
404pub enum SecretKind {
405 /// API key
406 ApiKey,
407 /// Access token
408 AccessToken,
409 /// Private key
410 PrivateKey,
411 /// Password
412 Password,
413 /// Database connection string
414 ConnectionString,
415 /// AWS credentials
416 AwsCredential,
417 /// GitHub token
418 GitHubToken,
419 /// Generic secret
420 Generic,
421}
422
423impl SecretKind {
424 /// Get human-readable name
425 pub fn name(&self) -> &'static str {
426 match self {
427 Self::ApiKey => "API Key",
428 Self::AccessToken => "Access Token",
429 Self::PrivateKey => "Private Key",
430 Self::Password => "Password",
431 Self::ConnectionString => "Connection String",
432 Self::AwsCredential => "AWS Credential",
433 Self::GitHubToken => "GitHub Token",
434 Self::Generic => "Generic Secret",
435 }
436 }
437}
438
439/// Severity level
440#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
441pub enum Severity {
442 Low,
443 Medium,
444 High,
445 Critical,
446}
447
448/// Security scanner
449pub struct SecurityScanner {
450 patterns: Vec<SecretPattern>,
451 custom_patterns: Vec<CustomSecretPattern>,
452 allowlist: HashSet<String>,
453}
454
455struct SecretPattern {
456 kind: SecretKind,
457 regex: &'static Lazy<Regex>,
458 severity: Severity,
459}
460
461/// Custom user-defined secret pattern
462struct CustomSecretPattern {
463 regex: Regex,
464 severity: Severity,
465}
466
467impl Default for SecurityScanner {
468 fn default() -> Self {
469 Self::new()
470 }
471}
472
473impl SecurityScanner {
474 /// Create a new security scanner with default patterns
475 /// Uses pre-compiled static regex patterns for optimal performance
476 ///
477 /// Pattern order matters: more specific patterns (Stripe, Slack, JWT) must come
478 /// BEFORE generic patterns (API_KEY, SECRET_TOKEN) to ensure proper detection
479 /// and redaction.
480 pub fn new() -> Self {
481 let patterns = vec![
482 // === Critical: Specific cloud credentials (most specific patterns first) ===
483 // AWS
484 SecretPattern {
485 kind: SecretKind::AwsCredential,
486 regex: &RE_AWS_KEY,
487 severity: Severity::Critical,
488 },
489 SecretPattern {
490 kind: SecretKind::AwsCredential,
491 regex: &RE_AWS_SECRET,
492 severity: Severity::Critical,
493 },
494 // GitHub tokens (all types: ghp_, gho_, ghu_, ghs_, ghr_, github_pat_)
495 SecretPattern {
496 kind: SecretKind::GitHubToken,
497 regex: &RE_GITHUB_PAT,
498 severity: Severity::Critical,
499 },
500 SecretPattern {
501 kind: SecretKind::GitHubToken,
502 regex: &RE_GITHUB_FINE_PAT,
503 severity: Severity::Critical,
504 },
505 SecretPattern {
506 kind: SecretKind::GitHubToken,
507 regex: &RE_GITHUB_OTHER_TOKENS,
508 severity: Severity::Critical,
509 },
510 // Private keys
511 SecretPattern {
512 kind: SecretKind::PrivateKey,
513 regex: &RE_PRIVATE_KEY,
514 severity: Severity::Critical,
515 },
516 // Anthropic API keys (must come before OpenAI since sk-ant- is more specific)
517 SecretPattern {
518 kind: SecretKind::ApiKey,
519 regex: &RE_ANTHROPIC,
520 severity: Severity::Critical,
521 },
522 // OpenAI API keys (must come before Stripe since sk- is more general)
523 SecretPattern {
524 kind: SecretKind::ApiKey,
525 regex: &RE_OPENAI,
526 severity: Severity::Critical,
527 },
528 // Stripe keys (specific pattern: sk_live_, pk_test_, etc.)
529 SecretPattern {
530 kind: SecretKind::ApiKey,
531 regex: &RE_STRIPE,
532 severity: Severity::Critical,
533 },
534 // === Critical: Additional cloud credentials ===
535 // Google Cloud API keys
536 SecretPattern {
537 kind: SecretKind::ApiKey,
538 regex: &RE_GCP_API_KEY,
539 severity: Severity::Critical,
540 },
541 // Azure connection strings
542 SecretPattern {
543 kind: SecretKind::ConnectionString,
544 regex: &RE_AZURE_CONN,
545 severity: Severity::Critical,
546 },
547 // === High: Specific service tokens (must come before generic patterns) ===
548 // Slack tokens (specific pattern: xoxb-, xoxa-, etc.)
549 SecretPattern {
550 kind: SecretKind::AccessToken,
551 regex: &RE_SLACK,
552 severity: Severity::High,
553 },
554 // JWT tokens (specific pattern: eyJ...eyJ...signature)
555 SecretPattern {
556 kind: SecretKind::AccessToken,
557 regex: &RE_JWT,
558 severity: Severity::High,
559 },
560 // Connection strings (specific pattern: mongodb://, postgres://, etc.)
561 SecretPattern {
562 kind: SecretKind::ConnectionString,
563 regex: &RE_CONN_STRING,
564 severity: Severity::High,
565 },
566 // Hugging Face tokens
567 SecretPattern {
568 kind: SecretKind::AccessToken,
569 regex: &RE_HUGGINGFACE,
570 severity: Severity::High,
571 },
572 // DigitalOcean tokens
573 SecretPattern {
574 kind: SecretKind::AccessToken,
575 regex: &RE_DIGITALOCEAN,
576 severity: Severity::High,
577 },
578 // SendGrid API keys
579 SecretPattern {
580 kind: SecretKind::ApiKey,
581 regex: &RE_SENDGRID,
582 severity: Severity::High,
583 },
584 // Twilio API keys
585 SecretPattern { kind: SecretKind::ApiKey, regex: &RE_TWILIO, severity: Severity::High },
586 // === High: Generic patterns (must come LAST to avoid masking specific patterns) ===
587 // Generic API keys (matches api_key=xxx, apikey:xxx, etc.)
588 SecretPattern {
589 kind: SecretKind::ApiKey,
590 regex: &RE_API_KEY,
591 severity: Severity::High,
592 },
593 // Generic secrets (matches secret=xxx, token=xxx, etc.)
594 SecretPattern {
595 kind: SecretKind::Generic,
596 regex: &RE_SECRET_TOKEN,
597 severity: Severity::High,
598 },
599 // Passwords
600 SecretPattern {
601 kind: SecretKind::Password,
602 regex: &RE_PASSWORD,
603 severity: Severity::High,
604 },
605 ];
606
607 Self { patterns, custom_patterns: Vec::new(), allowlist: HashSet::new() }
608 }
609
610 /// Add a pattern to allowlist
611 pub fn allowlist(&mut self, pattern: &str) {
612 self.allowlist.insert(pattern.to_owned());
613 }
614
615 /// Add a custom regex pattern for secret detection
616 ///
617 /// Custom patterns are matched as generic secrets with High severity.
618 /// Returns an error if the regex pattern is invalid.
619 ///
620 /// # Example
621 /// ```
622 /// use infiniloom_engine::security::SecurityScanner;
623 ///
624 /// let mut scanner = SecurityScanner::new();
625 /// scanner.add_custom_pattern(r"MY_SECRET_[A-Z0-9]{32}").unwrap();
626 /// ```
627 ///
628 /// # Errors
629 /// Returns `SecurityError::InvalidPattern` if the regex pattern is invalid.
630 pub fn add_custom_pattern(&mut self, pattern: &str) -> Result<(), SecurityError> {
631 let regex = Regex::new(pattern).map_err(|e| SecurityError::InvalidPattern {
632 pattern: pattern.to_owned(),
633 message: e.to_string(),
634 })?;
635 self.custom_patterns
636 .push(CustomSecretPattern { regex, severity: Severity::High });
637 Ok(())
638 }
639
640 /// Add a custom regex pattern, ignoring invalid patterns
641 ///
642 /// This is a convenience method that silently ignores invalid patterns.
643 /// Use [`add_custom_pattern`] if you need to handle errors.
644 pub fn add_custom_pattern_unchecked(&mut self, pattern: &str) {
645 let _ = self.add_custom_pattern(pattern);
646 }
647
648 /// Add multiple custom patterns at once
649 ///
650 /// Returns the first error encountered, if any. Patterns before the error
651 /// will have been added successfully.
652 ///
653 /// # Errors
654 /// Returns `SecurityError::InvalidPattern` if any regex pattern is invalid.
655 pub fn add_custom_patterns(&mut self, patterns: &[String]) -> Result<(), SecurityError> {
656 for pattern in patterns {
657 self.add_custom_pattern(pattern)?;
658 }
659 Ok(())
660 }
661
662 /// Add multiple custom patterns, ignoring invalid patterns
663 ///
664 /// This is a convenience method that silently ignores invalid patterns.
665 /// Use [`add_custom_patterns`] if you need to handle errors.
666 pub fn add_custom_patterns_unchecked(&mut self, patterns: &[String]) {
667 for pattern in patterns {
668 self.add_custom_pattern_unchecked(pattern);
669 }
670 }
671
672 /// Scan content for secrets
673 pub fn scan(&self, content: &str, file_path: &str) -> Vec<SecretFinding> {
674 let mut findings = Vec::new();
675
676 for (line_num, line) in content.lines().enumerate() {
677 let trimmed = line.trim();
678
679 // Detect if line is likely a comment - skip entirely to reduce false positives
680 // Real secrets shouldn't be in comments anyway
681 let is_jsdoc_continuation =
682 trimmed.starts_with("* ") && !trimmed.contains('=') && !trimmed.contains(':');
683 let is_comment = trimmed.starts_with("//")
684 || trimmed.starts_with('#')
685 || trimmed.starts_with("/*")
686 || trimmed.starts_with('*')
687 || is_jsdoc_continuation;
688
689 // Skip obvious false positives (example docs, placeholders, comments)
690 let is_obvious_false_positive = is_comment
691 || RE_EXAMPLE_WORD.is_match(trimmed)
692 || trimmed.to_lowercase().contains("placeholder")
693 || trimmed.contains("xxxxx");
694
695 if is_obvious_false_positive {
696 continue;
697 }
698
699 for pattern in &self.patterns {
700 // Use find_iter to catch ALL matches on a line, not just the first
701 for m in pattern.regex.find_iter(line) {
702 let matched = m.as_str();
703
704 // Check allowlist
705 if self.allowlist.iter().any(|a| matched.contains(a)) {
706 continue;
707 }
708
709 findings.push(SecretFinding {
710 kind: pattern.kind,
711 file: file_path.to_owned(),
712 line: (line_num + 1) as u32,
713 pattern: redact(matched),
714 severity: pattern.severity,
715 in_comment: false, // Non-comment lines only now
716 });
717 }
718 }
719
720 // Check custom patterns
721 for custom in &self.custom_patterns {
722 for m in custom.regex.find_iter(line) {
723 let matched = m.as_str();
724
725 // Check allowlist
726 if self.allowlist.iter().any(|a| matched.contains(a)) {
727 continue;
728 }
729
730 findings.push(SecretFinding {
731 kind: SecretKind::Generic,
732 file: file_path.to_owned(),
733 line: (line_num + 1) as u32,
734 pattern: redact(matched),
735 severity: custom.severity,
736 in_comment: false,
737 });
738 }
739 }
740 }
741
742 findings
743 }
744
745 /// Scan a file and return whether it's safe to include
746 pub fn is_safe(&self, content: &str, file_path: &str) -> bool {
747 let findings = self.scan(content, file_path);
748 findings.iter().all(|f| f.severity < Severity::High)
749 }
750
751 /// Get summary of findings
752 pub fn summarize(findings: &[SecretFinding]) -> String {
753 if findings.is_empty() {
754 return "No secrets detected".to_owned();
755 }
756
757 let critical = findings
758 .iter()
759 .filter(|f| f.severity == Severity::Critical)
760 .count();
761 let high = findings
762 .iter()
763 .filter(|f| f.severity == Severity::High)
764 .count();
765
766 format!(
767 "Found {} potential secrets ({} critical, {} high severity)",
768 findings.len(),
769 critical,
770 high
771 )
772 }
773
774 /// Redact secrets from content, returning the redacted content
775 /// This replaces detected secrets with redacted versions in the actual content
776 ///
777 /// # Implementation Note
778 /// Uses a two-pass approach to handle multiple secrets on the same line correctly:
779 /// 1. First pass: collect all matches with their positions
780 /// 2. Second pass: replace in reverse order (right to left) so positions don't shift
781 pub fn redact_content(&self, content: &str, _file_path: &str) -> String {
782 // Collect all matches that need redaction: (start_byte, end_byte, redacted_text)
783 let mut replacements: Vec<(usize, usize, String)> = Vec::new();
784
785 let mut current_byte_offset = 0usize;
786 for line in content.lines() {
787 let trimmed = line.trim();
788
789 // Skip obvious false positives (example docs, placeholders)
790 let is_obvious_false_positive = RE_EXAMPLE_WORD.is_match(trimmed)
791 || trimmed.to_lowercase().contains("placeholder")
792 || trimmed.contains("xxxxx");
793
794 if !is_obvious_false_positive {
795 // Check built-in patterns
796 for pattern in &self.patterns {
797 if pattern.severity >= Severity::High {
798 for m in pattern.regex.find_iter(line) {
799 let matched = m.as_str();
800
801 // Check allowlist
802 if self.allowlist.iter().any(|a| matched.contains(a)) {
803 continue;
804 }
805
806 let start = current_byte_offset + m.start();
807 let end = current_byte_offset + m.end();
808 replacements.push((start, end, redact(matched)));
809 }
810 }
811 }
812
813 // Check custom patterns
814 for custom in &self.custom_patterns {
815 if custom.severity >= Severity::High {
816 for m in custom.regex.find_iter(line) {
817 let matched = m.as_str();
818
819 // Check allowlist
820 if self.allowlist.iter().any(|a| matched.contains(a)) {
821 continue;
822 }
823
824 let start = current_byte_offset + m.start();
825 let end = current_byte_offset + m.end();
826 replacements.push((start, end, redact(matched)));
827 }
828 }
829 }
830 }
831
832 // Move past the line content and its line ending.
833 // str::lines() strips both '\n' and '\r\n', so we must account for
834 // whichever terminator is actually present in the source content.
835 current_byte_offset += line.len();
836 if current_byte_offset < content.len() {
837 if content.as_bytes()[current_byte_offset] == b'\r' {
838 current_byte_offset += 1; // skip '\r' in '\r\n'
839 }
840 if current_byte_offset < content.len()
841 && content.as_bytes()[current_byte_offset] == b'\n'
842 {
843 current_byte_offset += 1; // skip '\n'
844 }
845 }
846 }
847
848 // Sort replacements by length first (shorter = more specific), then by position
849 // This ensures more specific patterns (Stripe key) are preferred over
850 // generic patterns (api_key=xxx) that might include the key name
851 replacements.sort_by(|a, b| {
852 let a_len = a.1 - a.0;
853 let b_len = b.1 - b.0;
854 a_len.cmp(&b_len).then(a.0.cmp(&b.0))
855 });
856
857 // Remove overlapping ranges, keeping the more specific (shorter) match
858 // Since we sorted by length first, shorter matches are processed first
859 let mut filtered: Vec<(usize, usize, String)> = Vec::new();
860 for replacement in replacements {
861 // Check if this overlaps with any existing replacement
862 let overlaps = filtered.iter().any(|(start, end, _)| {
863 // Two ranges overlap if one starts before the other ends and vice versa
864 replacement.0 < *end && *start < replacement.1
865 });
866
867 if !overlaps {
868 filtered.push(replacement);
869 }
870 // If overlaps, skip this one (we already have the shorter/more specific match)
871 }
872
873 // Apply replacements in reverse order so positions don't shift
874 let mut result = content.to_owned();
875 for (start, end, redacted) in filtered.into_iter().rev() {
876 if end <= result.len() {
877 result.replace_range(start..end, &redacted);
878 }
879 }
880
881 result
882 }
883
884 /// Scan and redact all secrets from content.
885 ///
886 /// Returns a tuple of (redacted_content, findings) where:
887 /// - `redacted_content` has all detected secrets replaced with `[REDACTED]`
888 /// - `findings` is a list of all detected secrets with metadata
889 ///
890 /// # Important
891 ///
892 /// Always check the findings list to understand what was redacted and whether
893 /// the file should be excluded from context entirely.
894 #[must_use = "security findings should be reviewed"]
895 pub fn scan_and_redact(&self, content: &str, file_path: &str) -> (String, Vec<SecretFinding>) {
896 let findings = self.scan(content, file_path);
897 let redacted = self.redact_content(content, file_path);
898 (redacted, findings)
899 }
900}
901
902/// Redact a matched secret for display
903///
904/// This function is UTF-8 safe - it uses character counts rather than byte
905/// positions to avoid panics when secrets contain multi-byte characters.
906fn redact(s: &str) -> String {
907 let char_count = s.chars().count();
908
909 if char_count <= 8 {
910 return "*".repeat(char_count);
911 }
912
913 // Use character-based positions for UTF-8 safety
914 let prefix_chars = 4.min(char_count / 4);
915 let suffix_chars = 4.min(char_count / 4);
916 let redact_chars = char_count.saturating_sub(prefix_chars + suffix_chars);
917
918 // Collect prefix characters
919 let prefix: String = s.chars().take(prefix_chars).collect();
920
921 // Collect suffix characters
922 let suffix: String = s.chars().skip(char_count - suffix_chars).collect();
923
924 format!("{}{}{}", prefix, "*".repeat(redact_chars), suffix)
925}
926
927#[cfg(test)]
928mod tests {
929 use super::*;
930
931 #[test]
932 fn test_aws_key_detection() {
933 let scanner = SecurityScanner::new();
934 let content = r#"AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE""#;
935
936 let findings = scanner.scan(content, "config.py");
937
938 assert!(!findings.is_empty());
939 assert!(findings.iter().any(|f| f.kind == SecretKind::AwsCredential));
940 }
941
942 #[test]
943 fn test_github_token_detection() {
944 let scanner = SecurityScanner::new();
945 let content = r#"GITHUB_TOKEN = "ghp_abcdefghijklmnopqrstuvwxyz1234567890""#;
946
947 let findings = scanner.scan(content, ".env");
948
949 assert!(!findings.is_empty());
950 assert!(findings.iter().any(|f| f.kind == SecretKind::GitHubToken));
951 }
952
953 #[test]
954 fn test_private_key_detection() {
955 let scanner = SecurityScanner::new();
956 let content = "-----BEGIN RSA PRIVATE KEY-----\nMIIEpA...";
957
958 let findings = scanner.scan(content, "key.pem");
959
960 assert!(!findings.is_empty());
961 assert!(findings.iter().any(|f| f.kind == SecretKind::PrivateKey));
962 }
963
964 #[test]
965 fn test_allowlist() {
966 let mut scanner = SecurityScanner::new();
967 scanner.allowlist("EXAMPLE");
968
969 let content = r#"api_key = "AKIAIOSFODNN7EXAMPLE""#;
970 let findings = scanner.scan(content, "test.py");
971
972 assert!(findings.is_empty());
973 }
974
975 #[test]
976 fn test_redact() {
977 assert_eq!(redact("AKIAIOSFODNN7EXAMPLE"), "AKIA************MPLE");
978 assert_eq!(redact("short"), "*****");
979 }
980
981 #[test]
982 fn test_redact_unicode_safety() {
983 // Test with Chinese characters (3 bytes each)
984 // Should not panic when slicing
985 let chinese_secret = "密钥ABCDEFGHIJKLMNOP密钥";
986 let result = redact(chinese_secret);
987 // Should produce valid UTF-8
988 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
989 // Should contain asterisks
990 assert!(result.contains('*'));
991
992 // Test with emoji (4 bytes each)
993 let emoji_secret = "🔑ABCDEFGHIJKLMNOP🔒";
994 let result = redact(emoji_secret);
995 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
996 assert!(result.contains('*'));
997
998 // Test with mixed multi-byte characters
999 let mixed_secret = "абвгдежзийклмноп"; // Cyrillic (2 bytes each)
1000 let result = redact(mixed_secret);
1001 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1002 assert!(result.contains('*'));
1003
1004 // Test short Unicode strings (should all be asterisks)
1005 let short_chinese = "密钥";
1006 let result = redact(short_chinese);
1007 assert_eq!(result, "**"); // 2 characters
1008 }
1009
1010 #[test]
1011 fn test_redact_edge_cases() {
1012 // Empty string
1013 assert_eq!(redact(""), "");
1014
1015 // Single character
1016 assert_eq!(redact("x"), "*");
1017
1018 // Exactly 8 characters (boundary)
1019 assert_eq!(redact("12345678"), "********");
1020
1021 // 9 characters (first to show prefix/suffix)
1022 let result = redact("123456789");
1023 assert!(result.contains('*'));
1024 assert!(result.starts_with('1') || result.starts_with('*'));
1025 }
1026
1027 #[test]
1028 fn test_comments_are_skipped() {
1029 let scanner = SecurityScanner::new();
1030 let content = "# api_key = 'some_secret_key_12345678901234567890'";
1031
1032 let findings = scanner.scan(content, "test.py");
1033
1034 // Comments are skipped entirely to reduce false positives
1035 assert!(findings.is_empty(), "Secrets in comments should be skipped");
1036 }
1037
1038 #[test]
1039 fn test_non_comment_detected() {
1040 let scanner = SecurityScanner::new();
1041 let content = "api_key = 'some_secret_key_12345678901234567890'";
1042
1043 let findings = scanner.scan(content, "test.py");
1044
1045 assert!(!findings.is_empty(), "Secrets in non-comments should be detected");
1046 assert!(
1047 findings.iter().all(|f| !f.in_comment),
1048 "in_comment should be false for non-comment lines"
1049 );
1050 }
1051
1052 #[test]
1053 fn test_custom_pattern() {
1054 let mut scanner = SecurityScanner::new();
1055 scanner
1056 .add_custom_pattern(r"CUSTOM_SECRET_[A-Z0-9]{16}")
1057 .unwrap();
1058
1059 let content = "my_secret = CUSTOM_SECRET_ABCD1234EFGH5678";
1060 let findings = scanner.scan(content, "test.py");
1061
1062 assert!(!findings.is_empty(), "Custom pattern should be detected");
1063 assert!(findings.iter().any(|f| f.kind == SecretKind::Generic));
1064 }
1065
1066 #[test]
1067 fn test_custom_patterns_multiple() {
1068 let mut scanner = SecurityScanner::new();
1069 scanner
1070 .add_custom_patterns(&[
1071 r"MYAPP_KEY_[a-f0-9]{32}".to_owned(),
1072 r"MYAPP_TOKEN_[A-Z]{20}".to_owned(),
1073 ])
1074 .unwrap();
1075
1076 let content = "key = MYAPP_KEY_0123456789abcdef0123456789abcdef";
1077 let findings = scanner.scan(content, "test.py");
1078
1079 assert!(!findings.is_empty(), "Custom patterns should be detected");
1080 }
1081
1082 #[test]
1083 fn test_invalid_custom_pattern_returns_error() {
1084 let mut scanner = SecurityScanner::new();
1085 // Invalid regex - unclosed bracket
1086 let result = scanner.add_custom_pattern(r"INVALID_[PATTERN");
1087
1088 // Should return an error with details
1089 assert!(result.is_err(), "Invalid regex should return error");
1090 let err = result.unwrap_err();
1091 match err {
1092 SecurityError::InvalidPattern { pattern, message } => {
1093 assert_eq!(pattern, r"INVALID_[PATTERN");
1094 assert!(!message.is_empty(), "Error message should not be empty");
1095 },
1096 }
1097 }
1098
1099 #[test]
1100 fn test_invalid_custom_pattern_unchecked() {
1101 let mut scanner = SecurityScanner::new();
1102 // Invalid regex - unclosed bracket (silently ignored with _unchecked)
1103 scanner.add_custom_pattern_unchecked(r"INVALID_[PATTERN");
1104
1105 // Should not panic, invalid patterns are ignored
1106 let content = "INVALID_[PATTERN here";
1107 let _findings = scanner.scan(content, "test.py");
1108 }
1109
1110 #[test]
1111 fn test_multiple_secrets_same_line() {
1112 let scanner = SecurityScanner::new();
1113
1114 // Two GitHub tokens on the same line
1115 let content = r#"TOKEN1="ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" TOKEN2="ghp_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb""#;
1116
1117 let findings = scanner.scan(content, "test.env");
1118 assert_eq!(findings.len(), 2, "Should detect both tokens on the same line");
1119
1120 // Test redaction of multiple secrets on same line
1121 let (redacted, _) = scanner.scan_and_redact(content, "test.env");
1122
1123 // Both tokens should be redacted
1124 assert!(
1125 !redacted.contains("ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
1126 "First token should be redacted"
1127 );
1128 assert!(
1129 !redacted.contains("ghp_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"),
1130 "Second token should be redacted"
1131 );
1132 assert!(redacted.contains('*'), "Redacted content should contain asterisks");
1133 }
1134
1135 #[test]
1136 fn test_redaction_preserves_structure() {
1137 let scanner = SecurityScanner::new();
1138 let content = "line1\napi_key = 'secret_key_12345678901234567890'\nline3";
1139
1140 let (redacted, _) = scanner.scan_and_redact(content, "test.py");
1141
1142 // Should preserve newlines and structure
1143 let lines: Vec<&str> = redacted.lines().collect();
1144 assert_eq!(lines.len(), 3, "Should preserve line count");
1145 assert_eq!(lines[0], "line1");
1146 assert_eq!(lines[2], "line3");
1147 }
1148
1149 #[test]
1150 fn test_gcp_api_key_detection() {
1151 let scanner = SecurityScanner::new();
1152 let content = r#"GCP_API_KEY = "AIzaSyA1234567890abcdefghijklmnopqrstuv""#;
1153 let findings = scanner.scan(content, "config.py");
1154 assert!(!findings.is_empty());
1155 assert!(findings.iter().any(|f| f.severity == Severity::Critical));
1156 }
1157
1158 #[test]
1159 fn test_huggingface_token_detection() {
1160 let scanner = SecurityScanner::new();
1161 let content = r#"HF_TOKEN = "hf_abcdefghijklmnopqrstuvwxyz12345678""#;
1162 let findings = scanner.scan(content, ".env");
1163 assert!(!findings.is_empty());
1164 }
1165
1166 #[test]
1167 fn test_azure_connection_string_detection() {
1168 let scanner = SecurityScanner::new();
1169 let content = r#"AZURE_STORAGE = "DefaultEndpointsProtocol=https;AccountName=myaccount;AccountKey=abc123def456ghi789jkl012mno345pqr678stu901vw==""#;
1170 let findings = scanner.scan(content, ".env");
1171 assert!(!findings.is_empty());
1172 assert!(findings.iter().any(|f| f.severity == Severity::Critical));
1173 }
1174
1175 #[test]
1176 fn test_digitalocean_token_detection() {
1177 let scanner = SecurityScanner::new();
1178 // Build the token dynamically to avoid GitHub push protection
1179 let token = format!("dop_v1_{}", "a1b2c3d4".repeat(8));
1180 let content = format!(r#"DO_TOKEN = "{}""#, token);
1181 let findings = scanner.scan(&content, ".env");
1182 assert!(!findings.is_empty());
1183 }
1184
1185 #[test]
1186 fn test_sendgrid_api_key_detection() {
1187 let scanner = SecurityScanner::new();
1188 let content = r#"SENDGRID_KEY = "SG.abcdefghijklmnopqrstuv.abcdefghijklmnopqrstuvwxyz01234567890123456""#;
1189 let findings = scanner.scan(content, ".env");
1190 assert!(!findings.is_empty());
1191 }
1192
1193 #[test]
1194 fn test_twilio_api_key_detection() {
1195 let scanner = SecurityScanner::new();
1196 // Build the key dynamically to avoid GitHub push protection
1197 let key = format!("SK{}", "ab12cd34".repeat(4));
1198 let content = format!(r#"TWILIO_KEY = "{}""#, key);
1199 let findings = scanner.scan(&content, ".env");
1200 assert!(!findings.is_empty());
1201 }
1202}