1use std::fs::File;
6use std::io::{self, BufReader, Read};
7use std::path::PathBuf;
8use std::sync::Arc;
9
10use crossbeam::channel::{Receiver, Sender};
11use log::{debug, trace, warn};
12use memmap2::MmapOptions;
13use parking_lot::{Mutex, RwLock};
14
15use super::cache::SecurityCache;
16use super::file_discovery::FileMetadata;
17use super::pattern_engine::{PatternEngine, PatternMatch};
18use crate::analyzer::security::{SecurityCategory, SecurityFinding, SecuritySeverity};
19
20#[derive(Debug)]
22pub struct ScanTask {
23 pub id: usize,
24 pub file: FileMetadata,
25 pub quick_reject: bool,
26}
27
28#[derive(Debug)]
30pub enum ScanResult {
31 Findings(Vec<SecurityFinding>),
32 Skipped,
33 Error(String),
34}
35
36pub struct FileScanner {
38 thread_id: usize,
39 pattern_engine: Arc<PatternEngine>,
40 cache: Arc<SecurityCache>,
41 use_mmap: bool,
42}
43
44impl FileScanner {
45 pub fn new(
46 thread_id: usize,
47 pattern_engine: Arc<PatternEngine>,
48 cache: Arc<SecurityCache>,
49 use_mmap: bool,
50 ) -> Self {
51 Self {
52 thread_id,
53 pattern_engine,
54 cache,
55 use_mmap,
56 }
57 }
58
59 pub fn run(
61 &self,
62 task_receiver: Receiver<ScanTask>,
63 result_sender: Sender<ScanResult>,
64 critical_count: Arc<Mutex<usize>>,
65 should_terminate: Arc<RwLock<bool>>,
66 max_critical: Option<usize>,
67 ) {
68 debug!("Scanner thread {} started", self.thread_id);
69
70 while let Ok(task) = task_receiver.recv() {
71 if *should_terminate.read() {
73 debug!("Scanner thread {} terminating early", self.thread_id);
74 break;
75 }
76
77 let result = self.scan_file(task);
79
80 if let ScanResult::Findings(ref findings) = result {
82 let critical_findings = findings
83 .iter()
84 .filter(|f| f.severity == SecuritySeverity::Critical)
85 .count();
86
87 if critical_findings > 0 {
88 let mut count = critical_count.lock();
89 *count += critical_findings;
90
91 if let Some(max) = max_critical
92 && *count >= max
93 {
94 *should_terminate.write() = true;
95 debug!("Critical findings limit reached, triggering early termination");
96 }
97 }
98 }
99
100 if result_sender.send(result).is_err() {
102 break; }
104 }
105
106 debug!("Scanner thread {} finished", self.thread_id);
107 }
108
109 fn scan_file(&self, task: ScanTask) -> ScanResult {
111 trace!(
112 "Thread {} scanning: {}",
113 self.thread_id,
114 task.file.path.display()
115 );
116
117 if let Some(cached_result) = self.cache.get(&task.file.path) {
119 trace!("Cache hit for: {}", task.file.path.display());
120 return ScanResult::Findings(cached_result);
121 }
122
123 let content = match self.read_file_content(&task.file) {
125 Ok(content) => content,
126 Err(e) => {
127 warn!("Failed to read file {}: {}", task.file.path.display(), e);
128 return ScanResult::Error(e.to_string());
129 }
130 };
131
132 if content.is_empty() {
134 return ScanResult::Skipped;
135 }
136
137 let matches = self
139 .pattern_engine
140 .scan_content(&content, task.quick_reject, &task.file);
141
142 let findings = self.convert_matches_to_findings(matches, &task.file);
144
145 self.cache.insert(task.file.path.clone(), findings.clone());
147
148 ScanResult::Findings(findings)
149 }
150
151 fn read_file_content(&self, file_meta: &FileMetadata) -> io::Result<String> {
153 let content = if self.use_mmap && file_meta.size > 4096 {
154 self.read_file_mmap(&file_meta.path)?
155 } else {
156 self.read_file_buffered(&file_meta.path)?
157 };
158
159 if self.should_skip_content(&content, file_meta) {
161 return Ok(String::new()); }
163
164 Ok(content)
165 }
166
167 fn should_skip_content(&self, content: &str, file_meta: &FileMetadata) -> bool {
169 if content.trim().is_empty() || content.len() < 10 {
171 return true;
172 }
173
174 if self.is_binary_content(content) {
176 return true;
177 }
178
179 if self.is_generated_content(content, file_meta) {
181 return true;
182 }
183
184 if self.has_high_base64_content(content) {
186 return true;
187 }
188
189 false
190 }
191
192 fn is_binary_content(&self, content: &str) -> bool {
194 let non_printable_count = content
196 .chars()
197 .filter(|c| !c.is_ascii() || (c.is_control() && !c.is_whitespace()))
198 .count();
199
200 let non_printable_ratio = non_printable_count as f32 / content.len() as f32;
201
202 non_printable_ratio > 0.05
204 }
205
206 fn is_generated_content(&self, content: &str, file_meta: &FileMetadata) -> bool {
208 let content_lower = content.to_lowercase();
209
210 let generated_markers = [
212 "// this file is generated",
213 "/* this file is generated",
214 "# this file is generated",
215 "automatically generated",
216 "auto-generated",
217 "autogenerated",
218 "do not edit",
219 "do not modify",
220 "generated by webpack",
221 "generated by babel",
222 "compiled by typescript",
223 "@generated",
224 "sourcemappingurl=",
225 ];
226
227 if generated_markers
228 .iter()
229 .any(|&marker| content_lower.contains(marker))
230 {
231 return true;
232 }
233
234 if content.contains("//# sourceMappingURL=") || content.contains("/*# sourceMappingURL=") {
236 return true;
237 }
238
239 if self.is_code_generation_file(content, file_meta) {
241 return true;
242 }
243
244 if self.is_minified_js_css(content, file_meta) {
246 return true;
247 }
248
249 false
250 }
251
252 fn is_code_generation_file(&self, content: &str, file_meta: &FileMetadata) -> bool {
254 let content_lower = content.to_lowercase();
255
256 if let Some(filename) = file_meta.path.file_name().and_then(|n| n.to_str()) {
258 let filename_lower = filename.to_lowercase();
259 let code_gen_filenames = [
260 "apicodedialog",
261 "codedialog",
262 "codeexample",
263 "apiexample",
264 "codesnippet",
265 "snippets",
266 "examples",
267 "templates",
268 "codegenerator",
269 "apitool",
270 ];
271
272 if code_gen_filenames
273 .iter()
274 .any(|&pattern| filename_lower.contains(pattern))
275 {
276 return true;
277 }
278 }
279
280 let code_gen_content_patterns = [
282 "getcode(",
284 "generatecode",
285 "getcodewithauthorization",
286 "getconfigcode",
287 "getmulticonfigcode",
288 "api_url =",
290 "def query(",
291 "async function query",
292 "import requests",
293 "const response = await fetch",
294 "curl ",
295 "bearer ${",
296 "authorization: \"bearer",
297 "copyblock",
299 "codeblock",
300 "react-code-blocks",
301 ];
303
304 let pattern_matches = code_gen_content_patterns
305 .iter()
306 .filter(|&pattern| content_lower.contains(pattern))
307 .count();
308
309 if pattern_matches >= 3 {
311 return true;
312 }
313
314 let template_literal_count = content.matches("${").count();
316 let api_pattern_count = content_lower.matches("api").count()
317 + content_lower.matches("bearer").count()
318 + content_lower.matches("authorization").count();
319
320 if template_literal_count > 5 && api_pattern_count > 3 {
322 return true;
323 }
324
325 false
326 }
327
328 fn is_minified_js_css(&self, content: &str, file_meta: &FileMetadata) -> bool {
330 let has_js_css_ext = file_meta
331 .extension
332 .as_deref()
333 .map(|ext| matches!(ext, "js" | "css" | "mjs" | "cjs"))
334 .unwrap_or(false);
335
336 if !has_js_css_ext {
337 return false;
338 }
339
340 let lines: Vec<&str> = content.lines().collect();
341
342 if lines.len() < 10 {
344 let avg_line_length = content.len() / lines.len().max(1);
345 if avg_line_length > 500 {
346 return true;
347 }
348 }
349
350 if content.contains(";var ")
352 || content.contains(",function(")
353 || content.contains("!function(")
354 || content.contains(";!function")
355 {
356 return true;
357 }
358
359 false
360 }
361
362 fn has_high_base64_content(&self, content: &str) -> bool {
364 if content.len() < 100 {
366 return false;
367 }
368
369 let base64_chars = content
370 .chars()
371 .filter(|c| c.is_alphanumeric() || *c == '+' || *c == '/' || *c == '=')
372 .count();
373
374 let base64_ratio = base64_chars as f32 / content.len() as f32;
375
376 if base64_ratio > 0.7 && content.len() > 1000 && !content.contains("eyJ") {
379 return true;
380 }
381
382 if content.contains("data:image/")
384 || content.contains("data:font/")
385 || content.contains("data:application/")
386 {
387 return true;
388 }
389
390 false
391 }
392
393 fn read_file_mmap(&self, path: &PathBuf) -> io::Result<String> {
395 let file = File::open(path)?;
396 let mmap = unsafe { MmapOptions::new().map(&file)? };
397
398 match simdutf8::basic::from_utf8(&mmap) {
400 Ok(content) => Ok(content.to_string()),
401 Err(_) => {
402 Ok(String::from_utf8_lossy(&mmap).to_string())
404 }
405 }
406 }
407
408 fn read_file_buffered(&self, path: &PathBuf) -> io::Result<String> {
410 let file = File::open(path)?;
411 let mut reader = BufReader::with_capacity(8192, file);
412 let mut content = String::new();
413 reader.read_to_string(&mut content)?;
414 Ok(content)
415 }
416
417 fn convert_matches_to_findings(
419 &self,
420 matches: Vec<PatternMatch>,
421 file_meta: &FileMetadata,
422 ) -> Vec<SecurityFinding> {
423 matches
424 .into_iter()
425 .map(|match_| SecurityFinding {
426 id: format!(
427 "{}-{}-{}",
428 match_.pattern.id,
429 file_meta.path.display(),
430 match_.line_number
431 ),
432 title: match_.pattern.name.clone(),
433 description: self.enhance_description(&match_.pattern.description, file_meta),
434 severity: self.adjust_severity(
435 &match_.pattern.severity,
436 file_meta,
437 match_.confidence,
438 ),
439 category: match_.pattern.category.clone(),
440 file_path: Some(file_meta.path.clone()),
441 line_number: Some(match_.line_number),
442 column_number: Some(match_.column_number),
443 evidence: Some(match_.evidence),
444 remediation: match_.pattern.remediation.clone(),
445 references: match_.pattern.references.clone(),
446 cwe_id: match_.pattern.cwe_id.clone(),
447 compliance_frameworks: self.get_compliance_frameworks(&match_.pattern.category),
448 })
449 .collect()
450 }
451
452 fn enhance_description(&self, base_description: &str, file_meta: &FileMetadata) -> String {
454 let mut description = base_description.to_string();
455
456 if file_meta.is_gitignored {
458 if file_meta.priority_hints.is_env_file
460 || file_meta.priority_hints.is_config_file
461 || base_description.to_lowercase().contains("secret")
462 || base_description.to_lowercase().contains("key")
463 || base_description.to_lowercase().contains("token")
464 {
465 description.push_str(" (File is protected by .gitignore)");
466 } else {
467 description.push_str(" (File appears safe for version control)");
468 }
469 } else {
470 if self.file_contains_secrets(file_meta) {
472 if self.is_file_tracked_by_git(&file_meta.path) {
474 description.push_str(" (File is tracked by git and may expose secrets in version history - CRITICAL RISK)");
475 } else {
476 description
477 .push_str(" (File is NOT in .gitignore but contains secrets - HIGH RISK)");
478 }
479 } else {
480 description.push_str(" (File appears safe for version control)");
481 }
482 }
483
484 if file_meta.priority_hints.is_env_file {
486 description.push_str(" [Environment file]");
487 } else if file_meta.priority_hints.is_config_file {
488 description.push_str(" [Configuration file]");
489 }
490
491 description
492 }
493
494 fn file_contains_secrets(&self, file_meta: &FileMetadata) -> bool {
496 if let Some(file_name) = file_meta.path.file_name().and_then(|n| n.to_str()) {
498 let file_name_lower = file_name.to_lowercase();
499 let secret_file_patterns = [
500 ".env",
501 ".key",
502 ".pem",
503 ".p12",
504 ".pfx",
505 "id_rsa",
506 "id_dsa",
507 "id_ecdsa",
508 "id_ed25519",
509 "credentials",
510 "secrets",
511 "private",
512 "secret.json",
513 "service-account",
514 "auth.json",
515 "config.json",
516 ];
517
518 if secret_file_patterns
519 .iter()
520 .any(|pattern| file_name_lower.contains(pattern))
521 {
522 return true;
523 }
524 }
525
526 file_meta.priority_hints.is_env_file
528 || file_meta.priority_hints.is_config_file
529 || file_meta.is_critical()
530 }
531
532 fn is_file_tracked_by_git(&self, file_path: &std::path::PathBuf) -> bool {
534 use std::process::Command;
535
536 Command::new("git")
537 .args(["ls-files", "--error-unmatch"])
538 .arg(file_path)
539 .output()
540 .map(|output| output.status.success())
541 .unwrap_or(false)
542 }
543
544 fn adjust_severity(
546 &self,
547 base_severity: &SecuritySeverity,
548 file_meta: &FileMetadata,
549 confidence: f32,
550 ) -> SecuritySeverity {
551 let mut severity = base_severity.clone();
552 let filename = file_meta
553 .path
554 .file_name()
555 .and_then(|s| s.to_str())
556 .unwrap_or("");
557
558 if (filename == "GoogleService-Info.plist" || filename.ends_with(".plist"))
560 && matches!(
561 severity,
562 SecuritySeverity::Critical | SecuritySeverity::High
563 )
564 {
565 return SecuritySeverity::Medium; }
567
568 if !file_meta.is_gitignored
570 && matches!(severity, SecuritySeverity::Medium | SecuritySeverity::High)
571 {
572 severity = match severity {
573 SecuritySeverity::Medium => SecuritySeverity::High,
574 SecuritySeverity::High => SecuritySeverity::Critical,
575 _ => severity,
576 };
577 }
578
579 if confidence < 0.5
581 && matches!(
582 severity,
583 SecuritySeverity::High | SecuritySeverity::Critical
584 )
585 {
586 severity = match severity {
587 SecuritySeverity::Critical => SecuritySeverity::High,
588 SecuritySeverity::High => SecuritySeverity::Medium,
589 _ => severity,
590 };
591 }
592
593 severity
594 }
595
596 fn get_compliance_frameworks(&self, category: &SecurityCategory) -> Vec<String> {
598 match category {
599 SecurityCategory::SecretsExposure => vec![
600 "SOC2".to_string(),
601 "GDPR".to_string(),
602 "PCI-DSS".to_string(),
603 ],
604 SecurityCategory::InsecureConfiguration => {
605 vec!["SOC2".to_string(), "OWASP".to_string()]
606 }
607 SecurityCategory::AuthenticationSecurity => {
608 vec!["SOC2".to_string(), "OWASP".to_string()]
609 }
610 SecurityCategory::DataProtection => vec!["GDPR".to_string(), "CCPA".to_string()],
611 _ => vec!["SOC2".to_string()],
612 }
613 }
614}
615
616pub struct EnvFileScanner;
618
619impl EnvFileScanner {
620 pub fn scan_env_file(path: &PathBuf) -> Result<Vec<SecurityFinding>, io::Error> {
622 let content = std::fs::read_to_string(path)?;
623 let mut findings = Vec::new();
624
625 for (line_num, line) in content.lines().enumerate() {
626 let line = line.trim();
627
628 if line.is_empty() || line.starts_with('#') {
630 continue;
631 }
632
633 if let Some(eq_pos) = line.find('=') {
635 let key = &line[..eq_pos].trim();
636 let value = &line[eq_pos + 1..].trim_matches('"').trim_matches('\'');
637
638 if is_sensitive_env_key(key) && !value.is_empty() && !is_placeholder_value(value) {
640 findings.push(SecurityFinding {
641 id: format!("env-secret-{}-{}", path.display(), line_num),
642 title: format!("Sensitive Environment Variable: {}", key),
643 description: format!(
644 "Environment variable '{}' contains a potentially sensitive value",
645 key
646 ),
647 severity: determine_env_severity(key, value),
648 category: SecurityCategory::SecretsExposure,
649 file_path: Some(path.clone()),
650 line_number: Some(line_num + 1),
651 column_number: Some(eq_pos + 1),
652 evidence: Some(format!("{}=***", key)),
653 remediation: vec![
654 "Ensure .env files are in .gitignore".to_string(),
655 "Use .env.example for documentation".to_string(),
656 "Consider using a secure secret management service".to_string(),
657 ],
658 references: vec!["https://12factor.net/config".to_string()],
659 cwe_id: Some("CWE-798".to_string()),
660 compliance_frameworks: vec!["SOC2".to_string(), "GDPR".to_string()],
661 });
662 }
663 }
664 }
665
666 Ok(findings)
667 }
668}
669
670fn is_sensitive_env_key(key: &str) -> bool {
672 let key_upper = key.to_uppercase();
673 let sensitive_patterns = [
674 "PASSWORD",
675 "SECRET",
676 "KEY",
677 "TOKEN",
678 "API",
679 "AUTH",
680 "PRIVATE",
681 "CREDENTIAL",
682 "ACCESS",
683 "CLIENT",
684 "STRIPE",
685 "AWS",
686 "GOOGLE",
687 "AZURE",
688 "DATABASE",
689 "DB_",
690 "JWT",
691 ];
692
693 sensitive_patterns
694 .iter()
695 .any(|pattern| key_upper.contains(pattern))
696}
697
698fn is_placeholder_value(value: &str) -> bool {
700 let placeholders = [
701 "your_",
702 "change_me",
703 "xxx",
704 "placeholder",
705 "example",
706 "test",
707 "demo",
708 "fake",
709 "dummy",
710 "<",
711 ">",
712 "${",
713 "}",
714 ];
715
716 let value_lower = value.to_lowercase();
717 placeholders.iter().any(|p| value_lower.contains(p))
718}
719
720fn determine_env_severity(key: &str, _value: &str) -> SecuritySeverity {
722 let key_upper = key.to_uppercase();
723
724 if key_upper.contains("DATABASE")
726 || key_upper.contains("DB_PASS")
727 || key_upper.contains("AWS_SECRET")
728 || key_upper.contains("STRIPE_SECRET")
729 {
730 return SecuritySeverity::Critical;
731 }
732
733 if key_upper.contains("API")
735 || key_upper.contains("SECRET")
736 || key_upper.contains("PRIVATE")
737 || key_upper.contains("TOKEN")
738 {
739 return SecuritySeverity::High;
740 }
741
742 if key_upper.contains("PASSWORD") || key_upper.contains("AUTH") {
744 return SecuritySeverity::Medium;
745 }
746
747 SecuritySeverity::Low
748}
749
750#[cfg(test)]
751mod tests {
752 use super::*;
753 use std::fs;
754 use tempfile::TempDir;
755
756 #[test]
757 fn test_env_file_scanner() {
758 let temp_dir = TempDir::new().unwrap();
759 let env_file = temp_dir.path().join(".env");
760
761 fs::write(
762 &env_file,
763 r#"
764# Database config
765DATABASE_URL=postgres://user:password@localhost/db
766API_KEY=sk-1234567890abcdef
767PUBLIC_URL=https://example.com
768TEST_VAR=placeholder_value
769"#,
770 )
771 .unwrap();
772
773 let findings = EnvFileScanner::scan_env_file(&env_file).unwrap();
774
775 assert_eq!(findings.len(), 2);
777 assert!(findings.iter().any(|f| f.title.contains("DATABASE_URL")));
778 assert!(findings.iter().any(|f| f.title.contains("API_KEY")));
779 }
780
781 #[test]
782 fn test_placeholder_detection() {
783 assert!(is_placeholder_value("your_api_key_here"));
784 assert!(is_placeholder_value("<YOUR_TOKEN>"));
785 assert!(is_placeholder_value("xxx"));
786 assert!(!is_placeholder_value("sk-1234567890"));
787 }
788}