1use std::path::PathBuf;
6use std::sync::Arc;
7use std::fs::File;
8use std::io::{self, Read, BufReader};
9
10use memmap2::MmapOptions;
11use crossbeam::channel::{Receiver, Sender};
12use parking_lot::{Mutex, RwLock};
13use log::{debug, trace, warn};
14
15use super::file_discovery::FileMetadata;
16use super::pattern_engine::{PatternEngine, PatternMatch};
17use super::cache::SecurityCache;
18use crate::analyzer::security::{SecurityFinding, SecuritySeverity, SecurityCategory};
19
20#[derive(Debug)]
22pub struct ScanTask {
23 pub id: usize,
24 pub file: FileMetadata,
25 pub quick_reject: bool,
26}
27
28#[derive(Debug)]
30pub enum ScanResult {
31 Findings(Vec<SecurityFinding>),
32 Skipped,
33 Error(String),
34}
35
36pub struct FileScanner {
38 thread_id: usize,
39 pattern_engine: Arc<PatternEngine>,
40 cache: Arc<SecurityCache>,
41 use_mmap: bool,
42}
43
44impl FileScanner {
45 pub fn new(
46 thread_id: usize,
47 pattern_engine: Arc<PatternEngine>,
48 cache: Arc<SecurityCache>,
49 use_mmap: bool,
50 ) -> Self {
51 Self {
52 thread_id,
53 pattern_engine,
54 cache,
55 use_mmap,
56 }
57 }
58
59 pub fn run(
61 &self,
62 task_receiver: Receiver<ScanTask>,
63 result_sender: Sender<ScanResult>,
64 critical_count: Arc<Mutex<usize>>,
65 should_terminate: Arc<RwLock<bool>>,
66 max_critical: Option<usize>,
67 ) {
68 debug!("Scanner thread {} started", self.thread_id);
69
70 while let Ok(task) = task_receiver.recv() {
71 if *should_terminate.read() {
73 debug!("Scanner thread {} terminating early", self.thread_id);
74 break;
75 }
76
77 let result = self.scan_file(task);
79
80 if let ScanResult::Findings(ref findings) = result {
82 let critical_findings = findings.iter()
83 .filter(|f| f.severity == SecuritySeverity::Critical)
84 .count();
85
86 if critical_findings > 0 {
87 let mut count = critical_count.lock();
88 *count += critical_findings;
89
90 if let Some(max) = max_critical {
91 if *count >= max {
92 *should_terminate.write() = true;
93 debug!("Critical findings limit reached, triggering early termination");
94 }
95 }
96 }
97 }
98
99 if result_sender.send(result).is_err() {
101 break; }
103 }
104
105 debug!("Scanner thread {} finished", self.thread_id);
106 }
107
108 fn scan_file(&self, task: ScanTask) -> ScanResult {
110 trace!("Thread {} scanning: {}", self.thread_id, task.file.path.display());
111
112 if let Some(cached_result) = self.cache.get(&task.file.path) {
114 trace!("Cache hit for: {}", task.file.path.display());
115 return ScanResult::Findings(cached_result);
116 }
117
118 let content = match self.read_file_content(&task.file) {
120 Ok(content) => content,
121 Err(e) => {
122 warn!("Failed to read file {}: {}", task.file.path.display(), e);
123 return ScanResult::Error(e.to_string());
124 }
125 };
126
127 if content.is_empty() {
129 return ScanResult::Skipped;
130 }
131
132 let matches = self.pattern_engine.scan_content(&content, task.quick_reject, &task.file);
134
135 let findings = self.convert_matches_to_findings(matches, &task.file);
137
138 self.cache.insert(task.file.path.clone(), findings.clone());
140
141 ScanResult::Findings(findings)
142 }
143
144 fn read_file_content(&self, file_meta: &FileMetadata) -> io::Result<String> {
146 let content = if self.use_mmap && file_meta.size > 4096 {
147 self.read_file_mmap(&file_meta.path)?
148 } else {
149 self.read_file_buffered(&file_meta.path)?
150 };
151
152 if self.should_skip_content(&content, file_meta) {
154 return Ok(String::new()); }
156
157 Ok(content)
158 }
159
160 fn should_skip_content(&self, content: &str, file_meta: &FileMetadata) -> bool {
162 if content.trim().is_empty() || content.len() < 10 {
164 return true;
165 }
166
167 if self.is_binary_content(content) {
169 return true;
170 }
171
172 if self.is_generated_content(content, file_meta) {
174 return true;
175 }
176
177 if self.has_high_base64_content(content) {
179 return true;
180 }
181
182 false
183 }
184
185 fn is_binary_content(&self, content: &str) -> bool {
187 let non_printable_count = content.chars()
189 .filter(|c| !c.is_ascii() || (c.is_control() && !c.is_whitespace()))
190 .count();
191
192 let non_printable_ratio = non_printable_count as f32 / content.len() as f32;
193
194 non_printable_ratio > 0.05
196 }
197
198 fn is_generated_content(&self, content: &str, file_meta: &FileMetadata) -> bool {
200 let content_lower = content.to_lowercase();
201
202 let generated_markers = [
204 "// this file is generated",
205 "/* this file is generated",
206 "# this file is generated",
207 "automatically generated",
208 "auto-generated",
209 "autogenerated",
210 "do not edit",
211 "do not modify",
212 "generated by webpack",
213 "generated by babel",
214 "compiled by typescript",
215 "@generated",
216 "sourcemappingurl=",
217 ];
218
219 if generated_markers.iter().any(|&marker| content_lower.contains(marker)) {
220 return true;
221 }
222
223 if content.contains("//# sourceMappingURL=") || content.contains("/*# sourceMappingURL=") {
225 return true;
226 }
227
228 if self.is_code_generation_file(content, file_meta) {
230 return true;
231 }
232
233 if self.is_minified_js_css(content, file_meta) {
235 return true;
236 }
237
238 false
239 }
240
241 fn is_code_generation_file(&self, content: &str, file_meta: &FileMetadata) -> bool {
243 let content_lower = content.to_lowercase();
244
245 if let Some(filename) = file_meta.path.file_name().and_then(|n| n.to_str()) {
247 let filename_lower = filename.to_lowercase();
248 let code_gen_filenames = [
249 "apicodedialog", "codedialog", "codeexample", "apiexample",
250 "codesnippet", "snippets", "examples", "templates",
251 "codegenerator", "apitool"
252 ];
253
254 if code_gen_filenames.iter().any(|&pattern| filename_lower.contains(pattern)) {
255 return true;
256 }
257 }
258
259 let code_gen_content_patterns = [
261 "getcode(", "generatecode", "getcodewithauthorization",
263 "getconfigcode", "getmulticonfigcode",
264 "api_url =", "def query(", "async function query",
266 "import requests", "const response = await fetch",
267 "curl ", "bearer ${", "authorization: \"bearer",
268 "copyblock", "codeblock", "react-code-blocks",
270 ];
272
273 let pattern_matches = code_gen_content_patterns.iter()
274 .filter(|&pattern| content_lower.contains(pattern))
275 .count();
276
277 if pattern_matches >= 3 {
279 return true;
280 }
281
282 let template_literal_count = content.matches("${").count();
284 let api_pattern_count = content_lower.matches("api").count() +
285 content_lower.matches("bearer").count() +
286 content_lower.matches("authorization").count();
287
288 if template_literal_count > 5 && api_pattern_count > 3 {
290 return true;
291 }
292
293 false
294 }
295
296 fn is_minified_js_css(&self, content: &str, file_meta: &FileMetadata) -> bool {
298 let has_js_css_ext = file_meta.extension.as_deref()
299 .map(|ext| matches!(ext, "js" | "css" | "mjs" | "cjs"))
300 .unwrap_or(false);
301
302 if !has_js_css_ext {
303 return false;
304 }
305
306 let lines: Vec<&str> = content.lines().collect();
307
308 if lines.len() < 10 {
310 let avg_line_length = content.len() / lines.len().max(1);
311 if avg_line_length > 500 {
312 return true;
313 }
314 }
315
316 if content.contains(";var ") || content.contains(",function(") ||
318 content.contains("!function(") || content.contains(";!function") {
319 return true;
320 }
321
322 false
323 }
324
325 fn has_high_base64_content(&self, content: &str) -> bool {
327 if content.len() < 100 {
329 return false;
330 }
331
332 let base64_chars = content.chars()
333 .filter(|c| c.is_alphanumeric() || *c == '+' || *c == '/' || *c == '=')
334 .count();
335
336 let base64_ratio = base64_chars as f32 / content.len() as f32;
337
338 if base64_ratio > 0.7 && content.len() > 1000 && !content.contains("eyJ") {
341 return true;
342 }
343
344 if content.contains("data:image/") || content.contains("data:font/") ||
346 content.contains("data:application/") {
347 return true;
348 }
349
350 false
351 }
352
353 fn read_file_mmap(&self, path: &PathBuf) -> io::Result<String> {
355 let file = File::open(path)?;
356 let mmap = unsafe { MmapOptions::new().map(&file)? };
357
358 match simdutf8::basic::from_utf8(&mmap) {
360 Ok(content) => Ok(content.to_string()),
361 Err(_) => {
362 Ok(String::from_utf8_lossy(&mmap).to_string())
364 }
365 }
366 }
367
368 fn read_file_buffered(&self, path: &PathBuf) -> io::Result<String> {
370 let file = File::open(path)?;
371 let mut reader = BufReader::with_capacity(8192, file);
372 let mut content = String::new();
373 reader.read_to_string(&mut content)?;
374 Ok(content)
375 }
376
377 fn convert_matches_to_findings(&self, matches: Vec<PatternMatch>, file_meta: &FileMetadata) -> Vec<SecurityFinding> {
379 matches.into_iter()
380 .map(|match_| {
381 SecurityFinding {
382 id: format!("{}-{}-{}", match_.pattern.id, file_meta.path.display(), match_.line_number),
383 title: match_.pattern.name.clone(),
384 description: self.enhance_description(&match_.pattern.description, file_meta),
385 severity: self.adjust_severity(&match_.pattern.severity, file_meta, match_.confidence),
386 category: match_.pattern.category.clone(),
387 file_path: Some(file_meta.path.clone()),
388 line_number: Some(match_.line_number),
389 column_number: Some(match_.column_number),
390 evidence: Some(match_.evidence),
391 remediation: match_.pattern.remediation.clone(),
392 references: match_.pattern.references.clone(),
393 cwe_id: match_.pattern.cwe_id.clone(),
394 compliance_frameworks: self.get_compliance_frameworks(&match_.pattern.category),
395 }
396 })
397 .collect()
398 }
399
400 fn enhance_description(&self, base_description: &str, file_meta: &FileMetadata) -> String {
402 let mut description = base_description.to_string();
403
404 if file_meta.is_gitignored {
406 if file_meta.priority_hints.is_env_file ||
408 file_meta.priority_hints.is_config_file ||
409 base_description.to_lowercase().contains("secret") ||
410 base_description.to_lowercase().contains("key") ||
411 base_description.to_lowercase().contains("token") {
412 description.push_str(" (File is protected by .gitignore)");
413 } else {
414 description.push_str(" (File appears safe for version control)");
415 }
416 } else {
417 if self.file_contains_secrets(file_meta) {
419 if self.is_file_tracked_by_git(&file_meta.path) {
421 description.push_str(" (File is tracked by git and may expose secrets in version history - CRITICAL RISK)");
422 } else {
423 description.push_str(" (File is NOT in .gitignore but contains secrets - HIGH RISK)");
424 }
425 } else {
426 description.push_str(" (File appears safe for version control)");
427 }
428 }
429
430 if file_meta.priority_hints.is_env_file {
432 description.push_str(" [Environment file]");
433 } else if file_meta.priority_hints.is_config_file {
434 description.push_str(" [Configuration file]");
435 }
436
437 description
438 }
439
440 fn file_contains_secrets(&self, file_meta: &FileMetadata) -> bool {
442 if let Some(file_name) = file_meta.path.file_name().and_then(|n| n.to_str()) {
444 let file_name_lower = file_name.to_lowercase();
445 let secret_file_patterns = [
446 ".env", ".key", ".pem", ".p12", ".pfx",
447 "id_rsa", "id_dsa", "id_ecdsa", "id_ed25519",
448 "credentials", "secrets", "private", "secret.json",
449 "service-account", "auth.json", "config.json"
450 ];
451
452 if secret_file_patterns.iter().any(|pattern| file_name_lower.contains(pattern)) {
453 return true;
454 }
455 }
456
457 file_meta.priority_hints.is_env_file ||
459 file_meta.priority_hints.is_config_file ||
460 file_meta.is_critical()
461 }
462
463 fn is_file_tracked_by_git(&self, file_path: &std::path::PathBuf) -> bool {
465 use std::process::Command;
466
467 Command::new("git")
468 .args(&["ls-files", "--error-unmatch"])
469 .arg(file_path)
470 .output()
471 .map(|output| output.status.success())
472 .unwrap_or(false)
473 }
474
475 fn adjust_severity(&self, base_severity: &SecuritySeverity, file_meta: &FileMetadata, confidence: f32) -> SecuritySeverity {
477 let mut severity = base_severity.clone();
478 let filename = file_meta.path.file_name().and_then(|s| s.to_str()).unwrap_or("");
479
480 if filename == "GoogleService-Info.plist" || filename.ends_with(".plist") {
482 if matches!(severity, SecuritySeverity::Critical | SecuritySeverity::High) {
483 return SecuritySeverity::Medium; }
485 }
486
487 if !file_meta.is_gitignored && matches!(severity, SecuritySeverity::Medium | SecuritySeverity::High) {
489 severity = match severity {
490 SecuritySeverity::Medium => SecuritySeverity::High,
491 SecuritySeverity::High => SecuritySeverity::Critical,
492 _ => severity,
493 };
494 }
495
496 if confidence < 0.5 && matches!(severity, SecuritySeverity::High | SecuritySeverity::Critical) {
498 severity = match severity {
499 SecuritySeverity::Critical => SecuritySeverity::High,
500 SecuritySeverity::High => SecuritySeverity::Medium,
501 _ => severity,
502 };
503 }
504
505 severity
506 }
507
508 fn get_compliance_frameworks(&self, category: &SecurityCategory) -> Vec<String> {
510 match category {
511 SecurityCategory::SecretsExposure => vec!["SOC2".to_string(), "GDPR".to_string(), "PCI-DSS".to_string()],
512 SecurityCategory::InsecureConfiguration => vec!["SOC2".to_string(), "OWASP".to_string()],
513 SecurityCategory::AuthenticationSecurity => vec!["SOC2".to_string(), "OWASP".to_string()],
514 SecurityCategory::DataProtection => vec!["GDPR".to_string(), "CCPA".to_string()],
515 _ => vec!["SOC2".to_string()],
516 }
517 }
518}
519
520pub struct EnvFileScanner;
522
523impl EnvFileScanner {
524 pub fn scan_env_file(path: &PathBuf) -> Result<Vec<SecurityFinding>, io::Error> {
526 let content = std::fs::read_to_string(path)?;
527 let mut findings = Vec::new();
528
529 for (line_num, line) in content.lines().enumerate() {
530 let line = line.trim();
531
532 if line.is_empty() || line.starts_with('#') {
534 continue;
535 }
536
537 if let Some(eq_pos) = line.find('=') {
539 let key = &line[..eq_pos].trim();
540 let value = &line[eq_pos + 1..].trim_matches('"').trim_matches('\'');
541
542 if is_sensitive_env_key(key) && !value.is_empty() && !is_placeholder_value(value) {
544 findings.push(SecurityFinding {
545 id: format!("env-secret-{}-{}", path.display(), line_num),
546 title: format!("Sensitive Environment Variable: {}", key),
547 description: format!("Environment variable '{}' contains a potentially sensitive value", key),
548 severity: determine_env_severity(key, value),
549 category: SecurityCategory::SecretsExposure,
550 file_path: Some(path.clone()),
551 line_number: Some(line_num + 1),
552 column_number: Some(eq_pos + 1),
553 evidence: Some(format!("{}=***", key)),
554 remediation: vec![
555 "Ensure .env files are in .gitignore".to_string(),
556 "Use .env.example for documentation".to_string(),
557 "Consider using a secure secret management service".to_string(),
558 ],
559 references: vec![
560 "https://12factor.net/config".to_string(),
561 ],
562 cwe_id: Some("CWE-798".to_string()),
563 compliance_frameworks: vec!["SOC2".to_string(), "GDPR".to_string()],
564 });
565 }
566 }
567 }
568
569 Ok(findings)
570 }
571}
572
573fn is_sensitive_env_key(key: &str) -> bool {
575 let key_upper = key.to_uppercase();
576 let sensitive_patterns = [
577 "PASSWORD", "SECRET", "KEY", "TOKEN", "API", "AUTH",
578 "PRIVATE", "CREDENTIAL", "ACCESS", "CLIENT", "STRIPE",
579 "AWS", "GOOGLE", "AZURE", "DATABASE", "DB_", "JWT",
580 ];
581
582 sensitive_patterns.iter().any(|pattern| key_upper.contains(pattern))
583}
584
585fn is_placeholder_value(value: &str) -> bool {
587 let placeholders = [
588 "your_", "change_me", "xxx", "placeholder", "example",
589 "test", "demo", "fake", "dummy", "<", ">", "${", "}",
590 ];
591
592 let value_lower = value.to_lowercase();
593 placeholders.iter().any(|p| value_lower.contains(p))
594}
595
596fn determine_env_severity(key: &str, _value: &str) -> SecuritySeverity {
598 let key_upper = key.to_uppercase();
599
600 if key_upper.contains("DATABASE") || key_upper.contains("DB_PASS") ||
602 key_upper.contains("AWS_SECRET") || key_upper.contains("STRIPE_SECRET") {
603 return SecuritySeverity::Critical;
604 }
605
606 if key_upper.contains("API") || key_upper.contains("SECRET") ||
608 key_upper.contains("PRIVATE") || key_upper.contains("TOKEN") {
609 return SecuritySeverity::High;
610 }
611
612 if key_upper.contains("PASSWORD") || key_upper.contains("AUTH") {
614 return SecuritySeverity::Medium;
615 }
616
617 SecuritySeverity::Low
618}
619
620#[cfg(test)]
621mod tests {
622 use super::*;
623 use tempfile::TempDir;
624 use std::fs;
625
626 #[test]
627 fn test_env_file_scanner() {
628 let temp_dir = TempDir::new().unwrap();
629 let env_file = temp_dir.path().join(".env");
630
631 fs::write(&env_file, r#"
632# Database config
633DATABASE_URL=postgres://user:password@localhost/db
634API_KEY=sk-1234567890abcdef
635PUBLIC_URL=https://example.com
636TEST_VAR=placeholder_value
637"#).unwrap();
638
639 let findings = EnvFileScanner::scan_env_file(&env_file).unwrap();
640
641 assert_eq!(findings.len(), 2);
643 assert!(findings.iter().any(|f| f.title.contains("DATABASE_URL")));
644 assert!(findings.iter().any(|f| f.title.contains("API_KEY")));
645 }
646
647 #[test]
648 fn test_placeholder_detection() {
649 assert!(is_placeholder_value("your_api_key_here"));
650 assert!(is_placeholder_value("<YOUR_TOKEN>"));
651 assert!(is_placeholder_value("xxx"));
652 assert!(!is_placeholder_value("sk-1234567890"));
653 }
654}