syncable_cli/analyzer/security/turbo/
scanner.rs

1//! # Scanner Module
2//! 
3//! High-performance file scanning with memory-mapped I/O and parallel processing.
4
5use std::path::PathBuf;
6use std::sync::Arc;
7use std::fs::File;
8use std::io::{self, Read, BufReader};
9
10use memmap2::MmapOptions;
11use crossbeam::channel::{Receiver, Sender};
12use parking_lot::{Mutex, RwLock};
13use log::{debug, trace, warn};
14
15use super::file_discovery::FileMetadata;
16use super::pattern_engine::{PatternEngine, PatternMatch};
17use super::cache::SecurityCache;
18use crate::analyzer::security::{SecurityFinding, SecuritySeverity, SecurityCategory};
19
20/// Scan task for a worker thread
21#[derive(Debug)]
22pub struct ScanTask {
23    pub id: usize,
24    pub file: FileMetadata,
25    pub quick_reject: bool,
26}
27
28/// Scan result from a worker thread
29#[derive(Debug)]
30pub enum ScanResult {
31    Findings(Vec<SecurityFinding>),
32    Skipped,
33    Error(String),
34}
35
36/// File scanner worker
37pub struct FileScanner {
38    thread_id: usize,
39    pattern_engine: Arc<PatternEngine>,
40    cache: Arc<SecurityCache>,
41    use_mmap: bool,
42}
43
44impl FileScanner {
45    pub fn new(
46        thread_id: usize,
47        pattern_engine: Arc<PatternEngine>,
48        cache: Arc<SecurityCache>,
49        use_mmap: bool,
50    ) -> Self {
51        Self {
52            thread_id,
53            pattern_engine,
54            cache,
55            use_mmap,
56        }
57    }
58    
59    /// Run the scanner worker
60    pub fn run(
61        &self,
62        task_receiver: Receiver<ScanTask>,
63        result_sender: Sender<ScanResult>,
64        critical_count: Arc<Mutex<usize>>,
65        should_terminate: Arc<RwLock<bool>>,
66        max_critical: Option<usize>,
67    ) {
68        debug!("Scanner thread {} started", self.thread_id);
69        
70        while let Ok(task) = task_receiver.recv() {
71            // Check for early termination
72            if *should_terminate.read() {
73                debug!("Scanner thread {} terminating early", self.thread_id);
74                break;
75            }
76            
77            // Process the scan task
78            let result = self.scan_file(task);
79            
80            // Check for critical findings
81            if let ScanResult::Findings(ref findings) = result {
82                let critical_findings = findings.iter()
83                    .filter(|f| f.severity == SecuritySeverity::Critical)
84                    .count();
85                
86                if critical_findings > 0 {
87                    let mut count = critical_count.lock();
88                    *count += critical_findings;
89                    
90                    if let Some(max) = max_critical {
91                        if *count >= max {
92                            *should_terminate.write() = true;
93                            debug!("Critical findings limit reached, triggering early termination");
94                        }
95                    }
96                }
97            }
98            
99            // Send result
100            if result_sender.send(result).is_err() {
101                break; // Channel closed
102            }
103        }
104        
105        debug!("Scanner thread {} finished", self.thread_id);
106    }
107    
108    /// Scan a single file
109    fn scan_file(&self, task: ScanTask) -> ScanResult {
110        trace!("Thread {} scanning: {}", self.thread_id, task.file.path.display());
111        
112        // Check cache first
113        if let Some(cached_result) = self.cache.get(&task.file.path) {
114            trace!("Cache hit for: {}", task.file.path.display());
115            return ScanResult::Findings(cached_result);
116        }
117        
118        // Read file content
119        let content = match self.read_file_content(&task.file) {
120            Ok(content) => content,
121            Err(e) => {
122                warn!("Failed to read file {}: {}", task.file.path.display(), e);
123                return ScanResult::Error(e.to_string());
124            }
125        };
126        
127        // Skip if content is empty
128        if content.is_empty() {
129            return ScanResult::Skipped;
130        }
131        
132        // Scan content for patterns
133        let matches = self.pattern_engine.scan_content(&content, task.quick_reject, &task.file);
134        
135        // Convert matches to findings
136        let findings = self.convert_matches_to_findings(matches, &task.file);
137        
138        // Cache the result
139        self.cache.insert(task.file.path.clone(), findings.clone());
140        
141        ScanResult::Findings(findings)
142    }
143    
144    /// Read file content with optimal method and content validation
145    fn read_file_content(&self, file_meta: &FileMetadata) -> io::Result<String> {
146        let content = if self.use_mmap && file_meta.size > 4096 {
147            self.read_file_mmap(&file_meta.path)?
148        } else {
149            self.read_file_buffered(&file_meta.path)?
150        };
151        
152        // Additional content validation to skip files that are unlikely to contain secrets
153        if self.should_skip_content(&content, file_meta) {
154            return Ok(String::new()); // Return empty string to indicate skip
155        }
156        
157        Ok(content)
158    }
159    
160    /// Check if file content should be skipped based on content analysis
161    fn should_skip_content(&self, content: &str, file_meta: &FileMetadata) -> bool {
162        // Skip empty or very small files
163        if content.trim().is_empty() || content.len() < 10 {
164            return true;
165        }
166        
167        // Skip binary-like content that passed initial filtering
168        if self.is_binary_content(content) {
169            return true;
170        }
171        
172        // Skip generated or compiled files
173        if self.is_generated_content(content, file_meta) {
174            return true;
175        }
176        
177        // Skip files with very high base64 content (likely assets)
178        if self.has_high_base64_content(content) {
179            return true;
180        }
181        
182        false
183    }
184    
185    /// Check if content appears to be binary
186    fn is_binary_content(&self, content: &str) -> bool {
187        // Check for null bytes or high percentage of non-printable characters
188        let non_printable_count = content.chars()
189            .filter(|c| !c.is_ascii() || (c.is_control() && !c.is_whitespace()))
190            .count();
191        
192        let non_printable_ratio = non_printable_count as f32 / content.len() as f32;
193        
194        // If more than 5% non-printable characters, likely binary
195        non_printable_ratio > 0.05
196    }
197    
198    /// Check if content appears to be generated or compiled
199    fn is_generated_content(&self, content: &str, file_meta: &FileMetadata) -> bool {
200        let content_lower = content.to_lowercase();
201        
202        // Check for generation markers
203        let generated_markers = [
204            "// this file is generated",
205            "/* this file is generated",
206            "# this file is generated",
207            "automatically generated",
208            "auto-generated",
209            "autogenerated",
210            "do not edit",
211            "do not modify",
212            "generated by webpack",
213            "generated by babel",
214            "compiled by typescript",
215            "@generated",
216            "sourcemappingurl=",
217        ];
218        
219        if generated_markers.iter().any(|&marker| content_lower.contains(marker)) {
220            return true;
221        }
222        
223        // Check for source maps
224        if content.contains("//# sourceMappingURL=") || content.contains("/*# sourceMappingURL=") {
225            return true;
226        }
227        
228        // Check for code generation files (files that generate example code)
229        if self.is_code_generation_file(content, file_meta) {
230            return true;
231        }
232        
233        // Check for minified JavaScript/CSS
234        if self.is_minified_js_css(content, file_meta) {
235            return true;
236        }
237        
238        false
239    }
240    
241    /// Check if file is primarily for code generation/examples
242    fn is_code_generation_file(&self, content: &str, file_meta: &FileMetadata) -> bool {
243        let content_lower = content.to_lowercase();
244        
245        // Check filename patterns
246        if let Some(filename) = file_meta.path.file_name().and_then(|n| n.to_str()) {
247            let filename_lower = filename.to_lowercase();
248            let code_gen_filenames = [
249                "apicodedialog", "codedialog", "codeexample", "apiexample",
250                "codesnippet", "snippets", "examples", "templates",
251                "codegenerator", "apitool"
252            ];
253            
254            if code_gen_filenames.iter().any(|&pattern| filename_lower.contains(pattern)) {
255                return true;
256            }
257        }
258        
259        // Check content patterns - files that primarily generate code examples
260        let code_gen_content_patterns = [
261            // Function names
262            "getcode(", "generatecode", "getcodewithauthorization",
263            "getconfigcode", "getmulticonfigcode",
264            // Template/example generation
265            "api_url =", "def query(", "async function query",
266            "import requests", "const response = await fetch",
267            "curl ", "bearer ${", "authorization: \"bearer",
268            // React component patterns for code display
269            "copyblock", "codeblock", "react-code-blocks",
270            // High density of template literals
271        ];
272        
273        let pattern_matches = code_gen_content_patterns.iter()
274            .filter(|&pattern| content_lower.contains(pattern))
275            .count();
276        
277        // If we have multiple code generation patterns, likely a code gen file
278        if pattern_matches >= 3 {
279            return true;
280        }
281        
282        // Check for high density of template literals with API patterns
283        let template_literal_count = content.matches("${").count();
284        let api_pattern_count = content_lower.matches("api").count() + 
285                               content_lower.matches("bearer").count() +
286                               content_lower.matches("authorization").count();
287        
288        // High template literal density + API patterns = likely code generation
289        if template_literal_count > 5 && api_pattern_count > 3 {
290            return true;
291        }
292        
293        false
294    }
295    
296    /// Check if content is minified JavaScript or CSS
297    fn is_minified_js_css(&self, content: &str, file_meta: &FileMetadata) -> bool {
298        let has_js_css_ext = file_meta.extension.as_deref()
299            .map(|ext| matches!(ext, "js" | "css" | "mjs" | "cjs"))
300            .unwrap_or(false);
301        
302        if !has_js_css_ext {
303            return false;
304        }
305        
306        let lines: Vec<&str> = content.lines().collect();
307        
308        // Minified files typically have very few lines but very long lines
309        if lines.len() < 10 {
310            let avg_line_length = content.len() / lines.len().max(1);
311            if avg_line_length > 500 {
312                return true;
313            }
314        }
315        
316        // Check for typical minification patterns
317        if content.contains(";var ") || content.contains(",function(") || 
318           content.contains("!function(") || content.contains(";!function") {
319            return true;
320        }
321        
322        false
323    }
324    
325    /// Check if content has high percentage of base64-like data
326    fn has_high_base64_content(&self, content: &str) -> bool {
327        // Skip if content is too small
328        if content.len() < 100 {
329            return false;
330        }
331        
332        let base64_chars = content.chars()
333            .filter(|c| c.is_alphanumeric() || *c == '+' || *c == '/' || *c == '=')
334            .count();
335        
336        let base64_ratio = base64_chars as f32 / content.len() as f32;
337        
338        // High base64 ratio suggests encoded content (images, fonts, etc.)
339        // But allow JWT tokens which are smaller
340        if base64_ratio > 0.7 && content.len() > 1000 && !content.contains("eyJ") {
341            return true;
342        }
343        
344        // Check for data URLs
345        if content.contains("data:image/") || content.contains("data:font/") ||
346           content.contains("data:application/") {
347            return true;
348        }
349        
350        false
351    }
352    
353    /// Read file using memory mapping
354    fn read_file_mmap(&self, path: &PathBuf) -> io::Result<String> {
355        let file = File::open(path)?;
356        let mmap = unsafe { MmapOptions::new().map(&file)? };
357        
358        // Validate UTF-8 using SIMD if available
359        match simdutf8::basic::from_utf8(&mmap) {
360            Ok(content) => Ok(content.to_string()),
361            Err(_) => {
362                // Fallback to lossy conversion for non-UTF8 files
363                Ok(String::from_utf8_lossy(&mmap).to_string())
364            }
365        }
366    }
367    
368    /// Read file using buffered I/O
369    fn read_file_buffered(&self, path: &PathBuf) -> io::Result<String> {
370        let file = File::open(path)?;
371        let mut reader = BufReader::with_capacity(8192, file);
372        let mut content = String::new();
373        reader.read_to_string(&mut content)?;
374        Ok(content)
375    }
376    
377    /// Convert pattern matches to security findings
378    fn convert_matches_to_findings(&self, matches: Vec<PatternMatch>, file_meta: &FileMetadata) -> Vec<SecurityFinding> {
379        matches.into_iter()
380            .map(|match_| {
381                SecurityFinding {
382                    id: format!("{}-{}-{}", match_.pattern.id, file_meta.path.display(), match_.line_number),
383                    title: match_.pattern.name.clone(),
384                    description: self.enhance_description(&match_.pattern.description, file_meta),
385                    severity: self.adjust_severity(&match_.pattern.severity, file_meta, match_.confidence),
386                    category: match_.pattern.category.clone(),
387                    file_path: Some(file_meta.path.clone()),
388                    line_number: Some(match_.line_number),
389                    column_number: Some(match_.column_number),
390                    evidence: Some(match_.evidence),
391                    remediation: match_.pattern.remediation.clone(),
392                    references: match_.pattern.references.clone(),
393                    cwe_id: match_.pattern.cwe_id.clone(),
394                    compliance_frameworks: self.get_compliance_frameworks(&match_.pattern.category),
395                }
396            })
397            .collect()
398    }
399    
400    /// Enhance description with file context and proper gitignore status
401    fn enhance_description(&self, base_description: &str, file_meta: &FileMetadata) -> String {
402        let mut description = base_description.to_string();
403        
404        // Add comprehensive gitignore context for status determination
405        if file_meta.is_gitignored {
406            // File is properly protected
407            if file_meta.priority_hints.is_env_file || 
408               file_meta.priority_hints.is_config_file ||
409               base_description.to_lowercase().contains("secret") ||
410               base_description.to_lowercase().contains("key") ||
411               base_description.to_lowercase().contains("token") {
412                description.push_str(" (File is protected by .gitignore)");
413            } else {
414                description.push_str(" (File appears safe for version control)");
415            }
416        } else {
417            // File is NOT gitignored - determine risk level
418            if self.file_contains_secrets(file_meta) {
419                // Check if tracked by git using git command
420                if self.is_file_tracked_by_git(&file_meta.path) {
421                    description.push_str(" (File is tracked by git and may expose secrets in version history - CRITICAL RISK)");
422                } else {
423                    description.push_str(" (File is NOT in .gitignore but contains secrets - HIGH RISK)");
424                }
425            } else {
426                description.push_str(" (File appears safe for version control)");
427            }
428        }
429        
430        // Add file type context
431        if file_meta.priority_hints.is_env_file {
432            description.push_str(" [Environment file]");
433        } else if file_meta.priority_hints.is_config_file {
434            description.push_str(" [Configuration file]");
435        }
436        
437        description
438    }
439    
440    /// Check if file likely contains secrets based on patterns
441    fn file_contains_secrets(&self, file_meta: &FileMetadata) -> bool {
442        // Check file name patterns
443        if let Some(file_name) = file_meta.path.file_name().and_then(|n| n.to_str()) {
444            let file_name_lower = file_name.to_lowercase();
445            let secret_file_patterns = [
446                ".env", ".key", ".pem", ".p12", ".pfx", 
447                "id_rsa", "id_dsa", "id_ecdsa", "id_ed25519",
448                "credentials", "secrets", "private", "secret.json",
449                "service-account", "auth.json", "config.json"
450            ];
451            
452            if secret_file_patterns.iter().any(|pattern| file_name_lower.contains(pattern)) {
453                return true;
454            }
455        }
456        
457        // Check if it's a priority file (likely to contain secrets)
458        file_meta.priority_hints.is_env_file || 
459        file_meta.priority_hints.is_config_file ||
460        file_meta.is_critical()
461    }
462    
463    /// Check if file is tracked by git
464    fn is_file_tracked_by_git(&self, file_path: &std::path::PathBuf) -> bool {
465        use std::process::Command;
466        
467        Command::new("git")
468            .args(&["ls-files", "--error-unmatch"])
469            .arg(file_path)
470            .output()
471            .map(|output| output.status.success())
472            .unwrap_or(false)
473    }
474    
475    /// Adjust severity based on context
476    fn adjust_severity(&self, base_severity: &SecuritySeverity, file_meta: &FileMetadata, confidence: f32) -> SecuritySeverity {
477        let mut severity = base_severity.clone();
478        let filename = file_meta.path.file_name().and_then(|s| s.to_str()).unwrap_or("");
479
480        // Downgrade severity for known public/client-side keys in specific files.
481        if filename == "GoogleService-Info.plist" || filename.ends_with(".plist") {
482            if matches!(severity, SecuritySeverity::Critical | SecuritySeverity::High) {
483                return SecuritySeverity::Medium; // It's a client-side key, less critical.
484            }
485        }
486        
487        // Upgrade severity for unprotected files
488        if !file_meta.is_gitignored && matches!(severity, SecuritySeverity::Medium | SecuritySeverity::High) {
489            severity = match severity {
490                SecuritySeverity::Medium => SecuritySeverity::High,
491                SecuritySeverity::High => SecuritySeverity::Critical,
492                _ => severity,
493            };
494        }
495        
496        // Downgrade for low confidence
497        if confidence < 0.5 && matches!(severity, SecuritySeverity::High | SecuritySeverity::Critical) {
498            severity = match severity {
499                SecuritySeverity::Critical => SecuritySeverity::High,
500                SecuritySeverity::High => SecuritySeverity::Medium,
501                _ => severity,
502            };
503        }
504        
505        severity
506    }
507    
508    /// Get compliance frameworks based on category
509    fn get_compliance_frameworks(&self, category: &SecurityCategory) -> Vec<String> {
510        match category {
511            SecurityCategory::SecretsExposure => vec!["SOC2".to_string(), "GDPR".to_string(), "PCI-DSS".to_string()],
512            SecurityCategory::InsecureConfiguration => vec!["SOC2".to_string(), "OWASP".to_string()],
513            SecurityCategory::AuthenticationSecurity => vec!["SOC2".to_string(), "OWASP".to_string()],
514            SecurityCategory::DataProtection => vec!["GDPR".to_string(), "CCPA".to_string()],
515            _ => vec!["SOC2".to_string()],
516        }
517    }
518}
519
520/// Specialized scanner for .env files
521pub struct EnvFileScanner;
522
523impl EnvFileScanner {
524    /// Fast scan of .env files without regex
525    pub fn scan_env_file(path: &PathBuf) -> Result<Vec<SecurityFinding>, io::Error> {
526        let content = std::fs::read_to_string(path)?;
527        let mut findings = Vec::new();
528        
529        for (line_num, line) in content.lines().enumerate() {
530            let line = line.trim();
531            
532            // Skip comments and empty lines
533            if line.is_empty() || line.starts_with('#') {
534                continue;
535            }
536            
537            // Parse key=value pairs
538            if let Some(eq_pos) = line.find('=') {
539                let key = &line[..eq_pos].trim();
540                let value = &line[eq_pos + 1..].trim_matches('"').trim_matches('\'');
541                
542                // Check for sensitive keys with actual values
543                if is_sensitive_env_key(key) && !value.is_empty() && !is_placeholder_value(value) {
544                    findings.push(SecurityFinding {
545                        id: format!("env-secret-{}-{}", path.display(), line_num),
546                        title: format!("Sensitive Environment Variable: {}", key),
547                        description: format!("Environment variable '{}' contains a potentially sensitive value", key),
548                        severity: determine_env_severity(key, value),
549                        category: SecurityCategory::SecretsExposure,
550                        file_path: Some(path.clone()),
551                        line_number: Some(line_num + 1),
552                        column_number: Some(eq_pos + 1),
553                        evidence: Some(format!("{}=***", key)),
554                        remediation: vec![
555                            "Ensure .env files are in .gitignore".to_string(),
556                            "Use .env.example for documentation".to_string(),
557                            "Consider using a secure secret management service".to_string(),
558                        ],
559                        references: vec![
560                            "https://12factor.net/config".to_string(),
561                        ],
562                        cwe_id: Some("CWE-798".to_string()),
563                        compliance_frameworks: vec!["SOC2".to_string(), "GDPR".to_string()],
564                    });
565                }
566            }
567        }
568        
569        Ok(findings)
570    }
571}
572
573/// Check if an environment variable key is sensitive
574fn is_sensitive_env_key(key: &str) -> bool {
575    let key_upper = key.to_uppercase();
576    let sensitive_patterns = [
577        "PASSWORD", "SECRET", "KEY", "TOKEN", "API", "AUTH",
578        "PRIVATE", "CREDENTIAL", "ACCESS", "CLIENT", "STRIPE",
579        "AWS", "GOOGLE", "AZURE", "DATABASE", "DB_", "JWT",
580    ];
581    
582    sensitive_patterns.iter().any(|pattern| key_upper.contains(pattern))
583}
584
585/// Check if a value is likely a placeholder
586fn is_placeholder_value(value: &str) -> bool {
587    let placeholders = [
588        "your_", "change_me", "xxx", "placeholder", "example",
589        "test", "demo", "fake", "dummy", "<", ">", "${", "}",
590    ];
591    
592    let value_lower = value.to_lowercase();
593    placeholders.iter().any(|p| value_lower.contains(p))
594}
595
596/// Determine severity based on the type of secret
597fn determine_env_severity(key: &str, _value: &str) -> SecuritySeverity {
598    let key_upper = key.to_uppercase();
599    
600    // Critical: API keys, database credentials
601    if key_upper.contains("DATABASE") || key_upper.contains("DB_PASS") ||
602       key_upper.contains("AWS_SECRET") || key_upper.contains("STRIPE_SECRET") {
603        return SecuritySeverity::Critical;
604    }
605    
606    // High: Most API keys and secrets
607    if key_upper.contains("API") || key_upper.contains("SECRET") ||
608       key_upper.contains("PRIVATE") || key_upper.contains("TOKEN") {
609        return SecuritySeverity::High;
610    }
611    
612    // Medium: General passwords and auth
613    if key_upper.contains("PASSWORD") || key_upper.contains("AUTH") {
614        return SecuritySeverity::Medium;
615    }
616    
617    SecuritySeverity::Low
618}
619
620#[cfg(test)]
621mod tests {
622    use super::*;
623    use tempfile::TempDir;
624    use std::fs;
625    
626    #[test]
627    fn test_env_file_scanner() {
628        let temp_dir = TempDir::new().unwrap();
629        let env_file = temp_dir.path().join(".env");
630        
631        fs::write(&env_file, r#"
632# Database config
633DATABASE_URL=postgres://user:password@localhost/db
634API_KEY=sk-1234567890abcdef
635PUBLIC_URL=https://example.com
636TEST_VAR=placeholder_value
637"#).unwrap();
638        
639        let findings = EnvFileScanner::scan_env_file(&env_file).unwrap();
640        
641        // Should find DATABASE_URL and API_KEY but not PUBLIC_URL or TEST_VAR
642        assert_eq!(findings.len(), 2);
643        assert!(findings.iter().any(|f| f.title.contains("DATABASE_URL")));
644        assert!(findings.iter().any(|f| f.title.contains("API_KEY")));
645    }
646    
647    #[test]
648    fn test_placeholder_detection() {
649        assert!(is_placeholder_value("your_api_key_here"));
650        assert!(is_placeholder_value("<YOUR_TOKEN>"));
651        assert!(is_placeholder_value("xxx"));
652        assert!(!is_placeholder_value("sk-1234567890"));
653    }
654}