syncable_cli/analyzer/security/turbo/
scanner.rs

1//! # Scanner Module
2//! 
3//! High-performance file scanning with memory-mapped I/O and parallel processing.
4
5use std::path::PathBuf;
6use std::sync::Arc;
7use std::fs::File;
8use std::io::{self, Read, BufReader};
9
10use memmap2::MmapOptions;
11use crossbeam::channel::{Receiver, Sender};
12use parking_lot::{Mutex, RwLock};
13use log::{debug, trace, warn};
14
15use super::file_discovery::FileMetadata;
16use super::pattern_engine::{PatternEngine, PatternMatch};
17use super::cache::SecurityCache;
18use crate::analyzer::security::{SecurityFinding, SecuritySeverity, SecurityCategory};
19
20/// Scan task for a worker thread
21#[derive(Debug)]
22pub struct ScanTask {
23    pub id: usize,
24    pub file: FileMetadata,
25    pub quick_reject: bool,
26}
27
28/// Scan result from a worker thread
29#[derive(Debug)]
30pub enum ScanResult {
31    Findings(Vec<SecurityFinding>),
32    Skipped,
33    Error(String),
34}
35
36/// File scanner worker
37pub struct FileScanner {
38    thread_id: usize,
39    pattern_engine: Arc<PatternEngine>,
40    cache: Arc<SecurityCache>,
41    use_mmap: bool,
42}
43
44impl FileScanner {
45    pub fn new(
46        thread_id: usize,
47        pattern_engine: Arc<PatternEngine>,
48        cache: Arc<SecurityCache>,
49        use_mmap: bool,
50    ) -> Self {
51        Self {
52            thread_id,
53            pattern_engine,
54            cache,
55            use_mmap,
56        }
57    }
58    
59    /// Run the scanner worker
60    pub fn run(
61        &self,
62        task_receiver: Receiver<ScanTask>,
63        result_sender: Sender<ScanResult>,
64        critical_count: Arc<Mutex<usize>>,
65        should_terminate: Arc<RwLock<bool>>,
66        max_critical: Option<usize>,
67    ) {
68        debug!("Scanner thread {} started", self.thread_id);
69        
70        while let Ok(task) = task_receiver.recv() {
71            // Check for early termination
72            if *should_terminate.read() {
73                debug!("Scanner thread {} terminating early", self.thread_id);
74                break;
75            }
76            
77            // Process the scan task
78            let result = self.scan_file(task);
79            
80            // Check for critical findings
81            if let ScanResult::Findings(ref findings) = result {
82                let critical_findings = findings.iter()
83                    .filter(|f| f.severity == SecuritySeverity::Critical)
84                    .count();
85                
86                if critical_findings > 0 {
87                    let mut count = critical_count.lock();
88                    *count += critical_findings;
89                    
90                    if let Some(max) = max_critical {
91                        if *count >= max {
92                            *should_terminate.write() = true;
93                            debug!("Critical findings limit reached, triggering early termination");
94                        }
95                    }
96                }
97            }
98            
99            // Send result
100            if result_sender.send(result).is_err() {
101                break; // Channel closed
102            }
103        }
104        
105        debug!("Scanner thread {} finished", self.thread_id);
106    }
107    
108    /// Scan a single file
109    fn scan_file(&self, task: ScanTask) -> ScanResult {
110        trace!("Thread {} scanning: {}", self.thread_id, task.file.path.display());
111        
112        // Check cache first
113        if let Some(cached_result) = self.cache.get(&task.file.path) {
114            trace!("Cache hit for: {}", task.file.path.display());
115            return ScanResult::Findings(cached_result);
116        }
117        
118        // Read file content
119        let content = match self.read_file_content(&task.file) {
120            Ok(content) => content,
121            Err(e) => {
122                warn!("Failed to read file {}: {}", task.file.path.display(), e);
123                return ScanResult::Error(e.to_string());
124            }
125        };
126        
127        // Skip if content is empty
128        if content.is_empty() {
129            return ScanResult::Skipped;
130        }
131        
132        // Scan content for patterns
133        let matches = self.pattern_engine.scan_content(&content, task.quick_reject);
134        
135        // Convert matches to findings
136        let findings = self.convert_matches_to_findings(matches, &task.file);
137        
138        // Cache the result
139        self.cache.insert(task.file.path.clone(), findings.clone());
140        
141        ScanResult::Findings(findings)
142    }
143    
144    /// Read file content with optimal method
145    fn read_file_content(&self, file_meta: &FileMetadata) -> io::Result<String> {
146        // Use memory mapping for larger files if enabled
147        if self.use_mmap && file_meta.size > 4096 {
148            self.read_file_mmap(&file_meta.path)
149        } else {
150            self.read_file_buffered(&file_meta.path)
151        }
152    }
153    
154    /// Read file using memory mapping
155    fn read_file_mmap(&self, path: &PathBuf) -> io::Result<String> {
156        let file = File::open(path)?;
157        let mmap = unsafe { MmapOptions::new().map(&file)? };
158        
159        // Validate UTF-8 using SIMD if available
160        match simdutf8::basic::from_utf8(&mmap) {
161            Ok(content) => Ok(content.to_string()),
162            Err(_) => {
163                // Fallback to lossy conversion for non-UTF8 files
164                Ok(String::from_utf8_lossy(&mmap).to_string())
165            }
166        }
167    }
168    
169    /// Read file using buffered I/O
170    fn read_file_buffered(&self, path: &PathBuf) -> io::Result<String> {
171        let file = File::open(path)?;
172        let mut reader = BufReader::with_capacity(8192, file);
173        let mut content = String::new();
174        reader.read_to_string(&mut content)?;
175        Ok(content)
176    }
177    
178    /// Convert pattern matches to security findings
179    fn convert_matches_to_findings(&self, matches: Vec<PatternMatch>, file_meta: &FileMetadata) -> Vec<SecurityFinding> {
180        matches.into_iter()
181            .map(|match_| {
182                SecurityFinding {
183                    id: format!("{}-{}-{}", match_.pattern.id, file_meta.path.display(), match_.line_number),
184                    title: match_.pattern.name.clone(),
185                    description: self.enhance_description(&match_.pattern.description, file_meta),
186                    severity: self.adjust_severity(&match_.pattern.severity, file_meta, match_.confidence),
187                    category: match_.pattern.category.clone(),
188                    file_path: Some(file_meta.path.clone()),
189                    line_number: Some(match_.line_number),
190                    column_number: Some(match_.column_number),
191                    evidence: Some(match_.evidence),
192                    remediation: match_.pattern.remediation.clone(),
193                    references: match_.pattern.references.clone(),
194                    cwe_id: match_.pattern.cwe_id.clone(),
195                    compliance_frameworks: self.get_compliance_frameworks(&match_.pattern.category),
196                }
197            })
198            .collect()
199    }
200    
201    /// Enhance description with file context and proper gitignore status
202    fn enhance_description(&self, base_description: &str, file_meta: &FileMetadata) -> String {
203        let mut description = base_description.to_string();
204        
205        // Add comprehensive gitignore context for status determination
206        if file_meta.is_gitignored {
207            // File is properly protected
208            if file_meta.priority_hints.is_env_file || 
209               file_meta.priority_hints.is_config_file ||
210               base_description.to_lowercase().contains("secret") ||
211               base_description.to_lowercase().contains("key") ||
212               base_description.to_lowercase().contains("token") {
213                description.push_str(" (File is protected by .gitignore)");
214            } else {
215                description.push_str(" (File appears safe for version control)");
216            }
217        } else {
218            // File is NOT gitignored - determine risk level
219            if self.file_contains_secrets(file_meta) {
220                // Check if tracked by git using git command
221                if self.is_file_tracked_by_git(&file_meta.path) {
222                    description.push_str(" (File is tracked by git and may expose secrets in version history - CRITICAL RISK)");
223                } else {
224                    description.push_str(" (File is NOT in .gitignore but contains secrets - HIGH RISK)");
225                }
226            } else {
227                description.push_str(" (File appears safe for version control)");
228            }
229        }
230        
231        // Add file type context
232        if file_meta.priority_hints.is_env_file {
233            description.push_str(" [Environment file]");
234        } else if file_meta.priority_hints.is_config_file {
235            description.push_str(" [Configuration file]");
236        }
237        
238        description
239    }
240    
241    /// Check if file likely contains secrets based on patterns
242    fn file_contains_secrets(&self, file_meta: &FileMetadata) -> bool {
243        // Check file name patterns
244        if let Some(file_name) = file_meta.path.file_name().and_then(|n| n.to_str()) {
245            let file_name_lower = file_name.to_lowercase();
246            let secret_file_patterns = [
247                ".env", ".key", ".pem", ".p12", ".pfx", 
248                "id_rsa", "id_dsa", "id_ecdsa", "id_ed25519",
249                "credentials", "secrets", "private", "secret.json",
250                "service-account", "auth.json", "config.json"
251            ];
252            
253            if secret_file_patterns.iter().any(|pattern| file_name_lower.contains(pattern)) {
254                return true;
255            }
256        }
257        
258        // Check if it's a priority file (likely to contain secrets)
259        file_meta.priority_hints.is_env_file || 
260        file_meta.priority_hints.is_config_file ||
261        file_meta.is_critical()
262    }
263    
264    /// Check if file is tracked by git
265    fn is_file_tracked_by_git(&self, file_path: &std::path::PathBuf) -> bool {
266        use std::process::Command;
267        
268        Command::new("git")
269            .args(&["ls-files", "--error-unmatch"])
270            .arg(file_path)
271            .output()
272            .map(|output| output.status.success())
273            .unwrap_or(false)
274    }
275    
276    /// Adjust severity based on context
277    fn adjust_severity(&self, base_severity: &SecuritySeverity, file_meta: &FileMetadata, confidence: f32) -> SecuritySeverity {
278        let mut severity = base_severity.clone();
279        
280        // Upgrade severity for unprotected files
281        if !file_meta.is_gitignored && matches!(severity, SecuritySeverity::Medium | SecuritySeverity::High) {
282            severity = match severity {
283                SecuritySeverity::Medium => SecuritySeverity::High,
284                SecuritySeverity::High => SecuritySeverity::Critical,
285                _ => severity,
286            };
287        }
288        
289        // Downgrade for low confidence
290        if confidence < 0.5 && matches!(severity, SecuritySeverity::High | SecuritySeverity::Critical) {
291            severity = match severity {
292                SecuritySeverity::Critical => SecuritySeverity::High,
293                SecuritySeverity::High => SecuritySeverity::Medium,
294                _ => severity,
295            };
296        }
297        
298        severity
299    }
300    
301    /// Get compliance frameworks based on category
302    fn get_compliance_frameworks(&self, category: &SecurityCategory) -> Vec<String> {
303        match category {
304            SecurityCategory::SecretsExposure => vec!["SOC2".to_string(), "GDPR".to_string(), "PCI-DSS".to_string()],
305            SecurityCategory::InsecureConfiguration => vec!["SOC2".to_string(), "OWASP".to_string()],
306            SecurityCategory::AuthenticationSecurity => vec!["SOC2".to_string(), "OWASP".to_string()],
307            SecurityCategory::DataProtection => vec!["GDPR".to_string(), "CCPA".to_string()],
308            _ => vec!["SOC2".to_string()],
309        }
310    }
311}
312
313/// Specialized scanner for .env files
314pub struct EnvFileScanner;
315
316impl EnvFileScanner {
317    /// Fast scan of .env files without regex
318    pub fn scan_env_file(path: &PathBuf) -> Result<Vec<SecurityFinding>, io::Error> {
319        let content = std::fs::read_to_string(path)?;
320        let mut findings = Vec::new();
321        
322        for (line_num, line) in content.lines().enumerate() {
323            let line = line.trim();
324            
325            // Skip comments and empty lines
326            if line.is_empty() || line.starts_with('#') {
327                continue;
328            }
329            
330            // Parse key=value pairs
331            if let Some(eq_pos) = line.find('=') {
332                let key = &line[..eq_pos].trim();
333                let value = &line[eq_pos + 1..].trim_matches('"').trim_matches('\'');
334                
335                // Check for sensitive keys with actual values
336                if is_sensitive_env_key(key) && !value.is_empty() && !is_placeholder_value(value) {
337                    findings.push(SecurityFinding {
338                        id: format!("env-secret-{}-{}", path.display(), line_num),
339                        title: format!("Sensitive Environment Variable: {}", key),
340                        description: format!("Environment variable '{}' contains a potentially sensitive value", key),
341                        severity: determine_env_severity(key, value),
342                        category: SecurityCategory::SecretsExposure,
343                        file_path: Some(path.clone()),
344                        line_number: Some(line_num + 1),
345                        column_number: Some(eq_pos + 1),
346                        evidence: Some(format!("{}=***", key)),
347                        remediation: vec![
348                            "Ensure .env files are in .gitignore".to_string(),
349                            "Use .env.example for documentation".to_string(),
350                            "Consider using a secure secret management service".to_string(),
351                        ],
352                        references: vec![
353                            "https://12factor.net/config".to_string(),
354                        ],
355                        cwe_id: Some("CWE-798".to_string()),
356                        compliance_frameworks: vec!["SOC2".to_string(), "GDPR".to_string()],
357                    });
358                }
359            }
360        }
361        
362        Ok(findings)
363    }
364}
365
366/// Check if an environment variable key is sensitive
367fn is_sensitive_env_key(key: &str) -> bool {
368    let key_upper = key.to_uppercase();
369    let sensitive_patterns = [
370        "PASSWORD", "SECRET", "KEY", "TOKEN", "API", "AUTH",
371        "PRIVATE", "CREDENTIAL", "ACCESS", "CLIENT", "STRIPE",
372        "AWS", "GOOGLE", "AZURE", "DATABASE", "DB_", "JWT",
373    ];
374    
375    sensitive_patterns.iter().any(|pattern| key_upper.contains(pattern))
376}
377
378/// Check if a value is likely a placeholder
379fn is_placeholder_value(value: &str) -> bool {
380    let placeholders = [
381        "your_", "change_me", "xxx", "placeholder", "example",
382        "test", "demo", "fake", "dummy", "<", ">", "${", "}",
383    ];
384    
385    let value_lower = value.to_lowercase();
386    placeholders.iter().any(|p| value_lower.contains(p))
387}
388
389/// Determine severity based on the type of secret
390fn determine_env_severity(key: &str, _value: &str) -> SecuritySeverity {
391    let key_upper = key.to_uppercase();
392    
393    // Critical: API keys, database credentials
394    if key_upper.contains("DATABASE") || key_upper.contains("DB_PASS") ||
395       key_upper.contains("AWS_SECRET") || key_upper.contains("STRIPE_SECRET") {
396        return SecuritySeverity::Critical;
397    }
398    
399    // High: Most API keys and secrets
400    if key_upper.contains("API") || key_upper.contains("SECRET") ||
401       key_upper.contains("PRIVATE") || key_upper.contains("TOKEN") {
402        return SecuritySeverity::High;
403    }
404    
405    // Medium: General passwords and auth
406    if key_upper.contains("PASSWORD") || key_upper.contains("AUTH") {
407        return SecuritySeverity::Medium;
408    }
409    
410    SecuritySeverity::Low
411}
412
413#[cfg(test)]
414mod tests {
415    use super::*;
416    use tempfile::TempDir;
417    use std::fs;
418    
419    #[test]
420    fn test_env_file_scanner() {
421        let temp_dir = TempDir::new().unwrap();
422        let env_file = temp_dir.path().join(".env");
423        
424        fs::write(&env_file, r#"
425# Database config
426DATABASE_URL=postgres://user:password@localhost/db
427API_KEY=sk-1234567890abcdef
428PUBLIC_URL=https://example.com
429TEST_VAR=placeholder_value
430"#).unwrap();
431        
432        let findings = EnvFileScanner::scan_env_file(&env_file).unwrap();
433        
434        // Should find DATABASE_URL and API_KEY but not PUBLIC_URL or TEST_VAR
435        assert_eq!(findings.len(), 2);
436        assert!(findings.iter().any(|f| f.title.contains("DATABASE_URL")));
437        assert!(findings.iter().any(|f| f.title.contains("API_KEY")));
438    }
439    
440    #[test]
441    fn test_placeholder_detection() {
442        assert!(is_placeholder_value("your_api_key_here"));
443        assert!(is_placeholder_value("<YOUR_TOKEN>"));
444        assert!(is_placeholder_value("xxx"));
445        assert!(!is_placeholder_value("sk-1234567890"));
446    }
447}