syncable_cli/analyzer/security/turbo/
scanner.rs

1//! # Scanner Module
2//!
3//! High-performance file scanning with memory-mapped I/O and parallel processing.
4
5use std::fs::File;
6use std::io::{self, BufReader, Read};
7use std::path::PathBuf;
8use std::sync::Arc;
9
10use crossbeam::channel::{Receiver, Sender};
11use log::{debug, trace, warn};
12use memmap2::MmapOptions;
13use parking_lot::{Mutex, RwLock};
14
15use super::cache::SecurityCache;
16use super::file_discovery::FileMetadata;
17use super::pattern_engine::{PatternEngine, PatternMatch};
18use crate::analyzer::security::{SecurityCategory, SecurityFinding, SecuritySeverity};
19
20/// Scan task for a worker thread
21#[derive(Debug)]
22pub struct ScanTask {
23    pub id: usize,
24    pub file: FileMetadata,
25    pub quick_reject: bool,
26}
27
28/// Scan result from a worker thread
29#[derive(Debug)]
30pub enum ScanResult {
31    Findings(Vec<SecurityFinding>),
32    Skipped,
33    Error(String),
34}
35
36/// File scanner worker
37pub struct FileScanner {
38    thread_id: usize,
39    pattern_engine: Arc<PatternEngine>,
40    cache: Arc<SecurityCache>,
41    use_mmap: bool,
42}
43
44impl FileScanner {
45    pub fn new(
46        thread_id: usize,
47        pattern_engine: Arc<PatternEngine>,
48        cache: Arc<SecurityCache>,
49        use_mmap: bool,
50    ) -> Self {
51        Self {
52            thread_id,
53            pattern_engine,
54            cache,
55            use_mmap,
56        }
57    }
58
59    /// Run the scanner worker
60    pub fn run(
61        &self,
62        task_receiver: Receiver<ScanTask>,
63        result_sender: Sender<ScanResult>,
64        critical_count: Arc<Mutex<usize>>,
65        should_terminate: Arc<RwLock<bool>>,
66        max_critical: Option<usize>,
67    ) {
68        debug!("Scanner thread {} started", self.thread_id);
69
70        while let Ok(task) = task_receiver.recv() {
71            // Check for early termination
72            if *should_terminate.read() {
73                debug!("Scanner thread {} terminating early", self.thread_id);
74                break;
75            }
76
77            // Process the scan task
78            let result = self.scan_file(task);
79
80            // Check for critical findings
81            if let ScanResult::Findings(ref findings) = result {
82                let critical_findings = findings
83                    .iter()
84                    .filter(|f| f.severity == SecuritySeverity::Critical)
85                    .count();
86
87                if critical_findings > 0 {
88                    let mut count = critical_count.lock();
89                    *count += critical_findings;
90
91                    if let Some(max) = max_critical
92                        && *count >= max
93                    {
94                        *should_terminate.write() = true;
95                        debug!("Critical findings limit reached, triggering early termination");
96                    }
97                }
98            }
99
100            // Send result
101            if result_sender.send(result).is_err() {
102                break; // Channel closed
103            }
104        }
105
106        debug!("Scanner thread {} finished", self.thread_id);
107    }
108
109    /// Scan a single file
110    fn scan_file(&self, task: ScanTask) -> ScanResult {
111        trace!(
112            "Thread {} scanning: {}",
113            self.thread_id,
114            task.file.path.display()
115        );
116
117        // Check cache first
118        if let Some(cached_result) = self.cache.get(&task.file.path) {
119            trace!("Cache hit for: {}", task.file.path.display());
120            return ScanResult::Findings(cached_result);
121        }
122
123        // Read file content
124        let content = match self.read_file_content(&task.file) {
125            Ok(content) => content,
126            Err(e) => {
127                warn!("Failed to read file {}: {}", task.file.path.display(), e);
128                return ScanResult::Error(e.to_string());
129            }
130        };
131
132        // Skip if content is empty
133        if content.is_empty() {
134            return ScanResult::Skipped;
135        }
136
137        // Scan content for patterns
138        let matches = self
139            .pattern_engine
140            .scan_content(&content, task.quick_reject, &task.file);
141
142        // Convert matches to findings
143        let findings = self.convert_matches_to_findings(matches, &task.file);
144
145        // Cache the result
146        self.cache.insert(task.file.path.clone(), findings.clone());
147
148        ScanResult::Findings(findings)
149    }
150
151    /// Read file content with optimal method and content validation
152    fn read_file_content(&self, file_meta: &FileMetadata) -> io::Result<String> {
153        let content = if self.use_mmap && file_meta.size > 4096 {
154            self.read_file_mmap(&file_meta.path)?
155        } else {
156            self.read_file_buffered(&file_meta.path)?
157        };
158
159        // Additional content validation to skip files that are unlikely to contain secrets
160        if self.should_skip_content(&content, file_meta) {
161            return Ok(String::new()); // Return empty string to indicate skip
162        }
163
164        Ok(content)
165    }
166
167    /// Check if file content should be skipped based on content analysis
168    fn should_skip_content(&self, content: &str, file_meta: &FileMetadata) -> bool {
169        // Skip empty or very small files
170        if content.trim().is_empty() || content.len() < 10 {
171            return true;
172        }
173
174        // Skip binary-like content that passed initial filtering
175        if self.is_binary_content(content) {
176            return true;
177        }
178
179        // Skip generated or compiled files
180        if self.is_generated_content(content, file_meta) {
181            return true;
182        }
183
184        // Skip files with very high base64 content (likely assets)
185        if self.has_high_base64_content(content) {
186            return true;
187        }
188
189        false
190    }
191
192    /// Check if content appears to be binary
193    fn is_binary_content(&self, content: &str) -> bool {
194        // Check for null bytes or high percentage of non-printable characters
195        let non_printable_count = content
196            .chars()
197            .filter(|c| !c.is_ascii() || (c.is_control() && !c.is_whitespace()))
198            .count();
199
200        let non_printable_ratio = non_printable_count as f32 / content.len() as f32;
201
202        // If more than 5% non-printable characters, likely binary
203        non_printable_ratio > 0.05
204    }
205
206    /// Check if content appears to be generated or compiled
207    fn is_generated_content(&self, content: &str, file_meta: &FileMetadata) -> bool {
208        let content_lower = content.to_lowercase();
209
210        // Check for generation markers
211        let generated_markers = [
212            "// this file is generated",
213            "/* this file is generated",
214            "# this file is generated",
215            "automatically generated",
216            "auto-generated",
217            "autogenerated",
218            "do not edit",
219            "do not modify",
220            "generated by webpack",
221            "generated by babel",
222            "compiled by typescript",
223            "@generated",
224            "sourcemappingurl=",
225        ];
226
227        if generated_markers
228            .iter()
229            .any(|&marker| content_lower.contains(marker))
230        {
231            return true;
232        }
233
234        // Check for source maps
235        if content.contains("//# sourceMappingURL=") || content.contains("/*# sourceMappingURL=") {
236            return true;
237        }
238
239        // Check for code generation files (files that generate example code)
240        if self.is_code_generation_file(content, file_meta) {
241            return true;
242        }
243
244        // Check for minified JavaScript/CSS
245        if self.is_minified_js_css(content, file_meta) {
246            return true;
247        }
248
249        false
250    }
251
252    /// Check if file is primarily for code generation/examples
253    fn is_code_generation_file(&self, content: &str, file_meta: &FileMetadata) -> bool {
254        let content_lower = content.to_lowercase();
255
256        // Check filename patterns
257        if let Some(filename) = file_meta.path.file_name().and_then(|n| n.to_str()) {
258            let filename_lower = filename.to_lowercase();
259            let code_gen_filenames = [
260                "apicodedialog",
261                "codedialog",
262                "codeexample",
263                "apiexample",
264                "codesnippet",
265                "snippets",
266                "examples",
267                "templates",
268                "codegenerator",
269                "apitool",
270            ];
271
272            if code_gen_filenames
273                .iter()
274                .any(|&pattern| filename_lower.contains(pattern))
275            {
276                return true;
277            }
278        }
279
280        // Check content patterns - files that primarily generate code examples
281        let code_gen_content_patterns = [
282            // Function names
283            "getcode(",
284            "generatecode",
285            "getcodewithauthorization",
286            "getconfigcode",
287            "getmulticonfigcode",
288            // Template/example generation
289            "api_url =",
290            "def query(",
291            "async function query",
292            "import requests",
293            "const response = await fetch",
294            "curl ",
295            "bearer ${",
296            "authorization: \"bearer",
297            // React component patterns for code display
298            "copyblock",
299            "codeblock",
300            "react-code-blocks",
301            // High density of template literals
302        ];
303
304        let pattern_matches = code_gen_content_patterns
305            .iter()
306            .filter(|&pattern| content_lower.contains(pattern))
307            .count();
308
309        // If we have multiple code generation patterns, likely a code gen file
310        if pattern_matches >= 3 {
311            return true;
312        }
313
314        // Check for high density of template literals with API patterns
315        let template_literal_count = content.matches("${").count();
316        let api_pattern_count = content_lower.matches("api").count()
317            + content_lower.matches("bearer").count()
318            + content_lower.matches("authorization").count();
319
320        // High template literal density + API patterns = likely code generation
321        if template_literal_count > 5 && api_pattern_count > 3 {
322            return true;
323        }
324
325        false
326    }
327
328    /// Check if content is minified JavaScript or CSS
329    fn is_minified_js_css(&self, content: &str, file_meta: &FileMetadata) -> bool {
330        let has_js_css_ext = file_meta
331            .extension
332            .as_deref()
333            .map(|ext| matches!(ext, "js" | "css" | "mjs" | "cjs"))
334            .unwrap_or(false);
335
336        if !has_js_css_ext {
337            return false;
338        }
339
340        let lines: Vec<&str> = content.lines().collect();
341
342        // Minified files typically have very few lines but very long lines
343        if lines.len() < 10 {
344            let avg_line_length = content.len() / lines.len().max(1);
345            if avg_line_length > 500 {
346                return true;
347            }
348        }
349
350        // Check for typical minification patterns
351        if content.contains(";var ")
352            || content.contains(",function(")
353            || content.contains("!function(")
354            || content.contains(";!function")
355        {
356            return true;
357        }
358
359        false
360    }
361
362    /// Check if content has high percentage of base64-like data
363    fn has_high_base64_content(&self, content: &str) -> bool {
364        // Skip if content is too small
365        if content.len() < 100 {
366            return false;
367        }
368
369        let base64_chars = content
370            .chars()
371            .filter(|c| c.is_alphanumeric() || *c == '+' || *c == '/' || *c == '=')
372            .count();
373
374        let base64_ratio = base64_chars as f32 / content.len() as f32;
375
376        // High base64 ratio suggests encoded content (images, fonts, etc.)
377        // But allow JWT tokens which are smaller
378        if base64_ratio > 0.7 && content.len() > 1000 && !content.contains("eyJ") {
379            return true;
380        }
381
382        // Check for data URLs
383        if content.contains("data:image/")
384            || content.contains("data:font/")
385            || content.contains("data:application/")
386        {
387            return true;
388        }
389
390        false
391    }
392
393    /// Read file using memory mapping
394    fn read_file_mmap(&self, path: &PathBuf) -> io::Result<String> {
395        let file = File::open(path)?;
396        let mmap = unsafe { MmapOptions::new().map(&file)? };
397
398        // Validate UTF-8 using SIMD if available
399        match simdutf8::basic::from_utf8(&mmap) {
400            Ok(content) => Ok(content.to_string()),
401            Err(_) => {
402                // Fallback to lossy conversion for non-UTF8 files
403                Ok(String::from_utf8_lossy(&mmap).to_string())
404            }
405        }
406    }
407
408    /// Read file using buffered I/O
409    fn read_file_buffered(&self, path: &PathBuf) -> io::Result<String> {
410        let file = File::open(path)?;
411        let mut reader = BufReader::with_capacity(8192, file);
412        let mut content = String::new();
413        reader.read_to_string(&mut content)?;
414        Ok(content)
415    }
416
417    /// Convert pattern matches to security findings
418    fn convert_matches_to_findings(
419        &self,
420        matches: Vec<PatternMatch>,
421        file_meta: &FileMetadata,
422    ) -> Vec<SecurityFinding> {
423        matches
424            .into_iter()
425            .map(|match_| SecurityFinding {
426                id: format!(
427                    "{}-{}-{}",
428                    match_.pattern.id,
429                    file_meta.path.display(),
430                    match_.line_number
431                ),
432                title: match_.pattern.name.clone(),
433                description: self.enhance_description(&match_.pattern.description, file_meta),
434                severity: self.adjust_severity(
435                    &match_.pattern.severity,
436                    file_meta,
437                    match_.confidence,
438                ),
439                category: match_.pattern.category.clone(),
440                file_path: Some(file_meta.path.clone()),
441                line_number: Some(match_.line_number),
442                column_number: Some(match_.column_number),
443                evidence: Some(match_.evidence),
444                remediation: match_.pattern.remediation.clone(),
445                references: match_.pattern.references.clone(),
446                cwe_id: match_.pattern.cwe_id.clone(),
447                compliance_frameworks: self.get_compliance_frameworks(&match_.pattern.category),
448            })
449            .collect()
450    }
451
452    /// Enhance description with file context and proper gitignore status
453    fn enhance_description(&self, base_description: &str, file_meta: &FileMetadata) -> String {
454        let mut description = base_description.to_string();
455
456        // Add comprehensive gitignore context for status determination
457        if file_meta.is_gitignored {
458            // File is properly protected
459            if file_meta.priority_hints.is_env_file
460                || file_meta.priority_hints.is_config_file
461                || base_description.to_lowercase().contains("secret")
462                || base_description.to_lowercase().contains("key")
463                || base_description.to_lowercase().contains("token")
464            {
465                description.push_str(" (File is protected by .gitignore)");
466            } else {
467                description.push_str(" (File appears safe for version control)");
468            }
469        } else {
470            // File is NOT gitignored - determine risk level
471            if self.file_contains_secrets(file_meta) {
472                // Check if tracked by git using git command
473                if self.is_file_tracked_by_git(&file_meta.path) {
474                    description.push_str(" (File is tracked by git and may expose secrets in version history - CRITICAL RISK)");
475                } else {
476                    description
477                        .push_str(" (File is NOT in .gitignore but contains secrets - HIGH RISK)");
478                }
479            } else {
480                description.push_str(" (File appears safe for version control)");
481            }
482        }
483
484        // Add file type context
485        if file_meta.priority_hints.is_env_file {
486            description.push_str(" [Environment file]");
487        } else if file_meta.priority_hints.is_config_file {
488            description.push_str(" [Configuration file]");
489        }
490
491        description
492    }
493
494    /// Check if file likely contains secrets based on patterns
495    fn file_contains_secrets(&self, file_meta: &FileMetadata) -> bool {
496        // Check file name patterns
497        if let Some(file_name) = file_meta.path.file_name().and_then(|n| n.to_str()) {
498            let file_name_lower = file_name.to_lowercase();
499            let secret_file_patterns = [
500                ".env",
501                ".key",
502                ".pem",
503                ".p12",
504                ".pfx",
505                "id_rsa",
506                "id_dsa",
507                "id_ecdsa",
508                "id_ed25519",
509                "credentials",
510                "secrets",
511                "private",
512                "secret.json",
513                "service-account",
514                "auth.json",
515                "config.json",
516            ];
517
518            if secret_file_patterns
519                .iter()
520                .any(|pattern| file_name_lower.contains(pattern))
521            {
522                return true;
523            }
524        }
525
526        // Check if it's a priority file (likely to contain secrets)
527        file_meta.priority_hints.is_env_file
528            || file_meta.priority_hints.is_config_file
529            || file_meta.is_critical()
530    }
531
532    /// Check if file is tracked by git
533    fn is_file_tracked_by_git(&self, file_path: &std::path::PathBuf) -> bool {
534        use std::process::Command;
535
536        Command::new("git")
537            .args(["ls-files", "--error-unmatch"])
538            .arg(file_path)
539            .output()
540            .map(|output| output.status.success())
541            .unwrap_or(false)
542    }
543
544    /// Adjust severity based on context
545    fn adjust_severity(
546        &self,
547        base_severity: &SecuritySeverity,
548        file_meta: &FileMetadata,
549        confidence: f32,
550    ) -> SecuritySeverity {
551        let mut severity = base_severity.clone();
552        let filename = file_meta
553            .path
554            .file_name()
555            .and_then(|s| s.to_str())
556            .unwrap_or("");
557
558        // Downgrade severity for known public/client-side keys in specific files.
559        if (filename == "GoogleService-Info.plist" || filename.ends_with(".plist"))
560            && matches!(
561                severity,
562                SecuritySeverity::Critical | SecuritySeverity::High
563            )
564        {
565            return SecuritySeverity::Medium; // It's a client-side key, less critical.
566        }
567
568        // Upgrade severity for unprotected files
569        if !file_meta.is_gitignored
570            && matches!(severity, SecuritySeverity::Medium | SecuritySeverity::High)
571        {
572            severity = match severity {
573                SecuritySeverity::Medium => SecuritySeverity::High,
574                SecuritySeverity::High => SecuritySeverity::Critical,
575                _ => severity,
576            };
577        }
578
579        // Downgrade for low confidence
580        if confidence < 0.5
581            && matches!(
582                severity,
583                SecuritySeverity::High | SecuritySeverity::Critical
584            )
585        {
586            severity = match severity {
587                SecuritySeverity::Critical => SecuritySeverity::High,
588                SecuritySeverity::High => SecuritySeverity::Medium,
589                _ => severity,
590            };
591        }
592
593        severity
594    }
595
596    /// Get compliance frameworks based on category
597    fn get_compliance_frameworks(&self, category: &SecurityCategory) -> Vec<String> {
598        match category {
599            SecurityCategory::SecretsExposure => vec![
600                "SOC2".to_string(),
601                "GDPR".to_string(),
602                "PCI-DSS".to_string(),
603            ],
604            SecurityCategory::InsecureConfiguration => {
605                vec!["SOC2".to_string(), "OWASP".to_string()]
606            }
607            SecurityCategory::AuthenticationSecurity => {
608                vec!["SOC2".to_string(), "OWASP".to_string()]
609            }
610            SecurityCategory::DataProtection => vec!["GDPR".to_string(), "CCPA".to_string()],
611            _ => vec!["SOC2".to_string()],
612        }
613    }
614}
615
616/// Specialized scanner for .env files
617pub struct EnvFileScanner;
618
619impl EnvFileScanner {
620    /// Fast scan of .env files without regex
621    pub fn scan_env_file(path: &PathBuf) -> Result<Vec<SecurityFinding>, io::Error> {
622        let content = std::fs::read_to_string(path)?;
623        let mut findings = Vec::new();
624
625        for (line_num, line) in content.lines().enumerate() {
626            let line = line.trim();
627
628            // Skip comments and empty lines
629            if line.is_empty() || line.starts_with('#') {
630                continue;
631            }
632
633            // Parse key=value pairs
634            if let Some(eq_pos) = line.find('=') {
635                let key = &line[..eq_pos].trim();
636                let value = &line[eq_pos + 1..].trim_matches('"').trim_matches('\'');
637
638                // Check for sensitive keys with actual values
639                if is_sensitive_env_key(key) && !value.is_empty() && !is_placeholder_value(value) {
640                    findings.push(SecurityFinding {
641                        id: format!("env-secret-{}-{}", path.display(), line_num),
642                        title: format!("Sensitive Environment Variable: {}", key),
643                        description: format!(
644                            "Environment variable '{}' contains a potentially sensitive value",
645                            key
646                        ),
647                        severity: determine_env_severity(key, value),
648                        category: SecurityCategory::SecretsExposure,
649                        file_path: Some(path.clone()),
650                        line_number: Some(line_num + 1),
651                        column_number: Some(eq_pos + 1),
652                        evidence: Some(format!("{}=***", key)),
653                        remediation: vec![
654                            "Ensure .env files are in .gitignore".to_string(),
655                            "Use .env.example for documentation".to_string(),
656                            "Consider using a secure secret management service".to_string(),
657                        ],
658                        references: vec!["https://12factor.net/config".to_string()],
659                        cwe_id: Some("CWE-798".to_string()),
660                        compliance_frameworks: vec!["SOC2".to_string(), "GDPR".to_string()],
661                    });
662                }
663            }
664        }
665
666        Ok(findings)
667    }
668}
669
670/// Check if an environment variable key is sensitive
671fn is_sensitive_env_key(key: &str) -> bool {
672    let key_upper = key.to_uppercase();
673    let sensitive_patterns = [
674        "PASSWORD",
675        "SECRET",
676        "KEY",
677        "TOKEN",
678        "API",
679        "AUTH",
680        "PRIVATE",
681        "CREDENTIAL",
682        "ACCESS",
683        "CLIENT",
684        "STRIPE",
685        "AWS",
686        "GOOGLE",
687        "AZURE",
688        "DATABASE",
689        "DB_",
690        "JWT",
691    ];
692
693    sensitive_patterns
694        .iter()
695        .any(|pattern| key_upper.contains(pattern))
696}
697
698/// Check if a value is likely a placeholder
699fn is_placeholder_value(value: &str) -> bool {
700    let placeholders = [
701        "your_",
702        "change_me",
703        "xxx",
704        "placeholder",
705        "example",
706        "test",
707        "demo",
708        "fake",
709        "dummy",
710        "<",
711        ">",
712        "${",
713        "}",
714    ];
715
716    let value_lower = value.to_lowercase();
717    placeholders.iter().any(|p| value_lower.contains(p))
718}
719
720/// Determine severity based on the type of secret
721fn determine_env_severity(key: &str, _value: &str) -> SecuritySeverity {
722    let key_upper = key.to_uppercase();
723
724    // Critical: API keys, database credentials
725    if key_upper.contains("DATABASE")
726        || key_upper.contains("DB_PASS")
727        || key_upper.contains("AWS_SECRET")
728        || key_upper.contains("STRIPE_SECRET")
729    {
730        return SecuritySeverity::Critical;
731    }
732
733    // High: Most API keys and secrets
734    if key_upper.contains("API")
735        || key_upper.contains("SECRET")
736        || key_upper.contains("PRIVATE")
737        || key_upper.contains("TOKEN")
738    {
739        return SecuritySeverity::High;
740    }
741
742    // Medium: General passwords and auth
743    if key_upper.contains("PASSWORD") || key_upper.contains("AUTH") {
744        return SecuritySeverity::Medium;
745    }
746
747    SecuritySeverity::Low
748}
749
750#[cfg(test)]
751mod tests {
752    use super::*;
753    use std::fs;
754    use tempfile::TempDir;
755
756    #[test]
757    fn test_env_file_scanner() {
758        let temp_dir = TempDir::new().unwrap();
759        let env_file = temp_dir.path().join(".env");
760
761        fs::write(
762            &env_file,
763            r#"
764# Database config
765DATABASE_URL=postgres://user:password@localhost/db
766API_KEY=sk-1234567890abcdef
767PUBLIC_URL=https://example.com
768TEST_VAR=placeholder_value
769"#,
770        )
771        .unwrap();
772
773        let findings = EnvFileScanner::scan_env_file(&env_file).unwrap();
774
775        // Should find DATABASE_URL and API_KEY but not PUBLIC_URL or TEST_VAR
776        assert_eq!(findings.len(), 2);
777        assert!(findings.iter().any(|f| f.title.contains("DATABASE_URL")));
778        assert!(findings.iter().any(|f| f.title.contains("API_KEY")));
779    }
780
781    #[test]
782    fn test_placeholder_detection() {
783        assert!(is_placeholder_value("your_api_key_here"));
784        assert!(is_placeholder_value("<YOUR_TOKEN>"));
785        assert!(is_placeholder_value("xxx"));
786        assert!(!is_placeholder_value("sk-1234567890"));
787    }
788}