syncable_cli/analyzer/security/turbo/
file_discovery.rs

1//! # File Discovery Module
2//! 
3//! Ultra-fast file discovery with git-aware filtering and smart prioritization.
4
5use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::fs;
8use std::time::SystemTime;
9
10use ahash::AHashSet;
11use rayon::prelude::*;
12use walkdir::WalkDir;
13use log::{debug, trace};
14
15use super::{ScanMode, SecurityError};
16
17/// File metadata for efficient filtering
18#[derive(Debug, Clone)]
19pub struct FileMetadata {
20    pub path: PathBuf,
21    pub size: usize,
22    pub extension: Option<String>,
23    pub is_gitignored: bool,
24    pub modified: SystemTime,
25    pub priority_hints: PriorityHints,
26}
27
28/// Priority hints for file scoring
29#[derive(Debug, Clone, Default)]
30pub struct PriorityHints {
31    pub is_env_file: bool,
32    pub is_config_file: bool,
33    pub is_secret_file: bool,
34    pub is_source_file: bool,
35    pub has_secret_keywords: bool,
36}
37
38/// Configuration for file discovery
39#[derive(Debug, Clone)]
40pub struct DiscoveryConfig {
41    pub use_git: bool,
42    pub max_file_size: usize,
43    pub priority_extensions: Vec<String>,
44    pub scan_mode: ScanMode,
45}
46
47/// High-performance file discovery
48pub struct FileDiscovery {
49    config: DiscoveryConfig,
50    ignored_dirs: AHashSet<String>,
51    secret_keywords: Vec<&'static str>,
52}
53
54impl FileDiscovery {
55    pub fn new(config: DiscoveryConfig) -> Self {
56        let ignored_dirs = Self::get_ignored_dirs(&config.scan_mode);
57        let secret_keywords = Self::get_secret_keywords();
58        
59        Self {
60            config,
61            ignored_dirs,
62            secret_keywords,
63        }
64    }
65    
66    /// Discover files with ultra-fast git-aware filtering
67    pub fn discover_files(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
68        let is_git_repo = project_root.join(".git").exists();
69        
70        if is_git_repo && self.config.use_git {
71            self.git_aware_discovery(project_root)
72        } else {
73            self.filesystem_discovery(project_root)
74        }
75    }
76    
77    /// Git-aware file discovery (fastest method)
78    fn git_aware_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
79        debug!("Using git-aware file discovery");
80        
81        // Get all tracked files using git ls-files
82        let tracked_files = self.get_git_tracked_files(project_root)?;
83        
84        // Get untracked files that might contain secrets
85        let untracked_files = self.get_untracked_secret_files(project_root)?;
86        
87        // Combine and process in parallel
88        let all_paths: Vec<PathBuf> = tracked_files.into_iter()
89            .chain(untracked_files)
90            .collect();
91        
92        // Process files in parallel to build metadata
93        let files: Vec<FileMetadata> = all_paths
94            .par_iter()
95            .filter_map(|path| self.build_file_metadata(path, project_root).ok())
96            .filter(|meta| self.should_include_file(meta))
97            .collect();
98        
99        Ok(files)
100    }
101    
102    /// Get tracked files from git
103    fn get_git_tracked_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
104        let output = Command::new("git")
105            .args(&["ls-files", "-z"]) // -z for null-terminated output
106            .current_dir(project_root)
107            .output()
108            .map_err(|e| SecurityError::FileDiscovery(format!("Git ls-files failed: {}", e)))?;
109        
110        if !output.status.success() {
111            return Err(SecurityError::FileDiscovery("Git ls-files failed".to_string()));
112        }
113        
114        // Parse null-terminated paths
115        let paths: Vec<PathBuf> = output.stdout
116            .split(|&b| b == 0)
117            .filter(|path| !path.is_empty())
118            .filter_map(|path| std::str::from_utf8(path).ok())
119            .map(|path| project_root.join(path))
120            .collect();
121        
122        Ok(paths)
123    }
124    
125    /// Get untracked files that might contain secrets
126    fn get_untracked_secret_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
127        // Common secret file patterns that might not be tracked
128        let secret_patterns = vec![
129            ".env*",
130            "*.key",
131            "*.pem",
132            "*.p12",
133            "*credentials*",
134            "*secret*",
135            "config/*.json",
136            "config/*.yml",
137        ];
138        
139        let mut untracked_files = Vec::new();
140        
141        for pattern in secret_patterns {
142            let output = Command::new("git")
143                .args(&["ls-files", "--others", "--exclude-standard", pattern])
144                .current_dir(project_root)
145                .output();
146            
147            if let Ok(output) = output {
148                if output.status.success() {
149                    let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
150                        .lines()
151                        .map(|line| project_root.join(line))
152                        .collect();
153                    untracked_files.extend(paths);
154                }
155            }
156        }
157        
158        Ok(untracked_files)
159    }
160    
161    /// Fallback filesystem discovery
162    fn filesystem_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
163        debug!("Using filesystem discovery");
164        
165        let walker = WalkDir::new(project_root)
166            .follow_links(false)
167            .max_depth(20)
168            .into_iter()
169            .filter_entry(|entry| {
170                // Skip ignored directories
171                if entry.file_type().is_dir() {
172                    let dir_name = entry.file_name().to_string_lossy();
173                    return !self.ignored_dirs.contains(dir_name.as_ref());
174                }
175                true
176            });
177        
178        let files: Vec<FileMetadata> = walker
179            .par_bridge()
180            .filter_map(|entry| entry.ok())
181            .filter(|entry| entry.file_type().is_file())
182            .filter_map(|entry| self.build_file_metadata(entry.path(), project_root).ok())
183            .filter(|meta| self.should_include_file(meta))
184            .collect();
185        
186        Ok(files)
187    }
188    
189    /// Build file metadata with priority hints
190    fn build_file_metadata(&self, path: &Path, project_root: &Path) -> Result<FileMetadata, std::io::Error> {
191        let metadata = fs::metadata(path)?;
192        let size = metadata.len() as usize;
193        let modified = metadata.modified()?;
194        
195        let extension = path.extension()
196            .and_then(|ext| ext.to_str())
197            .map(|s| s.to_lowercase());
198        
199        let file_name = path.file_name()
200            .and_then(|n| n.to_str())
201            .unwrap_or("");
202        
203        let file_name_lower = file_name.to_lowercase();
204        
205        // Check gitignore status efficiently
206        let is_gitignored = if project_root.join(".git").exists() {
207            self.check_gitignore_batch(path, project_root)
208        } else {
209            false
210        };
211        
212        // Build priority hints
213        let priority_hints = PriorityHints {
214            is_env_file: file_name_lower.starts_with(".env") || file_name_lower.ends_with(".env"),
215            is_config_file: self.is_config_file(&file_name_lower, &extension),
216            is_secret_file: self.is_secret_file(&file_name_lower, path),
217            is_source_file: self.is_source_file(&extension),
218            has_secret_keywords: self.has_secret_keywords(&file_name_lower),
219        };
220        
221        Ok(FileMetadata {
222            path: path.to_path_buf(),
223            size,
224            extension,
225            is_gitignored,
226            modified,
227            priority_hints,
228        })
229    }
230    
231    /// Batch check gitignore status
232    fn check_gitignore_batch(&self, path: &Path, project_root: &Path) -> bool {
233        // Quick check using git check-ignore
234        let output = Command::new("git")
235            .args(&["check-ignore", path.to_str().unwrap_or("")])
236            .current_dir(project_root)
237            .output();
238        
239        match output {
240            Ok(output) => output.status.success(),
241            Err(_) => false,
242        }
243    }
244    
245    /// Check if file should be included based on filters
246    fn should_include_file(&self, meta: &FileMetadata) -> bool {
247        // Size filter
248        if meta.size > self.config.max_file_size {
249            trace!("Skipping large file: {} ({} bytes)", meta.path.display(), meta.size);
250            return false;
251        }
252        
253        // Binary file detection (simple heuristic)
254        if let Some(ext) = &meta.extension {
255            let binary_extensions = ["exe", "dll", "so", "dylib", "jpg", "png", "gif", "mp4", "zip", "tar", "gz"];
256            if binary_extensions.contains(&ext.as_str()) {
257                return false;
258            }
259        }
260        
261        // Exclude files that are unlikely to contain real secrets
262        if self.should_exclude_from_security_scan(meta) {
263            trace!("Excluding from security scan: {}", meta.path.display());
264            return false;
265        }
266        
267        // Critical files always included
268        if meta.is_critical() {
269            return true;
270        }
271        
272        // Scan mode specific filtering
273        match self.config.scan_mode {
274            ScanMode::Lightning => {
275                // Only critical files (already handled above)
276                false
277            }
278            ScanMode::Fast => {
279                // Priority files or small source files
280                meta.is_priority() || (meta.priority_hints.is_source_file && meta.size < 50_000)
281            }
282            _ => true, // Include all for other modes
283        }
284    }
285    
286    /// Check if file should be excluded from security scanning
287    fn should_exclude_from_security_scan(&self, meta: &FileMetadata) -> bool {
288        let path_str = meta.path.to_string_lossy().to_lowercase();
289        
290        // DEPENDENCY LOCK FILES - These contain package hashes/metadata, not secrets
291        if self.is_dependency_lock_file(meta) {
292            return true;
293        }
294        
295        // Documentation and non-code files that rarely contain real secrets
296        let exclude_patterns = [
297            ".md", ".txt", ".rst", ".adoc", ".asciidoc",
298            "readme", "changelog", "license", "todo",
299            "roadmap", "contributing", "authors",
300            // Test files (often contain fake/example data)
301            "/test/", "/tests/", "/spec/", "/specs/",
302            "__test__", "__spec__", ".test.", ".spec.",
303            "_test.", "_spec.", "fixtures", "mocks", "examples",
304            // Documentation directories
305            "/docs/", "/doc/", "/documentation/",
306            // Framework/library detection files (they contain patterns but not secrets)
307            "frameworks/", "detector", "rules", "patterns",
308            // Build artifacts
309            "target/", "build/", "dist/", ".next/", "coverage/",
310        ];
311        
312        // Check patterns
313        if exclude_patterns.iter().any(|&pattern| path_str.contains(pattern)) {
314            return true;
315        }
316        
317        // Documentation file extensions
318        if let Some(ext) = &meta.extension {
319            let doc_extensions = ["md", "txt", "rst", "adoc", "asciidoc"];
320            if doc_extensions.contains(&ext.as_str()) {
321                return true;
322            }
323        }
324        
325        // Check if filename suggests it's documentation or examples
326        let filename = meta.path.file_name()
327            .and_then(|n| n.to_str())
328            .unwrap_or("")
329            .to_lowercase();
330        
331        let doc_filenames = [
332            "readme", "changelog", "license", "authors", "contributing",
333            "roadmap", "todo", "examples", "demo", "sample",
334        ];
335        
336        if doc_filenames.iter().any(|&name| filename.contains(name)) {
337            return true;
338        }
339        
340        false
341    }
342    
343    /// Get ignored directories based on scan mode
344    fn get_ignored_dirs(scan_mode: &ScanMode) -> AHashSet<String> {
345        let mut dirs = AHashSet::new();
346        
347        // Always ignore these
348        let always_ignore = vec![
349            ".git", "node_modules", "target", "build", "dist", ".next",
350            "coverage", "__pycache__", ".pytest_cache", ".mypy_cache",
351            "vendor", "packages", ".bundle", "bower_components",
352        ];
353        
354        for dir in always_ignore {
355            dirs.insert(dir.to_string());
356        }
357        
358        // Additional ignores for faster modes
359        if matches!(scan_mode, ScanMode::Lightning | ScanMode::Fast) {
360            let fast_ignore = vec!["test", "tests", "spec", "specs", "docs", "documentation"];
361            for dir in fast_ignore {
362                dirs.insert(dir.to_string());
363            }
364        }
365        
366        dirs
367    }
368    
369    /// Get secret keywords for detection
370    fn get_secret_keywords() -> Vec<&'static str> {
371        vec![
372            "secret", "key", "token", "password", "credential",
373            "auth", "api", "private", "access", "bearer",
374        ]
375    }
376    
377    fn is_config_file(&self, name: &str, extension: &Option<String>) -> bool {
378        let config_extensions = ["json", "yml", "yaml", "toml", "ini", "conf", "config", "xml"];
379        let config_names = ["config", "settings", "configuration", ".env"];
380        
381        if let Some(ext) = extension {
382            if config_extensions.contains(&ext.as_str()) {
383                return true;
384            }
385        }
386        
387        config_names.iter().any(|&n| name.contains(n))
388    }
389    
390    fn is_secret_file(&self, name: &str, path: &Path) -> bool {
391        let secret_patterns = [
392            ".env", ".key", ".pem", ".p12", ".pfx",
393            "credentials", "secret", "private", "cert",
394        ];
395        
396        // Check filename
397        if secret_patterns.iter().any(|&p| name.contains(p)) {
398            return true;
399        }
400        
401        // Check path components
402        let path_str = path.to_string_lossy().to_lowercase();
403        secret_patterns.iter().any(|&p| path_str.contains(p))
404    }
405    
406    fn is_source_file(&self, extension: &Option<String>) -> bool {
407        if let Some(ext) = extension {
408            let source_extensions = [
409                "js", "jsx", "ts", "tsx", "py", "java", "kt", "go",
410                "rs", "rb", "php", "cs", "cpp", "c", "h", "swift",
411                "scala", "clj", "ex", "exs",
412            ];
413            source_extensions.contains(&ext.as_str())
414        } else {
415            false
416        }
417    }
418    
419    fn has_secret_keywords(&self, name: &str) -> bool {
420        self.secret_keywords.iter().any(|&keyword| name.contains(keyword))
421    }
422    
423    /// Check if file is a dependency lock file (contains hashes/metadata, not secrets)
424    fn is_dependency_lock_file(&self, meta: &FileMetadata) -> bool {
425        let filename = meta.path.file_name()
426            .and_then(|n| n.to_str())
427            .unwrap_or("")
428            .to_lowercase();
429        
430        // Common dependency lock files that contain package hashes and metadata
431        let lock_files = [
432            // JavaScript/Node.js
433            "package-lock.json",
434            "yarn.lock", 
435            "pnpm-lock.yaml",  // <-- This was missing!
436            "shrinkwrap.yaml",
437            "npm-shrinkwrap.json",
438            // Python
439            "poetry.lock",
440            "pipfile.lock",
441            "pip-lock.txt",
442            // Rust
443            "cargo.lock",
444            // Go
445            "go.sum",
446            "go.mod",
447            // Java
448            "gradle.lockfile",
449            "maven-dependency-plugin.log",
450            // Ruby
451            "gemfile.lock",
452            // PHP
453            "composer.lock",
454            // .NET
455            "packages.lock.json",
456            "paket.lock",
457            // Others
458            "mix.lock",  // Elixir
459            "pubspec.lock",  // Dart
460        ];
461        
462        // Check if filename matches any lock file pattern
463        lock_files.iter().any(|&pattern| filename == pattern) ||
464        // Also check for common lock file patterns
465        filename.ends_with(".lock") ||
466        filename.ends_with("-lock.json") ||
467        filename.ends_with("-lock.yaml") ||
468        filename.ends_with("-lock.yml") ||
469        filename.contains("shrinkwrap") ||
470        filename.contains("lockfile")
471    }
472}
473
474impl FileMetadata {
475    /// Check if file is critical (must scan)
476    pub fn is_critical(&self) -> bool {
477        self.priority_hints.is_env_file || 
478        self.priority_hints.is_secret_file ||
479        self.extension.as_deref() == Some("pem") ||
480        self.extension.as_deref() == Some("key")
481    }
482    
483    /// Check if file is high priority
484    pub fn is_priority(&self) -> bool {
485        self.is_critical() ||
486        self.priority_hints.is_config_file ||
487        self.priority_hints.has_secret_keywords
488    }
489    
490    /// Calculate priority score (higher = more important)
491    pub fn priority_score(&self) -> u32 {
492        let mut score: u32 = 0;
493        
494        if self.priority_hints.is_env_file { score += 1000; }
495        if self.priority_hints.is_secret_file { score += 900; }
496        if self.priority_hints.is_config_file { score += 500; }
497        if self.priority_hints.has_secret_keywords { score += 300; }
498        if !self.is_gitignored { score += 200; }
499        if self.priority_hints.is_source_file { score += 100; }
500        
501        // Penalize large files
502        if self.size > 1_000_000 { score = score.saturating_sub(100); }
503        
504        score
505    }
506}
507
508#[cfg(test)]
509mod tests {
510    use super::*;
511    use tempfile::TempDir;
512    
513    #[test]
514    fn test_file_priority_scoring() {
515        let meta = FileMetadata {
516            path: PathBuf::from(".env"),
517            size: 100,
518            extension: Some("env".to_string()),
519            is_gitignored: false,
520            modified: SystemTime::now(),
521            priority_hints: PriorityHints {
522                is_env_file: true,
523                is_config_file: true,
524                is_secret_file: true,
525                is_source_file: false,
526                has_secret_keywords: true,
527            },
528        };
529        
530        assert!(meta.is_critical());
531        assert!(meta.is_priority());
532        assert!(meta.priority_score() > 2000);
533    }
534    
535    #[test]
536    fn test_file_discovery() {
537        let temp_dir = TempDir::new().unwrap();
538        fs::write(temp_dir.path().join(".env"), "SECRET=123").unwrap();
539        fs::write(temp_dir.path().join("config.json"), "{}").unwrap();
540        fs::create_dir(temp_dir.path().join("node_modules")).unwrap();
541        fs::write(temp_dir.path().join("node_modules/test.js"), "code").unwrap();
542        
543        let config = DiscoveryConfig {
544            use_git: false,
545            max_file_size: 1024 * 1024,
546            priority_extensions: vec!["env".to_string()],
547            scan_mode: ScanMode::Fast,
548        };
549        
550        let discovery = FileDiscovery::new(config);
551        let files = discovery.discover_files(temp_dir.path()).unwrap();
552        
553        // Should find .env and config.json but not node_modules/test.js
554        assert_eq!(files.len(), 2);
555        assert!(files.iter().any(|f| f.path.ends_with(".env")));
556        assert!(files.iter().any(|f| f.path.ends_with("config.json")));
557    }
558}