syncable_cli/analyzer/security/turbo/
file_discovery.rs

1//! # File Discovery Module
2//! 
3//! Ultra-fast file discovery with git-aware filtering and smart prioritization.
4
5use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::fs;
8use std::time::SystemTime;
9
10use ahash::AHashSet;
11use rayon::prelude::*;
12use walkdir::WalkDir;
13use log::{debug, trace};
14
15use super::{ScanMode, SecurityError};
16
17/// File metadata for efficient filtering
18#[derive(Debug, Clone)]
19pub struct FileMetadata {
20    pub path: PathBuf,
21    pub size: usize,
22    pub extension: Option<String>,
23    pub is_gitignored: bool,
24    pub modified: SystemTime,
25    pub priority_hints: PriorityHints,
26}
27
28/// Priority hints for file scoring
29#[derive(Debug, Clone, Default)]
30pub struct PriorityHints {
31    pub is_env_file: bool,
32    pub is_config_file: bool,
33    pub is_secret_file: bool,
34    pub is_source_file: bool,
35    pub has_secret_keywords: bool,
36}
37
38/// Configuration for file discovery
39#[derive(Debug, Clone)]
40pub struct DiscoveryConfig {
41    pub use_git: bool,
42    pub max_file_size: usize,
43    pub priority_extensions: Vec<String>,
44    pub scan_mode: ScanMode,
45}
46
47/// High-performance file discovery
48pub struct FileDiscovery {
49    config: DiscoveryConfig,
50    ignored_dirs: AHashSet<String>,
51    secret_keywords: Vec<&'static str>,
52    binary_extensions: AHashSet<&'static str>,
53    excluded_filenames: AHashSet<&'static str>,
54    asset_extensions: AHashSet<&'static str>,
55}
56
57impl FileDiscovery {
58    pub fn new(config: DiscoveryConfig) -> Self {
59        let ignored_dirs = Self::get_ignored_dirs(&config.scan_mode);
60        let secret_keywords = Self::get_secret_keywords();
61        let binary_extensions = Self::get_binary_extensions();
62        let excluded_filenames = Self::get_excluded_filenames();
63        let asset_extensions = Self::get_asset_extensions();
64        
65        Self {
66            config,
67            ignored_dirs,
68            secret_keywords,
69            binary_extensions,
70            excluded_filenames,
71            asset_extensions,
72        }
73    }
74    
75    /// Discover files with ultra-fast git-aware filtering
76    pub fn discover_files(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
77        let is_git_repo = project_root.join(".git").exists();
78        
79        if is_git_repo && self.config.use_git {
80            self.git_aware_discovery(project_root)
81        } else {
82            self.filesystem_discovery(project_root)
83        }
84    }
85    
86    /// Git-aware file discovery (fastest method)
87    fn git_aware_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
88        debug!("Using git-aware file discovery");
89        
90        // Get all tracked files using git ls-files
91        let tracked_files = self.get_git_tracked_files(project_root)?;
92        
93        // Get untracked files that might contain secrets
94        let untracked_files = self.get_untracked_secret_files(project_root)?;
95        
96        // Combine and process in parallel
97        let all_paths: Vec<PathBuf> = tracked_files.into_iter()
98            .chain(untracked_files)
99            .collect();
100        
101        // Process files in parallel to build metadata
102        let files: Vec<FileMetadata> = all_paths
103            .par_iter()
104            .filter_map(|path| self.build_file_metadata(path, project_root).ok())
105            .filter(|meta| self.should_include_file(meta))
106            .collect();
107        
108        Ok(files)
109    }
110    
111    /// Get tracked files from git
112    fn get_git_tracked_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
113        let output = Command::new("git")
114            .args(&["ls-files", "-z"]) // -z for null-terminated output
115            .current_dir(project_root)
116            .output()
117            .map_err(|e| SecurityError::FileDiscovery(format!("Git ls-files failed: {}", e)))?;
118        
119        if !output.status.success() {
120            return Err(SecurityError::FileDiscovery("Git ls-files failed".to_string()));
121        }
122        
123        // Parse null-terminated paths
124        let paths: Vec<PathBuf> = output.stdout
125            .split(|&b| b == 0)
126            .filter(|path| !path.is_empty())
127            .filter_map(|path| std::str::from_utf8(path).ok())
128            .map(|path| project_root.join(path))
129            .collect();
130        
131        Ok(paths)
132    }
133    
134    /// Get untracked files that might contain secrets (including gitignored files)
135    fn get_untracked_secret_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
136        // Common secret file patterns that might not be tracked
137        let secret_patterns = vec![
138            ".env*",
139            "*.key",
140            "*.pem",
141            "*.p12",
142            "*credentials*",
143            "*secret*",
144            "config/*.json",
145            "config/*.yml",
146        ];
147
148        let mut untracked_files = Vec::new();
149
150        for pattern in secret_patterns {
151            // First, get untracked files that are NOT gitignored (potential accidental exposure)
152            let output = Command::new("git")
153                .args(&["ls-files", "--others", "--exclude-standard", pattern])
154                .current_dir(project_root)
155                .output();
156
157            if let Ok(output) = output {
158                if output.status.success() {
159                    let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
160                        .lines()
161                        .filter(|line| !line.is_empty())
162                        .map(|line| project_root.join(line))
163                        .collect();
164                    untracked_files.extend(paths);
165                }
166            }
167
168            // Also get gitignored files - these should be scanned to verify they exist
169            // and contain real secrets (important for security audit completeness)
170            let output = Command::new("git")
171                .args(&["ls-files", "--others", "--ignored", "--exclude-standard", pattern])
172                .current_dir(project_root)
173                .output();
174
175            if let Ok(output) = output {
176                if output.status.success() {
177                    let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
178                        .lines()
179                        .filter(|line| !line.is_empty())
180                        .map(|line| project_root.join(line))
181                        .collect();
182                    untracked_files.extend(paths);
183                }
184            }
185        }
186
187        Ok(untracked_files)
188    }
189    
190    /// Fallback filesystem discovery
191    fn filesystem_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
192        debug!("Using filesystem discovery");
193        
194        let walker = WalkDir::new(project_root)
195            .follow_links(false)
196            .max_depth(20)
197            .into_iter()
198            .filter_entry(|entry| {
199                // Skip ignored directories
200                if entry.file_type().is_dir() {
201                    let dir_name = entry.file_name().to_string_lossy();
202                    return !self.ignored_dirs.contains(dir_name.as_ref());
203                }
204                true
205            });
206        
207        let files: Vec<FileMetadata> = walker
208            .par_bridge()
209            .filter_map(|entry| entry.ok())
210            .filter(|entry| entry.file_type().is_file())
211            .filter_map(|entry| self.build_file_metadata(entry.path(), project_root).ok())
212            .filter(|meta| self.should_include_file(meta))
213            .collect();
214        
215        Ok(files)
216    }
217    
218    /// Build file metadata with priority hints
219    fn build_file_metadata(&self, path: &Path, project_root: &Path) -> Result<FileMetadata, std::io::Error> {
220        let metadata = fs::metadata(path)?;
221        let size = metadata.len() as usize;
222        let modified = metadata.modified()?;
223        
224        let extension = path.extension()
225            .and_then(|ext| ext.to_str())
226            .map(|s| s.to_lowercase());
227        
228        let file_name = path.file_name()
229            .and_then(|n| n.to_str())
230            .unwrap_or("");
231        
232        let file_name_lower = file_name.to_lowercase();
233        
234        // Check gitignore status efficiently
235        let is_gitignored = if project_root.join(".git").exists() {
236            self.check_gitignore_batch(path, project_root)
237        } else {
238            false
239        };
240        
241        // Build priority hints
242        let priority_hints = PriorityHints {
243            is_env_file: file_name_lower.starts_with(".env") || file_name_lower.ends_with(".env"),
244            is_config_file: self.is_config_file(&file_name_lower, &extension),
245            is_secret_file: self.is_secret_file(&file_name_lower, path),
246            is_source_file: self.is_source_file(&extension),
247            has_secret_keywords: self.has_secret_keywords(&file_name_lower),
248        };
249        
250        Ok(FileMetadata {
251            path: path.to_path_buf(),
252            size,
253            extension,
254            is_gitignored,
255            modified,
256            priority_hints,
257        })
258    }
259    
260    /// Batch check gitignore status
261    fn check_gitignore_batch(&self, path: &Path, project_root: &Path) -> bool {
262        // Quick check using git check-ignore
263        let output = Command::new("git")
264            .args(&["check-ignore", path.to_str().unwrap_or("")])
265            .current_dir(project_root)
266            .output();
267        
268        match output {
269            Ok(output) => output.status.success(),
270            Err(_) => false,
271        }
272    }
273    
274    /// Check if file should be included based on filters
275    fn should_include_file(&self, meta: &FileMetadata) -> bool {
276        // Size filter
277        if meta.size > self.config.max_file_size {
278            trace!("Skipping large file: {} ({} bytes)", meta.path.display(), meta.size);
279            return false;
280        }
281        
282        // Enhanced binary file detection
283        if self.is_binary_file(meta) {
284            trace!("Skipping binary file: {}", meta.path.display());
285            return false;
286        }
287        
288        // Asset file detection (images, fonts, media)
289        if self.is_asset_file(meta) {
290            trace!("Skipping asset file: {}", meta.path.display());
291            return false;
292        }
293        
294        // Exclude files that are unlikely to contain real secrets
295        if self.should_exclude_from_security_scan(meta) {
296            trace!("Excluding from security scan: {}", meta.path.display());
297            return false;
298        }
299        
300        // Critical files always included
301        if meta.is_critical() {
302            return true;
303        }
304        
305        // Scan mode specific filtering
306        match self.config.scan_mode {
307            ScanMode::Lightning => {
308                // Only critical files (already handled above)
309                false
310            }
311            ScanMode::Fast => {
312                // Priority files or small source files
313                meta.is_priority() || (meta.priority_hints.is_source_file && meta.size < 50_000)
314            }
315            _ => true, // Include all for other modes
316        }
317    }
318    
319    /// Enhanced binary file detection
320    fn is_binary_file(&self, meta: &FileMetadata) -> bool {
321        if let Some(ext) = &meta.extension {
322            if self.binary_extensions.contains(ext.as_str()) {
323                return true;
324            }
325        }
326        
327        // Check filename patterns
328        let filename = meta.path.file_name()
329            .and_then(|n| n.to_str())
330            .unwrap_or("")
331            .to_lowercase();
332        
333        if self.excluded_filenames.contains(filename.as_str()) {
334            return true;
335        }
336        
337        false
338    }
339    
340    /// Check if file is an asset (images, fonts, media)
341    fn is_asset_file(&self, meta: &FileMetadata) -> bool {
342        if let Some(ext) = &meta.extension {
343            if self.asset_extensions.contains(ext.as_str()) {
344                return true;
345            }
346        }
347        
348        // Check for asset directories
349        let path_str = meta.path.to_string_lossy().to_lowercase();
350        let asset_dirs = [
351            "/assets/", "/static/", "/public/", "/images/", "/img/", 
352            "/media/", "/fonts/", "/icons/", "/graphics/", "/pictures/"
353        ];
354        
355        asset_dirs.iter().any(|&dir| path_str.contains(dir))
356    }
357    
358    /// Check if file should be excluded from security scanning
359    fn should_exclude_from_security_scan(&self, meta: &FileMetadata) -> bool {
360        let path_str = meta.path.to_string_lossy().to_lowercase();
361        
362        // DEPENDENCY LOCK FILES - These contain package hashes/metadata, not secrets
363        if self.is_dependency_lock_file(meta) {
364            return true;
365        }
366        
367        // SVG files often contain base64 encoded graphics that trigger false positives
368        if meta.extension.as_deref() == Some("svg") {
369            return true;
370        }
371        
372        // Minified and bundled files
373        if self.is_minified_or_bundled_file(meta) {
374            return true;
375        }
376        
377        // Documentation and non-code files that rarely contain real secrets
378        let exclude_patterns = [
379            ".md", ".txt", ".rst", ".adoc", ".asciidoc",
380            "readme", "changelog", "license", "todo",
381            "roadmap", "contributing", "authors",
382            // Test files (often contain fake/example data)
383            "/test/", "/tests/", "/spec/", "/specs/",
384            "__test__", "__spec__", ".test.", ".spec.",
385            "_test.", "_spec.", "fixtures", "mocks", "examples",
386            // Documentation directories
387            "/docs/", "/doc/", "/documentation/",
388            // Framework/library detection files (they contain patterns but not secrets)
389            "frameworks/", "detector", "rules", "patterns",
390            // Build artifacts and generated files
391            "target/", "build/", "dist/", ".next/", "coverage/",
392            ".nuxt/", ".output/", ".vercel/", ".netlify/",
393            // IDE and editor files
394            ".vscode/", ".idea/", ".vs/", "*.swp", "*.swo",
395            // OS files
396            ".ds_store", "thumbs.db", "desktop.ini",
397        ];
398        
399        // Check patterns
400        if exclude_patterns.iter().any(|&pattern| path_str.contains(pattern)) {
401            return true;
402        }
403        
404        // Documentation file extensions
405        if let Some(ext) = &meta.extension {
406            let doc_extensions = ["md", "txt", "rst", "adoc", "asciidoc", "rtf"];
407            if doc_extensions.contains(&ext.as_str()) {
408                return true;
409            }
410        }
411        
412        // Check if filename suggests it's documentation, examples, or code generation
413        let filename = meta.path.file_name()
414            .and_then(|n| n.to_str())
415            .unwrap_or("")
416            .to_lowercase();
417        
418        let doc_filenames = [
419            "readme", "changelog", "license", "authors", "contributing",
420            "roadmap", "todo", "examples", "demo", "sample", "fixture",
421            // Code generation and API example files
422            "apicodedialog", "codedialog", "codeexample", "apiexample",
423            "codesnippet", "snippets", "templates", "codegenerator",
424            "apitool", "playground", "sandbox",
425        ];
426        
427        if doc_filenames.iter().any(|&name| filename.contains(name)) {
428            return true;
429        }
430        
431        false
432    }
433    
434    /// Check if file is minified or bundled
435    fn is_minified_or_bundled_file(&self, meta: &FileMetadata) -> bool {
436        let filename = meta.path.file_name()
437            .and_then(|n| n.to_str())
438            .unwrap_or("")
439            .to_lowercase();
440        
441        // Minified file patterns
442        let minified_patterns = [
443            ".min.", ".bundle.", ".chunk.", ".vendor.",
444            "-min.", "-bundle.", "-chunk.", "-vendor.",
445            "_min.", "_bundle.", "_chunk.", "_vendor.",
446        ];
447        
448        minified_patterns.iter().any(|&pattern| filename.contains(pattern))
449    }
450    
451    /// Get ignored directories based on scan mode
452    fn get_ignored_dirs(scan_mode: &ScanMode) -> AHashSet<String> {
453        let mut dirs = AHashSet::new();
454        
455        // Always ignore these
456        let always_ignore = vec![
457            ".git", "node_modules", "target", "build", "dist", ".next",
458            "coverage", "__pycache__", ".pytest_cache", ".mypy_cache",
459            "vendor", "packages", ".bundle", "bower_components",
460            ".nuxt", ".output", ".vercel", ".netlify", ".vscode", ".idea",
461            ".venv", "venv", // Python virtual environments
462        ];
463        
464        for dir in always_ignore {
465            dirs.insert(dir.to_string());
466        }
467        
468        // Additional ignores for faster modes
469        if matches!(scan_mode, ScanMode::Lightning | ScanMode::Fast) {
470            let fast_ignore = vec!["test", "tests", "spec", "specs", "docs", "documentation"];
471            for dir in fast_ignore {
472                dirs.insert(dir.to_string());
473            }
474        }
475        
476        dirs
477    }
478    
479    /// Get comprehensive binary file extensions
480    fn get_binary_extensions() -> AHashSet<&'static str> {
481        let mut extensions = AHashSet::new();
482        
483        // Executables and libraries
484        let binary_exts = [
485            "exe", "dll", "so", "dylib", "lib", "a", "o", "obj",
486            "bin", "com", "scr", "msi", "deb", "rpm", "pkg",
487            // Archives
488            "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "ace",
489            "cab", "dmg", "iso", "img",
490            // Media files
491            "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm",
492            "wav", "flac", "ogg", "aac", "m4a", "wma",
493            // Images (will be handled separately as assets)
494            "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp",
495            "ico", "cur", "psd", "ai", "eps", "raw", "cr2", "nef",
496            // Fonts
497            "ttf", "otf", "woff", "woff2", "eot",
498            // Documents
499            "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
500            "odt", "ods", "odp", "rtf",
501            // Databases
502            "db", "sqlite", "sqlite3", "mdb", "accdb", "wt",
503            // Other binary formats
504            "pyc", "pyo", "class", "jar", "war", "ear", "cer", "jks",
505        ];
506        
507        for ext in binary_exts {
508            extensions.insert(ext);
509        }
510        
511        extensions
512    }
513    
514    /// Get asset file extensions (images, media, fonts)
515    fn get_asset_extensions() -> AHashSet<&'static str> {
516        let mut extensions = AHashSet::new();
517        
518        let asset_exts = [
519            // Images
520            "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp",
521            "ico", "cur", "psd", "ai", "eps", "raw", "cr2", "nef", "svg",
522            // Fonts
523            "ttf", "otf", "woff", "woff2", "eot",
524            // Media
525            "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm",
526            "wav", "flac", "ogg", "aac", "m4a", "wma",
527        ];
528        
529        for ext in asset_exts {
530            extensions.insert(ext);
531        }
532        
533        extensions
534    }
535    
536    /// Get filenames that should be excluded
537    fn get_excluded_filenames() -> AHashSet<&'static str> {
538        let mut filenames = AHashSet::new();
539        
540        let excluded = [
541            // OS files
542            ".ds_store", "thumbs.db", "desktop.ini", "folder.ico",
543            // Editor files
544            ".gitkeep", ".keep", ".placeholder",
545            // Temporary files
546            ".tmp", ".temp", ".swp", ".swo", ".bak", ".backup",
547        ];
548        
549        for filename in excluded {
550            filenames.insert(filename);
551        }
552        
553        filenames
554    }
555    
556    /// Get secret keywords for detection
557    fn get_secret_keywords() -> Vec<&'static str> {
558        vec![
559            "secret", "key", "token", "password", "credential",
560            "auth", "api", "private", "access", "bearer",
561        ]
562    }
563    
564    fn is_config_file(&self, name: &str, extension: &Option<String>) -> bool {
565        let config_extensions = ["json", "yml", "yaml", "toml", "ini", "conf", "config", "xml"];
566        let config_names = ["config", "settings", "configuration", ".env"];
567        
568        if let Some(ext) = extension {
569            if config_extensions.contains(&ext.as_str()) {
570                return true;
571            }
572        }
573        
574        config_names.iter().any(|&n| name.contains(n))
575    }
576    
577    fn is_secret_file(&self, name: &str, path: &Path) -> bool {
578        let secret_patterns = [
579            ".env", ".key", ".pem", ".p12", ".pfx",
580            "credentials", "secret", "private", "cert",
581        ];
582        
583        // Check filename
584        if secret_patterns.iter().any(|&p| name.contains(p)) {
585            return true;
586        }
587        
588        // Check path components
589        let path_str = path.to_string_lossy().to_lowercase();
590        secret_patterns.iter().any(|&p| path_str.contains(p))
591    }
592    
593    fn is_source_file(&self, extension: &Option<String>) -> bool {
594        if let Some(ext) = extension {
595            let source_extensions = [
596                "js", "jsx", "ts", "tsx", "py", "java", "kt", "go",
597                "rs", "rb", "php", "cs", "cpp", "c", "h", "swift",
598                "scala", "clj", "ex", "exs",
599            ];
600            source_extensions.contains(&ext.as_str())
601        } else {
602            false
603        }
604    }
605    
606    fn has_secret_keywords(&self, name: &str) -> bool {
607        self.secret_keywords.iter().any(|&keyword| name.contains(keyword))
608    }
609    
610    /// Enhanced dependency lock file detection
611    fn is_dependency_lock_file(&self, meta: &FileMetadata) -> bool {
612        let filename = meta.path.file_name()
613            .and_then(|n| n.to_str())
614            .unwrap_or("")
615            .to_lowercase();
616        
617        // Common dependency lock files that contain package hashes and metadata
618        let lock_files = [
619            // JavaScript/Node.js
620            "package-lock.json",
621            "yarn.lock", 
622            "pnpm-lock.yaml",
623            "bun.lockb",  // Bun lock file (binary format)
624            // Python
625            "poetry.lock",
626            "pipfile.lock",
627            "pip-lock.txt",
628            "pdm.lock",
629            // Rust
630            "cargo.lock",
631            // Go
632            "go.sum",
633            "go.mod",
634            // Java
635            "gradle.lockfile",
636            "maven-dependency-plugin.log",
637            // Ruby
638            "gemfile.lock",
639            // PHP
640            "composer.lock",
641            // .NET
642            "packages.lock.json",
643            "paket.lock",
644            // Others
645            "mix.lock",  // Elixir
646            "pubspec.lock",  // Dart
647            "swift.resolved", // Swift
648            "flake.lock", // Nix
649        ];
650        
651        // Check if filename matches any lock file pattern
652        lock_files.iter().any(|&pattern| filename == pattern) ||
653        // Also check for common lock file patterns
654        filename.ends_with(".lock") ||
655        filename.ends_with("-lock.json") ||
656        filename.ends_with("-lock.yaml") ||
657        filename.ends_with("-lock.yml") ||
658        filename.ends_with(".lockb") ||  // Binary lock files
659        filename.contains("shrinkwrap") ||
660        filename.contains("lockfile")
661    }
662}
663
664impl FileMetadata {
665    /// Check if file is critical (must scan)
666    pub fn is_critical(&self) -> bool {
667        self.priority_hints.is_env_file || 
668        self.priority_hints.is_secret_file ||
669        self.extension.as_deref() == Some("pem") ||
670        self.extension.as_deref() == Some("key")
671    }
672    
673    /// Check if file is high priority
674    pub fn is_priority(&self) -> bool {
675        self.is_critical() ||
676        self.priority_hints.is_config_file ||
677        self.priority_hints.has_secret_keywords
678    }
679    
680    /// Calculate priority score (higher = more important)
681    pub fn priority_score(&self) -> u32 {
682        let mut score: u32 = 0;
683        
684        if self.priority_hints.is_env_file { score += 1000; }
685        if self.priority_hints.is_secret_file { score += 900; }
686        if self.priority_hints.is_config_file { score += 500; }
687        if self.priority_hints.has_secret_keywords { score += 300; }
688        if !self.is_gitignored { score += 200; }
689        if self.priority_hints.is_source_file { score += 100; }
690        
691        // Penalize large files
692        if self.size > 1_000_000 { score = score.saturating_sub(100); }
693        
694        score
695    }
696}
697
698#[cfg(test)]
699mod tests {
700    use super::*;
701    use tempfile::TempDir;
702    
703    #[test]
704    fn test_file_priority_scoring() {
705        let meta = FileMetadata {
706            path: PathBuf::from(".env"),
707            size: 100,
708            extension: Some("env".to_string()),
709            is_gitignored: false,
710            modified: SystemTime::now(),
711            priority_hints: PriorityHints {
712                is_env_file: true,
713                is_config_file: true,
714                is_secret_file: true,
715                is_source_file: false,
716                has_secret_keywords: true,
717            },
718        };
719        
720        assert!(meta.is_critical());
721        assert!(meta.is_priority());
722        assert!(meta.priority_score() > 2000);
723    }
724    
725    #[test]
726    fn test_file_discovery() {
727        let temp_dir = TempDir::new().unwrap();
728        fs::write(temp_dir.path().join(".env"), "SECRET=123").unwrap();
729        fs::write(temp_dir.path().join("config.json"), "{}").unwrap();
730        fs::create_dir(temp_dir.path().join("node_modules")).unwrap();
731        fs::write(temp_dir.path().join("node_modules/test.js"), "code").unwrap();
732        
733        let config = DiscoveryConfig {
734            use_git: false,
735            max_file_size: 1024 * 1024,
736            priority_extensions: vec!["env".to_string()],
737            scan_mode: ScanMode::Fast,
738        };
739        
740        let discovery = FileDiscovery::new(config);
741        let files = discovery.discover_files(temp_dir.path()).unwrap();
742        
743        // Should find .env and config.json but not node_modules/test.js
744        assert_eq!(files.len(), 2);
745        assert!(files.iter().any(|f| f.path.ends_with(".env")));
746        assert!(files.iter().any(|f| f.path.ends_with("config.json")));
747    }
748    
749    #[test]
750    fn test_binary_file_detection() {
751        let config = DiscoveryConfig {
752            use_git: false,
753            max_file_size: 1024 * 1024,
754            priority_extensions: vec![],
755            scan_mode: ScanMode::Fast,
756        };
757        let discovery = FileDiscovery::new(config);
758        
759        let binary_meta = FileMetadata {
760            path: PathBuf::from("test.jpg"),
761            size: 100,
762            extension: Some("jpg".to_string()),
763            is_gitignored: false,
764            modified: SystemTime::now(),
765            priority_hints: PriorityHints::default(),
766        };
767        
768        assert!(discovery.is_binary_file(&binary_meta));
769    }
770    
771    #[test]
772    fn test_lock_file_detection() {
773        let config = DiscoveryConfig {
774            use_git: false,
775            max_file_size: 1024 * 1024,
776            priority_extensions: vec![],
777            scan_mode: ScanMode::Fast,
778        };
779        let discovery = FileDiscovery::new(config);
780        
781        let lock_files = [
782            "package-lock.json",
783            "yarn.lock",
784            "pnpm-lock.yaml",
785            "bun.lockb",
786            "cargo.lock",
787            "go.sum",
788        ];
789        
790        for lock_file in lock_files {
791            let meta = FileMetadata {
792                path: PathBuf::from(lock_file),
793                size: 100,
794                extension: None,
795                is_gitignored: false,
796                modified: SystemTime::now(),
797                priority_hints: PriorityHints::default(),
798            };
799            
800            assert!(discovery.is_dependency_lock_file(&meta), "Failed to detect {}", lock_file);
801        }
802    }
803}