1use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::fs;
8use std::time::SystemTime;
9
10use ahash::AHashSet;
11use rayon::prelude::*;
12use walkdir::WalkDir;
13use log::{debug, trace};
14
15use super::{ScanMode, SecurityError};
16
17#[derive(Debug, Clone)]
19pub struct FileMetadata {
20 pub path: PathBuf,
21 pub size: usize,
22 pub extension: Option<String>,
23 pub is_gitignored: bool,
24 pub modified: SystemTime,
25 pub priority_hints: PriorityHints,
26}
27
28#[derive(Debug, Clone, Default)]
30pub struct PriorityHints {
31 pub is_env_file: bool,
32 pub is_config_file: bool,
33 pub is_secret_file: bool,
34 pub is_source_file: bool,
35 pub has_secret_keywords: bool,
36}
37
38#[derive(Debug, Clone)]
40pub struct DiscoveryConfig {
41 pub use_git: bool,
42 pub max_file_size: usize,
43 pub priority_extensions: Vec<String>,
44 pub scan_mode: ScanMode,
45}
46
47pub struct FileDiscovery {
49 config: DiscoveryConfig,
50 ignored_dirs: AHashSet<String>,
51 secret_keywords: Vec<&'static str>,
52 binary_extensions: AHashSet<&'static str>,
53 excluded_filenames: AHashSet<&'static str>,
54 asset_extensions: AHashSet<&'static str>,
55}
56
57impl FileDiscovery {
58 pub fn new(config: DiscoveryConfig) -> Self {
59 let ignored_dirs = Self::get_ignored_dirs(&config.scan_mode);
60 let secret_keywords = Self::get_secret_keywords();
61 let binary_extensions = Self::get_binary_extensions();
62 let excluded_filenames = Self::get_excluded_filenames();
63 let asset_extensions = Self::get_asset_extensions();
64
65 Self {
66 config,
67 ignored_dirs,
68 secret_keywords,
69 binary_extensions,
70 excluded_filenames,
71 asset_extensions,
72 }
73 }
74
75 pub fn discover_files(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
77 let is_git_repo = project_root.join(".git").exists();
78
79 if is_git_repo && self.config.use_git {
80 self.git_aware_discovery(project_root)
81 } else {
82 self.filesystem_discovery(project_root)
83 }
84 }
85
86 fn git_aware_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
88 debug!("Using git-aware file discovery");
89
90 let tracked_files = self.get_git_tracked_files(project_root)?;
92
93 let untracked_files = self.get_untracked_secret_files(project_root)?;
95
96 let all_paths: Vec<PathBuf> = tracked_files.into_iter()
98 .chain(untracked_files)
99 .collect();
100
101 let files: Vec<FileMetadata> = all_paths
103 .par_iter()
104 .filter_map(|path| self.build_file_metadata(path, project_root).ok())
105 .filter(|meta| self.should_include_file(meta))
106 .collect();
107
108 Ok(files)
109 }
110
111 fn get_git_tracked_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
113 let output = Command::new("git")
114 .args(&["ls-files", "-z"]) .current_dir(project_root)
116 .output()
117 .map_err(|e| SecurityError::FileDiscovery(format!("Git ls-files failed: {}", e)))?;
118
119 if !output.status.success() {
120 return Err(SecurityError::FileDiscovery("Git ls-files failed".to_string()));
121 }
122
123 let paths: Vec<PathBuf> = output.stdout
125 .split(|&b| b == 0)
126 .filter(|path| !path.is_empty())
127 .filter_map(|path| std::str::from_utf8(path).ok())
128 .map(|path| project_root.join(path))
129 .collect();
130
131 Ok(paths)
132 }
133
134 fn get_untracked_secret_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
136 let secret_patterns = vec![
138 ".env*",
139 "*.key",
140 "*.pem",
141 "*.p12",
142 "*credentials*",
143 "*secret*",
144 "config/*.json",
145 "config/*.yml",
146 ];
147
148 let mut untracked_files = Vec::new();
149
150 for pattern in secret_patterns {
151 let output = Command::new("git")
153 .args(&["ls-files", "--others", "--exclude-standard", pattern])
154 .current_dir(project_root)
155 .output();
156
157 if let Ok(output) = output {
158 if output.status.success() {
159 let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
160 .lines()
161 .filter(|line| !line.is_empty())
162 .map(|line| project_root.join(line))
163 .collect();
164 untracked_files.extend(paths);
165 }
166 }
167
168 let output = Command::new("git")
171 .args(&["ls-files", "--others", "--ignored", "--exclude-standard", pattern])
172 .current_dir(project_root)
173 .output();
174
175 if let Ok(output) = output {
176 if output.status.success() {
177 let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
178 .lines()
179 .filter(|line| !line.is_empty())
180 .map(|line| project_root.join(line))
181 .collect();
182 untracked_files.extend(paths);
183 }
184 }
185 }
186
187 Ok(untracked_files)
188 }
189
190 fn filesystem_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
192 debug!("Using filesystem discovery");
193
194 let walker = WalkDir::new(project_root)
195 .follow_links(false)
196 .max_depth(20)
197 .into_iter()
198 .filter_entry(|entry| {
199 if entry.file_type().is_dir() {
201 let dir_name = entry.file_name().to_string_lossy();
202 return !self.ignored_dirs.contains(dir_name.as_ref());
203 }
204 true
205 });
206
207 let files: Vec<FileMetadata> = walker
208 .par_bridge()
209 .filter_map(|entry| entry.ok())
210 .filter(|entry| entry.file_type().is_file())
211 .filter_map(|entry| self.build_file_metadata(entry.path(), project_root).ok())
212 .filter(|meta| self.should_include_file(meta))
213 .collect();
214
215 Ok(files)
216 }
217
218 fn build_file_metadata(&self, path: &Path, project_root: &Path) -> Result<FileMetadata, std::io::Error> {
220 let metadata = fs::metadata(path)?;
221 let size = metadata.len() as usize;
222 let modified = metadata.modified()?;
223
224 let extension = path.extension()
225 .and_then(|ext| ext.to_str())
226 .map(|s| s.to_lowercase());
227
228 let file_name = path.file_name()
229 .and_then(|n| n.to_str())
230 .unwrap_or("");
231
232 let file_name_lower = file_name.to_lowercase();
233
234 let is_gitignored = if project_root.join(".git").exists() {
236 self.check_gitignore_batch(path, project_root)
237 } else {
238 false
239 };
240
241 let priority_hints = PriorityHints {
243 is_env_file: file_name_lower.starts_with(".env") || file_name_lower.ends_with(".env"),
244 is_config_file: self.is_config_file(&file_name_lower, &extension),
245 is_secret_file: self.is_secret_file(&file_name_lower, path),
246 is_source_file: self.is_source_file(&extension),
247 has_secret_keywords: self.has_secret_keywords(&file_name_lower),
248 };
249
250 Ok(FileMetadata {
251 path: path.to_path_buf(),
252 size,
253 extension,
254 is_gitignored,
255 modified,
256 priority_hints,
257 })
258 }
259
260 fn check_gitignore_batch(&self, path: &Path, project_root: &Path) -> bool {
262 let output = Command::new("git")
264 .args(&["check-ignore", path.to_str().unwrap_or("")])
265 .current_dir(project_root)
266 .output();
267
268 match output {
269 Ok(output) => output.status.success(),
270 Err(_) => false,
271 }
272 }
273
274 fn should_include_file(&self, meta: &FileMetadata) -> bool {
276 if meta.size > self.config.max_file_size {
278 trace!("Skipping large file: {} ({} bytes)", meta.path.display(), meta.size);
279 return false;
280 }
281
282 if self.is_binary_file(meta) {
284 trace!("Skipping binary file: {}", meta.path.display());
285 return false;
286 }
287
288 if self.is_asset_file(meta) {
290 trace!("Skipping asset file: {}", meta.path.display());
291 return false;
292 }
293
294 if self.should_exclude_from_security_scan(meta) {
296 trace!("Excluding from security scan: {}", meta.path.display());
297 return false;
298 }
299
300 if meta.is_critical() {
302 return true;
303 }
304
305 match self.config.scan_mode {
307 ScanMode::Lightning => {
308 false
310 }
311 ScanMode::Fast => {
312 meta.is_priority() || (meta.priority_hints.is_source_file && meta.size < 50_000)
314 }
315 _ => true, }
317 }
318
319 fn is_binary_file(&self, meta: &FileMetadata) -> bool {
321 if let Some(ext) = &meta.extension {
322 if self.binary_extensions.contains(ext.as_str()) {
323 return true;
324 }
325 }
326
327 let filename = meta.path.file_name()
329 .and_then(|n| n.to_str())
330 .unwrap_or("")
331 .to_lowercase();
332
333 if self.excluded_filenames.contains(filename.as_str()) {
334 return true;
335 }
336
337 false
338 }
339
340 fn is_asset_file(&self, meta: &FileMetadata) -> bool {
342 if let Some(ext) = &meta.extension {
343 if self.asset_extensions.contains(ext.as_str()) {
344 return true;
345 }
346 }
347
348 let path_str = meta.path.to_string_lossy().to_lowercase();
350 let asset_dirs = [
351 "/assets/", "/static/", "/public/", "/images/", "/img/",
352 "/media/", "/fonts/", "/icons/", "/graphics/", "/pictures/"
353 ];
354
355 asset_dirs.iter().any(|&dir| path_str.contains(dir))
356 }
357
358 fn should_exclude_from_security_scan(&self, meta: &FileMetadata) -> bool {
360 let path_str = meta.path.to_string_lossy().to_lowercase();
361
362 if self.is_dependency_lock_file(meta) {
364 return true;
365 }
366
367 if meta.extension.as_deref() == Some("svg") {
369 return true;
370 }
371
372 if self.is_minified_or_bundled_file(meta) {
374 return true;
375 }
376
377 let exclude_patterns = [
379 ".md", ".txt", ".rst", ".adoc", ".asciidoc",
380 "readme", "changelog", "license", "todo",
381 "roadmap", "contributing", "authors",
382 "/test/", "/tests/", "/spec/", "/specs/",
384 "__test__", "__spec__", ".test.", ".spec.",
385 "_test.", "_spec.", "fixtures", "mocks", "examples",
386 "/docs/", "/doc/", "/documentation/",
388 "frameworks/", "detector", "rules", "patterns",
390 "target/", "build/", "dist/", ".next/", "coverage/",
392 ".nuxt/", ".output/", ".vercel/", ".netlify/",
393 ".vscode/", ".idea/", ".vs/", "*.swp", "*.swo",
395 ".ds_store", "thumbs.db", "desktop.ini",
397 ];
398
399 if exclude_patterns.iter().any(|&pattern| path_str.contains(pattern)) {
401 return true;
402 }
403
404 if let Some(ext) = &meta.extension {
406 let doc_extensions = ["md", "txt", "rst", "adoc", "asciidoc", "rtf"];
407 if doc_extensions.contains(&ext.as_str()) {
408 return true;
409 }
410 }
411
412 let filename = meta.path.file_name()
414 .and_then(|n| n.to_str())
415 .unwrap_or("")
416 .to_lowercase();
417
418 let doc_filenames = [
419 "readme", "changelog", "license", "authors", "contributing",
420 "roadmap", "todo", "examples", "demo", "sample", "fixture",
421 "apicodedialog", "codedialog", "codeexample", "apiexample",
423 "codesnippet", "snippets", "templates", "codegenerator",
424 "apitool", "playground", "sandbox",
425 ];
426
427 if doc_filenames.iter().any(|&name| filename.contains(name)) {
428 return true;
429 }
430
431 false
432 }
433
434 fn is_minified_or_bundled_file(&self, meta: &FileMetadata) -> bool {
436 let filename = meta.path.file_name()
437 .and_then(|n| n.to_str())
438 .unwrap_or("")
439 .to_lowercase();
440
441 let minified_patterns = [
443 ".min.", ".bundle.", ".chunk.", ".vendor.",
444 "-min.", "-bundle.", "-chunk.", "-vendor.",
445 "_min.", "_bundle.", "_chunk.", "_vendor.",
446 ];
447
448 minified_patterns.iter().any(|&pattern| filename.contains(pattern))
449 }
450
451 fn get_ignored_dirs(scan_mode: &ScanMode) -> AHashSet<String> {
453 let mut dirs = AHashSet::new();
454
455 let always_ignore = vec![
457 ".git", "node_modules", "target", "build", "dist", ".next",
458 "coverage", "__pycache__", ".pytest_cache", ".mypy_cache",
459 "vendor", "packages", ".bundle", "bower_components",
460 ".nuxt", ".output", ".vercel", ".netlify", ".vscode", ".idea",
461 ".venv", "venv", ];
463
464 for dir in always_ignore {
465 dirs.insert(dir.to_string());
466 }
467
468 if matches!(scan_mode, ScanMode::Lightning | ScanMode::Fast) {
470 let fast_ignore = vec!["test", "tests", "spec", "specs", "docs", "documentation"];
471 for dir in fast_ignore {
472 dirs.insert(dir.to_string());
473 }
474 }
475
476 dirs
477 }
478
479 fn get_binary_extensions() -> AHashSet<&'static str> {
481 let mut extensions = AHashSet::new();
482
483 let binary_exts = [
485 "exe", "dll", "so", "dylib", "lib", "a", "o", "obj",
486 "bin", "com", "scr", "msi", "deb", "rpm", "pkg",
487 "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "ace",
489 "cab", "dmg", "iso", "img",
490 "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm",
492 "wav", "flac", "ogg", "aac", "m4a", "wma",
493 "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp",
495 "ico", "cur", "psd", "ai", "eps", "raw", "cr2", "nef",
496 "ttf", "otf", "woff", "woff2", "eot",
498 "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
500 "odt", "ods", "odp", "rtf",
501 "db", "sqlite", "sqlite3", "mdb", "accdb", "wt",
503 "pyc", "pyo", "class", "jar", "war", "ear", "cer", "jks",
505 ];
506
507 for ext in binary_exts {
508 extensions.insert(ext);
509 }
510
511 extensions
512 }
513
514 fn get_asset_extensions() -> AHashSet<&'static str> {
516 let mut extensions = AHashSet::new();
517
518 let asset_exts = [
519 "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp",
521 "ico", "cur", "psd", "ai", "eps", "raw", "cr2", "nef", "svg",
522 "ttf", "otf", "woff", "woff2", "eot",
524 "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm",
526 "wav", "flac", "ogg", "aac", "m4a", "wma",
527 ];
528
529 for ext in asset_exts {
530 extensions.insert(ext);
531 }
532
533 extensions
534 }
535
536 fn get_excluded_filenames() -> AHashSet<&'static str> {
538 let mut filenames = AHashSet::new();
539
540 let excluded = [
541 ".ds_store", "thumbs.db", "desktop.ini", "folder.ico",
543 ".gitkeep", ".keep", ".placeholder",
545 ".tmp", ".temp", ".swp", ".swo", ".bak", ".backup",
547 ];
548
549 for filename in excluded {
550 filenames.insert(filename);
551 }
552
553 filenames
554 }
555
556 fn get_secret_keywords() -> Vec<&'static str> {
558 vec![
559 "secret", "key", "token", "password", "credential",
560 "auth", "api", "private", "access", "bearer",
561 ]
562 }
563
564 fn is_config_file(&self, name: &str, extension: &Option<String>) -> bool {
565 let config_extensions = ["json", "yml", "yaml", "toml", "ini", "conf", "config", "xml"];
566 let config_names = ["config", "settings", "configuration", ".env"];
567
568 if let Some(ext) = extension {
569 if config_extensions.contains(&ext.as_str()) {
570 return true;
571 }
572 }
573
574 config_names.iter().any(|&n| name.contains(n))
575 }
576
577 fn is_secret_file(&self, name: &str, path: &Path) -> bool {
578 let secret_patterns = [
579 ".env", ".key", ".pem", ".p12", ".pfx",
580 "credentials", "secret", "private", "cert",
581 ];
582
583 if secret_patterns.iter().any(|&p| name.contains(p)) {
585 return true;
586 }
587
588 let path_str = path.to_string_lossy().to_lowercase();
590 secret_patterns.iter().any(|&p| path_str.contains(p))
591 }
592
593 fn is_source_file(&self, extension: &Option<String>) -> bool {
594 if let Some(ext) = extension {
595 let source_extensions = [
596 "js", "jsx", "ts", "tsx", "py", "java", "kt", "go",
597 "rs", "rb", "php", "cs", "cpp", "c", "h", "swift",
598 "scala", "clj", "ex", "exs",
599 ];
600 source_extensions.contains(&ext.as_str())
601 } else {
602 false
603 }
604 }
605
606 fn has_secret_keywords(&self, name: &str) -> bool {
607 self.secret_keywords.iter().any(|&keyword| name.contains(keyword))
608 }
609
610 fn is_dependency_lock_file(&self, meta: &FileMetadata) -> bool {
612 let filename = meta.path.file_name()
613 .and_then(|n| n.to_str())
614 .unwrap_or("")
615 .to_lowercase();
616
617 let lock_files = [
619 "package-lock.json",
621 "yarn.lock",
622 "pnpm-lock.yaml",
623 "bun.lockb", "poetry.lock",
626 "pipfile.lock",
627 "pip-lock.txt",
628 "pdm.lock",
629 "cargo.lock",
631 "go.sum",
633 "go.mod",
634 "gradle.lockfile",
636 "maven-dependency-plugin.log",
637 "gemfile.lock",
639 "composer.lock",
641 "packages.lock.json",
643 "paket.lock",
644 "mix.lock", "pubspec.lock", "swift.resolved", "flake.lock", ];
650
651 lock_files.iter().any(|&pattern| filename == pattern) ||
653 filename.ends_with(".lock") ||
655 filename.ends_with("-lock.json") ||
656 filename.ends_with("-lock.yaml") ||
657 filename.ends_with("-lock.yml") ||
658 filename.ends_with(".lockb") || filename.contains("shrinkwrap") ||
660 filename.contains("lockfile")
661 }
662}
663
664impl FileMetadata {
665 pub fn is_critical(&self) -> bool {
667 self.priority_hints.is_env_file ||
668 self.priority_hints.is_secret_file ||
669 self.extension.as_deref() == Some("pem") ||
670 self.extension.as_deref() == Some("key")
671 }
672
673 pub fn is_priority(&self) -> bool {
675 self.is_critical() ||
676 self.priority_hints.is_config_file ||
677 self.priority_hints.has_secret_keywords
678 }
679
680 pub fn priority_score(&self) -> u32 {
682 let mut score: u32 = 0;
683
684 if self.priority_hints.is_env_file { score += 1000; }
685 if self.priority_hints.is_secret_file { score += 900; }
686 if self.priority_hints.is_config_file { score += 500; }
687 if self.priority_hints.has_secret_keywords { score += 300; }
688 if !self.is_gitignored { score += 200; }
689 if self.priority_hints.is_source_file { score += 100; }
690
691 if self.size > 1_000_000 { score = score.saturating_sub(100); }
693
694 score
695 }
696}
697
698#[cfg(test)]
699mod tests {
700 use super::*;
701 use tempfile::TempDir;
702
703 #[test]
704 fn test_file_priority_scoring() {
705 let meta = FileMetadata {
706 path: PathBuf::from(".env"),
707 size: 100,
708 extension: Some("env".to_string()),
709 is_gitignored: false,
710 modified: SystemTime::now(),
711 priority_hints: PriorityHints {
712 is_env_file: true,
713 is_config_file: true,
714 is_secret_file: true,
715 is_source_file: false,
716 has_secret_keywords: true,
717 },
718 };
719
720 assert!(meta.is_critical());
721 assert!(meta.is_priority());
722 assert!(meta.priority_score() > 2000);
723 }
724
725 #[test]
726 fn test_file_discovery() {
727 let temp_dir = TempDir::new().unwrap();
728 fs::write(temp_dir.path().join(".env"), "SECRET=123").unwrap();
729 fs::write(temp_dir.path().join("config.json"), "{}").unwrap();
730 fs::create_dir(temp_dir.path().join("node_modules")).unwrap();
731 fs::write(temp_dir.path().join("node_modules/test.js"), "code").unwrap();
732
733 let config = DiscoveryConfig {
734 use_git: false,
735 max_file_size: 1024 * 1024,
736 priority_extensions: vec!["env".to_string()],
737 scan_mode: ScanMode::Fast,
738 };
739
740 let discovery = FileDiscovery::new(config);
741 let files = discovery.discover_files(temp_dir.path()).unwrap();
742
743 assert_eq!(files.len(), 2);
745 assert!(files.iter().any(|f| f.path.ends_with(".env")));
746 assert!(files.iter().any(|f| f.path.ends_with("config.json")));
747 }
748
749 #[test]
750 fn test_binary_file_detection() {
751 let config = DiscoveryConfig {
752 use_git: false,
753 max_file_size: 1024 * 1024,
754 priority_extensions: vec![],
755 scan_mode: ScanMode::Fast,
756 };
757 let discovery = FileDiscovery::new(config);
758
759 let binary_meta = FileMetadata {
760 path: PathBuf::from("test.jpg"),
761 size: 100,
762 extension: Some("jpg".to_string()),
763 is_gitignored: false,
764 modified: SystemTime::now(),
765 priority_hints: PriorityHints::default(),
766 };
767
768 assert!(discovery.is_binary_file(&binary_meta));
769 }
770
771 #[test]
772 fn test_lock_file_detection() {
773 let config = DiscoveryConfig {
774 use_git: false,
775 max_file_size: 1024 * 1024,
776 priority_extensions: vec![],
777 scan_mode: ScanMode::Fast,
778 };
779 let discovery = FileDiscovery::new(config);
780
781 let lock_files = [
782 "package-lock.json",
783 "yarn.lock",
784 "pnpm-lock.yaml",
785 "bun.lockb",
786 "cargo.lock",
787 "go.sum",
788 ];
789
790 for lock_file in lock_files {
791 let meta = FileMetadata {
792 path: PathBuf::from(lock_file),
793 size: 100,
794 extension: None,
795 is_gitignored: false,
796 modified: SystemTime::now(),
797 priority_hints: PriorityHints::default(),
798 };
799
800 assert!(discovery.is_dependency_lock_file(&meta), "Failed to detect {}", lock_file);
801 }
802 }
803}