1use std::fs;
6use std::path::{Path, PathBuf};
7use std::process::Command;
8use std::time::SystemTime;
9
10use ahash::AHashSet;
11use log::{debug, trace};
12use rayon::prelude::*;
13use walkdir::WalkDir;
14
15use super::{ScanMode, SecurityError};
16
17#[derive(Debug, Clone)]
19pub struct FileMetadata {
20 pub path: PathBuf,
21 pub size: usize,
22 pub extension: Option<String>,
23 pub is_gitignored: bool,
24 pub modified: SystemTime,
25 pub priority_hints: PriorityHints,
26}
27
28#[derive(Debug, Clone, Default)]
30pub struct PriorityHints {
31 pub is_env_file: bool,
32 pub is_config_file: bool,
33 pub is_secret_file: bool,
34 pub is_source_file: bool,
35 pub has_secret_keywords: bool,
36}
37
38#[derive(Debug, Clone)]
40pub struct DiscoveryConfig {
41 pub use_git: bool,
42 pub max_file_size: usize,
43 pub priority_extensions: Vec<String>,
44 pub scan_mode: ScanMode,
45}
46
47pub struct FileDiscovery {
49 config: DiscoveryConfig,
50 ignored_dirs: AHashSet<String>,
51 secret_keywords: Vec<&'static str>,
52 binary_extensions: AHashSet<&'static str>,
53 excluded_filenames: AHashSet<&'static str>,
54 asset_extensions: AHashSet<&'static str>,
55}
56
57impl FileDiscovery {
58 pub fn new(config: DiscoveryConfig) -> Self {
59 let ignored_dirs = Self::get_ignored_dirs(&config.scan_mode);
60 let secret_keywords = Self::get_secret_keywords();
61 let binary_extensions = Self::get_binary_extensions();
62 let excluded_filenames = Self::get_excluded_filenames();
63 let asset_extensions = Self::get_asset_extensions();
64
65 Self {
66 config,
67 ignored_dirs,
68 secret_keywords,
69 binary_extensions,
70 excluded_filenames,
71 asset_extensions,
72 }
73 }
74
75 pub fn discover_files(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
77 let is_git_repo = project_root.join(".git").exists();
78
79 if is_git_repo && self.config.use_git {
80 self.git_aware_discovery(project_root)
81 } else {
82 self.filesystem_discovery(project_root)
83 }
84 }
85
86 fn git_aware_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
88 debug!("Using git-aware file discovery");
89
90 let tracked_files = self.get_git_tracked_files(project_root)?;
92
93 let untracked_files = self.get_untracked_secret_files(project_root)?;
95
96 let all_paths: Vec<PathBuf> = tracked_files.into_iter().chain(untracked_files).collect();
98
99 let files: Vec<FileMetadata> = all_paths
101 .par_iter()
102 .filter_map(|path| self.build_file_metadata(path, project_root).ok())
103 .filter(|meta| self.should_include_file(meta))
104 .collect();
105
106 Ok(files)
107 }
108
109 fn get_git_tracked_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
111 let output = Command::new("git")
112 .args(["ls-files", "-z"]) .current_dir(project_root)
114 .output()
115 .map_err(|e| SecurityError::FileDiscovery(format!("Git ls-files failed: {}", e)))?;
116
117 if !output.status.success() {
118 return Err(SecurityError::FileDiscovery(
119 "Git ls-files failed".to_string(),
120 ));
121 }
122
123 let paths: Vec<PathBuf> = output
125 .stdout
126 .split(|&b| b == 0)
127 .filter(|path| !path.is_empty())
128 .filter_map(|path| std::str::from_utf8(path).ok())
129 .map(|path| project_root.join(path))
130 .collect();
131
132 Ok(paths)
133 }
134
135 fn get_untracked_secret_files(
137 &self,
138 project_root: &Path,
139 ) -> Result<Vec<PathBuf>, SecurityError> {
140 let secret_patterns = vec![
142 ".env*",
143 "*.key",
144 "*.pem",
145 "*.p12",
146 "*credentials*",
147 "*secret*",
148 "config/*.json",
149 "config/*.yml",
150 ];
151
152 let mut untracked_files = Vec::new();
153
154 for pattern in secret_patterns {
155 let output = Command::new("git")
157 .args(["ls-files", "--others", "--exclude-standard", pattern])
158 .current_dir(project_root)
159 .output();
160
161 if let Ok(output) = output
162 && output.status.success()
163 {
164 let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
165 .lines()
166 .filter(|line| !line.is_empty())
167 .map(|line| project_root.join(line))
168 .collect();
169 untracked_files.extend(paths);
170 }
171
172 let output = Command::new("git")
175 .args([
176 "ls-files",
177 "--others",
178 "--ignored",
179 "--exclude-standard",
180 pattern,
181 ])
182 .current_dir(project_root)
183 .output();
184
185 if let Ok(output) = output
186 && output.status.success()
187 {
188 let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
189 .lines()
190 .filter(|line| !line.is_empty())
191 .map(|line| project_root.join(line))
192 .collect();
193 untracked_files.extend(paths);
194 }
195 }
196
197 Ok(untracked_files)
198 }
199
200 fn filesystem_discovery(
202 &self,
203 project_root: &Path,
204 ) -> Result<Vec<FileMetadata>, SecurityError> {
205 debug!("Using filesystem discovery");
206
207 let walker = WalkDir::new(project_root)
208 .follow_links(false)
209 .max_depth(20)
210 .into_iter()
211 .filter_entry(|entry| {
212 if entry.file_type().is_dir() {
214 let dir_name = entry.file_name().to_string_lossy();
215 return !self.ignored_dirs.contains(dir_name.as_ref());
216 }
217 true
218 });
219
220 let files: Vec<FileMetadata> = walker
221 .par_bridge()
222 .filter_map(|entry| entry.ok())
223 .filter(|entry| entry.file_type().is_file())
224 .filter_map(|entry| self.build_file_metadata(entry.path(), project_root).ok())
225 .filter(|meta| self.should_include_file(meta))
226 .collect();
227
228 Ok(files)
229 }
230
231 fn build_file_metadata(
233 &self,
234 path: &Path,
235 project_root: &Path,
236 ) -> Result<FileMetadata, std::io::Error> {
237 let metadata = fs::metadata(path)?;
238 let size = metadata.len() as usize;
239 let modified = metadata.modified()?;
240
241 let extension = path
242 .extension()
243 .and_then(|ext| ext.to_str())
244 .map(|s| s.to_lowercase());
245
246 let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
247
248 let file_name_lower = file_name.to_lowercase();
249
250 let is_gitignored = if project_root.join(".git").exists() {
252 self.check_gitignore_batch(path, project_root)
253 } else {
254 false
255 };
256
257 let priority_hints = PriorityHints {
259 is_env_file: file_name_lower.starts_with(".env") || file_name_lower.ends_with(".env"),
260 is_config_file: self.is_config_file(&file_name_lower, &extension),
261 is_secret_file: self.is_secret_file(&file_name_lower, path),
262 is_source_file: self.is_source_file(&extension),
263 has_secret_keywords: self.has_secret_keywords(&file_name_lower),
264 };
265
266 Ok(FileMetadata {
267 path: path.to_path_buf(),
268 size,
269 extension,
270 is_gitignored,
271 modified,
272 priority_hints,
273 })
274 }
275
276 fn check_gitignore_batch(&self, path: &Path, project_root: &Path) -> bool {
278 let output = Command::new("git")
280 .args(["check-ignore", path.to_str().unwrap_or("")])
281 .current_dir(project_root)
282 .output();
283
284 match output {
285 Ok(output) => output.status.success(),
286 Err(_) => false,
287 }
288 }
289
290 fn should_include_file(&self, meta: &FileMetadata) -> bool {
292 if meta.size > self.config.max_file_size {
294 trace!(
295 "Skipping large file: {} ({} bytes)",
296 meta.path.display(),
297 meta.size
298 );
299 return false;
300 }
301
302 if self.is_binary_file(meta) {
304 trace!("Skipping binary file: {}", meta.path.display());
305 return false;
306 }
307
308 if self.is_asset_file(meta) {
310 trace!("Skipping asset file: {}", meta.path.display());
311 return false;
312 }
313
314 if self.should_exclude_from_security_scan(meta) {
316 trace!("Excluding from security scan: {}", meta.path.display());
317 return false;
318 }
319
320 if meta.is_critical() {
322 return true;
323 }
324
325 match self.config.scan_mode {
327 ScanMode::Lightning => {
328 false
330 }
331 ScanMode::Fast => {
332 meta.is_priority() || (meta.priority_hints.is_source_file && meta.size < 50_000)
334 }
335 _ => true, }
337 }
338
339 fn is_binary_file(&self, meta: &FileMetadata) -> bool {
341 if let Some(ext) = &meta.extension
342 && self.binary_extensions.contains(ext.as_str())
343 {
344 return true;
345 }
346
347 let filename = meta
349 .path
350 .file_name()
351 .and_then(|n| n.to_str())
352 .unwrap_or("")
353 .to_lowercase();
354
355 if self.excluded_filenames.contains(filename.as_str()) {
356 return true;
357 }
358
359 false
360 }
361
362 fn is_asset_file(&self, meta: &FileMetadata) -> bool {
364 if let Some(ext) = &meta.extension
365 && self.asset_extensions.contains(ext.as_str())
366 {
367 return true;
368 }
369
370 let path_str = meta.path.to_string_lossy().to_lowercase();
372 let asset_dirs = [
373 "/assets/",
374 "/static/",
375 "/public/",
376 "/images/",
377 "/img/",
378 "/media/",
379 "/fonts/",
380 "/icons/",
381 "/graphics/",
382 "/pictures/",
383 ];
384
385 asset_dirs.iter().any(|&dir| path_str.contains(dir))
386 }
387
388 fn should_exclude_from_security_scan(&self, meta: &FileMetadata) -> bool {
390 let path_str = meta.path.to_string_lossy().to_lowercase();
391
392 if self.is_dependency_lock_file(meta) {
394 return true;
395 }
396
397 if meta.extension.as_deref() == Some("svg") {
399 return true;
400 }
401
402 if self.is_minified_or_bundled_file(meta) {
404 return true;
405 }
406
407 let exclude_patterns = [
409 ".md",
410 ".txt",
411 ".rst",
412 ".adoc",
413 ".asciidoc",
414 "readme",
415 "changelog",
416 "license",
417 "todo",
418 "roadmap",
419 "contributing",
420 "authors",
421 "/test/",
423 "/tests/",
424 "/spec/",
425 "/specs/",
426 "__test__",
427 "__spec__",
428 ".test.",
429 ".spec.",
430 "_test.",
431 "_spec.",
432 "fixtures",
433 "mocks",
434 "examples",
435 "/docs/",
437 "/doc/",
438 "/documentation/",
439 "frameworks/",
441 "detector",
442 "rules",
443 "patterns",
444 "target/",
446 "build/",
447 "dist/",
448 ".next/",
449 "coverage/",
450 ".nuxt/",
451 ".output/",
452 ".vercel/",
453 ".netlify/",
454 ".vscode/",
456 ".idea/",
457 ".vs/",
458 "*.swp",
459 "*.swo",
460 ".ds_store",
462 "thumbs.db",
463 "desktop.ini",
464 ];
465
466 if exclude_patterns
468 .iter()
469 .any(|&pattern| path_str.contains(pattern))
470 {
471 return true;
472 }
473
474 if let Some(ext) = &meta.extension {
476 let doc_extensions = ["md", "txt", "rst", "adoc", "asciidoc", "rtf"];
477 if doc_extensions.contains(&ext.as_str()) {
478 return true;
479 }
480 }
481
482 let filename = meta
484 .path
485 .file_name()
486 .and_then(|n| n.to_str())
487 .unwrap_or("")
488 .to_lowercase();
489
490 let doc_filenames = [
491 "readme",
492 "changelog",
493 "license",
494 "authors",
495 "contributing",
496 "roadmap",
497 "todo",
498 "examples",
499 "demo",
500 "sample",
501 "fixture",
502 "apicodedialog",
504 "codedialog",
505 "codeexample",
506 "apiexample",
507 "codesnippet",
508 "snippets",
509 "templates",
510 "codegenerator",
511 "apitool",
512 "playground",
513 "sandbox",
514 ];
515
516 if doc_filenames.iter().any(|&name| filename.contains(name)) {
517 return true;
518 }
519
520 false
521 }
522
523 fn is_minified_or_bundled_file(&self, meta: &FileMetadata) -> bool {
525 let filename = meta
526 .path
527 .file_name()
528 .and_then(|n| n.to_str())
529 .unwrap_or("")
530 .to_lowercase();
531
532 let minified_patterns = [
534 ".min.", ".bundle.", ".chunk.", ".vendor.", "-min.", "-bundle.", "-chunk.", "-vendor.",
535 "_min.", "_bundle.", "_chunk.", "_vendor.",
536 ];
537
538 minified_patterns
539 .iter()
540 .any(|&pattern| filename.contains(pattern))
541 }
542
543 fn get_ignored_dirs(scan_mode: &ScanMode) -> AHashSet<String> {
545 let mut dirs = AHashSet::new();
546
547 let always_ignore = vec![
549 ".git",
550 "node_modules",
551 "target",
552 "build",
553 "dist",
554 ".next",
555 "coverage",
556 "__pycache__",
557 ".pytest_cache",
558 ".mypy_cache",
559 "vendor",
560 "packages",
561 ".bundle",
562 "bower_components",
563 ".nuxt",
564 ".output",
565 ".vercel",
566 ".netlify",
567 ".vscode",
568 ".idea",
569 ".venv",
570 "venv", ];
572
573 for dir in always_ignore {
574 dirs.insert(dir.to_string());
575 }
576
577 if matches!(scan_mode, ScanMode::Lightning | ScanMode::Fast) {
579 let fast_ignore = vec!["test", "tests", "spec", "specs", "docs", "documentation"];
580 for dir in fast_ignore {
581 dirs.insert(dir.to_string());
582 }
583 }
584
585 dirs
586 }
587
588 fn get_binary_extensions() -> AHashSet<&'static str> {
590 let mut extensions = AHashSet::new();
591
592 let binary_exts = [
594 "exe", "dll", "so", "dylib", "lib", "a", "o", "obj", "bin", "com", "scr", "msi", "deb",
595 "rpm", "pkg", "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "ace", "cab", "dmg", "iso", "img",
597 "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm", "wav", "flac", "ogg", "aac",
599 "m4a", "wma", "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp", "ico", "cur", "psd", "ai",
601 "eps", "raw", "cr2", "nef", "ttf", "otf", "woff", "woff2", "eot", "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "odt", "ods", "odp", "rtf",
604 "db", "sqlite", "sqlite3", "mdb", "accdb", "wt", "pyc", "pyo", "class", "jar", "war", "ear", "cer", "jks",
607 ];
608
609 for ext in binary_exts {
610 extensions.insert(ext);
611 }
612
613 extensions
614 }
615
616 fn get_asset_extensions() -> AHashSet<&'static str> {
618 let mut extensions = AHashSet::new();
619
620 let asset_exts = [
621 "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp", "ico", "cur", "psd", "ai",
623 "eps", "raw", "cr2", "nef", "svg", "ttf", "otf", "woff", "woff2", "eot", "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm", "wav", "flac", "ogg", "aac",
626 "m4a", "wma",
627 ];
628
629 for ext in asset_exts {
630 extensions.insert(ext);
631 }
632
633 extensions
634 }
635
636 fn get_excluded_filenames() -> AHashSet<&'static str> {
638 let mut filenames = AHashSet::new();
639
640 let excluded = [
641 ".ds_store",
643 "thumbs.db",
644 "desktop.ini",
645 "folder.ico",
646 ".gitkeep",
648 ".keep",
649 ".placeholder",
650 ".tmp",
652 ".temp",
653 ".swp",
654 ".swo",
655 ".bak",
656 ".backup",
657 ];
658
659 for filename in excluded {
660 filenames.insert(filename);
661 }
662
663 filenames
664 }
665
666 fn get_secret_keywords() -> Vec<&'static str> {
668 vec![
669 "secret",
670 "key",
671 "token",
672 "password",
673 "credential",
674 "auth",
675 "api",
676 "private",
677 "access",
678 "bearer",
679 ]
680 }
681
682 fn is_config_file(&self, name: &str, extension: &Option<String>) -> bool {
683 let config_extensions = [
684 "json", "yml", "yaml", "toml", "ini", "conf", "config", "xml",
685 ];
686 let config_names = ["config", "settings", "configuration", ".env"];
687
688 if let Some(ext) = extension
689 && config_extensions.contains(&ext.as_str())
690 {
691 return true;
692 }
693
694 config_names.iter().any(|&n| name.contains(n))
695 }
696
697 fn is_secret_file(&self, name: &str, path: &Path) -> bool {
698 let secret_patterns = [
699 ".env",
700 ".key",
701 ".pem",
702 ".p12",
703 ".pfx",
704 "credentials",
705 "secret",
706 "private",
707 "cert",
708 ];
709
710 if secret_patterns.iter().any(|&p| name.contains(p)) {
712 return true;
713 }
714
715 let path_str = path.to_string_lossy().to_lowercase();
717 secret_patterns.iter().any(|&p| path_str.contains(p))
718 }
719
720 fn is_source_file(&self, extension: &Option<String>) -> bool {
721 if let Some(ext) = extension {
722 let source_extensions = [
723 "js", "jsx", "ts", "tsx", "py", "java", "kt", "go", "rs", "rb", "php", "cs", "cpp",
724 "c", "h", "swift", "scala", "clj", "ex", "exs",
725 ];
726 source_extensions.contains(&ext.as_str())
727 } else {
728 false
729 }
730 }
731
732 fn has_secret_keywords(&self, name: &str) -> bool {
733 self.secret_keywords
734 .iter()
735 .any(|&keyword| name.contains(keyword))
736 }
737
738 fn is_dependency_lock_file(&self, meta: &FileMetadata) -> bool {
740 let filename = meta
741 .path
742 .file_name()
743 .and_then(|n| n.to_str())
744 .unwrap_or("")
745 .to_lowercase();
746
747 let lock_files = [
749 "package-lock.json",
751 "yarn.lock",
752 "pnpm-lock.yaml",
753 "bun.lockb", "poetry.lock",
756 "pipfile.lock",
757 "pip-lock.txt",
758 "pdm.lock",
759 "cargo.lock",
761 "go.sum",
763 "go.mod",
764 "gradle.lockfile",
766 "maven-dependency-plugin.log",
767 "gemfile.lock",
769 "composer.lock",
771 "packages.lock.json",
773 "paket.lock",
774 "mix.lock", "pubspec.lock", "swift.resolved", "flake.lock", ];
780
781 lock_files.iter().any(|&pattern| filename == pattern) ||
783 filename.ends_with(".lock") ||
785 filename.ends_with("-lock.json") ||
786 filename.ends_with("-lock.yaml") ||
787 filename.ends_with("-lock.yml") ||
788 filename.ends_with(".lockb") || filename.contains("shrinkwrap") ||
790 filename.contains("lockfile")
791 }
792}
793
794impl FileMetadata {
795 pub fn is_critical(&self) -> bool {
797 self.priority_hints.is_env_file
798 || self.priority_hints.is_secret_file
799 || self.extension.as_deref() == Some("pem")
800 || self.extension.as_deref() == Some("key")
801 }
802
803 pub fn is_priority(&self) -> bool {
805 self.is_critical()
806 || self.priority_hints.is_config_file
807 || self.priority_hints.has_secret_keywords
808 }
809
810 pub fn priority_score(&self) -> u32 {
812 let mut score: u32 = 0;
813
814 if self.priority_hints.is_env_file {
815 score += 1000;
816 }
817 if self.priority_hints.is_secret_file {
818 score += 900;
819 }
820 if self.priority_hints.is_config_file {
821 score += 500;
822 }
823 if self.priority_hints.has_secret_keywords {
824 score += 300;
825 }
826 if !self.is_gitignored {
827 score += 200;
828 }
829 if self.priority_hints.is_source_file {
830 score += 100;
831 }
832
833 if self.size > 1_000_000 {
835 score = score.saturating_sub(100);
836 }
837
838 score
839 }
840}
841
842#[cfg(test)]
843mod tests {
844 use super::*;
845 use tempfile::TempDir;
846
847 #[test]
848 fn test_file_priority_scoring() {
849 let meta = FileMetadata {
850 path: PathBuf::from(".env"),
851 size: 100,
852 extension: Some("env".to_string()),
853 is_gitignored: false,
854 modified: SystemTime::now(),
855 priority_hints: PriorityHints {
856 is_env_file: true,
857 is_config_file: true,
858 is_secret_file: true,
859 is_source_file: false,
860 has_secret_keywords: true,
861 },
862 };
863
864 assert!(meta.is_critical());
865 assert!(meta.is_priority());
866 assert!(meta.priority_score() > 2000);
867 }
868
869 #[test]
870 fn test_file_discovery() {
871 let temp_dir = TempDir::new().unwrap();
872 fs::write(temp_dir.path().join(".env"), "SECRET=123").unwrap();
873 fs::write(temp_dir.path().join("config.json"), "{}").unwrap();
874 fs::create_dir(temp_dir.path().join("node_modules")).unwrap();
875 fs::write(temp_dir.path().join("node_modules/test.js"), "code").unwrap();
876
877 let config = DiscoveryConfig {
878 use_git: false,
879 max_file_size: 1024 * 1024,
880 priority_extensions: vec!["env".to_string()],
881 scan_mode: ScanMode::Fast,
882 };
883
884 let discovery = FileDiscovery::new(config);
885 let files = discovery.discover_files(temp_dir.path()).unwrap();
886
887 assert_eq!(files.len(), 2);
889 assert!(files.iter().any(|f| f.path.ends_with(".env")));
890 assert!(files.iter().any(|f| f.path.ends_with("config.json")));
891 }
892
893 #[test]
894 fn test_binary_file_detection() {
895 let config = DiscoveryConfig {
896 use_git: false,
897 max_file_size: 1024 * 1024,
898 priority_extensions: vec![],
899 scan_mode: ScanMode::Fast,
900 };
901 let discovery = FileDiscovery::new(config);
902
903 let binary_meta = FileMetadata {
904 path: PathBuf::from("test.jpg"),
905 size: 100,
906 extension: Some("jpg".to_string()),
907 is_gitignored: false,
908 modified: SystemTime::now(),
909 priority_hints: PriorityHints::default(),
910 };
911
912 assert!(discovery.is_binary_file(&binary_meta));
913 }
914
915 #[test]
916 fn test_lock_file_detection() {
917 let config = DiscoveryConfig {
918 use_git: false,
919 max_file_size: 1024 * 1024,
920 priority_extensions: vec![],
921 scan_mode: ScanMode::Fast,
922 };
923 let discovery = FileDiscovery::new(config);
924
925 let lock_files = [
926 "package-lock.json",
927 "yarn.lock",
928 "pnpm-lock.yaml",
929 "bun.lockb",
930 "cargo.lock",
931 "go.sum",
932 ];
933
934 for lock_file in lock_files {
935 let meta = FileMetadata {
936 path: PathBuf::from(lock_file),
937 size: 100,
938 extension: None,
939 is_gitignored: false,
940 modified: SystemTime::now(),
941 priority_hints: PriorityHints::default(),
942 };
943
944 assert!(
945 discovery.is_dependency_lock_file(&meta),
946 "Failed to detect {}",
947 lock_file
948 );
949 }
950 }
951}