1use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::fs;
8use std::time::SystemTime;
9
10use ahash::AHashSet;
11use rayon::prelude::*;
12use walkdir::WalkDir;
13use log::{debug, trace};
14
15use super::{ScanMode, SecurityError};
16
17#[derive(Debug, Clone)]
19pub struct FileMetadata {
20 pub path: PathBuf,
21 pub size: usize,
22 pub extension: Option<String>,
23 pub is_gitignored: bool,
24 pub modified: SystemTime,
25 pub priority_hints: PriorityHints,
26}
27
28#[derive(Debug, Clone, Default)]
30pub struct PriorityHints {
31 pub is_env_file: bool,
32 pub is_config_file: bool,
33 pub is_secret_file: bool,
34 pub is_source_file: bool,
35 pub has_secret_keywords: bool,
36}
37
38#[derive(Debug, Clone)]
40pub struct DiscoveryConfig {
41 pub use_git: bool,
42 pub max_file_size: usize,
43 pub priority_extensions: Vec<String>,
44 pub scan_mode: ScanMode,
45}
46
47pub struct FileDiscovery {
49 config: DiscoveryConfig,
50 ignored_dirs: AHashSet<String>,
51 secret_keywords: Vec<&'static str>,
52 binary_extensions: AHashSet<&'static str>,
53 excluded_filenames: AHashSet<&'static str>,
54 asset_extensions: AHashSet<&'static str>,
55}
56
57impl FileDiscovery {
58 pub fn new(config: DiscoveryConfig) -> Self {
59 let ignored_dirs = Self::get_ignored_dirs(&config.scan_mode);
60 let secret_keywords = Self::get_secret_keywords();
61 let binary_extensions = Self::get_binary_extensions();
62 let excluded_filenames = Self::get_excluded_filenames();
63 let asset_extensions = Self::get_asset_extensions();
64
65 Self {
66 config,
67 ignored_dirs,
68 secret_keywords,
69 binary_extensions,
70 excluded_filenames,
71 asset_extensions,
72 }
73 }
74
75 pub fn discover_files(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
77 let is_git_repo = project_root.join(".git").exists();
78
79 if is_git_repo && self.config.use_git {
80 self.git_aware_discovery(project_root)
81 } else {
82 self.filesystem_discovery(project_root)
83 }
84 }
85
86 fn git_aware_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
88 debug!("Using git-aware file discovery");
89
90 let tracked_files = self.get_git_tracked_files(project_root)?;
92
93 let untracked_files = self.get_untracked_secret_files(project_root)?;
95
96 let all_paths: Vec<PathBuf> = tracked_files.into_iter()
98 .chain(untracked_files)
99 .collect();
100
101 let files: Vec<FileMetadata> = all_paths
103 .par_iter()
104 .filter_map(|path| self.build_file_metadata(path, project_root).ok())
105 .filter(|meta| self.should_include_file(meta))
106 .collect();
107
108 Ok(files)
109 }
110
111 fn get_git_tracked_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
113 let output = Command::new("git")
114 .args(&["ls-files", "-z"]) .current_dir(project_root)
116 .output()
117 .map_err(|e| SecurityError::FileDiscovery(format!("Git ls-files failed: {}", e)))?;
118
119 if !output.status.success() {
120 return Err(SecurityError::FileDiscovery("Git ls-files failed".to_string()));
121 }
122
123 let paths: Vec<PathBuf> = output.stdout
125 .split(|&b| b == 0)
126 .filter(|path| !path.is_empty())
127 .filter_map(|path| std::str::from_utf8(path).ok())
128 .map(|path| project_root.join(path))
129 .collect();
130
131 Ok(paths)
132 }
133
134 fn get_untracked_secret_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
136 let secret_patterns = vec![
138 ".env*",
139 "*.key",
140 "*.pem",
141 "*.p12",
142 "*credentials*",
143 "*secret*",
144 "config/*.json",
145 "config/*.yml",
146 ];
147
148 let mut untracked_files = Vec::new();
149
150 for pattern in secret_patterns {
151 let output = Command::new("git")
152 .args(&["ls-files", "--others", "--exclude-standard", pattern])
153 .current_dir(project_root)
154 .output();
155
156 if let Ok(output) = output {
157 if output.status.success() {
158 let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
159 .lines()
160 .map(|line| project_root.join(line))
161 .collect();
162 untracked_files.extend(paths);
163 }
164 }
165 }
166
167 Ok(untracked_files)
168 }
169
170 fn filesystem_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
172 debug!("Using filesystem discovery");
173
174 let walker = WalkDir::new(project_root)
175 .follow_links(false)
176 .max_depth(20)
177 .into_iter()
178 .filter_entry(|entry| {
179 if entry.file_type().is_dir() {
181 let dir_name = entry.file_name().to_string_lossy();
182 return !self.ignored_dirs.contains(dir_name.as_ref());
183 }
184 true
185 });
186
187 let files: Vec<FileMetadata> = walker
188 .par_bridge()
189 .filter_map(|entry| entry.ok())
190 .filter(|entry| entry.file_type().is_file())
191 .filter_map(|entry| self.build_file_metadata(entry.path(), project_root).ok())
192 .filter(|meta| self.should_include_file(meta))
193 .collect();
194
195 Ok(files)
196 }
197
198 fn build_file_metadata(&self, path: &Path, project_root: &Path) -> Result<FileMetadata, std::io::Error> {
200 let metadata = fs::metadata(path)?;
201 let size = metadata.len() as usize;
202 let modified = metadata.modified()?;
203
204 let extension = path.extension()
205 .and_then(|ext| ext.to_str())
206 .map(|s| s.to_lowercase());
207
208 let file_name = path.file_name()
209 .and_then(|n| n.to_str())
210 .unwrap_or("");
211
212 let file_name_lower = file_name.to_lowercase();
213
214 let is_gitignored = if project_root.join(".git").exists() {
216 self.check_gitignore_batch(path, project_root)
217 } else {
218 false
219 };
220
221 let priority_hints = PriorityHints {
223 is_env_file: file_name_lower.starts_with(".env") || file_name_lower.ends_with(".env"),
224 is_config_file: self.is_config_file(&file_name_lower, &extension),
225 is_secret_file: self.is_secret_file(&file_name_lower, path),
226 is_source_file: self.is_source_file(&extension),
227 has_secret_keywords: self.has_secret_keywords(&file_name_lower),
228 };
229
230 Ok(FileMetadata {
231 path: path.to_path_buf(),
232 size,
233 extension,
234 is_gitignored,
235 modified,
236 priority_hints,
237 })
238 }
239
240 fn check_gitignore_batch(&self, path: &Path, project_root: &Path) -> bool {
242 let output = Command::new("git")
244 .args(&["check-ignore", path.to_str().unwrap_or("")])
245 .current_dir(project_root)
246 .output();
247
248 match output {
249 Ok(output) => output.status.success(),
250 Err(_) => false,
251 }
252 }
253
254 fn should_include_file(&self, meta: &FileMetadata) -> bool {
256 if meta.size > self.config.max_file_size {
258 trace!("Skipping large file: {} ({} bytes)", meta.path.display(), meta.size);
259 return false;
260 }
261
262 if self.is_binary_file(meta) {
264 trace!("Skipping binary file: {}", meta.path.display());
265 return false;
266 }
267
268 if self.is_asset_file(meta) {
270 trace!("Skipping asset file: {}", meta.path.display());
271 return false;
272 }
273
274 if self.should_exclude_from_security_scan(meta) {
276 trace!("Excluding from security scan: {}", meta.path.display());
277 return false;
278 }
279
280 if meta.is_critical() {
282 return true;
283 }
284
285 match self.config.scan_mode {
287 ScanMode::Lightning => {
288 false
290 }
291 ScanMode::Fast => {
292 meta.is_priority() || (meta.priority_hints.is_source_file && meta.size < 50_000)
294 }
295 _ => true, }
297 }
298
299 fn is_binary_file(&self, meta: &FileMetadata) -> bool {
301 if let Some(ext) = &meta.extension {
302 if self.binary_extensions.contains(ext.as_str()) {
303 return true;
304 }
305 }
306
307 let filename = meta.path.file_name()
309 .and_then(|n| n.to_str())
310 .unwrap_or("")
311 .to_lowercase();
312
313 if self.excluded_filenames.contains(filename.as_str()) {
314 return true;
315 }
316
317 false
318 }
319
320 fn is_asset_file(&self, meta: &FileMetadata) -> bool {
322 if let Some(ext) = &meta.extension {
323 if self.asset_extensions.contains(ext.as_str()) {
324 return true;
325 }
326 }
327
328 let path_str = meta.path.to_string_lossy().to_lowercase();
330 let asset_dirs = [
331 "/assets/", "/static/", "/public/", "/images/", "/img/",
332 "/media/", "/fonts/", "/icons/", "/graphics/", "/pictures/"
333 ];
334
335 asset_dirs.iter().any(|&dir| path_str.contains(dir))
336 }
337
338 fn should_exclude_from_security_scan(&self, meta: &FileMetadata) -> bool {
340 let path_str = meta.path.to_string_lossy().to_lowercase();
341
342 if self.is_dependency_lock_file(meta) {
344 return true;
345 }
346
347 if meta.extension.as_deref() == Some("svg") {
349 return true;
350 }
351
352 if self.is_minified_or_bundled_file(meta) {
354 return true;
355 }
356
357 let exclude_patterns = [
359 ".md", ".txt", ".rst", ".adoc", ".asciidoc",
360 "readme", "changelog", "license", "todo",
361 "roadmap", "contributing", "authors",
362 "/test/", "/tests/", "/spec/", "/specs/",
364 "__test__", "__spec__", ".test.", ".spec.",
365 "_test.", "_spec.", "fixtures", "mocks", "examples",
366 "/docs/", "/doc/", "/documentation/",
368 "frameworks/", "detector", "rules", "patterns",
370 "target/", "build/", "dist/", ".next/", "coverage/",
372 ".nuxt/", ".output/", ".vercel/", ".netlify/",
373 ".vscode/", ".idea/", ".vs/", "*.swp", "*.swo",
375 ".ds_store", "thumbs.db", "desktop.ini",
377 ];
378
379 if exclude_patterns.iter().any(|&pattern| path_str.contains(pattern)) {
381 return true;
382 }
383
384 if let Some(ext) = &meta.extension {
386 let doc_extensions = ["md", "txt", "rst", "adoc", "asciidoc", "rtf"];
387 if doc_extensions.contains(&ext.as_str()) {
388 return true;
389 }
390 }
391
392 let filename = meta.path.file_name()
394 .and_then(|n| n.to_str())
395 .unwrap_or("")
396 .to_lowercase();
397
398 let doc_filenames = [
399 "readme", "changelog", "license", "authors", "contributing",
400 "roadmap", "todo", "examples", "demo", "sample", "fixture",
401 "apicodedialog", "codedialog", "codeexample", "apiexample",
403 "codesnippet", "snippets", "templates", "codegenerator",
404 "apitool", "playground", "sandbox",
405 ];
406
407 if doc_filenames.iter().any(|&name| filename.contains(name)) {
408 return true;
409 }
410
411 false
412 }
413
414 fn is_minified_or_bundled_file(&self, meta: &FileMetadata) -> bool {
416 let filename = meta.path.file_name()
417 .and_then(|n| n.to_str())
418 .unwrap_or("")
419 .to_lowercase();
420
421 let minified_patterns = [
423 ".min.", ".bundle.", ".chunk.", ".vendor.",
424 "-min.", "-bundle.", "-chunk.", "-vendor.",
425 "_min.", "_bundle.", "_chunk.", "_vendor.",
426 ];
427
428 minified_patterns.iter().any(|&pattern| filename.contains(pattern))
429 }
430
431 fn get_ignored_dirs(scan_mode: &ScanMode) -> AHashSet<String> {
433 let mut dirs = AHashSet::new();
434
435 let always_ignore = vec![
437 ".git", "node_modules", "target", "build", "dist", ".next",
438 "coverage", "__pycache__", ".pytest_cache", ".mypy_cache",
439 "vendor", "packages", ".bundle", "bower_components",
440 ".nuxt", ".output", ".vercel", ".netlify", ".vscode", ".idea",
441 ];
442
443 for dir in always_ignore {
444 dirs.insert(dir.to_string());
445 }
446
447 if matches!(scan_mode, ScanMode::Lightning | ScanMode::Fast) {
449 let fast_ignore = vec!["test", "tests", "spec", "specs", "docs", "documentation"];
450 for dir in fast_ignore {
451 dirs.insert(dir.to_string());
452 }
453 }
454
455 dirs
456 }
457
458 fn get_binary_extensions() -> AHashSet<&'static str> {
460 let mut extensions = AHashSet::new();
461
462 let binary_exts = [
464 "exe", "dll", "so", "dylib", "lib", "a", "o", "obj",
465 "bin", "com", "scr", "msi", "deb", "rpm", "pkg",
466 "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "ace",
468 "cab", "dmg", "iso", "img",
469 "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm",
471 "wav", "flac", "ogg", "aac", "m4a", "wma",
472 "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp",
474 "ico", "cur", "psd", "ai", "eps", "raw", "cr2", "nef",
475 "ttf", "otf", "woff", "woff2", "eot",
477 "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
479 "odt", "ods", "odp", "rtf",
480 "db", "sqlite", "sqlite3", "mdb", "accdb",
482 "pyc", "pyo", "class", "jar", "war", "ear",
484 ];
485
486 for ext in binary_exts {
487 extensions.insert(ext);
488 }
489
490 extensions
491 }
492
493 fn get_asset_extensions() -> AHashSet<&'static str> {
495 let mut extensions = AHashSet::new();
496
497 let asset_exts = [
498 "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp",
500 "ico", "cur", "psd", "ai", "eps", "raw", "cr2", "nef", "svg",
501 "ttf", "otf", "woff", "woff2", "eot",
503 "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm",
505 "wav", "flac", "ogg", "aac", "m4a", "wma",
506 ];
507
508 for ext in asset_exts {
509 extensions.insert(ext);
510 }
511
512 extensions
513 }
514
515 fn get_excluded_filenames() -> AHashSet<&'static str> {
517 let mut filenames = AHashSet::new();
518
519 let excluded = [
520 ".ds_store", "thumbs.db", "desktop.ini", "folder.ico",
522 ".gitkeep", ".keep", ".placeholder",
524 ".tmp", ".temp", ".swp", ".swo", ".bak", ".backup",
526 ];
527
528 for filename in excluded {
529 filenames.insert(filename);
530 }
531
532 filenames
533 }
534
535 fn get_secret_keywords() -> Vec<&'static str> {
537 vec![
538 "secret", "key", "token", "password", "credential",
539 "auth", "api", "private", "access", "bearer",
540 ]
541 }
542
543 fn is_config_file(&self, name: &str, extension: &Option<String>) -> bool {
544 let config_extensions = ["json", "yml", "yaml", "toml", "ini", "conf", "config", "xml"];
545 let config_names = ["config", "settings", "configuration", ".env"];
546
547 if let Some(ext) = extension {
548 if config_extensions.contains(&ext.as_str()) {
549 return true;
550 }
551 }
552
553 config_names.iter().any(|&n| name.contains(n))
554 }
555
556 fn is_secret_file(&self, name: &str, path: &Path) -> bool {
557 let secret_patterns = [
558 ".env", ".key", ".pem", ".p12", ".pfx",
559 "credentials", "secret", "private", "cert",
560 ];
561
562 if secret_patterns.iter().any(|&p| name.contains(p)) {
564 return true;
565 }
566
567 let path_str = path.to_string_lossy().to_lowercase();
569 secret_patterns.iter().any(|&p| path_str.contains(p))
570 }
571
572 fn is_source_file(&self, extension: &Option<String>) -> bool {
573 if let Some(ext) = extension {
574 let source_extensions = [
575 "js", "jsx", "ts", "tsx", "py", "java", "kt", "go",
576 "rs", "rb", "php", "cs", "cpp", "c", "h", "swift",
577 "scala", "clj", "ex", "exs",
578 ];
579 source_extensions.contains(&ext.as_str())
580 } else {
581 false
582 }
583 }
584
585 fn has_secret_keywords(&self, name: &str) -> bool {
586 self.secret_keywords.iter().any(|&keyword| name.contains(keyword))
587 }
588
589 fn is_dependency_lock_file(&self, meta: &FileMetadata) -> bool {
591 let filename = meta.path.file_name()
592 .and_then(|n| n.to_str())
593 .unwrap_or("")
594 .to_lowercase();
595
596 let lock_files = [
598 "package-lock.json",
600 "yarn.lock",
601 "pnpm-lock.yaml",
602 "bun.lockb", "poetry.lock",
605 "pipfile.lock",
606 "pip-lock.txt",
607 "pdm.lock",
608 "cargo.lock",
610 "go.sum",
612 "go.mod",
613 "gradle.lockfile",
615 "maven-dependency-plugin.log",
616 "gemfile.lock",
618 "composer.lock",
620 "packages.lock.json",
622 "paket.lock",
623 "mix.lock", "pubspec.lock", "swift.resolved", "flake.lock", ];
629
630 lock_files.iter().any(|&pattern| filename == pattern) ||
632 filename.ends_with(".lock") ||
634 filename.ends_with("-lock.json") ||
635 filename.ends_with("-lock.yaml") ||
636 filename.ends_with("-lock.yml") ||
637 filename.ends_with(".lockb") || filename.contains("shrinkwrap") ||
639 filename.contains("lockfile")
640 }
641}
642
643impl FileMetadata {
644 pub fn is_critical(&self) -> bool {
646 self.priority_hints.is_env_file ||
647 self.priority_hints.is_secret_file ||
648 self.extension.as_deref() == Some("pem") ||
649 self.extension.as_deref() == Some("key")
650 }
651
652 pub fn is_priority(&self) -> bool {
654 self.is_critical() ||
655 self.priority_hints.is_config_file ||
656 self.priority_hints.has_secret_keywords
657 }
658
659 pub fn priority_score(&self) -> u32 {
661 let mut score: u32 = 0;
662
663 if self.priority_hints.is_env_file { score += 1000; }
664 if self.priority_hints.is_secret_file { score += 900; }
665 if self.priority_hints.is_config_file { score += 500; }
666 if self.priority_hints.has_secret_keywords { score += 300; }
667 if !self.is_gitignored { score += 200; }
668 if self.priority_hints.is_source_file { score += 100; }
669
670 if self.size > 1_000_000 { score = score.saturating_sub(100); }
672
673 score
674 }
675}
676
677#[cfg(test)]
678mod tests {
679 use super::*;
680 use tempfile::TempDir;
681
682 #[test]
683 fn test_file_priority_scoring() {
684 let meta = FileMetadata {
685 path: PathBuf::from(".env"),
686 size: 100,
687 extension: Some("env".to_string()),
688 is_gitignored: false,
689 modified: SystemTime::now(),
690 priority_hints: PriorityHints {
691 is_env_file: true,
692 is_config_file: true,
693 is_secret_file: true,
694 is_source_file: false,
695 has_secret_keywords: true,
696 },
697 };
698
699 assert!(meta.is_critical());
700 assert!(meta.is_priority());
701 assert!(meta.priority_score() > 2000);
702 }
703
704 #[test]
705 fn test_file_discovery() {
706 let temp_dir = TempDir::new().unwrap();
707 fs::write(temp_dir.path().join(".env"), "SECRET=123").unwrap();
708 fs::write(temp_dir.path().join("config.json"), "{}").unwrap();
709 fs::create_dir(temp_dir.path().join("node_modules")).unwrap();
710 fs::write(temp_dir.path().join("node_modules/test.js"), "code").unwrap();
711
712 let config = DiscoveryConfig {
713 use_git: false,
714 max_file_size: 1024 * 1024,
715 priority_extensions: vec!["env".to_string()],
716 scan_mode: ScanMode::Fast,
717 };
718
719 let discovery = FileDiscovery::new(config);
720 let files = discovery.discover_files(temp_dir.path()).unwrap();
721
722 assert_eq!(files.len(), 2);
724 assert!(files.iter().any(|f| f.path.ends_with(".env")));
725 assert!(files.iter().any(|f| f.path.ends_with("config.json")));
726 }
727
728 #[test]
729 fn test_binary_file_detection() {
730 let config = DiscoveryConfig {
731 use_git: false,
732 max_file_size: 1024 * 1024,
733 priority_extensions: vec![],
734 scan_mode: ScanMode::Fast,
735 };
736 let discovery = FileDiscovery::new(config);
737
738 let binary_meta = FileMetadata {
739 path: PathBuf::from("test.jpg"),
740 size: 100,
741 extension: Some("jpg".to_string()),
742 is_gitignored: false,
743 modified: SystemTime::now(),
744 priority_hints: PriorityHints::default(),
745 };
746
747 assert!(discovery.is_binary_file(&binary_meta));
748 }
749
750 #[test]
751 fn test_lock_file_detection() {
752 let config = DiscoveryConfig {
753 use_git: false,
754 max_file_size: 1024 * 1024,
755 priority_extensions: vec![],
756 scan_mode: ScanMode::Fast,
757 };
758 let discovery = FileDiscovery::new(config);
759
760 let lock_files = [
761 "package-lock.json",
762 "yarn.lock",
763 "pnpm-lock.yaml",
764 "bun.lockb",
765 "cargo.lock",
766 "go.sum",
767 ];
768
769 for lock_file in lock_files {
770 let meta = FileMetadata {
771 path: PathBuf::from(lock_file),
772 size: 100,
773 extension: None,
774 is_gitignored: false,
775 modified: SystemTime::now(),
776 priority_hints: PriorityHints::default(),
777 };
778
779 assert!(discovery.is_dependency_lock_file(&meta), "Failed to detect {}", lock_file);
780 }
781 }
782}