1use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::fs;
8use std::time::SystemTime;
9
10use ahash::AHashSet;
11use rayon::prelude::*;
12use walkdir::WalkDir;
13use log::{debug, trace};
14
15use super::{ScanMode, SecurityError};
16
17#[derive(Debug, Clone)]
19pub struct FileMetadata {
20 pub path: PathBuf,
21 pub size: usize,
22 pub extension: Option<String>,
23 pub is_gitignored: bool,
24 pub modified: SystemTime,
25 pub priority_hints: PriorityHints,
26}
27
28#[derive(Debug, Clone, Default)]
30pub struct PriorityHints {
31 pub is_env_file: bool,
32 pub is_config_file: bool,
33 pub is_secret_file: bool,
34 pub is_source_file: bool,
35 pub has_secret_keywords: bool,
36}
37
38#[derive(Debug, Clone)]
40pub struct DiscoveryConfig {
41 pub use_git: bool,
42 pub max_file_size: usize,
43 pub priority_extensions: Vec<String>,
44 pub scan_mode: ScanMode,
45}
46
47pub struct FileDiscovery {
49 config: DiscoveryConfig,
50 ignored_dirs: AHashSet<String>,
51 secret_keywords: Vec<&'static str>,
52 binary_extensions: AHashSet<&'static str>,
53 excluded_filenames: AHashSet<&'static str>,
54 asset_extensions: AHashSet<&'static str>,
55}
56
57impl FileDiscovery {
58 pub fn new(config: DiscoveryConfig) -> Self {
59 let ignored_dirs = Self::get_ignored_dirs(&config.scan_mode);
60 let secret_keywords = Self::get_secret_keywords();
61 let binary_extensions = Self::get_binary_extensions();
62 let excluded_filenames = Self::get_excluded_filenames();
63 let asset_extensions = Self::get_asset_extensions();
64
65 Self {
66 config,
67 ignored_dirs,
68 secret_keywords,
69 binary_extensions,
70 excluded_filenames,
71 asset_extensions,
72 }
73 }
74
75 pub fn discover_files(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
77 let is_git_repo = project_root.join(".git").exists();
78
79 if is_git_repo && self.config.use_git {
80 self.git_aware_discovery(project_root)
81 } else {
82 self.filesystem_discovery(project_root)
83 }
84 }
85
86 fn git_aware_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
88 debug!("Using git-aware file discovery");
89
90 let tracked_files = self.get_git_tracked_files(project_root)?;
92
93 let untracked_files = self.get_untracked_secret_files(project_root)?;
95
96 let all_paths: Vec<PathBuf> = tracked_files.into_iter()
98 .chain(untracked_files)
99 .collect();
100
101 let files: Vec<FileMetadata> = all_paths
103 .par_iter()
104 .filter_map(|path| self.build_file_metadata(path, project_root).ok())
105 .filter(|meta| self.should_include_file(meta))
106 .collect();
107
108 Ok(files)
109 }
110
111 fn get_git_tracked_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
113 let output = Command::new("git")
114 .args(&["ls-files", "-z"]) .current_dir(project_root)
116 .output()
117 .map_err(|e| SecurityError::FileDiscovery(format!("Git ls-files failed: {}", e)))?;
118
119 if !output.status.success() {
120 return Err(SecurityError::FileDiscovery("Git ls-files failed".to_string()));
121 }
122
123 let paths: Vec<PathBuf> = output.stdout
125 .split(|&b| b == 0)
126 .filter(|path| !path.is_empty())
127 .filter_map(|path| std::str::from_utf8(path).ok())
128 .map(|path| project_root.join(path))
129 .collect();
130
131 Ok(paths)
132 }
133
134 fn get_untracked_secret_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
136 let secret_patterns = vec![
138 ".env*",
139 "*.key",
140 "*.pem",
141 "*.p12",
142 "*credentials*",
143 "*secret*",
144 "config/*.json",
145 "config/*.yml",
146 ];
147
148 let mut untracked_files = Vec::new();
149
150 for pattern in secret_patterns {
151 let output = Command::new("git")
152 .args(&["ls-files", "--others", "--exclude-standard", pattern])
153 .current_dir(project_root)
154 .output();
155
156 if let Ok(output) = output {
157 if output.status.success() {
158 let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
159 .lines()
160 .map(|line| project_root.join(line))
161 .collect();
162 untracked_files.extend(paths);
163 }
164 }
165 }
166
167 Ok(untracked_files)
168 }
169
170 fn filesystem_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
172 debug!("Using filesystem discovery");
173
174 let walker = WalkDir::new(project_root)
175 .follow_links(false)
176 .max_depth(20)
177 .into_iter()
178 .filter_entry(|entry| {
179 if entry.file_type().is_dir() {
181 let dir_name = entry.file_name().to_string_lossy();
182 return !self.ignored_dirs.contains(dir_name.as_ref());
183 }
184 true
185 });
186
187 let files: Vec<FileMetadata> = walker
188 .par_bridge()
189 .filter_map(|entry| entry.ok())
190 .filter(|entry| entry.file_type().is_file())
191 .filter_map(|entry| self.build_file_metadata(entry.path(), project_root).ok())
192 .filter(|meta| self.should_include_file(meta))
193 .collect();
194
195 Ok(files)
196 }
197
198 fn build_file_metadata(&self, path: &Path, project_root: &Path) -> Result<FileMetadata, std::io::Error> {
200 let metadata = fs::metadata(path)?;
201 let size = metadata.len() as usize;
202 let modified = metadata.modified()?;
203
204 let extension = path.extension()
205 .and_then(|ext| ext.to_str())
206 .map(|s| s.to_lowercase());
207
208 let file_name = path.file_name()
209 .and_then(|n| n.to_str())
210 .unwrap_or("");
211
212 let file_name_lower = file_name.to_lowercase();
213
214 let is_gitignored = if project_root.join(".git").exists() {
216 self.check_gitignore_batch(path, project_root)
217 } else {
218 false
219 };
220
221 let priority_hints = PriorityHints {
223 is_env_file: file_name_lower.starts_with(".env") || file_name_lower.ends_with(".env"),
224 is_config_file: self.is_config_file(&file_name_lower, &extension),
225 is_secret_file: self.is_secret_file(&file_name_lower, path),
226 is_source_file: self.is_source_file(&extension),
227 has_secret_keywords: self.has_secret_keywords(&file_name_lower),
228 };
229
230 Ok(FileMetadata {
231 path: path.to_path_buf(),
232 size,
233 extension,
234 is_gitignored,
235 modified,
236 priority_hints,
237 })
238 }
239
240 fn check_gitignore_batch(&self, path: &Path, project_root: &Path) -> bool {
242 let output = Command::new("git")
244 .args(&["check-ignore", path.to_str().unwrap_or("")])
245 .current_dir(project_root)
246 .output();
247
248 match output {
249 Ok(output) => output.status.success(),
250 Err(_) => false,
251 }
252 }
253
254 fn should_include_file(&self, meta: &FileMetadata) -> bool {
256 if meta.size > self.config.max_file_size {
258 trace!("Skipping large file: {} ({} bytes)", meta.path.display(), meta.size);
259 return false;
260 }
261
262 if self.is_binary_file(meta) {
264 trace!("Skipping binary file: {}", meta.path.display());
265 return false;
266 }
267
268 if self.is_asset_file(meta) {
270 trace!("Skipping asset file: {}", meta.path.display());
271 return false;
272 }
273
274 if self.should_exclude_from_security_scan(meta) {
276 trace!("Excluding from security scan: {}", meta.path.display());
277 return false;
278 }
279
280 if meta.is_critical() {
282 return true;
283 }
284
285 match self.config.scan_mode {
287 ScanMode::Lightning => {
288 false
290 }
291 ScanMode::Fast => {
292 meta.is_priority() || (meta.priority_hints.is_source_file && meta.size < 50_000)
294 }
295 _ => true, }
297 }
298
299 fn is_binary_file(&self, meta: &FileMetadata) -> bool {
301 if let Some(ext) = &meta.extension {
302 if self.binary_extensions.contains(ext.as_str()) {
303 return true;
304 }
305 }
306
307 let filename = meta.path.file_name()
309 .and_then(|n| n.to_str())
310 .unwrap_or("")
311 .to_lowercase();
312
313 if self.excluded_filenames.contains(filename.as_str()) {
314 return true;
315 }
316
317 false
318 }
319
320 fn is_asset_file(&self, meta: &FileMetadata) -> bool {
322 if let Some(ext) = &meta.extension {
323 if self.asset_extensions.contains(ext.as_str()) {
324 return true;
325 }
326 }
327
328 let path_str = meta.path.to_string_lossy().to_lowercase();
330 let asset_dirs = [
331 "/assets/", "/static/", "/public/", "/images/", "/img/",
332 "/media/", "/fonts/", "/icons/", "/graphics/", "/pictures/"
333 ];
334
335 asset_dirs.iter().any(|&dir| path_str.contains(dir))
336 }
337
338 fn should_exclude_from_security_scan(&self, meta: &FileMetadata) -> bool {
340 let path_str = meta.path.to_string_lossy().to_lowercase();
341
342 if self.is_dependency_lock_file(meta) {
344 return true;
345 }
346
347 if meta.extension.as_deref() == Some("svg") {
349 return true;
350 }
351
352 if self.is_minified_or_bundled_file(meta) {
354 return true;
355 }
356
357 let exclude_patterns = [
359 ".md", ".txt", ".rst", ".adoc", ".asciidoc",
360 "readme", "changelog", "license", "todo",
361 "roadmap", "contributing", "authors",
362 "/test/", "/tests/", "/spec/", "/specs/",
364 "__test__", "__spec__", ".test.", ".spec.",
365 "_test.", "_spec.", "fixtures", "mocks", "examples",
366 "/docs/", "/doc/", "/documentation/",
368 "frameworks/", "detector", "rules", "patterns",
370 "target/", "build/", "dist/", ".next/", "coverage/",
372 ".nuxt/", ".output/", ".vercel/", ".netlify/",
373 ".vscode/", ".idea/", ".vs/", "*.swp", "*.swo",
375 ".ds_store", "thumbs.db", "desktop.ini",
377 ];
378
379 if exclude_patterns.iter().any(|&pattern| path_str.contains(pattern)) {
381 return true;
382 }
383
384 if let Some(ext) = &meta.extension {
386 let doc_extensions = ["md", "txt", "rst", "adoc", "asciidoc", "rtf"];
387 if doc_extensions.contains(&ext.as_str()) {
388 return true;
389 }
390 }
391
392 let filename = meta.path.file_name()
394 .and_then(|n| n.to_str())
395 .unwrap_or("")
396 .to_lowercase();
397
398 let doc_filenames = [
399 "readme", "changelog", "license", "authors", "contributing",
400 "roadmap", "todo", "examples", "demo", "sample", "fixture",
401 "apicodedialog", "codedialog", "codeexample", "apiexample",
403 "codesnippet", "snippets", "templates", "codegenerator",
404 "apitool", "playground", "sandbox",
405 ];
406
407 if doc_filenames.iter().any(|&name| filename.contains(name)) {
408 return true;
409 }
410
411 false
412 }
413
414 fn is_minified_or_bundled_file(&self, meta: &FileMetadata) -> bool {
416 let filename = meta.path.file_name()
417 .and_then(|n| n.to_str())
418 .unwrap_or("")
419 .to_lowercase();
420
421 let minified_patterns = [
423 ".min.", ".bundle.", ".chunk.", ".vendor.",
424 "-min.", "-bundle.", "-chunk.", "-vendor.",
425 "_min.", "_bundle.", "_chunk.", "_vendor.",
426 ];
427
428 minified_patterns.iter().any(|&pattern| filename.contains(pattern))
429 }
430
431 fn get_ignored_dirs(scan_mode: &ScanMode) -> AHashSet<String> {
433 let mut dirs = AHashSet::new();
434
435 let always_ignore = vec![
437 ".git", "node_modules", "target", "build", "dist", ".next",
438 "coverage", "__pycache__", ".pytest_cache", ".mypy_cache",
439 "vendor", "packages", ".bundle", "bower_components",
440 ".nuxt", ".output", ".vercel", ".netlify", ".vscode", ".idea",
441 ".venv", "venv", ];
443
444 for dir in always_ignore {
445 dirs.insert(dir.to_string());
446 }
447
448 if matches!(scan_mode, ScanMode::Lightning | ScanMode::Fast) {
450 let fast_ignore = vec!["test", "tests", "spec", "specs", "docs", "documentation"];
451 for dir in fast_ignore {
452 dirs.insert(dir.to_string());
453 }
454 }
455
456 dirs
457 }
458
459 fn get_binary_extensions() -> AHashSet<&'static str> {
461 let mut extensions = AHashSet::new();
462
463 let binary_exts = [
465 "exe", "dll", "so", "dylib", "lib", "a", "o", "obj",
466 "bin", "com", "scr", "msi", "deb", "rpm", "pkg",
467 "zip", "tar", "gz", "bz2", "xz", "7z", "rar", "ace",
469 "cab", "dmg", "iso", "img",
470 "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm",
472 "wav", "flac", "ogg", "aac", "m4a", "wma",
473 "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp",
475 "ico", "cur", "psd", "ai", "eps", "raw", "cr2", "nef",
476 "ttf", "otf", "woff", "woff2", "eot",
478 "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
480 "odt", "ods", "odp", "rtf",
481 "db", "sqlite", "sqlite3", "mdb", "accdb", "wt",
483 "pyc", "pyo", "class", "jar", "war", "ear", "cer", "jks",
485 ];
486
487 for ext in binary_exts {
488 extensions.insert(ext);
489 }
490
491 extensions
492 }
493
494 fn get_asset_extensions() -> AHashSet<&'static str> {
496 let mut extensions = AHashSet::new();
497
498 let asset_exts = [
499 "jpg", "jpeg", "png", "gif", "bmp", "tiff", "tga", "webp",
501 "ico", "cur", "psd", "ai", "eps", "raw", "cr2", "nef", "svg",
502 "ttf", "otf", "woff", "woff2", "eot",
504 "mp3", "mp4", "avi", "mov", "wmv", "flv", "mkv", "webm",
506 "wav", "flac", "ogg", "aac", "m4a", "wma",
507 ];
508
509 for ext in asset_exts {
510 extensions.insert(ext);
511 }
512
513 extensions
514 }
515
516 fn get_excluded_filenames() -> AHashSet<&'static str> {
518 let mut filenames = AHashSet::new();
519
520 let excluded = [
521 ".ds_store", "thumbs.db", "desktop.ini", "folder.ico",
523 ".gitkeep", ".keep", ".placeholder",
525 ".tmp", ".temp", ".swp", ".swo", ".bak", ".backup",
527 ];
528
529 for filename in excluded {
530 filenames.insert(filename);
531 }
532
533 filenames
534 }
535
536 fn get_secret_keywords() -> Vec<&'static str> {
538 vec![
539 "secret", "key", "token", "password", "credential",
540 "auth", "api", "private", "access", "bearer",
541 ]
542 }
543
544 fn is_config_file(&self, name: &str, extension: &Option<String>) -> bool {
545 let config_extensions = ["json", "yml", "yaml", "toml", "ini", "conf", "config", "xml"];
546 let config_names = ["config", "settings", "configuration", ".env"];
547
548 if let Some(ext) = extension {
549 if config_extensions.contains(&ext.as_str()) {
550 return true;
551 }
552 }
553
554 config_names.iter().any(|&n| name.contains(n))
555 }
556
557 fn is_secret_file(&self, name: &str, path: &Path) -> bool {
558 let secret_patterns = [
559 ".env", ".key", ".pem", ".p12", ".pfx",
560 "credentials", "secret", "private", "cert",
561 ];
562
563 if secret_patterns.iter().any(|&p| name.contains(p)) {
565 return true;
566 }
567
568 let path_str = path.to_string_lossy().to_lowercase();
570 secret_patterns.iter().any(|&p| path_str.contains(p))
571 }
572
573 fn is_source_file(&self, extension: &Option<String>) -> bool {
574 if let Some(ext) = extension {
575 let source_extensions = [
576 "js", "jsx", "ts", "tsx", "py", "java", "kt", "go",
577 "rs", "rb", "php", "cs", "cpp", "c", "h", "swift",
578 "scala", "clj", "ex", "exs",
579 ];
580 source_extensions.contains(&ext.as_str())
581 } else {
582 false
583 }
584 }
585
586 fn has_secret_keywords(&self, name: &str) -> bool {
587 self.secret_keywords.iter().any(|&keyword| name.contains(keyword))
588 }
589
590 fn is_dependency_lock_file(&self, meta: &FileMetadata) -> bool {
592 let filename = meta.path.file_name()
593 .and_then(|n| n.to_str())
594 .unwrap_or("")
595 .to_lowercase();
596
597 let lock_files = [
599 "package-lock.json",
601 "yarn.lock",
602 "pnpm-lock.yaml",
603 "bun.lockb", "poetry.lock",
606 "pipfile.lock",
607 "pip-lock.txt",
608 "pdm.lock",
609 "cargo.lock",
611 "go.sum",
613 "go.mod",
614 "gradle.lockfile",
616 "maven-dependency-plugin.log",
617 "gemfile.lock",
619 "composer.lock",
621 "packages.lock.json",
623 "paket.lock",
624 "mix.lock", "pubspec.lock", "swift.resolved", "flake.lock", ];
630
631 lock_files.iter().any(|&pattern| filename == pattern) ||
633 filename.ends_with(".lock") ||
635 filename.ends_with("-lock.json") ||
636 filename.ends_with("-lock.yaml") ||
637 filename.ends_with("-lock.yml") ||
638 filename.ends_with(".lockb") || filename.contains("shrinkwrap") ||
640 filename.contains("lockfile")
641 }
642}
643
644impl FileMetadata {
645 pub fn is_critical(&self) -> bool {
647 self.priority_hints.is_env_file ||
648 self.priority_hints.is_secret_file ||
649 self.extension.as_deref() == Some("pem") ||
650 self.extension.as_deref() == Some("key")
651 }
652
653 pub fn is_priority(&self) -> bool {
655 self.is_critical() ||
656 self.priority_hints.is_config_file ||
657 self.priority_hints.has_secret_keywords
658 }
659
660 pub fn priority_score(&self) -> u32 {
662 let mut score: u32 = 0;
663
664 if self.priority_hints.is_env_file { score += 1000; }
665 if self.priority_hints.is_secret_file { score += 900; }
666 if self.priority_hints.is_config_file { score += 500; }
667 if self.priority_hints.has_secret_keywords { score += 300; }
668 if !self.is_gitignored { score += 200; }
669 if self.priority_hints.is_source_file { score += 100; }
670
671 if self.size > 1_000_000 { score = score.saturating_sub(100); }
673
674 score
675 }
676}
677
678#[cfg(test)]
679mod tests {
680 use super::*;
681 use tempfile::TempDir;
682
683 #[test]
684 fn test_file_priority_scoring() {
685 let meta = FileMetadata {
686 path: PathBuf::from(".env"),
687 size: 100,
688 extension: Some("env".to_string()),
689 is_gitignored: false,
690 modified: SystemTime::now(),
691 priority_hints: PriorityHints {
692 is_env_file: true,
693 is_config_file: true,
694 is_secret_file: true,
695 is_source_file: false,
696 has_secret_keywords: true,
697 },
698 };
699
700 assert!(meta.is_critical());
701 assert!(meta.is_priority());
702 assert!(meta.priority_score() > 2000);
703 }
704
705 #[test]
706 fn test_file_discovery() {
707 let temp_dir = TempDir::new().unwrap();
708 fs::write(temp_dir.path().join(".env"), "SECRET=123").unwrap();
709 fs::write(temp_dir.path().join("config.json"), "{}").unwrap();
710 fs::create_dir(temp_dir.path().join("node_modules")).unwrap();
711 fs::write(temp_dir.path().join("node_modules/test.js"), "code").unwrap();
712
713 let config = DiscoveryConfig {
714 use_git: false,
715 max_file_size: 1024 * 1024,
716 priority_extensions: vec!["env".to_string()],
717 scan_mode: ScanMode::Fast,
718 };
719
720 let discovery = FileDiscovery::new(config);
721 let files = discovery.discover_files(temp_dir.path()).unwrap();
722
723 assert_eq!(files.len(), 2);
725 assert!(files.iter().any(|f| f.path.ends_with(".env")));
726 assert!(files.iter().any(|f| f.path.ends_with("config.json")));
727 }
728
729 #[test]
730 fn test_binary_file_detection() {
731 let config = DiscoveryConfig {
732 use_git: false,
733 max_file_size: 1024 * 1024,
734 priority_extensions: vec![],
735 scan_mode: ScanMode::Fast,
736 };
737 let discovery = FileDiscovery::new(config);
738
739 let binary_meta = FileMetadata {
740 path: PathBuf::from("test.jpg"),
741 size: 100,
742 extension: Some("jpg".to_string()),
743 is_gitignored: false,
744 modified: SystemTime::now(),
745 priority_hints: PriorityHints::default(),
746 };
747
748 assert!(discovery.is_binary_file(&binary_meta));
749 }
750
751 #[test]
752 fn test_lock_file_detection() {
753 let config = DiscoveryConfig {
754 use_git: false,
755 max_file_size: 1024 * 1024,
756 priority_extensions: vec![],
757 scan_mode: ScanMode::Fast,
758 };
759 let discovery = FileDiscovery::new(config);
760
761 let lock_files = [
762 "package-lock.json",
763 "yarn.lock",
764 "pnpm-lock.yaml",
765 "bun.lockb",
766 "cargo.lock",
767 "go.sum",
768 ];
769
770 for lock_file in lock_files {
771 let meta = FileMetadata {
772 path: PathBuf::from(lock_file),
773 size: 100,
774 extension: None,
775 is_gitignored: false,
776 modified: SystemTime::now(),
777 priority_hints: PriorityHints::default(),
778 };
779
780 assert!(discovery.is_dependency_lock_file(&meta), "Failed to detect {}", lock_file);
781 }
782 }
783}