1use std::path::{Path, PathBuf};
6use std::process::Command;
7use std::fs;
8use std::time::SystemTime;
9
10use ahash::AHashSet;
11use rayon::prelude::*;
12use walkdir::WalkDir;
13use log::{debug, trace};
14
15use super::{ScanMode, SecurityError};
16
17#[derive(Debug, Clone)]
19pub struct FileMetadata {
20 pub path: PathBuf,
21 pub size: usize,
22 pub extension: Option<String>,
23 pub is_gitignored: bool,
24 pub modified: SystemTime,
25 pub priority_hints: PriorityHints,
26}
27
28#[derive(Debug, Clone, Default)]
30pub struct PriorityHints {
31 pub is_env_file: bool,
32 pub is_config_file: bool,
33 pub is_secret_file: bool,
34 pub is_source_file: bool,
35 pub has_secret_keywords: bool,
36}
37
38#[derive(Debug, Clone)]
40pub struct DiscoveryConfig {
41 pub use_git: bool,
42 pub max_file_size: usize,
43 pub priority_extensions: Vec<String>,
44 pub scan_mode: ScanMode,
45}
46
47pub struct FileDiscovery {
49 config: DiscoveryConfig,
50 ignored_dirs: AHashSet<String>,
51 secret_keywords: Vec<&'static str>,
52}
53
54impl FileDiscovery {
55 pub fn new(config: DiscoveryConfig) -> Self {
56 let ignored_dirs = Self::get_ignored_dirs(&config.scan_mode);
57 let secret_keywords = Self::get_secret_keywords();
58
59 Self {
60 config,
61 ignored_dirs,
62 secret_keywords,
63 }
64 }
65
66 pub fn discover_files(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
68 let is_git_repo = project_root.join(".git").exists();
69
70 if is_git_repo && self.config.use_git {
71 self.git_aware_discovery(project_root)
72 } else {
73 self.filesystem_discovery(project_root)
74 }
75 }
76
77 fn git_aware_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
79 debug!("Using git-aware file discovery");
80
81 let tracked_files = self.get_git_tracked_files(project_root)?;
83
84 let untracked_files = self.get_untracked_secret_files(project_root)?;
86
87 let all_paths: Vec<PathBuf> = tracked_files.into_iter()
89 .chain(untracked_files)
90 .collect();
91
92 let files: Vec<FileMetadata> = all_paths
94 .par_iter()
95 .filter_map(|path| self.build_file_metadata(path, project_root).ok())
96 .filter(|meta| self.should_include_file(meta))
97 .collect();
98
99 Ok(files)
100 }
101
102 fn get_git_tracked_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
104 let output = Command::new("git")
105 .args(&["ls-files", "-z"]) .current_dir(project_root)
107 .output()
108 .map_err(|e| SecurityError::FileDiscovery(format!("Git ls-files failed: {}", e)))?;
109
110 if !output.status.success() {
111 return Err(SecurityError::FileDiscovery("Git ls-files failed".to_string()));
112 }
113
114 let paths: Vec<PathBuf> = output.stdout
116 .split(|&b| b == 0)
117 .filter(|path| !path.is_empty())
118 .filter_map(|path| std::str::from_utf8(path).ok())
119 .map(|path| project_root.join(path))
120 .collect();
121
122 Ok(paths)
123 }
124
125 fn get_untracked_secret_files(&self, project_root: &Path) -> Result<Vec<PathBuf>, SecurityError> {
127 let secret_patterns = vec![
129 ".env*",
130 "*.key",
131 "*.pem",
132 "*.p12",
133 "*credentials*",
134 "*secret*",
135 "config/*.json",
136 "config/*.yml",
137 ];
138
139 let mut untracked_files = Vec::new();
140
141 for pattern in secret_patterns {
142 let output = Command::new("git")
143 .args(&["ls-files", "--others", "--exclude-standard", pattern])
144 .current_dir(project_root)
145 .output();
146
147 if let Ok(output) = output {
148 if output.status.success() {
149 let paths: Vec<PathBuf> = String::from_utf8_lossy(&output.stdout)
150 .lines()
151 .map(|line| project_root.join(line))
152 .collect();
153 untracked_files.extend(paths);
154 }
155 }
156 }
157
158 Ok(untracked_files)
159 }
160
161 fn filesystem_discovery(&self, project_root: &Path) -> Result<Vec<FileMetadata>, SecurityError> {
163 debug!("Using filesystem discovery");
164
165 let walker = WalkDir::new(project_root)
166 .follow_links(false)
167 .max_depth(20)
168 .into_iter()
169 .filter_entry(|entry| {
170 if entry.file_type().is_dir() {
172 let dir_name = entry.file_name().to_string_lossy();
173 return !self.ignored_dirs.contains(dir_name.as_ref());
174 }
175 true
176 });
177
178 let files: Vec<FileMetadata> = walker
179 .par_bridge()
180 .filter_map(|entry| entry.ok())
181 .filter(|entry| entry.file_type().is_file())
182 .filter_map(|entry| self.build_file_metadata(entry.path(), project_root).ok())
183 .filter(|meta| self.should_include_file(meta))
184 .collect();
185
186 Ok(files)
187 }
188
189 fn build_file_metadata(&self, path: &Path, project_root: &Path) -> Result<FileMetadata, std::io::Error> {
191 let metadata = fs::metadata(path)?;
192 let size = metadata.len() as usize;
193 let modified = metadata.modified()?;
194
195 let extension = path.extension()
196 .and_then(|ext| ext.to_str())
197 .map(|s| s.to_lowercase());
198
199 let file_name = path.file_name()
200 .and_then(|n| n.to_str())
201 .unwrap_or("");
202
203 let file_name_lower = file_name.to_lowercase();
204
205 let is_gitignored = if project_root.join(".git").exists() {
207 self.check_gitignore_batch(path, project_root)
208 } else {
209 false
210 };
211
212 let priority_hints = PriorityHints {
214 is_env_file: file_name_lower.starts_with(".env") || file_name_lower.ends_with(".env"),
215 is_config_file: self.is_config_file(&file_name_lower, &extension),
216 is_secret_file: self.is_secret_file(&file_name_lower, path),
217 is_source_file: self.is_source_file(&extension),
218 has_secret_keywords: self.has_secret_keywords(&file_name_lower),
219 };
220
221 Ok(FileMetadata {
222 path: path.to_path_buf(),
223 size,
224 extension,
225 is_gitignored,
226 modified,
227 priority_hints,
228 })
229 }
230
231 fn check_gitignore_batch(&self, path: &Path, project_root: &Path) -> bool {
233 let output = Command::new("git")
235 .args(&["check-ignore", path.to_str().unwrap_or("")])
236 .current_dir(project_root)
237 .output();
238
239 match output {
240 Ok(output) => output.status.success(),
241 Err(_) => false,
242 }
243 }
244
245 fn should_include_file(&self, meta: &FileMetadata) -> bool {
247 if meta.size > self.config.max_file_size {
249 trace!("Skipping large file: {} ({} bytes)", meta.path.display(), meta.size);
250 return false;
251 }
252
253 if let Some(ext) = &meta.extension {
255 let binary_extensions = ["exe", "dll", "so", "dylib", "jpg", "png", "gif", "mp4", "zip", "tar", "gz"];
256 if binary_extensions.contains(&ext.as_str()) {
257 return false;
258 }
259 }
260
261 if self.should_exclude_from_security_scan(meta) {
263 trace!("Excluding from security scan: {}", meta.path.display());
264 return false;
265 }
266
267 if meta.is_critical() {
269 return true;
270 }
271
272 match self.config.scan_mode {
274 ScanMode::Lightning => {
275 false
277 }
278 ScanMode::Fast => {
279 meta.is_priority() || (meta.priority_hints.is_source_file && meta.size < 50_000)
281 }
282 _ => true, }
284 }
285
286 fn should_exclude_from_security_scan(&self, meta: &FileMetadata) -> bool {
288 let path_str = meta.path.to_string_lossy().to_lowercase();
289
290 if self.is_dependency_lock_file(meta) {
292 return true;
293 }
294
295 let exclude_patterns = [
297 ".md", ".txt", ".rst", ".adoc", ".asciidoc",
298 "readme", "changelog", "license", "todo",
299 "roadmap", "contributing", "authors",
300 "/test/", "/tests/", "/spec/", "/specs/",
302 "__test__", "__spec__", ".test.", ".spec.",
303 "_test.", "_spec.", "fixtures", "mocks", "examples",
304 "/docs/", "/doc/", "/documentation/",
306 "frameworks/", "detector", "rules", "patterns",
308 "target/", "build/", "dist/", ".next/", "coverage/",
310 ];
311
312 if exclude_patterns.iter().any(|&pattern| path_str.contains(pattern)) {
314 return true;
315 }
316
317 if let Some(ext) = &meta.extension {
319 let doc_extensions = ["md", "txt", "rst", "adoc", "asciidoc"];
320 if doc_extensions.contains(&ext.as_str()) {
321 return true;
322 }
323 }
324
325 let filename = meta.path.file_name()
327 .and_then(|n| n.to_str())
328 .unwrap_or("")
329 .to_lowercase();
330
331 let doc_filenames = [
332 "readme", "changelog", "license", "authors", "contributing",
333 "roadmap", "todo", "examples", "demo", "sample",
334 ];
335
336 if doc_filenames.iter().any(|&name| filename.contains(name)) {
337 return true;
338 }
339
340 false
341 }
342
343 fn get_ignored_dirs(scan_mode: &ScanMode) -> AHashSet<String> {
345 let mut dirs = AHashSet::new();
346
347 let always_ignore = vec![
349 ".git", "node_modules", "target", "build", "dist", ".next",
350 "coverage", "__pycache__", ".pytest_cache", ".mypy_cache",
351 "vendor", "packages", ".bundle", "bower_components",
352 ];
353
354 for dir in always_ignore {
355 dirs.insert(dir.to_string());
356 }
357
358 if matches!(scan_mode, ScanMode::Lightning | ScanMode::Fast) {
360 let fast_ignore = vec!["test", "tests", "spec", "specs", "docs", "documentation"];
361 for dir in fast_ignore {
362 dirs.insert(dir.to_string());
363 }
364 }
365
366 dirs
367 }
368
369 fn get_secret_keywords() -> Vec<&'static str> {
371 vec![
372 "secret", "key", "token", "password", "credential",
373 "auth", "api", "private", "access", "bearer",
374 ]
375 }
376
377 fn is_config_file(&self, name: &str, extension: &Option<String>) -> bool {
378 let config_extensions = ["json", "yml", "yaml", "toml", "ini", "conf", "config", "xml"];
379 let config_names = ["config", "settings", "configuration", ".env"];
380
381 if let Some(ext) = extension {
382 if config_extensions.contains(&ext.as_str()) {
383 return true;
384 }
385 }
386
387 config_names.iter().any(|&n| name.contains(n))
388 }
389
390 fn is_secret_file(&self, name: &str, path: &Path) -> bool {
391 let secret_patterns = [
392 ".env", ".key", ".pem", ".p12", ".pfx",
393 "credentials", "secret", "private", "cert",
394 ];
395
396 if secret_patterns.iter().any(|&p| name.contains(p)) {
398 return true;
399 }
400
401 let path_str = path.to_string_lossy().to_lowercase();
403 secret_patterns.iter().any(|&p| path_str.contains(p))
404 }
405
406 fn is_source_file(&self, extension: &Option<String>) -> bool {
407 if let Some(ext) = extension {
408 let source_extensions = [
409 "js", "jsx", "ts", "tsx", "py", "java", "kt", "go",
410 "rs", "rb", "php", "cs", "cpp", "c", "h", "swift",
411 "scala", "clj", "ex", "exs",
412 ];
413 source_extensions.contains(&ext.as_str())
414 } else {
415 false
416 }
417 }
418
419 fn has_secret_keywords(&self, name: &str) -> bool {
420 self.secret_keywords.iter().any(|&keyword| name.contains(keyword))
421 }
422
423 fn is_dependency_lock_file(&self, meta: &FileMetadata) -> bool {
425 let filename = meta.path.file_name()
426 .and_then(|n| n.to_str())
427 .unwrap_or("")
428 .to_lowercase();
429
430 let lock_files = [
432 "package-lock.json",
434 "yarn.lock",
435 "pnpm-lock.yaml", "shrinkwrap.yaml",
437 "npm-shrinkwrap.json",
438 "poetry.lock",
440 "pipfile.lock",
441 "pip-lock.txt",
442 "cargo.lock",
444 "go.sum",
446 "go.mod",
447 "gradle.lockfile",
449 "maven-dependency-plugin.log",
450 "gemfile.lock",
452 "composer.lock",
454 "packages.lock.json",
456 "paket.lock",
457 "mix.lock", "pubspec.lock", ];
461
462 lock_files.iter().any(|&pattern| filename == pattern) ||
464 filename.ends_with(".lock") ||
466 filename.ends_with("-lock.json") ||
467 filename.ends_with("-lock.yaml") ||
468 filename.ends_with("-lock.yml") ||
469 filename.contains("shrinkwrap") ||
470 filename.contains("lockfile")
471 }
472}
473
474impl FileMetadata {
475 pub fn is_critical(&self) -> bool {
477 self.priority_hints.is_env_file ||
478 self.priority_hints.is_secret_file ||
479 self.extension.as_deref() == Some("pem") ||
480 self.extension.as_deref() == Some("key")
481 }
482
483 pub fn is_priority(&self) -> bool {
485 self.is_critical() ||
486 self.priority_hints.is_config_file ||
487 self.priority_hints.has_secret_keywords
488 }
489
490 pub fn priority_score(&self) -> u32 {
492 let mut score: u32 = 0;
493
494 if self.priority_hints.is_env_file { score += 1000; }
495 if self.priority_hints.is_secret_file { score += 900; }
496 if self.priority_hints.is_config_file { score += 500; }
497 if self.priority_hints.has_secret_keywords { score += 300; }
498 if !self.is_gitignored { score += 200; }
499 if self.priority_hints.is_source_file { score += 100; }
500
501 if self.size > 1_000_000 { score = score.saturating_sub(100); }
503
504 score
505 }
506}
507
508#[cfg(test)]
509mod tests {
510 use super::*;
511 use tempfile::TempDir;
512
513 #[test]
514 fn test_file_priority_scoring() {
515 let meta = FileMetadata {
516 path: PathBuf::from(".env"),
517 size: 100,
518 extension: Some("env".to_string()),
519 is_gitignored: false,
520 modified: SystemTime::now(),
521 priority_hints: PriorityHints {
522 is_env_file: true,
523 is_config_file: true,
524 is_secret_file: true,
525 is_source_file: false,
526 has_secret_keywords: true,
527 },
528 };
529
530 assert!(meta.is_critical());
531 assert!(meta.is_priority());
532 assert!(meta.priority_score() > 2000);
533 }
534
535 #[test]
536 fn test_file_discovery() {
537 let temp_dir = TempDir::new().unwrap();
538 fs::write(temp_dir.path().join(".env"), "SECRET=123").unwrap();
539 fs::write(temp_dir.path().join("config.json"), "{}").unwrap();
540 fs::create_dir(temp_dir.path().join("node_modules")).unwrap();
541 fs::write(temp_dir.path().join("node_modules/test.js"), "code").unwrap();
542
543 let config = DiscoveryConfig {
544 use_git: false,
545 max_file_size: 1024 * 1024,
546 priority_extensions: vec!["env".to_string()],
547 scan_mode: ScanMode::Fast,
548 };
549
550 let discovery = FileDiscovery::new(config);
551 let files = discovery.discover_files(temp_dir.path()).unwrap();
552
553 assert_eq!(files.len(), 2);
555 assert!(files.iter().any(|f| f.path.ends_with(".env")));
556 assert!(files.iter().any(|f| f.path.ends_with("config.json")));
557 }
558}