1use std::collections::HashSet;
7use std::path::{Path, PathBuf};
8
9use ignore::WalkBuilder;
10use seshat_core::{Language, ScanConfig};
11
12use crate::ScanError;
13
14#[derive(Debug, Clone)]
16pub struct DiscoveredFile {
17 pub path: PathBuf,
24 pub language: Language,
26 pub size_bytes: u64,
28}
29
30#[derive(Debug, Clone)]
32pub struct DiscoveryResult {
33 pub files: Vec<DiscoveredFile>,
35 pub excluded_submodules: Vec<String>,
39}
40
41pub fn discover_files(root: &Path, config: &ScanConfig) -> Result<DiscoveryResult, ScanError> {
61 let max_size_bytes = config.max_file_size_kb * 1024;
62
63 let excluded_submodules = detect_submodule_paths(root);
67
68 let submodule_dirs: HashSet<std::ffi::OsString> = excluded_submodules
71 .iter()
72 .filter_map(|p| {
73 Path::new(p).file_name().map(|n| n.to_os_string())
77 })
78 .collect();
79
80 let submodule_rel_paths: HashSet<PathBuf> =
82 excluded_submodules.iter().map(PathBuf::from).collect();
83
84 let root_for_closure = root.to_path_buf();
85
86 let mut builder = WalkBuilder::new(root);
87 builder
88 .hidden(true) .git_ignore(true) .git_global(true) .git_exclude(true) .filter_entry(move |entry| {
94 if entry.file_type().is_some_and(|ft| ft.is_dir()) {
96 if entry.file_name() == ".git" {
97 return false;
98 }
99 if !submodule_dirs.is_empty() {
101 if let Ok(rel) = entry.path().strip_prefix(&root_for_closure) {
103 if submodule_rel_paths.contains(rel) {
104 return false;
105 }
106 }
107 if submodule_dirs.contains(&entry.file_name().to_os_string()) {
109 if let Ok(rel) = entry.path().strip_prefix(&root_for_closure) {
110 if submodule_rel_paths.contains(rel) {
111 return false;
112 }
113 }
114 }
115 }
116 }
117 true
118 });
119
120 if !config.exclude_paths.is_empty() {
123 let mut overrides = ignore::overrides::OverrideBuilder::new(root);
124 for pattern in &config.exclude_paths {
125 let negated = format!("!{pattern}");
127 overrides
128 .add(&negated)
129 .map_err(|e| ScanError::DiscoveryError {
130 path: root.to_path_buf(),
131 reason: format!("Invalid exclude pattern '{pattern}': {e}"),
132 })?;
133 }
134 let built = overrides.build().map_err(|e| ScanError::DiscoveryError {
135 path: root.to_path_buf(),
136 reason: format!("Failed to build override globs: {e}"),
137 })?;
138 builder.overrides(built);
139 }
140
141 let mut discovered = Vec::new();
142
143 for entry_result in builder.build() {
144 let entry = match entry_result {
145 Ok(e) => e,
146 Err(err) => {
147 tracing::warn!("File walk error: {err}");
148 continue;
149 }
150 };
151
152 let Some(file_type) = entry.file_type() else {
154 continue;
155 };
156 if !file_type.is_file() {
157 continue;
158 }
159
160 let path = entry.path();
161
162 let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
164 continue;
165 };
166 let Some(language) = Language::from_extension(ext) else {
167 continue;
168 };
169
170 let size_bytes = entry.metadata().map(|m| m.len()).unwrap_or(0);
172 if size_bytes > max_size_bytes {
173 tracing::warn!(
174 path = %path.display(),
175 size_kb = size_bytes / 1024,
176 limit_kb = config.max_file_size_kb,
177 "Skipping file exceeding size limit"
178 );
179 continue;
180 }
181
182 let relative = path.strip_prefix(root).unwrap_or(path).to_path_buf();
190
191 discovered.push(DiscoveredFile {
192 path: relative,
193 language,
194 size_bytes,
195 });
196 }
197
198 Ok(DiscoveryResult {
199 files: discovered,
200 excluded_submodules,
201 })
202}
203
204pub fn detect_submodule_paths(root: &Path) -> Vec<String> {
209 let gitmodules_path = root.join(".gitmodules");
210 let content = match std::fs::read_to_string(&gitmodules_path) {
211 Ok(c) => c,
212 Err(_) => return Vec::new(),
213 };
214
215 let mut paths = Vec::new();
216 for line in content.lines() {
217 let trimmed = line.trim();
218 if trimmed.starts_with("path") {
219 if let Some((_key, value)) = trimmed.split_once('=') {
220 let path = value.trim().to_string();
221 if !path.is_empty() {
222 paths.push(path);
223 }
224 }
225 }
226 }
227 paths
228}
229
230#[cfg(test)]
231mod tests {
232 use super::*;
233 use std::fs;
234
235 fn setup_temp_project(files: &[&str]) -> tempfile::TempDir {
237 let dir = tempfile::tempdir().expect("create temp dir");
238 for file in files {
239 let path = dir.path().join(file);
240 if let Some(parent) = path.parent() {
241 fs::create_dir_all(parent).expect("create parent dirs");
242 }
243 fs::write(&path, "// placeholder").expect("write file");
244 }
245 dir
246 }
247
248 #[test]
249 fn discovers_recognised_extensions() {
250 let dir = setup_temp_project(&[
251 "src/main.rs",
252 "src/lib.ts",
253 "app/index.js",
254 "scripts/run.py",
255 "README.md", "data/config.yaml", ]);
258
259 let config = ScanConfig::default();
260 let result = discover_files(dir.path(), &config).unwrap();
261
262 let mut names: Vec<String> = result
263 .files
264 .iter()
265 .map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
266 .collect();
267 names.sort();
268
269 assert_eq!(names, vec!["index.js", "lib.ts", "main.rs", "run.py"]);
270 }
271
272 #[test]
273 fn skips_hidden_files_and_directories() {
274 let dir = setup_temp_project(&["src/main.rs", ".hidden/secret.rs", "src/.hidden_file.py"]);
275
276 let config = ScanConfig::default();
277 let result = discover_files(dir.path(), &config).unwrap();
278
279 assert_eq!(result.files.len(), 1);
280 assert!(result.files[0].path.ends_with("src/main.rs"));
281 }
282
283 #[test]
284 fn respects_gitignore() {
285 let dir = setup_temp_project(&[
286 "src/main.rs",
287 "target/debug/build.rs",
288 "node_modules/pkg/index.js",
289 ]);
290
291 fs::write(dir.path().join(".gitignore"), "target/\nnode_modules/\n").unwrap();
293
294 fs::create_dir(dir.path().join(".git")).unwrap();
296
297 let config = ScanConfig::default();
298 let result = discover_files(dir.path(), &config).unwrap();
299
300 assert_eq!(result.files.len(), 1);
301 assert!(result.files[0].path.ends_with("src/main.rs"));
302 }
303
304 #[test]
305 fn respects_custom_exclude_paths() {
306 let dir = setup_temp_project(&["src/main.rs", "src/generated.rs", "tests/test_main.rs"]);
307
308 let config = ScanConfig {
309 exclude_paths: vec!["tests/**".to_string()],
310 ..ScanConfig::default()
311 };
312
313 let result = discover_files(dir.path(), &config).unwrap();
314
315 let mut names: Vec<String> = result
316 .files
317 .iter()
318 .map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
319 .collect();
320 names.sort();
321
322 assert_eq!(names, vec!["generated.rs", "main.rs"]);
323 }
324
325 #[test]
326 fn skips_files_exceeding_size_limit() {
327 let dir = setup_temp_project(&["src/small.rs"]);
328
329 let big_file = dir.path().join("src/big.rs");
331 let big_content = "x".repeat(2048); fs::write(&big_file, big_content).unwrap();
333
334 let config = ScanConfig {
335 max_file_size_kb: 1,
336 ..ScanConfig::default()
337 };
338
339 let result = discover_files(dir.path(), &config).unwrap();
340
341 assert_eq!(result.files.len(), 1);
342 assert!(result.files[0].path.ends_with("src/small.rs"));
343 }
344
345 #[test]
346 fn skips_unrecognised_extensions() {
347 let dir = setup_temp_project(&[
348 "src/main.rs",
349 "src/style.css",
350 "src/page.html",
351 "src/data.json",
352 ]);
353
354 let config = ScanConfig::default();
355 let result = discover_files(dir.path(), &config).unwrap();
356
357 assert_eq!(result.files.len(), 1);
358 assert!(result.files[0].path.ends_with("src/main.rs"));
359 }
360
361 #[test]
362 fn detected_language_matches_extension() {
363 let dir = setup_temp_project(&[
364 "a.rs", "b.ts", "c.tsx", "d.js", "e.jsx", "f.mjs", "g.cjs", "h.py",
365 ]);
366
367 let config = ScanConfig::default();
368 let result = discover_files(dir.path(), &config).unwrap();
369
370 for f in &result.files {
371 let ext = f.path.extension().unwrap().to_str().unwrap();
372 assert_eq!(
373 f.language,
374 Language::from_extension(ext).unwrap(),
375 "Mismatch for extension {ext}"
376 );
377 }
378 assert_eq!(result.files.len(), 8);
379 }
380
381 #[test]
382 fn discovered_file_has_size() {
383 let dir = setup_temp_project(&["src/main.rs"]);
384
385 let config = ScanConfig::default();
386 let result = discover_files(dir.path(), &config).unwrap();
387
388 assert_eq!(result.files.len(), 1);
389 assert!(result.files[0].size_bytes > 0);
390 }
391
392 #[test]
393 fn empty_directory_returns_empty_vec() {
394 let dir = tempfile::tempdir().expect("create temp dir");
395
396 let config = ScanConfig::default();
397 let result = discover_files(dir.path(), &config).unwrap();
398
399 assert!(result.files.is_empty());
400 }
401
402 #[test]
403 fn git_directory_always_excluded() {
404 let dir = setup_temp_project(&["src/main.rs"]);
405
406 let git_dir = dir.path().join(".git");
408 fs::create_dir_all(&git_dir).unwrap();
409 fs::write(git_dir.join("hook.rs"), "// git hook").unwrap();
410
411 let config = ScanConfig::default();
412 let result = discover_files(dir.path(), &config).unwrap();
413
414 assert_eq!(result.files.len(), 1);
415 assert!(result.files[0].path.ends_with("src/main.rs"));
416 }
417
418 #[test]
421 fn detect_submodule_paths_parses_gitmodules() {
422 let dir = tempfile::tempdir().expect("create temp dir");
423 fs::write(
424 dir.path().join(".gitmodules"),
425 "[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/frontend.git\n\
426 [submodule \"libs/shared\"]\n\tpath = libs/shared\n\turl = https://example.com/shared.git\n",
427 )
428 .unwrap();
429
430 let paths = detect_submodule_paths(dir.path());
431 assert_eq!(paths, vec!["frontend", "libs/shared"]);
432 }
433
434 #[test]
435 fn detect_submodule_paths_no_gitmodules() {
436 let dir = tempfile::tempdir().expect("create temp dir");
437 let paths = detect_submodule_paths(dir.path());
438 assert!(paths.is_empty());
439 }
440
441 #[test]
442 fn excluded_submodules_reported_when_gitmodules_present() {
443 let dir = setup_temp_project(&["src/main.rs"]);
444 fs::create_dir_all(dir.path().join(".git")).unwrap();
445 fs::write(
446 dir.path().join(".gitmodules"),
447 "[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/fe.git\n",
448 )
449 .unwrap();
450
451 let config = ScanConfig::default(); let result = discover_files(dir.path(), &config).unwrap();
453
454 assert_eq!(result.excluded_submodules, vec!["frontend"]);
456 }
457
458 #[test]
459 fn submodule_dirs_always_excluded_from_root_walk() {
460 let dir = setup_temp_project(&["src/main.rs", "frontend/src/app.ts"]);
461 fs::create_dir_all(dir.path().join(".git")).unwrap();
462 fs::write(
463 dir.path().join(".gitmodules"),
464 "[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/fe.git\n",
465 )
466 .unwrap();
467
468 let config = ScanConfig::default();
471 let result = discover_files(dir.path(), &config).unwrap();
472
473 assert_eq!(result.excluded_submodules, vec!["frontend"]);
474 let file_names: Vec<String> = result
476 .files
477 .iter()
478 .map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
479 .collect();
480 assert!(
481 !file_names.contains(&"app.ts".to_string()),
482 "submodule files should be excluded from root discovery"
483 );
484 }
485}