Skip to main content

seshat_scanner/
discovery.rs

1//! File discovery with `.gitignore` respect.
2//!
3//! Uses the [`ignore`] crate's [`WalkBuilder`] for native `.gitignore`
4//! support and configurable exclusion patterns.
5
6use std::collections::HashSet;
7use std::path::{Path, PathBuf};
8
9use ignore::WalkBuilder;
10use seshat_core::{Language, ScanConfig};
11
12use crate::ScanError;
13
14/// A discovered source file ready for parsing.
15#[derive(Debug, Clone)]
16pub struct DiscoveredFile {
17    /// Path relative to the scan root (the project_root passed to
18    /// [`discover_files`]). Stored relative so the IR is identifies the
19    /// same logical file regardless of which worktree directory the scan
20    /// was invoked from — see Bug #3 in the merge-aware-decisions branch
21    /// notes. Callers that need to read the file from disk must
22    /// `root.join(&df.path)`.
23    pub path: PathBuf,
24    /// Detected programming language based on file extension.
25    pub language: Language,
26    /// File size in bytes.
27    pub size_bytes: u64,
28}
29
30/// Result of the file discovery phase.
31#[derive(Debug, Clone)]
32pub struct DiscoveryResult {
33    /// The discovered source files.
34    pub files: Vec<DiscoveredFile>,
35    /// Submodule paths that were excluded from discovery.
36    /// Root discovery always excludes submodule dirs (they get their own DBs).
37    /// Empty when there is no `.gitmodules`.
38    pub excluded_submodules: Vec<String>,
39}
40
41/// Discover all recognised source files under `root`, respecting `.gitignore`,
42/// hidden-file conventions, and the supplied [`ScanConfig`].
43///
44/// # Behaviour
45///
46/// - Uses [`WalkBuilder`] for native `.gitignore` support (including nested
47///   `.gitignore` files).
48/// - `.git/` directory is always excluded.
49/// - Hidden files and directories (starting with `.`) are excluded by default.
50/// - Custom exclude paths from [`ScanConfig::exclude_paths`] are applied
51///   as additional override globs.
52/// - Files exceeding [`ScanConfig::max_file_size_kb`] are skipped with a
53///   [`tracing::warn`].
54/// - Files with unrecognised extensions are silently skipped.
55///
56/// # Errors
57///
58/// Returns [`ScanError::DiscoveryError`] when the walker itself fails to
59/// initialise or encounters a fatal filesystem error.
60pub fn discover_files(root: &Path, config: &ScanConfig) -> Result<DiscoveryResult, ScanError> {
61    let max_size_bytes = config.max_file_size_kb * 1024;
62
63    // Root discovery ALWAYS excludes submodule directories — they get their own
64    // separate DBs. The `exclude_submodules` config flag controls whether those
65    // separate submodule scans happen at all, not whether the root walk includes them.
66    let excluded_submodules = detect_submodule_paths(root);
67
68    // Build a set of submodule directory names for the filter_entry closure.
69    // We need to exclude these directories during the walk, not just report them.
70    let submodule_dirs: HashSet<std::ffi::OsString> = excluded_submodules
71        .iter()
72        .filter_map(|p| {
73            // Use the last component of the submodule path for directory matching.
74            // For nested submodules like "libs/shared", we match on the full
75            // relative path in the walker instead.
76            Path::new(p).file_name().map(|n| n.to_os_string())
77        })
78        .collect();
79
80    // Also keep full relative paths for nested submodules.
81    let submodule_rel_paths: HashSet<PathBuf> =
82        excluded_submodules.iter().map(PathBuf::from).collect();
83
84    let root_for_closure = root.to_path_buf();
85
86    let mut builder = WalkBuilder::new(root);
87    builder
88        // Native .gitignore support is on by default in WalkBuilder.
89        .hidden(true) // skip hidden files/dirs
90        .git_ignore(true) // respect .gitignore
91        .git_global(true) // respect global gitignore
92        .git_exclude(true) // respect .git/info/exclude
93        .filter_entry(move |entry| {
94            // Always skip .git directory itself
95            if entry.file_type().is_some_and(|ft| ft.is_dir()) {
96                if entry.file_name() == ".git" {
97                    return false;
98                }
99                // Skip submodule directories when not included.
100                if !submodule_dirs.is_empty() {
101                    // Check by relative path (handles nested submodules).
102                    if let Ok(rel) = entry.path().strip_prefix(&root_for_closure) {
103                        if submodule_rel_paths.contains(rel) {
104                            return false;
105                        }
106                    }
107                    // Fallback: check by directory name (top-level submodules).
108                    if submodule_dirs.contains(&entry.file_name().to_os_string()) {
109                        if let Ok(rel) = entry.path().strip_prefix(&root_for_closure) {
110                            if submodule_rel_paths.contains(rel) {
111                                return false;
112                            }
113                        }
114                    }
115                }
116            }
117            true
118        });
119
120    // Apply custom exclude paths as WalkBuilder overrides.
121    // The ignore crate's overrides act like a `.gitignore` on top of everything.
122    if !config.exclude_paths.is_empty() {
123        let mut overrides = ignore::overrides::OverrideBuilder::new(root);
124        for pattern in &config.exclude_paths {
125            // Negate the pattern so matching entries are *excluded*.
126            let negated = format!("!{pattern}");
127            overrides
128                .add(&negated)
129                .map_err(|e| ScanError::DiscoveryError {
130                    path: root.to_path_buf(),
131                    reason: format!("Invalid exclude pattern '{pattern}': {e}"),
132                })?;
133        }
134        let built = overrides.build().map_err(|e| ScanError::DiscoveryError {
135            path: root.to_path_buf(),
136            reason: format!("Failed to build override globs: {e}"),
137        })?;
138        builder.overrides(built);
139    }
140
141    let mut discovered = Vec::new();
142
143    for entry_result in builder.build() {
144        let entry = match entry_result {
145            Ok(e) => e,
146            Err(err) => {
147                tracing::warn!("File walk error: {err}");
148                continue;
149            }
150        };
151
152        // Only process regular files.
153        let Some(file_type) = entry.file_type() else {
154            continue;
155        };
156        if !file_type.is_file() {
157            continue;
158        }
159
160        let path = entry.path();
161
162        // Detect language from extension; skip unrecognised files.
163        let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
164            continue;
165        };
166        let Some(language) = Language::from_extension(ext) else {
167            continue;
168        };
169
170        // Check file size.
171        let size_bytes = entry.metadata().map(|m| m.len()).unwrap_or(0);
172        if size_bytes > max_size_bytes {
173            tracing::warn!(
174                path = %path.display(),
175                size_kb = size_bytes / 1024,
176                limit_kb = config.max_file_size_kb,
177                "Skipping file exceeding size limit"
178            );
179            continue;
180        }
181
182        // Strip the scan root so the stored path is identical regardless of
183        // which worktree the scan was invoked from. WalkBuilder yields paths
184        // relative to its starting point most of the time, but
185        // `path.strip_prefix(root)` is safe in either case (relative input
186        // returns the same path; absolute input becomes relative to root).
187        // Fall back to the raw path when the prefix doesn't match (defensive
188        // — should not happen because WalkBuilder is rooted at `root`).
189        let relative = path.strip_prefix(root).unwrap_or(path).to_path_buf();
190
191        discovered.push(DiscoveredFile {
192            path: relative,
193            language,
194            size_bytes,
195        });
196    }
197
198    Ok(DiscoveryResult {
199        files: discovered,
200        excluded_submodules,
201    })
202}
203
204/// Parse `.gitmodules` to extract submodule paths.
205///
206/// Returns a list of relative path strings from `path = ...` entries.
207/// If `.gitmodules` doesn't exist or cannot be read, returns an empty vec.
208pub fn detect_submodule_paths(root: &Path) -> Vec<String> {
209    let gitmodules_path = root.join(".gitmodules");
210    let content = match std::fs::read_to_string(&gitmodules_path) {
211        Ok(c) => c,
212        Err(_) => return Vec::new(),
213    };
214
215    let mut paths = Vec::new();
216    for line in content.lines() {
217        let trimmed = line.trim();
218        if trimmed.starts_with("path") {
219            if let Some((_key, value)) = trimmed.split_once('=') {
220                let path = value.trim().to_string();
221                if !path.is_empty() {
222                    paths.push(path);
223                }
224            }
225        }
226    }
227    paths
228}
229
230#[cfg(test)]
231mod tests {
232    use super::*;
233    use std::fs;
234
235    /// Helper: create a temporary project directory with the given file structure.
236    fn setup_temp_project(files: &[&str]) -> tempfile::TempDir {
237        let dir = tempfile::tempdir().expect("create temp dir");
238        for file in files {
239            let path = dir.path().join(file);
240            if let Some(parent) = path.parent() {
241                fs::create_dir_all(parent).expect("create parent dirs");
242            }
243            fs::write(&path, "// placeholder").expect("write file");
244        }
245        dir
246    }
247
248    #[test]
249    fn discovers_recognised_extensions() {
250        let dir = setup_temp_project(&[
251            "src/main.rs",
252            "src/lib.ts",
253            "app/index.js",
254            "scripts/run.py",
255            "README.md",        // not recognised
256            "data/config.yaml", // not recognised
257        ]);
258
259        let config = ScanConfig::default();
260        let result = discover_files(dir.path(), &config).unwrap();
261
262        let mut names: Vec<String> = result
263            .files
264            .iter()
265            .map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
266            .collect();
267        names.sort();
268
269        assert_eq!(names, vec!["index.js", "lib.ts", "main.rs", "run.py"]);
270    }
271
272    #[test]
273    fn skips_hidden_files_and_directories() {
274        let dir = setup_temp_project(&["src/main.rs", ".hidden/secret.rs", "src/.hidden_file.py"]);
275
276        let config = ScanConfig::default();
277        let result = discover_files(dir.path(), &config).unwrap();
278
279        assert_eq!(result.files.len(), 1);
280        assert!(result.files[0].path.ends_with("src/main.rs"));
281    }
282
283    #[test]
284    fn respects_gitignore() {
285        let dir = setup_temp_project(&[
286            "src/main.rs",
287            "target/debug/build.rs",
288            "node_modules/pkg/index.js",
289        ]);
290
291        // Create a .gitignore that excludes target/ and node_modules/
292        fs::write(dir.path().join(".gitignore"), "target/\nnode_modules/\n").unwrap();
293
294        // WalkBuilder needs a git repo to respect .gitignore
295        fs::create_dir(dir.path().join(".git")).unwrap();
296
297        let config = ScanConfig::default();
298        let result = discover_files(dir.path(), &config).unwrap();
299
300        assert_eq!(result.files.len(), 1);
301        assert!(result.files[0].path.ends_with("src/main.rs"));
302    }
303
304    #[test]
305    fn respects_custom_exclude_paths() {
306        let dir = setup_temp_project(&["src/main.rs", "src/generated.rs", "tests/test_main.rs"]);
307
308        let config = ScanConfig {
309            exclude_paths: vec!["tests/**".to_string()],
310            ..ScanConfig::default()
311        };
312
313        let result = discover_files(dir.path(), &config).unwrap();
314
315        let mut names: Vec<String> = result
316            .files
317            .iter()
318            .map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
319            .collect();
320        names.sort();
321
322        assert_eq!(names, vec!["generated.rs", "main.rs"]);
323    }
324
325    #[test]
326    fn skips_files_exceeding_size_limit() {
327        let dir = setup_temp_project(&["src/small.rs"]);
328
329        // Create a file that exceeds 1 KB limit
330        let big_file = dir.path().join("src/big.rs");
331        let big_content = "x".repeat(2048); // 2 KB
332        fs::write(&big_file, big_content).unwrap();
333
334        let config = ScanConfig {
335            max_file_size_kb: 1,
336            ..ScanConfig::default()
337        };
338
339        let result = discover_files(dir.path(), &config).unwrap();
340
341        assert_eq!(result.files.len(), 1);
342        assert!(result.files[0].path.ends_with("src/small.rs"));
343    }
344
345    #[test]
346    fn skips_unrecognised_extensions() {
347        let dir = setup_temp_project(&[
348            "src/main.rs",
349            "src/style.css",
350            "src/page.html",
351            "src/data.json",
352        ]);
353
354        let config = ScanConfig::default();
355        let result = discover_files(dir.path(), &config).unwrap();
356
357        assert_eq!(result.files.len(), 1);
358        assert!(result.files[0].path.ends_with("src/main.rs"));
359    }
360
361    #[test]
362    fn detected_language_matches_extension() {
363        let dir = setup_temp_project(&[
364            "a.rs", "b.ts", "c.tsx", "d.js", "e.jsx", "f.mjs", "g.cjs", "h.py",
365        ]);
366
367        let config = ScanConfig::default();
368        let result = discover_files(dir.path(), &config).unwrap();
369
370        for f in &result.files {
371            let ext = f.path.extension().unwrap().to_str().unwrap();
372            assert_eq!(
373                f.language,
374                Language::from_extension(ext).unwrap(),
375                "Mismatch for extension {ext}"
376            );
377        }
378        assert_eq!(result.files.len(), 8);
379    }
380
381    #[test]
382    fn discovered_file_has_size() {
383        let dir = setup_temp_project(&["src/main.rs"]);
384
385        let config = ScanConfig::default();
386        let result = discover_files(dir.path(), &config).unwrap();
387
388        assert_eq!(result.files.len(), 1);
389        assert!(result.files[0].size_bytes > 0);
390    }
391
392    #[test]
393    fn empty_directory_returns_empty_vec() {
394        let dir = tempfile::tempdir().expect("create temp dir");
395
396        let config = ScanConfig::default();
397        let result = discover_files(dir.path(), &config).unwrap();
398
399        assert!(result.files.is_empty());
400    }
401
402    #[test]
403    fn git_directory_always_excluded() {
404        let dir = setup_temp_project(&["src/main.rs"]);
405
406        // Create a .git dir with a Rust file inside (should be ignored)
407        let git_dir = dir.path().join(".git");
408        fs::create_dir_all(&git_dir).unwrap();
409        fs::write(git_dir.join("hook.rs"), "// git hook").unwrap();
410
411        let config = ScanConfig::default();
412        let result = discover_files(dir.path(), &config).unwrap();
413
414        assert_eq!(result.files.len(), 1);
415        assert!(result.files[0].path.ends_with("src/main.rs"));
416    }
417
418    // -- Submodule tests ---------------------------------------------------
419
420    #[test]
421    fn detect_submodule_paths_parses_gitmodules() {
422        let dir = tempfile::tempdir().expect("create temp dir");
423        fs::write(
424            dir.path().join(".gitmodules"),
425            "[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/frontend.git\n\
426             [submodule \"libs/shared\"]\n\tpath = libs/shared\n\turl = https://example.com/shared.git\n",
427        )
428        .unwrap();
429
430        let paths = detect_submodule_paths(dir.path());
431        assert_eq!(paths, vec!["frontend", "libs/shared"]);
432    }
433
434    #[test]
435    fn detect_submodule_paths_no_gitmodules() {
436        let dir = tempfile::tempdir().expect("create temp dir");
437        let paths = detect_submodule_paths(dir.path());
438        assert!(paths.is_empty());
439    }
440
441    #[test]
442    fn excluded_submodules_reported_when_gitmodules_present() {
443        let dir = setup_temp_project(&["src/main.rs"]);
444        fs::create_dir_all(dir.path().join(".git")).unwrap();
445        fs::write(
446            dir.path().join(".gitmodules"),
447            "[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/fe.git\n",
448        )
449        .unwrap();
450
451        let config = ScanConfig::default(); // exclude_submodules = false
452        let result = discover_files(dir.path(), &config).unwrap();
453
454        // Root discovery always excludes submodule dirs (they get their own DBs).
455        assert_eq!(result.excluded_submodules, vec!["frontend"]);
456    }
457
458    #[test]
459    fn submodule_dirs_always_excluded_from_root_walk() {
460        let dir = setup_temp_project(&["src/main.rs", "frontend/src/app.ts"]);
461        fs::create_dir_all(dir.path().join(".git")).unwrap();
462        fs::write(
463            dir.path().join(".gitmodules"),
464            "[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/fe.git\n",
465        )
466        .unwrap();
467
468        // Even with exclude_submodules = false (default), root discovery
469        // excludes submodule dirs. They get their own separate scans.
470        let config = ScanConfig::default();
471        let result = discover_files(dir.path(), &config).unwrap();
472
473        assert_eq!(result.excluded_submodules, vec!["frontend"]);
474        // frontend/src/app.ts should NOT appear in discovered files.
475        let file_names: Vec<String> = result
476            .files
477            .iter()
478            .map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
479            .collect();
480        assert!(
481            !file_names.contains(&"app.ts".to_string()),
482            "submodule files should be excluded from root discovery"
483        );
484    }
485}