use std::collections::HashSet;
use std::path::{Path, PathBuf};
use ignore::WalkBuilder;
use seshat_core::{Language, ScanConfig};
use crate::ScanError;
#[derive(Debug, Clone)]
pub struct DiscoveredFile {
pub path: PathBuf,
pub language: Language,
pub size_bytes: u64,
}
#[derive(Debug, Clone)]
pub struct DiscoveryResult {
pub files: Vec<DiscoveredFile>,
pub excluded_submodules: Vec<String>,
}
pub fn discover_files(root: &Path, config: &ScanConfig) -> Result<DiscoveryResult, ScanError> {
let max_size_bytes = config.max_file_size_kb * 1024;
let excluded_submodules = detect_submodule_paths(root);
let submodule_dirs: HashSet<std::ffi::OsString> = excluded_submodules
.iter()
.filter_map(|p| {
Path::new(p).file_name().map(|n| n.to_os_string())
})
.collect();
let submodule_rel_paths: HashSet<PathBuf> =
excluded_submodules.iter().map(PathBuf::from).collect();
let root_for_closure = root.to_path_buf();
let mut builder = WalkBuilder::new(root);
builder
.hidden(true) .git_ignore(true) .git_global(true) .git_exclude(true) .filter_entry(move |entry| {
if entry.file_type().is_some_and(|ft| ft.is_dir()) {
if entry.file_name() == ".git" {
return false;
}
if !submodule_dirs.is_empty() {
if let Ok(rel) = entry.path().strip_prefix(&root_for_closure) {
if submodule_rel_paths.contains(rel) {
return false;
}
}
if submodule_dirs.contains(&entry.file_name().to_os_string()) {
if let Ok(rel) = entry.path().strip_prefix(&root_for_closure) {
if submodule_rel_paths.contains(rel) {
return false;
}
}
}
}
}
true
});
if !config.exclude_paths.is_empty() {
let mut overrides = ignore::overrides::OverrideBuilder::new(root);
for pattern in &config.exclude_paths {
let negated = format!("!{pattern}");
overrides
.add(&negated)
.map_err(|e| ScanError::DiscoveryError {
path: root.to_path_buf(),
reason: format!("Invalid exclude pattern '{pattern}': {e}"),
})?;
}
let built = overrides.build().map_err(|e| ScanError::DiscoveryError {
path: root.to_path_buf(),
reason: format!("Failed to build override globs: {e}"),
})?;
builder.overrides(built);
}
let mut discovered = Vec::new();
for entry_result in builder.build() {
let entry = match entry_result {
Ok(e) => e,
Err(err) => {
tracing::warn!("File walk error: {err}");
continue;
}
};
let Some(file_type) = entry.file_type() else {
continue;
};
if !file_type.is_file() {
continue;
}
let path = entry.path();
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
continue;
};
let Some(language) = Language::from_extension(ext) else {
continue;
};
let size_bytes = entry.metadata().map(|m| m.len()).unwrap_or(0);
if size_bytes > max_size_bytes {
tracing::warn!(
path = %path.display(),
size_kb = size_bytes / 1024,
limit_kb = config.max_file_size_kb,
"Skipping file exceeding size limit"
);
continue;
}
let relative = path.strip_prefix(root).unwrap_or(path).to_path_buf();
discovered.push(DiscoveredFile {
path: relative,
language,
size_bytes,
});
}
Ok(DiscoveryResult {
files: discovered,
excluded_submodules,
})
}
pub fn detect_submodule_paths(root: &Path) -> Vec<String> {
let gitmodules_path = root.join(".gitmodules");
let content = match std::fs::read_to_string(&gitmodules_path) {
Ok(c) => c,
Err(_) => return Vec::new(),
};
let mut paths = Vec::new();
for line in content.lines() {
let trimmed = line.trim();
if trimmed.starts_with("path") {
if let Some((_key, value)) = trimmed.split_once('=') {
let path = value.trim().to_string();
if !path.is_empty() {
paths.push(path);
}
}
}
}
paths
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
fn setup_temp_project(files: &[&str]) -> tempfile::TempDir {
let dir = tempfile::tempdir().expect("create temp dir");
for file in files {
let path = dir.path().join(file);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).expect("create parent dirs");
}
fs::write(&path, "// placeholder").expect("write file");
}
dir
}
#[test]
fn discovers_recognised_extensions() {
let dir = setup_temp_project(&[
"src/main.rs",
"src/lib.ts",
"app/index.js",
"scripts/run.py",
"README.md", "data/config.yaml", ]);
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
let mut names: Vec<String> = result
.files
.iter()
.map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
.collect();
names.sort();
assert_eq!(names, vec!["index.js", "lib.ts", "main.rs", "run.py"]);
}
#[test]
fn skips_hidden_files_and_directories() {
let dir = setup_temp_project(&["src/main.rs", ".hidden/secret.rs", "src/.hidden_file.py"]);
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
assert_eq!(result.files.len(), 1);
assert!(result.files[0].path.ends_with("src/main.rs"));
}
#[test]
fn respects_gitignore() {
let dir = setup_temp_project(&[
"src/main.rs",
"target/debug/build.rs",
"node_modules/pkg/index.js",
]);
fs::write(dir.path().join(".gitignore"), "target/\nnode_modules/\n").unwrap();
fs::create_dir(dir.path().join(".git")).unwrap();
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
assert_eq!(result.files.len(), 1);
assert!(result.files[0].path.ends_with("src/main.rs"));
}
#[test]
fn respects_custom_exclude_paths() {
let dir = setup_temp_project(&["src/main.rs", "src/generated.rs", "tests/test_main.rs"]);
let config = ScanConfig {
exclude_paths: vec!["tests/**".to_string()],
..ScanConfig::default()
};
let result = discover_files(dir.path(), &config).unwrap();
let mut names: Vec<String> = result
.files
.iter()
.map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
.collect();
names.sort();
assert_eq!(names, vec!["generated.rs", "main.rs"]);
}
#[test]
fn skips_files_exceeding_size_limit() {
let dir = setup_temp_project(&["src/small.rs"]);
let big_file = dir.path().join("src/big.rs");
let big_content = "x".repeat(2048); fs::write(&big_file, big_content).unwrap();
let config = ScanConfig {
max_file_size_kb: 1,
..ScanConfig::default()
};
let result = discover_files(dir.path(), &config).unwrap();
assert_eq!(result.files.len(), 1);
assert!(result.files[0].path.ends_with("src/small.rs"));
}
#[test]
fn skips_unrecognised_extensions() {
let dir = setup_temp_project(&[
"src/main.rs",
"src/style.css",
"src/page.html",
"src/data.json",
]);
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
assert_eq!(result.files.len(), 1);
assert!(result.files[0].path.ends_with("src/main.rs"));
}
#[test]
fn detected_language_matches_extension() {
let dir = setup_temp_project(&[
"a.rs", "b.ts", "c.tsx", "d.js", "e.jsx", "f.mjs", "g.cjs", "h.py",
]);
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
for f in &result.files {
let ext = f.path.extension().unwrap().to_str().unwrap();
assert_eq!(
f.language,
Language::from_extension(ext).unwrap(),
"Mismatch for extension {ext}"
);
}
assert_eq!(result.files.len(), 8);
}
#[test]
fn discovered_file_has_size() {
let dir = setup_temp_project(&["src/main.rs"]);
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
assert_eq!(result.files.len(), 1);
assert!(result.files[0].size_bytes > 0);
}
#[test]
fn empty_directory_returns_empty_vec() {
let dir = tempfile::tempdir().expect("create temp dir");
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
assert!(result.files.is_empty());
}
#[test]
fn git_directory_always_excluded() {
let dir = setup_temp_project(&["src/main.rs"]);
let git_dir = dir.path().join(".git");
fs::create_dir_all(&git_dir).unwrap();
fs::write(git_dir.join("hook.rs"), "// git hook").unwrap();
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
assert_eq!(result.files.len(), 1);
assert!(result.files[0].path.ends_with("src/main.rs"));
}
#[test]
fn detect_submodule_paths_parses_gitmodules() {
let dir = tempfile::tempdir().expect("create temp dir");
fs::write(
dir.path().join(".gitmodules"),
"[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/frontend.git\n\
[submodule \"libs/shared\"]\n\tpath = libs/shared\n\turl = https://example.com/shared.git\n",
)
.unwrap();
let paths = detect_submodule_paths(dir.path());
assert_eq!(paths, vec!["frontend", "libs/shared"]);
}
#[test]
fn detect_submodule_paths_no_gitmodules() {
let dir = tempfile::tempdir().expect("create temp dir");
let paths = detect_submodule_paths(dir.path());
assert!(paths.is_empty());
}
#[test]
fn excluded_submodules_reported_when_gitmodules_present() {
let dir = setup_temp_project(&["src/main.rs"]);
fs::create_dir_all(dir.path().join(".git")).unwrap();
fs::write(
dir.path().join(".gitmodules"),
"[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/fe.git\n",
)
.unwrap();
let config = ScanConfig::default(); let result = discover_files(dir.path(), &config).unwrap();
assert_eq!(result.excluded_submodules, vec!["frontend"]);
}
#[test]
fn submodule_dirs_always_excluded_from_root_walk() {
let dir = setup_temp_project(&["src/main.rs", "frontend/src/app.ts"]);
fs::create_dir_all(dir.path().join(".git")).unwrap();
fs::write(
dir.path().join(".gitmodules"),
"[submodule \"frontend\"]\n\tpath = frontend\n\turl = https://example.com/fe.git\n",
)
.unwrap();
let config = ScanConfig::default();
let result = discover_files(dir.path(), &config).unwrap();
assert_eq!(result.excluded_submodules, vec!["frontend"]);
let file_names: Vec<String> = result
.files
.iter()
.map(|f| f.path.file_name().unwrap().to_string_lossy().to_string())
.collect();
assert!(
!file_names.contains(&"app.ts".to_string()),
"submodule files should be excluded from root discovery"
);
}
}