use std::path::{Path, PathBuf};
use ignore::{DirEntry, WalkBuilder};
pub const DEFAULT_EXCLUDE_DIRS: &[&str] = &[
"node_modules",
"vendor",
"target",
"dist",
"build",
"out",
"bin",
"obj",
".next",
".nuxt",
"dox",
"__pycache__",
".pytest_cache",
".tox",
".mypy_cache",
".ruff_cache",
"coverage",
".coverage",
".gradle",
".git",
];
const GENERATED_DIR_SENTINELS: &[&str] = &["doxygen.css", "doxygen.svg"];
const JS_TS_PRESERVED_DIRS: &[&str] = &["build", "dist", "out", "bin", "obj"];
pub struct ProjectWalker {
root: PathBuf,
respect_gitignore: bool,
default_ignore: bool,
max_depth: Option<usize>,
extensions: Option<Vec<&'static str>>,
lang_hint: Option<crate::types::Language>,
}
impl ProjectWalker {
pub fn new(root: impl AsRef<Path>) -> Self {
Self {
root: root.as_ref().to_path_buf(),
respect_gitignore: true,
default_ignore: true,
max_depth: None,
extensions: None,
lang_hint: None,
}
}
pub fn lang_hint(mut self, lang: crate::types::Language) -> Self {
self.lang_hint = Some(lang);
self
}
pub fn no_default_ignore(mut self) -> Self {
self.default_ignore = false;
self
}
pub fn respect_gitignore(mut self, yes: bool) -> Self {
self.respect_gitignore = yes;
self
}
pub fn max_depth(mut self, n: usize) -> Self {
self.max_depth = Some(n);
self
}
pub fn extensions(mut self, exts: &[&'static str]) -> Self {
self.extensions = Some(exts.to_vec());
self
}
pub fn iter(self) -> impl Iterator<Item = DirEntry> {
let default_ignore = self.default_ignore;
let extensions = self.extensions.clone();
let auto_js_ts = self.lang_hint.is_none() && root_is_js_ts_dominated(&self.root);
let preserve_js_ts_dirs = auto_js_ts
|| matches!(
self.lang_hint,
Some(crate::types::Language::JavaScript) | Some(crate::types::Language::TypeScript)
);
let mut builder = WalkBuilder::new(&self.root);
builder
.hidden(true) .git_ignore(self.respect_gitignore)
.git_global(self.respect_gitignore)
.git_exclude(self.respect_gitignore)
.parents(self.respect_gitignore)
.follow_links(false);
if let Some(depth) = self.max_depth {
builder.max_depth(Some(depth));
}
if default_ignore {
builder.filter_entry(move |entry| {
let is_dir = entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false);
if !is_dir {
return true;
}
let name_excluded = match entry.file_name().to_str() {
Some(name) => {
if preserve_js_ts_dirs && JS_TS_PRESERVED_DIRS.contains(&name) {
false
} else {
DEFAULT_EXCLUDE_DIRS.contains(&name)
}
}
None => false,
};
if name_excluded {
return false;
}
if dir_has_generated_sentinel(entry.path()) {
return false;
}
true
});
}
builder.build().filter_map(move |res| {
let entry = res.ok()?;
if let Some(ref allowed) = extensions {
let is_file = entry.file_type().map(|ft| ft.is_file()).unwrap_or(false);
if is_file {
let ext = entry.path().extension().and_then(|s| s.to_str());
match ext {
Some(e) if allowed.contains(&e) => Some(entry),
_ => None,
}
} else {
Some(entry)
}
} else {
Some(entry)
}
})
}
}
fn dir_has_generated_sentinel(dir: &Path) -> bool {
let Ok(entries) = std::fs::read_dir(dir) else {
return false;
};
for entry in entries.flatten() {
if let Some(name) = entry.file_name().to_str() {
if GENERATED_DIR_SENTINELS.contains(&name) {
return true;
}
}
}
false
}
fn root_is_js_ts_dominated(dir: &Path) -> bool {
if !dir.is_dir() {
return false;
}
let mut js_ts_count = 0usize;
let mut other_count = 0usize;
let mut inspected = 0usize;
const CAP: usize = 256;
let mut walker = WalkBuilder::new(dir);
walker
.hidden(true)
.git_ignore(true)
.git_global(true)
.git_exclude(true)
.parents(true)
.follow_links(false);
for entry in walker.build().flatten() {
if inspected >= CAP {
break;
}
if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
continue;
}
let p = entry.path();
let Some(ext) = p.extension().and_then(|s| s.to_str()) else {
continue;
};
match ext {
"ts" | "tsx" | "js" | "jsx" | "mjs" | "cjs" => {
js_ts_count += 1;
inspected += 1;
}
"py" | "rs" | "go" | "java" | "c" | "cc" | "cpp" | "cxx" | "h" | "hpp" | "kt"
| "swift" | "rb" | "php" | "scala" | "lua" | "luau" | "ex" | "exs" | "ml" | "mli"
| "cs" => {
other_count += 1;
inspected += 1;
}
_ => {}
}
}
js_ts_count > other_count && js_ts_count > 0
}
pub fn walk_project(root: impl AsRef<Path>) -> impl Iterator<Item = DirEntry> {
ProjectWalker::new(root).iter()
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::tempdir;
fn write_file(path: &Path, contents: &str) {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).unwrap();
}
fs::write(path, contents).unwrap();
}
fn collect_rel_files(root: &Path, walker: impl Iterator<Item = DirEntry>) -> Vec<String> {
let mut out: Vec<String> = walker
.filter(|e| e.file_type().map(|ft| ft.is_file()).unwrap_or(false))
.map(|e| {
e.path()
.strip_prefix(root)
.unwrap_or(e.path())
.to_string_lossy()
.replace('\\', "/")
.to_string()
})
.collect();
out.sort();
out
}
#[test]
fn test_skips_node_modules_by_default() {
let tmp = tempdir().unwrap();
let root = tmp.path();
write_file(&root.join("foo.rs"), "fn main() {}");
write_file(&root.join("node_modules/bad.py"), "import os");
let files = collect_rel_files(root, walk_project(root));
assert_eq!(files, vec!["foo.rs".to_string()]);
}
#[test]
fn test_skips_target_dist_build_cache() {
let tmp = tempdir().unwrap();
let root = tmp.path();
write_file(&root.join("src/lib.rs"), "fn main() {}");
write_file(&root.join("target/debug/x.rs"), "fn x() {}");
write_file(&root.join("dist/bundle.js"), "// bundled");
write_file(&root.join("build/out.o"), "binary");
write_file(&root.join("__pycache__/cached.pyc"), "binary");
write_file(&root.join(".next/cache.js"), "// cached");
write_file(&root.join("vendor/dep.go"), "package v");
let files = collect_rel_files(root, walk_project(root));
assert_eq!(files, vec!["src/lib.rs".to_string()]);
}
#[test]
fn test_respects_gitignore() {
let tmp = tempdir().unwrap();
let root = tmp.path();
fs::create_dir_all(root.join(".git")).unwrap();
write_file(&root.join(".gitignore"), "secret/\n");
write_file(&root.join("foo.rs"), "fn main() {}");
write_file(&root.join("secret/x.rs"), "fn x() {}");
let files = collect_rel_files(root, walk_project(root));
assert_eq!(files, vec!["foo.rs".to_string()]);
}
#[test]
fn test_hidden_dirs_skipped() {
let tmp = tempdir().unwrap();
let root = tmp.path();
write_file(&root.join("visible.rs"), "fn main() {}");
write_file(&root.join(".hidden/secret.rs"), "fn secret() {}");
let files = collect_rel_files(root, walk_project(root));
assert_eq!(files, vec!["visible.rs".to_string()]);
}
#[test]
fn test_does_not_follow_symlinks_into_loop() {
let tmp = tempdir().unwrap();
let root = tmp.path();
write_file(&root.join("a.rs"), "fn a() {}");
#[cfg(unix)]
{
use std::os::unix::fs::symlink;
let loop_path = root.join("loop");
symlink(root, &loop_path).unwrap();
}
#[cfg(windows)]
{
use std::os::windows::fs::symlink_dir;
let loop_path = root.join("loop");
let _ = symlink_dir(root, &loop_path);
}
let files: Vec<_> = walk_project(root).take(10_000).collect();
let count_a = files.iter().filter(|e| e.file_name() == "a.rs").count();
assert_eq!(count_a, 1, "expected exactly one a.rs, got {}", count_a);
}
#[test]
fn test_no_default_ignore_walks_node_modules() {
let tmp = tempdir().unwrap();
let root = tmp.path();
write_file(&root.join("foo.rs"), "fn main() {}");
write_file(&root.join("node_modules/bad.py"), "import os");
let files = collect_rel_files(root, ProjectWalker::new(root).no_default_ignore().iter());
assert!(
files.contains(&"foo.rs".to_string()),
"missing foo.rs: {files:?}"
);
assert!(
files.contains(&"node_modules/bad.py".to_string()),
"expected node_modules/bad.py to be walked with no_default_ignore: {files:?}"
);
}
#[test]
fn test_extensions_filter() {
let tmp = tempdir().unwrap();
let root = tmp.path();
write_file(&root.join("a.rs"), "fn a() {}");
write_file(&root.join("b.py"), "def b(): pass");
write_file(&root.join("c.ts"), "function c() {}");
let files = collect_rel_files(root, ProjectWalker::new(root).extensions(&["rs"]).iter());
assert_eq!(files, vec!["a.rs".to_string()]);
}
#[test]
fn test_max_depth_limits_recursion() {
let tmp = tempdir().unwrap();
let root = tmp.path();
write_file(&root.join("top.rs"), "fn top() {}");
write_file(&root.join("a/b/deep.rs"), "fn deep() {}");
let files = collect_rel_files(root, ProjectWalker::new(root).max_depth(1).iter());
assert!(files.contains(&"top.rs".to_string()), "{files:?}");
assert!(
!files.contains(&"a/b/deep.rs".to_string()),
"max_depth=1 should have excluded deep file: {files:?}"
);
}
}