use std::path::{Path, PathBuf};
use walkdir::WalkDir;
#[rustfmt::skip]
pub const SOURCE_EXTS: &[&str] = &[
"rs", "py", "ts", "tsx", "js", "jsx", "mjs", "cjs", "go", "java",
"c", "cpp", "h", "hpp", "cs", "rb", "php", "swift", "kt", "kts",
"scala", "groovy", "gradle", "sh",
"md", "mdx",
"yaml", "yml", "toml", "json", "xml",
"txt", "log",
];
pub const SKIP_DIRS: &[&str] = &[
".git",
"target",
"node_modules",
".venv",
"venv",
"__pycache__",
"dist",
"build",
".build",
".next",
".nuxt",
".svelte-kit",
"vendor",
".cargo",
".npm",
".cache",
".pnpm-store",
".yarn",
".rustup",
".tox",
".bundle",
"coverage",
".nyc_output",
".pytest_cache",
".mypy_cache",
".ruff_cache",
".gradle",
".mvn",
".m2",
"out",
"bin",
"classes",
"generated",
"generated-sources",
"generated-test-sources",
"cdk.out",
"cdk.out2",
".aws-sam",
".turbo",
".idea",
".vscode",
".claude",
".claude-mpm",
".open-mpm",
".cursor",
".aider",
".continue",
".obsidian",
"fixtures",
"__fixtures__",
"testdata",
"test-data",
"test_data",
"testresources",
"test_resources",
];
pub const SKIP_FILES: &[&str] = &[
"Cargo.lock",
"package-lock.json",
"yarn.lock",
"pnpm-lock.yaml",
"poetry.lock",
"Pipfile.lock",
"Gemfile.lock",
"composer.lock",
"go.sum",
];
const BINARY_EXTS: &[&str] = &[
"wasm", "so", "dylib", "dll", "exe", "pdf", "png", "jpg", "jpeg", "gif", "ico", "webp", "zip",
"tar", "gz", "bz2", "xz", "7z", "rar", "ttf", "otf", "woff", "woff2", "mp3", "mp4", "mov",
"avi", "mkv", "db", "sqlite", "lock", "pyc", "class", "o", "a",
];
pub const MAX_FILE_BYTES: u64 = 1_048_576;
const MAX_LINE_LEN_FOR_MINIFIED: usize = 500;
const MIN_LINES_FOR_READABLE_JS: usize = 5;
pub fn should_skip_path(path: &Path) -> bool {
let file_name = match path.file_name().and_then(|n| n.to_str()) {
Some(n) => n,
None => return true, };
if SKIP_FILES.contains(&file_name) {
return true;
}
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
if BINARY_EXTS.iter().any(|b| *b == ext) {
return true;
}
if file_name.ends_with(".min.js")
|| file_name.ends_with(".min.css")
|| file_name.ends_with(".bundle.js")
|| file_name.ends_with(".bundle.css")
|| file_name.ends_with(".chunk.js")
{
return true;
}
if ext == "js" || ext == "css" {
if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
if is_hashed_bundle_stem(stem) {
return true;
}
}
}
if let Ok(meta) = std::fs::metadata(path) {
if meta.len() > MAX_FILE_BYTES {
return true;
}
}
false
}
pub fn path_in_skipped_dir(path: &Path) -> bool {
path.components().any(|c| {
c.as_os_str()
.to_str()
.is_some_and(|name| SKIP_DIRS.contains(&name))
})
}
pub fn should_skip_content(path: &Path, content: &str) -> bool {
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_ascii_lowercase();
if !matches!(ext.as_str(), "js" | "mjs" | "cjs") {
return false;
}
let line_count = content.lines().count();
if line_count >= MIN_LINES_FOR_READABLE_JS {
return false;
}
content.lines().any(|l| l.len() > MAX_LINE_LEN_FOR_MINIFIED)
}
fn is_hashed_bundle_stem(stem: &str) -> bool {
let Some(dash_pos) = stem.rfind('-') else {
return false;
};
let hash_part = &stem[dash_pos + 1..];
hash_part.len() >= 8 && hash_part.chars().all(|c| c.is_ascii_alphanumeric())
}
pub struct WalkResult {
pub files: Vec<PathBuf>,
pub skipped_dirs: usize,
}
pub fn walk_source_files(root: &Path) -> WalkResult {
let mut files = Vec::new();
let mut skipped_dirs = 0usize;
let walker = WalkDir::new(root).follow_links(false).into_iter();
let walker = walker.filter_entry(|e| {
if e.depth() == 0 {
return true;
}
if e.file_type().is_dir() {
let name = e.file_name().to_string_lossy();
if SKIP_DIRS.iter().any(|d| *d == name) {
return false;
}
}
true
});
for entry in walker {
let entry = match entry {
Ok(e) => e,
Err(_) => {
skipped_dirs += 1;
continue;
}
};
if !entry.file_type().is_file() {
continue;
}
let path = entry.path();
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
continue;
};
if !SOURCE_EXTS.iter().any(|e| e.eq_ignore_ascii_case(ext)) {
continue;
}
if should_skip_path(path) {
continue;
}
files.push(path.to_path_buf());
}
WalkResult {
files,
skipped_dirs,
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
#[test]
fn finds_source_files_and_skips_dirs() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
fs::create_dir_all(root.join("src")).unwrap();
fs::create_dir_all(root.join("target/debug")).unwrap();
fs::create_dir_all(root.join("node_modules/foo")).unwrap();
fs::write(root.join("src/main.rs"), "fn main() {}").unwrap();
fs::write(root.join("src/lib.py"), "x = 1").unwrap();
fs::write(root.join("README.md"), "# hi").unwrap();
fs::write(root.join("target/debug/build.o"), b"\0\0").unwrap();
fs::write(root.join("node_modules/foo/index.js"), "// no").unwrap();
fs::write(root.join("binary.bin"), b"\0\0").unwrap();
let result = walk_source_files(root);
let names: Vec<String> = result
.files
.iter()
.filter_map(|p| p.file_name().map(|n| n.to_string_lossy().to_string()))
.collect();
assert!(names.contains(&"main.rs".to_string()));
assert!(names.contains(&"lib.py".to_string()));
assert!(names.contains(&"README.md".to_string()));
assert!(!names.contains(&"build.o".to_string()));
assert!(!names.contains(&"index.js".to_string()));
assert!(!names.contains(&"binary.bin".to_string()));
}
#[test]
fn test_skips_min_js() {
assert!(should_skip_path(Path::new("foo.min.js")));
assert!(should_skip_path(Path::new("path/to/app.min.js")));
}
#[test]
fn test_skips_min_css() {
assert!(should_skip_path(Path::new("styles.min.css")));
}
#[test]
fn test_skips_bundle_js() {
assert!(should_skip_path(Path::new("app.bundle.js")));
assert!(should_skip_path(Path::new("vendor.bundle.css")));
}
#[test]
fn test_skips_chunk_js() {
assert!(should_skip_path(Path::new("runtime.chunk.js")));
}
#[test]
fn test_skips_hashed_bundle() {
assert!(should_skip_path(Path::new("index-ahKOasfG.js")));
assert!(should_skip_path(Path::new("vendor-1a2b3c4d5e6f7a8b.js")));
assert!(should_skip_path(Path::new("src/assets/main-AbCdEfGh.js")));
}
#[test]
fn test_hashed_bundle_too_short_not_skipped() {
assert!(!should_skip_path(Path::new("foo-abcdefg.js")));
}
#[test]
fn test_keeps_normal_js() {
assert!(!should_skip_path(Path::new("utils.js")));
assert!(!should_skip_path(Path::new("main.js")));
assert!(!should_skip_path(Path::new("src/components/button.js")));
}
#[test]
fn test_skips_node_modules_dir() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
fs::create_dir_all(root.join("node_modules/lodash")).unwrap();
fs::write(
root.join("node_modules/lodash/index.js"),
"module.exports={}",
)
.unwrap();
fs::write(root.join("real.js"), "export const x = 1;").unwrap();
let result = walk_source_files(root);
let names: Vec<String> = result
.files
.iter()
.filter_map(|p| p.file_name().map(|n| n.to_string_lossy().to_string()))
.collect();
assert!(
!names.contains(&"index.js".to_string()),
"node_modules must be excluded"
);
assert!(names.contains(&"real.js".to_string()));
}
#[test]
fn test_skips_large_file() {
let tmp = tempfile::tempdir().expect("tempdir");
let big_path = tmp.path().join("huge.js");
let big_content = "x".repeat((MAX_FILE_BYTES + 1) as usize);
fs::write(&big_path, big_content.as_bytes()).unwrap();
assert!(should_skip_path(&big_path));
}
#[test]
fn test_keeps_small_file() {
let tmp = tempfile::tempdir().expect("tempdir");
let small_path = tmp.path().join("small.js");
fs::write(&small_path, b"const x = 1;").unwrap();
assert!(!should_skip_path(&small_path));
}
#[test]
fn test_skip_content_detects_minified_js() {
let minified = "a".repeat(501);
assert!(should_skip_content(Path::new("bundle.js"), &minified));
}
#[test]
fn test_skip_content_allows_normal_js() {
let normal = "const x = 1;\nconst y = 2;\nconst z = 3;\nconst w = 4;\nconst v = 5;\n";
assert!(!should_skip_content(Path::new("app.js"), normal));
}
#[test]
fn test_skip_content_ignores_non_js() {
let long_line = "x".repeat(1000);
assert!(!should_skip_content(Path::new("data.rs"), &long_line));
assert!(!should_skip_content(Path::new("query.py"), &long_line));
}
#[test]
fn test_skip_content_mjs_cjs() {
let minified = "a".repeat(501);
assert!(should_skip_content(Path::new("mod.mjs"), &minified));
assert!(should_skip_content(Path::new("mod.cjs"), &minified));
}
#[test]
fn test_skips_lock_files() {
assert!(should_skip_path(Path::new("Cargo.lock")));
assert!(should_skip_path(Path::new("project/Cargo.lock")));
assert!(should_skip_path(Path::new("package-lock.json")));
assert!(should_skip_path(Path::new("yarn.lock")));
assert!(should_skip_path(Path::new("pnpm-lock.yaml")));
assert!(should_skip_path(Path::new("poetry.lock")));
assert!(should_skip_path(Path::new("Pipfile.lock")));
assert!(should_skip_path(Path::new("Gemfile.lock")));
assert!(should_skip_path(Path::new("composer.lock")));
assert!(should_skip_path(Path::new("go.sum")));
}
#[test]
fn test_does_not_skip_non_lock_named_files() {
assert!(!should_skip_path(Path::new("main.rs")));
assert!(!should_skip_path(Path::new("locked_file.rs")));
assert!(!should_skip_path(Path::new("my-cargo-locker.py")));
}
#[test]
fn test_walker_skips_lock_files_in_tree() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
fs::write(root.join("Cargo.lock"), "# lock").unwrap();
fs::write(root.join("package-lock.json"), "{}").unwrap();
fs::write(root.join("yarn.lock"), "# lock").unwrap();
fs::write(root.join("real.rs"), "fn main() {}").unwrap();
let result = walk_source_files(root);
let names: Vec<String> = result
.files
.iter()
.filter_map(|p| p.file_name().map(|n| n.to_string_lossy().to_string()))
.collect();
assert!(names.contains(&"real.rs".to_string()));
assert!(!names.contains(&"Cargo.lock".to_string()));
assert!(!names.contains(&"package-lock.json".to_string()));
assert!(!names.contains(&"yarn.lock".to_string()));
}
#[test]
fn test_walker_skips_new_skip_dirs() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
for dir in [".cache", ".npm", ".build", ".pnpm-store", ".yarn", ".tox"] {
let d = root.join(dir);
fs::create_dir_all(&d).unwrap();
fs::write(d.join("trapped.rs"), "fn x() {}").unwrap();
}
fs::write(root.join("kept.rs"), "fn k() {}").unwrap();
let result = walk_source_files(root);
let names: Vec<String> = result
.files
.iter()
.filter_map(|p| p.file_name().map(|n| n.to_string_lossy().to_string()))
.collect();
assert!(names.contains(&"kept.rs".to_string()));
assert!(
!names.contains(&"trapped.rs".to_string()),
"files inside new SKIP_DIRS must be excluded"
);
}
#[test]
fn test_walker_skips_cdk_and_sam_dirs() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
for dir in ["cdk.out", "cdk.out2", ".aws-sam", ".turbo", ".mvn"] {
let d = root.join(dir).join("asset.abc123/python/lib");
fs::create_dir_all(&d).unwrap();
fs::write(d.join("vendored.py"), "import boto3").unwrap();
}
fs::write(root.join("handler.py"), "def handler(): pass").unwrap();
let result = walk_source_files(root);
let names: Vec<String> = result
.files
.iter()
.filter_map(|p| p.file_name().map(|n| n.to_string_lossy().to_string()))
.collect();
assert!(
names.contains(&"handler.py".to_string()),
"real source must be kept"
);
assert!(
!names.contains(&"vendored.py".to_string()),
"files inside cdk.out / .aws-sam / .turbo / .mvn must be excluded"
);
}
#[test]
fn test_walker_skips_fixture_and_test_data_dirs() {
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
for dir in [
"fixtures",
"__fixtures__",
"testdata",
"test-data",
"test_data",
"testresources",
"test_resources",
] {
let d = root.join(dir);
fs::create_dir_all(&d).unwrap();
fs::write(d.join("sample.py"), "x = 1").unwrap();
}
let kept_resources = root.join("src/test/resources");
fs::create_dir_all(&kept_resources).unwrap();
fs::write(kept_resources.join("config.py"), "y = 2").unwrap();
fs::write(root.join("handler.py"), "def handler(): pass").unwrap();
let result = walk_source_files(root);
let names: Vec<String> = result
.files
.iter()
.filter_map(|p| p.file_name().map(|n| n.to_string_lossy().to_string()))
.collect();
assert!(
names.contains(&"handler.py".to_string()),
"real source must be kept"
);
assert!(
names.contains(&"config.py".to_string()),
"src/test/resources must stay indexed (basename `resources` not skipped)"
);
assert!(
!names.contains(&"sample.py".to_string()),
"files inside fixture / test-data dirs must be excluded"
);
}
#[test]
fn test_skip_dirs_contains_fixture_entries() {
for required in [
"fixtures",
"__fixtures__",
"testdata",
"test-data",
"test_data",
"testresources",
"test_resources",
] {
assert!(
SKIP_DIRS.contains(&required),
"SKIP_DIRS missing required fixture entry: {required}"
);
}
}
#[test]
fn test_sql_extension_excluded_by_allowlist() {
assert!(
!SOURCE_EXTS.iter().any(|e| e.eq_ignore_ascii_case("sql")),
"`sql` must not be in SOURCE_EXTS — SQL is excluded by the allowlist"
);
let tmp = tempfile::tempdir().expect("tempdir");
let root = tmp.path();
fs::write(root.join("schema.sql"), "CREATE TABLE t (id INT);").unwrap();
fs::write(root.join("real.rs"), "fn main() {}").unwrap();
let result = walk_source_files(root);
let names: Vec<String> = result
.files
.iter()
.filter_map(|p| p.file_name().map(|n| n.to_string_lossy().to_string()))
.collect();
assert!(names.contains(&"real.rs".to_string()));
assert!(
!names.contains(&"schema.sql".to_string()),
".sql files must be excluded by the SOURCE_EXTS allowlist"
);
}
#[test]
fn test_skip_dirs_contains_build_artifact_entries() {
for required in [
"cdk.out",
"cdk.out2",
".aws-sam",
".turbo",
".mvn",
".gradle",
"node_modules",
".venv",
"venv",
"__pycache__",
".next",
"dist",
"build",
"target",
"vendor",
] {
assert!(
SKIP_DIRS.contains(&required),
"SKIP_DIRS missing required build-artifact entry: {required}"
);
}
}
#[test]
fn test_skip_dirs_contains_required_entries() {
for required in [
"node_modules",
"target",
"vendor",
".git",
".cargo",
".npm",
"dist",
"build",
".build",
"__pycache__",
".venv",
"venv",
".next",
".nuxt",
"coverage",
".nyc_output",
] {
assert!(
SKIP_DIRS.contains(&required),
"SKIP_DIRS missing required entry: {required}"
);
}
}
#[test]
fn test_path_in_skipped_dir() {
assert!(path_in_skipped_dir(Path::new(
"project/cdk.out/asset.abc/python/handler.py"
)));
assert!(path_in_skipped_dir(Path::new(".aws-sam/build/app.py")));
assert!(path_in_skipped_dir(Path::new(
"repo/node_modules/lodash/x.js"
)));
assert!(path_in_skipped_dir(Path::new("repo/.turbo/cache/x.js")));
assert!(!path_in_skipped_dir(Path::new("src/handler.py")));
assert!(!path_in_skipped_dir(Path::new("project/src/main.rs")));
}
#[test]
fn test_skip_content_multiline_js_not_skipped() {
let content = format!("line1\nline2\nline3\nline4\nline5\n{}\n", "x".repeat(600));
assert!(!should_skip_content(Path::new("ok.js"), &content));
}
}