use std::path::{Path, PathBuf};
use std::sync::{mpsc, Arc, Mutex};
use anyhow::Result;
use ignore::{DirEntry, ParallelVisitor, ParallelVisitorBuilder, WalkBuilder, WalkState};
pub const DEFAULT_MAX_FILE_SIZE: u64 = 1024 * 1024;
const FLUSH_THRESHOLD: usize = 32;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Language {
Rust,
TypeScript,
JavaScript,
Python,
Go,
Java,
C,
Cpp,
Ruby,
Scala,
Elixir,
Haskell,
Unknown,
}
#[derive(Debug, Clone)]
pub struct WalkedFile {
pub abs_path: PathBuf,
pub rel_path: String,
pub language: Language,
pub size_bytes: u64,
pub mtime_secs: u64,
}
pub struct Walker {
root: PathBuf,
max_file_size: u64,
follow_symlinks: bool,
}
impl Walker {
pub fn new(root: impl Into<PathBuf>) -> Self {
Self {
root: root.into(),
max_file_size: DEFAULT_MAX_FILE_SIZE,
follow_symlinks: false,
}
}
pub fn max_file_size(mut self, bytes: u64) -> Self {
self.max_file_size = bytes;
self
}
pub fn follow_symlinks(mut self, yes: bool) -> Self {
self.follow_symlinks = yes;
self
}
pub fn walk_channel(&self) -> Result<mpsc::Receiver<WalkedFile>> {
if !self.root.is_dir() {
anyhow::bail!("walk root is not a directory: {}", self.root.display());
}
let (tx, rx) = mpsc::channel::<WalkedFile>();
let root_arc = Arc::new(self.root.clone());
let max_file_size = self.max_file_size;
let follow_symlinks = self.follow_symlinks;
std::thread::spawn(move || {
let walk = WalkBuilder::new(root_arc.as_path())
.hidden(false)
.follow_links(follow_symlinks)
.git_ignore(true)
.git_global(true)
.git_exclude(true)
.build_parallel();
let mut builder = VisitorBuilder {
tx: Arc::new(Mutex::new(tx)),
root: root_arc,
max_file_size,
};
walk.visit(&mut builder);
});
Ok(rx)
}
pub fn walk(&self) -> Result<Vec<WalkedFile>> {
let mut files: Vec<WalkedFile> = self.walk_channel()?.into_iter().collect();
files.sort_unstable_by(|a, b| a.rel_path.cmp(&b.rel_path));
Ok(files)
}
}
struct VisitorBuilder {
tx: Arc<Mutex<mpsc::Sender<WalkedFile>>>,
root: Arc<PathBuf>,
max_file_size: u64,
}
impl<'s> ParallelVisitorBuilder<'s> for VisitorBuilder {
fn build(&mut self) -> Box<dyn ParallelVisitor + 's> {
let tx = self
.tx
.lock()
.expect("VisitorBuilder mutex poisoned")
.clone();
Box::new(FileVisitor {
local: Vec::with_capacity(FLUSH_THRESHOLD),
tx,
root: Arc::clone(&self.root),
max_file_size: self.max_file_size,
})
}
}
struct FileVisitor {
local: Vec<WalkedFile>,
tx: mpsc::Sender<WalkedFile>,
root: Arc<PathBuf>,
max_file_size: u64,
}
impl FileVisitor {
fn flush(&mut self) -> bool {
for file in std::mem::take(&mut self.local) {
if self.tx.send(file).is_err() {
return false;
}
}
true
}
}
impl Drop for FileVisitor {
fn drop(&mut self) {
self.flush();
}
}
impl ParallelVisitor for FileVisitor {
fn visit(&mut self, entry: Result<DirEntry, ignore::Error>) -> WalkState {
let entry = match entry {
Ok(e) => e,
Err(e) => {
tracing::warn!("walker: entry error: {e}");
return WalkState::Continue;
}
};
let file_type = match entry.file_type() {
Some(ft) => ft,
None => return WalkState::Continue, };
if !file_type.is_file() {
return WalkState::Continue;
}
let path = entry.path();
if path.components().any(|c| c.as_os_str() == ".git") {
return WalkState::Continue;
}
if is_binary_extension(path) {
return WalkState::Continue;
}
let meta = match entry.metadata() {
Ok(m) => m,
Err(e) => {
tracing::warn!("walker: cannot read metadata for {}: {e}", path.display());
return WalkState::Continue;
}
};
let size_bytes = meta.len();
let mtime_secs = meta
.modified()
.ok()
.and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
.map(|d| d.as_secs())
.unwrap_or(0);
if size_bytes > self.max_file_size {
tracing::debug!(
"walker: skipping large file {} ({size_bytes} bytes)",
path.display()
);
return WalkState::Continue;
}
self.local.push(WalkedFile {
abs_path: path.to_path_buf(),
rel_path: make_rel_path(&self.root, path),
language: detect_language(path),
size_bytes,
mtime_secs,
});
if self.local.len() >= FLUSH_THRESHOLD && !self.flush() {
return WalkState::Quit;
}
WalkState::Continue
}
}
fn make_rel_path(root: &Path, abs: &Path) -> String {
match abs.strip_prefix(root) {
Ok(rel) => rel.to_string_lossy().replace('\\', "/"),
Err(_) => {
tracing::debug!(
"walker: {} is not under root {}; using absolute path",
abs.display(),
root.display()
);
abs.to_string_lossy().replace('\\', "/")
}
}
}
pub fn detect_language(path: &Path) -> Language {
match path.extension().and_then(|e| e.to_str()) {
Some("rs") => Language::Rust,
Some("ts" | "tsx") => Language::TypeScript,
Some("js" | "jsx" | "mjs" | "cjs") => Language::JavaScript,
Some("py" | "pyi") => Language::Python,
Some("go") => Language::Go,
Some("java") => Language::Java,
Some("c") => Language::C,
Some("h") => Language::C,
Some("cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh") => Language::Cpp,
Some("rb") => Language::Ruby,
Some("scala" | "sc") => Language::Scala,
Some("ex" | "exs") => Language::Elixir,
Some("hs" | "lhs") => Language::Haskell,
_ => Language::Unknown,
}
}
fn is_binary_extension(path: &Path) -> bool {
matches!(
path.extension().and_then(|e| e.to_str()),
Some(
"png" | "jpg" | "jpeg" | "gif" | "ico" | "webp" | "bmp" | "tiff"
| "o" | "a" | "so" | "dylib" | "dll" | "exe" | "wasm"
| "class" | "jar"
| "zip" | "tar" | "gz" | "bz2" | "xz" | "7z"
| "mp3" | "mp4" | "wav" | "avi" | "mkv" | "mov"
| "ttf" | "woff" | "woff2" | "otf" | "eot"
| "lock" | "snap"
| "db" | "sqlite" | "sqlite3"
| "pdf"
)
)
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
fn write(dir: &Path, rel: &str, content: &str) {
let full = dir.join(rel);
if let Some(parent) = full.parent() {
fs::create_dir_all(parent).unwrap();
}
fs::write(full, content).unwrap();
}
fn rel_paths(files: &[WalkedFile]) -> Vec<&str> {
let mut paths: Vec<&str> = files.iter().map(|f| f.rel_path.as_str()).collect();
paths.sort_unstable();
paths
}
#[test]
fn walk_returns_all_source_files() {
let dir = TempDir::new().unwrap();
write(dir.path(), "src/main.rs", "fn main() {}");
write(dir.path(), "src/lib.py", "def foo(): pass");
write(dir.path(), "app/index.ts", "export {}");
let files = Walker::new(dir.path()).walk().unwrap();
let paths = rel_paths(&files);
assert!(paths.contains(&"app/index.ts"));
assert!(paths.contains(&"src/lib.py"));
assert!(paths.contains(&"src/main.rs"));
assert_eq!(files.len(), 3);
}
#[test]
fn walk_output_is_sorted_by_rel_path() {
let dir = TempDir::new().unwrap();
write(dir.path(), "z.rs", "");
write(dir.path(), "a.rs", "");
write(dir.path(), "m.rs", "");
let files = Walker::new(dir.path()).walk().unwrap();
let paths: Vec<&str> = files.iter().map(|f| f.rel_path.as_str()).collect();
assert_eq!(paths, vec!["a.rs", "m.rs", "z.rs"]);
}
#[test]
fn walk_empty_dir_returns_empty_vec() {
let dir = TempDir::new().unwrap();
let files = Walker::new(dir.path()).walk().unwrap();
assert!(files.is_empty());
}
#[test]
fn walk_nested_dirs_have_correct_rel_path() {
let dir = TempDir::new().unwrap();
write(dir.path(), "a/b/c/deep.rs", "");
let files = Walker::new(dir.path()).walk().unwrap();
assert_eq!(files.len(), 1);
assert_eq!(files[0].rel_path, "a/b/c/deep.rs");
}
#[test]
fn walk_rel_path_does_not_start_with_slash() {
let dir = TempDir::new().unwrap();
write(dir.path(), "src/foo.rs", "");
let files = Walker::new(dir.path()).walk().unwrap();
assert_eq!(files.len(), 1);
assert!(!files[0].rel_path.starts_with('/'));
}
#[test]
fn walk_respects_gitignore() {
let dir = TempDir::new().unwrap();
fs::create_dir(dir.path().join(".git")).unwrap();
write(dir.path(), ".gitignore", "ignored.rs\ntarget/\n");
write(dir.path(), "kept.rs", "");
write(dir.path(), "ignored.rs", "");
write(dir.path(), "target/debug/binary", "");
let files = Walker::new(dir.path()).walk().unwrap();
let paths = rel_paths(&files);
assert!(paths.contains(&"kept.rs"));
assert!(
!paths.contains(&"ignored.rs"),
"ignored.rs should be excluded by .gitignore"
);
assert!(
paths.iter().all(|p| !p.starts_with("target/")),
"target/ should be excluded by .gitignore"
);
}
#[test]
fn walk_excludes_files_over_size_limit() {
let dir = TempDir::new().unwrap();
let big = dir.path().join("big.rs");
fs::write(&big, vec![b'x'; 513]).unwrap();
write(dir.path(), "small.rs", "fn main() {}");
let files = Walker::new(dir.path()).max_file_size(512).walk().unwrap();
let paths = rel_paths(&files);
assert!(paths.contains(&"small.rs"));
assert!(
!paths.contains(&"big.rs"),
"big.rs should be excluded by size limit"
);
}
#[test]
fn walk_includes_file_exactly_at_size_limit() {
let dir = TempDir::new().unwrap();
let exact = dir.path().join("exact.rs");
fs::write(&exact, vec![b'x'; 512]).unwrap();
let files = Walker::new(dir.path()).max_file_size(512).walk().unwrap();
assert_eq!(
files.len(),
1,
"file at exact size limit should be included"
);
}
#[test]
fn walk_excludes_binary_extensions() {
let dir = TempDir::new().unwrap();
write(dir.path(), "image.png", "not really a png");
write(dir.path(), "archive.zip", "not really a zip");
write(dir.path(), "lib.so", "");
write(dir.path(), "Cargo.lock", "generated");
write(dir.path(), "source.rs", "fn main() {}");
let files = Walker::new(dir.path()).walk().unwrap();
let paths = rel_paths(&files);
assert!(paths.contains(&"source.rs"));
assert!(!paths.contains(&"image.png"));
assert!(!paths.contains(&"archive.zip"));
assert!(!paths.contains(&"lib.so"));
assert!(!paths.contains(&"Cargo.lock"));
}
#[test]
fn walk_does_not_yield_directories() {
let dir = TempDir::new().unwrap();
fs::create_dir(dir.path().join("subdir")).unwrap();
write(dir.path(), "subdir/file.rs", "");
let files = Walker::new(dir.path()).walk().unwrap();
for f in &files {
assert!(
f.abs_path.is_file(),
"walker yielded a directory: {}",
f.rel_path
);
}
}
#[test]
fn walk_channel_and_walk_return_same_files() {
let dir = TempDir::new().unwrap();
write(dir.path(), "a.rs", "");
write(dir.path(), "b.py", "");
write(dir.path(), "c.ts", "");
let walker = Walker::new(dir.path());
let mut channel_paths: Vec<String> = walker
.walk_channel()
.unwrap()
.into_iter()
.map(|f| f.rel_path)
.collect();
channel_paths.sort_unstable();
let batch_paths: Vec<String> = walker
.walk()
.unwrap()
.into_iter()
.map(|f| f.rel_path)
.collect();
assert_eq!(channel_paths, batch_paths);
}
#[test]
fn walk_errors_on_nonexistent_root() {
let result = Walker::new("/nonexistent/path/that/does/not/exist").walk();
assert!(result.is_err());
}
#[test]
fn walk_size_bytes_is_accurate() {
let dir = TempDir::new().unwrap();
let content = "fn main() { println!(\"hello\"); }";
write(dir.path(), "main.rs", content);
let files = Walker::new(dir.path()).walk().unwrap();
assert_eq!(files.len(), 1);
assert_eq!(files[0].size_bytes, content.len() as u64);
}
#[test]
fn detect_language_rust() {
assert_eq!(detect_language(Path::new("foo.rs")), Language::Rust);
}
#[test]
fn detect_language_typescript() {
assert_eq!(detect_language(Path::new("app.ts")), Language::TypeScript);
assert_eq!(detect_language(Path::new("comp.tsx")), Language::TypeScript);
}
#[test]
fn detect_language_javascript() {
assert_eq!(detect_language(Path::new("index.js")), Language::JavaScript);
assert_eq!(detect_language(Path::new("mod.mjs")), Language::JavaScript);
assert_eq!(detect_language(Path::new("cjs.cjs")), Language::JavaScript);
}
#[test]
fn detect_language_python() {
assert_eq!(detect_language(Path::new("main.py")), Language::Python);
assert_eq!(detect_language(Path::new("types.pyi")), Language::Python);
}
#[test]
fn detect_language_go() {
assert_eq!(detect_language(Path::new("main.go")), Language::Go);
}
#[test]
fn detect_language_java() {
assert_eq!(detect_language(Path::new("Main.java")), Language::Java);
}
#[test]
fn detect_language_c() {
assert_eq!(detect_language(Path::new("main.c")), Language::C);
assert_eq!(detect_language(Path::new("header.h")), Language::C);
}
#[test]
fn detect_language_cpp() {
assert_eq!(detect_language(Path::new("main.cpp")), Language::Cpp);
assert_eq!(detect_language(Path::new("util.cc")), Language::Cpp);
assert_eq!(detect_language(Path::new("lib.cxx")), Language::Cpp);
assert_eq!(detect_language(Path::new("header.hpp")), Language::Cpp);
assert_eq!(detect_language(Path::new("tmpl.hxx")), Language::Cpp);
assert_eq!(detect_language(Path::new("types.hh")), Language::Cpp);
}
#[test]
fn detect_language_ruby() {
assert_eq!(detect_language(Path::new("app.rb")), Language::Ruby);
}
#[test]
fn detect_language_scala() {
assert_eq!(detect_language(Path::new("Main.scala")), Language::Scala);
assert_eq!(detect_language(Path::new("script.sc")), Language::Scala);
}
#[test]
fn detect_language_elixir() {
assert_eq!(detect_language(Path::new("app.ex")), Language::Elixir);
assert_eq!(detect_language(Path::new("test.exs")), Language::Elixir);
}
#[test]
fn detect_language_haskell() {
assert_eq!(detect_language(Path::new("Main.hs")), Language::Haskell);
assert_eq!(
detect_language(Path::new("Literate.lhs")),
Language::Haskell
);
}
#[test]
fn detect_language_unknown_for_config_and_text() {
assert_eq!(detect_language(Path::new("Cargo.toml")), Language::Unknown);
assert_eq!(detect_language(Path::new("README.md")), Language::Unknown);
assert_eq!(detect_language(Path::new("script.sh")), Language::Unknown);
assert_eq!(detect_language(Path::new(".env")), Language::Unknown);
assert_eq!(
detect_language(Path::new("no_extension")),
Language::Unknown
);
}
#[test]
fn binary_extensions_are_excluded() {
let binaries = [
"image.png",
"photo.jpg",
"archive.zip",
"lib.so",
"binary.exe",
"module.wasm",
"Cargo.lock",
"yarn.lock",
"snapshot.snap",
"data.db",
"doc.pdf",
];
for name in binaries {
assert!(
is_binary_extension(Path::new(name)),
"{name} should be detected as binary"
);
}
}
#[test]
fn source_extensions_are_not_binary() {
let sources = [
"main.rs",
"app.py",
"index.ts",
"main.go",
"package.json",
"Cargo.toml",
"README.md",
"style.css",
"image.svg",
];
for name in sources {
assert!(
!is_binary_extension(Path::new(name)),
"{name} should not be detected as binary"
);
}
}
}