use crate::utils::Config;
use crate::utils::path::path_stays_within_root;
use crossbeam_channel::{Receiver, Sender, bounded};
use ignore::{WalkBuilder, WalkState, overrides::OverrideBuilder};
use std::thread::JoinHandle;
use std::{
mem,
path::{Path, PathBuf},
thread,
};
type Paths = Vec<PathBuf>;
struct BatchSender {
tx: Sender<Paths>,
batch: Paths,
batch_size: usize,
}
impl BatchSender {
fn new(tx: Sender<Paths>, batch_size: usize) -> Self {
Self {
tx,
batch: Vec::with_capacity(batch_size),
batch_size,
}
}
fn push_path(&mut self, path: PathBuf) {
self.batch.push(path);
if self.batch.len() >= self.batch_size {
self.flush();
}
}
fn flush(&mut self) {
if !self.batch.is_empty() {
tracing::debug!(n_paths = self.batch.len(), "flushing batch");
let _ = self.tx.send(mem::take(&mut self.batch));
}
}
}
impl Drop for BatchSender {
fn drop(&mut self) {
self.flush();
}
}
fn build_overrides(root: &Path, cfg: &Config) -> ignore::overrides::Override {
let mut ob = OverrideBuilder::new(root);
for ext in &cfg.scanner.excluded_extensions {
if let Err(e) = ob.add(&format!("!*.{ext}")) {
tracing::warn!("invalid exclude‐extension pattern ‘{ext}’: {e}");
}
}
for dir in &cfg.scanner.excluded_directories {
if let Err(e) = ob.add(&format!("!**/{dir}/**")) {
tracing::warn!("invalid exclude‐dir pattern ‘{dir}’: {e}");
}
}
for file in &cfg.scanner.excluded_files {
if let Err(e) = ob.add(&format!("!{file}")) {
tracing::warn!("invalid exclude‐file pattern ‘{file}’: {e}");
}
}
ob.build().unwrap_or_else(|e| {
tracing::error!("failed to build ignore overrides: {e}");
ignore::overrides::Override::empty()
})
}
pub fn spawn_file_walker(root: &Path, cfg: &Config) -> (Receiver<Paths>, JoinHandle<()>) {
let _span = tracing::info_span!("spawn_file_walker", root = %root.display()).entered();
let overrides = build_overrides(root, cfg);
let workers = cfg.performance.worker_threads.unwrap_or(num_cpus::get());
let (tx, rx) = bounded::<Paths>(workers * cfg.performance.channel_multiplier);
let root = root.to_path_buf();
let canonical_root = std::fs::canonicalize(&root).ok();
let scan_hidden = cfg.scanner.scan_hidden_files;
let follow = cfg.scanner.follow_symlinks;
let max_bytes = cfg.scanner.max_file_size_mb.unwrap_or(0) * 1_048_576;
let batch_size = cfg.performance.batch_size;
let max_depth = cfg.performance.max_depth;
let same_file_system = cfg.scanner.one_file_system;
let require_git = cfg.scanner.require_git_to_read_vcsignore;
let handle = thread::spawn(move || {
tracing::info!(
root = ?root,
workers = workers,
scan_hidden = scan_hidden,
follow_links = follow,
max_bytes = max_bytes,
batch_size = batch_size,
"starting directory walk"
);
let mut builder = WalkBuilder::new(root);
builder
.hidden(!scan_hidden)
.follow_links(follow)
.threads(workers)
.overrides(overrides)
.same_file_system(same_file_system)
.require_git(require_git);
if let Some(depth) = max_depth {
builder.max_depth(Some(depth));
}
builder
.filter_entry(|e| {
e.file_type()
.map(|ft| ft.is_dir() || ft.is_file())
.unwrap_or(true)
})
.build_parallel()
.run(move || {
let mut bs = BatchSender::new(tx.clone(), batch_size);
let canonical_root = canonical_root.clone();
Box::new(move |entry| {
if let Ok(e) = entry {
let metadata = match e.metadata() {
Ok(metadata) => metadata,
Err(_) => return WalkState::Continue,
};
let is_file = metadata.file_type().is_file();
let under_limit = max_bytes == 0 || metadata.len() <= max_bytes;
let path_allowed = canonical_root.as_ref().is_none_or(|root| {
path_stays_within_root(root, e.path()).unwrap_or(false)
});
if is_file && under_limit && path_allowed {
bs.push_path(e.into_path());
}
}
WalkState::Continue
})
});
tracing::info!("directory walk complete");
});
(rx, handle)
}
#[test]
fn walker_respects_excluded_extensions() {
let tmp = tempfile::tempdir().unwrap();
std::fs::write(tmp.path().join("keep.rs"), "fn main(){}").unwrap(); std::fs::write(tmp.path().join("skip.txt"), "ignored").unwrap();
let mut cfg = Config::default();
cfg.scanner.excluded_extensions = vec!["txt".into()];
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 2;
let (rx, handle) = spawn_file_walker(tmp.path(), &cfg);
if let Err(err) = handle.join() {
tracing::error!("walker thread panicked: {:#?}", err);
}
let all: Vec<_> = rx.into_iter().flatten().collect();
assert!(all.iter().any(|p| p.ends_with("keep.rs")));
assert!(all.iter().all(|p| !p.ends_with("skip.txt")));
}
#[test]
fn walker_respects_excluded_directories() {
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
std::fs::write(root.join("keep.rs"), "fn main(){}").unwrap(); let vendor = root.join("vendor");
std::fs::create_dir(&vendor).unwrap();
std::fs::write(vendor.join("dep.rs"), "fn dep(){}").unwrap();
let mut cfg = Config::default();
cfg.scanner.excluded_directories = vec!["vendor".into()];
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 4;
let (rx, handle) = spawn_file_walker(root, &cfg);
handle.join().ok();
let all: Vec<_> = rx.into_iter().flatten().collect();
assert!(all.iter().any(|p| p.ends_with("keep.rs")));
assert!(
all.iter().all(|p| !p.starts_with(&vendor)),
"vendor dir files should be excluded: {all:?}"
);
}
#[test]
fn walker_respects_excluded_files() {
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
std::fs::write(root.join("keep.rs"), "fn a(){}").unwrap(); std::fs::write(root.join("skip.rs"), "fn b(){}").unwrap();
let mut cfg = Config::default();
cfg.scanner.excluded_files = vec!["skip.rs".into()];
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 4;
let (rx, handle) = spawn_file_walker(root, &cfg);
handle.join().ok();
let all: Vec<_> = rx.into_iter().flatten().collect();
assert!(all.iter().any(|p| p.ends_with("keep.rs")));
assert!(all.iter().all(|p| !p.ends_with("skip.rs")));
}
#[test]
fn walker_respects_max_file_size() {
let tmp = tempfile::tempdir().unwrap();
let root = tmp.path();
std::fs::write(root.join("small.rs"), "fn s(){}").unwrap(); let big_data = vec![b'x'; 2 * 1_048_576]; std::fs::write(root.join("big.rs"), big_data).unwrap();
let mut cfg = Config::default();
cfg.scanner.max_file_size_mb = Some(1); cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 4;
let (rx, handle) = spawn_file_walker(root, &cfg);
handle.join().ok();
let all: Vec<_> = rx.into_iter().flatten().collect();
assert!(all.iter().any(|p| p.ends_with("small.rs")));
assert!(
all.iter().all(|p| !p.ends_with("big.rs")),
"file exceeding size limit should be excluded: {all:?}"
);
}
#[test]
fn walker_returns_empty_on_empty_directory() {
let tmp = tempfile::tempdir().unwrap();
let mut cfg = Config::default();
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 4;
let (rx, handle) = spawn_file_walker(tmp.path(), &cfg);
handle.join().ok();
let all: Vec<_> = rx.into_iter().flatten().collect();
assert!(all.is_empty(), "empty directory should yield no files");
}
#[cfg(unix)]
#[test]
fn walker_follow_symlinks_does_not_escape_root() {
use std::os::unix::fs::symlink;
let tmp = tempfile::tempdir().unwrap();
let outside = tempfile::tempdir().unwrap();
let outside_file = outside.path().join("secret.rs");
std::fs::write(&outside_file, "fn leaked() {}").unwrap();
let link = tmp.path().join("escape.rs");
symlink(&outside_file, &link).unwrap();
let mut cfg = Config::default();
cfg.scanner.follow_symlinks = true;
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 4;
let (rx, handle) = spawn_file_walker(tmp.path(), &cfg);
handle.join().ok();
let all: Vec<_> = rx.into_iter().flatten().collect();
assert!(
all.iter().all(|path| path != &link),
"symlink escapes must not be scanned: {all:?}"
);
}
#[cfg(unix)]
#[test]
fn walker_no_follow_symlinks_still_rejects_outside_paths() {
use std::os::unix::fs::symlink;
let tmp = tempfile::tempdir().unwrap();
let outside = tempfile::tempdir().unwrap();
let outside_file = outside.path().join("secret.rs");
std::fs::write(&outside_file, "fn leaked() {}").unwrap();
let link = tmp.path().join("escape.rs");
symlink(&outside_file, &link).unwrap();
let mut cfg = Config::default();
cfg.scanner.follow_symlinks = false;
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 4;
let (rx, handle) = spawn_file_walker(tmp.path(), &cfg);
handle.join().ok();
let all: Vec<_> = rx.into_iter().flatten().collect();
assert!(
all.iter()
.all(|path| !path.starts_with(outside.path()) && path != &link),
"symlink target outside root must not be scanned: {all:?}"
);
}