use ignore::WalkBuilder;
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::{Arc, Mutex};
use std::time::Instant;
use thiserror::Error;
use tracing::instrument;
#[derive(Debug, Clone)]
pub struct WalkEntry {
pub path: PathBuf,
pub depth: usize,
pub is_dir: bool,
pub is_symlink: bool,
pub symlink_target: Option<PathBuf>,
}
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum TraversalError {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("internal concurrency error: {0}")]
Internal(String),
#[error("git error: {0}")]
GitError(String),
}
#[instrument(skip_all, fields(path = %root.display(), max_depth))]
pub fn walk_directory(
root: &Path,
max_depth: Option<u32>,
) -> Result<Vec<WalkEntry>, TraversalError> {
let start = Instant::now();
let mut builder = WalkBuilder::new(root);
builder.hidden(true).standard_filters(true);
if let Some(depth) = max_depth
&& depth > 0
{
builder.max_depth(Some(depth as usize));
}
let entries = Arc::new(Mutex::new(Vec::new()));
let entries_clone = Arc::clone(&entries);
builder.build_parallel().run(move || {
let entries = Arc::clone(&entries_clone);
Box::new(move |result| match result {
Ok(entry) => {
let path = entry.path().to_path_buf();
let depth = entry.depth();
let is_dir = entry.file_type().is_some_and(|ft| ft.is_dir());
let is_symlink = entry.path_is_symlink();
let symlink_target = if is_symlink {
std::fs::read_link(&path).ok()
} else {
None
};
let walk_entry = WalkEntry {
path,
depth,
is_dir,
is_symlink,
symlink_target,
};
let Ok(mut guard) = entries.lock() else {
tracing::debug!("mutex poisoned in parallel walker, skipping entry");
return ignore::WalkState::Skip;
};
guard.push(walk_entry);
ignore::WalkState::Continue
}
Err(e) => {
tracing::warn!(error = %e, "skipping unreadable entry");
ignore::WalkState::Continue
}
})
});
let mut entries = Arc::try_unwrap(entries)
.map_err(|_| {
TraversalError::Internal("arc unwrap failed: strong references still live".to_string())
})?
.into_inner()
.map_err(|_| TraversalError::Internal("mutex poisoned".to_string()))?;
let dir_count = entries.iter().filter(|e| e.is_dir).count();
let file_count = entries.iter().filter(|e| !e.is_dir).count();
tracing::debug!(
entries = entries.len(),
dirs = dir_count,
files = file_count,
duration_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
"walk complete"
);
entries.sort_by(|a, b| a.path.cmp(&b.path));
Ok(entries)
}
pub fn changed_files_from_git_ref(
dir: &Path,
git_ref: &str,
) -> Result<HashSet<PathBuf>, TraversalError> {
let root_out = Command::new("git")
.arg("-C")
.arg(dir)
.arg("rev-parse")
.arg("--show-toplevel")
.output()
.map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
TraversalError::GitError("git not found on PATH".to_string())
} else {
TraversalError::GitError(format!("failed to run git: {e}"))
}
})?;
if !root_out.status.success() {
let stderr = String::from_utf8_lossy(&root_out.stderr);
return Err(TraversalError::GitError(format!(
"not a git repository: {stderr}"
)));
}
let root_raw = PathBuf::from(String::from_utf8_lossy(&root_out.stdout).trim().to_string());
let root = std::fs::canonicalize(&root_raw).unwrap_or(root_raw);
let diff_out = Command::new("git")
.arg("-C")
.arg(dir)
.arg("diff")
.arg("--name-only")
.arg(git_ref)
.output()
.map_err(|e| TraversalError::GitError(format!("failed to run git diff: {e}")))?;
if !diff_out.status.success() {
let stderr = String::from_utf8_lossy(&diff_out.stderr);
return Err(TraversalError::GitError(format!(
"git diff failed: {stderr}"
)));
}
let changed: HashSet<PathBuf> = String::from_utf8_lossy(&diff_out.stdout)
.lines()
.filter(|l| !l.is_empty())
.map(|l| root.join(l))
.collect();
Ok(changed)
}
#[must_use]
pub fn filter_entries_by_git_ref(
entries: Vec<WalkEntry>,
changed: &HashSet<PathBuf>,
root: &Path,
) -> Vec<WalkEntry> {
let canonical_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
let canonical_changed: HashSet<PathBuf> = changed
.iter()
.map(|p| std::fs::canonicalize(p).unwrap_or_else(|_| p.clone()))
.collect();
let mut ancestor_dirs: HashSet<PathBuf> = HashSet::new();
ancestor_dirs.insert(canonical_root.clone());
for p in &canonical_changed {
let mut cur = p.as_path();
while let Some(parent) = cur.parent() {
if !ancestor_dirs.insert(parent.to_path_buf()) {
break;
}
if parent == canonical_root {
break;
}
cur = parent;
}
}
entries
.into_iter()
.filter(|e| {
let canonical_path = std::fs::canonicalize(&e.path).unwrap_or_else(|_| e.path.clone());
if e.is_dir {
ancestor_dirs.contains(&canonical_path)
} else {
canonical_changed.contains(&canonical_path)
}
})
.collect()
}
#[must_use]
pub fn subtree_counts_from_entries(root: &Path, entries: &[WalkEntry]) -> Vec<(PathBuf, usize)> {
let mut counts: Vec<(PathBuf, usize)> = Vec::new();
for entry in entries {
if entry.is_dir {
continue;
}
if entry.path.components().any(|c| {
let s = c.as_os_str().to_string_lossy();
crate::EXCLUDED_DIRS.contains(&s.as_ref())
}) {
continue;
}
let Ok(rel) = entry.path.strip_prefix(root) else {
continue;
};
if let Some(first) = rel.components().next() {
let key = root.join(first);
match counts.last_mut() {
Some(last) if last.0 == key => last.1 += 1,
_ => counts.push((key, 1)),
}
}
}
counts
}