code_analyze_core/
traversal.rs1use ignore::WalkBuilder;
9use std::collections::HashSet;
10use std::path::{Path, PathBuf};
11use std::process::Command;
12use std::sync::{Arc, Mutex};
13use std::time::Instant;
14use thiserror::Error;
15use tracing::instrument;
16
17#[derive(Debug, Clone)]
18pub struct WalkEntry {
19 pub path: PathBuf,
20 pub depth: usize,
22 pub is_dir: bool,
23 pub is_symlink: bool,
24 pub symlink_target: Option<PathBuf>,
25}
26
27#[derive(Debug, Error)]
28#[non_exhaustive]
29pub enum TraversalError {
30 #[error("IO error: {0}")]
31 Io(#[from] std::io::Error),
32 #[error("internal concurrency error: {0}")]
33 Internal(String),
34 #[error("git error: {0}")]
35 GitError(String),
36}
37
38#[instrument(skip_all, fields(path = %root.display(), max_depth))]
42pub fn walk_directory(
43 root: &Path,
44 max_depth: Option<u32>,
45) -> Result<Vec<WalkEntry>, TraversalError> {
46 let start = Instant::now();
47 let mut builder = WalkBuilder::new(root);
48 builder.hidden(true).standard_filters(true);
49
50 if let Some(depth) = max_depth
52 && depth > 0
53 {
54 builder.max_depth(Some(depth as usize));
55 }
56
57 let entries = Arc::new(Mutex::new(Vec::new()));
58 let entries_clone = Arc::clone(&entries);
59
60 builder.build_parallel().run(move || {
61 let entries = Arc::clone(&entries_clone);
62 Box::new(move |result| match result {
63 Ok(entry) => {
64 let path = entry.path().to_path_buf();
65 let depth = entry.depth();
66 let is_dir = entry.file_type().is_some_and(|ft| ft.is_dir());
67 let is_symlink = entry.path_is_symlink();
68
69 let symlink_target = if is_symlink {
70 std::fs::read_link(&path).ok()
71 } else {
72 None
73 };
74
75 let walk_entry = WalkEntry {
76 path,
77 depth,
78 is_dir,
79 is_symlink,
80 symlink_target,
81 };
82 let Ok(mut guard) = entries.lock() else {
83 tracing::debug!("mutex poisoned in parallel walker, skipping entry");
84 return ignore::WalkState::Skip;
85 };
86 guard.push(walk_entry);
87 ignore::WalkState::Continue
88 }
89 Err(e) => {
90 tracing::warn!(error = %e, "skipping unreadable entry");
91 ignore::WalkState::Continue
92 }
93 })
94 });
95
96 let mut entries = Arc::try_unwrap(entries)
97 .map_err(|_| {
98 TraversalError::Internal("arc unwrap failed: strong references still live".to_string())
99 })?
100 .into_inner()
101 .map_err(|_| TraversalError::Internal("mutex poisoned".to_string()))?;
102
103 let dir_count = entries.iter().filter(|e| e.is_dir).count();
104 let file_count = entries.iter().filter(|e| !e.is_dir).count();
105
106 tracing::debug!(
107 entries = entries.len(),
108 dirs = dir_count,
109 files = file_count,
110 duration_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
111 "walk complete"
112 );
113
114 entries.sort_by(|a, b| a.path.cmp(&b.path));
116 Ok(entries)
117}
118
119pub fn changed_files_from_git_ref(
127 dir: &Path,
128 git_ref: &str,
129) -> Result<HashSet<PathBuf>, TraversalError> {
130 let root_out = Command::new("git")
133 .arg("-C")
134 .arg(dir)
135 .arg("rev-parse")
136 .arg("--show-toplevel")
137 .output()
138 .map_err(|e| {
139 if e.kind() == std::io::ErrorKind::NotFound {
140 TraversalError::GitError("git not found on PATH".to_string())
141 } else {
142 TraversalError::GitError(format!("failed to run git: {e}"))
143 }
144 })?;
145
146 if !root_out.status.success() {
147 let stderr = String::from_utf8_lossy(&root_out.stderr);
148 return Err(TraversalError::GitError(format!(
149 "not a git repository: {stderr}"
150 )));
151 }
152
153 let root_raw = PathBuf::from(String::from_utf8_lossy(&root_out.stdout).trim().to_string());
154 let root = std::fs::canonicalize(&root_raw).unwrap_or(root_raw);
157
158 let diff_out = Command::new("git")
160 .arg("-C")
161 .arg(dir)
162 .arg("diff")
163 .arg("--name-only")
164 .arg(git_ref)
165 .output()
166 .map_err(|e| TraversalError::GitError(format!("failed to run git diff: {e}")))?;
167
168 if !diff_out.status.success() {
169 let stderr = String::from_utf8_lossy(&diff_out.stderr);
170 return Err(TraversalError::GitError(format!(
171 "git diff failed: {stderr}"
172 )));
173 }
174
175 let changed: HashSet<PathBuf> = String::from_utf8_lossy(&diff_out.stdout)
176 .lines()
177 .filter(|l| !l.is_empty())
178 .map(|l| root.join(l))
179 .collect();
180
181 Ok(changed)
182}
183
184#[must_use]
190pub fn filter_entries_by_git_ref(
191 entries: Vec<WalkEntry>,
192 changed: &HashSet<PathBuf>,
193 root: &Path,
194) -> Vec<WalkEntry> {
195 let canonical_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
196
197 let canonical_changed: HashSet<PathBuf> = changed
199 .iter()
200 .map(|p| std::fs::canonicalize(p).unwrap_or_else(|_| p.clone()))
201 .collect();
202
203 let mut ancestor_dirs: HashSet<PathBuf> = HashSet::new();
205 ancestor_dirs.insert(canonical_root.clone());
206 for p in &canonical_changed {
207 let mut cur = p.as_path();
208 while let Some(parent) = cur.parent() {
209 if !ancestor_dirs.insert(parent.to_path_buf()) {
210 break;
212 }
213 if parent == canonical_root {
214 break;
215 }
216 cur = parent;
217 }
218 }
219
220 entries
221 .into_iter()
222 .filter(|e| {
223 let canonical_path = std::fs::canonicalize(&e.path).unwrap_or_else(|_| e.path.clone());
224 if e.is_dir {
225 ancestor_dirs.contains(&canonical_path)
226 } else {
227 canonical_changed.contains(&canonical_path)
228 }
229 })
230 .collect()
231}
232
233#[must_use]
238pub fn subtree_counts_from_entries(root: &Path, entries: &[WalkEntry]) -> Vec<(PathBuf, usize)> {
239 let mut counts: Vec<(PathBuf, usize)> = Vec::new();
240 for entry in entries {
241 if entry.is_dir {
242 continue;
243 }
244 if entry.path.components().any(|c| {
246 let s = c.as_os_str().to_string_lossy();
247 crate::EXCLUDED_DIRS.contains(&s.as_ref())
248 }) {
249 continue;
250 }
251 let Ok(rel) = entry.path.strip_prefix(root) else {
252 continue;
253 };
254 if let Some(first) = rel.components().next() {
255 let key = root.join(first);
256 match counts.last_mut() {
257 Some(last) if last.0 == key => last.1 += 1,
258 _ => counts.push((key, 1)),
259 }
260 }
261 }
262 counts
263}