1use ignore::WalkBuilder;
9use std::collections::HashSet;
10use std::path::{Path, PathBuf};
11use std::process::Command;
12use std::sync::Arc;
13use std::sync::atomic::{AtomicUsize, Ordering};
14use std::time::{Instant, SystemTime};
15use thiserror::Error;
16use tracing::instrument;
17
18pub const MAX_WALK_ENTRIES: usize = 50_000;
19
20#[derive(Debug, Clone)]
21pub struct WalkEntry {
22 pub path: PathBuf,
23 pub depth: usize,
25 pub is_dir: bool,
26 pub is_symlink: bool,
27 pub symlink_target: Option<PathBuf>,
28 pub mtime: Option<SystemTime>,
29 pub canonical_path: PathBuf,
30}
31
32#[derive(Debug, Error)]
33#[non_exhaustive]
34pub enum TraversalError {
35 #[error("IO error: {0}")]
36 Io(#[from] std::io::Error),
37 #[error("internal concurrency error: {0}")]
38 Internal(String),
39 #[error("git error: {0}")]
40 GitError(String),
41}
42
43#[instrument(skip_all, fields(path = %root.display(), max_depth))]
47pub fn walk_directory(
48 root: &Path,
49 max_depth: Option<u32>,
50) -> Result<Vec<WalkEntry>, TraversalError> {
51 let start = Instant::now();
52
53 if max_depth == Some(1) {
55 return walk_directory_sync(root, start);
56 }
57
58 let mut builder = WalkBuilder::new(root);
59 builder.hidden(true).standard_filters(true);
60
61 let max_depth_usize = if let Some(depth) = max_depth
63 && depth > 0
64 {
65 builder.max_depth(Some(depth as usize));
66 Some(depth as usize)
67 } else {
68 None
69 };
70
71 let (sender, receiver) = std::sync::mpsc::channel::<WalkEntry>();
72 let entry_count = Arc::new(AtomicUsize::new(0));
73
74 builder.build_parallel().run(move || {
75 let sender = sender.clone();
76 let entry_count = entry_count.clone();
77 Box::new(move |result| match result {
78 Ok(entry) => {
79 let path = entry.path().to_path_buf();
80 let depth = entry.depth();
81 let is_dir = entry.file_type().is_some_and(|ft| ft.is_dir());
82 let is_symlink = entry.path_is_symlink();
83
84 let symlink_target = if is_symlink {
85 std::fs::read_link(&path).ok()
86 } else {
87 None
88 };
89
90 let mtime = entry.metadata().ok().and_then(|m| m.modified().ok());
91
92 let walk_entry = WalkEntry {
93 path: path.clone(),
94 depth,
95 is_dir,
96 is_symlink,
97 symlink_target,
98 mtime,
99 canonical_path: path.clone(),
100 };
101 sender.send(walk_entry).ok();
102 let count = entry_count.fetch_add(1, Ordering::Relaxed);
103 if count >= MAX_WALK_ENTRIES {
104 return ignore::WalkState::Quit;
105 }
106 if is_dir && max_depth_usize.is_some_and(|max_d| depth >= max_d) {
111 return ignore::WalkState::Skip;
112 }
113 ignore::WalkState::Continue
114 }
115 Err(e) => {
116 tracing::warn!(error = %e, "skipping unreadable entry");
117 ignore::WalkState::Continue
118 }
119 })
120 });
121
122 let mut entries: Vec<WalkEntry> = receiver.try_iter().collect();
123 entries.truncate(MAX_WALK_ENTRIES);
124 if entries.len() >= MAX_WALK_ENTRIES {
125 tracing::warn!(
126 "walk truncated at {} entries (MAX_WALK_ENTRIES={}); results are partial",
127 MAX_WALK_ENTRIES,
128 MAX_WALK_ENTRIES
129 );
130 }
131
132 let dir_count = entries.iter().filter(|e| e.is_dir).count();
133 let file_count = entries.iter().filter(|e| !e.is_dir).count();
134
135 tracing::debug!(
136 entries = entries.len(),
137 dirs = dir_count,
138 files = file_count,
139 duration_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
140 "walk complete"
141 );
142
143 entries.sort_by(|a, b| a.path.cmp(&b.path));
145 Ok(entries)
146}
147
148fn walk_directory_sync(root: &Path, start: Instant) -> Result<Vec<WalkEntry>, TraversalError> {
150 let mut builder = WalkBuilder::new(root);
151 builder.hidden(true).standard_filters(true);
152 builder.max_depth(Some(1));
153
154 let mut entries = Vec::new();
155 let entry_count = Arc::new(AtomicUsize::new(0));
156
157 for result in builder.build() {
158 match result {
159 Ok(entry) => {
160 let path = entry.path().to_path_buf();
161 let depth = entry.depth();
162 let is_dir = entry.file_type().is_some_and(|ft| ft.is_dir());
163 let is_symlink = entry.path_is_symlink();
164
165 let symlink_target = if is_symlink {
166 std::fs::read_link(&path).ok()
167 } else {
168 None
169 };
170
171 let mtime = entry.metadata().ok().and_then(|m| m.modified().ok());
172
173 let walk_entry = WalkEntry {
174 path: path.clone(),
175 depth,
176 is_dir,
177 is_symlink,
178 symlink_target,
179 mtime,
180 canonical_path: path.clone(),
181 };
182 entries.push(walk_entry);
183 let count = entry_count.fetch_add(1, Ordering::Relaxed);
184 if count >= MAX_WALK_ENTRIES {
185 break;
186 }
187 }
188 Err(e) => {
189 tracing::warn!(error = %e, "skipping unreadable entry");
190 }
191 }
192 }
193
194 entries.truncate(MAX_WALK_ENTRIES);
195 if entries.len() >= MAX_WALK_ENTRIES {
196 tracing::warn!(
197 "walk truncated at {} entries (MAX_WALK_ENTRIES={}); results are partial",
198 MAX_WALK_ENTRIES,
199 MAX_WALK_ENTRIES
200 );
201 }
202
203 let dir_count = entries.iter().filter(|e| e.is_dir).count();
204 let file_count = entries.iter().filter(|e| !e.is_dir).count();
205
206 tracing::debug!(
207 entries = entries.len(),
208 dirs = dir_count,
209 files = file_count,
210 duration_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
211 "walk complete (sync)"
212 );
213
214 entries.sort_by(|a, b| a.path.cmp(&b.path));
216 Ok(entries)
217}
218
219pub fn changed_files_from_git_ref(
227 dir: &Path,
228 git_ref: &str,
229) -> Result<HashSet<PathBuf>, TraversalError> {
230 if git_ref.is_empty() || git_ref.starts_with('-') {
234 return Err(TraversalError::GitError(
235 "invalid git_ref: must not be empty or start with '-'".to_string(),
236 ));
237 }
238 if git_ref.chars().any(|c| {
239 c.is_whitespace()
240 || matches!(
241 c,
242 '|' | '&'
243 | ';'
244 | '>'
245 | '<'
246 | '`'
247 | '$'
248 | '('
249 | ')'
250 | '{'
251 | '}'
252 | '['
253 | ']'
254 | '*'
255 | '?'
256 | '\\'
257 | '"'
258 | '\''
259 )
260 }) {
261 return Err(TraversalError::GitError(
262 "invalid git_ref: contains forbidden characters".to_string(),
263 ));
264 }
265
266 let root_out = Command::new("git")
269 .arg("-C")
270 .arg(dir)
271 .arg("rev-parse")
272 .arg("--show-toplevel")
273 .output()
274 .map_err(|e| {
275 if e.kind() == std::io::ErrorKind::NotFound {
276 TraversalError::GitError("git not found on PATH".to_string())
277 } else {
278 TraversalError::GitError(format!("failed to run git: {e}"))
279 }
280 })?;
281
282 if !root_out.status.success() {
283 let stderr = String::from_utf8_lossy(&root_out.stderr);
284 return Err(TraversalError::GitError(format!(
285 "not a git repository: {stderr}"
286 )));
287 }
288
289 let root_raw = PathBuf::from(String::from_utf8_lossy(&root_out.stdout).trim().to_string());
290 let root = std::fs::canonicalize(&root_raw).unwrap_or(root_raw);
293
294 let diff_out = Command::new("git")
296 .arg("-C")
297 .arg(dir)
298 .arg("diff")
299 .arg("--name-only")
300 .arg(git_ref)
301 .output()
302 .map_err(|e| TraversalError::GitError(format!("failed to run git diff: {e}")))?;
303
304 if !diff_out.status.success() {
305 let stderr = String::from_utf8_lossy(&diff_out.stderr);
306 return Err(TraversalError::GitError(format!(
307 "git diff failed: {stderr}"
308 )));
309 }
310
311 let changed: HashSet<PathBuf> = String::from_utf8_lossy(&diff_out.stdout)
312 .lines()
313 .filter(|l| !l.is_empty())
314 .map(|l| root.join(l))
315 .collect();
316
317 Ok(changed)
318}
319
320#[must_use]
326pub fn filter_entries_by_git_ref(
327 entries: Vec<WalkEntry>,
328 changed: &HashSet<PathBuf>,
329 root: &Path,
330) -> Vec<WalkEntry> {
331 let canonical_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
332
333 let canonical_changed: HashSet<PathBuf> = changed
335 .iter()
336 .map(|p| std::fs::canonicalize(p).unwrap_or_else(|_| p.clone()))
337 .collect();
338
339 let mut ancestor_dirs: HashSet<PathBuf> = HashSet::new();
341 ancestor_dirs.insert(canonical_root.clone());
342 for p in &canonical_changed {
343 let mut cur = p.as_path();
344 while let Some(parent) = cur.parent() {
345 if !ancestor_dirs.insert(parent.to_path_buf()) {
346 break;
348 }
349 if parent == canonical_root {
350 break;
351 }
352 cur = parent;
353 }
354 }
355
356 entries
357 .into_iter()
358 .filter(|e| {
359 let canonical = std::fs::canonicalize(&e.path).unwrap_or_else(|_| e.path.clone());
360 if e.is_dir {
361 ancestor_dirs.contains(&canonical)
362 } else {
363 canonical_changed.contains(&canonical)
364 }
365 })
366 .collect()
367}
368
369#[must_use]
374pub fn subtree_counts_from_entries(root: &Path, entries: &[WalkEntry]) -> Vec<(PathBuf, usize)> {
375 let mut counts: Vec<(PathBuf, usize)> = Vec::new();
376 for entry in entries {
377 if entry.is_dir {
378 continue;
379 }
380 if entry.path.components().any(|c| {
382 let s = c.as_os_str().to_string_lossy();
383 crate::EXCLUDED_DIRS.contains(&s.as_ref())
384 }) {
385 continue;
386 }
387 let Ok(rel) = entry.path.strip_prefix(root) else {
388 continue;
389 };
390 if let Some(first) = rel.components().next() {
391 let key = root.join(first);
392 match counts.last_mut() {
393 Some(last) if last.0 == key => last.1 += 1,
394 _ => counts.push((key, 1)),
395 }
396 }
397 }
398 counts
399}
400
401#[cfg(test)]
402mod tests {
403 use super::*;
404
405 #[test]
406 fn test_git_ref_injection_rejected() {
407 let tmp = tempfile::tempdir().unwrap();
408 let tmp_path = tmp.path();
409
410 let result = changed_files_from_git_ref(tmp_path, "--output=/tmp/evil");
412 assert!(result.is_err(), "should reject git_ref starting with '-'");
413
414 let result = changed_files_from_git_ref(tmp_path, "--");
416 assert!(result.is_err(), "should reject git_ref starting with '-'");
417
418 let result = changed_files_from_git_ref(tmp_path, "main branch");
420 assert!(result.is_err(), "should reject git_ref with spaces");
421
422 let result = changed_files_from_git_ref(tmp_path, "");
424 assert!(result.is_err(), "should reject empty git_ref");
425
426 let result = changed_files_from_git_ref(tmp_path, "HEAD~1");
429 if let Err(TraversalError::GitError(msg)) = result {
431 assert!(
432 !msg.contains("invalid git_ref"),
433 "HEAD~1 should pass validation"
434 );
435 }
436
437 let result = changed_files_from_git_ref(tmp_path, "main");
439 if let Err(TraversalError::GitError(msg)) = result {
440 assert!(
441 !msg.contains("invalid git_ref"),
442 "main should pass validation"
443 );
444 }
445
446 let result = changed_files_from_git_ref(tmp_path, "abc123");
448 if let Err(TraversalError::GitError(msg)) = result {
449 assert!(
450 !msg.contains("invalid git_ref"),
451 "abc123 should pass validation"
452 );
453 }
454 }
455
456 #[test]
457 fn test_walk_directory_depth1_sync_matches_depth2_filtered() {
458 let tmp = tempfile::tempdir().unwrap();
460 let root = tmp.path();
461
462 std::fs::write(root.join("file1.txt"), b"content1").unwrap();
464 std::fs::create_dir(root.join("dir1")).unwrap();
465 std::fs::write(root.join("dir1/file2.txt"), b"content2").unwrap();
466 std::fs::create_dir(root.join("dir1/dir2")).unwrap();
467 std::fs::write(root.join("dir1/dir2/file3.txt"), b"content3").unwrap();
468
469 let entries_depth1 = walk_directory(root, Some(1)).unwrap();
471
472 let entries_depth2 = walk_directory(root, Some(2)).unwrap();
474
475 let entries_depth2_filtered: Vec<_> =
477 entries_depth2.iter().filter(|e| e.depth <= 1).collect();
478
479 assert_eq!(
481 entries_depth1.len(),
482 entries_depth2_filtered.len(),
483 "depth=1 sync walker should match depth=2 filtered to depth<=1"
484 );
485
486 for entry1 in &entries_depth1 {
488 let found = entries_depth2_filtered.iter().any(|e2| {
489 e2.path == entry1.path && e2.depth == entry1.depth && e2.is_dir == entry1.is_dir
490 });
491 assert!(
492 found,
493 "entry {:?} from depth=1 not found in depth=2 filtered",
494 entry1.path
495 );
496 }
497 }
498}