Skip to main content

aptu_coder_core/
traversal.rs

1// SPDX-FileCopyrightText: 2026 aptu-coder contributors
2// SPDX-License-Identifier: Apache-2.0
3//! Directory traversal with .gitignore support.
4//!
5//! Provides recursive directory walking with automatic filtering based on `.gitignore` and `.ignore` files.
6//! Uses the `ignore` crate for cross-platform, efficient file system traversal.
7
8use ignore::WalkBuilder;
9use std::collections::HashSet;
10use std::path::{Path, PathBuf};
11use std::process::Command;
12use std::sync::Arc;
13use std::sync::atomic::{AtomicUsize, Ordering};
14use std::time::{Instant, SystemTime};
15use thiserror::Error;
16use tracing::instrument;
17
18pub const MAX_WALK_ENTRIES: usize = 50_000;
19
20#[derive(Debug, Clone)]
21pub struct WalkEntry {
22    pub path: PathBuf,
23    /// Depth in the directory tree (0 = root).
24    pub depth: usize,
25    pub is_dir: bool,
26    pub is_symlink: bool,
27    pub symlink_target: Option<PathBuf>,
28    pub mtime: Option<SystemTime>,
29    pub canonical_path: PathBuf,
30}
31
32#[derive(Debug, Error)]
33#[non_exhaustive]
34pub enum TraversalError {
35    #[error("IO error: {0}")]
36    Io(#[from] std::io::Error),
37    #[error("internal concurrency error: {0}")]
38    Internal(String),
39    #[error("git error: {0}")]
40    GitError(String),
41}
42
43/// Walk a directory with support for `.gitignore` and `.ignore`.
44/// `max_depth=0` maps to unlimited recursion (None), positive values limit depth.
45/// The returned entries are sorted lexicographically by path.
46#[instrument(skip_all, fields(path = %root.display(), max_depth))]
47pub fn walk_directory(
48    root: &Path,
49    max_depth: Option<u32>,
50) -> Result<Vec<WalkEntry>, TraversalError> {
51    let start = Instant::now();
52
53    // Optimization: use synchronous walker for max_depth == Some(1) to avoid thread spawn overhead.
54    if max_depth == Some(1) {
55        return walk_directory_sync(root, start);
56    }
57
58    let mut builder = WalkBuilder::new(root);
59    builder.hidden(true).standard_filters(true);
60
61    // Map max_depth: 0 = unlimited (None), positive = Some(n)
62    let max_depth_usize = if let Some(depth) = max_depth
63        && depth > 0
64    {
65        builder.max_depth(Some(depth as usize));
66        Some(depth as usize)
67    } else {
68        None
69    };
70
71    let (sender, receiver) = std::sync::mpsc::channel::<WalkEntry>();
72    let entry_count = Arc::new(AtomicUsize::new(0));
73
74    builder.build_parallel().run(move || {
75        let sender = sender.clone();
76        let entry_count = entry_count.clone();
77        Box::new(move |result| match result {
78            Ok(entry) => {
79                let path = entry.path().to_path_buf();
80                let depth = entry.depth();
81                let is_dir = entry.file_type().is_some_and(|ft| ft.is_dir());
82                let is_symlink = entry.path_is_symlink();
83
84                let symlink_target = if is_symlink {
85                    std::fs::read_link(&path).ok()
86                } else {
87                    None
88                };
89
90                let mtime = entry.metadata().ok().and_then(|m| m.modified().ok());
91
92                let walk_entry = WalkEntry {
93                    path: path.clone(),
94                    depth,
95                    is_dir,
96                    is_symlink,
97                    symlink_target,
98                    mtime,
99                    canonical_path: path.clone(),
100                };
101                sender.send(walk_entry).ok();
102                let count = entry_count.fetch_add(1, Ordering::Relaxed);
103                if count >= MAX_WALK_ENTRIES {
104                    return ignore::WalkState::Quit;
105                }
106                // Skip recursing into directories at the depth boundary. The entry
107                // itself has already been sent above; this only prevents descent into
108                // its children, avoiding gitignore matching on large trees (target/,
109                // node_modules/) whose contents would be discarded by the depth limit.
110                if is_dir && max_depth_usize.is_some_and(|max_d| depth >= max_d) {
111                    return ignore::WalkState::Skip;
112                }
113                ignore::WalkState::Continue
114            }
115            Err(e) => {
116                tracing::warn!(error = %e, "skipping unreadable entry");
117                ignore::WalkState::Continue
118            }
119        })
120    });
121
122    let mut entries: Vec<WalkEntry> = receiver.try_iter().collect();
123    entries.truncate(MAX_WALK_ENTRIES);
124    if entries.len() >= MAX_WALK_ENTRIES {
125        tracing::warn!(
126            "walk truncated at {} entries (MAX_WALK_ENTRIES={}); results are partial",
127            MAX_WALK_ENTRIES,
128            MAX_WALK_ENTRIES
129        );
130    }
131
132    let dir_count = entries.iter().filter(|e| e.is_dir).count();
133    let file_count = entries.iter().filter(|e| !e.is_dir).count();
134
135    tracing::debug!(
136        entries = entries.len(),
137        dirs = dir_count,
138        files = file_count,
139        duration_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
140        "walk complete"
141    );
142
143    // Restore sort contract: walk_parallel does not guarantee order.
144    entries.sort_by(|a, b| a.path.cmp(&b.path));
145    Ok(entries)
146}
147
148/// Synchronous walker for max_depth == Some(1) to eliminate thread spawn overhead.
149fn walk_directory_sync(root: &Path, start: Instant) -> Result<Vec<WalkEntry>, TraversalError> {
150    let mut builder = WalkBuilder::new(root);
151    builder.hidden(true).standard_filters(true);
152    builder.max_depth(Some(1));
153
154    let mut entries = Vec::new();
155    let entry_count = Arc::new(AtomicUsize::new(0));
156
157    for result in builder.build() {
158        match result {
159            Ok(entry) => {
160                let path = entry.path().to_path_buf();
161                let depth = entry.depth();
162                let is_dir = entry.file_type().is_some_and(|ft| ft.is_dir());
163                let is_symlink = entry.path_is_symlink();
164
165                let symlink_target = if is_symlink {
166                    std::fs::read_link(&path).ok()
167                } else {
168                    None
169                };
170
171                let mtime = entry.metadata().ok().and_then(|m| m.modified().ok());
172
173                let walk_entry = WalkEntry {
174                    path: path.clone(),
175                    depth,
176                    is_dir,
177                    is_symlink,
178                    symlink_target,
179                    mtime,
180                    canonical_path: path.clone(),
181                };
182                entries.push(walk_entry);
183                let count = entry_count.fetch_add(1, Ordering::Relaxed);
184                if count >= MAX_WALK_ENTRIES {
185                    break;
186                }
187            }
188            Err(e) => {
189                tracing::warn!(error = %e, "skipping unreadable entry");
190            }
191        }
192    }
193
194    entries.truncate(MAX_WALK_ENTRIES);
195    if entries.len() >= MAX_WALK_ENTRIES {
196        tracing::warn!(
197            "walk truncated at {} entries (MAX_WALK_ENTRIES={}); results are partial",
198            MAX_WALK_ENTRIES,
199            MAX_WALK_ENTRIES
200        );
201    }
202
203    let dir_count = entries.iter().filter(|e| e.is_dir).count();
204    let file_count = entries.iter().filter(|e| !e.is_dir).count();
205
206    tracing::debug!(
207        entries = entries.len(),
208        dirs = dir_count,
209        files = file_count,
210        duration_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
211        "walk complete (sync)"
212    );
213
214    // Sort entries lexicographically.
215    entries.sort_by(|a, b| a.path.cmp(&b.path));
216    Ok(entries)
217}
218
219/// Return the set of absolute paths changed relative to `git_ref` in the repository
220/// containing `dir`. Invokes git without shell interpolation.
221///
222/// # Errors
223/// Returns [`TraversalError::GitError`] when:
224/// - `git` is not on PATH (distinct message)
225/// - `dir` is not inside a git repository
226pub fn changed_files_from_git_ref(
227    dir: &Path,
228    git_ref: &str,
229) -> Result<HashSet<PathBuf>, TraversalError> {
230    // Validate git_ref to prevent argument injection attacks.
231    // Reject refs that start with '-' (would be interpreted as flags).
232    // Also reject refs containing whitespace or shell metacharacters.
233    if git_ref.is_empty() || git_ref.starts_with('-') {
234        return Err(TraversalError::GitError(
235            "invalid git_ref: must not be empty or start with '-'".to_string(),
236        ));
237    }
238    if git_ref.chars().any(|c| {
239        c.is_whitespace()
240            || matches!(
241                c,
242                '|' | '&'
243                    | ';'
244                    | '>'
245                    | '<'
246                    | '`'
247                    | '$'
248                    | '('
249                    | ')'
250                    | '{'
251                    | '}'
252                    | '['
253                    | ']'
254                    | '*'
255                    | '?'
256                    | '\\'
257                    | '"'
258                    | '\''
259            )
260    }) {
261        return Err(TraversalError::GitError(
262            "invalid git_ref: contains forbidden characters".to_string(),
263        ));
264    }
265
266    // Resolve the git repository root so that relative paths from `git diff` can
267    // be anchored to an absolute base.
268    let root_out = Command::new("git")
269        .arg("-C")
270        .arg(dir)
271        .arg("rev-parse")
272        .arg("--show-toplevel")
273        .output()
274        .map_err(|e| {
275            if e.kind() == std::io::ErrorKind::NotFound {
276                TraversalError::GitError("git not found on PATH".to_string())
277            } else {
278                TraversalError::GitError(format!("failed to run git: {e}"))
279            }
280        })?;
281
282    if !root_out.status.success() {
283        let stderr = String::from_utf8_lossy(&root_out.stderr);
284        return Err(TraversalError::GitError(format!(
285            "not a git repository: {stderr}"
286        )));
287    }
288
289    let root_raw = PathBuf::from(String::from_utf8_lossy(&root_out.stdout).trim().to_string());
290    // Canonicalize to resolve symlinks (e.g. macOS /tmp -> /private/tmp) so that
291    // paths from git and paths from walk_directory are comparable.
292    let root = std::fs::canonicalize(&root_raw).unwrap_or(root_raw);
293
294    // Run `git diff --name-only <git_ref>` to get changed files relative to root.
295    let diff_out = Command::new("git")
296        .arg("-C")
297        .arg(dir)
298        .arg("diff")
299        .arg("--name-only")
300        .arg(git_ref)
301        .output()
302        .map_err(|e| TraversalError::GitError(format!("failed to run git diff: {e}")))?;
303
304    if !diff_out.status.success() {
305        let stderr = String::from_utf8_lossy(&diff_out.stderr);
306        return Err(TraversalError::GitError(format!(
307            "git diff failed: {stderr}"
308        )));
309    }
310
311    let changed: HashSet<PathBuf> = String::from_utf8_lossy(&diff_out.stdout)
312        .lines()
313        .filter(|l| !l.is_empty())
314        .map(|l| root.join(l))
315        .collect();
316
317    Ok(changed)
318}
319
320/// Filter walk entries to only those that are either changed files or ancestor directories
321/// of changed files. This preserves the tree structure while limiting analysis scope.
322///
323/// Uses O(|changed| * depth + |entries|) time by precomputing a HashSet of ancestor
324/// directories for each changed file (up to and including `root`).
325#[must_use]
326pub fn filter_entries_by_git_ref(
327    entries: Vec<WalkEntry>,
328    changed: &HashSet<PathBuf>,
329    root: &Path,
330) -> Vec<WalkEntry> {
331    let canonical_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
332
333    // Precompute canonical changed set so comparison works across symlink differences.
334    let canonical_changed: HashSet<PathBuf> = changed
335        .iter()
336        .map(|p| std::fs::canonicalize(p).unwrap_or_else(|_| p.clone()))
337        .collect();
338
339    // Build HashSet of all ancestor directories of changed files (bounded by canonical_root).
340    let mut ancestor_dirs: HashSet<PathBuf> = HashSet::new();
341    ancestor_dirs.insert(canonical_root.clone());
342    for p in &canonical_changed {
343        let mut cur = p.as_path();
344        while let Some(parent) = cur.parent() {
345            if !ancestor_dirs.insert(parent.to_path_buf()) {
346                // Already inserted this ancestor and all its ancestors; stop early.
347                break;
348            }
349            if parent == canonical_root {
350                break;
351            }
352            cur = parent;
353        }
354    }
355
356    entries
357        .into_iter()
358        .filter(|e| {
359            let canonical = std::fs::canonicalize(&e.path).unwrap_or_else(|_| e.path.clone());
360            if e.is_dir {
361                ancestor_dirs.contains(&canonical)
362            } else {
363                canonical_changed.contains(&canonical)
364            }
365        })
366        .collect()
367}
368
369/// Compute files-per-depth-1-subdirectory counts from an already-collected entry list.
370/// Returns a Vec of (depth-1 path, file count) sorted by path.
371/// Only counts file entries (not directories); skips entries containing `EXCLUDED_DIRS` components.
372/// Output Vec is sorted by construction (entries are pre-sorted by path).
373#[must_use]
374pub fn subtree_counts_from_entries(root: &Path, entries: &[WalkEntry]) -> Vec<(PathBuf, usize)> {
375    let mut counts: Vec<(PathBuf, usize)> = Vec::new();
376    for entry in entries {
377        if entry.is_dir {
378            continue;
379        }
380        // Skip entries whose path components contain EXCLUDED_DIRS
381        if entry.path.components().any(|c| {
382            let s = c.as_os_str().to_string_lossy();
383            crate::EXCLUDED_DIRS.contains(&s.as_ref())
384        }) {
385            continue;
386        }
387        let Ok(rel) = entry.path.strip_prefix(root) else {
388            continue;
389        };
390        if let Some(first) = rel.components().next() {
391            let key = root.join(first);
392            match counts.last_mut() {
393                Some(last) if last.0 == key => last.1 += 1,
394                _ => counts.push((key, 1)),
395            }
396        }
397    }
398    counts
399}
400
401#[cfg(test)]
402mod tests {
403    use super::*;
404
405    #[test]
406    fn test_git_ref_injection_rejected() {
407        let tmp = tempfile::tempdir().unwrap();
408        let tmp_path = tmp.path();
409
410        // --output=/tmp/evil should be rejected (starts with -)
411        let result = changed_files_from_git_ref(tmp_path, "--output=/tmp/evil");
412        assert!(result.is_err(), "should reject git_ref starting with '-'");
413
414        // double-dash alone should be rejected (starts with -)
415        let result = changed_files_from_git_ref(tmp_path, "--");
416        assert!(result.is_err(), "should reject git_ref starting with '-'");
417
418        // git_ref with spaces should be rejected
419        let result = changed_files_from_git_ref(tmp_path, "main branch");
420        assert!(result.is_err(), "should reject git_ref with spaces");
421
422        // empty git_ref should be rejected
423        let result = changed_files_from_git_ref(tmp_path, "");
424        assert!(result.is_err(), "should reject empty git_ref");
425
426        // valid refs should pass validation (may fail on git not found, but not on validation)
427        // HEAD~1 is valid
428        let result = changed_files_from_git_ref(tmp_path, "HEAD~1");
429        // We expect a git error (not a git repo), not a validation error
430        if let Err(TraversalError::GitError(msg)) = result {
431            assert!(
432                !msg.contains("invalid git_ref"),
433                "HEAD~1 should pass validation"
434            );
435        }
436
437        // main is valid
438        let result = changed_files_from_git_ref(tmp_path, "main");
439        if let Err(TraversalError::GitError(msg)) = result {
440            assert!(
441                !msg.contains("invalid git_ref"),
442                "main should pass validation"
443            );
444        }
445
446        // abc123 is valid
447        let result = changed_files_from_git_ref(tmp_path, "abc123");
448        if let Err(TraversalError::GitError(msg)) = result {
449            assert!(
450                !msg.contains("invalid git_ref"),
451                "abc123 should pass validation"
452            );
453        }
454    }
455
456    #[test]
457    fn test_walk_directory_depth1_sync_matches_depth2_filtered() {
458        // Create a small test directory structure
459        let tmp = tempfile::tempdir().unwrap();
460        let root = tmp.path();
461
462        // Create: root/file1.txt, root/dir1/, root/dir1/file2.txt, root/dir1/dir2/file3.txt
463        std::fs::write(root.join("file1.txt"), b"content1").unwrap();
464        std::fs::create_dir(root.join("dir1")).unwrap();
465        std::fs::write(root.join("dir1/file2.txt"), b"content2").unwrap();
466        std::fs::create_dir(root.join("dir1/dir2")).unwrap();
467        std::fs::write(root.join("dir1/dir2/file3.txt"), b"content3").unwrap();
468
469        // Walk with max_depth=1 (sync walker)
470        let entries_depth1 = walk_directory(root, Some(1)).unwrap();
471
472        // Walk with max_depth=2 (parallel walker)
473        let entries_depth2 = walk_directory(root, Some(2)).unwrap();
474
475        // Filter depth2 entries to only those at depth <= 1
476        let entries_depth2_filtered: Vec<_> =
477            entries_depth2.iter().filter(|e| e.depth <= 1).collect();
478
479        // Both should have the same number of entries at depth <= 1
480        assert_eq!(
481            entries_depth1.len(),
482            entries_depth2_filtered.len(),
483            "depth=1 sync walker should match depth=2 filtered to depth<=1"
484        );
485
486        // Verify that all entries in depth1 are present in depth2_filtered
487        for entry1 in &entries_depth1 {
488            let found = entries_depth2_filtered.iter().any(|e2| {
489                e2.path == entry1.path && e2.depth == entry1.depth && e2.is_dir == entry1.is_dir
490            });
491            assert!(
492                found,
493                "entry {:?} from depth=1 not found in depth=2 filtered",
494                entry1.path
495            );
496        }
497    }
498}