Skip to main content

code_analyze_core/
traversal.rs

1// SPDX-FileCopyrightText: 2026 code-analyze-mcp contributors
2// SPDX-License-Identifier: Apache-2.0
3//! Directory traversal with .gitignore support.
4//!
5//! Provides recursive directory walking with automatic filtering based on `.gitignore` and `.ignore` files.
6//! Uses the `ignore` crate for cross-platform, efficient file system traversal.
7
8use ignore::WalkBuilder;
9use std::collections::HashSet;
10use std::path::{Path, PathBuf};
11use std::process::Command;
12use std::sync::{Arc, Mutex};
13use std::time::Instant;
14use thiserror::Error;
15use tracing::instrument;
16
17#[derive(Debug, Clone)]
18pub struct WalkEntry {
19    pub path: PathBuf,
20    /// Depth in the directory tree (0 = root).
21    pub depth: usize,
22    pub is_dir: bool,
23    pub is_symlink: bool,
24    pub symlink_target: Option<PathBuf>,
25}
26
27#[derive(Debug, Error)]
28#[non_exhaustive]
29pub enum TraversalError {
30    #[error("IO error: {0}")]
31    Io(#[from] std::io::Error),
32    #[error("internal concurrency error: {0}")]
33    Internal(String),
34    #[error("git error: {0}")]
35    GitError(String),
36}
37
38/// Walk a directory with support for `.gitignore` and `.ignore`.
39/// `max_depth=0` maps to unlimited recursion (None), positive values limit depth.
40/// The returned entries are sorted lexicographically by path.
41#[instrument(skip_all, fields(path = %root.display(), max_depth))]
42pub fn walk_directory(
43    root: &Path,
44    max_depth: Option<u32>,
45) -> Result<Vec<WalkEntry>, TraversalError> {
46    let start = Instant::now();
47    let mut builder = WalkBuilder::new(root);
48    builder.hidden(true).standard_filters(true);
49
50    // Map max_depth: 0 = unlimited (None), positive = Some(n)
51    if let Some(depth) = max_depth
52        && depth > 0
53    {
54        builder.max_depth(Some(depth as usize));
55    }
56
57    let entries = Arc::new(Mutex::new(Vec::new()));
58    let entries_clone = Arc::clone(&entries);
59
60    builder.build_parallel().run(move || {
61        let entries = Arc::clone(&entries_clone);
62        Box::new(move |result| match result {
63            Ok(entry) => {
64                let path = entry.path().to_path_buf();
65                let depth = entry.depth();
66                let is_dir = entry.file_type().is_some_and(|ft| ft.is_dir());
67                let is_symlink = entry.path_is_symlink();
68
69                let symlink_target = if is_symlink {
70                    std::fs::read_link(&path).ok()
71                } else {
72                    None
73                };
74
75                let walk_entry = WalkEntry {
76                    path,
77                    depth,
78                    is_dir,
79                    is_symlink,
80                    symlink_target,
81                };
82                let Ok(mut guard) = entries.lock() else {
83                    tracing::debug!("mutex poisoned in parallel walker, skipping entry");
84                    return ignore::WalkState::Skip;
85                };
86                guard.push(walk_entry);
87                ignore::WalkState::Continue
88            }
89            Err(e) => {
90                tracing::warn!(error = %e, "skipping unreadable entry");
91                ignore::WalkState::Continue
92            }
93        })
94    });
95
96    let mut entries = Arc::try_unwrap(entries)
97        .map_err(|_| {
98            TraversalError::Internal("arc unwrap failed: strong references still live".to_string())
99        })?
100        .into_inner()
101        .map_err(|_| TraversalError::Internal("mutex poisoned".to_string()))?;
102
103    let dir_count = entries.iter().filter(|e| e.is_dir).count();
104    let file_count = entries.iter().filter(|e| !e.is_dir).count();
105
106    tracing::debug!(
107        entries = entries.len(),
108        dirs = dir_count,
109        files = file_count,
110        duration_ms = u64::try_from(start.elapsed().as_millis()).unwrap_or(u64::MAX),
111        "walk complete"
112    );
113
114    // Restore sort contract: walk_parallel does not guarantee order.
115    entries.sort_by(|a, b| a.path.cmp(&b.path));
116    Ok(entries)
117}
118
119/// Return the set of absolute paths changed relative to `git_ref` in the repository
120/// containing `dir`. Invokes git without shell interpolation.
121///
122/// # Errors
123/// Returns [`TraversalError::GitError`] when:
124/// - `git` is not on PATH (distinct message)
125/// - `dir` is not inside a git repository
126pub fn changed_files_from_git_ref(
127    dir: &Path,
128    git_ref: &str,
129) -> Result<HashSet<PathBuf>, TraversalError> {
130    // Resolve the git repository root so that relative paths from `git diff` can
131    // be anchored to an absolute base.
132    let root_out = Command::new("git")
133        .arg("-C")
134        .arg(dir)
135        .arg("rev-parse")
136        .arg("--show-toplevel")
137        .output()
138        .map_err(|e| {
139            if e.kind() == std::io::ErrorKind::NotFound {
140                TraversalError::GitError("git not found on PATH".to_string())
141            } else {
142                TraversalError::GitError(format!("failed to run git: {e}"))
143            }
144        })?;
145
146    if !root_out.status.success() {
147        let stderr = String::from_utf8_lossy(&root_out.stderr);
148        return Err(TraversalError::GitError(format!(
149            "not a git repository: {stderr}"
150        )));
151    }
152
153    let root_raw = PathBuf::from(String::from_utf8_lossy(&root_out.stdout).trim().to_string());
154    // Canonicalize to resolve symlinks (e.g. macOS /tmp -> /private/tmp) so that
155    // paths from git and paths from walk_directory are comparable.
156    let root = std::fs::canonicalize(&root_raw).unwrap_or(root_raw);
157
158    // Run `git diff --name-only <git_ref>` to get changed files relative to root.
159    let diff_out = Command::new("git")
160        .arg("-C")
161        .arg(dir)
162        .arg("diff")
163        .arg("--name-only")
164        .arg(git_ref)
165        .output()
166        .map_err(|e| TraversalError::GitError(format!("failed to run git diff: {e}")))?;
167
168    if !diff_out.status.success() {
169        let stderr = String::from_utf8_lossy(&diff_out.stderr);
170        return Err(TraversalError::GitError(format!(
171            "git diff failed: {stderr}"
172        )));
173    }
174
175    let changed: HashSet<PathBuf> = String::from_utf8_lossy(&diff_out.stdout)
176        .lines()
177        .filter(|l| !l.is_empty())
178        .map(|l| root.join(l))
179        .collect();
180
181    Ok(changed)
182}
183
184/// Filter walk entries to only those that are either changed files or ancestor directories
185/// of changed files. This preserves the tree structure while limiting analysis scope.
186///
187/// Uses O(|changed| * depth + |entries|) time by precomputing a HashSet of ancestor
188/// directories for each changed file (up to and including `root`).
189#[must_use]
190pub fn filter_entries_by_git_ref(
191    entries: Vec<WalkEntry>,
192    changed: &HashSet<PathBuf>,
193    root: &Path,
194) -> Vec<WalkEntry> {
195    let canonical_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
196
197    // Precompute canonical changed set so comparison works across symlink differences.
198    let canonical_changed: HashSet<PathBuf> = changed
199        .iter()
200        .map(|p| std::fs::canonicalize(p).unwrap_or_else(|_| p.clone()))
201        .collect();
202
203    // Build HashSet of all ancestor directories of changed files (bounded by canonical_root).
204    let mut ancestor_dirs: HashSet<PathBuf> = HashSet::new();
205    ancestor_dirs.insert(canonical_root.clone());
206    for p in &canonical_changed {
207        let mut cur = p.as_path();
208        while let Some(parent) = cur.parent() {
209            if !ancestor_dirs.insert(parent.to_path_buf()) {
210                // Already inserted this ancestor and all its ancestors; stop early.
211                break;
212            }
213            if parent == canonical_root {
214                break;
215            }
216            cur = parent;
217        }
218    }
219
220    entries
221        .into_iter()
222        .filter(|e| {
223            let canonical_path = std::fs::canonicalize(&e.path).unwrap_or_else(|_| e.path.clone());
224            if e.is_dir {
225                ancestor_dirs.contains(&canonical_path)
226            } else {
227                canonical_changed.contains(&canonical_path)
228            }
229        })
230        .collect()
231}
232
233/// Compute files-per-depth-1-subdirectory counts from an already-collected entry list.
234/// Returns a Vec of (depth-1 path, file count) sorted by path.
235/// Only counts file entries (not directories); skips entries containing `EXCLUDED_DIRS` components.
236/// Output Vec is sorted by construction (entries are pre-sorted by path).
237#[must_use]
238pub fn subtree_counts_from_entries(root: &Path, entries: &[WalkEntry]) -> Vec<(PathBuf, usize)> {
239    let mut counts: Vec<(PathBuf, usize)> = Vec::new();
240    for entry in entries {
241        if entry.is_dir {
242            continue;
243        }
244        // Skip entries whose path components contain EXCLUDED_DIRS
245        if entry.path.components().any(|c| {
246            let s = c.as_os_str().to_string_lossy();
247            crate::EXCLUDED_DIRS.contains(&s.as_ref())
248        }) {
249            continue;
250        }
251        let Ok(rel) = entry.path.strip_prefix(root) else {
252            continue;
253        };
254        if let Some(first) = rel.components().next() {
255            let key = root.join(first);
256            match counts.last_mut() {
257                Some(last) if last.0 == key => last.1 += 1,
258                _ => counts.push((key, 1)),
259            }
260        }
261    }
262    counts
263}