Skip to main content

cuenv_vcs/
walker.rs

1//! [`WalkHasher`]: a VCS-free [`VcsHasher`] implementation.
2//!
3//! `WalkHasher` resolves glob/directory/file patterns against a workspace
4//! root and computes a streaming SHA-256 over every matched file. It is the
5//! default fallback when a VCS-specific implementation isn't available.
6
7use crate::error::{Error, Result};
8use crate::hasher::{HashedInput, VcsHasher};
9use async_trait::async_trait;
10use globset::{Glob, GlobSet, GlobSetBuilder};
11use sha2::{Digest, Sha256};
12use std::collections::BTreeSet;
13use std::fs;
14use std::io::Read;
15use std::path::{Component, Path, PathBuf};
16use tracing::{debug, trace};
17use walkdir::WalkDir;
18
19/// Workspace-rooted walker that streams SHA-256 over every matched file.
20#[derive(Debug, Clone)]
21pub struct WalkHasher {
22    workspace_root: PathBuf,
23}
24
25impl WalkHasher {
26    /// Build a walker rooted at `workspace_root`.
27    #[must_use]
28    pub fn new(workspace_root: impl AsRef<Path>) -> Self {
29        Self {
30            workspace_root: workspace_root.as_ref().to_path_buf(),
31        }
32    }
33
34    /// Workspace root this walker is rooted at.
35    #[must_use]
36    pub fn workspace_root(&self) -> &Path {
37        &self.workspace_root
38    }
39
40    fn hash_file(path: &Path) -> Result<(String, u64)> {
41        let mut file = fs::File::open(path).map_err(|e| Error::io(e, path, "open"))?;
42        let mut hasher = Sha256::new();
43        let mut buf: Box<[u8]> = vec![0u8; 64 * 1024].into_boxed_slice();
44        let mut size: u64 = 0;
45        loop {
46            let n = file
47                .read(&mut buf)
48                .map_err(|e| Error::io(e, path, "read"))?;
49            if n == 0 {
50                break;
51            }
52            hasher.update(&buf[..n]);
53            size += n as u64;
54        }
55        Ok((hex::encode(hasher.finalize()), size))
56    }
57
58    fn resolve_sync(&self, patterns: &[String]) -> Result<Vec<HashedInput>> {
59        let mut explicit_files: Vec<String> = Vec::new();
60        let mut dirs_to_walk: Vec<(String, GlobSet)> = Vec::new();
61
62        for pat in patterns {
63            let trimmed = pat.trim();
64            if trimmed.is_empty() {
65                continue;
66            }
67            let looks_like_glob = trimmed.contains('*')
68                || trimmed.contains('{')
69                || trimmed.contains('?')
70                || trimmed.contains('[');
71            let abs = self.workspace_root.join(trimmed);
72
73            if looks_like_glob {
74                let base_dir = extract_glob_base(trimmed);
75                let glob = Glob::new(trimmed).map_err(|e| {
76                    Error::pattern(format!("invalid glob pattern `{trimmed}`: {e}"))
77                })?;
78                let set = GlobSetBuilder::new()
79                    .add(glob)
80                    .build()
81                    .map_err(|e| Error::pattern(format!("failed to build globset: {e}")))?;
82                dirs_to_walk.push((base_dir, set));
83            } else if abs.is_dir() {
84                let glob_pat = format!("{}/**/*", trimmed.trim_end_matches('/'));
85                let glob = Glob::new(&glob_pat).map_err(|e| {
86                    Error::pattern(format!("invalid glob pattern `{glob_pat}`: {e}"))
87                })?;
88                let set = GlobSetBuilder::new()
89                    .add(glob)
90                    .build()
91                    .map_err(|e| Error::pattern(format!("failed to build globset: {e}")))?;
92                dirs_to_walk.push((trimmed.to_string(), set));
93            } else {
94                explicit_files.push(trimmed.to_string());
95            }
96        }
97
98        let mut seen: BTreeSet<PathBuf> = BTreeSet::new();
99        let mut results: Vec<HashedInput> = Vec::new();
100
101        for raw in &explicit_files {
102            let abs = self.workspace_root.join(raw);
103            if abs.is_file() {
104                let rel = normalize_rel_path(Path::new(raw));
105                if seen.insert(rel.clone()) {
106                    let (hash, size) = Self::hash_file(&abs)?;
107                    results.push(HashedInput {
108                        relative_path: rel,
109                        absolute_path: canonical_or_abs(&abs),
110                        sha256: hash,
111                        size,
112                        is_executable: is_executable(&abs)?,
113                    });
114                }
115            } else {
116                return Err(Error::io(
117                    std::io::Error::new(
118                        std::io::ErrorKind::NotFound,
119                        format!("explicit input file '{raw}' not found"),
120                    ),
121                    &abs,
122                    "open",
123                ));
124            }
125        }
126
127        for (base_dir, globset) in &dirs_to_walk {
128            let walk_root = self.workspace_root.join(base_dir);
129            if !walk_root.exists() {
130                debug!(dir = %base_dir, "Directory does not exist, skipping");
131                continue;
132            }
133            for entry in WalkDir::new(&walk_root).follow_links(true) {
134                let entry = entry.map_err(|e| {
135                    let path = e.path().unwrap_or(walk_root.as_path());
136                    Error::io(
137                        std::io::Error::new(
138                            e.io_error()
139                                .map_or(std::io::ErrorKind::Other, std::io::Error::kind),
140                            format!("walkdir error under {}: {e}", walk_root.display()),
141                        ),
142                        path,
143                        "walkdir",
144                    )
145                })?;
146                let path = entry.path();
147                if path.is_dir() {
148                    continue;
149                }
150                let Ok(rel) = path.strip_prefix(&self.workspace_root) else {
151                    continue;
152                };
153                let rel_norm = normalize_rel_path(rel);
154                if globset.is_match(rel_norm.as_path()) && seen.insert(rel_norm.clone()) {
155                    let (hash, size) = Self::hash_file(path)?;
156                    results.push(HashedInput {
157                        relative_path: rel_norm,
158                        absolute_path: canonical_or_abs(path),
159                        sha256: hash,
160                        size,
161                        is_executable: is_executable(path)?,
162                    });
163                }
164            }
165        }
166
167        // Deterministic ordering — `seen` is a BTreeSet but `results` is a Vec,
168        // so we sort explicitly by relative path.
169        results.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
170        trace!(count = results.len(), "WalkHasher resolved inputs");
171        Ok(results)
172    }
173}
174
175#[async_trait]
176impl VcsHasher for WalkHasher {
177    async fn resolve_and_hash(&self, patterns: &[String]) -> Result<Vec<HashedInput>> {
178        // The walker is blocking I/O; keep it on the current task since
179        // callers typically already wrap us in a spawn_blocking or a parallel
180        // task executor.
181        self.resolve_sync(patterns)
182    }
183
184    fn name(&self) -> &'static str {
185        "walk"
186    }
187}
188
189/// Strip `.` / `..` components from a relative path so the result is a clean
190/// workspace-relative identifier.
191fn normalize_rel_path(p: &Path) -> PathBuf {
192    let mut out = PathBuf::new();
193    for comp in p.components() {
194        match comp {
195            Component::ParentDir => {
196                out.pop();
197            }
198            Component::Normal(s) => out.push(s),
199            _ => {}
200        }
201    }
202    out
203}
204
205/// Canonicalize a path, falling back to the absolute form when canonicalize fails.
206fn canonical_or_abs(p: &Path) -> PathBuf {
207    fs::canonicalize(p).unwrap_or_else(|_| {
208        if p.is_absolute() {
209            p.to_path_buf()
210        } else {
211            std::env::current_dir()
212                .unwrap_or_else(|_| PathBuf::from("."))
213                .join(p)
214        }
215    })
216}
217
218#[cfg(unix)]
219fn is_executable(path: &Path) -> Result<bool> {
220    use std::os::unix::fs::PermissionsExt;
221
222    let metadata = fs::metadata(path).map_err(|e| Error::io(e, path, "metadata"))?;
223    Ok(metadata.permissions().mode() & 0o111 != 0)
224}
225
226#[cfg(not(unix))]
227fn is_executable(_path: &Path) -> Result<bool> {
228    Ok(false)
229}
230
231/// Extract the literal-prefix of a glob pattern.
232///
233/// * `src/**/*.ts` → `src`
234/// * `**/*.ts` → `` (workspace root)
235/// * `foo/bar/*.rs` → `foo/bar`
236fn extract_glob_base(pattern: &str) -> String {
237    let mut parts = Vec::new();
238    for segment in pattern.split('/') {
239        if segment.contains('*')
240            || segment.contains('{')
241            || segment.contains('?')
242            || segment.contains('[')
243        {
244            break;
245        }
246        if !segment.is_empty() {
247            parts.push(segment);
248        }
249    }
250    parts.join("/")
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256    use tempfile::TempDir;
257
258    #[test]
259    fn resolves_explicit_files_dirs_and_globs() {
260        let tmp = TempDir::new().unwrap();
261        let root = tmp.path();
262        fs::create_dir_all(root.join("src/sub")).unwrap();
263        fs::write(root.join("src/a.ts"), "A").unwrap();
264        fs::write(root.join("src/sub/b.ts"), "B").unwrap();
265        fs::write(root.join("README.md"), "readme").unwrap();
266
267        let hasher = WalkHasher::new(root);
268        let inputs = hasher
269            .resolve_sync(&["src".into(), "README.md".into(), "**/*.ts".into()])
270            .unwrap();
271        let rels: Vec<String> = inputs
272            .iter()
273            .map(|f| f.relative_path.to_string_lossy().into_owned())
274            .collect();
275        assert!(rels.contains(&"src/a.ts".to_string()));
276        assert!(rels.contains(&"src/sub/b.ts".to_string()));
277        assert!(rels.contains(&"README.md".to_string()));
278    }
279
280    #[test]
281    fn deduplicates_overlapping_patterns() {
282        let tmp = TempDir::new().unwrap();
283        fs::write(tmp.path().join("a.txt"), "content").unwrap();
284        let hasher = WalkHasher::new(tmp.path());
285        let inputs = hasher
286            .resolve_sync(&["a.txt".into(), "*.txt".into()])
287            .unwrap();
288        assert_eq!(inputs.len(), 1);
289    }
290
291    #[test]
292    fn empty_and_whitespace_patterns_are_ignored() {
293        let tmp = TempDir::new().unwrap();
294        fs::write(tmp.path().join("a.txt"), "content").unwrap();
295        let hasher = WalkHasher::new(tmp.path());
296        let inputs = hasher.resolve_sync(&[String::new(), "  ".into()]).unwrap();
297        assert!(inputs.is_empty());
298    }
299
300    #[test]
301    fn missing_file_errors() {
302        let tmp = TempDir::new().unwrap();
303        let hasher = WalkHasher::new(tmp.path());
304        let err = hasher
305            .resolve_sync(&["nonexistent.txt".into()])
306            .unwrap_err();
307        assert!(matches!(
308            err,
309            Error::Io { source, .. } if source.kind() == std::io::ErrorKind::NotFound
310        ));
311    }
312
313    #[test]
314    fn same_content_yields_same_hash() {
315        let tmp = TempDir::new().unwrap();
316        fs::write(tmp.path().join("a.txt"), "payload").unwrap();
317        fs::write(tmp.path().join("b.txt"), "payload").unwrap();
318        let hasher = WalkHasher::new(tmp.path());
319        let inputs = hasher.resolve_sync(&["*.txt".into()]).unwrap();
320        assert_eq!(inputs.len(), 2);
321        assert_eq!(inputs[0].sha256, inputs[1].sha256);
322    }
323
324    #[test]
325    fn different_content_yields_different_hash() {
326        let tmp = TempDir::new().unwrap();
327        fs::write(tmp.path().join("a.txt"), "one").unwrap();
328        fs::write(tmp.path().join("b.txt"), "two").unwrap();
329        let hasher = WalkHasher::new(tmp.path());
330        let inputs = hasher.resolve_sync(&["*.txt".into()]).unwrap();
331        assert_eq!(inputs.len(), 2);
332        assert_ne!(inputs[0].sha256, inputs[1].sha256);
333    }
334
335    #[test]
336    fn results_are_sorted_by_relative_path() {
337        let tmp = TempDir::new().unwrap();
338        for name in ["c.txt", "a.txt", "b.txt"] {
339            fs::write(tmp.path().join(name), name).unwrap();
340        }
341        let hasher = WalkHasher::new(tmp.path());
342        let inputs = hasher.resolve_sync(&["*.txt".into()]).unwrap();
343        let names: Vec<String> = inputs
344            .iter()
345            .map(|i| i.relative_path.to_string_lossy().into_owned())
346            .collect();
347        assert_eq!(names, vec!["a.txt", "b.txt", "c.txt"]);
348    }
349
350    #[test]
351    fn nested_directory_walks_recursively() {
352        let tmp = TempDir::new().unwrap();
353        fs::create_dir_all(tmp.path().join("a/b/c")).unwrap();
354        fs::write(tmp.path().join("a/b/c/deep.txt"), "deep").unwrap();
355        let hasher = WalkHasher::new(tmp.path());
356        let inputs = hasher.resolve_sync(&["a".into()]).unwrap();
357        assert_eq!(inputs.len(), 1);
358        assert_eq!(inputs[0].relative_path, PathBuf::from("a/b/c/deep.txt"));
359    }
360
361    #[test]
362    fn glob_brackets_work() {
363        let tmp = TempDir::new().unwrap();
364        for name in ["a1.txt", "a2.txt", "b1.txt"] {
365            fs::write(tmp.path().join(name), name).unwrap();
366        }
367        let hasher = WalkHasher::new(tmp.path());
368        let inputs = hasher.resolve_sync(&["a[12].txt".into()]).unwrap();
369        assert_eq!(inputs.len(), 2);
370    }
371
372    #[cfg(unix)]
373    #[test]
374    fn walkdir_errors_are_not_silently_dropped() {
375        use std::os::unix::fs::PermissionsExt;
376
377        let tmp = TempDir::new().unwrap();
378        let unreadable = tmp.path().join("restricted");
379        fs::create_dir_all(&unreadable).unwrap();
380        fs::write(unreadable.join("secret.txt"), "secret").unwrap();
381
382        let mut permissions = fs::metadata(&unreadable).unwrap().permissions();
383        permissions.set_mode(0o000);
384        fs::set_permissions(&unreadable, permissions).unwrap();
385
386        let hasher = WalkHasher::new(tmp.path());
387        let err = hasher.resolve_sync(&["restricted".into()]).unwrap_err();
388
389        let mut cleanup_permissions = fs::metadata(&unreadable).unwrap().permissions();
390        cleanup_permissions.set_mode(0o755);
391        fs::set_permissions(&unreadable, cleanup_permissions).unwrap();
392
393        assert!(err.to_string().contains("walkdir"));
394    }
395
396    #[test]
397    fn walker_name_is_walk() {
398        let tmp = TempDir::new().unwrap();
399        let hasher = WalkHasher::new(tmp.path());
400        assert_eq!(hasher.name(), "walk");
401    }
402
403    #[tokio::test]
404    async fn async_trait_method_works() {
405        let tmp = TempDir::new().unwrap();
406        fs::write(tmp.path().join("x.txt"), "x").unwrap();
407        let hasher = WalkHasher::new(tmp.path());
408        let inputs = hasher.resolve_and_hash(&["*.txt".into()]).await.unwrap();
409        assert_eq!(inputs.len(), 1);
410    }
411
412    #[test]
413    fn extract_glob_base_handles_common_shapes() {
414        assert_eq!(extract_glob_base("src/**/*.ts"), "src");
415        assert_eq!(extract_glob_base("**/*.ts"), "");
416        assert_eq!(extract_glob_base("foo/bar/*.rs"), "foo/bar");
417        assert_eq!(extract_glob_base("*.txt"), "");
418    }
419
420    #[test]
421    fn normalize_rel_path_strips_dots() {
422        assert_eq!(normalize_rel_path(Path::new("./a/b")), PathBuf::from("a/b"));
423        assert_eq!(normalize_rel_path(Path::new("a/../b")), PathBuf::from("b"));
424    }
425}