cuenv_core/tasks/
io.rs

1use crate::{Error, Result};
2use globset::{Glob, GlobSet, GlobSetBuilder};
3use sha2::{Digest, Sha256};
4use std::collections::{BTreeMap, BTreeSet};
5use std::fs;
6use std::io::Read;
7use std::path::{Component, Path, PathBuf};
8use tracing;
9use walkdir::WalkDir;
10
11#[derive(Debug, Clone)]
12pub struct ResolvedInputFile {
13    pub rel_path: PathBuf,
14    pub source_path: PathBuf,
15    pub sha256: String,
16    pub size: u64,
17}
18
19#[derive(Debug, Clone)]
20pub struct ResolvedInputs {
21    pub files: Vec<ResolvedInputFile>,
22}
23
24impl ResolvedInputs {
25    pub fn to_summary_map(&self) -> BTreeMap<String, String> {
26        let mut map = BTreeMap::new();
27        for f in &self.files {
28            map.insert(
29                normalize_rel_path(&f.rel_path)
30                    .to_string_lossy()
31                    .to_string(),
32                f.sha256.clone(),
33            );
34        }
35        map
36    }
37}
38
39fn normalize_rel_path(p: &Path) -> PathBuf {
40    let mut out = PathBuf::new();
41    for comp in p.components() {
42        match comp {
43            Component::CurDir => {}
44            Component::ParentDir => {
45                out.pop();
46            }
47            Component::Normal(s) => out.push(s),
48            _ => {}
49        }
50    }
51    out
52}
53
54pub fn sha256_file(path: &Path) -> Result<(String, u64)> {
55    let _span = tracing::trace_span!("sha256_file", path = %path.display()).entered();
56    let mut file = fs::File::open(path).map_err(|e| Error::Io {
57        source: e,
58        path: Some(path.into()),
59        operation: "open".into(),
60    })?;
61    let mut hasher = Sha256::new();
62    let mut buf = [0u8; 1024 * 64];
63    let mut total: u64 = 0;
64    loop {
65        let n = file.read(&mut buf).map_err(|e| Error::Io {
66            source: e,
67            path: Some(path.into()),
68            operation: "read".into(),
69        })?;
70        if n == 0 {
71            break;
72        }
73        hasher.update(&buf[..n]);
74        total += n as u64;
75    }
76    let digest = hasher.finalize();
77    tracing::trace!(path = %path.display(), size = total, "Hashed file");
78    Ok((hex::encode(digest), total))
79}
80
81pub struct InputResolver {
82    project_root: PathBuf,
83}
84
85impl InputResolver {
86    pub fn new(project_root: impl AsRef<Path>) -> Self {
87        Self {
88            project_root: project_root.as_ref().to_path_buf(),
89        }
90    }
91
92    pub fn resolve(&self, patterns: &[String]) -> Result<ResolvedInputs> {
93        let resolve_span = tracing::info_span!(
94            "input_resolver.resolve",
95            root = %self.project_root.display(),
96            pattern_count = patterns.len()
97        );
98        let _resolve_guard = resolve_span.enter();
99
100        tracing::debug!(
101            patterns = ?patterns,
102            "Starting input resolution"
103        );
104
105        // Categorize patterns: explicit files, directories to walk, and globs
106        let mut explicit_files: Vec<String> = Vec::new();
107        let mut dirs_to_walk: Vec<(String, GlobSet)> = Vec::new(); // (dir_path, globset for matching)
108
109        let pattern_span = tracing::debug_span!("patterns.analyze");
110        {
111            let _g = pattern_span.enter();
112            for pat in patterns {
113                let p = pat.trim();
114                if p.is_empty() {
115                    continue;
116                }
117
118                let looks_like_glob =
119                    p.contains('*') || p.contains('{') || p.contains('?') || p.contains('[');
120                let abs = self.project_root.join(p);
121
122                if looks_like_glob {
123                    // Extract base directory from glob pattern
124                    let base_dir = extract_glob_base(p);
125                    let glob_pat = p.to_string();
126                    let glob = Glob::new(&glob_pat).map_err(|e| {
127                        Error::configuration(format!("Invalid glob pattern '{glob_pat}': {e}"))
128                    })?;
129                    let set = GlobSetBuilder::new().add(glob).build().map_err(|e| {
130                        Error::configuration(format!("Failed to build glob set: {e}"))
131                    })?;
132                    dirs_to_walk.push((base_dir, set));
133                } else if abs.is_dir() {
134                    // Directory - walk it with a recursive glob
135                    let glob_pat = format!("{}/**/*", p.trim_end_matches('/'));
136                    let glob = Glob::new(&glob_pat).map_err(|e| {
137                        Error::configuration(format!("Invalid glob pattern '{glob_pat}': {e}"))
138                    })?;
139                    let set = GlobSetBuilder::new().add(glob).build().map_err(|e| {
140                        Error::configuration(format!("Failed to build glob set: {e}"))
141                    })?;
142                    dirs_to_walk.push((p.to_string(), set));
143                } else {
144                    // Explicit file path
145                    explicit_files.push(p.to_string());
146                }
147            }
148
149            tracing::debug!(
150                explicit_file_count = explicit_files.len(),
151                dirs_to_walk_count = dirs_to_walk.len(),
152                "Categorized input patterns"
153            );
154        }
155
156        let mut seen: BTreeSet<PathBuf> = BTreeSet::new();
157        let mut files: Vec<ResolvedInputFile> = Vec::new();
158
159        // Resolve explicit file paths directly (no walking needed)
160        let explicit_span =
161            tracing::debug_span!("explicit_files.resolve", count = explicit_files.len());
162        {
163            let _g = explicit_span.enter();
164            for raw in &explicit_files {
165                let abs = self.project_root.join(raw);
166                if abs.is_file() {
167                    let rel = normalize_rel_path(Path::new(raw));
168                    if seen.insert(rel.clone()) {
169                        let (hash, size) = sha256_file(&abs)?;
170                        files.push(ResolvedInputFile {
171                            rel_path: rel,
172                            source_path: canonical_or_abs(&abs)?,
173                            sha256: hash,
174                            size,
175                        });
176                    }
177                } else {
178                    tracing::warn!(path = %raw, "Explicit input file not found");
179                }
180            }
181            tracing::debug!(
182                explicit_files_found = files.len(),
183                "Explicit files resolved"
184            );
185        }
186
187        // Walk only the specific directories that need walking
188        if !dirs_to_walk.is_empty() {
189            let walkdir_span =
190                tracing::info_span!("walkdir.traverse", dirs_count = dirs_to_walk.len());
191            let _g = walkdir_span.enter();
192
193            let mut total_entries_visited: u64 = 0;
194            let mut total_files_matched: u64 = 0;
195            let mut total_bytes_hashed: u64 = 0;
196
197            for (base_dir, globset) in &dirs_to_walk {
198                let walk_root = self.project_root.join(base_dir);
199                if !walk_root.exists() {
200                    tracing::debug!(dir = %base_dir, "Directory does not exist, skipping");
201                    continue;
202                }
203
204                tracing::debug!(dir = %base_dir, "Walking directory for glob matches");
205
206                for entry in WalkDir::new(&walk_root)
207                    .follow_links(true)
208                    .into_iter()
209                    .filter_map(|e| e.ok())
210                {
211                    total_entries_visited += 1;
212                    let path = entry.path();
213                    if path.is_dir() {
214                        continue;
215                    }
216
217                    // Relative to project root (not walk root)
218                    let rel = match path.strip_prefix(&self.project_root) {
219                        Ok(p) => p,
220                        Err(_) => continue,
221                    };
222                    let rel_norm = normalize_rel_path(rel);
223
224                    // Match against this specific globset
225                    if globset.is_match(rel_norm.as_path()) && seen.insert(rel_norm.clone()) {
226                        total_files_matched += 1;
227                        let src = canonical_or_abs(path)?;
228                        let (hash, size) = sha256_file(&src)?;
229                        total_bytes_hashed += size;
230                        files.push(ResolvedInputFile {
231                            rel_path: rel_norm,
232                            source_path: src,
233                            sha256: hash,
234                            size,
235                        });
236                    }
237                }
238            }
239
240            tracing::info!(
241                entries_visited = total_entries_visited,
242                files_matched = total_files_matched,
243                total_bytes_hashed,
244                "WalkDir traversal complete"
245            );
246        } else {
247            tracing::debug!("No directories to walk, skipping WalkDir");
248        }
249
250        // Deterministic ordering
251        files.sort_by(|a, b| a.rel_path.cmp(&b.rel_path));
252
253        tracing::info!(total_files = files.len(), "Input resolution complete");
254
255        Ok(ResolvedInputs { files })
256    }
257}
258
259/// Extract the base directory from a glob pattern.
260/// For example:
261/// - `src/**/*.ts` -> `src`
262/// - `**/*.ts` -> `` (empty, meaning root)
263/// - `foo/bar/*.rs` -> `foo/bar`
264fn extract_glob_base(pattern: &str) -> String {
265    let mut base_parts = Vec::new();
266    for part in pattern.split('/') {
267        if part.contains('*') || part.contains('{') || part.contains('?') || part.contains('[') {
268            break;
269        }
270        if !part.is_empty() {
271            base_parts.push(part);
272        }
273    }
274    base_parts.join("/")
275}
276
277fn canonical_or_abs(p: &Path) -> Result<PathBuf> {
278    // Resolve symlinks to target content; fall back to absolute if canonicalize fails
279    match fs::canonicalize(p) {
280        Ok(c) => Ok(c),
281        Err(_) => Ok(p.absolutize()),
282    }
283}
284
285trait Absolutize {
286    fn absolutize(&self) -> PathBuf;
287}
288impl Absolutize for &Path {
289    fn absolutize(&self) -> PathBuf {
290        if self.is_absolute() {
291            self.to_path_buf()
292        } else {
293            std::env::current_dir()
294                .unwrap_or_else(|_| PathBuf::from("."))
295                .join(self)
296        }
297    }
298}
299
300pub fn populate_hermetic_dir(resolved: &ResolvedInputs, hermetic_root: &Path) -> Result<()> {
301    // Create directories and populate files preserving relative structure
302    for f in &resolved.files {
303        let dest = hermetic_root.join(&f.rel_path);
304        if let Some(parent) = dest.parent() {
305            fs::create_dir_all(parent).map_err(|e| Error::Io {
306                source: e,
307                path: Some(parent.into()),
308                operation: "create_dir_all".into(),
309            })?;
310        }
311        // Try hardlink first
312        match fs::hard_link(&f.source_path, &dest) {
313            Ok(_) => {}
314            Err(_e) => {
315                // Fall back to copy on any error creating hardlink
316                fs::copy(&f.source_path, &dest).map_err(|e2| Error::Io {
317                    source: e2,
318                    path: Some(dest.into()),
319                    operation: "copy".into(),
320                })?;
321            }
322        }
323    }
324    Ok(())
325}
326
327pub fn collect_outputs(hermetic_root: &Path, patterns: &[String]) -> Result<Vec<PathBuf>> {
328    if patterns.is_empty() {
329        return Ok(vec![]);
330    }
331    let mut builder = GlobSetBuilder::new();
332    for p in patterns {
333        let looks_like_glob =
334            p.contains('*') || p.contains('{') || p.contains('?') || p.contains('[');
335        let mut pat = p.clone();
336        let abs = hermetic_root.join(&pat);
337        if abs.is_dir() && !looks_like_glob {
338            pat = format!("{}/**/*", pat.trim_end_matches('/'));
339        }
340        let glob = Glob::new(&pat)
341            .map_err(|e| Error::configuration(format!("Invalid output glob '{pat}': {e}")))?;
342        builder.add(glob);
343    }
344    let set = builder
345        .build()
346        .map_err(|e| Error::configuration(format!("Failed to build output globset: {e}")))?;
347
348    let mut results = Vec::new();
349    for entry in WalkDir::new(hermetic_root)
350        .into_iter()
351        .filter_map(|e| e.ok())
352    {
353        let path = entry.path();
354        if path.is_dir() {
355            continue;
356        }
357        let rel = match path.strip_prefix(hermetic_root) {
358            Ok(p) => p,
359            Err(_) => continue,
360        };
361        if set.is_match(rel) {
362            results.push(rel.to_path_buf());
363        }
364    }
365    results.sort();
366    Ok(results)
367}
368
369pub fn snapshot_workspace_tar_zst(src_root: &Path, dst_file: &Path) -> Result<()> {
370    let file = fs::File::create(dst_file).map_err(|e| Error::Io {
371        source: e,
372        path: Some(dst_file.into()),
373        operation: "create".into(),
374    })?;
375    let enc = zstd::Encoder::new(file, 3)
376        .map_err(|e| Error::configuration(format!("zstd encoder error: {e}")))?;
377    let mut builder = tar::Builder::new(enc);
378
379    match builder.append_dir_all(".", src_root) {
380        Ok(()) => {}
381        Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
382            // Workspace contents can legitimately disappear during a task (e.g.
383            // package managers removing temp files). Skip snapshotting instead
384            // of failing the whole task cache write.
385            let _ = fs::remove_file(dst_file);
386            tracing::warn!(
387                root = %src_root.display(),
388                "Skipping workspace snapshot; files disappeared during archive: {e}"
389            );
390            return Ok(());
391        }
392        Err(e) => {
393            return Err(Error::configuration(format!("tar append failed: {e}")));
394        }
395    }
396
397    let enc = builder
398        .into_inner()
399        .map_err(|e| Error::configuration(format!("tar finalize failed: {e}")))?;
400    enc.finish()
401        .map_err(|e| Error::configuration(format!("zstd finish failed: {e}")))?;
402    Ok(())
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408    use tempfile::TempDir;
409
410    #[test]
411    fn resolves_files_dirs_and_globs() {
412        let tmp = TempDir::new().unwrap();
413        let root = tmp.path();
414        // create structure
415        std::fs::create_dir_all(root.join("src/sub")).unwrap();
416        std::fs::write(root.join("src/a.ts"), "A").unwrap();
417        std::fs::write(root.join("src/sub/b.ts"), "B").unwrap();
418        std::fs::write(root.join("README.md"), "readme").unwrap();
419
420        let resolver = InputResolver::new(root);
421        let inputs = resolver
422            .resolve(&["src".into(), "README.md".into(), "**/*.ts".into()])
423            .unwrap();
424        let rels: Vec<String> = inputs
425            .files
426            .iter()
427            .map(|f| f.rel_path.to_string_lossy().to_string())
428            .collect();
429        assert!(rels.contains(&"src/a.ts".to_string()));
430        assert!(rels.contains(&"src/sub/b.ts".to_string()));
431        assert!(rels.contains(&"README.md".to_string()));
432    }
433
434    #[cfg(unix)]
435    #[test]
436    fn resolves_symlink_targets() {
437        use std::os::unix::fs as unixfs;
438        let tmp = TempDir::new().unwrap();
439        let root = tmp.path();
440        std::fs::create_dir_all(root.join("data")).unwrap();
441        std::fs::write(root.join("data/real.txt"), "hello").unwrap();
442        unixfs::symlink("real.txt", root.join("data/link.txt")).unwrap();
443        let resolver = InputResolver::new(root);
444        let inputs = resolver.resolve(&["data/link.txt".into()]).unwrap();
445        assert_eq!(inputs.files.len(), 1);
446        assert!(inputs.files[0].source_path.ends_with("real.txt"));
447    }
448
449    #[test]
450    fn populates_hermetic_dir() {
451        let tmp = TempDir::new().unwrap();
452        let root = tmp.path();
453        std::fs::create_dir_all(root.join("dir")).unwrap();
454        std::fs::write(root.join("dir/x.txt"), "x").unwrap();
455        let resolver = InputResolver::new(root);
456        let resolved = resolver.resolve(&["dir".into()]).unwrap();
457        let herm = TempDir::new().unwrap();
458        populate_hermetic_dir(&resolved, herm.path()).unwrap();
459        assert!(herm.path().join("dir/x.txt").exists());
460    }
461}