Skip to main content

newt_coder/
workspace_scan.rs

1//! Scan the workspace for files relevant to the task.
2//!
3//! Two-pass strategy:
4//!
5//! 1. **Mentioned files.** Walk the workspace once for source files
6//!    (extensions in `SCAN_EXTENSIONS`); if the task prompt mentions
7//!    any of their relative paths, file names, or stems, return only
8//!    those (precision wins for targeted refactors like
9//!    "rename greet to hello in src/lib.rs").
10//!
11//! 2. **Fallback.** If no file matches, return every source file in
12//!    the workspace (rg-style: skip `target/`, `node_modules/`, and
13//!    hidden dirs). The prompt builder applies the
14//!    `DEFAULT_CONTEXT_CAP_CHARS` budget on top, so even a large
15//!    workspace gets bounded context.
16//!
17//! Paths returned are **relative to the workspace root** so the
18//! prompt and the apply step can share the same identifiers.
19
20use std::path::{Path, PathBuf};
21
22use crate::error::{CoderError, Result};
23
24/// File extensions considered "source" for the scan. Heavy enough to
25/// cover the languages the bake-off targets (Rust + adjacent ecosystem
26/// + docs); light enough to skip vendored binaries.
27const SCAN_EXTENSIONS: &[&str] = &[
28    "rs", "toml", "py", "js", "ts", "go", "java", "c", "h", "cpp", "hpp", "md",
29];
30
31/// Directories to skip during the walk. Matching is exact on the
32/// directory component name.
33const SKIP_DIRS: &[&str] = &["target", "node_modules"];
34
35/// Two-pass scan: prefer mentioned files; fall back to all source.
36pub fn scan_workspace_for_files(workspace: &Path, task: &str) -> Result<Vec<PathBuf>> {
37    let all = scan_all_source_files(workspace)?;
38    let mentioned = filter_mentioned(&all, task);
39    if !mentioned.is_empty() {
40        Ok(mentioned)
41    } else {
42        Ok(all)
43    }
44}
45
46/// Return the subset of `all` whose path, file name, or file stem
47/// appears anywhere in `task`. Empty stems / names are skipped so we
48/// don't accidentally match on every file when an extensionless dotfile
49/// slips through.
50fn filter_mentioned(all: &[PathBuf], task: &str) -> Vec<PathBuf> {
51    let mut hits = Vec::new();
52    for path in all {
53        let rel = path.display().to_string();
54        let stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
55        let fname = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
56        if task.contains(&rel)
57            || (!fname.is_empty() && task.contains(fname))
58            || (!stem.is_empty() && task.contains(stem))
59        {
60            hits.push(path.clone());
61        }
62    }
63    hits
64}
65
66/// Walk the workspace and collect every source file (extension in
67/// `SCAN_EXTENSIONS`, not under a skipped dir, not hidden).
68fn scan_all_source_files(workspace: &Path) -> Result<Vec<PathBuf>> {
69    let mut out = Vec::new();
70    walk(workspace, workspace, &mut out)?;
71    out.sort();
72    Ok(out)
73}
74
75fn walk(root: &Path, dir: &Path, out: &mut Vec<PathBuf>) -> Result<()> {
76    let entries = std::fs::read_dir(dir)
77        .map_err(|e| CoderError::Workspace(format!("read_dir {}: {e}", dir.display())))?;
78    for entry in entries {
79        let entry = entry?;
80        let path = entry.path();
81        let name = entry.file_name();
82        let name_str = name.to_string_lossy();
83
84        if name_str.starts_with('.') || SKIP_DIRS.contains(&name_str.as_ref()) {
85            continue;
86        }
87
88        if path.is_dir() {
89            walk(root, &path, out)?;
90        } else if path.is_file() {
91            if let Some(ext) = path.extension().and_then(|s| s.to_str()) {
92                if SCAN_EXTENSIONS.contains(&ext) {
93                    let rel = path.strip_prefix(root).map_err(|e| {
94                        CoderError::Workspace(format!("strip_prefix {}: {e}", path.display()))
95                    })?;
96                    out.push(rel.to_path_buf());
97                }
98            }
99        }
100    }
101    Ok(())
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107    use std::fs;
108    use tempfile::TempDir;
109
110    fn write(dir: &Path, rel: &str, contents: &str) {
111        let abs = dir.join(rel);
112        if let Some(parent) = abs.parent() {
113            fs::create_dir_all(parent).unwrap();
114        }
115        fs::write(abs, contents).unwrap();
116    }
117
118    #[test]
119    fn finds_rust_sources_and_skips_target() {
120        let tmp = TempDir::new().unwrap();
121        write(tmp.path(), "src/lib.rs", "pub fn x() {}\n");
122        write(tmp.path(), "src/main.rs", "fn main() {}\n");
123        // target/ artifacts should NOT appear in the scan.
124        write(tmp.path(), "target/debug/junk.rs", "fn junk() {}\n");
125
126        let files = scan_all_source_files(tmp.path()).unwrap();
127        let paths: Vec<String> = files.iter().map(|p| p.display().to_string()).collect();
128        assert!(paths.contains(&"src/lib.rs".to_string()));
129        assert!(paths.contains(&"src/main.rs".to_string()));
130        assert!(
131            !paths.iter().any(|p| p.starts_with("target/")),
132            "target/ leaked into the scan: {paths:?}"
133        );
134    }
135
136    #[test]
137    fn skips_hidden_dirs() {
138        let tmp = TempDir::new().unwrap();
139        write(tmp.path(), "src/lib.rs", "pub fn x() {}\n");
140        write(tmp.path(), ".git/config", "[core]\n");
141        write(tmp.path(), ".hidden/file.rs", "fn x() {}\n");
142
143        let files = scan_all_source_files(tmp.path()).unwrap();
144        let paths: Vec<String> = files.iter().map(|p| p.display().to_string()).collect();
145        assert!(paths.contains(&"src/lib.rs".to_string()));
146        assert!(
147            !paths
148                .iter()
149                .any(|p| p.starts_with(".git/") || p.starts_with(".hidden/")),
150            "hidden dir leaked: {paths:?}"
151        );
152    }
153
154    #[test]
155    fn scan_prefers_mentioned_files() {
156        let tmp = TempDir::new().unwrap();
157        write(tmp.path(), "src/lib.rs", "pub fn greet() {}\n");
158        write(tmp.path(), "src/other.rs", "pub fn other() {}\n");
159        write(tmp.path(), "Cargo.toml", "[package]\n");
160
161        let hits = scan_workspace_for_files(tmp.path(), "Rename greet in src/lib.rs").unwrap();
162        let paths: Vec<String> = hits.iter().map(|p| p.display().to_string()).collect();
163        assert!(paths.contains(&"src/lib.rs".to_string()));
164        // other.rs is not mentioned, so it must not be included.
165        assert!(
166            !paths.contains(&"src/other.rs".to_string()),
167            "unrelated file leaked into mentioned-only scan: {paths:?}"
168        );
169    }
170
171    #[test]
172    fn scan_falls_back_to_all_when_nothing_mentioned() {
173        let tmp = TempDir::new().unwrap();
174        write(tmp.path(), "src/lib.rs", "pub fn a() {}\n");
175        write(tmp.path(), "src/other.rs", "pub fn b() {}\n");
176
177        // A task that doesn't mention any path/name/stem in the workspace.
178        let hits = scan_workspace_for_files(tmp.path(), "Add a license header everywhere").unwrap();
179        let paths: Vec<String> = hits.iter().map(|p| p.display().to_string()).collect();
180        assert!(paths.contains(&"src/lib.rs".to_string()));
181        assert!(paths.contains(&"src/other.rs".to_string()));
182    }
183
184    #[test]
185    fn ignores_unknown_extensions() {
186        let tmp = TempDir::new().unwrap();
187        write(tmp.path(), "src/lib.rs", "pub fn x() {}\n");
188        write(tmp.path(), "binary.bin", "junk");
189        write(tmp.path(), "image.png", "fake png bytes");
190
191        let files = scan_all_source_files(tmp.path()).unwrap();
192        let paths: Vec<String> = files.iter().map(|p| p.display().to_string()).collect();
193        assert!(paths.contains(&"src/lib.rs".to_string()));
194        assert!(!paths
195            .iter()
196            .any(|p| p.ends_with(".bin") || p.ends_with(".png")));
197    }
198
199    #[test]
200    fn file_stem_match_triggers_mention() {
201        let tmp = TempDir::new().unwrap();
202        write(tmp.path(), "src/parser.rs", "pub fn parse() {}\n");
203
204        // "parser" is the file stem.
205        let hits =
206            scan_workspace_for_files(tmp.path(), "Update the parser to handle commas").unwrap();
207        let paths: Vec<String> = hits.iter().map(|p| p.display().to_string()).collect();
208        assert!(paths.contains(&"src/parser.rs".to_string()));
209    }
210}