Skip to main content

winx_code_agent/utils/
repo.rs

1//! Repository context, ported for parity with wcgw's `repo_context.py`.
2//!
3//! The flow mirrors wcgw exactly:
4//! 1. Walk the workspace (gitignore-aware) collecting candidate files.
5//! 2. For git repos, pull recently-changed files from history (topological).
6//! 3. Rank every file with the embedded path-probability model.
7//! 4. Build the shown set: active files first, then recent git files, then the
8//!    statistically-ranked remainder, up to a size that scales with the repo.
9//! 5. Render it as a partially-expanded directory tree.
10
11use crate::errors::Result;
12use crate::utils::display_tree::DirectoryTree;
13use ignore::WalkBuilder;
14use std::collections::HashSet;
15use std::path::{Path, PathBuf};
16use std::process::Command;
17
18/// Stop scanning once we've seen this many filesystem entries (wcgw parity).
19const MAX_ENTRIES_CHECK: usize = 100_000;
20/// Roughly "10 directory levels deep" — walk depth counts files too, so +1.
21const MAX_WALK_DEPTH: usize = 11;
22/// How far back through git history to look for recently-touched files.
23const MAX_COMMITS_WALK: usize = 500;
24
25/// Build the workspace context string and the list of shown files.
26///
27/// The returned string is a partially-expanded directory tree, byte-for-byte in
28/// the same spirit as wcgw's `DirectoryTree.display()`.
29pub fn get_repo_context(path: &Path) -> Result<(String, Vec<String>)> {
30    let context_dir = context_dir(path);
31    let is_git_repo = find_git_root(&context_dir).is_some();
32
33    let mut all_files = get_all_files_max_depth(&context_dir, is_git_repo);
34    all_files.sort(); // deterministic order so score ties resolve stably
35
36    let dynamic_max_files =
37        if is_git_repo { calculate_dynamic_file_limit(all_files.len()) } else { 50 };
38
39    let existing: HashSet<&str> = all_files.iter().map(String::as_str).collect();
40
41    let recent_git_files = if is_git_repo {
42        let count = std::cmp::max(10, (dynamic_max_files as f64 * 0.2) as usize);
43        get_recent_git_files(&context_dir, count, &existing)
44    } else {
45        Vec::new()
46    };
47
48    let ranked = rank_files(&all_files);
49    let active = crate::utils::workspace_stats::active_files_for_context(&context_dir);
50
51    // Compose the shown set: active → recent → ranked remainder (no dups).
52    let mut top_files: Vec<String> = Vec::new();
53    let mut seen: HashSet<String> = HashSet::new();
54    let mut push = |file: String, top: &mut Vec<String>, seen: &mut HashSet<String>| {
55        if existing.contains(file.as_str()) && seen.insert(file.clone()) {
56            top.push(file);
57        }
58    };
59
60    for file in active {
61        push(file, &mut top_files, &mut seen);
62    }
63    for file in recent_git_files {
64        push(file, &mut top_files, &mut seen);
65    }
66    if top_files.len() < dynamic_max_files {
67        for file in ranked {
68            if top_files.len() >= dynamic_max_files {
69                break;
70            }
71            if seen.insert(file.clone()) {
72                top_files.push(file);
73            }
74        }
75    }
76
77    let mut tree = DirectoryTree::new(&context_dir);
78    for file in top_files.iter().take(dynamic_max_files) {
79        tree.expand(file);
80    }
81
82    Ok((tree.display(), top_files))
83}
84
85/// The directory wcgw would treat as the context root: the git toplevel if any,
86/// otherwise the path itself (or its parent when a file is passed).
87fn context_dir(path: &Path) -> PathBuf {
88    if let Some(git_root) = find_git_root(path) {
89        return git_root;
90    }
91    if path.is_file() {
92        path.parent().unwrap_or(path).to_path_buf()
93    } else {
94        path.to_path_buf()
95    }
96}
97
98/// Walk up from `path` looking for a `.git` directory; returns the repo root.
99fn find_git_root(path: &Path) -> Option<PathBuf> {
100    let start = if path.is_file() { path.parent()? } else { path };
101    let mut current = Some(start);
102    while let Some(dir) = current {
103        if dir.join(".git").exists() {
104            return Some(dir.to_path_buf());
105        }
106        current = dir.parent();
107    }
108    None
109}
110
111/// Collect candidate files relative to `root`.
112///
113/// gitignore filtering is only applied inside a git repo (`require_git`), matching
114/// wcgw which passes `repo=None` — and thus never ignores anything — for plain
115/// folders. Hidden files are kept (wcgw shows dotfiles unless gitignored); only
116/// the `.git` directory itself is always pruned.
117fn get_all_files_max_depth(root: &Path, is_git_repo: bool) -> Vec<String> {
118    let walker = WalkBuilder::new(root)
119        .max_depth(Some(MAX_WALK_DEPTH))
120        .hidden(false)
121        .parents(true)
122        .ignore(false)
123        .git_ignore(is_git_repo)
124        .git_global(is_git_repo)
125        .git_exclude(is_git_repo)
126        .require_git(true)
127        .filter_entry(|entry| entry.file_name() != ".git")
128        .build();
129
130    let mut files = Vec::new();
131    for entry in walker.flatten() {
132        if files.len() >= MAX_ENTRIES_CHECK {
133            break;
134        }
135        if entry.file_type().is_some_and(|file_type| file_type.is_file()) {
136            if let Ok(relative) = entry.path().strip_prefix(root) {
137                files.push(relative.to_string_lossy().to_string());
138            }
139        }
140    }
141    files
142}
143
144/// Recently-changed files from git history, newest first, topological order,
145/// merges skipped — the CLI mirror of wcgw's pygit2 revwalk.
146fn get_recent_git_files(root: &Path, count: usize, existing: &HashSet<&str>) -> Vec<String> {
147    let output = Command::new("git")
148        .arg("-C")
149        .arg(root)
150        .args([
151            "log",
152            "--name-only",
153            "--no-merges",
154            "--topo-order",
155            "--format=",
156            "-n",
157            &MAX_COMMITS_WALK.to_string(),
158        ])
159        .output();
160
161    let Ok(output) = output else {
162        return Vec::new();
163    };
164    if !output.status.success() {
165        return Vec::new();
166    }
167
168    let mut recent = Vec::new();
169    let mut seen = HashSet::new();
170    for line in String::from_utf8_lossy(&output.stdout).lines().map(str::trim) {
171        if line.is_empty() || !existing.contains(line) {
172            continue;
173        }
174        if seen.insert(line.to_string()) {
175            recent.push(line.to_string());
176            if recent.len() >= count {
177                break;
178            }
179        }
180    }
181    recent
182}
183
184/// Scale the number of shown files with repo size (wcgw: 50..=400 linearly).
185fn calculate_dynamic_file_limit(total_files: usize) -> usize {
186    const MIN_FILES: usize = 50;
187    const MAX_FILES: usize = 400;
188    if total_files <= MIN_FILES {
189        return MIN_FILES;
190    }
191    let scale = (MAX_FILES - MIN_FILES) as f64 / (30_000.0 - MIN_FILES as f64);
192    let dynamic = MIN_FILES + ((total_files - MIN_FILES) as f64 * scale) as usize;
193    dynamic.min(MAX_FILES)
194}
195
196/// Order files best-first. Uses the embedded path-probability model; if it
197/// can't be loaded, falls back to a simple importance/depth heuristic.
198fn rank_files(all_files: &[String]) -> Vec<String> {
199    if let Some(scores) = crate::utils::path_prob::score_paths(all_files) {
200        let mut indexed: Vec<(usize, f64)> = scores.into_iter().enumerate().collect();
201        // Higher log-prob first; stable sort keeps the alphabetical order on ties.
202        indexed.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
203        return indexed.into_iter().map(|(index, _)| all_files[index].clone()).collect();
204    }
205
206    let mut files = all_files.to_vec();
207    files.sort_by_key(|path| (heuristic_score(path), path.clone()));
208    files
209}
210
211const IMPORTANT_NAMES: &[&str] = &[
212    "Cargo.toml",
213    "README.md",
214    "AGENTS.md",
215    "package.json",
216    "pnpm-workspace.yaml",
217    "pyproject.toml",
218    "go.mod",
219    "Dockerfile",
220    "docker-compose.yml",
221];
222
223/// Fallback ranking when the ML model is unavailable. Lower is better.
224fn heuristic_score(path: &str) -> usize {
225    let not_important = usize::from(!IMPORTANT_NAMES.contains(&path));
226    let depth = path.matches('/').count();
227    let test_penalty = usize::from(path.contains("test") || path.contains("spec"));
228    not_important * 10 + depth + test_penalty
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234    use tempfile::TempDir;
235
236    #[test]
237    fn builds_repo_context_from_files() -> Result<()> {
238        let temp_dir = TempDir::new()?;
239        std::fs::write(temp_dir.path().join("Cargo.toml"), "[package]\nname='x'\n")?;
240        std::fs::create_dir(temp_dir.path().join("src"))?;
241        std::fs::write(temp_dir.path().join("src/lib.rs"), "pub fn x() {}\n")?;
242
243        let (context, files) = get_repo_context(temp_dir.path())?;
244        assert!(context.contains("Cargo.toml"));
245        assert!(files.iter().any(|file| file == "src/lib.rs"));
246        Ok(())
247    }
248
249    #[test]
250    fn dynamic_limit_scales_between_bounds() {
251        assert_eq!(calculate_dynamic_file_limit(10), 50);
252        assert_eq!(calculate_dynamic_file_limit(50), 50);
253        assert!(calculate_dynamic_file_limit(1000) > 50);
254        assert_eq!(calculate_dynamic_file_limit(1_000_000), 400);
255    }
256
257    #[test]
258    fn respects_gitignore_in_git_repo() -> Result<()> {
259        let temp_dir = TempDir::new()?;
260        let root = temp_dir.path();
261        std::fs::create_dir(root.join(".git"))?; // mark as git repo
262        std::fs::write(root.join(".gitignore"), "ignored.txt\n")?;
263        std::fs::write(root.join("ignored.txt"), "secret\n")?;
264        std::fs::write(root.join("kept.rs"), "fn x() {}\n")?;
265
266        let files = get_all_files_max_depth(root, true);
267        assert!(files.iter().any(|file| file == "kept.rs"));
268        assert!(!files.iter().any(|file| file == "ignored.txt"), "gitignore must hide ignored.txt");
269        Ok(())
270    }
271}