use std::path::{Path, PathBuf};
use walkdir::{DirEntry, WalkDir};
use super::Corpus;
pub struct Candidate {
pub path: PathBuf,
pub corpus: Corpus,
pub repo: String,
}
const HARD_EXCLUDES: &[&str] = &[
"target", ".git", "node_modules",
".venv", "venv", "site-packages", "__pycache__",
".mypy_cache", ".pytest_cache", ".ruff_cache", ".tox",
".idea", ".vscode", ".svn", ".hg", ".gradle",
];
pub fn walk_workspace(root: &Path, repo_scope: &[String]) -> Vec<Candidate> {
let mut out = Vec::new();
let walker = WalkDir::new(root)
.follow_links(false)
.into_iter()
.filter_entry(|e| !is_excluded(e, repo_scope));
for entry in walker.flatten() {
if !entry.file_type().is_file() {
continue;
}
let p = entry.path();
let Some(corpus) = classify(p) else { continue };
let repo = first_segment_under(root, p).unwrap_or_default();
out.push(Candidate {
path: p.to_path_buf(),
corpus,
repo,
});
}
out
}
fn is_excluded(entry: &DirEntry, repo_scope: &[String]) -> bool {
if let Some(name) = entry.file_name().to_str() {
if HARD_EXCLUDES.iter().any(|h| *h == name) {
return true;
}
if entry.file_type().is_dir() && (name == "warehouse" || name == "cache") {
let parent_is_nornir = entry
.path()
.parent()
.and_then(|p| p.file_name())
.and_then(|s| s.to_str())
== Some(".nornir");
if parent_is_nornir {
return true;
}
}
if !repo_scope.is_empty()
&& entry.depth() == 1
&& entry.file_type().is_dir()
&& !repo_scope.iter().any(|r| r == name)
{
return true;
}
}
false
}
fn first_segment_under(root: &Path, p: &Path) -> Option<String> {
let rel = p.strip_prefix(root).ok()?;
rel.components()
.next()
.map(|c| c.as_os_str().to_string_lossy().into_owned())
}
fn classify(p: &Path) -> Option<Corpus> {
let name = p.file_name()?.to_string_lossy();
let name_lower = name.to_lowercase();
let ext = p
.extension()
.map(|s| s.to_string_lossy().to_lowercase())
.unwrap_or_default();
if name_lower == "bench_history.jsonl" {
return Some(Corpus::BenchHistory);
}
if name_lower.starts_with("changelog") || name_lower == "history.md" {
return Some(Corpus::Changelog);
}
if name_lower == "cargo.toml" || name_lower == "nornir.toml" || name_lower == "workspace.md" {
return Some(Corpus::Config);
}
match ext.as_str() {
"md" | "markdown" | "txt" | "rst" | "adoc" => Some(Corpus::Docs),
"rs" | "py" | "java" | "kt" | "scala" | "go" | "ts" | "tsx" | "js" | "sh" | "rb"
| "c" | "cc" | "cpp" | "h" | "hpp" => Some(Corpus::Code),
"toml" | "yaml" | "yml" | "json" => Some(Corpus::Config),
_ => None,
}
}