use super::*;
pub(crate) fn estimated_chunks(total_source_bytes: u64) -> u64 {
total_source_bytes / 500
}
pub(crate) fn recommend_backend(estimated_chunks: u64) -> EmbeddingBackend {
if estimated_chunks <= 5_000 {
EmbeddingBackend::FastEmbed
} else {
EmbeddingBackend::Model2Vec
}
}
pub(crate) fn backend_label(backend: EmbeddingBackend) -> &'static str {
match backend {
EmbeddingBackend::FastEmbed =>
"minilm — MiniLM transformer; best quality, CPU backfill ~10-100 chunks/sec",
EmbeddingBackend::Model2Vec =>
"model2vec — static embeddings; ~100-500x faster on CPU, some quality cost",
EmbeddingBackend::None => "none — BM25 + structure only, no dense vectors",
}
}
pub(crate) fn scan_repo(root: &Path) -> anyhow::Result<RepoScan> {
let mut scan = RepoScan::default();
let ignore = IgnoreMatcher::compile(root, &[]);
scan_dir(root, root, &ignore, &mut scan)?;
assign_headers(root, &mut scan)?;
Ok(scan)
}
pub(crate) fn assign_headers(root: &Path, scan: &mut RepoScan) -> anyhow::Result<()> {
let header_lang = if scan.language_counts.get(&Language::Cpp).copied().unwrap_or(0) > 0 {
Language::Cpp
} else {
Language::C
};
for path in std::mem::take(&mut scan.deferred_headers) {
*scan.language_counts.entry(header_lang).or_default() += 1;
add_file_to_dir_counts(root, &path, header_lang, scan)?;
}
Ok(())
}
pub(crate) fn scan_dir(
root: &Path,
dir: &Path,
ignore: &IgnoreMatcher,
scan: &mut RepoScan,
) -> anyhow::Result<()> {
let mut entries = fs::read_dir(dir)?.collect::<Result<Vec<_>, io::Error>>()?;
entries.sort_by_key(|entry| entry.file_name());
for entry in entries {
let path = entry.path();
let file_type = entry.file_type()?;
if file_type.is_dir() {
let name = entry.file_name().to_string_lossy().into_owned();
if !ignore.is_ignored(&path, true) && is_virtualenv_dir(&path) {
scan.has_python_virtualenv = true;
continue;
}
if should_skip_dir(&name) || ignore.is_ignored(&path, true) {
continue;
}
scan_dir(root, &path, ignore, scan)?;
} else if file_type.is_file()
&& !ignore.is_ignored(&path, false)
&& let Some(language) = Language::from_path(&path)
{
scan.total_source_bytes += entry.metadata().map(|metadata| metadata.len()).unwrap_or(0);
if language == Language::C && path.extension().is_some_and(|ext| ext == "h") {
scan.deferred_headers.push(path);
} else {
*scan.language_counts.entry(language).or_default() += 1;
add_file_to_dir_counts(root, &path, language, scan)?;
}
}
}
Ok(())
}
pub(crate) fn add_file_to_dir_counts(
root: &Path,
path: &Path,
language: Language,
scan: &mut RepoScan,
) -> anyhow::Result<()> {
let parent = path.parent().unwrap_or(root);
let relative_parent = parent.strip_prefix(root).unwrap_or(parent);
let relative_parent =
if relative_parent.as_os_str().is_empty() { Path::new(".") } else { relative_parent };
*scan
.direct_dir_counts
.entry(language)
.or_default()
.entry(relative_parent.to_path_buf())
.or_default() += 1;
*scan.dir_counts.entry(language).or_default().entry(PathBuf::from(".")).or_default() += 1;
let mut current = PathBuf::new();
for component in relative_parent.components() {
if component.as_os_str() == "." {
continue;
}
current.push(component.as_os_str());
*scan.dir_counts.entry(language).or_default().entry(current.clone()).or_default() += 1;
}
Ok(())
}
pub(crate) fn should_skip_dir(name: &str) -> bool {
SKIPPED_DIRS.contains(&name)
}
pub(crate) fn candidate_dirs(scan: &RepoScan, language: Language) -> Vec<DirCandidate> {
let Some(counts) = scan.dir_counts.get(&language) else {
return Vec::new();
};
let mut candidates = counts
.iter()
.filter(|(path, _)| path_depth(path) <= 4)
.map(|(path, count)| DirCandidate {
path: path.clone(),
count: *count,
default: default_dir(scan, language, path),
})
.collect::<Vec<_>>();
if !candidates.iter().any(|candidate| candidate.default)
&& let Some(best) = candidates
.iter_mut()
.filter(|candidate| !fallback_excluded(language, &candidate.path))
.filter(|candidate| {
language != Language::Python
|| candidate.path != Path::new(".")
|| python_root_has_direct_source(scan, &candidate.path)
})
.max_by_key(|candidate| candidate.count)
{
best.default = true;
}
candidates.sort_by(|a, b| {
b.default
.cmp(&a.default)
.then_with(|| b.count.cmp(&a.count))
.then_with(|| a.path.cmp(&b.path))
});
candidates.truncate(32);
candidates.sort_by(|a, b| a.path.cmp(&b.path));
candidates
}
pub(crate) fn default_dir(scan: &RepoScan, language: Language, path: &Path) -> bool {
let text = display_rel(path);
match language {
Language::Rust => text == "src" || text.ends_with("/src"),
Language::TypeScript => text == "src" || text.ends_with("/src") || text.ends_with("/app"),
Language::Kotlin =>
text == "src"
|| text.ends_with("/src")
|| text.ends_with("/src/main/java")
|| text.ends_with("/src/main/kotlin"),
Language::C | Language::Cpp =>
text == "src"
|| text.ends_with("/src")
|| text == "include"
|| text.ends_with("/include")
|| directly_contains_source(scan, language, path),
Language::Python =>
!is_python_dependency_dir(&text)
&& (text == "src"
|| text.ends_with("/src")
|| directly_contains_source(scan, language, path)
|| python_root_has_direct_source(scan, path)),
Language::Markdown => text == "docs" || text == ".",
}
}
fn is_python_dependency_dir(text: &str) -> bool {
text.split('/').any(|component| {
matches!(
component,
".venv"
| "venv"
| "env"
| ".env"
| "virtualenv"
| "site-packages"
| "__pycache__"
| ".tox"
| ".nox"
| "node_modules"
)
})
}
fn fallback_excluded(language: Language, path: &Path) -> bool {
language == Language::Python && is_python_dependency_dir(&display_rel(path))
}
fn python_root_has_direct_source(scan: &RepoScan, path: &Path) -> bool {
path == Path::new(".")
&& !scan.has_python_virtualenv
&& scan
.direct_dir_counts
.get(&Language::Python)
.and_then(|counts| counts.get(Path::new(".")))
.copied()
.unwrap_or_default()
> 0
}
pub(crate) fn directly_contains_source(scan: &RepoScan, language: Language, path: &Path) -> bool {
path != Path::new(".")
&& scan
.direct_dir_counts
.get(&language)
.and_then(|counts| counts.get(path))
.copied()
.unwrap_or_default()
> 0
}
pub(crate) fn path_depth(path: &Path) -> usize {
if path == Path::new(".") { 0 } else { path.components().count() }
}
pub(crate) fn print_language_summary(scan: &RepoScan) {
for language in supported_languages() {
let count = scan.language_counts.get(&language).copied().unwrap_or_default();
if count > 0 {
println!(" {}: {count} files", language.as_str());
}
}
}
#[cfg(test)]
mod header_assignment_tests {
use std::sync::atomic::{AtomicU64, Ordering};
use super::*;
use crate::init::run::default_plan;
fn temp_root(tag: &str) -> PathBuf {
static N: AtomicU64 = AtomicU64::new(0);
let id = N.fetch_add(1, Ordering::Relaxed);
let root =
std::env::temp_dir().join(format!("ragrat-hdr-{tag}-{}-{id}", std::process::id()));
fs::remove_dir_all(&root).ok();
root
}
#[test]
fn cpp_project_binds_h_headers_as_cpp() {
let root = temp_root("cpp");
fs::create_dir_all(root.join("include/lib")).unwrap();
fs::create_dir_all(root.join("src")).unwrap();
fs::write(root.join("include/lib/api.h"), "class Api { void run(); };\n").unwrap();
fs::write(root.join("src/api.cpp"), "#include \"lib/api.h\"\nvoid Api::run() {}\n")
.unwrap();
let scan = scan_repo(&root).unwrap();
assert_eq!(scan.language_counts.get(&Language::C).copied().unwrap_or(0), 0, "no C files");
assert_eq!(
scan.language_counts.get(&Language::Cpp).copied().unwrap_or(0),
2,
"header + src"
);
assert!(scan.dir_counts.get(&Language::Cpp).unwrap().contains_key(Path::new("include")));
let plan = default_plan(".".to_string(), &scan);
let cpp = &plan.bindings[&Language::Cpp];
assert!(cpp.contains(&PathBuf::from("include")), "cpp must bind the header dir: {cpp:?}");
assert!(!plan.bindings.contains_key(&Language::C), "no C binding for a C++-only repo");
fs::remove_dir_all(&root).ok();
}
#[test]
fn pure_c_project_keeps_h_headers_as_c() {
let root = temp_root("c");
fs::create_dir_all(root.join("src")).unwrap();
fs::write(root.join("src/lib.h"), "int f(void);\n").unwrap();
fs::write(root.join("src/lib.c"), "int f(void){return 0;}\n").unwrap();
let scan = scan_repo(&root).unwrap();
assert_eq!(scan.language_counts.get(&Language::C).copied().unwrap_or(0), 2, ".c + .h");
assert_eq!(scan.language_counts.get(&Language::Cpp).copied().unwrap_or(0), 0, "no C++");
fs::remove_dir_all(&root).ok();
}
}
#[cfg(test)]
mod python_dir_tests {
use super::*;
#[test]
fn python_dependency_dir_detection() {
assert!(is_python_dependency_dir(".venv"));
assert!(is_python_dependency_dir("env"));
assert!(is_python_dependency_dir("project/.venv/lib/site-packages"));
assert!(!is_python_dependency_dir("src"));
assert!(!is_python_dependency_dir("app"));
}
#[test]
fn virtualenv_detected_by_content_not_name() {
let tmp = std::env::temp_dir().join(format!("rag-rat-venv-detect-{}", std::process::id()));
fs::remove_dir_all(&tmp).ok();
fs::create_dir_all(tmp.join("env")).unwrap();
fs::write(tmp.join("env/pyvenv.cfg"), "home = /usr\n").unwrap();
assert!(is_virtualenv_dir(&tmp.join("env")), "a pyvenv.cfg dir is a venv");
fs::create_dir_all(tmp.join("src/virtualenv")).unwrap();
fs::write(tmp.join("src/virtualenv/__init__.py"), "").unwrap();
assert!(
!is_virtualenv_dir(&tmp.join("src/virtualenv")),
"the virtualenv package dir has no pyvenv.cfg"
);
fs::remove_dir_all(&tmp).ok();
}
#[test]
fn fallback_does_not_promote_a_venv_only_python_repo() {
let mut scan = RepoScan::default();
let dir = PathBuf::from("env");
scan.dir_counts.entry(Language::Python).or_default().insert(dir.clone(), 9);
scan.direct_dir_counts.entry(Language::Python).or_default().insert(dir, 9);
let candidates = candidate_dirs(&scan, Language::Python);
assert!(
candidates.iter().all(|candidate| !candidate.default),
"a virtualenv dir must never be selected as the default Python target: {candidates:?}"
);
}
#[test]
fn fallback_does_not_promote_dot_when_python_lives_only_under_a_dependency_tree() {
let root = Path::new("/repo");
let mut scan = RepoScan::default();
for name in ["a.py", "b.py"] {
add_file_to_dir_counts(
root,
&root.join("env/lib/site-packages/pkg").join(name),
Language::Python,
&mut scan,
)
.unwrap();
}
let candidates = candidate_dirs(&scan, Language::Python);
assert!(
candidates.iter().all(|candidate| !candidate.default),
"no binding for an env-only repo (not even `.`): {candidates:?}"
);
}
#[test]
fn root_entrypoints_default_alongside_a_package_dir() {
let root = Path::new("/repo");
let mut scan = RepoScan::default();
add_file_to_dir_counts(root, &root.join("manage.py"), Language::Python, &mut scan).unwrap();
for name in ["__init__.py", "views.py"] {
add_file_to_dir_counts(
root,
&root.join("myapp").join(name),
Language::Python,
&mut scan,
)
.unwrap();
}
let candidates = candidate_dirs(&scan, Language::Python);
let default_paths: Vec<String> =
candidates.iter().filter(|c| c.default).map(|c| display_rel(&c.path)).collect();
assert!(
default_paths.contains(&".".to_string()),
"root entrypoints make `.` a default: {candidates:?}"
);
assert!(
default_paths.contains(&"myapp".to_string()),
"the package dir is still a default: {candidates:?}"
);
}
}