//! repo_map — Aider-style ranked symbol outline of the workspace.
//!
//! Small coding brains localize by *guessing* grep patterns; when they don't
//! know the exact symbol they spiral (observed on qwen3.6-35b q3: a "where is
//! X configured" query burned the whole iteration budget guessing regexes).
//! `repo_map` hands the brain a map instead. For a natural-language query it:
//! 1. walks the workspace gitignore-aware (ripgrep's `ignore` crate, same as
//! grep_search) so build/dep/VCS dirs and *.log noise are skipped,
//! 2. extracts top-level definitions per file via language-aware patterns
//! (rust / python / js-ts / go) with line numbers + signature snippets,
//! 3. ranks files by how well their symbol names + path match the query
//! tokens, and returns the top files with their best symbols.
//!
//! One call replaces N grep guesses, and the signature snippet frequently
//! carries the answer itself (`const DEFAULT_MAX_FIX_ROUNDS: u32 = 3;`) so the
//! brain can answer a "what's the default" question without a follow-up read.
//!
//! `mode="refs"` is the exhaustive counterpart: a literal substring scan over
//! every readable file (not just the 4 definition languages) that returns a
//! deduped `distinct_names` enumeration plus source-first / docs-last hit
//! lists — for "list everywhere X is used" and "what's the real value past
//! the stale docs". See [`run_repo_refs`].
use std::path::Path;
use regex::Regex;
use serde_json::{json, Value};
use super::{validate_read_path, MAX_FILE_BYTES};
const MAX_FILES_SCANNED: usize = 5000;
const MAX_RESULT_FILES: usize = 15;
const MAX_SYMBOLS_PER_FILE: usize = 40;
const MAX_SIG_CHARS: usize = 160;
/// `mode="refs"`: cap on individual hit lines returned (source + doc
/// combined). Higher than grep's 100 because the per-hit payload is just
/// file/line/snippet. `distinct_names` is computed over ALL hits before
/// this cap, so the enumeration answer is never truncated.
const MAX_REF_HITS: usize = 120;
/// `mode="refs"`: cap on the deduped identifier list (the enumeration
/// answer). 200 distinct matches of one needle is already pathological.
const MAX_DISTINCT_NAMES: usize = 200;
/// Source-code extensions for the refs-mode source/doc split. A hit in one
/// of these is `source` (authoritative); everything else (`.md`, `.txt`,
/// `.toml`, `.json`, `.yaml`, …) is `doc` and ranked below — so the brain
/// sees the real `const X = 3;` ahead of a stale doc saying `2`.
const SOURCE_EXTENSIONS: &[&str] = &[
"rs", "py", "js", "mjs", "cjs", "jsx", "ts", "tsx", "go", "java", "c", "cc", "cpp", "cxx", "h",
"hpp", "rb", "php", "sh", "bash", "sql", "kt", "swift", "scala", "cs",
];
fn is_source_file(path: &Path) -> bool {
path.extension()
.and_then(|e| e.to_str())
.is_some_and(|e| SOURCE_EXTENSIONS.iter().any(|s| e.eq_ignore_ascii_case(s)))
}
pub(super) fn schemas() -> Vec<Value> {
vec![json!({
"type": "function",
"function": {
"name": "repo_map",
"description": "Localize code by concept. Returns a ranked outline of the workspace: the files whose top-level definitions (functions, types, constants) best match `query`, each with line numbers + signature snippets. Use this FIRST to find where something lives instead of guessing grep patterns — the snippet often shows the value/signature directly. Then read_file the cited line if you need more. (Languages: Rust, Python, JS/TS, Go.) For an exhaustive list of everywhere a name is used (or to pin the real source value past stale docs), call with mode='refs' and name='<exact text>'.",
"parameters": {
"type": "object",
"properties": {
"query": { "type": "string", "description": "What you're looking for, in words or symbol fragments (e.g. 'forge fix loop max rounds default')" },
"path": { "type": "string", "description": "Directory to map (default: the workspace/project root)" },
"mode": { "type": "string", "enum": ["map", "refs"], "description": "map (default): ranked outline of code DEFINITIONS matching a concept. refs: exhaustive deduped list of every place a name appears (definitions, calls, env-var string literals, comments) — source files first, docs last. Use refs to enumerate ALL occurrences of something, or to find the authoritative source value when docs might be stale." },
"name": { "type": "string", "description": "mode=refs only: the exact text to find everywhere, e.g. 'CLAUDETTE_FORGE_' (prefix match finds all CLAUDETTE_FORGE_* vars) or 'DEFAULT_MAX_FIX_ROUNDS'. Case-sensitive literal substring." }
},
"required": ["query"]
}
}
})]
}
pub(super) fn dispatch(name: &str, input: &str) -> Option<Result<String, String>> {
match name {
"repo_map" => Some(run_repo_map(input)),
_ => None,
}
}
struct Symbol {
line: usize,
kind: &'static str,
name: String,
sig: String,
}
fn run_repo_map(input: &str) -> Result<String, String> {
let v: Value = serde_json::from_str(input)
.map_err(|e| format!("repo_map: invalid JSON ({e}): {input}"))?;
let query = v
.get("query")
.and_then(Value::as_str)
.ok_or("repo_map: missing 'query'")?;
if query.trim().is_empty() {
return Err("repo_map: query is empty".to_string());
}
// Default root: active mission tree → workspace cwd → first workspace root
// → $HOME (mirrors grep_search's resolution).
let default_path: String;
let path_str = match v.get("path").and_then(Value::as_str) {
Some(s) => s,
None => {
default_path = if let Some(m) = crate::missions::active_mission() {
m.path.display().to_string()
} else if let Some(root) = crate::tools::default_workspace_root() {
root.display().to_string()
} else {
"~".to_string()
};
default_path.as_str()
}
};
let root = validate_read_path(path_str)?;
if !root.is_dir() {
return Err(format!("repo_map: {} is not a directory", root.display()));
}
// mode="refs": exhaustive occurrence scan. Needle = `name` if given,
// else fall back to `query` (forgiving — a brain that forgets `name`
// still works). Default mode is "map" (everything below), so existing
// behaviour is byte-for-byte unchanged.
let mode = v.get("mode").and_then(Value::as_str).unwrap_or("map");
if mode == "refs" {
let needle = v
.get("name")
.and_then(Value::as_str)
.filter(|s| !s.trim().is_empty())
.unwrap_or(query);
return run_repo_refs(&root, needle);
}
let query_tokens = tokenize(query);
// (file, score, symbols) accumulated across the walk.
let mut scored: Vec<(String, usize, Vec<Symbol>)> = Vec::new();
let mut files_scanned = 0usize;
let mut truncated = false;
let walker = ignore::WalkBuilder::new(&root)
.hidden(true)
.git_ignore(true)
.git_global(true)
.git_exclude(true)
.parents(true)
.follow_links(false)
.filter_entry(|entry| {
if entry.file_type().is_some_and(|ft| ft.is_dir()) {
let name = entry.file_name().to_string_lossy();
if super::SEARCH_SKIP_DIRS.contains(&name.as_ref()) {
return false;
}
}
true
})
.build();
for result in walker {
let Ok(entry) = result else { continue };
if !entry.file_type().is_some_and(|ft| ft.is_file()) {
continue;
}
if files_scanned >= MAX_FILES_SCANNED {
truncated = true;
break;
}
let p = entry.path();
let Some(patterns) = patterns_for(p) else {
continue;
};
files_scanned += 1;
let Ok(meta) = entry.metadata() else { continue };
if meta.len() > MAX_FILE_BYTES as u64 {
continue;
}
let Ok(content) = std::fs::read_to_string(p) else {
continue;
};
let path_tokens = tokenize(&p.to_string_lossy());
let mut symbols: Vec<(usize, Symbol)> = Vec::new(); // (sym_score, symbol)
let mut file_score = 0usize;
for (lineno, line) in content.lines().enumerate() {
for (kind, re) in &patterns {
if let Some(caps) = re.captures(line) {
if let Some(name) = caps.get(1).map(|m| m.as_str()) {
let name_tokens = tokenize(name);
let sym_score = query_tokens
.iter()
.filter(|qt| name_tokens.iter().any(|nt| nt == *qt))
.count();
file_score += sym_score * 2;
symbols.push((
sym_score,
Symbol {
line: lineno + 1,
kind,
name: name.to_string(),
sig: line.trim().chars().take(MAX_SIG_CHARS).collect(),
},
));
break; // one kind per line
}
}
}
}
// Path-token overlap (a query mentioning "search" should surface
// search.rs even if no symbol name matches).
file_score += query_tokens
.iter()
.filter(|qt| path_tokens.iter().any(|pt| pt == *qt))
.count();
if file_score == 0 || symbols.is_empty() {
continue;
}
// Best-scoring symbols first, then source order; cap per file.
symbols.sort_by(|a, b| b.0.cmp(&a.0).then(a.1.line.cmp(&b.1.line)));
let kept: Vec<Symbol> = symbols
.into_iter()
.take(MAX_SYMBOLS_PER_FILE)
.map(|(_, s)| s)
.collect();
scored.push((p.display().to_string(), file_score, kept));
}
scored.sort_by_key(|f| std::cmp::Reverse(f.1));
let result_files = scored.len().min(MAX_RESULT_FILES);
let files_json: Vec<Value> = scored
.into_iter()
.take(MAX_RESULT_FILES)
.map(|(file, score, syms)| {
json!({
"file": file,
"score": score,
"symbols": syms.iter().map(|s| json!({
"line": s.line,
"kind": s.kind,
"name": s.name,
"sig": s.sig,
})).collect::<Vec<_>>(),
})
})
.collect();
Ok(json!({
"query": query,
"root": root.display().to_string(),
"files_scanned": files_scanned,
"result_files": result_files,
"truncated": truncated,
"files": files_json,
})
.to_string())
}
struct RefHit {
file: String,
line: usize,
sig: String,
}
/// `mode="refs"` — exhaustive, deduped, source-first occurrence scan.
///
/// Unlike map mode (which extracts only top-level *definitions* in 4
/// languages), refs scans the literal text of EVERY readable file
/// (gitignore-aware, build dirs skipped) for `needle` as a case-sensitive
/// substring. This is the only mechanism that surfaces things that are not
/// definitions — env-var string literals (`std::env::var("CLAUDETTE_FORGE_…")`),
/// call sites, doc/comment mentions — which is exactly the enumerate /
/// deep-locate gap map mode can't close.
///
/// Two payloads do the heavy lifting for a weak brain:
/// - `distinct_names`: the deduped set of full identifiers containing the
/// needle, computed over ALL hits before any cap — the brain copies it
/// verbatim as the enumeration answer, no per-line counting.
/// - `source_hits` vs `doc_hits`: source files first, docs quarantined —
/// so the authoritative `const X = 3;` outranks a stale doc saying `2`.
fn run_repo_refs(root: &Path, needle: &str) -> Result<String, String> {
if needle.trim().is_empty() {
return Err("repo_map(refs): empty name/query".to_string());
}
// Identifier tokens containing the needle → the deduped enumeration
// answer. BTreeSet keeps it sorted + unique for free.
let ident_re = Regex::new(r"[A-Za-z_][A-Za-z0-9_]*").expect("static ident regex");
let mut distinct: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
let mut source_hits: Vec<RefHit> = Vec::new();
let mut doc_hits: Vec<RefHit> = Vec::new();
let mut total_hits = 0usize;
let mut files_scanned = 0usize;
let mut truncated = false;
let walker = ignore::WalkBuilder::new(root)
.hidden(true)
.git_ignore(true)
.git_global(true)
.git_exclude(true)
.parents(true)
.follow_links(false)
.filter_entry(|entry| {
if entry.file_type().is_some_and(|ft| ft.is_dir()) {
let name = entry.file_name().to_string_lossy();
if super::SEARCH_SKIP_DIRS.contains(&name.as_ref()) {
return false;
}
}
true
})
.build();
for result in walker {
let Ok(entry) = result else { continue };
if !entry.file_type().is_some_and(|ft| ft.is_file()) {
continue;
}
if files_scanned >= MAX_FILES_SCANNED {
truncated = true;
break;
}
let p = entry.path();
let Ok(meta) = entry.metadata() else { continue };
if meta.len() > MAX_FILE_BYTES as u64 {
continue;
}
// Language-agnostic: any file that reads as UTF-8 text is scanned
// (binaries fail read_to_string and are skipped). Docs/config are
// in scope precisely because they carry the conflicting values.
let Ok(content) = std::fs::read_to_string(p) else {
continue;
};
files_scanned += 1;
let source = is_source_file(p);
for (lineno, line) in content.lines().enumerate() {
if !line.contains(needle) {
continue;
}
// Enumeration answer: every full identifier on this line that
// contains the needle. Computed for ALL hits (before the cap).
for m in ident_re.find_iter(line) {
if m.as_str().contains(needle) {
if distinct.len() < MAX_DISTINCT_NAMES {
distinct.insert(m.as_str().to_string());
} else {
// Enumeration answer is now partial — say so, don't
// let the brain treat it as exhaustive.
truncated = true;
}
}
}
total_hits += 1;
// PER-PARTITION caps, NOT a joint budget: the `ignore` walker
// visits dirs alphabetically (docs/ before src/), so a joint cap
// would let 120+ doc lines exhaust the budget before the first
// source file is read — starving source_hits and inverting the
// whole source-first (I3) guarantee. Each partition gets its own
// MAX_REF_HITS so the authoritative source value always survives.
let hits = if source {
&mut source_hits
} else {
&mut doc_hits
};
if hits.len() < MAX_REF_HITS {
hits.push(RefHit {
file: p.display().to_string(),
line: lineno + 1,
sig: line.trim().chars().take(MAX_SIG_CHARS).collect(),
});
} else {
truncated = true;
}
}
}
// Stable, useful order: by file then line within each partition.
let by_file_line = |a: &RefHit, b: &RefHit| a.file.cmp(&b.file).then(a.line.cmp(&b.line));
source_hits.sort_by(by_file_line);
doc_hits.sort_by(by_file_line);
let to_json = |hits: &[RefHit]| -> Vec<Value> {
hits.iter()
.map(|h| json!({ "file": h.file, "line": h.line, "sig": h.sig }))
.collect()
};
let distinct_names: Vec<String> = distinct.into_iter().collect();
Ok(json!({
"mode": "refs",
"needle": needle,
"root": root.display().to_string(),
"files_scanned": files_scanned,
"distinct_names": distinct_names,
"distinct_count": distinct_names.len(),
"source_hits": to_json(&source_hits),
"doc_hits": to_json(&doc_hits),
"total_hits": total_hits,
"truncated": truncated,
})
.to_string())
}
/// Split an identifier or query into lowercase word tokens: breaks on
/// non-alphanumerics (so `snake_case` and `kebab-case` split) and on
/// camelCase boundaries (`maxFixRounds` → max, fix, rounds). Drops 1-char
/// tokens and a tiny stoplist of NL filler so "where is the X" matches X.
fn tokenize(s: &str) -> Vec<String> {
const STOP: &[&str] = &[
"the", "is", "are", "of", "in", "to", "where", "what", "how", "does", "do", "it", "this",
"that", "for", "and", "or", "a", "an", "be", "on", "at",
];
let mut out = Vec::new();
for raw in s.split(|c: char| !c.is_alphanumeric()) {
if raw.is_empty() {
continue;
}
let mut cur = String::new();
let mut prev_lower_or_digit = false;
for ch in raw.chars() {
if ch.is_uppercase() && prev_lower_or_digit && !cur.is_empty() {
push_token(&mut out, &cur, STOP);
cur.clear();
}
cur.push(ch);
prev_lower_or_digit = ch.is_lowercase() || ch.is_ascii_digit();
}
push_token(&mut out, &cur, STOP);
}
out
}
fn push_token(out: &mut Vec<String>, tok: &str, stop: &[&str]) {
if tok.chars().count() < 2 {
return;
}
let lower = tok.to_lowercase();
if stop.contains(&lower.as_str()) {
return;
}
out.push(lower);
}
/// Language-aware definition patterns for a file, or `None` if the extension
/// isn't a supported source language. Each `Regex` captures the symbol NAME in
/// group 1. Compiled per call (cheap — a few dozen small patterns, and
/// repo_map runs rarely).
fn patterns_for(path: &Path) -> Option<Vec<(&'static str, Regex)>> {
let ext = path.extension()?.to_str()?.to_lowercase();
let pats: Vec<(&'static str, &str)> = match ext.as_str() {
"rs" => vec![
(
"fn",
r"^\s*(?:pub(?:\([^)]*\))?\s+)?(?:async\s+)?(?:unsafe\s+)?(?:const\s+)?fn\s+([A-Za-z_]\w*)",
),
(
"struct",
r"^\s*(?:pub(?:\([^)]*\))?\s+)?struct\s+([A-Za-z_]\w*)",
),
(
"enum",
r"^\s*(?:pub(?:\([^)]*\))?\s+)?enum\s+([A-Za-z_]\w*)",
),
(
"trait",
r"^\s*(?:pub(?:\([^)]*\))?\s+)?trait\s+([A-Za-z_]\w*)",
),
(
"type",
r"^\s*(?:pub(?:\([^)]*\))?\s+)?type\s+([A-Za-z_]\w*)",
),
(
"const",
r"^\s*(?:pub(?:\([^)]*\))?\s+)?const\s+([A-Za-z_]\w*)",
),
(
"static",
r"^\s*(?:pub(?:\([^)]*\))?\s+)?static\s+(?:mut\s+)?([A-Za-z_]\w*)",
),
("mod", r"^\s*(?:pub(?:\([^)]*\))?\s+)?mod\s+([A-Za-z_]\w*)"),
("macro", r"^\s*macro_rules!\s+([A-Za-z_]\w*)"),
],
"py" => vec![
("def", r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)"),
("class", r"^\s*class\s+([A-Za-z_]\w*)"),
],
"js" | "mjs" | "cjs" | "jsx" | "ts" | "tsx" => vec![
(
"function",
r"^\s*(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s*\*?\s+([A-Za-z_$][\w$]*)",
),
(
"class",
r"^\s*(?:export\s+)?(?:default\s+)?(?:abstract\s+)?class\s+([A-Za-z_$][\w$]*)",
),
(
"type",
r"^\s*(?:export\s+)?(?:interface|type|enum)\s+([A-Za-z_$][\w$]*)",
),
(
"const",
r"^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=",
),
],
"go" => vec![
("func", r"^\s*func\s+(?:\([^)]*\)\s*)?([A-Za-z_]\w*)"),
("type", r"^\s*type\s+([A-Za-z_]\w*)"),
("const", r"^\s*(?:const|var)\s+([A-Za-z_]\w*)"),
],
_ => return None,
};
Some(
pats.into_iter()
.filter_map(|(kind, p)| Regex::new(p).ok().map(|re| (kind, re)))
.collect(),
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_splits_snake_and_camel_and_drops_filler() {
assert_eq!(tokenize("max_fix_rounds"), ["max", "fix", "rounds"]);
assert_eq!(tokenize("maxFixRounds"), ["max", "fix", "rounds"]);
assert_eq!(
tokenize("DEFAULT_MAX_FIX_ROUNDS"),
["default", "max", "fix", "rounds"]
);
// NL query: filler dropped, content kept.
assert_eq!(
tokenize("where is the fix loop max rounds default"),
["fix", "loop", "max", "rounds", "default"]
);
}
#[test]
fn rust_patterns_capture_fn_and_const_with_value() {
let pats = patterns_for(Path::new("x.rs")).unwrap();
let line_fn = "fn max_fix_rounds() -> u32 {";
let line_const = "const DEFAULT_MAX_FIX_ROUNDS: u32 = 3;";
let hit_fn = pats
.iter()
.find_map(|(k, re)| re.captures(line_fn).map(|c| (*k, c[1].to_string())));
let hit_const = pats
.iter()
.find_map(|(k, re)| re.captures(line_const).map(|c| (*k, c[1].to_string())));
assert_eq!(hit_fn, Some(("fn", "max_fix_rounds".to_string())));
assert_eq!(
hit_const,
Some(("const", "DEFAULT_MAX_FIX_ROUNDS".to_string()))
);
}
#[test]
fn repo_map_ranks_the_matching_file_first_with_value_in_sig() {
let _eg = crate::test_env_lock(); // home-resolving: serialize vs temp-home swaps
let base = super::super::user_home()
.join(".claudette")
.join("files")
.join("claudette-repomap-test-k3");
let _ = std::fs::remove_dir_all(&base);
std::fs::create_dir_all(base.join("src")).unwrap();
std::fs::write(
base.join("src").join("run.rs"),
"const DEFAULT_MAX_FIX_ROUNDS: u32 = 3;\nfn max_fix_rounds() -> u32 { 3 }\n",
)
.unwrap();
std::fs::write(
base.join("src").join("notes.rs"),
"fn note_create() {}\nstruct Note {}\n",
)
.unwrap();
let input = json!({
"query": "forge fix loop max rounds default",
"path": base.to_str().unwrap()
})
.to_string();
let out = run_repo_map(&input).unwrap();
let v: Value = serde_json::from_str(&out).unwrap();
let first = &v["files"][0];
assert!(
first["file"]
.as_str()
.unwrap()
.replace('\\', "/")
.contains("/src/run.rs"),
"run.rs should rank first: {out}"
);
// The const's sig snippet carries the answer (= 3) directly.
let sigs: String = first["symbols"]
.as_array()
.unwrap()
.iter()
.map(|s| s["sig"].as_str().unwrap())
.collect::<Vec<_>>()
.join(" | ");
assert!(sigs.contains("= 3"), "expected the value in a sig: {sigs}");
// map mode response shape: has `files`, no refs-only keys.
assert!(v.get("files").is_some(), "map mode must keep `files`");
assert!(
v.get("distinct_names").is_none(),
"map mode must NOT carry refs-only keys"
);
let _ = std::fs::remove_dir_all(&base);
}
/// Build a throwaway fixture tree under a unique dir, run a closure with
/// its path, then clean up. Holds the env lock (home-resolving).
fn with_refs_fixture<F: FnOnce(&str)>(tag: &str, files: &[(&str, &str)], f: F) {
let _eg = crate::test_env_lock();
let base = super::super::user_home()
.join(".claudette")
.join("files")
.join(format!("claudette-refs-test-{tag}"));
let _ = std::fs::remove_dir_all(&base);
for (rel, content) in files {
let p = base.join(rel);
std::fs::create_dir_all(p.parent().unwrap()).unwrap();
std::fs::write(&p, content).unwrap();
}
f(base.to_str().unwrap());
let _ = std::fs::remove_dir_all(&base);
}
#[test]
fn repo_refs_enumerates_all_distinct_prefix_matches() {
// I1 proxy: the 6 CLAUDETTE_FORGE_* vars exist ONLY as env-read
// string literals / comments / error strings — ZERO definitions —
// so map mode finds none. refs must enumerate all 6, deduped across
// duplicate lines and across files.
let run_rs = r#"
// CLAUDETTE_FORGE_ABORT_WINDOW_SECS controls the abort window
let _ = std::env::var("CLAUDETTE_FORGE_ABORT_WINDOW_SECS");
if env_flag_enabled("CLAUDETTE_FORGE_ALLOW_DIRTY") {}
if env_flag_enabled("CLAUDETTE_FORGE_ALLOW_DIRTY") {} // duplicate line
let _ = std::env::var("CLAUDETTE_FORGE_AUTO_APPROVE");
let _ = std::env::var("CLAUDETTE_FORGE_SUBMIT_ON_FAIL");
"#;
let sec_rs = r#"
if env_flag_enabled("CLAUDETTE_FORGE_SECURITY_OVERRIDE") {}
// also CLAUDETTE_FORGE_SECURITY_REVIEW in this comment
"#;
with_refs_fixture(
"i1",
&[("src/run.rs", run_rs), ("src/security_review.rs", sec_rs)],
|path| {
let input = json!({
"query": "forge env vars",
"mode": "refs",
"name": "CLAUDETTE_FORGE_",
"path": path
})
.to_string();
let out = run_repo_map(&input).unwrap();
let v: Value = serde_json::from_str(&out).unwrap();
assert_eq!(v["mode"], "refs");
let names: Vec<&str> = v["distinct_names"]
.as_array()
.unwrap()
.iter()
.map(|n| n.as_str().unwrap())
.collect();
let expected = [
"CLAUDETTE_FORGE_ABORT_WINDOW_SECS",
"CLAUDETTE_FORGE_ALLOW_DIRTY",
"CLAUDETTE_FORGE_AUTO_APPROVE",
"CLAUDETTE_FORGE_SECURITY_OVERRIDE",
"CLAUDETTE_FORGE_SECURITY_REVIEW",
"CLAUDETTE_FORGE_SUBMIT_ON_FAIL",
];
for e in expected {
assert!(names.contains(&e), "missing {e} in {names:?}");
}
assert_eq!(v["distinct_count"], 6, "deduped to exactly 6: {names:?}");
},
);
}
#[test]
fn repo_refs_puts_source_before_docs_with_value_in_sig() {
// I3 proxy: the real const is in run.rs (= 3); stale docs say 2. The
// source hit must carry the value, and every doc_hit must be a .md.
with_refs_fixture(
"i3",
&[
("src/run.rs", "const DEFAULT_MAX_FIX_ROUNDS: u32 = 3;\n"),
("docs/forge.md", "The DEFAULT_MAX_FIX_ROUNDS is 2 rounds.\n"),
(
"docs/configuration.md",
"Set DEFAULT_MAX_FIX_ROUNDS (default 2).\n",
),
],
|path| {
let input = json!({
"query": "max fix rounds",
"mode": "refs",
"name": "DEFAULT_MAX_FIX_ROUNDS",
"path": path
})
.to_string();
let out = run_repo_map(&input).unwrap();
let v: Value = serde_json::from_str(&out).unwrap();
let src = v["source_hits"].as_array().unwrap();
assert!(!src.is_empty(), "source hit expected");
assert!(
src[0]["file"]
.as_str()
.unwrap()
.replace('\\', "/")
.ends_with("/src/run.rs"),
"source-first: {out}"
);
assert!(
src[0]["sig"].as_str().unwrap().contains("= 3"),
"source hit must carry the real value: {out}"
);
for d in v["doc_hits"].as_array().unwrap() {
// Fixture filenames are literal lowercase `.md`, so a
// case-sensitive check is exactly what we want here.
#[allow(clippy::case_sensitive_file_extension_comparisons)]
let is_md = d["file"].as_str().unwrap().ends_with(".md");
assert!(is_md, "doc partition must hold only docs: {out}");
}
},
);
}
#[test]
fn repo_refs_forgiving_args_falls_back_to_query() {
// Weak-brain forgiving args: mode=refs with NO `name` uses `query`.
with_refs_fixture(
"fallback",
&[(
"src/run.rs",
"std::env::var(\"CLAUDETTE_FORGE_AUTO_APPROVE\");\n",
)],
|path| {
let input = json!({
"query": "CLAUDETTE_FORGE_",
"mode": "refs",
"path": path
})
.to_string();
let out = run_repo_map(&input).unwrap();
let v: Value = serde_json::from_str(&out).unwrap();
assert_eq!(
v["distinct_count"], 1,
"fell back to query as needle: {out}"
);
},
);
}
#[test]
fn repo_refs_source_hits_survive_a_flood_of_earlier_doc_hits() {
// I3 guarantee under load: the walker visits docs/ before src/
// alphabetically. With a JOINT hit cap, 130 doc matches would
// exhaust the budget and the lone authoritative source hit would be
// dropped — silently inverting source-first. Per-partition caps must
// keep the source hit. (MAX_REF_HITS=120; 130 doc lines > that.)
let many_doc_lines = "MARKER_NEEDLE mentioned in the docs\n".repeat(130);
with_refs_fixture(
"flood",
&[
("docs/big.md", &many_doc_lines),
("src/run.rs", "const MARKER_NEEDLE: u32 = 3;\n"),
],
|path| {
let input = json!({
"query": "x", "mode": "refs", "name": "MARKER_NEEDLE", "path": path
})
.to_string();
let out = run_repo_map(&input).unwrap();
let v: Value = serde_json::from_str(&out).unwrap();
let src = v["source_hits"].as_array().unwrap();
assert!(
!src.is_empty(),
"source hit must NOT be starved by 130 earlier doc hits: {}",
// truncated should be flagged; doc_hits capped at 120.
serde_json::to_string(&json!({
"source_len": src.len(),
"doc_len": v["doc_hits"].as_array().unwrap().len(),
"truncated": v["truncated"].clone(),
}))
.unwrap()
);
assert!(
src[0]["sig"].as_str().unwrap().contains("= 3"),
"the surviving source hit must be the real value"
);
assert_eq!(
v["truncated"], true,
"doc flood past the cap must flag truncated"
);
},
);
}
#[test]
fn repo_refs_finds_string_literals_not_only_definitions() {
// Guard against regressing into definition-only scanning (the
// rejected Design-1 failure mode): a needle with ZERO definitions
// but many string-literal occurrences must still produce source hits.
with_refs_fixture(
"literals",
&[(
"src/run.rs",
"let a = \"CLAUDETTE_FORGE_X\";\nlet b = \"CLAUDETTE_FORGE_Y\";\n",
)],
|path| {
let input = json!({
"query": "x", "mode": "refs", "name": "CLAUDETTE_FORGE_", "path": path
})
.to_string();
let out = run_repo_map(&input).unwrap();
let v: Value = serde_json::from_str(&out).unwrap();
assert!(
!v["source_hits"].as_array().unwrap().is_empty(),
"string-literal occurrences must be found: {out}"
);
},
);
}
}