Skip to main content

lean_ctx/tools/
ctx_compose.rs

1//! `ctx_compose` — task composer (Phase 2 of the efficiency epic).
2//!
3//! The biggest agent win is a single "rich per call" tool that returns ranked
4//! files *with* inline bodies, replacing the typical search → read → outline →
5//! read chain (3-5 calls) with one.
6//!
7//! lean-ctx already has the building blocks as separate tools; this composes
8//! them into one response for a natural-language task:
9//!   1. extracted keywords,
10//!   2. semantically ranked files (BM25 / hybrid),
11//!   3. exact match locations (index-backed `ctx_search`),
12//!   4. the body of the most relevant symbol, inline.
13
14use std::collections::HashMap;
15use std::sync::mpsc;
16use std::time::Duration;
17
18use crate::core::graph_provider;
19use crate::core::tokens::count_tokens;
20use crate::tools::CrpMode;
21
22/// Wall-time budget for the semantic-ranking stage. The exact-match and symbol
23/// stages are index-backed and cheap; only semantic ranking can hit a cold
24/// `O(corpus)` BM25 build. We never let that block the agent loop: past the
25/// budget we return what we have and let the detached worker finish warming the
26/// resident cache for the next call. Override via `LEAN_CTX_COMPOSE_BUDGET_MS`.
27const DEFAULT_SEMANTIC_BUDGET_MS: u64 = 2500;
28
29fn semantic_budget() -> Duration {
30    let ms = std::env::var("LEAN_CTX_COMPOSE_BUDGET_MS")
31        .ok()
32        .and_then(|v| v.parse::<u64>().ok())
33        .filter(|&v| v > 0)
34        .unwrap_or(DEFAULT_SEMANTIC_BUDGET_MS);
35    Duration::from_millis(ms)
36}
37
38/// Token budget for the inlined symbol bodies. Submodular selection fills it
39/// with the most coverage-effective, non-redundant set of symbols.
40/// Override via `LEAN_CTX_COMPOSE_SYMBOL_TOKENS`.
41const DEFAULT_SYMBOL_BUDGET_TOKENS: usize = 600;
42
43fn symbol_budget_tokens() -> usize {
44    std::env::var("LEAN_CTX_COMPOSE_SYMBOL_TOKENS")
45        .ok()
46        .and_then(|v| v.parse::<usize>().ok())
47        .filter(|&v| v > 0)
48        .unwrap_or(DEFAULT_SYMBOL_BUDGET_TOKENS)
49}
50
51/// Wall-time budget for the associative (graph spreading-activation) stage.
52/// Opening/building the graph index is `O(corpus)` on a cold repo, so — like
53/// semantic ranking — we bound it and skip the (purely additive) section on
54/// overrun while the detached worker warms the index. `LEAN_CTX_COMPOSE_GRAPH_BUDGET_MS`.
55const DEFAULT_GRAPH_BUDGET_MS: u64 = 1500;
56
57fn graph_budget() -> Duration {
58    let ms = std::env::var("LEAN_CTX_COMPOSE_GRAPH_BUDGET_MS")
59        .ok()
60        .and_then(|v| v.parse::<u64>().ok())
61        .filter(|&v| v > 0)
62        .unwrap_or(DEFAULT_GRAPH_BUDGET_MS);
63    Duration::from_millis(ms)
64}
65
66/// Per-hop activation decay and hop count for spreading activation. Small decay
67/// keeps activation local (structurally near the seeds); 3 hops covers
68/// import→callee→sibling chains without diffusing across the whole graph.
69const SPREAD_DECAY: f64 = 0.6;
70const SPREAD_HOPS: usize = 3;
71/// How many associative neighbours to surface.
72const SPREAD_TOP_K: usize = 8;
73
74/// Build the associative-relevance block: spreading activation seeded at the
75/// files the task keywords resolve to, propagated over the union of the static
76/// import/call graph and the *learned* Hebbian co-access graph. Returns an empty
77/// string when no graph/seeds are available. Runs entirely in the worker thread
78/// so [`associative_block_budgeted`] can bound it.
79fn build_associative_block(project_root: &str, keywords: &[String]) -> String {
80    let Some(open) = graph_provider::open_or_build(project_root) else {
81        return String::new();
82    };
83    let gp = &open.provider;
84
85    // Seeds: distinct files the keywords resolve to via symbol lookup.
86    let mut seed_files: Vec<String> = Vec::new();
87    for kw in keywords {
88        for sym in gp.find_symbols(kw, None, None) {
89            if !seed_files.contains(&sym.file) {
90                seed_files.push(sym.file);
91            }
92        }
93    }
94    if seed_files.is_empty() {
95        return String::new();
96    }
97
98    // Hebbian update: files relevant to the same task "fire together", so record
99    // their co-access (strengthens future associative recall). Persisted.
100    crate::core::cooccurrence::record_access(project_root, &seed_files);
101
102    // Adjacency = static structural edges ∪ learned co-access edges. Edges are
103    // made bidirectional so activation spreads both up and down the graph.
104    let mut adjacency: HashMap<String, Vec<(String, f64)>> = HashMap::new();
105    let mut add_edge = |a: &str, b: &str, w: f64| {
106        adjacency
107            .entry(a.to_string())
108            .or_default()
109            .push((b.to_string(), w));
110        adjacency
111            .entry(b.to_string())
112            .or_default()
113            .push((a.to_string(), w));
114    };
115    for e in gp.edges() {
116        add_edge(&e.from, &e.to, if e.weight > 0.0 { e.weight } else { 1.0 });
117    }
118    let coaccess = crate::core::cooccurrence::load(project_root);
119    for sf in &seed_files {
120        for (nbr, w) in coaccess.related(sf, 16) {
121            add_edge(sf, &nbr, w);
122        }
123    }
124
125    let seeds: HashMap<String, f64> = seed_files.iter().map(|f| (f.clone(), 1.0)).collect();
126    let ranked = crate::core::spreading_activation::related_ranked(
127        &seeds,
128        &adjacency,
129        SPREAD_DECAY,
130        SPREAD_HOPS,
131        SPREAD_TOP_K,
132    );
133    if ranked.is_empty() {
134        return String::new();
135    }
136
137    let mut s = String::from("\n## Related (associative: import/call graph + learned co-access)\n");
138    for (file, activation) in ranked {
139        s.push_str(&format!("- {file} (activation {activation:.2})\n"));
140    }
141    s
142}
143
144/// Run [`build_associative_block`] under [`graph_budget`]. The Hebbian record is
145/// a side effect of the worker, so it persists even when we time out and drop
146/// the (optional) section.
147fn associative_block_budgeted(project_root: &str, keywords: &[String]) -> String {
148    if keywords.is_empty() {
149        return String::new();
150    }
151    let (tx, rx) = mpsc::channel::<String>();
152    let root = project_root.to_string();
153    let kws = keywords.to_vec();
154    std::thread::spawn(move || {
155        let block = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
156            build_associative_block(&root, &kws)
157        }))
158        .unwrap_or_else(|_| {
159            tracing::warn!("[ctx_compose: associative block panicked; omitting section]");
160            String::new()
161        });
162        let _ = tx.send(block);
163    });
164    rx.recv_timeout(graph_budget()).unwrap_or_default()
165}
166
167/// Words that carry no retrieval signal — dropped from keyword extraction.
168const STOPWORDS: &[&str] = &[
169    "the",
170    "and",
171    "for",
172    "with",
173    "that",
174    "this",
175    "from",
176    "into",
177    "how",
178    "where",
179    "what",
180    "does",
181    "are",
182    "was",
183    "use",
184    "used",
185    "uses",
186    "add",
187    "all",
188    "any",
189    "can",
190    "get",
191    "set",
192    "via",
193    "out",
194    "its",
195    "his",
196    "her",
197    "you",
198    "your",
199    "our",
200    "find",
201    "show",
202    "list",
203    "make",
204    "when",
205    "then",
206    "has",
207    "have",
208    "had",
209    "not",
210    "but",
211    "see",
212    "function",
213    "method",
214    "class",
215    "code",
216    "file",
217    "files",
218    "implement",
219    "implementation",
220];
221
222/// Extract up to `max` distinct identifier-ish keywords from a task, preserving
223/// original case (symbol lookups are case-sensitive) and first-seen order.
224fn extract_keywords(task: &str, max: usize) -> Vec<String> {
225    let mut seen = std::collections::HashSet::new();
226    let mut out = Vec::new();
227    for raw in task.split(|c: char| !(c.is_alphanumeric() || c == '_')) {
228        if raw.len() < 3 {
229            continue;
230        }
231        if STOPWORDS.contains(&raw.to_ascii_lowercase().as_str()) {
232            continue;
233        }
234        if seen.insert(raw.to_string()) {
235            out.push(raw.to_string());
236            if out.len() >= max {
237                break;
238            }
239        }
240    }
241    out
242}
243
244/// Run the semantic ranking stage under a wall-time budget. Returns the ranked
245/// block on time, or a short "deferred" note if the (cold) build overruns —
246/// in which case the detached worker keeps running to warm the resident cache.
247fn ranked_files_budgeted(task: &str, project_root: &str, crp_mode: CrpMode) -> String {
248    let shared_cache = crate::tools::ctx_semantic_search::get_thread_cache();
249    let (tx, rx) = mpsc::channel::<String>();
250    let task_owned = task.to_string();
251    let root_owned = project_root.to_string();
252
253    std::thread::spawn(move || {
254        if let Some(cache) = shared_cache {
255            crate::tools::ctx_semantic_search::set_thread_cache(cache);
256        }
257        let ranked = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
258            crate::tools::ctx_semantic_search::handle(
259                &task_owned,
260                &root_owned,
261                8,
262                crp_mode,
263                None,
264                None,
265                None,
266                Some(false),
267                Some(false),
268            )
269        }))
270        .unwrap_or_else(|_| {
271            tracing::warn!("[ctx_compose: semantic ranking panicked; omitting section]");
272            String::new()
273        });
274        // Receiver may be gone (we timed out); dropping the result is fine —
275        // the cache warming already happened as a side effect of the build.
276        let _ = tx.send(ranked);
277    });
278
279    match rx.recv_timeout(semantic_budget()) {
280        Ok(ranked) => ranked.trim().to_string(),
281        Err(_) => "(deferred — semantic index is warming; the exact matches below are \
282                   authoritative for this call, and ranking will be instant on the next call)"
283            .to_string(),
284    }
285}
286
287/// Compose a single rich response for `task`.
288pub fn handle(task: &str, project_root: &str, crp_mode: CrpMode) -> (String, usize) {
289    let task = task.trim();
290    if task.is_empty() {
291        return ("ERROR: task is required".to_string(), 0);
292    }
293
294    let keywords = extract_keywords(task, 6);
295    let allow_secret = crate::core::roles::active_role().io.allow_secret_paths;
296
297    let mut out = String::new();
298    out.push_str(&format!("TASK: {task}\n"));
299    if keywords.is_empty() {
300        out.push_str("KEYWORDS: (none extracted — using full task for ranking)\n");
301    } else {
302        out.push_str(&format!("KEYWORDS: {}\n", keywords.join(", ")));
303    }
304
305    // 1. Semantically ranked files for the whole task — budgeted so a cold
306    //    BM25 build can never stall the agent loop (hardening H1). The worker
307    //    inherits the resident cache, so a build that overruns the budget still
308    //    warms the cache for the next call rather than being wasted.
309    out.push_str("\n## Ranked files (semantic)\n");
310    out.push_str(&ranked_files_budgeted(task, project_root, crp_mode));
311    out.push('\n');
312
313    // 2. Exact match locations for the primary keyword (index-backed search).
314    if let Some(primary) = keywords.first() {
315        let (grep, _g) = crate::tools::ctx_search::handle(
316            primary,
317            project_root,
318            None,
319            10,
320            crp_mode,
321            true,
322            allow_secret,
323        );
324        out.push_str(&format!("\n## Exact matches: '{primary}'\n"));
325        out.push_str(grep.trim());
326        out.push('\n');
327    }
328
329    // 3. Inline the symbol bodies that best cover the task keywords. Rather
330    //    than just the first match, select the non-redundant *set* of symbols
331    //    with maximal keyword coverage under a token budget via submodular
332    //    greedy (1−1/e optimal). Two keywords resolving to the same symbol, or
333    //    a symbol whose body adds no new keyword, are naturally pruned.
334    use crate::core::context_packing::{greedy_max_coverage, CoverageItem};
335    let mut snippets: Vec<String> = Vec::new();
336    let mut items: Vec<CoverageItem> = Vec::new();
337    for kw in &keywords {
338        if let Some((rendered, toks)) =
339            crate::tools::ctx_symbol::best_symbol_snippet(kw, project_root)
340        {
341            // The snippet always covers its triggering keyword, plus any other
342            // task keyword its body textually surfaces (a more central symbol).
343            let mut terms: std::collections::HashSet<String> =
344                std::collections::HashSet::from([kw.clone()]);
345            for other in &keywords {
346                if other != kw && rendered.contains(other.as_str()) {
347                    terms.insert(other.clone());
348                }
349            }
350            items.push(CoverageItem {
351                terms,
352                cost: toks.max(1),
353            });
354            snippets.push(rendered);
355        }
356    }
357    if !items.is_empty() {
358        let chosen = greedy_max_coverage(&items, symbol_budget_tokens(), |_| 1.0);
359        let mut seen = std::collections::HashSet::new();
360        let mut header_written = false;
361        for idx in chosen {
362            let rendered = snippets[idx].trim();
363            if rendered.is_empty() || !seen.insert(rendered.to_string()) {
364                continue;
365            }
366            if !header_written {
367                out.push_str("\n## Top symbols (bodies)\n");
368                header_written = true;
369            }
370            out.push_str(rendered);
371            out.push('\n');
372        }
373    }
374
375    // 4. Associative neighbours via spreading activation over the import/call
376    //    graph unified with the learned Hebbian co-access graph (budgeted,
377    //    additive — surfaces structurally-close files lexical search misses).
378    out.push_str(&associative_block_budgeted(project_root, &keywords));
379
380    let sent = count_tokens(&out);
381    (out, sent)
382}
383
384#[cfg(test)]
385mod tests {
386    use super::*;
387
388    #[test]
389    fn extract_keywords_drops_stopwords_and_short_tokens() {
390        let kw = extract_keywords("How does the BM25Index cache work for ctx_search?", 6);
391        assert!(kw.contains(&"BM25Index".to_string()));
392        assert!(kw.contains(&"cache".to_string()));
393        assert!(kw.contains(&"ctx_search".to_string()));
394        assert!(!kw.iter().any(|k| k == "the" || k == "How" || k == "for"));
395    }
396
397    #[test]
398    fn extract_keywords_dedups_and_caps() {
399        let kw = extract_keywords("alpha alpha beta gamma delta epsilon zeta eta", 3);
400        assert_eq!(kw.len(), 3);
401        assert_eq!(kw[0], "alpha");
402    }
403
404    #[test]
405    fn empty_task_is_rejected() {
406        let (out, tok) = handle("   ", "/tmp", CrpMode::Off);
407        assert!(out.starts_with("ERROR"));
408        assert_eq!(tok, 0);
409    }
410}