lean_ctx/core/
task_relevance.rs

1use std::collections::{HashMap, HashSet, VecDeque};
2
3use super::graph_index::ProjectIndex;
4
5use super::attention_model::positional_attention;
6
7#[derive(Debug, Clone)]
8pub struct RelevanceScore {
9    pub path: String,
10    pub score: f64,
11    pub recommended_mode: &'static str,
12}
13
14pub fn compute_relevance(
15    index: &ProjectIndex,
16    task_files: &[String],
17    task_keywords: &[String],
18) -> Vec<RelevanceScore> {
19    let mut scores: HashMap<String, f64> = HashMap::new();
20
21    // Seed: task files get score 1.0
22    for f in task_files {
23        scores.insert(f.clone(), 1.0);
24    }
25
26    // BFS from task files through import graph, decaying by distance
27    let adj = build_adjacency(index);
28    for seed in task_files {
29        let mut visited: HashSet<String> = HashSet::new();
30        let mut queue: VecDeque<(String, usize)> = VecDeque::new();
31        queue.push_back((seed.clone(), 0));
32        visited.insert(seed.clone());
33
34        while let Some((node, depth)) = queue.pop_front() {
35            if depth > 4 {
36                continue;
37            }
38            let decay = 1.0 / (1.0 + depth as f64).powi(2); // quadratic decay
39            let entry = scores.entry(node.clone()).or_insert(0.0);
40            *entry = entry.max(decay);
41
42            if let Some(neighbors) = adj.get(&node) {
43                for neighbor in neighbors {
44                    if !visited.contains(neighbor) {
45                        visited.insert(neighbor.clone());
46                        queue.push_back((neighbor.clone(), depth + 1));
47                    }
48                }
49            }
50        }
51    }
52
53    // Keyword boost: files containing task keywords get a relevance boost
54    if !task_keywords.is_empty() {
55        let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
56        for (file_path, file_entry) in &index.files {
57            let path_lower = file_path.to_lowercase();
58            let mut keyword_hits = 0;
59            for kw in &kw_lower {
60                if path_lower.contains(kw) {
61                    keyword_hits += 1;
62                }
63                for export in &file_entry.exports {
64                    if export.to_lowercase().contains(kw) {
65                        keyword_hits += 1;
66                    }
67                }
68            }
69            if keyword_hits > 0 {
70                let boost = (keyword_hits as f64 * 0.15).min(0.6);
71                let entry = scores.entry(file_path.clone()).or_insert(0.0);
72                *entry = (*entry + boost).min(1.0);
73            }
74        }
75    }
76
77    let mut result: Vec<RelevanceScore> = scores
78        .into_iter()
79        .map(|(path, score)| {
80            let mode = recommend_mode(score);
81            RelevanceScore {
82                path,
83                score,
84                recommended_mode: mode,
85            }
86        })
87        .collect();
88
89    result.sort_by(|a, b| {
90        b.score
91            .partial_cmp(&a.score)
92            .unwrap_or(std::cmp::Ordering::Equal)
93    });
94    result
95}
96
97fn recommend_mode(score: f64) -> &'static str {
98    if score >= 0.8 {
99        "full"
100    } else if score >= 0.5 {
101        "signatures"
102    } else if score >= 0.2 {
103        "map"
104    } else {
105        "reference"
106    }
107}
108
109fn build_adjacency(index: &ProjectIndex) -> HashMap<String, Vec<String>> {
110    let mut adj: HashMap<String, Vec<String>> = HashMap::new();
111    for edge in &index.edges {
112        adj.entry(edge.from.clone())
113            .or_default()
114            .push(edge.to.clone());
115        adj.entry(edge.to.clone())
116            .or_default()
117            .push(edge.from.clone());
118    }
119    adj
120}
121
122/// Extract likely task-relevant file paths and keywords from a task description.
123pub fn parse_task_hints(task_description: &str) -> (Vec<String>, Vec<String>) {
124    let mut files = Vec::new();
125    let mut keywords = Vec::new();
126
127    for word in task_description.split_whitespace() {
128        let clean = word.trim_matches(|c: char| {
129            !c.is_alphanumeric() && c != '.' && c != '/' && c != '_' && c != '-'
130        });
131        if clean.contains('.')
132            && (clean.contains('/')
133                || clean.ends_with(".rs")
134                || clean.ends_with(".ts")
135                || clean.ends_with(".py")
136                || clean.ends_with(".go")
137                || clean.ends_with(".js"))
138        {
139            files.push(clean.to_string());
140        } else if clean.len() >= 3 && !STOP_WORDS.contains(&clean.to_lowercase().as_str()) {
141            keywords.push(clean.to_string());
142        }
143    }
144
145    (files, keywords)
146}
147
148const STOP_WORDS: &[&str] = &[
149    "the", "and", "for", "that", "this", "with", "from", "have", "has", "was", "are", "been",
150    "not", "but", "all", "can", "had", "her", "one", "our", "out", "you", "its", "will", "each",
151    "make", "like", "fix", "add", "use", "get", "set", "run", "new", "old", "should", "would",
152    "could", "into", "also", "than", "them", "then", "when", "just", "only", "very", "some",
153    "more", "other", "nach", "und", "die", "der", "das", "ist", "ein", "eine", "nicht", "auf",
154    "mit",
155];
156
157/// Information Bottleneck filter based on Tishby et al. (2000) and QUITO-X (EMNLP 2025).
158///
159/// IB principle: maximize I(T;Y) (task relevance) while minimizing I(T;X) (input redundancy).
160/// Each line is scored by: relevance_to_task * information_density * attention_weight.
161///
162/// - I(T;Y) is approximated via keyword matching + structural importance
163/// - I(T;X) is approximated via token IDF (inverse document frequency within the file)
164///   and token diversity ratio — lines with rare tokens carry more mutual information
165/// - Attention weighting uses the LITM U-curve (Liu et al. 2023, ICLR 2026 Memory Demands)
166pub fn information_bottleneck_filter(
167    content: &str,
168    task_keywords: &[String],
169    budget_ratio: f64,
170) -> String {
171    let lines: Vec<&str> = content.lines().collect();
172    if lines.is_empty() {
173        return String::new();
174    }
175
176    let n = lines.len();
177    let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
178
179    let mut global_token_freq: HashMap<&str, usize> = HashMap::new();
180    for line in &lines {
181        for token in line.split_whitespace() {
182            *global_token_freq.entry(token).or_insert(0) += 1;
183        }
184    }
185    let total_unique = global_token_freq.len().max(1) as f64;
186
187    let mut scored_lines: Vec<(usize, &str, f64)> = lines
188        .iter()
189        .enumerate()
190        .map(|(i, line)| {
191            let trimmed = line.trim();
192            if trimmed.is_empty() {
193                return (i, *line, 0.05);
194            }
195
196            // I(T;Y): Task relevance — keyword hits + structural importance
197            let line_lower = trimmed.to_lowercase();
198            let keyword_hits: f64 = kw_lower
199                .iter()
200                .filter(|kw| line_lower.contains(kw.as_str()))
201                .count() as f64;
202            let structural = if is_definition_line(trimmed) {
203                1.0
204            } else if is_control_flow(trimmed) {
205                0.5
206            } else if is_closing_brace(trimmed) {
207                0.15
208            } else {
209                0.3
210            };
211            let relevance = keyword_hits * 0.5 + structural;
212
213            // I(T;X): Information density via token uniqueness and IDF
214            let line_tokens: Vec<&str> = trimmed.split_whitespace().collect();
215            let unique_in_line = line_tokens.iter().collect::<HashSet<_>>().len() as f64;
216            let line_token_count = line_tokens.len().max(1) as f64;
217            let token_diversity = unique_in_line / line_token_count;
218
219            let avg_idf: f64 = if line_tokens.is_empty() {
220                0.0
221            } else {
222                line_tokens
223                    .iter()
224                    .map(|t| {
225                        let freq = *global_token_freq.get(t).unwrap_or(&1) as f64;
226                        (total_unique / freq).ln().max(0.0)
227                    })
228                    .sum::<f64>()
229                    / line_token_count
230            };
231            let information = (token_diversity * 0.4 + (avg_idf.min(3.0) / 3.0) * 0.6).min(1.0);
232
233            // LITM U-curve attention weighting (begin α=0.9, middle β=0.3, end γ=0.85)
234            let pos = i as f64 / n.max(1) as f64;
235            let attention = positional_attention(pos, 0.9, 0.3, 0.85);
236
237            // IB composite: maximize relevance × information_density × attention_weight
238            // Weights: relevance dominates (×0.6), information density secondary (×0.25),
239            // attention tertiary (×0.15). Floor of 0.05 prevents zero-multiplication.
240            let score =
241                (relevance * 0.6 + 0.05) * (information * 0.25 + 0.05) * (attention * 0.15 + 0.05);
242
243            (i, *line, score)
244        })
245        .collect();
246
247    let budget = ((n as f64) * budget_ratio).ceil() as usize;
248
249    let mut by_score = scored_lines.clone();
250    by_score.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
251
252    let cutoff_score = if budget < by_score.len() {
253        by_score[budget].2
254    } else {
255        0.0
256    };
257
258    scored_lines.retain(|(_, _, score)| *score >= cutoff_score);
259
260    scored_lines
261        .iter()
262        .map(|(_, line, _)| *line)
263        .collect::<Vec<&str>>()
264        .join("\n")
265}
266
267/// Compute an adaptive IB budget ratio based on content characteristics.
268/// Highly repetitive content → more aggressive filtering (lower ratio).
269/// High-entropy diverse content → more conservative (higher ratio).
270pub fn adaptive_ib_budget(content: &str, base_ratio: f64) -> f64 {
271    let lines: Vec<&str> = content.lines().collect();
272    if lines.len() < 10 {
273        return 1.0;
274    }
275
276    let mut token_freq: HashMap<&str, usize> = HashMap::new();
277    let mut total_tokens = 0usize;
278    for line in &lines {
279        for token in line.split_whitespace() {
280            *token_freq.entry(token).or_insert(0) += 1;
281            total_tokens += 1;
282        }
283    }
284
285    if total_tokens == 0 {
286        return base_ratio;
287    }
288
289    let unique_ratio = token_freq.len() as f64 / total_tokens as f64;
290    let repetition_factor = 1.0 - unique_ratio;
291
292    (base_ratio * (1.0 - repetition_factor * 0.3)).clamp(0.2, 1.0)
293}
294
295fn is_definition_line(line: &str) -> bool {
296    let prefixes = [
297        "fn ",
298        "pub fn ",
299        "async fn ",
300        "pub async fn ",
301        "struct ",
302        "pub struct ",
303        "enum ",
304        "pub enum ",
305        "trait ",
306        "pub trait ",
307        "impl ",
308        "type ",
309        "pub type ",
310        "const ",
311        "pub const ",
312        "static ",
313        "pub static ",
314        "class ",
315        "export class ",
316        "interface ",
317        "export interface ",
318        "function ",
319        "export function ",
320        "async function ",
321        "def ",
322        "async def ",
323        "func ",
324    ];
325    prefixes
326        .iter()
327        .any(|p| line.starts_with(p) || line.trim_start().starts_with(p))
328}
329
330fn is_control_flow(line: &str) -> bool {
331    let trimmed = line.trim();
332    trimmed.starts_with("if ")
333        || trimmed.starts_with("else ")
334        || trimmed.starts_with("match ")
335        || trimmed.starts_with("for ")
336        || trimmed.starts_with("while ")
337        || trimmed.starts_with("return ")
338        || trimmed.starts_with("break")
339        || trimmed.starts_with("continue")
340        || trimmed.starts_with("yield")
341        || trimmed.starts_with("await ")
342}
343
344fn is_closing_brace(line: &str) -> bool {
345    let trimmed = line.trim();
346    trimmed == "}" || trimmed == "};" || trimmed == "})" || trimmed == "});"
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352
353    #[test]
354    fn parse_task_finds_files_and_keywords() {
355        let (files, keywords) =
356            parse_task_hints("Fix the authentication bug in src/auth.rs and update tests");
357        assert!(files.iter().any(|f| f.contains("auth.rs")));
358        assert!(keywords
359            .iter()
360            .any(|k| k.to_lowercase().contains("authentication")));
361    }
362
363    #[test]
364    fn recommend_mode_by_score() {
365        assert_eq!(recommend_mode(1.0), "full");
366        assert_eq!(recommend_mode(0.6), "signatures");
367        assert_eq!(recommend_mode(0.3), "map");
368        assert_eq!(recommend_mode(0.1), "reference");
369    }
370
371    #[test]
372    fn info_bottleneck_preserves_definitions() {
373        let content = "fn main() {\n    let x = 42;\n    // boring comment\n    println!(x);\n}\n";
374        let result = information_bottleneck_filter(content, &["main".to_string()], 0.6);
375        assert!(result.contains("fn main"));
376    }
377
378    #[test]
379    fn adaptive_budget_reduces_for_repetitive() {
380        let repetitive = "let x = 1;\n".repeat(50);
381        let diverse = (0..50)
382            .map(|i| format!("let var_{i} = func_{i}(arg_{i});"))
383            .collect::<Vec<_>>()
384            .join("\n");
385        let budget_rep = super::adaptive_ib_budget(&repetitive, 0.7);
386        let budget_div = super::adaptive_ib_budget(&diverse, 0.7);
387        assert!(
388            budget_rep < budget_div,
389            "repetitive content should get lower budget"
390        );
391    }
392}
lean_ctx/core/task_relevance.rs

lean_ctx/core/
task_relevance.rs