lean_ctx/core/
task_relevance.rs

1use std::collections::{HashMap, HashSet, VecDeque};
2
3use super::graph_index::ProjectIndex;
4
5use super::attention_model::positional_attention;
6
7#[derive(Debug, Clone)]
8pub struct RelevanceScore {
9    pub path: String,
10    pub score: f64,
11    pub recommended_mode: &'static str,
12}
13
14pub fn compute_relevance(
15    index: &ProjectIndex,
16    task_files: &[String],
17    task_keywords: &[String],
18) -> Vec<RelevanceScore> {
19    let mut scores: HashMap<String, f64> = HashMap::new();
20
21    // Seed: task files get score 1.0
22    for f in task_files {
23        scores.insert(f.clone(), 1.0);
24    }
25
26    // BFS from task files through import graph, decaying by distance
27    let adj = build_adjacency(index);
28    for seed in task_files {
29        let mut visited: HashSet<String> = HashSet::new();
30        let mut queue: VecDeque<(String, usize)> = VecDeque::new();
31        queue.push_back((seed.clone(), 0));
32        visited.insert(seed.clone());
33
34        while let Some((node, depth)) = queue.pop_front() {
35            if depth > 4 {
36                continue;
37            }
38            let decay = 1.0 / (1.0 + depth as f64).powi(2); // quadratic decay
39            let entry = scores.entry(node.clone()).or_insert(0.0);
40            *entry = entry.max(decay);
41
42            if let Some(neighbors) = adj.get(&node) {
43                for neighbor in neighbors {
44                    if !visited.contains(neighbor) {
45                        visited.insert(neighbor.clone());
46                        queue.push_back((neighbor.clone(), depth + 1));
47                    }
48                }
49            }
50        }
51    }
52
53    // Keyword boost: files containing task keywords get a relevance boost
54    if !task_keywords.is_empty() {
55        let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
56        for (file_path, file_entry) in &index.files {
57            let path_lower = file_path.to_lowercase();
58            let mut keyword_hits = 0;
59            for kw in &kw_lower {
60                if path_lower.contains(kw) {
61                    keyword_hits += 1;
62                }
63                for export in &file_entry.exports {
64                    if export.to_lowercase().contains(kw) {
65                        keyword_hits += 1;
66                    }
67                }
68            }
69            if keyword_hits > 0 {
70                let boost = (keyword_hits as f64 * 0.15).min(0.6);
71                let entry = scores.entry(file_path.clone()).or_insert(0.0);
72                *entry = (*entry + boost).min(1.0);
73            }
74        }
75    }
76
77    let mut result: Vec<RelevanceScore> = scores
78        .into_iter()
79        .map(|(path, score)| {
80            let mode = recommend_mode(score);
81            RelevanceScore {
82                path,
83                score,
84                recommended_mode: mode,
85            }
86        })
87        .collect();
88
89    result.sort_by(|a, b| {
90        b.score
91            .partial_cmp(&a.score)
92            .unwrap_or(std::cmp::Ordering::Equal)
93    });
94    result
95}
96
97fn recommend_mode(score: f64) -> &'static str {
98    if score >= 0.8 {
99        "full"
100    } else if score >= 0.5 {
101        "signatures"
102    } else if score >= 0.2 {
103        "map"
104    } else {
105        "reference"
106    }
107}
108
109fn build_adjacency(index: &ProjectIndex) -> HashMap<String, Vec<String>> {
110    let mut adj: HashMap<String, Vec<String>> = HashMap::new();
111    for edge in &index.edges {
112        adj.entry(edge.from.clone())
113            .or_default()
114            .push(edge.to.clone());
115        adj.entry(edge.to.clone())
116            .or_default()
117            .push(edge.from.clone());
118    }
119    adj
120}
121
122/// Extract likely task-relevant file paths and keywords from a task description.
123pub fn parse_task_hints(task_description: &str) -> (Vec<String>, Vec<String>) {
124    let mut files = Vec::new();
125    let mut keywords = Vec::new();
126
127    for word in task_description.split_whitespace() {
128        let clean = word.trim_matches(|c: char| {
129            !c.is_alphanumeric() && c != '.' && c != '/' && c != '_' && c != '-'
130        });
131        if clean.contains('.')
132            && (clean.contains('/')
133                || clean.ends_with(".rs")
134                || clean.ends_with(".ts")
135                || clean.ends_with(".py")
136                || clean.ends_with(".go")
137                || clean.ends_with(".js"))
138        {
139            files.push(clean.to_string());
140        } else if clean.len() >= 3 && !STOP_WORDS.contains(&clean.to_lowercase().as_str()) {
141            keywords.push(clean.to_string());
142        }
143    }
144
145    (files, keywords)
146}
147
148const STOP_WORDS: &[&str] = &[
149    "the", "and", "for", "that", "this", "with", "from", "have", "has", "was", "are", "been",
150    "not", "but", "all", "can", "had", "her", "one", "our", "out", "you", "its", "will", "each",
151    "make", "like", "fix", "add", "use", "get", "set", "run", "new", "old", "should", "would",
152    "could", "into", "also", "than", "them", "then", "when", "just", "only", "very", "some",
153    "more", "other", "nach", "und", "die", "der", "das", "ist", "ein", "eine", "nicht", "auf",
154    "mit",
155];
156
157/// Information Bottleneck filter based on Tishby et al. (2000) and QUITO-X (EMNLP 2025).
158///
159/// IB principle: maximize I(T;Y) (task relevance) while minimizing I(T;X) (input redundancy).
160/// Each line is scored by: relevance_to_task * information_density * attention_weight.
161///
162/// - I(T;Y) is approximated via keyword matching + structural importance
163/// - I(T;X) is approximated via token IDF (inverse document frequency within the file)
164///   and token diversity ratio — lines with rare tokens carry more mutual information
165/// - Attention weighting uses the LITM U-curve (Liu et al. 2023, ICLR 2026 Memory Demands)
166pub fn information_bottleneck_filter(
167    content: &str,
168    task_keywords: &[String],
169    budget_ratio: f64,
170) -> String {
171    let lines: Vec<&str> = content.lines().collect();
172    if lines.is_empty() {
173        return String::new();
174    }
175
176    let n = lines.len();
177    let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
178
179    let mut global_token_freq: HashMap<&str, usize> = HashMap::new();
180    for line in &lines {
181        for token in line.split_whitespace() {
182            *global_token_freq.entry(token).or_insert(0) += 1;
183        }
184    }
185    let total_unique = global_token_freq.len().max(1) as f64;
186
187    let mut scored_lines: Vec<(usize, &str, f64)> = lines
188        .iter()
189        .enumerate()
190        .map(|(i, line)| {
191            let trimmed = line.trim();
192            if trimmed.is_empty() {
193                return (i, *line, 0.05);
194            }
195
196            // I(T;Y): Task relevance — keyword hits + structural importance
197            let line_lower = trimmed.to_lowercase();
198            let keyword_hits: f64 = kw_lower
199                .iter()
200                .filter(|kw| line_lower.contains(kw.as_str()))
201                .count() as f64;
202            let structural = if is_definition_line(trimmed) {
203                1.0
204            } else if is_control_flow(trimmed) {
205                0.5
206            } else if is_closing_brace(trimmed) {
207                0.15
208            } else {
209                0.3
210            };
211            let relevance = keyword_hits * 0.5 + structural;
212
213            // I(T;X): Information density via token uniqueness and IDF
214            let line_tokens: Vec<&str> = trimmed.split_whitespace().collect();
215            let unique_in_line = line_tokens.iter().collect::<HashSet<_>>().len() as f64;
216            let line_token_count = line_tokens.len().max(1) as f64;
217            let token_diversity = unique_in_line / line_token_count;
218
219            let avg_idf: f64 = if line_tokens.is_empty() {
220                0.0
221            } else {
222                line_tokens
223                    .iter()
224                    .map(|t| {
225                        let freq = *global_token_freq.get(t).unwrap_or(&1) as f64;
226                        (total_unique / freq).ln().max(0.0)
227                    })
228                    .sum::<f64>()
229                    / line_token_count
230            };
231            let information = (token_diversity * 0.4 + (avg_idf.min(3.0) / 3.0) * 0.6).min(1.0);
232
233            // LITM U-curve attention weighting (begin α=0.9, middle β=0.3, end γ=0.85)
234            let pos = i as f64 / n.max(1) as f64;
235            let attention = positional_attention(pos, 0.9, 0.3, 0.85);
236
237            // IB composite: maximize relevance * information_density * attention_weight
238            let score = (relevance + 0.15) * (information * 0.3 + 0.7) * (attention * 0.3 + 0.7);
239
240            (i, *line, score)
241        })
242        .collect();
243
244    let budget = ((n as f64) * budget_ratio).ceil() as usize;
245
246    let mut by_score = scored_lines.clone();
247    by_score.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
248
249    let cutoff_score = if budget < by_score.len() {
250        by_score[budget].2
251    } else {
252        0.0
253    };
254
255    scored_lines.retain(|(_, _, score)| *score >= cutoff_score);
256
257    scored_lines
258        .iter()
259        .map(|(_, line, _)| *line)
260        .collect::<Vec<&str>>()
261        .join("\n")
262}
263
264/// Compute an adaptive IB budget ratio based on content characteristics.
265/// Highly repetitive content → more aggressive filtering (lower ratio).
266/// High-entropy diverse content → more conservative (higher ratio).
267pub fn adaptive_ib_budget(content: &str, base_ratio: f64) -> f64 {
268    let lines: Vec<&str> = content.lines().collect();
269    if lines.len() < 10 {
270        return 1.0;
271    }
272
273    let mut token_freq: HashMap<&str, usize> = HashMap::new();
274    let mut total_tokens = 0usize;
275    for line in &lines {
276        for token in line.split_whitespace() {
277            *token_freq.entry(token).or_insert(0) += 1;
278            total_tokens += 1;
279        }
280    }
281
282    if total_tokens == 0 {
283        return base_ratio;
284    }
285
286    let unique_ratio = token_freq.len() as f64 / total_tokens as f64;
287    let repetition_factor = 1.0 - unique_ratio;
288
289    (base_ratio * (1.0 - repetition_factor * 0.3)).clamp(0.2, 1.0)
290}
291
292fn is_definition_line(line: &str) -> bool {
293    let prefixes = [
294        "fn ",
295        "pub fn ",
296        "async fn ",
297        "pub async fn ",
298        "struct ",
299        "pub struct ",
300        "enum ",
301        "pub enum ",
302        "trait ",
303        "pub trait ",
304        "impl ",
305        "type ",
306        "pub type ",
307        "const ",
308        "pub const ",
309        "static ",
310        "pub static ",
311        "class ",
312        "export class ",
313        "interface ",
314        "export interface ",
315        "function ",
316        "export function ",
317        "async function ",
318        "def ",
319        "async def ",
320        "func ",
321    ];
322    prefixes
323        .iter()
324        .any(|p| line.starts_with(p) || line.trim_start().starts_with(p))
325}
326
327fn is_control_flow(line: &str) -> bool {
328    let trimmed = line.trim();
329    trimmed.starts_with("if ")
330        || trimmed.starts_with("else ")
331        || trimmed.starts_with("match ")
332        || trimmed.starts_with("for ")
333        || trimmed.starts_with("while ")
334        || trimmed.starts_with("return ")
335        || trimmed.starts_with("break")
336        || trimmed.starts_with("continue")
337        || trimmed.starts_with("yield")
338        || trimmed.starts_with("await ")
339}
340
341fn is_closing_brace(line: &str) -> bool {
342    let trimmed = line.trim();
343    trimmed == "}" || trimmed == "};" || trimmed == "})" || trimmed == "});"
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn parse_task_finds_files_and_keywords() {
352        let (files, keywords) =
353            parse_task_hints("Fix the authentication bug in src/auth.rs and update tests");
354        assert!(files.iter().any(|f| f.contains("auth.rs")));
355        assert!(keywords
356            .iter()
357            .any(|k| k.to_lowercase().contains("authentication")));
358    }
359
360    #[test]
361    fn recommend_mode_by_score() {
362        assert_eq!(recommend_mode(1.0), "full");
363        assert_eq!(recommend_mode(0.6), "signatures");
364        assert_eq!(recommend_mode(0.3), "map");
365        assert_eq!(recommend_mode(0.1), "reference");
366    }
367
368    #[test]
369    fn info_bottleneck_preserves_definitions() {
370        let content = "fn main() {\n    let x = 42;\n    // boring comment\n    println!(x);\n}\n";
371        let result = information_bottleneck_filter(content, &["main".to_string()], 0.6);
372        assert!(result.contains("fn main"));
373    }
374
375    #[test]
376    fn adaptive_budget_reduces_for_repetitive() {
377        let repetitive = "let x = 1;\n".repeat(50);
378        let diverse = (0..50)
379            .map(|i| format!("let var_{i} = func_{i}(arg_{i});"))
380            .collect::<Vec<_>>()
381            .join("\n");
382        let budget_rep = super::adaptive_ib_budget(&repetitive, 0.7);
383        let budget_div = super::adaptive_ib_budget(&diverse, 0.7);
384        assert!(
385            budget_rep < budget_div,
386            "repetitive content should get lower budget"
387        );
388    }
389}
lean_ctx/core/task_relevance.rs

lean_ctx/core/
task_relevance.rs