lean_ctx/core/
task_relevance.rs

1use std::collections::{HashMap, HashSet, VecDeque};
2
3use super::graph_index::ProjectIndex;
4
5use super::neural::attention_learned::LearnedAttention;
6
7#[derive(Debug, Clone)]
8pub struct RelevanceScore {
9    pub path: String,
10    pub score: f64,
11    pub recommended_mode: &'static str,
12}
13
14pub fn compute_relevance(
15    index: &ProjectIndex,
16    task_files: &[String],
17    task_keywords: &[String],
18) -> Vec<RelevanceScore> {
19    let mut scores: HashMap<String, f64> = HashMap::new();
20
21    // Seed: task files get score 1.0
22    for f in task_files {
23        scores.insert(f.clone(), 1.0);
24    }
25
26    // BFS from task files through import graph, decaying by distance
27    let adj = build_adjacency(index);
28    for seed in task_files {
29        let mut visited: HashSet<String> = HashSet::new();
30        let mut queue: VecDeque<(String, usize)> = VecDeque::new();
31        queue.push_back((seed.clone(), 0));
32        visited.insert(seed.clone());
33
34        while let Some((node, depth)) = queue.pop_front() {
35            if depth > 4 {
36                continue;
37            }
38            let decay = 1.0 / (1.0 + depth as f64).powi(2); // quadratic decay
39            let entry = scores.entry(node.clone()).or_insert(0.0);
40            *entry = entry.max(decay);
41
42            if let Some(neighbors) = adj.get(&node) {
43                for neighbor in neighbors {
44                    if !visited.contains(neighbor) {
45                        visited.insert(neighbor.clone());
46                        queue.push_back((neighbor.clone(), depth + 1));
47                    }
48                }
49            }
50        }
51    }
52
53    // Keyword boost: files containing task keywords get a relevance boost
54    if !task_keywords.is_empty() {
55        let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
56        for (file_path, file_entry) in &index.files {
57            let path_lower = file_path.to_lowercase();
58            let mut keyword_hits = 0;
59            for kw in &kw_lower {
60                if path_lower.contains(kw) {
61                    keyword_hits += 1;
62                }
63                for export in &file_entry.exports {
64                    if export.to_lowercase().contains(kw) {
65                        keyword_hits += 1;
66                    }
67                }
68            }
69            if keyword_hits > 0 {
70                let boost = (keyword_hits as f64 * 0.15).min(0.6);
71                let entry = scores.entry(file_path.clone()).or_insert(0.0);
72                *entry = (*entry + boost).min(1.0);
73            }
74        }
75    }
76
77    let mut result: Vec<RelevanceScore> = scores
78        .into_iter()
79        .map(|(path, score)| {
80            let mode = recommend_mode(score);
81            RelevanceScore {
82                path,
83                score,
84                recommended_mode: mode,
85            }
86        })
87        .collect();
88
89    result.sort_by(|a, b| {
90        b.score
91            .partial_cmp(&a.score)
92            .unwrap_or(std::cmp::Ordering::Equal)
93    });
94    result
95}
96
97fn recommend_mode(score: f64) -> &'static str {
98    if score >= 0.8 {
99        "full"
100    } else if score >= 0.5 {
101        "signatures"
102    } else if score >= 0.2 {
103        "map"
104    } else {
105        "reference"
106    }
107}
108
109fn build_adjacency(index: &ProjectIndex) -> HashMap<String, Vec<String>> {
110    let mut adj: HashMap<String, Vec<String>> = HashMap::new();
111    for edge in &index.edges {
112        adj.entry(edge.from.clone())
113            .or_default()
114            .push(edge.to.clone());
115        adj.entry(edge.to.clone())
116            .or_default()
117            .push(edge.from.clone());
118    }
119    adj
120}
121
122/// Extract likely task-relevant file paths and keywords from a task description.
123pub fn parse_task_hints(task_description: &str) -> (Vec<String>, Vec<String>) {
124    let mut files = Vec::new();
125    let mut keywords = Vec::new();
126
127    for word in task_description.split_whitespace() {
128        let clean = word.trim_matches(|c: char| {
129            !c.is_alphanumeric() && c != '.' && c != '/' && c != '_' && c != '-'
130        });
131        if clean.contains('.')
132            && (clean.contains('/')
133                || clean.ends_with(".rs")
134                || clean.ends_with(".ts")
135                || clean.ends_with(".py")
136                || clean.ends_with(".go")
137                || clean.ends_with(".js"))
138        {
139            files.push(clean.to_string());
140        } else if clean.len() >= 3 && !STOP_WORDS.contains(&clean.to_lowercase().as_str()) {
141            keywords.push(clean.to_string());
142        }
143    }
144
145    (files, keywords)
146}
147
148const STOP_WORDS: &[&str] = &[
149    "the", "and", "for", "that", "this", "with", "from", "have", "has", "was", "are", "been",
150    "not", "but", "all", "can", "had", "her", "one", "our", "out", "you", "its", "will", "each",
151    "make", "like", "fix", "add", "use", "get", "set", "run", "new", "old", "should", "would",
152    "could", "into", "also", "than", "them", "then", "when", "just", "only", "very", "some",
153    "more", "other", "nach", "und", "die", "der", "das", "ist", "ein", "eine", "nicht", "auf",
154    "mit",
155];
156
157/// Information Bottleneck filter v2 — L-Curve aware, score-sorted output.
158///
159/// IB principle: maximize I(T;Y) (task relevance) while minimizing I(T;X) (input redundancy).
160/// Each line is scored by: relevance_to_task * information_density * attention_weight.
161///
162/// v2 changes (based on Lab Experiments A-C):
163///   - Uses empirical L-curve attention from attention_learned.rs instead of heuristic U-curve
164///   - Output is sorted by score DESC (most important first), not by line number
165///   - Error-handling lines get a priority boost (fragile under compression)
166///   - Emits a one-line task summary as the first line when keywords are present
167pub fn information_bottleneck_filter(
168    content: &str,
169    task_keywords: &[String],
170    budget_ratio: f64,
171) -> String {
172    let lines: Vec<&str> = content.lines().collect();
173    if lines.is_empty() {
174        return String::new();
175    }
176
177    let n = lines.len();
178    let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
179    let attention = LearnedAttention::with_defaults();
180
181    let mut global_token_freq: HashMap<&str, usize> = HashMap::new();
182    for line in &lines {
183        for token in line.split_whitespace() {
184            *global_token_freq.entry(token).or_insert(0) += 1;
185        }
186    }
187    let total_unique = global_token_freq.len().max(1) as f64;
188
189    let mut scored_lines: Vec<(usize, &str, f64)> = lines
190        .iter()
191        .enumerate()
192        .map(|(i, line)| {
193            let trimmed = line.trim();
194            if trimmed.is_empty() {
195                return (i, *line, 0.05);
196            }
197
198            let line_lower = trimmed.to_lowercase();
199            let keyword_hits: f64 = kw_lower
200                .iter()
201                .filter(|kw| line_lower.contains(kw.as_str()))
202                .count() as f64;
203
204            let structural = if is_error_handling(trimmed) {
205                1.5
206            } else if is_definition_line(trimmed) {
207                1.0
208            } else if is_control_flow(trimmed) {
209                0.5
210            } else if is_closing_brace(trimmed) {
211                0.15
212            } else {
213                0.3
214            };
215            let relevance = keyword_hits * 0.5 + structural;
216
217            let line_tokens: Vec<&str> = trimmed.split_whitespace().collect();
218            let unique_in_line = line_tokens.iter().collect::<HashSet<_>>().len() as f64;
219            let line_token_count = line_tokens.len().max(1) as f64;
220            let token_diversity = unique_in_line / line_token_count;
221
222            let avg_idf: f64 = if line_tokens.is_empty() {
223                0.0
224            } else {
225                line_tokens
226                    .iter()
227                    .map(|t| {
228                        let freq = *global_token_freq.get(t).unwrap_or(&1) as f64;
229                        (total_unique / freq).ln().max(0.0)
230                    })
231                    .sum::<f64>()
232                    / line_token_count
233            };
234            let information = (token_diversity * 0.4 + (avg_idf.min(3.0) / 3.0) * 0.6).min(1.0);
235
236            let pos = i as f64 / n.max(1) as f64;
237            let attn_weight = attention.weight(pos);
238
239            let score = (relevance * 0.6 + 0.05)
240                * (information * 0.25 + 0.05)
241                * (attn_weight * 0.15 + 0.05);
242
243            (i, *line, score)
244        })
245        .collect();
246
247    let budget = ((n as f64) * budget_ratio).ceil() as usize;
248
249    scored_lines.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
250
251    scored_lines.truncate(budget);
252
253    let mut output_lines: Vec<&str> = Vec::with_capacity(budget + 1);
254
255    if !kw_lower.is_empty() {
256        output_lines.push(""); // placeholder for summary
257    }
258
259    for (_, line, _) in &scored_lines {
260        output_lines.push(line);
261    }
262
263    if !kw_lower.is_empty() {
264        let summary = format!("[task: {}]", task_keywords.join(", "));
265        let mut result = summary;
266        result.push('\n');
267        result.push_str(
268            &output_lines[1..].to_vec().join("\n"),
269        );
270        return result;
271    }
272
273    output_lines.join("\n")
274}
275
276fn is_error_handling(line: &str) -> bool {
277    line.starts_with("return Err(")
278        || line.starts_with("Err(")
279        || line.starts_with("bail!(")
280        || line.starts_with("anyhow::bail!")
281        || line.contains(".map_err(")
282        || line.contains("unwrap()")
283        || line.contains("expect(\"")
284        || line.starts_with("raise ")
285        || line.starts_with("throw ")
286        || line.starts_with("catch ")
287        || line.starts_with("except ")
288        || line.starts_with("try ")
289        || (line.contains("?;") && !line.starts_with("//"))
290        || line.starts_with("panic!(")
291        || line.contains("Error::")
292        || line.contains("error!")
293}
294
295/// Compute an adaptive IB budget ratio based on content characteristics.
296/// Highly repetitive content → more aggressive filtering (lower ratio).
297/// High-entropy diverse content → more conservative (higher ratio).
298pub fn adaptive_ib_budget(content: &str, base_ratio: f64) -> f64 {
299    let lines: Vec<&str> = content.lines().collect();
300    if lines.len() < 10 {
301        return 1.0;
302    }
303
304    let mut token_freq: HashMap<&str, usize> = HashMap::new();
305    let mut total_tokens = 0usize;
306    for line in &lines {
307        for token in line.split_whitespace() {
308            *token_freq.entry(token).or_insert(0) += 1;
309            total_tokens += 1;
310        }
311    }
312
313    if total_tokens == 0 {
314        return base_ratio;
315    }
316
317    let unique_ratio = token_freq.len() as f64 / total_tokens as f64;
318    let repetition_factor = 1.0 - unique_ratio;
319
320    (base_ratio * (1.0 - repetition_factor * 0.3)).clamp(0.2, 1.0)
321}
322
323fn is_definition_line(line: &str) -> bool {
324    let prefixes = [
325        "fn ",
326        "pub fn ",
327        "async fn ",
328        "pub async fn ",
329        "struct ",
330        "pub struct ",
331        "enum ",
332        "pub enum ",
333        "trait ",
334        "pub trait ",
335        "impl ",
336        "type ",
337        "pub type ",
338        "const ",
339        "pub const ",
340        "static ",
341        "pub static ",
342        "class ",
343        "export class ",
344        "interface ",
345        "export interface ",
346        "function ",
347        "export function ",
348        "async function ",
349        "def ",
350        "async def ",
351        "func ",
352    ];
353    prefixes
354        .iter()
355        .any(|p| line.starts_with(p) || line.trim_start().starts_with(p))
356}
357
358fn is_control_flow(line: &str) -> bool {
359    let trimmed = line.trim();
360    trimmed.starts_with("if ")
361        || trimmed.starts_with("else ")
362        || trimmed.starts_with("match ")
363        || trimmed.starts_with("for ")
364        || trimmed.starts_with("while ")
365        || trimmed.starts_with("return ")
366        || trimmed.starts_with("break")
367        || trimmed.starts_with("continue")
368        || trimmed.starts_with("yield")
369        || trimmed.starts_with("await ")
370}
371
372fn is_closing_brace(line: &str) -> bool {
373    let trimmed = line.trim();
374    trimmed == "}" || trimmed == "};" || trimmed == "})" || trimmed == "});"
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    #[test]
382    fn parse_task_finds_files_and_keywords() {
383        let (files, keywords) =
384            parse_task_hints("Fix the authentication bug in src/auth.rs and update tests");
385        assert!(files.iter().any(|f| f.contains("auth.rs")));
386        assert!(keywords
387            .iter()
388            .any(|k| k.to_lowercase().contains("authentication")));
389    }
390
391    #[test]
392    fn recommend_mode_by_score() {
393        assert_eq!(recommend_mode(1.0), "full");
394        assert_eq!(recommend_mode(0.6), "signatures");
395        assert_eq!(recommend_mode(0.3), "map");
396        assert_eq!(recommend_mode(0.1), "reference");
397    }
398
399    #[test]
400    fn info_bottleneck_preserves_definitions() {
401        let content = "fn main() {\n    let x = 42;\n    // boring comment\n    println!(x);\n}\n";
402        let result = information_bottleneck_filter(content, &["main".to_string()], 0.6);
403        assert!(result.contains("fn main"), "definitions must be preserved");
404        assert!(result.contains("[task: main]"), "should have task summary");
405    }
406
407    #[test]
408    fn info_bottleneck_error_handling_priority() {
409        let content = "fn validate() {\n    let data = parse()?;\n    return Err(\"invalid\");\n    let x = 1;\n    let y = 2;\n}\n";
410        let result = information_bottleneck_filter(content, &["validate".to_string()], 0.5);
411        assert!(
412            result.contains("return Err"),
413            "error handling should survive filtering"
414        );
415    }
416
417    #[test]
418    fn info_bottleneck_score_sorted() {
419        let content = "fn important() {\n    let x = 1;\n    let y = 2;\n    let z = 3;\n}\n}\n";
420        let result = information_bottleneck_filter(content, &[], 0.6);
421        let lines: Vec<&str> = result.lines().collect();
422        let def_pos = lines.iter().position(|l| l.contains("fn important"));
423        let brace_pos = lines.iter().position(|l| l.trim() == "}");
424        if let (Some(d), Some(b)) = (def_pos, brace_pos) {
425            assert!(
426                d < b,
427                "definitions should appear before closing braces in score-sorted output"
428            );
429        }
430    }
431
432    #[test]
433    fn adaptive_budget_reduces_for_repetitive() {
434        let repetitive = "let x = 1;\n".repeat(50);
435        let diverse = (0..50)
436            .map(|i| format!("let var_{i} = func_{i}(arg_{i});"))
437            .collect::<Vec<_>>()
438            .join("\n");
439        let budget_rep = super::adaptive_ib_budget(&repetitive, 0.7);
440        let budget_div = super::adaptive_ib_budget(&diverse, 0.7);
441        assert!(
442            budget_rep < budget_div,
443            "repetitive content should get lower budget"
444        );
445    }
446}
lean_ctx/core/task_relevance.rs

lean_ctx/core/
task_relevance.rs