lean_ctx/core/
saliency.rs

1//! Saliency Map — information-theoretic chunk ranking and deduplication.
2//!
3//! Implements two key algorithms from recent information theory:
4//!
5//!   **ECS (Entropic Context Shaping, arXiv 2025):**
6//!   Scores each chunk by its pragmatic utility — how much it shifts the
7//!   LLM's output distribution toward the correct answer. Operationalized as
8//!   a composite of task relevance, graph centrality, and information density.
9//!
10//!   **MIG (Marginal Information Gain, COMI 2025):**
11//!   Removes redundant chunks by measuring each chunk's marginal contribution
12//!   given the already-selected set: MIG(c_i) = Relevance(c_i) - Redundancy(c_i, selected).
13//!
14//! Scientific basis: Saliency maps (Itti & Koch 2001) + lateral inhibition
15//! (V1 cortex) for competitive chunk selection.
16
17use crate::core::content_chunk::ContentChunk;
18
19/// Saliency score for a single chunk.
20#[derive(Debug, Clone)]
21pub struct SaliencyScore {
22    pub chunk_idx: usize,
23    pub ecs_score: f64,
24    pub task_relevance: f64,
25    pub graph_centrality: f64,
26    pub info_density: f64,
27    pub final_score: f64,
28}
29
30/// Weights for the ECS composite score.
31#[derive(Debug, Clone)]
32pub struct EcsWeights {
33    pub w_task: f64,
34    pub w_graph: f64,
35    pub w_density: f64,
36}
37
38impl Default for EcsWeights {
39    fn default() -> Self {
40        Self {
41            w_task: 0.5,
42            w_graph: 0.3,
43            w_density: 0.2,
44        }
45    }
46}
47
48/// Compute ECS saliency scores for a set of chunks.
49///
50/// For each chunk, the score is:
51///   ECS = w_task * task_relevance + w_graph * graph_centrality + w_density * info_density
52///
53/// `task_keywords`: words from the active task description.
54/// `graph_edges_per_chunk`: number of graph edges touching each chunk's file.
55pub fn compute_ecs_scores(
56    chunks: &[ContentChunk],
57    task_keywords: &[String],
58    graph_edge_counts: &[usize],
59    weights: &EcsWeights,
60) -> Vec<SaliencyScore> {
61    let max_edges = graph_edge_counts.iter().max().copied().unwrap_or(1).max(1) as f64;
62
63    chunks
64        .iter()
65        .enumerate()
66        .map(|(i, chunk)| {
67            let task_relevance = compute_task_relevance(chunk, task_keywords);
68            let graph_centrality =
69                graph_edge_counts.get(i).copied().unwrap_or(0) as f64 / max_edges;
70            let info_density = compute_info_density(chunk);
71
72            let ecs_score = weights.w_task * task_relevance
73                + weights.w_graph * graph_centrality
74                + weights.w_density * info_density;
75
76            SaliencyScore {
77                chunk_idx: i,
78                ecs_score,
79                task_relevance,
80                graph_centrality,
81                info_density,
82                final_score: ecs_score,
83            }
84        })
85        .collect()
86}
87
88/// Task relevance: fraction of task keywords present in the chunk.
89fn compute_task_relevance(chunk: &ContentChunk, task_keywords: &[String]) -> f64 {
90    if task_keywords.is_empty() {
91        return 0.5;
92    }
93    let content_lower = chunk.content.to_lowercase();
94    let title_lower = chunk.symbol_name.to_lowercase();
95    let combined = format!("{content_lower} {title_lower}");
96
97    let matches = task_keywords
98        .iter()
99        .filter(|kw| combined.contains(&kw.to_lowercase()))
100        .count();
101
102    matches as f64 / task_keywords.len() as f64
103}
104
105/// Information density: unique tokens / total tokens (normalized).
106/// Higher density = more diverse information, less repetition.
107fn compute_info_density(chunk: &ContentChunk) -> f64 {
108    if chunk.token_count == 0 {
109        return 0.0;
110    }
111    let unique: std::collections::HashSet<&str> = chunk.content.split_whitespace().collect();
112    let total = chunk.content.split_whitespace().count().max(1);
113    (unique.len() as f64 / total as f64).min(1.0)
114}
115
116// ---------------------------------------------------------------------------
117// MIG: Marginal Information Gain — redundancy-free chunk selection
118// ---------------------------------------------------------------------------
119
120/// Select top-k chunks using MIG: Marginal Information Gain.
121///
122/// Greedily selects chunks that maximize relevance while minimizing
123/// redundancy with already-selected chunks.
124///
125/// `lambda`: trade-off between relevance and diversity (0.0 = pure relevance,
126/// 1.0 = pure diversity). Default: 0.6.
127pub fn mig_select(
128    scores: &[SaliencyScore],
129    chunks: &[ContentChunk],
130    top_k: usize,
131    lambda: f64,
132) -> Vec<usize> {
133    if scores.is_empty() || top_k == 0 {
134        return Vec::new();
135    }
136
137    let mut selected: Vec<usize> = Vec::with_capacity(top_k);
138    let mut available: Vec<usize> = (0..scores.len()).collect();
139
140    // First: pick highest ECS score.
141    available.sort_by(|a, b| {
142        scores[*b]
143            .ecs_score
144            .partial_cmp(&scores[*a].ecs_score)
145            .unwrap_or(std::cmp::Ordering::Equal)
146    });
147
148    if let Some(&first) = available.first() {
149        selected.push(first);
150        available.retain(|&i| i != first);
151    }
152
153    // Greedy MIG selection.
154    while selected.len() < top_k && !available.is_empty() {
155        let mut best_idx = available[0];
156        let mut best_mig = f64::NEG_INFINITY;
157
158        for &candidate in &available {
159            let relevance = scores[candidate].ecs_score;
160            let redundancy = max_similarity_to_selected(candidate, &selected, chunks);
161            let mig = (1.0 - lambda) * relevance - lambda * redundancy;
162
163            if mig > best_mig {
164                best_mig = mig;
165                best_idx = candidate;
166            }
167        }
168
169        selected.push(best_idx);
170        available.retain(|&i| i != best_idx);
171    }
172
173    selected
174}
175
176/// Jaccard similarity between two chunks' token sets (fast approximation).
177fn chunk_similarity(a: &ContentChunk, b: &ContentChunk) -> f64 {
178    let tokens_a: std::collections::HashSet<&str> = a.content.split_whitespace().collect();
179    let tokens_b: std::collections::HashSet<&str> = b.content.split_whitespace().collect();
180
181    if tokens_a.is_empty() && tokens_b.is_empty() {
182        return 1.0;
183    }
184
185    let intersection = tokens_a.intersection(&tokens_b).count();
186    let union = tokens_a.union(&tokens_b).count().max(1);
187
188    intersection as f64 / union as f64
189}
190
191/// Max similarity between a candidate and all already-selected chunks.
192fn max_similarity_to_selected(
193    candidate: usize,
194    selected: &[usize],
195    chunks: &[ContentChunk],
196) -> f64 {
197    selected
198        .iter()
199        .map(|&s| chunk_similarity(&chunks[candidate], &chunks[s]))
200        .fold(0.0, f64::max)
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206    use crate::core::bm25_index::ChunkKind;
207
208    fn make_chunk(title: &str, content: &str) -> ContentChunk {
209        ContentChunk::from_provider(
210            "test",
211            "issues",
212            title,
213            title,
214            ChunkKind::Issue,
215            content.into(),
216            vec![],
217            None,
218        )
219    }
220
221    #[test]
222    fn ecs_score_higher_for_relevant_chunk() {
223        let chunks = vec![
224            make_chunk("auth-bug", "authentication token expiry broken"),
225            make_chunk("css-issue", "sidebar layout broken on mobile"),
226        ];
227        let keywords = vec!["authentication".into(), "token".into()];
228        let edge_counts = vec![0, 0];
229
230        let scores = compute_ecs_scores(&chunks, &keywords, &edge_counts, &EcsWeights::default());
231        assert!(scores[0].ecs_score > scores[1].ecs_score);
232        assert!(scores[0].task_relevance > scores[1].task_relevance);
233    }
234
235    #[test]
236    fn ecs_score_boosts_high_graph_centrality() {
237        let chunks = vec![
238            make_chunk("hub-file", "important module"),
239            make_chunk("leaf-file", "minor utility"),
240        ];
241        let keywords: Vec<String> = vec![];
242        let edge_counts = vec![10, 1];
243
244        let scores = compute_ecs_scores(&chunks, &keywords, &edge_counts, &EcsWeights::default());
245        assert!(scores[0].graph_centrality > scores[1].graph_centrality);
246    }
247
248    #[test]
249    fn info_density_higher_for_diverse_content() {
250        let diverse = make_chunk(
251            "diverse",
252            "authentication token validation expiry check refresh",
253        );
254        let repetitive = make_chunk("repetitive", "token token token token token token token");
255
256        let d_density = compute_info_density(&diverse);
257        let r_density = compute_info_density(&repetitive);
258        assert!(d_density > r_density);
259    }
260
261    #[test]
262    fn mig_select_picks_diverse_chunks() {
263        let chunks = vec![
264            make_chunk("auth-1", "authentication token expiry validation"),
265            make_chunk("auth-2", "authentication token expiry check"),
266            make_chunk("db-issue", "database connection pool exhausted timeout"),
267        ];
268        let keywords = vec!["authentication".into(), "database".into()];
269        let edge_counts = vec![0, 0, 0];
270
271        let scores = compute_ecs_scores(&chunks, &keywords, &edge_counts, &EcsWeights::default());
272        let selected = mig_select(&scores, &chunks, 2, 0.6);
273
274        assert_eq!(selected.len(), 2);
275        // Should select auth-1 (highest relevance) and db-issue (diverse),
276        // NOT auth-1 + auth-2 (redundant).
277        assert!(selected.contains(&0));
278        assert!(selected.contains(&2));
279    }
280
281    #[test]
282    fn mig_select_respects_top_k() {
283        let chunks = vec![
284            make_chunk("a", "content a"),
285            make_chunk("b", "content b"),
286            make_chunk("c", "content c"),
287        ];
288        let scores = compute_ecs_scores(&chunks, &[], &[0, 0, 0], &EcsWeights::default());
289
290        let selected = mig_select(&scores, &chunks, 1, 0.6);
291        assert_eq!(selected.len(), 1);
292
293        let selected = mig_select(&scores, &chunks, 10, 0.6);
294        assert_eq!(selected.len(), 3);
295    }
296
297    #[test]
298    fn mig_select_empty_input() {
299        let selected = mig_select(&[], &[], 5, 0.6);
300        assert!(selected.is_empty());
301    }
302
303    #[test]
304    fn chunk_similarity_identical() {
305        let a = make_chunk("a", "same content here");
306        assert!((chunk_similarity(&a, &a) - 1.0).abs() < f64::EPSILON);
307    }
308
309    #[test]
310    fn chunk_similarity_disjoint() {
311        let a = make_chunk("a", "authentication token validation");
312        let b = make_chunk("b", "database connection pool exhausted");
313        let sim = chunk_similarity(&a, &b);
314        assert!(sim < 0.2);
315    }
316
317    #[test]
318    fn default_weights_sum_to_one() {
319        let w = EcsWeights::default();
320        assert!((w.w_task + w.w_graph + w.w_density - 1.0).abs() < f64::EPSILON);
321    }
322
323    #[test]
324    fn no_task_keywords_gives_neutral_relevance() {
325        let chunk = make_chunk("test", "some content");
326        let relevance = compute_task_relevance(&chunk, &[]);
327        assert!((relevance - 0.5).abs() < f64::EPSILON);
328    }
329}
lean_ctx/core/saliency.rs

lean_ctx/core/
saliency.rs