1use crate::core::content_chunk::ContentChunk;
18
19#[derive(Debug, Clone)]
21pub struct SaliencyScore {
22 pub chunk_idx: usize,
23 pub ecs_score: f64,
24 pub task_relevance: f64,
25 pub graph_centrality: f64,
26 pub info_density: f64,
27 pub final_score: f64,
28}
29
30#[derive(Debug, Clone)]
32pub struct EcsWeights {
33 pub w_task: f64,
34 pub w_graph: f64,
35 pub w_density: f64,
36}
37
38impl Default for EcsWeights {
39 fn default() -> Self {
40 Self {
41 w_task: 0.5,
42 w_graph: 0.3,
43 w_density: 0.2,
44 }
45 }
46}
47
48pub fn compute_ecs_scores(
56 chunks: &[ContentChunk],
57 task_keywords: &[String],
58 graph_edge_counts: &[usize],
59 weights: &EcsWeights,
60) -> Vec<SaliencyScore> {
61 let max_edges = graph_edge_counts.iter().max().copied().unwrap_or(1).max(1) as f64;
62
63 chunks
64 .iter()
65 .enumerate()
66 .map(|(i, chunk)| {
67 let task_relevance = compute_task_relevance(chunk, task_keywords);
68 let graph_centrality =
69 graph_edge_counts.get(i).copied().unwrap_or(0) as f64 / max_edges;
70 let info_density = compute_info_density(chunk);
71
72 let ecs_score = weights.w_task * task_relevance
73 + weights.w_graph * graph_centrality
74 + weights.w_density * info_density;
75
76 SaliencyScore {
77 chunk_idx: i,
78 ecs_score,
79 task_relevance,
80 graph_centrality,
81 info_density,
82 final_score: ecs_score,
83 }
84 })
85 .collect()
86}
87
88fn compute_task_relevance(chunk: &ContentChunk, task_keywords: &[String]) -> f64 {
90 if task_keywords.is_empty() {
91 return 0.5;
92 }
93 let content_lower = chunk.content.to_lowercase();
94 let title_lower = chunk.symbol_name.to_lowercase();
95 let combined = format!("{content_lower} {title_lower}");
96
97 let matches = task_keywords
98 .iter()
99 .filter(|kw| combined.contains(&kw.to_lowercase()))
100 .count();
101
102 matches as f64 / task_keywords.len() as f64
103}
104
105fn compute_info_density(chunk: &ContentChunk) -> f64 {
108 if chunk.token_count == 0 {
109 return 0.0;
110 }
111 let unique: std::collections::HashSet<&str> = chunk.content.split_whitespace().collect();
112 let total = chunk.content.split_whitespace().count().max(1);
113 (unique.len() as f64 / total as f64).min(1.0)
114}
115
116pub fn mig_select(
128 scores: &[SaliencyScore],
129 chunks: &[ContentChunk],
130 top_k: usize,
131 lambda: f64,
132) -> Vec<usize> {
133 if scores.is_empty() || top_k == 0 {
134 return Vec::new();
135 }
136
137 let mut selected: Vec<usize> = Vec::with_capacity(top_k);
138 let mut available: Vec<usize> = (0..scores.len()).collect();
139
140 available.sort_by(|a, b| {
142 scores[*b]
143 .ecs_score
144 .partial_cmp(&scores[*a].ecs_score)
145 .unwrap_or(std::cmp::Ordering::Equal)
146 });
147
148 if let Some(&first) = available.first() {
149 selected.push(first);
150 available.retain(|&i| i != first);
151 }
152
153 while selected.len() < top_k && !available.is_empty() {
155 let mut best_idx = available[0];
156 let mut best_mig = f64::NEG_INFINITY;
157
158 for &candidate in &available {
159 let relevance = scores[candidate].ecs_score;
160 let redundancy = max_similarity_to_selected(candidate, &selected, chunks);
161 let mig = (1.0 - lambda) * relevance - lambda * redundancy;
162
163 if mig > best_mig {
164 best_mig = mig;
165 best_idx = candidate;
166 }
167 }
168
169 selected.push(best_idx);
170 available.retain(|&i| i != best_idx);
171 }
172
173 selected
174}
175
176fn chunk_similarity(a: &ContentChunk, b: &ContentChunk) -> f64 {
178 let tokens_a: std::collections::HashSet<&str> = a.content.split_whitespace().collect();
179 let tokens_b: std::collections::HashSet<&str> = b.content.split_whitespace().collect();
180
181 if tokens_a.is_empty() && tokens_b.is_empty() {
182 return 1.0;
183 }
184
185 let intersection = tokens_a.intersection(&tokens_b).count();
186 let union = tokens_a.union(&tokens_b).count().max(1);
187
188 intersection as f64 / union as f64
189}
190
191fn max_similarity_to_selected(
193 candidate: usize,
194 selected: &[usize],
195 chunks: &[ContentChunk],
196) -> f64 {
197 selected
198 .iter()
199 .map(|&s| chunk_similarity(&chunks[candidate], &chunks[s]))
200 .fold(0.0, f64::max)
201}
202
203#[cfg(test)]
204mod tests {
205 use super::*;
206 use crate::core::bm25_index::ChunkKind;
207
208 fn make_chunk(title: &str, content: &str) -> ContentChunk {
209 ContentChunk::from_provider(
210 "test",
211 "issues",
212 title,
213 title,
214 ChunkKind::Issue,
215 content.into(),
216 vec![],
217 None,
218 )
219 }
220
221 #[test]
222 fn ecs_score_higher_for_relevant_chunk() {
223 let chunks = vec![
224 make_chunk("auth-bug", "authentication token expiry broken"),
225 make_chunk("css-issue", "sidebar layout broken on mobile"),
226 ];
227 let keywords = vec!["authentication".into(), "token".into()];
228 let edge_counts = vec![0, 0];
229
230 let scores = compute_ecs_scores(&chunks, &keywords, &edge_counts, &EcsWeights::default());
231 assert!(scores[0].ecs_score > scores[1].ecs_score);
232 assert!(scores[0].task_relevance > scores[1].task_relevance);
233 }
234
235 #[test]
236 fn ecs_score_boosts_high_graph_centrality() {
237 let chunks = vec![
238 make_chunk("hub-file", "important module"),
239 make_chunk("leaf-file", "minor utility"),
240 ];
241 let keywords: Vec<String> = vec![];
242 let edge_counts = vec![10, 1];
243
244 let scores = compute_ecs_scores(&chunks, &keywords, &edge_counts, &EcsWeights::default());
245 assert!(scores[0].graph_centrality > scores[1].graph_centrality);
246 }
247
248 #[test]
249 fn info_density_higher_for_diverse_content() {
250 let diverse = make_chunk(
251 "diverse",
252 "authentication token validation expiry check refresh",
253 );
254 let repetitive = make_chunk("repetitive", "token token token token token token token");
255
256 let d_density = compute_info_density(&diverse);
257 let r_density = compute_info_density(&repetitive);
258 assert!(d_density > r_density);
259 }
260
261 #[test]
262 fn mig_select_picks_diverse_chunks() {
263 let chunks = vec![
264 make_chunk("auth-1", "authentication token expiry validation"),
265 make_chunk("auth-2", "authentication token expiry check"),
266 make_chunk("db-issue", "database connection pool exhausted timeout"),
267 ];
268 let keywords = vec!["authentication".into(), "database".into()];
269 let edge_counts = vec![0, 0, 0];
270
271 let scores = compute_ecs_scores(&chunks, &keywords, &edge_counts, &EcsWeights::default());
272 let selected = mig_select(&scores, &chunks, 2, 0.6);
273
274 assert_eq!(selected.len(), 2);
275 assert!(selected.contains(&0));
278 assert!(selected.contains(&2));
279 }
280
281 #[test]
282 fn mig_select_respects_top_k() {
283 let chunks = vec![
284 make_chunk("a", "content a"),
285 make_chunk("b", "content b"),
286 make_chunk("c", "content c"),
287 ];
288 let scores = compute_ecs_scores(&chunks, &[], &[0, 0, 0], &EcsWeights::default());
289
290 let selected = mig_select(&scores, &chunks, 1, 0.6);
291 assert_eq!(selected.len(), 1);
292
293 let selected = mig_select(&scores, &chunks, 10, 0.6);
294 assert_eq!(selected.len(), 3);
295 }
296
297 #[test]
298 fn mig_select_empty_input() {
299 let selected = mig_select(&[], &[], 5, 0.6);
300 assert!(selected.is_empty());
301 }
302
303 #[test]
304 fn chunk_similarity_identical() {
305 let a = make_chunk("a", "same content here");
306 assert!((chunk_similarity(&a, &a) - 1.0).abs() < f64::EPSILON);
307 }
308
309 #[test]
310 fn chunk_similarity_disjoint() {
311 let a = make_chunk("a", "authentication token validation");
312 let b = make_chunk("b", "database connection pool exhausted");
313 let sim = chunk_similarity(&a, &b);
314 assert!(sim < 0.2);
315 }
316
317 #[test]
318 fn default_weights_sum_to_one() {
319 let w = EcsWeights::default();
320 assert!((w.w_task + w.w_graph + w.w_density - 1.0).abs() < f64::EPSILON);
321 }
322
323 #[test]
324 fn no_task_keywords_gives_neutral_relevance() {
325 let chunk = make_chunk("test", "some content");
326 let relevance = compute_task_relevance(&chunk, &[]);
327 assert!((relevance - 0.5).abs() < f64::EPSILON);
328 }
329}