ripvec_core/ranking.rs
1//! Composable ranking layers for search results.
2//!
3//! ## Why this module exists
4//!
5//! Before this refactor, ranking logic was scattered across four call
6//! sites with bespoke layer combinations:
7//!
8//! | Call site | PageRank | Path penalty | Threshold | TopK |
9//! |---|---|---|---|---|
10//! | CLI `run_oneshot` (indexed) | ❌ | ❌ | inside `search()` | inside `search()` |
11//! | CLI `run_oneshot` (stateless) | ❌ | ❌ | inside `search()` | inside `search()` |
12//! | MCP `search_code` | ✅ | ❌ | inside `search()` | inside `search()` |
13//! | LSP nav/symbols | ✅ (α=0.3 hardcoded) | ❌ | inside `search()` | inside `search()` |
14//! | `RipvecIndex::search` | ✅ (optional) | ✅ | inside reranker | inside reranker |
15//!
16//! Three concrete bugs landed today because of this scatter: (1)
17//! PageRank silently absent from the CLI; (2) PageRank lookups hit
18//! zero entries due to path-rooting mismatch — the same bug present
19//! in every call site that used `boost_with_pagerank` before today's
20//! fix; (3) path penalty regex matched the corpus-root prefix when
21//! invoked from CWD-rooted chunk paths.
22//!
23//! The fix: a single [`RankingLayer`] trait that each call site
24//! composes into a pipeline. Layers are independently testable, the
25//! pipeline shape at each call site is explicit, and adding a new
26//! ranking signal (e.g., recency, file-saturation diversification)
27//! is a single new `impl RankingLayer`.
28//!
29//! ## Convention
30//!
31//! Layers operate on `Vec<(chunk_idx, score)>` with a parallel
32//! `&[CodeChunk]` for metadata lookup. Layers MAY:
33//!
34//! - Mutate scores in place (boost / penalty layers).
35//! - Reorder the vec (sort layers — most boost layers re-sort
36//! internally so downstream layers see descending order).
37//! - Drop entries (threshold / topK layers).
38//!
39//! When a layer reorders, it MUST leave the vec sorted descending by
40//! score so downstream layers (especially threshold + topK) operate
41//! on a meaningful ordering.
42
43use std::collections::HashMap;
44use std::path::PathBuf;
45
46use crate::chunk::CodeChunk;
47
48/// A composable layer in the ranking pipeline.
49///
50/// Implementations operate on the full `(idx, score)` list plus the
51/// canonical chunks slice. See the module-level docs for ordering
52/// conventions.
53pub trait RankingLayer: Send + Sync {
54 /// Apply this layer's transformation.
55 fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]);
56}
57
58/// Apply a sequence of ranking layers in order.
59///
60/// Each layer's effect is visible to subsequent layers. Returns the
61/// final `items` after all layers have run.
62pub fn apply_chain(
63 items: &mut Vec<(usize, f32)>,
64 chunks: &[CodeChunk],
65 layers: &[Box<dyn RankingLayer>],
66) {
67 for layer in layers {
68 layer.apply(items, chunks);
69 }
70}
71
72// ---------------------------------------------------------------------------
73// PageRankBoost
74// ---------------------------------------------------------------------------
75
76/// Multiplicative PageRank boost using the sigmoid-on-percentile curve
77/// from [`crate::hybrid::pagerank_boost_factor`].
78///
79/// `pagerank_by_file` maps relative file paths to percentile values
80/// in the corpus distribution (build it via
81/// [`crate::hybrid::pagerank_lookup`]). `alpha` controls the maximum
82/// boost; ceiling is `1 + alpha`.
83pub struct PageRankBoost {
84 pagerank: HashMap<String, f32>,
85 alpha: f32,
86}
87
88impl PageRankBoost {
89 /// Construct from a pre-built percentile lookup.
90 #[must_use]
91 pub fn new(pagerank: HashMap<String, f32>, alpha: f32) -> Self {
92 Self { pagerank, alpha }
93 }
94}
95
96impl RankingLayer for PageRankBoost {
97 fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
98 for (idx, score) in items.iter_mut() {
99 if let Some(chunk) = chunks.get(*idx) {
100 let rank = crate::hybrid::lookup_rank_for_chunk(
101 &self.pagerank,
102 &chunk.file_path,
103 &chunk.name,
104 );
105 *score *= crate::hybrid::pagerank_boost_factor(rank, self.alpha);
106 }
107 }
108 items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
109 }
110}
111
112// ---------------------------------------------------------------------------
113// PathPenalty
114// ---------------------------------------------------------------------------
115
116/// Multiplicative path-shape penalty for test files, examples, etc.
117///
118/// Wraps [`crate::encoder::ripvec::penalties::file_path_penalty`].
119/// Strips `corpus_root` from each chunk path before regex matching so
120/// the test/examples/d.ts regexes operate on repo-relative paths
121/// (otherwise `tests/corpus/code/X` itself triggers `test_dir_re` for
122/// every chunk).
123///
124/// **Not used in the default BERT pipeline.** Path-name heuristics are
125/// brittle and hide intent from the user; PageRank now carries the
126/// "structural importance" signal through percentile-based boost.
127/// Kept here for the semble pipeline's reference-impl parity with the
128/// Python upstream.
129pub struct PathPenalty {
130 corpus_root: PathBuf,
131}
132
133impl PathPenalty {
134 #[must_use]
135 pub fn new(corpus_root: PathBuf) -> Self {
136 Self { corpus_root }
137 }
138}
139
140impl RankingLayer for PathPenalty {
141 fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
142 let prefix = self.corpus_root.to_string_lossy().into_owned();
143 let trimmed_root = prefix.trim_end_matches('/');
144 for (idx, score) in items.iter_mut() {
145 if let Some(chunk) = chunks.get(*idx) {
146 let rel = chunk
147 .file_path
148 .strip_prefix(trimmed_root)
149 .map(|s| s.trim_start_matches('/'))
150 .unwrap_or(&chunk.file_path);
151 let penalty = crate::encoder::ripvec::penalties::file_path_penalty(rel);
152 *score *= penalty;
153 }
154 }
155 items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
156 }
157}
158
159// ---------------------------------------------------------------------------
160// Threshold + TopK
161// ---------------------------------------------------------------------------
162
163/// Drop items with score below `min_score`. Preserves ordering.
164pub struct Threshold {
165 pub min_score: f32,
166}
167
168impl RankingLayer for Threshold {
169 fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
170 items.retain(|(_, score)| *score >= self.min_score);
171 }
172}
173
174/// Truncate to the top `k` items. Caller is responsible for ensuring
175/// the list is sorted descending by score before this layer runs;
176/// most boost layers re-sort internally so the typical pipeline
177/// order is `boosts...` → `Threshold` → `TopK`.
178pub struct TopK {
179 pub k: usize,
180}
181
182impl RankingLayer for TopK {
183 fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
184 if self.k > 0 {
185 items.truncate(self.k);
186 }
187 }
188}
189
190/// Cross-encoder rerank layer.
191///
192/// Replaces every result's `(chunk_idx, score)` similarity with a
193/// fresh score from the cross-encoder, then re-sorts. The reranker
194/// joins query and document text and runs a BERT-with-classifier
195/// forward pass per pair, which is structurally higher quality than
196/// the bi-encoder's dual-tower similarity but O(candidates) cost.
197///
198/// Construct via [`Self::new`]. Holds an [`Arc`] to a
199/// [`crate::rerank::Reranker`] so the layer chain can be cheaply
200/// cloned and the model isn't reloaded per call.
201///
202/// ## Auto-detect via `query`
203///
204/// The layer's score-rewrite is unconditional: if it's in the
205/// pipeline, it runs. Auto-detect (skip rerank for symbol-shaped
206/// queries) belongs at the call site that builds the layer chain,
207/// not here. The call site already knows the query; it can decide
208/// whether to push this layer into the chain at all.
209pub struct CrossEncoderRerank {
210 reranker: std::sync::Arc<crate::rerank::Reranker>,
211 query: String,
212 /// Cap on candidates the reranker sees. Cost is linear in this.
213 candidates: usize,
214 /// Blend factor between bi-encoder and cross-encoder scores.
215 /// `blend = 1.0`: pure cross-encoder (replace). `blend = 0.0`:
216 /// pure bi-encoder (rerank is a no-op). The default `0.7` puts
217 /// most weight on the cross-encoder while preserving the bi-
218 /// encoder's coarse ordering as a tiebreaker — important when
219 /// the cross-encoder's sigmoid scores are compressed near 0.5
220 /// (no candidate clearly relevant), the original bi-encoder
221 /// ordering shouldn't get blown up by 1% rerank-score noise.
222 blend: f32,
223}
224
225impl CrossEncoderRerank {
226 /// Build a rerank layer over `reranker` for `query`. Limits the
227 /// pool to `candidates` (typical: 100). Uses the default blend
228 /// factor of 0.7 (heavy cross-encoder, tiebroken by bi-encoder).
229 #[must_use]
230 pub fn new(
231 reranker: std::sync::Arc<crate::rerank::Reranker>,
232 query: String,
233 candidates: usize,
234 ) -> Self {
235 Self {
236 reranker,
237 query,
238 candidates,
239 blend: 0.7,
240 }
241 }
242
243 /// Override the bi/cross-encoder blend factor. `0.0` = pure
244 /// bi-encoder (rerank is a no-op), `1.0` = pure cross-encoder
245 /// (replace).
246 #[must_use]
247 pub fn with_blend(mut self, blend: f32) -> Self {
248 self.blend = blend.clamp(0.0, 1.0);
249 self
250 }
251}
252
253impl RankingLayer for CrossEncoderRerank {
254 fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
255 // Cap to top-`candidates` before invoking the reranker. Cost
256 // is linear in the cap, so trimming is meaningful — even
257 // 100 vs 200 is a doubling.
258 if items.len() > self.candidates {
259 items.truncate(self.candidates);
260 }
261 if items.is_empty() {
262 return;
263 }
264 // Build `(query, doc_text)` pairs aligned with `items`.
265 // Out-of-range indices are dropped (shouldn't happen, but
266 // defensive against malformed input).
267 let pairs: Vec<(&str, &str)> = items
268 .iter()
269 .filter_map(|&(idx, _)| {
270 chunks
271 .get(idx)
272 .map(|c| (self.query.as_str(), c.content.as_str()))
273 })
274 .collect();
275 let Ok(scores) = self.reranker.score_pairs(&pairs) else {
276 // Rerank failed — leave the existing scores untouched.
277 // Logging happens at the call site; the layer is silent
278 // about errors since it has no logging context.
279 return;
280 };
281 // Blend bi-encoder and cross-encoder scores:
282 // new = blend * cross + (1 - blend) * bi
283 // The blend approach handles a known failure mode: when the
284 // cross-encoder is uncertain about all candidates (sigmoid
285 // outputs compressed near 0.5), pure replacement throws away
286 // the bi-encoder's confident top-1 in favor of 1% rerank-score
287 // noise. Blending lets the cross-encoder reorder confidently-
288 // different candidates while preserving the bi-encoder's
289 // ordering when it can't.
290 for (item, &cross_score) in items.iter_mut().zip(scores.iter()) {
291 let bi_score = item.1;
292 item.1 = self.blend * cross_score + (1.0 - self.blend) * bi_score;
293 }
294 items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
295 }
296}
297
298#[cfg(test)]
299mod tests {
300 use super::*;
301
302 fn dummy_chunk(file: &str, name: &str) -> CodeChunk {
303 CodeChunk {
304 file_path: file.into(),
305 name: name.into(),
306 kind: "function".into(),
307 start_line: 1,
308 end_line: 10,
309 content: String::new(),
310 enriched_content: String::new(),
311 }
312 }
313
314 #[test]
315 fn threshold_drops_below_min() {
316 let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
317 let mut items = vec![(0, 0.9), (1, 0.3)];
318 Threshold { min_score: 0.5 }.apply(&mut items, &chunks);
319 assert_eq!(items, vec![(0, 0.9)]);
320 }
321
322 #[test]
323 fn topk_truncates() {
324 let chunks = vec![
325 dummy_chunk("a.rs", "f"),
326 dummy_chunk("b.rs", "g"),
327 dummy_chunk("c.rs", "h"),
328 ];
329 let mut items = vec![(0, 0.9), (1, 0.8), (2, 0.7)];
330 TopK { k: 2 }.apply(&mut items, &chunks);
331 assert_eq!(items, vec![(0, 0.9), (1, 0.8)]);
332 }
333
334 #[test]
335 fn topk_zero_keeps_all() {
336 let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
337 let mut items = vec![(0, 0.9), (1, 0.8)];
338 TopK { k: 0 }.apply(&mut items, &chunks);
339 assert_eq!(items.len(), 2);
340 }
341
342 #[test]
343 fn chain_runs_layers_in_order() {
344 // Three items: scores 1.0, 0.6, 0.3. Threshold at 0.5, then top 1.
345 let chunks = vec![
346 dummy_chunk("a.rs", "f"),
347 dummy_chunk("b.rs", "g"),
348 dummy_chunk("c.rs", "h"),
349 ];
350 let mut items = vec![(0, 1.0), (1, 0.6), (2, 0.3)];
351 let layers: Vec<Box<dyn RankingLayer>> = vec![
352 Box::new(Threshold { min_score: 0.5 }),
353 Box::new(TopK { k: 1 }),
354 ];
355 apply_chain(&mut items, &chunks, &layers);
356 assert_eq!(items, vec![(0, 1.0)]);
357 }
358
359 #[test]
360 fn pagerank_boost_layer_reorders() {
361 let chunks = vec![
362 dummy_chunk("important.rs", "a"),
363 dummy_chunk("obscure.rs", "b"),
364 ];
365 let mut items = vec![(0, 0.8), (1, 0.8)];
366 let mut pr = HashMap::new();
367 pr.insert("important.rs".to_string(), 1.0); // top percentile
368 pr.insert("obscure.rs".to_string(), 0.1); // bottom decile
369 PageRankBoost::new(pr, 0.3).apply(&mut items, &chunks);
370 // important.rs should rank first after boost.
371 assert_eq!(items[0].0, 0);
372 assert!(items[0].1 > items[1].1);
373 }
374}