ripvec_core/ranking.rs
1//! Composable ranking layers for search results.
2//!
3//! ## Why this module exists
4//!
5//! Before this refactor, ranking logic was scattered across four call
6//! sites with bespoke layer combinations:
7//!
8//! | Call site | PageRank | Path penalty | Threshold | TopK |
9//! |---|---|---|---|---|
10//! | CLI `run_oneshot` (indexed) | ❌ | ❌ | inside `search()` | inside `search()` |
11//! | CLI `run_oneshot` (stateless) | ❌ | ❌ | inside `search()` | inside `search()` |
12//! | MCP `search_code` | ✅ | ❌ | inside `search()` | inside `search()` |
13//! | LSP nav/symbols | ✅ (α=0.3 hardcoded) | ❌ | inside `search()` | inside `search()` |
14//! | `RipvecIndex::search` | ✅ (optional) | ✅ | inside reranker | inside reranker |
15//!
16//! Three concrete bugs landed today because of this scatter: (1)
17//! PageRank silently absent from the CLI; (2) PageRank lookups hit
18//! zero entries due to path-rooting mismatch — the same bug present
19//! in every call site that used `boost_with_pagerank` before today's
20//! fix; (3) path penalty regex matched the corpus-root prefix when
21//! invoked from CWD-rooted chunk paths.
22//!
23//! The fix: a single [`RankingLayer`] trait that each call site
24//! composes into a pipeline. Layers are independently testable, the
25//! pipeline shape at each call site is explicit, and adding a new
26//! ranking signal (e.g., recency, file-saturation diversification)
27//! is a single new `impl RankingLayer`.
28//!
29//! ## Convention
30//!
31//! Layers operate on `Vec<(chunk_idx, score)>` with a parallel
32//! `&[CodeChunk]` for metadata lookup. Layers MAY:
33//!
34//! - Mutate scores in place (boost / penalty layers).
35//! - Reorder the vec (sort layers — most boost layers re-sort
36//! internally so downstream layers see descending order).
37//! - Drop entries (threshold / topK layers).
38//!
39//! When a layer reorders, it MUST leave the vec sorted descending by
40//! score so downstream layers (especially threshold + topK) operate
41//! on a meaningful ordering.
42
43use std::collections::HashMap;
44use std::path::PathBuf;
45
46use crate::chunk::CodeChunk;
47
48/// A composable layer in the ranking pipeline.
49///
50/// Implementations operate on the full `(idx, score)` list plus the
51/// canonical chunks slice. See the module-level docs for ordering
52/// conventions.
53pub trait RankingLayer: Send + Sync {
54 /// Apply this layer's transformation.
55 fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]);
56}
57
58/// Apply a sequence of ranking layers in order.
59///
60/// Each layer's effect is visible to subsequent layers. Returns the
61/// final `items` after all layers have run.
62pub fn apply_chain(
63 items: &mut Vec<(usize, f32)>,
64 chunks: &[CodeChunk],
65 layers: &[Box<dyn RankingLayer>],
66) {
67 for layer in layers {
68 layer.apply(items, chunks);
69 }
70}
71
72// ---------------------------------------------------------------------------
73// PageRankBoost
74// ---------------------------------------------------------------------------
75
76/// Multiplicative PageRank boost using the sigmoid-on-percentile curve
77/// from [`crate::hybrid::pagerank_boost_factor`].
78///
79/// `pagerank_by_file` maps relative file paths to percentile values
80/// in the corpus distribution (build it via
81/// [`crate::hybrid::pagerank_lookup`]). `alpha` controls the maximum
82/// boost; ceiling is `1 + alpha`.
83pub struct PageRankBoost {
84 /// Shared via `Arc` so per-query construction is a pointer-bump
85 /// instead of a full HashMap clone. The earlier shape took
86 /// `HashMap<String, f32>` by value and `RipvecIndex::apply_pagerank_layer`
87 /// called `lookup.clone()` on every query; on a 1M-chunk corpus that
88 /// was ~5.9 s of `String::clone` per 20-query batch (profile, samply,
89 /// 2026-05-21). Cloning an Arc is a relaxed atomic increment.
90 pagerank: std::sync::Arc<HashMap<String, f32>>,
91 alpha: f32,
92}
93
94impl PageRankBoost {
95 /// Construct from a pre-built percentile lookup.
96 ///
97 /// Take `Arc<HashMap<..>>` so the caller can build the lookup once
98 /// at index time and hand out cheap clones per query.
99 #[must_use]
100 pub fn new(pagerank: std::sync::Arc<HashMap<String, f32>>, alpha: f32) -> Self {
101 Self { pagerank, alpha }
102 }
103}
104
105impl RankingLayer for PageRankBoost {
106 fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
107 for (idx, score) in items.iter_mut() {
108 if let Some(chunk) = chunks.get(*idx) {
109 let rank = crate::hybrid::lookup_rank_for_chunk(
110 &self.pagerank,
111 &chunk.file_path,
112 &chunk.name,
113 );
114 *score *= crate::hybrid::pagerank_boost_factor(rank, self.alpha);
115 }
116 }
117 items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
118 }
119}
120
121// ---------------------------------------------------------------------------
122// PathPenalty
123// ---------------------------------------------------------------------------
124
125/// Multiplicative path-shape penalty for test files, examples, etc.
126///
127/// Wraps [`crate::encoder::ripvec::penalties::file_path_penalty`].
128/// Strips `corpus_root` from each chunk path before regex matching so
129/// the test/examples/d.ts regexes operate on repo-relative paths
130/// (otherwise `tests/corpus/code/X` itself triggers `test_dir_re` for
131/// every chunk).
132///
133/// **Not used in the default BERT pipeline.** Path-name heuristics are
134/// brittle and hide intent from the user; PageRank now carries the
135/// "structural importance" signal through percentile-based boost.
136/// Kept here for the semble pipeline's reference-impl parity with the
137/// Python upstream.
138pub struct PathPenalty {
139 corpus_root: PathBuf,
140}
141
142impl PathPenalty {
143 #[must_use]
144 pub fn new(corpus_root: PathBuf) -> Self {
145 Self { corpus_root }
146 }
147}
148
149impl RankingLayer for PathPenalty {
150 fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
151 let prefix = self.corpus_root.to_string_lossy().into_owned();
152 let trimmed_root = prefix.trim_end_matches('/');
153 for (idx, score) in items.iter_mut() {
154 if let Some(chunk) = chunks.get(*idx) {
155 let rel = chunk
156 .file_path
157 .strip_prefix(trimmed_root)
158 .map(|s| s.trim_start_matches('/'))
159 .unwrap_or(&chunk.file_path);
160 let penalty = crate::encoder::ripvec::penalties::file_path_penalty(rel);
161 *score *= penalty;
162 }
163 }
164 items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
165 }
166}
167
168// ---------------------------------------------------------------------------
169// Threshold + TopK
170// ---------------------------------------------------------------------------
171
172/// Drop items with score below `min_score`. Preserves ordering.
173pub struct Threshold {
174 pub min_score: f32,
175}
176
177impl RankingLayer for Threshold {
178 fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
179 items.retain(|(_, score)| *score >= self.min_score);
180 }
181}
182
183/// Truncate to the top `k` items. Caller is responsible for ensuring
184/// the list is sorted descending by score before this layer runs;
185/// most boost layers re-sort internally so the typical pipeline
186/// order is `boosts...` → `Threshold` → `TopK`.
187pub struct TopK {
188 pub k: usize,
189}
190
191impl RankingLayer for TopK {
192 fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
193 if self.k > 0 {
194 items.truncate(self.k);
195 }
196 }
197}
198
199/// Cross-encoder rerank layer.
200///
201/// Replaces every result's `(chunk_idx, score)` similarity with a
202/// fresh score from the cross-encoder, then re-sorts. The reranker
203/// joins query and document text and runs a BERT-with-classifier
204/// forward pass per pair, which is structurally higher quality than
205/// the bi-encoder's dual-tower similarity but O(candidates) cost.
206///
207/// Construct via [`Self::new`]. Holds an [`Arc`] to a
208/// [`crate::rerank::Reranker`] so the layer chain can be cheaply
209/// cloned and the model isn't reloaded per call.
210///
211/// ## Auto-detect via `query`
212///
213/// The layer's score-rewrite is unconditional: if it's in the
214/// pipeline, it runs. Auto-detect (skip rerank for symbol-shaped
215/// queries) belongs at the call site that builds the layer chain,
216/// not here. The call site already knows the query; it can decide
217/// whether to push this layer into the chain at all.
218pub struct CrossEncoderRerank {
219 reranker: std::sync::Arc<crate::rerank::Reranker>,
220 query: String,
221 /// Cap on candidates the reranker sees. Cost is linear in this.
222 candidates: usize,
223 /// Blend factor between bi-encoder and cross-encoder scores.
224 /// `blend = 1.0`: pure cross-encoder (replace). `blend = 0.0`:
225 /// pure bi-encoder (rerank is a no-op). The default `0.7` puts
226 /// most weight on the cross-encoder while preserving the bi-
227 /// encoder's coarse ordering as a tiebreaker — important when
228 /// the cross-encoder's sigmoid scores are compressed near 0.5
229 /// (no candidate clearly relevant), the original bi-encoder
230 /// ordering shouldn't get blown up by 1% rerank-score noise.
231 blend: f32,
232}
233
234impl CrossEncoderRerank {
235 /// Build a rerank layer over `reranker` for `query`. Limits the
236 /// pool to `candidates` (typical: 100). Uses the default blend
237 /// factor of 0.7 (heavy cross-encoder, tiebroken by bi-encoder).
238 #[must_use]
239 pub fn new(
240 reranker: std::sync::Arc<crate::rerank::Reranker>,
241 query: String,
242 candidates: usize,
243 ) -> Self {
244 Self {
245 reranker,
246 query,
247 candidates,
248 blend: 0.7,
249 }
250 }
251
252 /// Override the bi/cross-encoder blend factor. `0.0` = pure
253 /// bi-encoder (rerank is a no-op), `1.0` = pure cross-encoder
254 /// (replace).
255 #[must_use]
256 pub fn with_blend(mut self, blend: f32) -> Self {
257 self.blend = blend.clamp(0.0, 1.0);
258 self
259 }
260}
261
262impl RankingLayer for CrossEncoderRerank {
263 fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
264 // Cap to top-`candidates` before invoking the reranker. Cost
265 // is linear in the cap, so trimming is meaningful — even
266 // 100 vs 200 is a doubling.
267 if items.len() > self.candidates {
268 items.truncate(self.candidates);
269 }
270 if items.is_empty() {
271 return;
272 }
273 // Build `(query, doc_text)` pairs aligned with `items`.
274 // Out-of-range indices are dropped (shouldn't happen, but
275 // defensive against malformed input).
276 let pairs: Vec<(&str, &str)> = items
277 .iter()
278 .filter_map(|&(idx, _)| {
279 chunks
280 .get(idx)
281 .map(|c| (self.query.as_str(), c.content.as_str()))
282 })
283 .collect();
284 let Ok(scores) = self.reranker.score_pairs(&pairs) else {
285 // Rerank failed — leave the existing scores untouched.
286 // Logging happens at the call site; the layer is silent
287 // about errors since it has no logging context.
288 return;
289 };
290 // Min-max normalize both score arrays to [0, 1] within this
291 // candidate set, then blend. The reranker returns raw logits
292 // (sentence-transformers ms-marco config declares Identity
293 // activation), which span ~[-11, +5]; bi-encoder scores arrive
294 // here in the RRF-normalized [0, ~1] range. Without per-set
295 // normalization the magnitude-dominant signal wins by default
296 // — the `blend = 0.7` weighting is only meaningful when both
297 // sides live on the same scale. Min-max preserves ordering
298 // within each signal while making the blend a true convex
299 // combination.
300 let bi: Vec<f32> = items.iter().map(|&(_, s)| s).collect();
301 let cross_norm = min_max_normalize(&scores);
302 let bi_norm = min_max_normalize(&bi);
303 for ((item, &cross), &bi_n) in items.iter_mut().zip(cross_norm.iter()).zip(bi_norm.iter()) {
304 item.1 = self.blend * cross + (1.0 - self.blend) * bi_n;
305 }
306 items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
307 }
308}
309
310/// Linear-rescale a score array into `[0, 1]` based on its own
311/// min/max. Used by [`CrossEncoderRerank`] to put raw cross-encoder
312/// logits and RRF bi-encoder scores on the same scale before
313/// convex-combining them. Degenerate inputs (empty or all-equal)
314/// return all-`0.5` so the layer still contributes a neutral signal
315/// rather than NaN/`0.0` collapse.
316fn min_max_normalize(xs: &[f32]) -> Vec<f32> {
317 if xs.is_empty() {
318 return Vec::new();
319 }
320 let mut lo = f32::INFINITY;
321 let mut hi = f32::NEG_INFINITY;
322 for &x in xs {
323 if x < lo {
324 lo = x;
325 }
326 if x > hi {
327 hi = x;
328 }
329 }
330 let span = hi - lo;
331 if span.abs() < f32::EPSILON {
332 return vec![0.5; xs.len()];
333 }
334 xs.iter().map(|&x| (x - lo) / span).collect()
335}
336
337#[cfg(test)]
338mod tests {
339 use super::*;
340
341 fn dummy_chunk(file: &str, name: &str) -> CodeChunk {
342 CodeChunk {
343 file_path: file.into(),
344 name: name.into(),
345 kind: "function".into(),
346 start_line: 1,
347 end_line: 10,
348 content: String::new(),
349 enriched_content: String::new(),
350 }
351 }
352
353 #[test]
354 fn threshold_drops_below_min() {
355 let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
356 let mut items = vec![(0, 0.9), (1, 0.3)];
357 Threshold { min_score: 0.5 }.apply(&mut items, &chunks);
358 assert_eq!(items, vec![(0, 0.9)]);
359 }
360
361 #[test]
362 fn topk_truncates() {
363 let chunks = vec![
364 dummy_chunk("a.rs", "f"),
365 dummy_chunk("b.rs", "g"),
366 dummy_chunk("c.rs", "h"),
367 ];
368 let mut items = vec![(0, 0.9), (1, 0.8), (2, 0.7)];
369 TopK { k: 2 }.apply(&mut items, &chunks);
370 assert_eq!(items, vec![(0, 0.9), (1, 0.8)]);
371 }
372
373 #[test]
374 fn topk_zero_keeps_all() {
375 let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
376 let mut items = vec![(0, 0.9), (1, 0.8)];
377 TopK { k: 0 }.apply(&mut items, &chunks);
378 assert_eq!(items.len(), 2);
379 }
380
381 #[test]
382 fn chain_runs_layers_in_order() {
383 // Three items: scores 1.0, 0.6, 0.3. Threshold at 0.5, then top 1.
384 let chunks = vec![
385 dummy_chunk("a.rs", "f"),
386 dummy_chunk("b.rs", "g"),
387 dummy_chunk("c.rs", "h"),
388 ];
389 let mut items = vec![(0, 1.0), (1, 0.6), (2, 0.3)];
390 let layers: Vec<Box<dyn RankingLayer>> = vec![
391 Box::new(Threshold { min_score: 0.5 }),
392 Box::new(TopK { k: 1 }),
393 ];
394 apply_chain(&mut items, &chunks, &layers);
395 assert_eq!(items, vec![(0, 1.0)]);
396 }
397
398 #[test]
399 fn pagerank_boost_layer_reorders() {
400 let chunks = vec![
401 dummy_chunk("important.rs", "a"),
402 dummy_chunk("obscure.rs", "b"),
403 ];
404 let mut items = vec![(0, 0.8), (1, 0.8)];
405 let mut pr = HashMap::new();
406 pr.insert("important.rs".to_string(), 1.0); // top percentile
407 pr.insert("obscure.rs".to_string(), 0.1); // bottom decile
408 PageRankBoost::new(std::sync::Arc::new(pr), 0.3).apply(&mut items, &chunks);
409 // important.rs should rank first after boost.
410 assert_eq!(items[0].0, 0);
411 assert!(items[0].1 > items[1].1);
412 }
413}