Skip to main content

ripvec_core/
ranking.rs

1//! Composable ranking layers for search results.
2//!
3//! ## Why this module exists
4//!
5//! Before this refactor, ranking logic was scattered across four call
6//! sites with bespoke layer combinations:
7//!
8//! | Call site | PageRank | Path penalty | Threshold | TopK |
9//! |---|---|---|---|---|
10//! | CLI `run_oneshot` (indexed) | ❌ | ❌ | inside `search()` | inside `search()` |
11//! | CLI `run_oneshot` (stateless) | ❌ | ❌ | inside `search()` | inside `search()` |
12//! | MCP `search_code` | ✅ | ❌ | inside `search()` | inside `search()` |
13//! | LSP nav/symbols | ✅ (α=0.3 hardcoded) | ❌ | inside `search()` | inside `search()` |
14//! | `RipvecIndex::search` | ✅ (optional) | ✅ | inside reranker | inside reranker |
15//!
16//! Three concrete bugs landed today because of this scatter: (1)
17//! PageRank silently absent from the CLI; (2) PageRank lookups hit
18//! zero entries due to path-rooting mismatch — the same bug present
19//! in every call site that used `boost_with_pagerank` before today's
20//! fix; (3) path penalty regex matched the corpus-root prefix when
21//! invoked from CWD-rooted chunk paths.
22//!
23//! The fix: a single [`RankingLayer`] trait that each call site
24//! composes into a pipeline. Layers are independently testable, the
25//! pipeline shape at each call site is explicit, and adding a new
26//! ranking signal (e.g., recency, file-saturation diversification)
27//! is a single new `impl RankingLayer`.
28//!
29//! ## Convention
30//!
31//! Layers operate on `Vec<(chunk_idx, score)>` with a parallel
32//! `&[CodeChunk]` for metadata lookup. Layers MAY:
33//!
34//! - Mutate scores in place (boost / penalty layers).
35//! - Reorder the vec (sort layers — most boost layers re-sort
36//!   internally so downstream layers see descending order).
37//! - Drop entries (threshold / topK layers).
38//!
39//! When a layer reorders, it MUST leave the vec sorted descending by
40//! score so downstream layers (especially threshold + topK) operate
41//! on a meaningful ordering.
42
43use std::collections::HashMap;
44use std::path::PathBuf;
45
46use crate::chunk::CodeChunk;
47
48/// A composable layer in the ranking pipeline.
49///
50/// Implementations operate on the full `(idx, score)` list plus the
51/// canonical chunks slice. See the module-level docs for ordering
52/// conventions.
53pub trait RankingLayer: Send + Sync {
54    /// Apply this layer's transformation.
55    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]);
56}
57
58/// Apply a sequence of ranking layers in order.
59///
60/// Each layer's effect is visible to subsequent layers. Returns the
61/// final `items` after all layers have run.
62pub fn apply_chain(
63    items: &mut Vec<(usize, f32)>,
64    chunks: &[CodeChunk],
65    layers: &[Box<dyn RankingLayer>],
66) {
67    for layer in layers {
68        layer.apply(items, chunks);
69    }
70}
71
72// ---------------------------------------------------------------------------
73// PageRankBoost
74// ---------------------------------------------------------------------------
75
76/// Multiplicative PageRank boost using the sigmoid-on-percentile curve
77/// from [`crate::hybrid::pagerank_boost_factor`].
78///
79/// `pagerank_by_file` maps relative file paths to percentile values
80/// in the corpus distribution (build it via
81/// [`crate::hybrid::pagerank_lookup`]). `alpha` controls the maximum
82/// boost; ceiling is `1 + alpha`.
83pub struct PageRankBoost {
84    pagerank: HashMap<String, f32>,
85    alpha: f32,
86}
87
88impl PageRankBoost {
89    /// Construct from a pre-built percentile lookup.
90    #[must_use]
91    pub fn new(pagerank: HashMap<String, f32>, alpha: f32) -> Self {
92        Self { pagerank, alpha }
93    }
94}
95
96impl RankingLayer for PageRankBoost {
97    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
98        for (idx, score) in items.iter_mut() {
99            if let Some(chunk) = chunks.get(*idx) {
100                let rank = crate::hybrid::lookup_rank_for_chunk(
101                    &self.pagerank,
102                    &chunk.file_path,
103                    &chunk.name,
104                );
105                *score *= crate::hybrid::pagerank_boost_factor(rank, self.alpha);
106            }
107        }
108        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
109    }
110}
111
112// ---------------------------------------------------------------------------
113// PathPenalty
114// ---------------------------------------------------------------------------
115
116/// Multiplicative path-shape penalty for test files, examples, etc.
117///
118/// Wraps [`crate::encoder::ripvec::penalties::file_path_penalty`].
119/// Strips `corpus_root` from each chunk path before regex matching so
120/// the test/examples/d.ts regexes operate on repo-relative paths
121/// (otherwise `tests/corpus/code/X` itself triggers `test_dir_re` for
122/// every chunk).
123///
124/// **Not used in the default BERT pipeline.** Path-name heuristics are
125/// brittle and hide intent from the user; PageRank now carries the
126/// "structural importance" signal through percentile-based boost.
127/// Kept here for the semble pipeline's reference-impl parity with the
128/// Python upstream.
129pub struct PathPenalty {
130    corpus_root: PathBuf,
131}
132
133impl PathPenalty {
134    #[must_use]
135    pub fn new(corpus_root: PathBuf) -> Self {
136        Self { corpus_root }
137    }
138}
139
140impl RankingLayer for PathPenalty {
141    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
142        let prefix = self.corpus_root.to_string_lossy().into_owned();
143        let trimmed_root = prefix.trim_end_matches('/');
144        for (idx, score) in items.iter_mut() {
145            if let Some(chunk) = chunks.get(*idx) {
146                let rel = chunk
147                    .file_path
148                    .strip_prefix(trimmed_root)
149                    .map(|s| s.trim_start_matches('/'))
150                    .unwrap_or(&chunk.file_path);
151                let penalty = crate::encoder::ripvec::penalties::file_path_penalty(rel);
152                *score *= penalty;
153            }
154        }
155        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
156    }
157}
158
159// ---------------------------------------------------------------------------
160// Threshold + TopK
161// ---------------------------------------------------------------------------
162
163/// Drop items with score below `min_score`. Preserves ordering.
164pub struct Threshold {
165    pub min_score: f32,
166}
167
168impl RankingLayer for Threshold {
169    fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
170        items.retain(|(_, score)| *score >= self.min_score);
171    }
172}
173
174/// Truncate to the top `k` items. Caller is responsible for ensuring
175/// the list is sorted descending by score before this layer runs;
176/// most boost layers re-sort internally so the typical pipeline
177/// order is `boosts...` → `Threshold` → `TopK`.
178pub struct TopK {
179    pub k: usize,
180}
181
182impl RankingLayer for TopK {
183    fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
184        if self.k > 0 {
185            items.truncate(self.k);
186        }
187    }
188}
189
190/// Cross-encoder rerank layer.
191///
192/// Replaces every result's `(chunk_idx, score)` similarity with a
193/// fresh score from the cross-encoder, then re-sorts. The reranker
194/// joins query and document text and runs a BERT-with-classifier
195/// forward pass per pair, which is structurally higher quality than
196/// the bi-encoder's dual-tower similarity but O(candidates) cost.
197///
198/// Construct via [`Self::new`]. Holds an [`Arc`] to a
199/// [`crate::rerank::Reranker`] so the layer chain can be cheaply
200/// cloned and the model isn't reloaded per call.
201///
202/// ## Auto-detect via `query`
203///
204/// The layer's score-rewrite is unconditional: if it's in the
205/// pipeline, it runs. Auto-detect (skip rerank for symbol-shaped
206/// queries) belongs at the call site that builds the layer chain,
207/// not here. The call site already knows the query; it can decide
208/// whether to push this layer into the chain at all.
209pub struct CrossEncoderRerank {
210    reranker: std::sync::Arc<crate::rerank::Reranker>,
211    query: String,
212    /// Cap on candidates the reranker sees. Cost is linear in this.
213    candidates: usize,
214    /// Blend factor between bi-encoder and cross-encoder scores.
215    /// `blend = 1.0`: pure cross-encoder (replace). `blend = 0.0`:
216    /// pure bi-encoder (rerank is a no-op). The default `0.7` puts
217    /// most weight on the cross-encoder while preserving the bi-
218    /// encoder's coarse ordering as a tiebreaker — important when
219    /// the cross-encoder's sigmoid scores are compressed near 0.5
220    /// (no candidate clearly relevant), the original bi-encoder
221    /// ordering shouldn't get blown up by 1% rerank-score noise.
222    blend: f32,
223}
224
225impl CrossEncoderRerank {
226    /// Build a rerank layer over `reranker` for `query`. Limits the
227    /// pool to `candidates` (typical: 100). Uses the default blend
228    /// factor of 0.7 (heavy cross-encoder, tiebroken by bi-encoder).
229    #[must_use]
230    pub fn new(
231        reranker: std::sync::Arc<crate::rerank::Reranker>,
232        query: String,
233        candidates: usize,
234    ) -> Self {
235        Self {
236            reranker,
237            query,
238            candidates,
239            blend: 0.7,
240        }
241    }
242
243    /// Override the bi/cross-encoder blend factor. `0.0` = pure
244    /// bi-encoder (rerank is a no-op), `1.0` = pure cross-encoder
245    /// (replace).
246    #[must_use]
247    pub fn with_blend(mut self, blend: f32) -> Self {
248        self.blend = blend.clamp(0.0, 1.0);
249        self
250    }
251}
252
253impl RankingLayer for CrossEncoderRerank {
254    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
255        // Cap to top-`candidates` before invoking the reranker. Cost
256        // is linear in the cap, so trimming is meaningful — even
257        // 100 vs 200 is a doubling.
258        if items.len() > self.candidates {
259            items.truncate(self.candidates);
260        }
261        if items.is_empty() {
262            return;
263        }
264        // Build `(query, doc_text)` pairs aligned with `items`.
265        // Out-of-range indices are dropped (shouldn't happen, but
266        // defensive against malformed input).
267        let pairs: Vec<(&str, &str)> = items
268            .iter()
269            .filter_map(|&(idx, _)| {
270                chunks
271                    .get(idx)
272                    .map(|c| (self.query.as_str(), c.content.as_str()))
273            })
274            .collect();
275        let Ok(scores) = self.reranker.score_pairs(&pairs) else {
276            // Rerank failed — leave the existing scores untouched.
277            // Logging happens at the call site; the layer is silent
278            // about errors since it has no logging context.
279            return;
280        };
281        // Blend bi-encoder and cross-encoder scores:
282        //   new = blend * cross + (1 - blend) * bi
283        // The blend approach handles a known failure mode: when the
284        // cross-encoder is uncertain about all candidates (sigmoid
285        // outputs compressed near 0.5), pure replacement throws away
286        // the bi-encoder's confident top-1 in favor of 1% rerank-score
287        // noise. Blending lets the cross-encoder reorder confidently-
288        // different candidates while preserving the bi-encoder's
289        // ordering when it can't.
290        for (item, &cross_score) in items.iter_mut().zip(scores.iter()) {
291            let bi_score = item.1;
292            item.1 = self.blend * cross_score + (1.0 - self.blend) * bi_score;
293        }
294        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301
302    fn dummy_chunk(file: &str, name: &str) -> CodeChunk {
303        CodeChunk {
304            file_path: file.into(),
305            name: name.into(),
306            kind: "function".into(),
307            start_line: 1,
308            end_line: 10,
309            content: String::new(),
310            enriched_content: String::new(),
311        }
312    }
313
314    #[test]
315    fn threshold_drops_below_min() {
316        let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
317        let mut items = vec![(0, 0.9), (1, 0.3)];
318        Threshold { min_score: 0.5 }.apply(&mut items, &chunks);
319        assert_eq!(items, vec![(0, 0.9)]);
320    }
321
322    #[test]
323    fn topk_truncates() {
324        let chunks = vec![
325            dummy_chunk("a.rs", "f"),
326            dummy_chunk("b.rs", "g"),
327            dummy_chunk("c.rs", "h"),
328        ];
329        let mut items = vec![(0, 0.9), (1, 0.8), (2, 0.7)];
330        TopK { k: 2 }.apply(&mut items, &chunks);
331        assert_eq!(items, vec![(0, 0.9), (1, 0.8)]);
332    }
333
334    #[test]
335    fn topk_zero_keeps_all() {
336        let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
337        let mut items = vec![(0, 0.9), (1, 0.8)];
338        TopK { k: 0 }.apply(&mut items, &chunks);
339        assert_eq!(items.len(), 2);
340    }
341
342    #[test]
343    fn chain_runs_layers_in_order() {
344        // Three items: scores 1.0, 0.6, 0.3. Threshold at 0.5, then top 1.
345        let chunks = vec![
346            dummy_chunk("a.rs", "f"),
347            dummy_chunk("b.rs", "g"),
348            dummy_chunk("c.rs", "h"),
349        ];
350        let mut items = vec![(0, 1.0), (1, 0.6), (2, 0.3)];
351        let layers: Vec<Box<dyn RankingLayer>> = vec![
352            Box::new(Threshold { min_score: 0.5 }),
353            Box::new(TopK { k: 1 }),
354        ];
355        apply_chain(&mut items, &chunks, &layers);
356        assert_eq!(items, vec![(0, 1.0)]);
357    }
358
359    #[test]
360    fn pagerank_boost_layer_reorders() {
361        let chunks = vec![
362            dummy_chunk("important.rs", "a"),
363            dummy_chunk("obscure.rs", "b"),
364        ];
365        let mut items = vec![(0, 0.8), (1, 0.8)];
366        let mut pr = HashMap::new();
367        pr.insert("important.rs".to_string(), 1.0); // top percentile
368        pr.insert("obscure.rs".to_string(), 0.1); // bottom decile
369        PageRankBoost::new(pr, 0.3).apply(&mut items, &chunks);
370        // important.rs should rank first after boost.
371        assert_eq!(items[0].0, 0);
372        assert!(items[0].1 > items[1].1);
373    }
374}