Skip to main content

ripvec_core/
ranking.rs

1//! Composable ranking layers for search results.
2//!
3//! ## Why this module exists
4//!
5//! Before this refactor, ranking logic was scattered across four call
6//! sites with bespoke layer combinations:
7//!
8//! | Call site | PageRank | Path penalty | Threshold | TopK |
9//! |---|---|---|---|---|
10//! | CLI `run_oneshot` (indexed) | ❌ | ❌ | inside `search()` | inside `search()` |
11//! | CLI `run_oneshot` (stateless) | ❌ | ❌ | inside `search()` | inside `search()` |
12//! | MCP `search_code` | ✅ | ❌ | inside `search()` | inside `search()` |
13//! | LSP nav/symbols | ✅ (α=0.3 hardcoded) | ❌ | inside `search()` | inside `search()` |
14//! | `RipvecIndex::search` | ✅ (optional) | ✅ | inside reranker | inside reranker |
15//!
16//! Three concrete bugs landed today because of this scatter: (1)
17//! PageRank silently absent from the CLI; (2) PageRank lookups hit
18//! zero entries due to path-rooting mismatch — the same bug present
19//! in every call site that used `boost_with_pagerank` before today's
20//! fix; (3) path penalty regex matched the corpus-root prefix when
21//! invoked from CWD-rooted chunk paths.
22//!
23//! The fix: a single [`RankingLayer`] trait that each call site
24//! composes into a pipeline. Layers are independently testable, the
25//! pipeline shape at each call site is explicit, and adding a new
26//! ranking signal (e.g., recency, file-saturation diversification)
27//! is a single new `impl RankingLayer`.
28//!
29//! ## Convention
30//!
31//! Layers operate on `Vec<(chunk_idx, score)>` with a parallel
32//! `&[CodeChunk]` for metadata lookup. Layers MAY:
33//!
34//! - Mutate scores in place (boost / penalty layers).
35//! - Reorder the vec (sort layers — most boost layers re-sort
36//!   internally so downstream layers see descending order).
37//! - Drop entries (threshold / topK layers).
38//!
39//! When a layer reorders, it MUST leave the vec sorted descending by
40//! score so downstream layers (especially threshold + topK) operate
41//! on a meaningful ordering.
42
43use std::collections::HashMap;
44use std::path::PathBuf;
45
46use crate::chunk::CodeChunk;
47
48/// A composable layer in the ranking pipeline.
49///
50/// Implementations operate on the full `(idx, score)` list plus the
51/// canonical chunks slice. See the module-level docs for ordering
52/// conventions.
53pub trait RankingLayer: Send + Sync {
54    /// Apply this layer's transformation.
55    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]);
56}
57
58/// Apply a sequence of ranking layers in order.
59///
60/// Each layer's effect is visible to subsequent layers. Returns the
61/// final `items` after all layers have run.
62pub fn apply_chain(
63    items: &mut Vec<(usize, f32)>,
64    chunks: &[CodeChunk],
65    layers: &[Box<dyn RankingLayer>],
66) {
67    for layer in layers {
68        layer.apply(items, chunks);
69    }
70}
71
72// ---------------------------------------------------------------------------
73// PageRankBoost
74// ---------------------------------------------------------------------------
75
76/// Multiplicative PageRank boost using the sigmoid-on-percentile curve
77/// from [`crate::hybrid::pagerank_boost_factor`].
78///
79/// `pagerank_by_file` maps relative file paths to percentile values
80/// in the corpus distribution (build it via
81/// [`crate::hybrid::pagerank_lookup`]). `alpha` controls the maximum
82/// boost; ceiling is `1 + alpha`.
83pub struct PageRankBoost {
84    /// Shared via `Arc` so per-query construction is a pointer-bump
85    /// instead of a full HashMap clone. The earlier shape took
86    /// `HashMap<String, f32>` by value and `RipvecIndex::apply_pagerank_layer`
87    /// called `lookup.clone()` on every query; on a 1M-chunk corpus that
88    /// was ~5.9 s of `String::clone` per 20-query batch (profile, samply,
89    /// 2026-05-21). Cloning an Arc is a relaxed atomic increment.
90    pagerank: std::sync::Arc<HashMap<String, f32>>,
91    alpha: f32,
92}
93
94impl PageRankBoost {
95    /// Construct from a pre-built percentile lookup.
96    ///
97    /// Take `Arc<HashMap<..>>` so the caller can build the lookup once
98    /// at index time and hand out cheap clones per query.
99    #[must_use]
100    pub fn new(pagerank: std::sync::Arc<HashMap<String, f32>>, alpha: f32) -> Self {
101        Self { pagerank, alpha }
102    }
103}
104
105impl RankingLayer for PageRankBoost {
106    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
107        for (idx, score) in items.iter_mut() {
108            if let Some(chunk) = chunks.get(*idx) {
109                let rank = crate::hybrid::lookup_rank_for_chunk(
110                    &self.pagerank,
111                    &chunk.file_path,
112                    &chunk.name,
113                );
114                *score *= crate::hybrid::pagerank_boost_factor(rank, self.alpha);
115            }
116        }
117        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
118    }
119}
120
121// ---------------------------------------------------------------------------
122// PathPenalty
123// ---------------------------------------------------------------------------
124
125/// Multiplicative path-shape penalty for test files, examples, etc.
126///
127/// Wraps [`crate::encoder::ripvec::penalties::file_path_penalty`].
128/// Strips `corpus_root` from each chunk path before regex matching so
129/// the test/examples/d.ts regexes operate on repo-relative paths
130/// (otherwise `tests/corpus/code/X` itself triggers `test_dir_re` for
131/// every chunk).
132///
133/// **Not used in the default BERT pipeline.** Path-name heuristics are
134/// brittle and hide intent from the user; PageRank now carries the
135/// "structural importance" signal through percentile-based boost.
136/// Kept here for the semble pipeline's reference-impl parity with the
137/// Python upstream.
138pub struct PathPenalty {
139    corpus_root: PathBuf,
140}
141
142impl PathPenalty {
143    #[must_use]
144    pub fn new(corpus_root: PathBuf) -> Self {
145        Self { corpus_root }
146    }
147}
148
149impl RankingLayer for PathPenalty {
150    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
151        let prefix = self.corpus_root.to_string_lossy().into_owned();
152        let trimmed_root = prefix.trim_end_matches('/');
153        for (idx, score) in items.iter_mut() {
154            if let Some(chunk) = chunks.get(*idx) {
155                let rel = chunk
156                    .file_path
157                    .strip_prefix(trimmed_root)
158                    .map(|s| s.trim_start_matches('/'))
159                    .unwrap_or(&chunk.file_path);
160                let penalty = crate::encoder::ripvec::penalties::file_path_penalty(rel);
161                *score *= penalty;
162            }
163        }
164        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
165    }
166}
167
168// ---------------------------------------------------------------------------
169// Threshold + TopK
170// ---------------------------------------------------------------------------
171
172/// Drop items with score below `min_score`. Preserves ordering.
173pub struct Threshold {
174    pub min_score: f32,
175}
176
177impl RankingLayer for Threshold {
178    fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
179        items.retain(|(_, score)| *score >= self.min_score);
180    }
181}
182
183/// Truncate to the top `k` items. Caller is responsible for ensuring
184/// the list is sorted descending by score before this layer runs;
185/// most boost layers re-sort internally so the typical pipeline
186/// order is `boosts...` → `Threshold` → `TopK`.
187pub struct TopK {
188    pub k: usize,
189}
190
191impl RankingLayer for TopK {
192    fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
193        if self.k > 0 {
194            items.truncate(self.k);
195        }
196    }
197}
198
199/// Cross-encoder rerank layer.
200///
201/// Replaces every result's `(chunk_idx, score)` similarity with a
202/// fresh score from the cross-encoder, then re-sorts. The reranker
203/// joins query and document text and runs a BERT-with-classifier
204/// forward pass per pair, which is structurally higher quality than
205/// the bi-encoder's dual-tower similarity but O(candidates) cost.
206///
207/// Construct via [`Self::new`]. Holds an [`Arc`] to a
208/// [`crate::rerank::Reranker`] so the layer chain can be cheaply
209/// cloned and the model isn't reloaded per call.
210///
211/// ## Auto-detect via `query`
212///
213/// The layer's score-rewrite is unconditional: if it's in the
214/// pipeline, it runs. Auto-detect (skip rerank for symbol-shaped
215/// queries) belongs at the call site that builds the layer chain,
216/// not here. The call site already knows the query; it can decide
217/// whether to push this layer into the chain at all.
218pub struct CrossEncoderRerank {
219    reranker: std::sync::Arc<crate::rerank::Reranker>,
220    query: String,
221    /// Cap on candidates the reranker sees. Cost is linear in this.
222    candidates: usize,
223    /// Blend factor between bi-encoder and cross-encoder scores.
224    /// `blend = 1.0`: pure cross-encoder (replace). `blend = 0.0`:
225    /// pure bi-encoder (rerank is a no-op). The default `0.7` puts
226    /// most weight on the cross-encoder while preserving the bi-
227    /// encoder's coarse ordering as a tiebreaker — important when
228    /// the cross-encoder's sigmoid scores are compressed near 0.5
229    /// (no candidate clearly relevant), the original bi-encoder
230    /// ordering shouldn't get blown up by 1% rerank-score noise.
231    blend: f32,
232}
233
234impl CrossEncoderRerank {
235    /// Build a rerank layer over `reranker` for `query`. Limits the
236    /// pool to `candidates` (typical: 100). Uses the default blend
237    /// factor of 0.7 (heavy cross-encoder, tiebroken by bi-encoder).
238    #[must_use]
239    pub fn new(
240        reranker: std::sync::Arc<crate::rerank::Reranker>,
241        query: String,
242        candidates: usize,
243    ) -> Self {
244        Self {
245            reranker,
246            query,
247            candidates,
248            blend: 0.7,
249        }
250    }
251
252    /// Override the bi/cross-encoder blend factor. `0.0` = pure
253    /// bi-encoder (rerank is a no-op), `1.0` = pure cross-encoder
254    /// (replace).
255    #[must_use]
256    pub fn with_blend(mut self, blend: f32) -> Self {
257        self.blend = blend.clamp(0.0, 1.0);
258        self
259    }
260}
261
262impl RankingLayer for CrossEncoderRerank {
263    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
264        // Cap to top-`candidates` before invoking the reranker. Cost
265        // is linear in the cap, so trimming is meaningful — even
266        // 100 vs 200 is a doubling.
267        if items.len() > self.candidates {
268            items.truncate(self.candidates);
269        }
270        if items.is_empty() {
271            return;
272        }
273        // Build `(query, doc_text)` pairs aligned with `items`.
274        // Out-of-range indices are dropped (shouldn't happen, but
275        // defensive against malformed input).
276        let pairs: Vec<(&str, &str)> = items
277            .iter()
278            .filter_map(|&(idx, _)| {
279                chunks
280                    .get(idx)
281                    .map(|c| (self.query.as_str(), c.content.as_str()))
282            })
283            .collect();
284        let Ok(scores) = self.reranker.score_pairs(&pairs) else {
285            // Rerank failed — leave the existing scores untouched.
286            // Logging happens at the call site; the layer is silent
287            // about errors since it has no logging context.
288            return;
289        };
290        // Min-max normalize both score arrays to [0, 1] within this
291        // candidate set, then blend. The reranker returns raw logits
292        // (sentence-transformers ms-marco config declares Identity
293        // activation), which span ~[-11, +5]; bi-encoder scores arrive
294        // here in the RRF-normalized [0, ~1] range. Without per-set
295        // normalization the magnitude-dominant signal wins by default
296        // — the `blend = 0.7` weighting is only meaningful when both
297        // sides live on the same scale. Min-max preserves ordering
298        // within each signal while making the blend a true convex
299        // combination.
300        let bi: Vec<f32> = items.iter().map(|&(_, s)| s).collect();
301        let cross_norm = min_max_normalize(&scores);
302        let bi_norm = min_max_normalize(&bi);
303        for ((item, &cross), &bi_n) in items.iter_mut().zip(cross_norm.iter()).zip(bi_norm.iter()) {
304            item.1 = self.blend * cross + (1.0 - self.blend) * bi_n;
305        }
306        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
307    }
308}
309
310/// Linear-rescale a score array into `[0, 1]` based on its own
311/// min/max. Used by [`CrossEncoderRerank`] to put raw cross-encoder
312/// logits and RRF bi-encoder scores on the same scale before
313/// convex-combining them. Degenerate inputs (empty or all-equal)
314/// return all-`0.5` so the layer still contributes a neutral signal
315/// rather than NaN/`0.0` collapse.
316fn min_max_normalize(xs: &[f32]) -> Vec<f32> {
317    if xs.is_empty() {
318        return Vec::new();
319    }
320    let mut lo = f32::INFINITY;
321    let mut hi = f32::NEG_INFINITY;
322    for &x in xs {
323        if x < lo {
324            lo = x;
325        }
326        if x > hi {
327            hi = x;
328        }
329    }
330    let span = hi - lo;
331    if span.abs() < f32::EPSILON {
332        return vec![0.5; xs.len()];
333    }
334    xs.iter().map(|&x| (x - lo) / span).collect()
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    fn dummy_chunk(file: &str, name: &str) -> CodeChunk {
342        CodeChunk {
343            file_path: file.into(),
344            name: name.into(),
345            kind: "function".into(),
346            start_line: 1,
347            end_line: 10,
348            content: String::new(),
349            enriched_content: String::new(),
350        }
351    }
352
353    #[test]
354    fn threshold_drops_below_min() {
355        let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
356        let mut items = vec![(0, 0.9), (1, 0.3)];
357        Threshold { min_score: 0.5 }.apply(&mut items, &chunks);
358        assert_eq!(items, vec![(0, 0.9)]);
359    }
360
361    #[test]
362    fn topk_truncates() {
363        let chunks = vec![
364            dummy_chunk("a.rs", "f"),
365            dummy_chunk("b.rs", "g"),
366            dummy_chunk("c.rs", "h"),
367        ];
368        let mut items = vec![(0, 0.9), (1, 0.8), (2, 0.7)];
369        TopK { k: 2 }.apply(&mut items, &chunks);
370        assert_eq!(items, vec![(0, 0.9), (1, 0.8)]);
371    }
372
373    #[test]
374    fn topk_zero_keeps_all() {
375        let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
376        let mut items = vec![(0, 0.9), (1, 0.8)];
377        TopK { k: 0 }.apply(&mut items, &chunks);
378        assert_eq!(items.len(), 2);
379    }
380
381    #[test]
382    fn chain_runs_layers_in_order() {
383        // Three items: scores 1.0, 0.6, 0.3. Threshold at 0.5, then top 1.
384        let chunks = vec![
385            dummy_chunk("a.rs", "f"),
386            dummy_chunk("b.rs", "g"),
387            dummy_chunk("c.rs", "h"),
388        ];
389        let mut items = vec![(0, 1.0), (1, 0.6), (2, 0.3)];
390        let layers: Vec<Box<dyn RankingLayer>> = vec![
391            Box::new(Threshold { min_score: 0.5 }),
392            Box::new(TopK { k: 1 }),
393        ];
394        apply_chain(&mut items, &chunks, &layers);
395        assert_eq!(items, vec![(0, 1.0)]);
396    }
397
398    #[test]
399    fn pagerank_boost_layer_reorders() {
400        let chunks = vec![
401            dummy_chunk("important.rs", "a"),
402            dummy_chunk("obscure.rs", "b"),
403        ];
404        let mut items = vec![(0, 0.8), (1, 0.8)];
405        let mut pr = HashMap::new();
406        pr.insert("important.rs".to_string(), 1.0); // top percentile
407        pr.insert("obscure.rs".to_string(), 0.1); // bottom decile
408        PageRankBoost::new(std::sync::Arc::new(pr), 0.3).apply(&mut items, &chunks);
409        // important.rs should rank first after boost.
410        assert_eq!(items[0].0, 0);
411        assert!(items[0].1 > items[1].1);
412    }
413}