Skip to main content

ski/
rerank.rs

1//! Stage-2 cross-encoder reranking, gated on stage-1 ambiguity.
2//!
3//! The bi-encoder (stage 1, [`crate::rank`]) embeds query and skill description
4//! independently; its cosine scores pile into a muddy ~0.60 band where genuine
5//! matches and noise overlap, and it is confidently wrong on confusable pairs
6//! (canvas-design vs algorithmic-art, docx vs pdf). A cross-encoder reads the
7//! (prompt, skill) pair *jointly* and separates them: real matches score high,
8//! noise crashes well negative.
9//!
10//! It is far costlier than the bi-encoder (a second ONNX model load + inference
11//! on the hot path), so [`is_ambiguous`] gates it: a confident lone winner, or a
12//! prompt with nothing relevant, skips stage 2 entirely and pays nothing. Only
13//! the murky middle reaches the reranker.
14//!
15//! Feature-gated: without `fastembed`, [`rerank`] returns `None` and the caller
16//! keeps the stage-1 result — identical behaviour to before this stage existed.
17//!
18//! **Rejected experiment — mean-centering the bi-encoder space.** The classic
19//! anisotropy fix (subtract the corpus-mean embedding from the query and every
20//! skill vector before cosine, then renormalize) was implemented and measured
21//! against `examples/eval` across all three fixtures. It *did* sharpen stage 1 —
22//! stage-1 top-1 rose (e.g. 75% -> 84% on the anthropic set) and recall@`rerank_top_k`
23//! went 98% -> 100% (it recovered the one true retrieval miss) — but the final,
24//! post-rerank recall *regressed* ~3 points (93/106 -> 91/106) at equal false-inject,
25//! across a min_similarity sweep. The reason is the finding `examples/eval`'s
26//! recall@k instrumentation made explicit: retrieval is not the bottleneck (gold is
27//! almost always already in the top-k), so a sharper bi-encoder is largely redundant
28//! with this reranker, while the shifted cosine distribution disrupts the gate it
29//! feeds. Not worth the added complexity, the new persisted `mean`, and the forced
30//! reindex. Revisit only if the reranker is removed or the live distribution proves
31//! materially different from the eval corpus.
32
33use crate::config::Config;
34use crate::index::Index;
35use crate::rank::{cmp_score_desc, Hit};
36
37/// How far below stage-1's solo-injection floor (`min_similarity`) a reranked
38/// candidate may sit and still inject — the cosine "credit" the cross-encoder's
39/// confirmation is worth. Tuned on the realistic corpus *and* a live 56-skill
40/// library: a borderline real match the bi-encoder ranks at ~0.63 ("clean up this
41/// messy CSV" -> xlsx, cosine 0.634) injects, while the false-inject skills cluster
42/// lower (~0.57-0.59) and stay out. Sweep: at this slack recall holds 95% / false
43/// injects 2%; a smaller slack (floor 0.64) drops a positive, a larger one (0.58)
44/// readmits an FP. See `examples/eval`.
45const AGREEMENT_SLACK: f32 = 0.03;
46
47/// Whether stage-1 results warrant the cross-encoder. Skip (return `false`) when:
48/// - nothing clears the recall floor (the prompt has no relevant skill), or
49/// - the top match is a confident lone winner: high absolute score *and* a clear
50///   gap to the runner-up.
51///
52/// Everything else — clustered peers, or a match stuck in the muddy band — is
53/// ambiguous and reranked. The gate is deliberately conservative (errs toward
54/// reranking) because the bi-encoder is confidently wrong on exactly the
55/// clustered cases, so only an unmistakable single winner is allowed to skip.
56pub fn is_ambiguous(hits: &[Hit], cfg: &Config) -> bool {
57    let Some(top) = hits.first() else {
58        return false;
59    };
60    if top.score < cfg.recall_floor {
61        return false; // nothing relevant; stage-1 floor rejects it anyway.
62    }
63    !confident_winner(hits, cfg)
64}
65
66/// Whether stage-1's top match is a *confident lone dense winner*: high absolute
67/// cosine *and* a clear gap to the runner-up. This is the one case the bi-encoder
68/// is trusted outright — it skips both the reranker and the lexical fast-path, so
69/// neither can override a strong dense match.
70///
71/// Confidence is measured on *cosine*, not the keyword-inflated `score`: a keyword
72/// boost (e.g. "commit" matching pre-commit-setup) can fake a high score and a
73/// clear gap, but that is precisely the noisy signal stage 2 exists to arbitrate,
74/// so it must never grant a skip.
75pub fn confident_winner(hits: &[Hit], cfg: &Config) -> bool {
76    if hits.is_empty() {
77        return false;
78    }
79    let mut cos: Vec<f32> = hits.iter().map(|h| h.cosine).collect();
80    cos.sort_by(|a, b| cmp_score_desc(*a, *b));
81    let c1 = cos[0];
82    let c2 = cos.get(1).copied().unwrap_or(0.0);
83    c1 >= cfg.high_conf && (c1 - c2) >= cfg.clear_gap
84}
85
86/// Rerank the top-`cfg.rerank_top_k` stage-1 candidates with the cross-encoder,
87/// returning them rescored on the reranker's (logit) scale and sorted descending.
88/// `Some` only with the `fastembed` feature and a usable model; `None` otherwise,
89/// so the caller falls back to the stage-1 ordering.
90///
91/// `cosine`/`keyword` on each returned [`Hit`] are preserved for display; `score`
92/// is replaced by the reranker logit. Callers must gate the result with the
93/// reranker thresholds ([`Config::rerank_min`] / [`Config::rerank_margin`]), not
94/// the bi-encoder ones — the scales differ.
95pub fn rerank(hits: &[Hit], idx: &Index, prompt: &str, cfg: &Config) -> Option<Vec<Hit>> {
96    #[cfg(feature = "fastembed")]
97    {
98        fast::rerank(hits, idx, prompt, cfg)
99    }
100    #[cfg(not(feature = "fastembed"))]
101    {
102        let _ = (hits, idx, prompt, cfg);
103        None
104    }
105}
106
107/// Apply the reranker-scale guardrails to a reranked candidate list: keep hits at
108/// or above `rerank_min` and within `rerank_margin` of the best reranked score.
109/// Returns hits sorted by descending reranked score (input order is preserved as
110/// it already is). The caller still applies deny/session/cap.
111///
112/// **Stage-1 agreement.** Before the reranker thresholds, a candidate must have a
113/// bi-encoder score (the preserved stage-1 [`Hit::stage1_score`]; [`rerank`] only
114/// overwrites `score` with the logit) within [`AGREEMENT_SLACK`] of stage-1's own
115/// injection floor (`min_similarity`). Every channel counts: the phrase term is
116/// included on purpose — a confident multi-token trigger match is exactly the
117/// "stage-1 judged relevant" signal this gate looks for, so it may carry an
118/// otherwise sub-floor cosine through; the context term rides along for the same
119/// reason. (The project term is in the sum too and — since it fires from
120/// [`crate::rank::PROJECT_GATE_SLACK`] below the floor — can likewise carry a
121/// near-floor ecosystem skill through, which is deliberate: the workspace's own
122/// ecosystem skill is exactly the recall ski should lean toward, and the reranker
123/// logit still gates it.) The cross-encoder's job is to reorder and confirm the *retrieved*
124/// relevant set, not to resurrect a skill stage-1 judged irrelevant. Without this
125/// gate a prompt with no real match — "implement the builder pattern in Java", "RSA
126/// key generation from scratch" — lets the reranker pull a sub-floor skill to the
127/// top and inject noise; the logits there interleave with genuine weak matches (so
128/// no `rerank_min` value separates them), but their stage-1 scores sit lower
129/// (~0.57-0.59 vs ~0.63 for borderline real matches).
130pub fn passes(reranked: &[Hit], cfg: &Config) -> Vec<Hit> {
131    let floor = cfg.min_similarity - AGREEMENT_SLACK;
132    // Keep only candidates stage-1 also rated relevant; the best *eligible* logit
133    // then anchors the relative margin (a sub-floor leader can't drag peers in).
134    let eligible: Vec<&Hit> = reranked
135        .iter()
136        .filter(|h| h.stage1_score() >= floor)
137        .collect();
138    let best = eligible
139        .first()
140        .map(|h| h.score)
141        .unwrap_or(f32::NEG_INFINITY);
142    eligible
143        .into_iter()
144        .filter(|h| h.score >= cfg.rerank_min && h.score >= best - cfg.rerank_margin)
145        .cloned()
146        .collect()
147}
148
149#[cfg(feature = "fastembed")]
150mod fast {
151    use super::*;
152    use fastembed::{RerankInitOptions, RerankerModel, TextRerank};
153    use std::sync::OnceLock;
154
155    /// The reranker is expensive to construct; build it once per process. The hook
156    /// is a short-lived process (one prompt), so this is effectively per-prompt,
157    /// but `why`/tests that rerank many prompts pay the load only once.
158    fn model() -> Option<&'static TextRerank> {
159        static MODEL: OnceLock<Option<TextRerank>> = OnceLock::new();
160        MODEL
161            .get_or_init(|| {
162                // JINA turbo: on a realistic ~48-skill index it ties the 7x-larger
163                // bge-reranker-base and jina-v2-base on top-1 accuracy and false-
164                // injection rate, at a fraction of the load/latency cost. The gate
165                // (`rerank_min`), not reranker size, is what controls noise here.
166                TextRerank::try_new(
167                    RerankInitOptions::new(RerankerModel::JINARerankerV1TurboEn)
168                        .with_cache_dir(crate::paths::model_cache_dir())
169                        .with_show_download_progress(false),
170                )
171                .ok()
172            })
173            .as_ref()
174    }
175
176    pub fn rerank(hits: &[Hit], idx: &Index, prompt: &str, cfg: &Config) -> Option<Vec<Hit>> {
177        let reranker = model()?;
178        let cands: Vec<&Hit> = hits.iter().take(cfg.rerank_top_k).collect();
179        if cands.is_empty() {
180            return None;
181        }
182        // Document text comes from the index (description + cached body head), so
183        // reranking touches no files — previously each candidate's SKILL.md was
184        // re-read and re-parsed here on every reranked prompt.
185        let docs: Vec<String> = cands
186            .iter()
187            .map(|h| {
188                idx.get(&h.id)
189                    .map(crate::index::Entry::doc_text)
190                    .unwrap_or_default()
191            })
192            .collect();
193        let results = reranker
194            .rerank(prompt.to_string(), docs, false, None)
195            .ok()?;
196        // results are sorted desc by score; map each back to its candidate Hit.
197        let out = results
198            .into_iter()
199            .map(|r| {
200                let src = cands[r.index];
201                Hit {
202                    score: r.score,
203                    ..src.clone()
204                }
205            })
206            .collect();
207        Some(out)
208    }
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    fn cfg() -> Config {
216        Config {
217            recall_floor: 0.45,
218            high_conf: 0.75,
219            clear_gap: 0.12,
220            rerank_min: -2.5,
221            rerank_margin: 2.0,
222            rerank_top_k: 12,
223            ..Default::default()
224        }
225    }
226
227    /// For `is_ambiguous` tests, which read `cosine`: model a hit whose cosine is
228    /// its score (no keyword boost).
229    fn hit(id: &str, score: f32) -> Hit {
230        Hit {
231            id: id.to_string(),
232            name: id.to_string(),
233            cosine: score,
234            context: 0.0,
235            file: 0.0,
236            project: 0.0,
237            keyword: 0.0,
238            phrase: 0.0,
239            score,
240        }
241    }
242
243    /// For `passes` tests, which gate on the reranker *logit* (`score`) while the
244    /// new stage-1-agreement filter reads `cosine`: keep them independent.
245    fn rhit(id: &str, logit: f32, cosine: f32) -> Hit {
246        Hit {
247            id: id.to_string(),
248            name: id.to_string(),
249            cosine,
250            context: 0.0,
251            file: 0.0,
252            project: 0.0,
253            keyword: 0.0,
254            phrase: 0.0,
255            score: logit,
256        }
257    }
258
259    #[test]
260    fn nothing_relevant_is_not_ambiguous() {
261        // Best below the recall floor -> skip the reranker.
262        assert!(!is_ambiguous(&[hit("a", 0.40), hit("b", 0.38)], &cfg()));
263    }
264
265    #[test]
266    fn confident_lone_winner_is_not_ambiguous() {
267        // High top, clear gap -> skip.
268        assert!(!is_ambiguous(&[hit("a", 0.82), hit("b", 0.60)], &cfg()));
269    }
270
271    #[test]
272    fn clustered_peers_are_ambiguous() {
273        // High but close together -> rerank (the confusable case).
274        assert!(is_ambiguous(&[hit("a", 0.80), hit("b", 0.78)], &cfg()));
275    }
276
277    #[test]
278    fn muddy_band_is_ambiguous() {
279        // Above recall floor but below high-confidence -> rerank.
280        assert!(is_ambiguous(&[hit("a", 0.62), hit("b", 0.55)], &cfg()));
281    }
282
283    #[test]
284    fn empty_is_not_ambiguous() {
285        assert!(!is_ambiguous(&[], &cfg()));
286    }
287
288    #[test]
289    fn passes_keeps_top_and_rejects_negatives() {
290        // Reranker scale: a strong match, a co-relevant peer, and noise. All three
291        // cleared stage-1 (cosine above the 0.30 default floor), so only the logit
292        // gates apply.
293        let reranked = vec![
294            rhit("a", 1.10, 0.80),
295            rhit("b", -0.30, 0.70),
296            rhit("c", -3.90, 0.65),
297        ];
298        let got: Vec<String> = passes(&reranked, &cfg())
299            .into_iter()
300            .map(|h| h.id)
301            .collect();
302        assert_eq!(got, ["a", "b"]); // c below rerank_min, and outside margin
303    }
304
305    #[test]
306    fn passes_drops_all_when_best_is_noise() {
307        let reranked = vec![rhit("a", -2.83, 0.70), rhit("b", -3.94, 0.66)];
308        assert!(passes(&reranked, &cfg()).is_empty()); // negative prompt -> nothing
309    }
310
311    #[test]
312    fn passes_rejects_subfloor_stage1_resurrection() {
313        // The reranker pulled a skill to the top (high logit) whose stage-1 score
314        // (cosine 0.20) sits well below the agreement floor (min_similarity 0.30
315        // minus the slack): it must not be injected, even though its logit clears
316        // `rerank_min`. This is the over-injection the builder-pattern / RSA
317        // negatives produced.
318        let reranked = vec![rhit("ghost", 1.50, 0.20), rhit("real", 0.40, 0.72)];
319        let got: Vec<String> = passes(&reranked, &cfg())
320            .into_iter()
321            .map(|h| h.id)
322            .collect();
323        assert_eq!(got, ["real"]); // ghost dropped on stage-1 disagreement
324    }
325
326    #[test]
327    fn passes_subfloor_leader_does_not_drag_in_peers() {
328        // A sub-floor leader is dropped, and the relative margin is then anchored on
329        // the best *eligible* skill — a trailing real skill outside the leader's
330        // margin is still judged on its own.
331        let cfg = cfg(); // rerank_margin 2.0
332        let reranked = vec![rhit("ghost", 2.00, 0.20), rhit("real", -0.40, 0.72)];
333        let got: Vec<String> = passes(&reranked, &cfg).into_iter().map(|h| h.id).collect();
334        assert_eq!(got, ["real"]); // kept: -0.40 >= rerank_min, anchors its own margin
335    }
336}