ski/rerank.rs
1//! Stage-2 cross-encoder reranking, gated on stage-1 ambiguity.
2//!
3//! The bi-encoder (stage 1, [`crate::rank`]) embeds query and skill description
4//! independently; its cosine scores pile into a muddy ~0.60 band where genuine
5//! matches and noise overlap, and it is confidently wrong on confusable pairs
6//! (canvas-design vs algorithmic-art, docx vs pdf). A cross-encoder reads the
7//! (prompt, skill) pair *jointly* and separates them: real matches score high,
8//! noise crashes well negative.
9//!
10//! It is far costlier than the bi-encoder (a second ONNX model load + inference
11//! on the hot path), so [`is_ambiguous`] gates it: a confident lone winner, or a
12//! prompt with nothing relevant, skips stage 2 entirely and pays nothing. Only
13//! the murky middle reaches the reranker.
14//!
15//! Feature-gated: without `fastembed`, [`rerank`] returns `None` and the caller
16//! keeps the stage-1 result — identical behaviour to before this stage existed.
17//!
18//! **Rejected experiment — mean-centering the bi-encoder space.** The classic
19//! anisotropy fix (subtract the corpus-mean embedding from the query and every
20//! skill vector before cosine, then renormalize) was implemented and measured
21//! against `examples/eval` across all three fixtures. It *did* sharpen stage 1 —
22//! stage-1 top-1 rose (e.g. 75% -> 84% on the anthropic set) and recall@`rerank_top_k`
23//! went 98% -> 100% (it recovered the one true retrieval miss) — but the final,
24//! post-rerank recall *regressed* ~3 points (93/106 -> 91/106) at equal false-inject,
25//! across a min_similarity sweep. The reason is the finding `examples/eval`'s
26//! recall@k instrumentation made explicit: retrieval is not the bottleneck (gold is
27//! almost always already in the top-k), so a sharper bi-encoder is largely redundant
28//! with this reranker, while the shifted cosine distribution disrupts the gate it
29//! feeds. Not worth the added complexity, the new persisted `mean`, and the forced
30//! reindex. Revisit only if the reranker is removed or the live distribution proves
31//! materially different from the eval corpus.
32
33use crate::config::Config;
34use crate::index::Index;
35use crate::rank::{cmp_score_desc, Hit};
36
37/// How far below stage-1's solo-injection floor (`min_similarity`) a reranked
38/// candidate may sit and still inject — the cosine "credit" the cross-encoder's
39/// confirmation is worth. Tuned on the realistic corpus *and* a live 56-skill
40/// library: a borderline real match the bi-encoder ranks at ~0.63 ("clean up this
41/// messy CSV" -> xlsx, cosine 0.634) injects, while the false-inject skills cluster
42/// lower (~0.57-0.59) and stay out. Sweep: at this slack recall holds 95% / false
43/// injects 2%; a smaller slack (floor 0.64) drops a positive, a larger one (0.58)
44/// readmits an FP. See `examples/eval`.
45const AGREEMENT_SLACK: f32 = 0.03;
46
47/// Whether stage-1 results warrant the cross-encoder. Skip (return `false`) when:
48/// - nothing clears the recall floor (the prompt has no relevant skill), or
49/// - the top match is a confident lone winner: high absolute score *and* a clear
50/// gap to the runner-up.
51///
52/// Everything else — clustered peers, or a match stuck in the muddy band — is
53/// ambiguous and reranked. The gate is deliberately conservative (errs toward
54/// reranking) because the bi-encoder is confidently wrong on exactly the
55/// clustered cases, so only an unmistakable single winner is allowed to skip.
56pub fn is_ambiguous(hits: &[Hit], cfg: &Config) -> bool {
57 let Some(top) = hits.first() else {
58 return false;
59 };
60 if top.score < cfg.recall_floor {
61 return false; // nothing relevant; stage-1 floor rejects it anyway.
62 }
63 !confident_winner(hits, cfg)
64}
65
66/// Whether stage-1's top match is a *confident lone dense winner*: high absolute
67/// cosine *and* a clear gap to the runner-up. This is the one case the bi-encoder
68/// is trusted outright — it skips both the reranker and the lexical fast-path, so
69/// neither can override a strong dense match.
70///
71/// Confidence is measured on *cosine*, not the keyword-inflated `score`: a keyword
72/// boost (e.g. "commit" matching pre-commit-setup) can fake a high score and a
73/// clear gap, but that is precisely the noisy signal stage 2 exists to arbitrate,
74/// so it must never grant a skip.
75pub fn confident_winner(hits: &[Hit], cfg: &Config) -> bool {
76 if hits.is_empty() {
77 return false;
78 }
79 let mut cos: Vec<f32> = hits.iter().map(|h| h.cosine).collect();
80 cos.sort_by(|a, b| cmp_score_desc(*a, *b));
81 let c1 = cos[0];
82 let c2 = cos.get(1).copied().unwrap_or(0.0);
83 c1 >= cfg.high_conf && (c1 - c2) >= cfg.clear_gap
84}
85
86/// Rerank the top-`cfg.rerank_top_k` stage-1 candidates with the cross-encoder,
87/// returning them rescored on the reranker's (logit) scale and sorted descending.
88/// `Some` only with the `fastembed` feature and a usable model; `None` otherwise,
89/// so the caller falls back to the stage-1 ordering.
90///
91/// `cosine`/`keyword` on each returned [`Hit`] are preserved for display; `score`
92/// is replaced by the reranker logit. Callers must gate the result with the
93/// reranker thresholds ([`Config::rerank_min`] / [`Config::rerank_margin`]), not
94/// the bi-encoder ones — the scales differ.
95pub fn rerank(hits: &[Hit], idx: &Index, prompt: &str, cfg: &Config) -> Option<Vec<Hit>> {
96 #[cfg(feature = "fastembed")]
97 {
98 fast::rerank(hits, idx, prompt, cfg)
99 }
100 #[cfg(not(feature = "fastembed"))]
101 {
102 let _ = (hits, idx, prompt, cfg);
103 None
104 }
105}
106
107/// Apply the reranker-scale guardrails to a reranked candidate list: keep hits at
108/// or above `rerank_min` and within `rerank_margin` of the best reranked score.
109/// Returns hits sorted by descending reranked score (input order is preserved as
110/// it already is). The caller still applies deny/session/cap.
111///
112/// **Stage-1 agreement.** Before the reranker thresholds, a candidate must have a
113/// bi-encoder score (the preserved stage-1 [`Hit::stage1_score`]; [`rerank`] only
114/// overwrites `score` with the logit) within [`AGREEMENT_SLACK`] of stage-1's own
115/// injection floor (`min_similarity`). Every channel counts: the phrase term is
116/// included on purpose — a confident multi-token trigger match is exactly the
117/// "stage-1 judged relevant" signal this gate looks for, so it may carry an
118/// otherwise sub-floor cosine through; the context term rides along for the same
119/// reason. (The project term is in the sum too and — since it fires from
120/// [`crate::rank::PROJECT_GATE_SLACK`] below the floor — can likewise carry a
121/// near-floor ecosystem skill through, which is deliberate: the workspace's own
122/// ecosystem skill is exactly the recall ski should lean toward, and the reranker
123/// logit still gates it.) The cross-encoder's job is to reorder and confirm the *retrieved*
124/// relevant set, not to resurrect a skill stage-1 judged irrelevant. Without this
125/// gate a prompt with no real match — "implement the builder pattern in Java", "RSA
126/// key generation from scratch" — lets the reranker pull a sub-floor skill to the
127/// top and inject noise; the logits there interleave with genuine weak matches (so
128/// no `rerank_min` value separates them), but their stage-1 scores sit lower
129/// (~0.57-0.59 vs ~0.63 for borderline real matches).
130pub fn passes(reranked: &[Hit], cfg: &Config) -> Vec<Hit> {
131 let floor = cfg.min_similarity - AGREEMENT_SLACK;
132 // Keep only candidates stage-1 also rated relevant; the best *eligible* logit
133 // then anchors the relative margin (a sub-floor leader can't drag peers in).
134 let eligible: Vec<&Hit> = reranked
135 .iter()
136 .filter(|h| h.stage1_score() >= floor)
137 .collect();
138 let best = eligible
139 .first()
140 .map(|h| h.score)
141 .unwrap_or(f32::NEG_INFINITY);
142 eligible
143 .into_iter()
144 .filter(|h| h.score >= cfg.rerank_min && h.score >= best - cfg.rerank_margin)
145 .cloned()
146 .collect()
147}
148
149#[cfg(feature = "fastembed")]
150mod fast {
151 use super::*;
152 use fastembed::{RerankInitOptions, RerankerModel, TextRerank};
153 use std::sync::OnceLock;
154
155 /// The reranker is expensive to construct; build it once per process. The hook
156 /// is a short-lived process (one prompt), so this is effectively per-prompt,
157 /// but `why`/tests that rerank many prompts pay the load only once.
158 fn model() -> Option<&'static TextRerank> {
159 static MODEL: OnceLock<Option<TextRerank>> = OnceLock::new();
160 MODEL
161 .get_or_init(|| {
162 // JINA turbo: on a realistic ~48-skill index it ties the 7x-larger
163 // bge-reranker-base and jina-v2-base on top-1 accuracy and false-
164 // injection rate, at a fraction of the load/latency cost. The gate
165 // (`rerank_min`), not reranker size, is what controls noise here.
166 TextRerank::try_new(
167 RerankInitOptions::new(RerankerModel::JINARerankerV1TurboEn)
168 .with_cache_dir(crate::paths::model_cache_dir())
169 .with_show_download_progress(false),
170 )
171 .ok()
172 })
173 .as_ref()
174 }
175
176 pub fn rerank(hits: &[Hit], idx: &Index, prompt: &str, cfg: &Config) -> Option<Vec<Hit>> {
177 let reranker = model()?;
178 let cands: Vec<&Hit> = hits.iter().take(cfg.rerank_top_k).collect();
179 if cands.is_empty() {
180 return None;
181 }
182 // Document text comes from the index (description + cached body head), so
183 // reranking touches no files — previously each candidate's SKILL.md was
184 // re-read and re-parsed here on every reranked prompt.
185 let docs: Vec<String> = cands
186 .iter()
187 .map(|h| {
188 idx.get(&h.id)
189 .map(crate::index::Entry::doc_text)
190 .unwrap_or_default()
191 })
192 .collect();
193 let results = reranker
194 .rerank(prompt.to_string(), docs, false, None)
195 .ok()?;
196 // results are sorted desc by score; map each back to its candidate Hit.
197 let out = results
198 .into_iter()
199 .map(|r| {
200 let src = cands[r.index];
201 Hit {
202 score: r.score,
203 ..src.clone()
204 }
205 })
206 .collect();
207 Some(out)
208 }
209}
210
211#[cfg(test)]
212mod tests {
213 use super::*;
214
215 fn cfg() -> Config {
216 Config {
217 recall_floor: 0.45,
218 high_conf: 0.75,
219 clear_gap: 0.12,
220 rerank_min: -2.5,
221 rerank_margin: 2.0,
222 rerank_top_k: 12,
223 ..Default::default()
224 }
225 }
226
227 /// For `is_ambiguous` tests, which read `cosine`: model a hit whose cosine is
228 /// its score (no keyword boost).
229 fn hit(id: &str, score: f32) -> Hit {
230 Hit {
231 id: id.to_string(),
232 name: id.to_string(),
233 cosine: score,
234 context: 0.0,
235 file: 0.0,
236 project: 0.0,
237 keyword: 0.0,
238 phrase: 0.0,
239 score,
240 }
241 }
242
243 /// For `passes` tests, which gate on the reranker *logit* (`score`) while the
244 /// new stage-1-agreement filter reads `cosine`: keep them independent.
245 fn rhit(id: &str, logit: f32, cosine: f32) -> Hit {
246 Hit {
247 id: id.to_string(),
248 name: id.to_string(),
249 cosine,
250 context: 0.0,
251 file: 0.0,
252 project: 0.0,
253 keyword: 0.0,
254 phrase: 0.0,
255 score: logit,
256 }
257 }
258
259 #[test]
260 fn nothing_relevant_is_not_ambiguous() {
261 // Best below the recall floor -> skip the reranker.
262 assert!(!is_ambiguous(&[hit("a", 0.40), hit("b", 0.38)], &cfg()));
263 }
264
265 #[test]
266 fn confident_lone_winner_is_not_ambiguous() {
267 // High top, clear gap -> skip.
268 assert!(!is_ambiguous(&[hit("a", 0.82), hit("b", 0.60)], &cfg()));
269 }
270
271 #[test]
272 fn clustered_peers_are_ambiguous() {
273 // High but close together -> rerank (the confusable case).
274 assert!(is_ambiguous(&[hit("a", 0.80), hit("b", 0.78)], &cfg()));
275 }
276
277 #[test]
278 fn muddy_band_is_ambiguous() {
279 // Above recall floor but below high-confidence -> rerank.
280 assert!(is_ambiguous(&[hit("a", 0.62), hit("b", 0.55)], &cfg()));
281 }
282
283 #[test]
284 fn empty_is_not_ambiguous() {
285 assert!(!is_ambiguous(&[], &cfg()));
286 }
287
288 #[test]
289 fn passes_keeps_top_and_rejects_negatives() {
290 // Reranker scale: a strong match, a co-relevant peer, and noise. All three
291 // cleared stage-1 (cosine above the 0.30 default floor), so only the logit
292 // gates apply.
293 let reranked = vec![
294 rhit("a", 1.10, 0.80),
295 rhit("b", -0.30, 0.70),
296 rhit("c", -3.90, 0.65),
297 ];
298 let got: Vec<String> = passes(&reranked, &cfg())
299 .into_iter()
300 .map(|h| h.id)
301 .collect();
302 assert_eq!(got, ["a", "b"]); // c below rerank_min, and outside margin
303 }
304
305 #[test]
306 fn passes_drops_all_when_best_is_noise() {
307 let reranked = vec![rhit("a", -2.83, 0.70), rhit("b", -3.94, 0.66)];
308 assert!(passes(&reranked, &cfg()).is_empty()); // negative prompt -> nothing
309 }
310
311 #[test]
312 fn passes_rejects_subfloor_stage1_resurrection() {
313 // The reranker pulled a skill to the top (high logit) whose stage-1 score
314 // (cosine 0.20) sits well below the agreement floor (min_similarity 0.30
315 // minus the slack): it must not be injected, even though its logit clears
316 // `rerank_min`. This is the over-injection the builder-pattern / RSA
317 // negatives produced.
318 let reranked = vec![rhit("ghost", 1.50, 0.20), rhit("real", 0.40, 0.72)];
319 let got: Vec<String> = passes(&reranked, &cfg())
320 .into_iter()
321 .map(|h| h.id)
322 .collect();
323 assert_eq!(got, ["real"]); // ghost dropped on stage-1 disagreement
324 }
325
326 #[test]
327 fn passes_subfloor_leader_does_not_drag_in_peers() {
328 // A sub-floor leader is dropped, and the relative margin is then anchored on
329 // the best *eligible* skill — a trailing real skill outside the leader's
330 // margin is still judged on its own.
331 let cfg = cfg(); // rerank_margin 2.0
332 let reranked = vec![rhit("ghost", 2.00, 0.20), rhit("real", -0.40, 0.72)];
333 let got: Vec<String> = passes(&reranked, &cfg).into_iter().map(|h| h.id).collect();
334 assert_eq!(got, ["real"]); // kept: -0.40 >= rerank_min, anchors its own margin
335 }
336}