ski/lexical.rs
1//! Stage-1.5 lexical channel: BM25 over full skill descriptions.
2//!
3//! The dense bi-encoder embeds the (short, curated) description and the tiny
4//! cross-encoder reranker both miss lexically-obvious *indirect* matches — a
5//! prompt like "turn this sales spreadsheet into a chart" whose gold skill's
6//! description literally contains "spreadsheet", "chart", "formulas" but whose
7//! cosine sits in the muddy ~0.6 band and whose reranker logit falls below the
8//! abstention floor (so ski says nothing; the host's own chooser misses it too).
9//! BM25 over the description *prose* ranks those #1 reliably where the embeddings
10//! do not, because the discriminating vocabulary is literally in the prose.
11//!
12//! This is a high-precision **fast-path**, not another additive score term: a
13//! *dominant* BM25 winner (clears [`Config::lexical_min`] in absolute score AND
14//! beats the runner-up by [`Config::lexical_margin`]) is injected directly,
15//! skipping the reranker — mirroring the existing "confident lone winner skips
16//! rerank" gate ([`crate::rerank::confident_winner`]) but keyed on lexical
17//! certainty, which is reliable exactly where the bi-encoder cosine is not. A
18//! plain stage-1 score boost would not work: the reranker overwrites stage-1
19//! `score` with its logit and the agreement gate in [`crate::rerank::passes`]
20//! then rejects the still-sub-floor cosine.
21//!
22//! The fast-path only fires when stage-1 has *no* confident lone dense winner
23//! (the caller gates on [`crate::rerank::confident_winner`]); a strong dense
24//! match is never overridden. Off by default ([`Config::lexical_min`] `<= 0`).
25
26use crate::config::Config;
27use crate::index::Index;
28use crate::rank::cmp_score_desc;
29use crate::text::match_tokens;
30use std::collections::HashMap;
31
32/// BM25 term-frequency saturation. Standard Lucene/Elasticsearch default.
33const K1: f32 = 1.2;
34/// BM25 length-normalization. Standard default.
35const B: f32 = 0.75;
36/// Minimum distinct query content-tokens that must appear in the winning
37/// description for a lexical dominance to count. BM25 rewards a lone high-IDF term
38/// heavily, so a single rare word shared with an otherwise-unrelated description
39/// ("anthropic" -> brand-guidelines, "keyboard" -> accessibility) spikes a dominant
40/// score that is pure noise. A genuine indirect match overlaps the description on
41/// *several* terms ("chart" + "spreadsheet" + "formula"); requiring two independent
42/// hits is the precision gate that separates the rescues from the false injections.
43const MIN_TERM_OVERLAP: usize = 2;
44
45/// A skill's BM25 score against the prompt.
46#[derive(Clone, Debug)]
47pub struct Lex {
48 pub id: String,
49 pub score: f32,
50}
51
52/// BM25(prompt, description) for every skill, sorted by descending score.
53///
54/// Corpus statistics (document frequency, average length) are computed inline
55/// from the in-memory index every call — the descriptions are already stored, so
56/// no index-schema change is needed, and the corpus (tens to low-hundreds of
57/// short descriptions) is cheap to tokenize. Both prompt and descriptions are
58/// tokenized with [`match_tokens`] (stopword-stripped, singular-normalized), so
59/// glue words neither inflate a score nor pad a document's length, and trivial
60/// inflection ("charts" ↔ "chart") no longer defeats a verbatim-only match.
61pub fn scores(prompt: &str, idx: &Index) -> Vec<Lex> {
62 // Query terms as a set: a short prompt's repeats carry no extra signal, and
63 // de-duping keeps a doubled word from double-counting one description's hit.
64 let mut q: Vec<String> = match_tokens(prompt);
65 q.sort();
66 q.dedup();
67 if q.is_empty() || idx.skills.is_empty() {
68 return Vec::new();
69 }
70
71 // Per-document term frequencies and lengths (content tokens only).
72 let docs: Vec<HashMap<String, u32>> = idx
73 .skills
74 .iter()
75 .map(|e| {
76 let mut tf: HashMap<String, u32> = HashMap::new();
77 for t in match_tokens(&e.description) {
78 *tf.entry(t).or_insert(0) += 1;
79 }
80 tf
81 })
82 .collect();
83 let lens: Vec<f32> = docs
84 .iter()
85 .map(|d| d.values().sum::<u32>() as f32)
86 .collect();
87 let n = docs.len() as f32;
88 let avgdl = (lens.iter().sum::<f32>() / n).max(1.0);
89
90 // Document frequency, only for the query terms (all the IDF we need).
91 let idf: HashMap<&str, f32> = q
92 .iter()
93 .map(|t| {
94 let df = docs.iter().filter(|d| d.contains_key(t)).count() as f32;
95 // Lucene BM25 IDF: ln(1 + (N - df + 0.5)/(df + 0.5)) — always >= 0, so a
96 // term in most descriptions never drags a score negative.
97 let idf = (1.0 + (n - df + 0.5) / (df + 0.5)).ln();
98 (t.as_str(), idf)
99 })
100 .collect();
101
102 let mut out: Vec<Lex> = idx
103 .skills
104 .iter()
105 .enumerate()
106 .map(|(i, e)| {
107 let dl = lens[i];
108 let mut score = 0.0f32;
109 for t in &q {
110 let f = *docs[i].get(t).unwrap_or(&0) as f32;
111 if f == 0.0 {
112 continue;
113 }
114 let denom = f + K1 * (1.0 - B + B * dl / avgdl);
115 score += idf[t.as_str()] * (f * (K1 + 1.0)) / denom;
116 }
117 Lex {
118 id: e.id.clone(),
119 score,
120 }
121 })
122 .collect();
123 out.sort_by(|a, b| cmp_score_desc(a.score, b.score));
124 out
125}
126
127/// The dominant lexical winner, if one exists: the top-scoring skill, provided it
128/// clears `cfg.lexical_min` in absolute BM25 *and* beats the runner-up by at least
129/// `cfg.lexical_margin`. `None` when the channel is off (`lexical_min <= 0`), the
130/// prompt has no content tokens, or no skill stands clearly apart — the margin is
131/// what makes this high-precision, so a cluster of near-equal descriptions abstains
132/// and defers to the reranker.
133pub fn dominant(prompt: &str, idx: &Index, cfg: &Config) -> Option<Lex> {
134 if cfg.lexical_min <= 0.0 {
135 return None;
136 }
137 let ranked = scores(prompt, idx);
138 let top = ranked.first()?;
139 if top.score < cfg.lexical_min {
140 return None;
141 }
142 // Dominance is *over peers*: with no runner-up (a single-skill library) the
143 // margin gate would pass vacuously and the fast-path would fire on the sole
144 // skill for any two-term overlap. No peers -> no dominance -> defer.
145 let second = ranked.get(1).map(|l| l.score)?;
146 if top.score - second < cfg.lexical_margin {
147 return None;
148 }
149 // Precision guard: reject a dominance carried by a single high-IDF term. Count
150 // the distinct query content-tokens that actually appear in the winner's
151 // description; a real indirect match overlaps on at least `MIN_TERM_OVERLAP`.
152 let mut q: Vec<String> = match_tokens(prompt);
153 q.sort();
154 q.dedup();
155 let win_terms: std::collections::HashSet<String> = idx
156 .skills
157 .iter()
158 .find(|e| e.id == top.id)
159 .map(|e| match_tokens(&e.description).into_iter().collect())
160 .unwrap_or_default();
161 let overlap = q.iter().filter(|t| win_terms.contains(*t)).count();
162 if overlap < MIN_TERM_OVERLAP {
163 return None;
164 }
165 Some(top.clone())
166}
167
168#[cfg(test)]
169mod tests {
170 use super::*;
171 use crate::index::Entry;
172
173 fn entry(id: &str, description: &str) -> Entry {
174 Entry {
175 id: id.to_string(),
176 name: id.to_string(),
177 description: description.to_string(),
178 path: String::new(),
179 keywords: Vec::new(),
180 trigger_phrases: Vec::new(),
181 body_head: String::new(),
182 hash: String::new(),
183 embedding: Vec::new(),
184 }
185 }
186
187 fn index_of(entries: Vec<Entry>) -> Index {
188 Index {
189 model: "test".into(),
190 dim: 0,
191 skills: entries,
192 }
193 }
194
195 fn cfg(min: f32, margin: f32) -> Config {
196 Config {
197 lexical_min: min,
198 lexical_margin: margin,
199 ..Default::default()
200 }
201 }
202
203 #[test]
204 fn ranks_description_vocabulary_match_first() {
205 // The xlsx-style case: the indirect prompt's vocabulary lives in the gold
206 // description's prose, not in the others'.
207 let idx = index_of(vec![
208 entry(
209 "xlsx",
210 "create and edit spreadsheets, compute formulas, build charts",
211 ),
212 entry("pdf", "merge split and extract text from pdf documents"),
213 entry(
214 "docx",
215 "create and edit word documents with headings and tables",
216 ),
217 ]);
218 let ranked = scores(
219 "turn this sales spreadsheet into a chart with formulas",
220 &idx,
221 );
222 assert_eq!(ranked[0].id, "xlsx");
223 assert!(ranked[0].score > ranked[1].score);
224 }
225
226 #[test]
227 fn matches_across_plural_inflection() {
228 // Both sides tokenize through `match_tokens`, so a plural prompt still
229 // hits a singular description (and vice versa).
230 let idx = index_of(vec![
231 entry("xlsx", "edit a spreadsheet, charts and formulas"),
232 entry("pdf", "merge split and extract text from pdf documents"),
233 ]);
234 let ranked = scores("compute formulas across my spreadsheets", &idx);
235 assert_eq!(ranked[0].id, "xlsx");
236 assert!(ranked[0].score > 0.0);
237 }
238
239 #[test]
240 fn dominant_requires_absolute_floor() {
241 let idx = index_of(vec![
242 entry("xlsx", "edit a spreadsheet, charts and formulas"),
243 entry("pdf", "pdf documents"),
244 ]);
245 // A real match exists and stands apart, but an absurd floor rejects it.
246 assert!(dominant("edit my spreadsheet", &idx, &cfg(100.0, 0.5)).is_none());
247 // A reachable floor accepts it.
248 let win = dominant("edit my spreadsheet", &idx, &cfg(0.5, 0.5)).unwrap();
249 assert_eq!(win.id, "xlsx");
250 }
251
252 #[test]
253 fn dominant_requires_margin_over_runner_up() {
254 // Two descriptions share the query term equally -> no clear winner -> abstain.
255 let idx = index_of(vec![
256 entry("a", "process the report data"),
257 entry("b", "process the report data"),
258 ]);
259 assert!(dominant("process the report", &idx, &cfg(0.1, 0.5)).is_none());
260 }
261
262 #[test]
263 fn dominant_rejects_single_term_match() {
264 // The false-injection pattern: one rare query word ("anthropic") hits one
265 // otherwise-unrelated description and BM25 spikes it into a dominant winner.
266 // A single-term overlap must abstain even though the floor and margin pass.
267 let idx = index_of(vec![
268 entry(
269 "brand-guidelines",
270 "apply anthropic brand colors to artifacts",
271 ),
272 entry("pdf", "merge and split pdf documents"),
273 ]);
274 assert!(dominant(
275 "who founded anthropic and in what year",
276 &idx,
277 &cfg(0.1, 0.1)
278 )
279 .is_none());
280 // Two real overlapping terms clear the guard.
281 let idx2 = index_of(vec![
282 entry("xlsx", "edit a spreadsheet, charts and formulas"),
283 entry("pdf", "pdf documents"),
284 ]);
285 let win = dominant("edit the spreadsheet formulas", &idx2, &cfg(0.1, 0.1)).unwrap();
286 assert_eq!(win.id, "xlsx");
287 }
288
289 #[test]
290 fn single_skill_library_is_never_dominant() {
291 // With no runner-up the margin gate would pass vacuously; the sole skill
292 // must not ride the fast-path on any two-term overlap.
293 let idx = index_of(vec![entry(
294 "xlsx",
295 "edit a spreadsheet, charts and formulas",
296 )]);
297 assert!(dominant("edit the spreadsheet formulas", &idx, &cfg(0.1, 0.1)).is_none());
298 }
299
300 #[test]
301 fn dominant_off_when_min_non_positive() {
302 let idx = index_of(vec![entry("xlsx", "spreadsheets charts")]);
303 assert!(dominant("spreadsheet", &idx, &cfg(0.0, 0.5)).is_none());
304 }
305
306 #[test]
307 fn empty_prompt_or_index_is_none() {
308 let idx = index_of(vec![entry("xlsx", "spreadsheets")]);
309 assert!(scores("", &idx).is_empty());
310 assert!(scores("the an of to", &idx).is_empty()); // all stopwords
311 assert!(scores("spreadsheet", &index_of(vec![])).is_empty());
312 }
313}