Skip to main content

ski/
lexical.rs

1//! Stage-1.5 lexical channel: BM25 over full skill descriptions.
2//!
3//! The dense bi-encoder embeds the (short, curated) description and the tiny
4//! cross-encoder reranker both miss lexically-obvious *indirect* matches — a
5//! prompt like "turn this sales spreadsheet into a chart" whose gold skill's
6//! description literally contains "spreadsheet", "chart", "formulas" but whose
7//! cosine sits in the muddy ~0.6 band and whose reranker logit falls below the
8//! abstention floor (so ski says nothing; the host's own chooser misses it too).
9//! BM25 over the description *prose* ranks those #1 reliably where the embeddings
10//! do not, because the discriminating vocabulary is literally in the prose.
11//!
12//! This is a high-precision **fast-path**, not another additive score term: a
13//! *dominant* BM25 winner (clears [`Config::lexical_min`] in absolute score AND
14//! beats the runner-up by [`Config::lexical_margin`]) is injected directly,
15//! skipping the reranker — mirroring the existing "confident lone winner skips
16//! rerank" gate ([`crate::rerank::confident_winner`]) but keyed on lexical
17//! certainty, which is reliable exactly where the bi-encoder cosine is not. A
18//! plain stage-1 score boost would not work: the reranker overwrites stage-1
19//! `score` with its logit and the agreement gate in [`crate::rerank::passes`]
20//! then rejects the still-sub-floor cosine.
21//!
22//! The fast-path only fires when stage-1 has *no* confident lone dense winner
23//! (the caller gates on [`crate::rerank::confident_winner`]); a strong dense
24//! match is never overridden. Off by default ([`Config::lexical_min`] `<= 0`).
25
26use crate::config::Config;
27use crate::index::Index;
28use crate::rank::cmp_score_desc;
29use crate::text::match_tokens;
30use std::collections::HashMap;
31
32/// BM25 term-frequency saturation. Standard Lucene/Elasticsearch default.
33const K1: f32 = 1.2;
34/// BM25 length-normalization. Standard default.
35const B: f32 = 0.75;
36/// Minimum distinct query content-tokens that must appear in the winning
37/// description for a lexical dominance to count. BM25 rewards a lone high-IDF term
38/// heavily, so a single rare word shared with an otherwise-unrelated description
39/// ("anthropic" -> brand-guidelines, "keyboard" -> accessibility) spikes a dominant
40/// score that is pure noise. A genuine indirect match overlaps the description on
41/// *several* terms ("chart" + "spreadsheet" + "formula"); requiring two independent
42/// hits is the precision gate that separates the rescues from the false injections.
43const MIN_TERM_OVERLAP: usize = 2;
44
45/// A skill's BM25 score against the prompt.
46#[derive(Clone, Debug)]
47pub struct Lex {
48    pub id: String,
49    pub score: f32,
50}
51
52/// BM25(prompt, description) for every skill, sorted by descending score.
53///
54/// Corpus statistics (document frequency, average length) are computed inline
55/// from the in-memory index every call — the descriptions are already stored, so
56/// no index-schema change is needed, and the corpus (tens to low-hundreds of
57/// short descriptions) is cheap to tokenize. Both prompt and descriptions are
58/// tokenized with [`match_tokens`] (stopword-stripped, singular-normalized), so
59/// glue words neither inflate a score nor pad a document's length, and trivial
60/// inflection ("charts" ↔ "chart") no longer defeats a verbatim-only match.
61pub fn scores(prompt: &str, idx: &Index) -> Vec<Lex> {
62    // Query terms as a set: a short prompt's repeats carry no extra signal, and
63    // de-duping keeps a doubled word from double-counting one description's hit.
64    let mut q: Vec<String> = match_tokens(prompt);
65    q.sort();
66    q.dedup();
67    if q.is_empty() || idx.skills.is_empty() {
68        return Vec::new();
69    }
70
71    // Per-document term frequencies and lengths (content tokens only).
72    let docs: Vec<HashMap<String, u32>> = idx
73        .skills
74        .iter()
75        .map(|e| {
76            let mut tf: HashMap<String, u32> = HashMap::new();
77            for t in match_tokens(&e.description) {
78                *tf.entry(t).or_insert(0) += 1;
79            }
80            tf
81        })
82        .collect();
83    let lens: Vec<f32> = docs
84        .iter()
85        .map(|d| d.values().sum::<u32>() as f32)
86        .collect();
87    let n = docs.len() as f32;
88    let avgdl = (lens.iter().sum::<f32>() / n).max(1.0);
89
90    // Document frequency, only for the query terms (all the IDF we need).
91    let idf: HashMap<&str, f32> = q
92        .iter()
93        .map(|t| {
94            let df = docs.iter().filter(|d| d.contains_key(t)).count() as f32;
95            // Lucene BM25 IDF: ln(1 + (N - df + 0.5)/(df + 0.5)) — always >= 0, so a
96            // term in most descriptions never drags a score negative.
97            let idf = (1.0 + (n - df + 0.5) / (df + 0.5)).ln();
98            (t.as_str(), idf)
99        })
100        .collect();
101
102    let mut out: Vec<Lex> = idx
103        .skills
104        .iter()
105        .enumerate()
106        .map(|(i, e)| {
107            let dl = lens[i];
108            let mut score = 0.0f32;
109            for t in &q {
110                let f = *docs[i].get(t).unwrap_or(&0) as f32;
111                if f == 0.0 {
112                    continue;
113                }
114                let denom = f + K1 * (1.0 - B + B * dl / avgdl);
115                score += idf[t.as_str()] * (f * (K1 + 1.0)) / denom;
116            }
117            Lex {
118                id: e.id.clone(),
119                score,
120            }
121        })
122        .collect();
123    out.sort_by(|a, b| cmp_score_desc(a.score, b.score));
124    out
125}
126
127/// The dominant lexical winner, if one exists: the top-scoring skill, provided it
128/// clears `cfg.lexical_min` in absolute BM25 *and* beats the runner-up by at least
129/// `cfg.lexical_margin`. `None` when the channel is off (`lexical_min <= 0`), the
130/// prompt has no content tokens, or no skill stands clearly apart — the margin is
131/// what makes this high-precision, so a cluster of near-equal descriptions abstains
132/// and defers to the reranker.
133pub fn dominant(prompt: &str, idx: &Index, cfg: &Config) -> Option<Lex> {
134    if cfg.lexical_min <= 0.0 {
135        return None;
136    }
137    let ranked = scores(prompt, idx);
138    let top = ranked.first()?;
139    if top.score < cfg.lexical_min {
140        return None;
141    }
142    // Dominance is *over peers*: with no runner-up (a single-skill library) the
143    // margin gate would pass vacuously and the fast-path would fire on the sole
144    // skill for any two-term overlap. No peers -> no dominance -> defer.
145    let second = ranked.get(1).map(|l| l.score)?;
146    if top.score - second < cfg.lexical_margin {
147        return None;
148    }
149    // Precision guard: reject a dominance carried by a single high-IDF term. Count
150    // the distinct query content-tokens that actually appear in the winner's
151    // description; a real indirect match overlaps on at least `MIN_TERM_OVERLAP`.
152    let mut q: Vec<String> = match_tokens(prompt);
153    q.sort();
154    q.dedup();
155    let win_terms: std::collections::HashSet<String> = idx
156        .skills
157        .iter()
158        .find(|e| e.id == top.id)
159        .map(|e| match_tokens(&e.description).into_iter().collect())
160        .unwrap_or_default();
161    let overlap = q.iter().filter(|t| win_terms.contains(*t)).count();
162    if overlap < MIN_TERM_OVERLAP {
163        return None;
164    }
165    Some(top.clone())
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171    use crate::index::Entry;
172
173    fn entry(id: &str, description: &str) -> Entry {
174        Entry {
175            id: id.to_string(),
176            name: id.to_string(),
177            description: description.to_string(),
178            path: String::new(),
179            keywords: Vec::new(),
180            trigger_phrases: Vec::new(),
181            body_head: String::new(),
182            hash: String::new(),
183            embedding: Vec::new(),
184        }
185    }
186
187    fn index_of(entries: Vec<Entry>) -> Index {
188        Index {
189            model: "test".into(),
190            dim: 0,
191            skills: entries,
192        }
193    }
194
195    fn cfg(min: f32, margin: f32) -> Config {
196        Config {
197            lexical_min: min,
198            lexical_margin: margin,
199            ..Default::default()
200        }
201    }
202
203    #[test]
204    fn ranks_description_vocabulary_match_first() {
205        // The xlsx-style case: the indirect prompt's vocabulary lives in the gold
206        // description's prose, not in the others'.
207        let idx = index_of(vec![
208            entry(
209                "xlsx",
210                "create and edit spreadsheets, compute formulas, build charts",
211            ),
212            entry("pdf", "merge split and extract text from pdf documents"),
213            entry(
214                "docx",
215                "create and edit word documents with headings and tables",
216            ),
217        ]);
218        let ranked = scores(
219            "turn this sales spreadsheet into a chart with formulas",
220            &idx,
221        );
222        assert_eq!(ranked[0].id, "xlsx");
223        assert!(ranked[0].score > ranked[1].score);
224    }
225
226    #[test]
227    fn matches_across_plural_inflection() {
228        // Both sides tokenize through `match_tokens`, so a plural prompt still
229        // hits a singular description (and vice versa).
230        let idx = index_of(vec![
231            entry("xlsx", "edit a spreadsheet, charts and formulas"),
232            entry("pdf", "merge split and extract text from pdf documents"),
233        ]);
234        let ranked = scores("compute formulas across my spreadsheets", &idx);
235        assert_eq!(ranked[0].id, "xlsx");
236        assert!(ranked[0].score > 0.0);
237    }
238
239    #[test]
240    fn dominant_requires_absolute_floor() {
241        let idx = index_of(vec![
242            entry("xlsx", "edit a spreadsheet, charts and formulas"),
243            entry("pdf", "pdf documents"),
244        ]);
245        // A real match exists and stands apart, but an absurd floor rejects it.
246        assert!(dominant("edit my spreadsheet", &idx, &cfg(100.0, 0.5)).is_none());
247        // A reachable floor accepts it.
248        let win = dominant("edit my spreadsheet", &idx, &cfg(0.5, 0.5)).unwrap();
249        assert_eq!(win.id, "xlsx");
250    }
251
252    #[test]
253    fn dominant_requires_margin_over_runner_up() {
254        // Two descriptions share the query term equally -> no clear winner -> abstain.
255        let idx = index_of(vec![
256            entry("a", "process the report data"),
257            entry("b", "process the report data"),
258        ]);
259        assert!(dominant("process the report", &idx, &cfg(0.1, 0.5)).is_none());
260    }
261
262    #[test]
263    fn dominant_rejects_single_term_match() {
264        // The false-injection pattern: one rare query word ("anthropic") hits one
265        // otherwise-unrelated description and BM25 spikes it into a dominant winner.
266        // A single-term overlap must abstain even though the floor and margin pass.
267        let idx = index_of(vec![
268            entry(
269                "brand-guidelines",
270                "apply anthropic brand colors to artifacts",
271            ),
272            entry("pdf", "merge and split pdf documents"),
273        ]);
274        assert!(dominant(
275            "who founded anthropic and in what year",
276            &idx,
277            &cfg(0.1, 0.1)
278        )
279        .is_none());
280        // Two real overlapping terms clear the guard.
281        let idx2 = index_of(vec![
282            entry("xlsx", "edit a spreadsheet, charts and formulas"),
283            entry("pdf", "pdf documents"),
284        ]);
285        let win = dominant("edit the spreadsheet formulas", &idx2, &cfg(0.1, 0.1)).unwrap();
286        assert_eq!(win.id, "xlsx");
287    }
288
289    #[test]
290    fn single_skill_library_is_never_dominant() {
291        // With no runner-up the margin gate would pass vacuously; the sole skill
292        // must not ride the fast-path on any two-term overlap.
293        let idx = index_of(vec![entry(
294            "xlsx",
295            "edit a spreadsheet, charts and formulas",
296        )]);
297        assert!(dominant("edit the spreadsheet formulas", &idx, &cfg(0.1, 0.1)).is_none());
298    }
299
300    #[test]
301    fn dominant_off_when_min_non_positive() {
302        let idx = index_of(vec![entry("xlsx", "spreadsheets charts")]);
303        assert!(dominant("spreadsheet", &idx, &cfg(0.0, 0.5)).is_none());
304    }
305
306    #[test]
307    fn empty_prompt_or_index_is_none() {
308        let idx = index_of(vec![entry("xlsx", "spreadsheets")]);
309        assert!(scores("", &idx).is_empty());
310        assert!(scores("the an of to", &idx).is_empty()); // all stopwords
311        assert!(scores("spreadsheet", &index_of(vec![])).is_empty());
312    }
313}