Skip to main content

eval/
eval.rs

1//! In-process precision/recall harness for the injection decision.
2//!
3//! Builds the index once (one model load, unlike the per-prompt subprocess in
4//! `tests/data/run-anthropic-prompts.sh`), runs the *real* two-stage decision
5//! (stage-1 cosine, or stage-2 rerank when ambiguous) for every labelled prompt,
6//! and reports a confusion matrix: recall on positives, false-positive rate on
7//! negatives. It also reports the stage-1 retrieval ceiling — recall@`rerank_top_k`
8//! and top-1 over positives, before any rerank/threshold gating — so you can tell
9//! whether a miss is a retrieval failure (gold never reached the reranker) or a
10//! ranking failure (gold was retrieved but the gate dropped it). Per-prompt score
11//! dumps (`-v`) expose the distributions the gate is tuned against.
12//!
13//! Usage (point SKI_ROOTS at the eval index — colon-separated union is fine):
14//!   SKI_ROOTS="/var/tmp/ski-eval/.claude/skills:$HOME/.claude/plugins/marketplaces/anthropic-agent-skills" \
15//!     cargo run --example eval -- tests/data/popular_skills_prompts.tsv -v
16//!
17//! Labels: `<expected-skill-id>\t<kind>\t<prompt>`, `(none)` expects no injection.
18//! `borderline` rows are reported but excluded from the headline FP/recall (they
19//! are observe-only by design).
20//!
21//! **Headline metric — `host-value`, not raw FP count.** ski's job on a strong
22//! host is to recover skills the host would otherwise *not* invoke (it hand-rolls
23//! instead). A controlled probe (`[[ski-host-recall-gap]]`) showed the host
24//! ignores false injects even when phrased firmly (3/3), so a false inject costs
25//! almost nothing, while a recall miss costs the user a worse hand-rolled
26//! artifact. Tuning to minimise FP *count* (the old objective) therefore traded
27//! away the recall that is the entire point. The `host-value` line scores
28//! `recall_rate - FP_HARM * fp_rate` with `FP_HARM` small ([`FP_HARM`]): optimise
29//! THIS, not FP count. Raw recall and FP rate are still printed for diagnosis.
30
31use ski::confidence::Stage;
32use ski::config::Config;
33use ski::embed::{self, EmbedKind};
34use ski::hook::Host;
35use ski::rank::Hit;
36use ski::{context, index, pipeline, rank, skill};
37
38/// Per-false-inject harm, relative to a recall miss costing 1.0, used by the
39/// `host-value` headline. A strong host ignores false injects even when phrased
40/// firmly (probe: 3/3), so their real harm is near zero; the small non-zero value
41/// keeps mild pressure against flooding context with noise (and covers weak-host
42/// setups, where a false directive can actually mislead). Recall misses, by
43/// contrast, cost a full unit — the user gets a worse hand-rolled artifact.
44const FP_HARM: f32 = 0.15;
45
46struct Case {
47    want: String, // "(none)" for a negative
48    kind: String,
49    prompt: String,
50    /// Optional prior-turn context (oldest-first), from a 4th `|`-separated TSV
51    /// column. Empty for the single-prompt corpora.
52    context: Vec<String>,
53    /// Optional working directory, from a 5th TSV column, exercising the ambient
54    /// project-type channel (`SKI_PROJECT_BOOST`). Empty for the prompt-only corpora.
55    cwd: String,
56}
57
58fn parse_cases(raw: &str) -> Vec<Case> {
59    raw.lines()
60        .filter(|l| !l.trim().is_empty() && !l.trim_start().starts_with('#'))
61        .filter_map(|l| {
62            let mut it = l.splitn(5, '\t');
63            let want = it.next()?.trim().to_string();
64            let kind = it.next()?.trim().to_string();
65            let prompt = it.next()?.trim().to_string();
66            let context = it
67                .next()
68                .map(|c| {
69                    c.split('|')
70                        .map(|p| p.trim().to_string())
71                        .filter(|p| !p.is_empty())
72                        .collect()
73                })
74                .unwrap_or_default();
75            let cwd = it.next().map(|c| c.trim().to_string()).unwrap_or_default();
76            if prompt.is_empty() {
77                return None;
78            }
79            Some(Case {
80                want,
81                kind,
82                prompt,
83                context,
84                cwd,
85            })
86        })
87        .collect()
88}
89
90fn main() -> anyhow::Result<()> {
91    let args: Vec<String> = std::env::args().skip(1).collect();
92    let verbose = args.iter().any(|a| a == "-v" || a == "--verbose");
93    let path = args
94        .iter()
95        .find(|a| !a.starts_with('-'))
96        .cloned()
97        .unwrap_or_else(|| "tests/data/popular_skills_prompts.tsv".to_string());
98
99    let raw = std::fs::read_to_string(&path)?;
100    let cases = parse_cases(&raw);
101
102    let (mut cfg, file) = Config::load(Host::Claude);
103    // A/B affordance: override the phrase-channel boost (0.0 disables it) so the
104    // same corpus can be scored with and without the channel in one rebuild.
105    if let Ok(v) = std::env::var("SKI_PHRASE_BOOST") {
106        cfg.phrase_boost = v.parse().expect("SKI_PHRASE_BOOST must be a float");
107    }
108    // Context enrichment (Goal 3) is off by default; these env knobs activate and
109    // tune it for one run, mirroring SKI_PHRASE_BOOST, so the same corpus can be
110    // scored with and without conversational context.
111    if let Ok(v) = std::env::var("SKI_CONTEXT_DEPTH") {
112        cfg.context_depth = v.parse().expect("SKI_CONTEXT_DEPTH must be a usize");
113    }
114    if let Ok(v) = std::env::var("SKI_CONTEXT_WEIGHT") {
115        cfg.context_weight = v.parse().expect("SKI_CONTEXT_WEIGHT must be a float");
116    }
117    if let Ok(v) = std::env::var("SKI_VAGUE_LO") {
118        cfg.vague_lo = v.parse().expect("SKI_VAGUE_LO must be a float");
119    }
120    if let Ok(v) = std::env::var("SKI_VAGUE_HI") {
121        cfg.vague_hi = v.parse().expect("SKI_VAGUE_HI must be a float");
122    }
123    if let Ok(v) = std::env::var("SKI_FILE_BOOST") {
124        cfg.file_boost = v.parse().expect("SKI_FILE_BOOST must be a float");
125    }
126    if let Ok(v) = std::env::var("SKI_PROJECT_BOOST") {
127        cfg.project_boost = v.parse().expect("SKI_PROJECT_BOOST must be a float");
128    }
129    // Reranker-gate sweep knobs: tune the stage-2 abstention floor/margin for one
130    // run without editing config.toml (these are on the logit scale, untouched by
131    // `calibrate_to`).
132    if let Ok(v) = std::env::var("SKI_RERANK_MIN") {
133        cfg.rerank_min = v.parse().expect("SKI_RERANK_MIN must be a float");
134    }
135    if let Ok(v) = std::env::var("SKI_RERANK_MARGIN") {
136        cfg.rerank_margin = v.parse().expect("SKI_RERANK_MARGIN must be a float");
137    }
138    // Lexical fast-path (BM25 over description) sweep knobs: `lexical_min <= 0`
139    // disables it, so the same corpus can be scored with and without the channel.
140    if let Ok(v) = std::env::var("SKI_LEXICAL_MIN") {
141        cfg.lexical_min = v.parse().expect("SKI_LEXICAL_MIN must be a float");
142    }
143    if let Ok(v) = std::env::var("SKI_LEXICAL_MARGIN") {
144        cfg.lexical_margin = v.parse().expect("SKI_LEXICAL_MARGIN must be a float");
145    }
146    let skills = skill::discover(&cfg.roots)?;
147    let embedder = embed::build(&cfg.model)?;
148    cfg.calibrate_to(embedder.as_ref());
149    file.apply_cosine(&mut cfg);
150    let idx = index::build(&skills, embedder.as_ref(), None)?;
151    eprintln!(
152        "index: {} skills via {} | rerank_min {:.2} margin {:.2} | min_sim {:.2} | lexical_min {:.2} margin {:.2}",
153        idx.skills.len(),
154        idx.model,
155        cfg.rerank_min,
156        cfg.rerank_margin,
157        cfg.min_similarity,
158        cfg.lexical_min,
159        cfg.lexical_margin,
160    );
161
162    // Confusion counters. `borderline` rows are tallied separately (observe-only).
163    let (mut tp, mut fn_, mut fp, mut tn) = (0u32, 0u32, 0u32, 0u32);
164    let (mut n_pos, mut n_neg) = (0u32, 0u32);
165    let mut fp_rows: Vec<String> = Vec::new();
166    let mut fn_rows: Vec<String> = Vec::new();
167    // Stage-1 retrieval ceiling (pre-rerank), over positives only: recall@k is the
168    // fraction whose gold skill survives into the top-`rerank_top_k` candidates the
169    // reranker is fed (`rerank::rerank` takes exactly that many); top-1 is the
170    // fraction already ranked first by hybrid score. recall@k ~100% means retrieval
171    // is not the bottleneck and the problem is ranking within the retrieved set.
172    let (mut recall_at_k, mut stage1_top1) = (0u32, 0u32);
173    let mut recall_miss_rows: Vec<String> = Vec::new();
174
175    for c in &cases {
176        let query = embedder
177            .embed(std::slice::from_ref(&c.prompt), EmbedKind::Query)?
178            .remove(0);
179        let cvec = context::vector(embedder.as_ref(), &c.context, &cfg)?;
180        // File-type channel: scan this turn's prompt AND its prior context for named
181        // files (a `.xlsx` etc.), mapping each to its skill.
182        let file_text = format!("{} {}", c.context.join(" "), c.prompt);
183        let file_ids = context::file_ids(&file_text);
184        // Ambient project-type channel: the case's cwd (5th column) yields
185        // ecosystem terms (plus any code file named in the conversation), resolved
186        // against the installed index. Empty when the channel is off.
187        let project_ids = if cfg.project_boost > 0.0 {
188            let mut terms = context::project_terms(&c.cwd);
189            terms.extend(context::code_terms(&file_text));
190            context::skills_for_terms(&terms, &idx)
191                .into_keys()
192                .collect()
193        } else {
194            std::collections::BTreeSet::new()
195        };
196        let hits = rank::rank_all_ctx(
197            &query,
198            cvec.as_deref(),
199            &file_ids,
200            &project_ids,
201            &c.prompt,
202            &idx,
203            &cfg,
204        );
205        // The reranker reads text: enrich its query with the recent window when the
206        // prompt is vague (same gate that lets the context vector contribute).
207        let prompt_top = hits.iter().map(|h| h.cosine).fold(0.0_f32, f32::max);
208        let rerank_query = context::rerank_query(
209            &c.prompt,
210            prompt_top,
211            &c.context,
212            !file_ids.is_empty(),
213            &cfg,
214        );
215        let plan = pipeline::decide(&hits, &idx, &c.prompt, &rerank_query, &cfg);
216        let stage = match plan.stage {
217            Stage::Lexical => "lexical",
218            Stage::Rerank => "rerank",
219            Stage::Cosine => "stage1",
220        };
221        // Caller-side guardrails: the hook's `finalize` minus session dedup (the eval
222        // has no session) — drop denied skills, cap at `max_skills`.
223        let injected: Vec<Hit> = plan
224            .passed
225            .into_iter()
226            .filter(|h| !cfg.deny.contains(&h.id))
227            .take(cfg.max_skills)
228            .collect();
229        let ids: Vec<String> = injected.iter().map(|h| h.id.clone()).collect();
230        let is_neg = c.want == "(none)";
231        let observe_only = c.kind == "borderline";
232
233        if verbose {
234            let top: Vec<String> = hits
235                .iter()
236                .take(4)
237                .map(|h| format!("{}={:.3}", h.id, h.score))
238                .collect();
239            let inj: Vec<String> = injected
240                .iter()
241                .map(|h| {
242                    format!(
243                        "{}=L{:.2}/cos{:.3}+ctx{:.2}+file{:.2}+proj{:.2}+kw{:.2}+ph{:.2}",
244                        h.id, h.score, h.cosine, h.context, h.file, h.project, h.keyword, h.phrase
245                    )
246                })
247                .collect();
248            eprintln!(
249                "[{:<10}] {:<7} inject=[{}]  top: {}  :: {}",
250                c.kind,
251                stage,
252                inj.join(", "),
253                top.join(", "),
254                c.prompt,
255            );
256        }
257
258        if observe_only {
259            continue;
260        }
261        if is_neg {
262            n_neg += 1;
263            if injected.is_empty() {
264                tn += 1;
265            } else {
266                fp += 1;
267                fp_rows.push(format!(
268                    "  FP [{:<10}] inject=[{}] :: {}",
269                    c.kind,
270                    ids.join(", "),
271                    c.prompt
272                ));
273            }
274        } else {
275            n_pos += 1;
276            // Stage-1 ceiling: where does the gold skill land in the full hybrid
277            // ranking, before any rerank/threshold gating?
278            let rank = hits.iter().position(|h| h.id == c.want);
279            if rank == Some(0) {
280                stage1_top1 += 1;
281            }
282            if rank.is_some_and(|r| r < cfg.rerank_top_k) {
283                recall_at_k += 1;
284            } else {
285                recall_miss_rows.push(format!(
286                    "  R@k MISS [{:<10}] want={} stage-1 rank={} :: {}",
287                    c.kind,
288                    c.want,
289                    rank.map_or_else(|| "absent".to_string(), |r| r.to_string()),
290                    c.prompt
291                ));
292            }
293            if ids.iter().any(|id| id == &c.want) {
294                tp += 1;
295            } else {
296                fn_ += 1;
297                fn_rows.push(format!(
298                    "  FN [{:<10}] want={} got=[{}] :: {}",
299                    c.kind,
300                    c.want,
301                    ids.join(", "),
302                    c.prompt
303                ));
304            }
305        }
306    }
307
308    println!("\n=== eval: {} ===", path);
309    println!(
310        "positives {n_pos}: recall {tp}/{n_pos} ({:.0}%)   misses {fn_}",
311        pct(tp, n_pos)
312    );
313    println!(
314        "negatives {n_neg}: false-inject {fp}/{n_neg} ({:.0}%)   clean {tn}",
315        pct(fp, n_neg)
316    );
317    // Headline: recall recovered, net of discounted FP harm. Optimise this — not
318    // FP count — because a strong host filters false injects (see module docs).
319    let recall_rate = if n_pos == 0 {
320        0.0
321    } else {
322        tp as f32 / n_pos as f32
323    };
324    let fp_rate = if n_neg == 0 {
325        0.0
326    } else {
327        fp as f32 / n_neg as f32
328    };
329    println!(
330        "host-value {:.0}%  (= recall {:.0}% - {FP_HARM} * fp {:.0}%; FP discounted: a strong host ignores false injects)",
331        100.0 * (recall_rate - FP_HARM * fp_rate),
332        100.0 * recall_rate,
333        100.0 * fp_rate,
334    );
335    println!(
336        "stage-1 (pre-rerank, k={}): recall@k {recall_at_k}/{n_pos} ({:.0}%)   top-1 {stage1_top1}/{n_pos} ({:.0}%)",
337        cfg.rerank_top_k,
338        pct(recall_at_k, n_pos),
339        pct(stage1_top1, n_pos),
340    );
341    if !recall_miss_rows.is_empty() {
342        println!(
343            "--- stage-1 recall@k misses (gold below top-{}) ---",
344            cfg.rerank_top_k
345        );
346        recall_miss_rows.iter().for_each(|r| println!("{r}"));
347    }
348    if !fn_rows.is_empty() {
349        println!("--- recall misses ---");
350        fn_rows.iter().for_each(|r| println!("{r}"));
351    }
352    if !fp_rows.is_empty() {
353        println!("--- false injections ---");
354        fp_rows.iter().for_each(|r| println!("{r}"));
355    }
356    Ok(())
357}
358
359fn pct(n: u32, d: u32) -> f32 {
360    if d == 0 {
361        0.0
362    } else {
363        100.0 * n as f32 / d as f32
364    }
365}