pub fn project_terms(cwd: &str) -> Vec<String>Expand description
Ecosystem terms implied by the project manifest(s) found in cwd or any
ancestor directory (up to [PROJECT_WALK_LEVELS]). Performs cheap exists()
stats only; order-preserving and de-duplicated (most specific term first);
empty when cwd is empty or no known manifest is found. Resolve against the
installed library with skills_for_terms.
Examples found in repository?
examples/eval.rs (line 188)
90fn main() -> anyhow::Result<()> {
91 let args: Vec<String> = std::env::args().skip(1).collect();
92 let verbose = args.iter().any(|a| a == "-v" || a == "--verbose");
93 let path = args
94 .iter()
95 .find(|a| !a.starts_with('-'))
96 .cloned()
97 .unwrap_or_else(|| "tests/data/popular_skills_prompts.tsv".to_string());
98
99 let raw = std::fs::read_to_string(&path)?;
100 let cases = parse_cases(&raw);
101
102 let (mut cfg, file) = Config::load(Host::Claude);
103 // A/B affordance: override the phrase-channel boost (0.0 disables it) so the
104 // same corpus can be scored with and without the channel in one rebuild.
105 if let Ok(v) = std::env::var("SKI_PHRASE_BOOST") {
106 cfg.phrase_boost = v.parse().expect("SKI_PHRASE_BOOST must be a float");
107 }
108 // Context enrichment (Goal 3) is off by default; these env knobs activate and
109 // tune it for one run, mirroring SKI_PHRASE_BOOST, so the same corpus can be
110 // scored with and without conversational context.
111 if let Ok(v) = std::env::var("SKI_CONTEXT_DEPTH") {
112 cfg.context_depth = v.parse().expect("SKI_CONTEXT_DEPTH must be a usize");
113 }
114 if let Ok(v) = std::env::var("SKI_CONTEXT_WEIGHT") {
115 cfg.context_weight = v.parse().expect("SKI_CONTEXT_WEIGHT must be a float");
116 }
117 if let Ok(v) = std::env::var("SKI_VAGUE_LO") {
118 cfg.vague_lo = v.parse().expect("SKI_VAGUE_LO must be a float");
119 }
120 if let Ok(v) = std::env::var("SKI_VAGUE_HI") {
121 cfg.vague_hi = v.parse().expect("SKI_VAGUE_HI must be a float");
122 }
123 if let Ok(v) = std::env::var("SKI_FILE_BOOST") {
124 cfg.file_boost = v.parse().expect("SKI_FILE_BOOST must be a float");
125 }
126 if let Ok(v) = std::env::var("SKI_PROJECT_BOOST") {
127 cfg.project_boost = v.parse().expect("SKI_PROJECT_BOOST must be a float");
128 }
129 // Reranker-gate sweep knobs: tune the stage-2 abstention floor/margin for one
130 // run without editing config.toml (these are on the logit scale, untouched by
131 // `calibrate_to`).
132 if let Ok(v) = std::env::var("SKI_RERANK_MIN") {
133 cfg.rerank_min = v.parse().expect("SKI_RERANK_MIN must be a float");
134 }
135 if let Ok(v) = std::env::var("SKI_RERANK_MARGIN") {
136 cfg.rerank_margin = v.parse().expect("SKI_RERANK_MARGIN must be a float");
137 }
138 // Lexical fast-path (BM25 over description) sweep knobs: `lexical_min <= 0`
139 // disables it, so the same corpus can be scored with and without the channel.
140 if let Ok(v) = std::env::var("SKI_LEXICAL_MIN") {
141 cfg.lexical_min = v.parse().expect("SKI_LEXICAL_MIN must be a float");
142 }
143 if let Ok(v) = std::env::var("SKI_LEXICAL_MARGIN") {
144 cfg.lexical_margin = v.parse().expect("SKI_LEXICAL_MARGIN must be a float");
145 }
146 let skills = skill::discover(&cfg.roots)?;
147 let embedder = embed::build(&cfg.model)?;
148 cfg.calibrate_to(embedder.as_ref());
149 file.apply_cosine(&mut cfg);
150 let idx = index::build(&skills, embedder.as_ref(), None)?;
151 eprintln!(
152 "index: {} skills via {} | rerank_min {:.2} margin {:.2} | min_sim {:.2} | lexical_min {:.2} margin {:.2}",
153 idx.skills.len(),
154 idx.model,
155 cfg.rerank_min,
156 cfg.rerank_margin,
157 cfg.min_similarity,
158 cfg.lexical_min,
159 cfg.lexical_margin,
160 );
161
162 // Confusion counters. `borderline` rows are tallied separately (observe-only).
163 let (mut tp, mut fn_, mut fp, mut tn) = (0u32, 0u32, 0u32, 0u32);
164 let (mut n_pos, mut n_neg) = (0u32, 0u32);
165 let mut fp_rows: Vec<String> = Vec::new();
166 let mut fn_rows: Vec<String> = Vec::new();
167 // Stage-1 retrieval ceiling (pre-rerank), over positives only: recall@k is the
168 // fraction whose gold skill survives into the top-`rerank_top_k` candidates the
169 // reranker is fed (`rerank::rerank` takes exactly that many); top-1 is the
170 // fraction already ranked first by hybrid score. recall@k ~100% means retrieval
171 // is not the bottleneck and the problem is ranking within the retrieved set.
172 let (mut recall_at_k, mut stage1_top1) = (0u32, 0u32);
173 let mut recall_miss_rows: Vec<String> = Vec::new();
174
175 for c in &cases {
176 let query = embedder
177 .embed(std::slice::from_ref(&c.prompt), EmbedKind::Query)?
178 .remove(0);
179 let cvec = context::vector(embedder.as_ref(), &c.context, &cfg)?;
180 // File-type channel: scan this turn's prompt AND its prior context for named
181 // files (a `.xlsx` etc.), mapping each to its skill.
182 let file_text = format!("{} {}", c.context.join(" "), c.prompt);
183 let file_ids = context::file_ids(&file_text);
184 // Ambient project-type channel: the case's cwd (5th column) yields
185 // ecosystem terms (plus any code file named in the conversation), resolved
186 // against the installed index. Empty when the channel is off.
187 let project_ids = if cfg.project_boost > 0.0 {
188 let mut terms = context::project_terms(&c.cwd);
189 terms.extend(context::code_terms(&file_text));
190 context::skills_for_terms(&terms, &idx)
191 .into_keys()
192 .collect()
193 } else {
194 std::collections::BTreeSet::new()
195 };
196 let hits = rank::rank_all_ctx(
197 &query,
198 cvec.as_deref(),
199 &file_ids,
200 &project_ids,
201 &c.prompt,
202 &idx,
203 &cfg,
204 );
205 // The reranker reads text: enrich its query with the recent window when the
206 // prompt is vague (same gate that lets the context vector contribute).
207 let prompt_top = hits.iter().map(|h| h.cosine).fold(0.0_f32, f32::max);
208 let rerank_query = context::rerank_query(
209 &c.prompt,
210 prompt_top,
211 &c.context,
212 !file_ids.is_empty(),
213 &cfg,
214 );
215 let plan = pipeline::decide(&hits, &idx, &c.prompt, &rerank_query, &cfg);
216 let stage = match plan.stage {
217 Stage::Lexical => "lexical",
218 Stage::Rerank => "rerank",
219 Stage::Cosine => "stage1",
220 };
221 // Caller-side guardrails: the hook's `finalize` minus session dedup (the eval
222 // has no session) — drop denied skills, cap at `max_skills`.
223 let injected: Vec<Hit> = plan
224 .passed
225 .into_iter()
226 .filter(|h| !cfg.deny.contains(&h.id))
227 .take(cfg.max_skills)
228 .collect();
229 let ids: Vec<String> = injected.iter().map(|h| h.id.clone()).collect();
230 let is_neg = c.want == "(none)";
231 let observe_only = c.kind == "borderline";
232
233 if verbose {
234 let top: Vec<String> = hits
235 .iter()
236 .take(4)
237 .map(|h| format!("{}={:.3}", h.id, h.score))
238 .collect();
239 let inj: Vec<String> = injected
240 .iter()
241 .map(|h| {
242 format!(
243 "{}=L{:.2}/cos{:.3}+ctx{:.2}+file{:.2}+proj{:.2}+kw{:.2}+ph{:.2}",
244 h.id, h.score, h.cosine, h.context, h.file, h.project, h.keyword, h.phrase
245 )
246 })
247 .collect();
248 eprintln!(
249 "[{:<10}] {:<7} inject=[{}] top: {} :: {}",
250 c.kind,
251 stage,
252 inj.join(", "),
253 top.join(", "),
254 c.prompt,
255 );
256 }
257
258 if observe_only {
259 continue;
260 }
261 if is_neg {
262 n_neg += 1;
263 if injected.is_empty() {
264 tn += 1;
265 } else {
266 fp += 1;
267 fp_rows.push(format!(
268 " FP [{:<10}] inject=[{}] :: {}",
269 c.kind,
270 ids.join(", "),
271 c.prompt
272 ));
273 }
274 } else {
275 n_pos += 1;
276 // Stage-1 ceiling: where does the gold skill land in the full hybrid
277 // ranking, before any rerank/threshold gating?
278 let rank = hits.iter().position(|h| h.id == c.want);
279 if rank == Some(0) {
280 stage1_top1 += 1;
281 }
282 if rank.is_some_and(|r| r < cfg.rerank_top_k) {
283 recall_at_k += 1;
284 } else {
285 recall_miss_rows.push(format!(
286 " R@k MISS [{:<10}] want={} stage-1 rank={} :: {}",
287 c.kind,
288 c.want,
289 rank.map_or_else(|| "absent".to_string(), |r| r.to_string()),
290 c.prompt
291 ));
292 }
293 if ids.iter().any(|id| id == &c.want) {
294 tp += 1;
295 } else {
296 fn_ += 1;
297 fn_rows.push(format!(
298 " FN [{:<10}] want={} got=[{}] :: {}",
299 c.kind,
300 c.want,
301 ids.join(", "),
302 c.prompt
303 ));
304 }
305 }
306 }
307
308 println!("\n=== eval: {} ===", path);
309 println!(
310 "positives {n_pos}: recall {tp}/{n_pos} ({:.0}%) misses {fn_}",
311 pct(tp, n_pos)
312 );
313 println!(
314 "negatives {n_neg}: false-inject {fp}/{n_neg} ({:.0}%) clean {tn}",
315 pct(fp, n_neg)
316 );
317 // Headline: recall recovered, net of discounted FP harm. Optimise this — not
318 // FP count — because a strong host filters false injects (see module docs).
319 let recall_rate = if n_pos == 0 {
320 0.0
321 } else {
322 tp as f32 / n_pos as f32
323 };
324 let fp_rate = if n_neg == 0 {
325 0.0
326 } else {
327 fp as f32 / n_neg as f32
328 };
329 println!(
330 "host-value {:.0}% (= recall {:.0}% - {FP_HARM} * fp {:.0}%; FP discounted: a strong host ignores false injects)",
331 100.0 * (recall_rate - FP_HARM * fp_rate),
332 100.0 * recall_rate,
333 100.0 * fp_rate,
334 );
335 println!(
336 "stage-1 (pre-rerank, k={}): recall@k {recall_at_k}/{n_pos} ({:.0}%) top-1 {stage1_top1}/{n_pos} ({:.0}%)",
337 cfg.rerank_top_k,
338 pct(recall_at_k, n_pos),
339 pct(stage1_top1, n_pos),
340 );
341 if !recall_miss_rows.is_empty() {
342 println!(
343 "--- stage-1 recall@k misses (gold below top-{}) ---",
344 cfg.rerank_top_k
345 );
346 recall_miss_rows.iter().for_each(|r| println!("{r}"));
347 }
348 if !fn_rows.is_empty() {
349 println!("--- recall misses ---");
350 fn_rows.iter().for_each(|r| println!("{r}"));
351 }
352 if !fp_rows.is_empty() {
353 println!("--- false injections ---");
354 fp_rows.iter().for_each(|r| println!("{r}"));
355 }
356 Ok(())
357}