ski/context.rs
1//! Conversational-context query enrichment (the query side of retrieval).
2//!
3//! A vague follow-up prompt ("now do the other one", "fix that") carries little
4//! signal on its own, so the bi-encoder retrieves poorly and the cross-encoder
5//! has nothing to disambiguate against. The turns *before* it usually do carry
6//! the intent. This module turns a session's recent-prompt window into two
7//! enrichment signals, both gated on how vague the current prompt is
8//! ([`crate::rank::context_weight`]):
9//!
10//! - a **context vector** ([`vector`]) blended into stage-1 scoring
11//! ([`crate::rank::rank_all_ctx`]), and
12//! - an enriched **reranker query** ([`rerank_query`]) — the cross-encoder reads
13//! text, not vectors, so the recent window is prepended to the prompt.
14//!
15//! Both are inert unless the feature is enabled (`context_depth > 0` and
16//! `context_weight > 0.0`), so the default path pays nothing.
17
18use crate::config::Config;
19use crate::embed::{EmbedKind, Embedder};
20use crate::index::Index;
21use crate::text::{match_tokens, norm_token, tokenize};
22use std::collections::{BTreeMap, BTreeSet};
23use std::path::Path;
24
25/// Skill ids implied by a file extension, for the file-type context channel. Only
26/// extensions whose document *kind* is an unambiguous 1:1 with a skill are mapped —
27/// a `.xlsx`/`.ods`/`.numbers` is a spreadsheet task, a `.key`/`.odp` is a deck — so
28/// the boost stays high-precision and routes by intent even for formats a skill's
29/// own tooling converts rather than opens natively. Generic code extensions
30/// (`.rs`, `.py`, ...) map to no single skill and are deliberately absent. Images
31/// (`.png`/`.jpg`/`.gif`) and notebooks (`.ipynb`) are excluded for the same
32/// precision reason: no installed skill is their unambiguous identity (image skills
33/// are intent-specific, and there is no notebook skill), so mapping them would buy
34/// recall with false-injects.
35fn ext_skill(ext: &str) -> Option<&'static str> {
36 match ext {
37 "pdf" => Some("pdf"),
38 // Spreadsheet identity: the formats the xlsx skill reads (csv/tsv) plus the
39 // OpenDocument/iWork/legacy spreadsheet equivalents.
40 "xlsx" | "xls" | "xlsm" | "csv" | "tsv" | "ods" | "numbers" => Some("xlsx"),
41 // Word-processor identity.
42 "docx" | "doc" | "rtf" | "odt" | "pages" => Some("docx"),
43 // Presentation identity.
44 "pptx" | "ppt" | "odp" | "key" => Some("pptx"),
45 _ => None,
46 }
47}
48
49/// Iterate the filename-shaped tokens of `text`, yielding `(stem, extension)` for
50/// each token carrying a `.<ext>` suffix. Shared by the file-type channel
51/// ([`file_ids`]) and the code-file ecosystem scan ([`code_terms`]).
52fn file_tokens(text: &str) -> impl Iterator<Item = (&str, String)> {
53 text.split(|c: char| c.is_whitespace() || matches!(c, '"' | '\'' | '(' | ')' | '`' | ','))
54 .filter_map(|tok| {
55 // Strip trailing punctuation that commonly hugs a filename in prose.
56 let tok = tok.trim_end_matches(['.', ':', ';', '!', '?']);
57 let (stem, ext) = tok.rsplit_once('.')?;
58 if stem.is_empty() {
59 return None; // a bare ".pdf" with no name is not a real reference.
60 }
61 Some((stem, ext.to_ascii_lowercase()))
62 })
63}
64
65/// Skill ids implied by file references in `text` (a prompt and/or recent-window
66/// turns): scans whitespace-separated tokens for a trailing `.<ext>` and maps each
67/// known extension through [`ext_skill`]. This is the *directly attributable*
68/// context signal — a file attached or named **now** is unambiguous in a way a
69/// vague prompt is not, so unlike the dense context vector it is not gated on
70/// prompt vagueness. De-duplicated.
71pub fn file_ids(text: &str) -> BTreeSet<String> {
72 let mut out = BTreeSet::new();
73 for (_, ext) in file_tokens(text) {
74 if let Some(id) = ext_skill(&ext) {
75 out.insert(id.to_string());
76 }
77 }
78 out
79}
80
81/// Known project-manifest filenames and the ecosystem terms each implies, for the
82/// project-type context channel. Unlike the file channel's 1:1 skill-id map, these
83/// are *terms* matched dynamically against whatever library the user actually has
84/// installed ([`skills_for_terms`]) — a `uv.lock` surfaces *their* uv skill
85/// whatever it is named, and an unmatched term simply maps to nothing. That is why
86/// multi-skill ecosystems (python, the JS frameworks) can be listed here where the
87/// old hardcoded-id map had to leave them out: every plausibly-matching skill gets
88/// the (cosine-gated, deliberately recall-leaning) boost and the model arbitrates.
89const MANIFEST_TERMS: &[(&str, &[&str])] = &[
90 ("Cargo.toml", &["rust", "cargo"]),
91 ("go.mod", &["go", "golang"]),
92 ("uv.lock", &["uv", "python"]),
93 ("pyproject.toml", &["python"]),
94 ("requirements.txt", &["python", "pip"]),
95 ("setup.py", &["python"]),
96 ("Pipfile", &["python"]),
97 ("package.json", &["javascript", "node", "npm"]),
98 ("tsconfig.json", &["typescript"]),
99 ("Gemfile", &["ruby"]),
100 ("pom.xml", &["java", "maven"]),
101 ("build.gradle", &["java", "gradle"]),
102 ("build.gradle.kts", &["kotlin", "gradle"]),
103 ("Dockerfile", &["docker"]),
104 ("docker-compose.yml", &["docker"]),
105 ("compose.yaml", &["docker"]),
106 ("flake.nix", &["nix"]),
107 ("CMakeLists.txt", &["cmake"]),
108];
109
110/// Ecosystem terms implied by a *code* file's extension, feeding the same ambient
111/// project channel as the manifests. This covers the session working outside the
112/// project root (cwd walk finds nothing) but naming `scripts/etl.py` in the
113/// prompt, and attached code files. Kept to unambiguous language identities;
114/// document formats stay with the higher-precision, ungated [`ext_skill`] channel.
115fn ext_terms(ext: &str) -> Option<&'static [&'static str]> {
116 Some(match ext {
117 "py" => &["python"],
118 "ipynb" => &["python", "jupyter", "notebook"],
119 "rs" => &["rust"],
120 "go" => &["go", "golang"],
121 "ts" | "tsx" => &["typescript"],
122 "js" | "jsx" | "mjs" => &["javascript", "node"],
123 "rb" => &["ruby"],
124 "java" => &["java"],
125 "kt" => &["kotlin"],
126 "tf" => &["terraform"],
127 "sql" => &["sql"],
128 "sh" | "bash" => &["shell", "bash"],
129 _ => return None,
130 })
131}
132
133/// How many directory levels to walk upward from `cwd` looking for a manifest. A
134/// session's cwd is often a subdirectory of the project root where the manifest
135/// lives, so we ascend a few levels — but cap it so a deeply-nested cwd cannot
136/// stat its way to the filesystem root.
137const PROJECT_WALK_LEVELS: usize = 6;
138
139/// Append `term` if it isn't already present — an order-preserving de-dup, so a
140/// term list keeps most-specific-first ordering (the order [`MANIFEST_TERMS`] /
141/// [`ext_terms`] list them in) for [`skills_for_terms`]'s first-match-wins
142/// evidence attribution: a uv.lock reports "a uv project", not "a python project".
143fn push_term(out: &mut Vec<String>, term: &str) {
144 if !out.iter().any(|t| t == term) {
145 out.push(term.to_string());
146 }
147}
148
149/// Ecosystem terms implied by the project manifest(s) found in `cwd` or any
150/// ancestor directory (up to [`PROJECT_WALK_LEVELS`]). Performs cheap `exists()`
151/// stats only; order-preserving and de-duplicated (most specific term first);
152/// empty when `cwd` is empty or no known manifest is found. Resolve against the
153/// installed library with [`skills_for_terms`].
154pub fn project_terms(cwd: &str) -> Vec<String> {
155 let mut out = Vec::new();
156 if cwd.is_empty() {
157 return out;
158 }
159 let mut dir = Some(Path::new(cwd));
160 for _ in 0..PROJECT_WALK_LEVELS {
161 let Some(d) = dir else { break };
162 for (manifest, terms) in MANIFEST_TERMS {
163 if d.join(manifest).exists() {
164 for t in terms.iter() {
165 push_term(&mut out, t);
166 }
167 }
168 }
169 dir = d.parent();
170 }
171 out
172}
173
174/// Ecosystem terms implied by code files referenced in `text` (a prompt and/or
175/// recent-window turns), via [`ext_terms`]. Order-preserving, de-duplicated.
176pub fn code_terms(text: &str) -> Vec<String> {
177 let mut out = Vec::new();
178 for (_, ext) in file_tokens(text) {
179 if let Some(terms) = ext_terms(&ext) {
180 for t in terms.iter() {
181 push_term(&mut out, t);
182 }
183 }
184 }
185 out
186}
187
188/// Resolve ecosystem `terms` against the installed library: every index entry
189/// whose keywords (which include its name tokens) or description mention a term
190/// maps to that term. Returns skill id → the matched term (for evidence display;
191/// the first matching term in `terms` order wins, so callers should pass the
192/// most specific term first). Matching is token-exact after [`norm_token`]
193/// normalization — "uv" matches a `uv` keyword or "uv" in the description prose,
194/// never a substring — and deliberately generous beyond that: this feeds the
195/// *ambient* channel, which stays cosine-gated in [`crate::rank::rank_all_ctx`],
196/// so a spurious term match costs nothing unless the skill was already
197/// near-plausible for the prompt.
198pub fn skills_for_terms(terms: &[String], idx: &Index) -> BTreeMap<String, String> {
199 let mut out = BTreeMap::new();
200 if terms.is_empty() {
201 return out;
202 }
203 let terms: Vec<String> = terms.iter().map(|t| norm_token(t)).collect();
204 for e in &idx.skills {
205 let mut toks: BTreeSet<String> = e
206 .keywords
207 .iter()
208 .flat_map(|k| tokenize(k))
209 .map(|t| norm_token(&t))
210 .collect();
211 toks.extend(match_tokens(&e.description));
212 if let Some(term) = terms.iter().find(|t| toks.contains(*t)) {
213 out.insert(e.id.clone(), term.clone());
214 }
215 }
216 out
217}
218
219/// The most-recent `depth` prompts of `recent` (which is oldest-first), or all of
220/// them when fewer. Empty when the window is disabled (`context_depth == 0`) or
221/// there is nothing to use. Gated on depth alone — the window is shared by the
222/// dense-vector and file channels, so it must stay available even when the dense
223/// blend (`context_weight`) is off.
224fn window<'a>(recent: &'a [String], cfg: &Config) -> &'a [String] {
225 if cfg.context_depth == 0 || recent.is_empty() {
226 return &[];
227 }
228 let take = recent.len().min(cfg.context_depth);
229 &recent[recent.len() - take..]
230}
231
232/// Build a single context vector from the recent-prompt window: a recency-weighted
233/// mean of the per-prompt embeddings (geometric decay, most-recent weight 1.0).
234/// `None` when the feature is off or the window is empty. Embeds the whole window
235/// in one batch. The result need not be normalized — [`crate::rank::cosine`]
236/// normalizes both operands.
237pub fn vector(
238 embedder: &dyn Embedder,
239 recent: &[String],
240 cfg: &Config,
241) -> anyhow::Result<Option<Vec<f32>>> {
242 if cfg.context_weight <= 0.0 {
243 return Ok(None); // dense blend off (the window may still serve other channels)
244 }
245 let win = window(recent, cfg);
246 if win.is_empty() {
247 return Ok(None);
248 }
249 let embs = embedder.embed(win, EmbedKind::Query)?;
250 let Some(dim) = embs.first().map(|e| e.len()) else {
251 return Ok(None);
252 };
253 let n = embs.len();
254 let mut acc = vec![0.0f32; dim];
255 let mut wsum = 0.0f32;
256 for (i, e) in embs.iter().enumerate() {
257 // `recent`/`win` are oldest-first, so the last entry is the most recent and
258 // earns weight 1.0; each older turn is halved.
259 let w = 0.5f32.powi((n - 1 - i) as i32);
260 wsum += w;
261 for (a, x) in acc.iter_mut().zip(e) {
262 *a += w * x;
263 }
264 }
265 if wsum > 0.0 {
266 for a in acc.iter_mut() {
267 *a /= wsum;
268 }
269 }
270 Ok(Some(acc))
271}
272
273/// The reranker query for a prompt whose best stage-1 self-cosine is `prompt_top`.
274/// The recent-window text is prepended (so the cross-encoder reads the
275/// conversation, including any named file) when context applies this turn — either
276/// the prompt is vague enough that [`crate::rank::context_weight`] is positive, or
277/// a file was referenced (`file_present`) and the file channel is on. Otherwise the
278/// bare prompt is returned unchanged, so a confident, file-free prompt is never
279/// muddied by stale context.
280pub fn rerank_query(
281 prompt: &str,
282 prompt_top: f32,
283 recent: &[String],
284 file_present: bool,
285 cfg: &Config,
286) -> String {
287 let win = window(recent, cfg);
288 if win.is_empty() {
289 return prompt.to_string();
290 }
291 let by_vagueness = crate::rank::context_weight(prompt_top, cfg) > 0.0;
292 let by_file = file_present && cfg.file_boost > 0.0;
293 if !(by_vagueness || by_file) {
294 return prompt.to_string();
295 }
296 format!("{}\n{}", win.join("\n"), prompt)
297}
298
299#[cfg(test)]
300mod tests {
301 use super::*;
302 use crate::embed::bow::BowEmbedder;
303
304 fn on() -> Config {
305 Config {
306 context_depth: 2,
307 context_weight: 0.3,
308 vague_lo: 0.55,
309 vague_hi: 0.65,
310 ..Default::default()
311 }
312 }
313
314 #[test]
315 fn vector_none_when_disabled_or_empty() {
316 let e = BowEmbedder::new();
317 // Feature off.
318 let off = Config::default();
319 assert!(vector(&e, &["a".into()], &off).unwrap().is_none());
320 // On, but no prompts.
321 assert!(vector(&e, &[], &on()).unwrap().is_none());
322 }
323
324 #[test]
325 fn vector_built_when_enabled() {
326 let e = BowEmbedder::new();
327 let v = vector(
328 &e,
329 &["set up pytest".into(), "now the other one".into()],
330 &on(),
331 )
332 .unwrap()
333 .expect("a vector");
334 assert!(!v.is_empty());
335 }
336
337 #[test]
338 fn vector_respects_depth() {
339 // Depth 1 must embed only the most recent prompt: the result equals the
340 // single-prompt embedding of "b", not a mix with "a".
341 let e = BowEmbedder::new();
342 let cfg = Config {
343 context_depth: 1,
344 ..on()
345 };
346 let got = vector(&e, &["a a a".into(), "b b b".into()], &cfg)
347 .unwrap()
348 .unwrap();
349 let want = e
350 .embed(&["b b b".into()], EmbedKind::Query)
351 .unwrap()
352 .remove(0);
353 assert_eq!(got.len(), want.len());
354 for (g, w) in got.iter().zip(&want) {
355 assert!((g - w).abs() < 1e-6);
356 }
357 }
358
359 #[test]
360 fn rerank_query_enriches_only_when_vague() {
361 let recent = vec!["set up pytest".to_string()];
362 let cfg = on();
363 // Vague prompt (low self-cosine), no file -> prepend context.
364 let vague = rerank_query("now the other one", 0.50, &recent, false, &cfg);
365 assert_eq!(vague, "set up pytest\nnow the other one");
366 // Confident prompt (high self-cosine), no file -> bare prompt.
367 let confident = rerank_query("now the other one", 0.90, &recent, false, &cfg);
368 assert_eq!(confident, "now the other one");
369 }
370
371 #[test]
372 fn rerank_query_enriches_for_file_even_when_confident() {
373 // A named file justifies enrichment regardless of prompt vagueness, as long
374 // as the file channel is on and the window exists.
375 let recent = vec!["attached sales.xlsx".to_string()];
376 let cfg = Config {
377 file_boost: 0.2,
378 ..on()
379 };
380 let got = rerank_query("clean it up", 0.90, &recent, true, &cfg);
381 assert_eq!(got, "attached sales.xlsx\nclean it up");
382 // File channel off -> no enrichment from the file signal.
383 let off_file = Config {
384 file_boost: 0.0,
385 ..on()
386 };
387 assert_eq!(
388 rerank_query("clean it up", 0.90, &recent, true, &off_file),
389 "clean it up"
390 );
391 }
392
393 #[test]
394 fn rerank_query_bare_when_window_off() {
395 // context_depth 0 -> empty window -> always the bare prompt.
396 let recent = vec!["set up pytest".to_string()];
397 let off = Config {
398 context_depth: 0,
399 ..Config::default()
400 };
401 assert_eq!(rerank_query("x", 0.10, &recent, true, &off), "x");
402 }
403
404 #[test]
405 fn file_ids_maps_known_extensions() {
406 let got = file_ids("please clean up sales_q3.xlsx and merge report.pdf");
407 assert!(got.contains("xlsx"));
408 assert!(got.contains("pdf"));
409 // Spreadsheet-family extensions all map to xlsx.
410 assert!(file_ids("here is data.csv").contains("xlsx"));
411 assert!(file_ids("the deck.pptx").contains("pptx"));
412 assert!(file_ids("cover_letter.docx").contains("docx"));
413 // OpenDocument / iWork / legacy office formats route by document kind.
414 assert!(file_ids("budget.ods").contains("xlsx"));
415 assert!(file_ids("notes.pages").contains("docx"));
416 assert!(file_ids("keynote talk.key").contains("pptx"));
417 assert!(file_ids("memo.rtf").contains("docx"));
418 }
419
420 #[test]
421 fn file_ids_ignores_image_and_notebook_extensions() {
422 // No 1:1 skill identity -> deliberately unmapped (see `ext_skill` docs).
423 assert!(file_ids("see chart.png and demo.gif").is_empty());
424 assert!(file_ids("open analysis.ipynb").is_empty());
425 }
426
427 #[test]
428 fn project_terms_maps_manifest_in_cwd_and_ancestors() {
429 // Hermetic temp tree: <root>/uv.lock and a nested cwd two levels down.
430 let root = std::env::temp_dir().join(format!(
431 "ski-proj-{}-{}",
432 std::process::id(),
433 std::time::SystemTime::now()
434 .duration_since(std::time::UNIX_EPOCH)
435 .unwrap()
436 .as_nanos()
437 ));
438 let nested = root.join("src").join("inner");
439 std::fs::create_dir_all(&nested).unwrap();
440 std::fs::write(root.join("uv.lock"), b"version = 1\n").unwrap();
441
442 // Manifest in cwd itself: uv.lock implies uv (most specific, listed
443 // first) then python — order matters for evidence attribution.
444 let terms = project_terms(root.to_str().unwrap());
445 assert_eq!(terms, ["uv", "python"], "{terms:?}");
446 // Manifest found by walking up from a nested cwd.
447 assert!(project_terms(nested.to_str().unwrap())
448 .iter()
449 .any(|t| t == "uv"));
450
451 std::fs::remove_dir_all(&root).ok();
452 }
453
454 #[test]
455 fn project_terms_empty_when_no_manifest_or_blank_cwd() {
456 assert!(project_terms("").is_empty());
457 // A nonexistent path stats cleanly to empty.
458 assert!(project_terms("/no/such/ski/path/here").is_empty());
459 }
460
461 #[test]
462 fn code_terms_maps_referenced_code_files() {
463 let got = code_terms("please fix scripts/etl.py and look at handler.rs");
464 assert!(
465 got.iter().any(|t| t == "python") && got.iter().any(|t| t == "rust"),
466 "{got:?}"
467 );
468 // Document formats belong to the file channel, not this one; prose with no
469 // filenames maps nothing.
470 assert!(code_terms("clean up report.xlsx").is_empty());
471 assert!(code_terms("set up a project").is_empty());
472 }
473
474 #[test]
475 fn skills_for_terms_matches_installed_library_dynamically() {
476 let entry = |id: &str, description: &str, keywords: &[&str]| crate::index::Entry {
477 id: id.to_string(),
478 name: id.to_string(),
479 description: description.to_string(),
480 path: String::new(),
481 keywords: keywords.iter().map(|k| k.to_string()).collect(),
482 trigger_phrases: Vec::new(),
483 body_head: String::new(),
484 hash: String::new(),
485 embedding: Vec::new(),
486 };
487 let idx = crate::index::Index {
488 model: "test".into(),
489 dim: 0,
490 skills: vec![
491 // The user's-own-library case: matched via its `uv` keyword.
492 entry(
493 "uv-development",
494 "Bootstrap and manage projects.",
495 &["uv", "python"],
496 ),
497 // Matched via description prose only (no keywords).
498 entry(
499 "rusty-style",
500 "Idiomatic Rust patterns and error handling.",
501 &[],
502 ),
503 // No ecosystem mention anywhere -> unmatched.
504 entry("git-attribution", "Credit AI assistance in commits.", &[]),
505 ],
506 };
507 let terms = vec!["uv".to_string(), "python".to_string(), "rust".to_string()];
508 let got = skills_for_terms(&terms, &idx);
509 // First matching term wins: uv-development matches both `uv` and `python`
510 // keywords, and reports the more specific, earlier-listed `uv`.
511 assert_eq!(got.get("uv-development").map(String::as_str), Some("uv"));
512 assert_eq!(got.get("rusty-style").map(String::as_str), Some("rust"));
513 assert!(!got.contains_key("git-attribution"));
514 // Empty terms resolve to nothing.
515 assert!(skills_for_terms(&[], &idx).is_empty());
516 }
517
518 #[test]
519 fn file_ids_ignores_unmapped_and_bare_extensions() {
520 // Code files map to no skill; a bare ".pdf" with no stem is not a reference.
521 assert!(file_ids("edit main.rs and lib.py").is_empty());
522 assert!(file_ids("the .pdf format is great").is_empty());
523 // Trailing prose punctuation does not defeat the match.
524 assert!(file_ids("look at budget.xlsx, then stop.").contains("xlsx"));
525 }
526}