use crate::config::Config;
use crate::embed::{EmbedKind, Embedder};
use crate::index::Index;
use crate::text::{match_tokens, norm_token, tokenize};
use std::collections::{BTreeMap, BTreeSet};
use std::path::Path;
fn ext_skill(ext: &str) -> Option<&'static str> {
match ext {
"pdf" => Some("pdf"),
"xlsx" | "xls" | "xlsm" | "csv" | "tsv" | "ods" | "numbers" => Some("xlsx"),
"docx" | "doc" | "rtf" | "odt" | "pages" => Some("docx"),
"pptx" | "ppt" | "odp" | "key" => Some("pptx"),
_ => None,
}
}
fn file_tokens(text: &str) -> impl Iterator<Item = (&str, String)> {
text.split(|c: char| c.is_whitespace() || matches!(c, '"' | '\'' | '(' | ')' | '`' | ','))
.filter_map(|tok| {
let tok = tok.trim_end_matches(['.', ':', ';', '!', '?']);
let (stem, ext) = tok.rsplit_once('.')?;
if stem.is_empty() {
return None; }
Some((stem, ext.to_ascii_lowercase()))
})
}
pub fn file_ids(text: &str) -> BTreeSet<String> {
let mut out = BTreeSet::new();
for (_, ext) in file_tokens(text) {
if let Some(id) = ext_skill(&ext) {
out.insert(id.to_string());
}
}
out
}
const MANIFEST_TERMS: &[(&str, &[&str])] = &[
("Cargo.toml", &["rust", "cargo"]),
("go.mod", &["go", "golang"]),
("uv.lock", &["uv", "python"]),
("pyproject.toml", &["python"]),
("requirements.txt", &["python", "pip"]),
("setup.py", &["python"]),
("Pipfile", &["python"]),
("package.json", &["javascript", "node", "npm"]),
("tsconfig.json", &["typescript"]),
("Gemfile", &["ruby"]),
("pom.xml", &["java", "maven"]),
("build.gradle", &["java", "gradle"]),
("build.gradle.kts", &["kotlin", "gradle"]),
("Dockerfile", &["docker"]),
("docker-compose.yml", &["docker"]),
("compose.yaml", &["docker"]),
("flake.nix", &["nix"]),
("CMakeLists.txt", &["cmake"]),
];
fn ext_terms(ext: &str) -> Option<&'static [&'static str]> {
Some(match ext {
"py" => &["python"],
"ipynb" => &["python", "jupyter", "notebook"],
"rs" => &["rust"],
"go" => &["go", "golang"],
"ts" | "tsx" => &["typescript"],
"js" | "jsx" | "mjs" => &["javascript", "node"],
"rb" => &["ruby"],
"java" => &["java"],
"kt" => &["kotlin"],
"tf" => &["terraform"],
"sql" => &["sql"],
"sh" | "bash" => &["shell", "bash"],
_ => return None,
})
}
const PROJECT_WALK_LEVELS: usize = 6;
fn push_term(out: &mut Vec<String>, term: &str) {
if !out.iter().any(|t| t == term) {
out.push(term.to_string());
}
}
pub fn project_terms(cwd: &str) -> Vec<String> {
let mut out = Vec::new();
if cwd.is_empty() {
return out;
}
let mut dir = Some(Path::new(cwd));
for _ in 0..PROJECT_WALK_LEVELS {
let Some(d) = dir else { break };
for (manifest, terms) in MANIFEST_TERMS {
if d.join(manifest).exists() {
for t in terms.iter() {
push_term(&mut out, t);
}
}
}
dir = d.parent();
}
out
}
pub fn code_terms(text: &str) -> Vec<String> {
let mut out = Vec::new();
for (_, ext) in file_tokens(text) {
if let Some(terms) = ext_terms(&ext) {
for t in terms.iter() {
push_term(&mut out, t);
}
}
}
out
}
pub fn skills_for_terms(terms: &[String], idx: &Index) -> BTreeMap<String, String> {
let mut out = BTreeMap::new();
if terms.is_empty() {
return out;
}
let terms: Vec<String> = terms.iter().map(|t| norm_token(t)).collect();
for e in &idx.skills {
let mut toks: BTreeSet<String> = e
.keywords
.iter()
.flat_map(|k| tokenize(k))
.map(|t| norm_token(&t))
.collect();
toks.extend(match_tokens(&e.description));
if let Some(term) = terms.iter().find(|t| toks.contains(*t)) {
out.insert(e.id.clone(), term.clone());
}
}
out
}
fn window<'a>(recent: &'a [String], cfg: &Config) -> &'a [String] {
if cfg.context_depth == 0 || recent.is_empty() {
return &[];
}
let take = recent.len().min(cfg.context_depth);
&recent[recent.len() - take..]
}
pub fn vector(
embedder: &dyn Embedder,
recent: &[String],
cfg: &Config,
) -> anyhow::Result<Option<Vec<f32>>> {
if cfg.context_weight <= 0.0 {
return Ok(None); }
let win = window(recent, cfg);
if win.is_empty() {
return Ok(None);
}
let embs = embedder.embed(win, EmbedKind::Query)?;
let Some(dim) = embs.first().map(|e| e.len()) else {
return Ok(None);
};
let n = embs.len();
let mut acc = vec![0.0f32; dim];
let mut wsum = 0.0f32;
for (i, e) in embs.iter().enumerate() {
let w = 0.5f32.powi((n - 1 - i) as i32);
wsum += w;
for (a, x) in acc.iter_mut().zip(e) {
*a += w * x;
}
}
if wsum > 0.0 {
for a in acc.iter_mut() {
*a /= wsum;
}
}
Ok(Some(acc))
}
pub fn rerank_query(
prompt: &str,
prompt_top: f32,
recent: &[String],
file_present: bool,
cfg: &Config,
) -> String {
let win = window(recent, cfg);
if win.is_empty() {
return prompt.to_string();
}
let by_vagueness = crate::rank::context_weight(prompt_top, cfg) > 0.0;
let by_file = file_present && cfg.file_boost > 0.0;
if !(by_vagueness || by_file) {
return prompt.to_string();
}
format!("{}\n{}", win.join("\n"), prompt)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::embed::bow::BowEmbedder;
fn on() -> Config {
Config {
context_depth: 2,
context_weight: 0.3,
vague_lo: 0.55,
vague_hi: 0.65,
..Default::default()
}
}
#[test]
fn vector_none_when_disabled_or_empty() {
let e = BowEmbedder::new();
let off = Config::default();
assert!(vector(&e, &["a".into()], &off).unwrap().is_none());
assert!(vector(&e, &[], &on()).unwrap().is_none());
}
#[test]
fn vector_built_when_enabled() {
let e = BowEmbedder::new();
let v = vector(
&e,
&["set up pytest".into(), "now the other one".into()],
&on(),
)
.unwrap()
.expect("a vector");
assert!(!v.is_empty());
}
#[test]
fn vector_respects_depth() {
let e = BowEmbedder::new();
let cfg = Config {
context_depth: 1,
..on()
};
let got = vector(&e, &["a a a".into(), "b b b".into()], &cfg)
.unwrap()
.unwrap();
let want = e
.embed(&["b b b".into()], EmbedKind::Query)
.unwrap()
.remove(0);
assert_eq!(got.len(), want.len());
for (g, w) in got.iter().zip(&want) {
assert!((g - w).abs() < 1e-6);
}
}
#[test]
fn rerank_query_enriches_only_when_vague() {
let recent = vec!["set up pytest".to_string()];
let cfg = on();
let vague = rerank_query("now the other one", 0.50, &recent, false, &cfg);
assert_eq!(vague, "set up pytest\nnow the other one");
let confident = rerank_query("now the other one", 0.90, &recent, false, &cfg);
assert_eq!(confident, "now the other one");
}
#[test]
fn rerank_query_enriches_for_file_even_when_confident() {
let recent = vec!["attached sales.xlsx".to_string()];
let cfg = Config {
file_boost: 0.2,
..on()
};
let got = rerank_query("clean it up", 0.90, &recent, true, &cfg);
assert_eq!(got, "attached sales.xlsx\nclean it up");
let off_file = Config {
file_boost: 0.0,
..on()
};
assert_eq!(
rerank_query("clean it up", 0.90, &recent, true, &off_file),
"clean it up"
);
}
#[test]
fn rerank_query_bare_when_window_off() {
let recent = vec!["set up pytest".to_string()];
let off = Config {
context_depth: 0,
..Config::default()
};
assert_eq!(rerank_query("x", 0.10, &recent, true, &off), "x");
}
#[test]
fn file_ids_maps_known_extensions() {
let got = file_ids("please clean up sales_q3.xlsx and merge report.pdf");
assert!(got.contains("xlsx"));
assert!(got.contains("pdf"));
assert!(file_ids("here is data.csv").contains("xlsx"));
assert!(file_ids("the deck.pptx").contains("pptx"));
assert!(file_ids("cover_letter.docx").contains("docx"));
assert!(file_ids("budget.ods").contains("xlsx"));
assert!(file_ids("notes.pages").contains("docx"));
assert!(file_ids("keynote talk.key").contains("pptx"));
assert!(file_ids("memo.rtf").contains("docx"));
}
#[test]
fn file_ids_ignores_image_and_notebook_extensions() {
assert!(file_ids("see chart.png and demo.gif").is_empty());
assert!(file_ids("open analysis.ipynb").is_empty());
}
#[test]
fn project_terms_maps_manifest_in_cwd_and_ancestors() {
let root = std::env::temp_dir().join(format!(
"ski-proj-{}-{}",
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_nanos()
));
let nested = root.join("src").join("inner");
std::fs::create_dir_all(&nested).unwrap();
std::fs::write(root.join("uv.lock"), b"version = 1\n").unwrap();
let terms = project_terms(root.to_str().unwrap());
assert_eq!(terms, ["uv", "python"], "{terms:?}");
assert!(project_terms(nested.to_str().unwrap())
.iter()
.any(|t| t == "uv"));
std::fs::remove_dir_all(&root).ok();
}
#[test]
fn project_terms_empty_when_no_manifest_or_blank_cwd() {
assert!(project_terms("").is_empty());
assert!(project_terms("/no/such/ski/path/here").is_empty());
}
#[test]
fn code_terms_maps_referenced_code_files() {
let got = code_terms("please fix scripts/etl.py and look at handler.rs");
assert!(
got.iter().any(|t| t == "python") && got.iter().any(|t| t == "rust"),
"{got:?}"
);
assert!(code_terms("clean up report.xlsx").is_empty());
assert!(code_terms("set up a project").is_empty());
}
#[test]
fn skills_for_terms_matches_installed_library_dynamically() {
let entry = |id: &str, description: &str, keywords: &[&str]| crate::index::Entry {
id: id.to_string(),
name: id.to_string(),
description: description.to_string(),
path: String::new(),
keywords: keywords.iter().map(|k| k.to_string()).collect(),
trigger_phrases: Vec::new(),
body_head: String::new(),
hash: String::new(),
embedding: Vec::new(),
};
let idx = crate::index::Index {
model: "test".into(),
dim: 0,
skills: vec![
entry(
"uv-development",
"Bootstrap and manage projects.",
&["uv", "python"],
),
entry(
"rusty-style",
"Idiomatic Rust patterns and error handling.",
&[],
),
entry("git-attribution", "Credit AI assistance in commits.", &[]),
],
};
let terms = vec!["uv".to_string(), "python".to_string(), "rust".to_string()];
let got = skills_for_terms(&terms, &idx);
assert_eq!(got.get("uv-development").map(String::as_str), Some("uv"));
assert_eq!(got.get("rusty-style").map(String::as_str), Some("rust"));
assert!(!got.contains_key("git-attribution"));
assert!(skills_for_terms(&[], &idx).is_empty());
}
#[test]
fn file_ids_ignores_unmapped_and_bare_extensions() {
assert!(file_ids("edit main.rs and lib.py").is_empty());
assert!(file_ids("the .pdf format is great").is_empty());
assert!(file_ids("look at budget.xlsx, then stop.").contains("xlsx"));
}
}