Skip to main content

ski/
index.rs

1//! The skill index: skill metadata plus the description embedding, persisted to
2//! disk and reused incrementally (re-embed only entries whose content hash or
3//! the embedding model changed).
4
5use crate::embed::{EmbedKind, Embedder};
6use crate::skill::Skill;
7use serde::{Deserialize, Serialize};
8use std::fs;
9use std::path::Path;
10
11#[derive(Clone, Debug, Serialize, Deserialize)]
12pub struct Entry {
13    pub id: String,
14    pub name: String,
15    pub description: String,
16    pub path: String,
17    pub keywords: Vec<String>,
18    /// Trigger phrases for the phrase channel (see [`crate::skill::extract_phrases`]).
19    /// `#[serde(default)]` so indexes written before this field still load.
20    #[serde(default)]
21    pub trigger_phrases: Vec<String>,
22    /// First prose lines of the skill body (see [`crate::skill::Skill::body_head`]),
23    /// cached so the stage-2 reranker can assemble its document text from the index
24    /// instead of re-reading and re-parsing each candidate's `SKILL.md` from disk on
25    /// the hot path. `#[serde(default)]` so indexes written before this field still
26    /// load (they parse the file until the next reindex refreshes the field).
27    #[serde(default)]
28    pub body_head: String,
29    pub hash: String,
30    pub embedding: Vec<f32>,
31}
32
33impl Entry {
34    /// Document text for the stage-2 cross-encoder — the curated description plus the
35    /// cached body head — mirroring [`crate::skill::Skill::doc_text`], but sourced
36    /// from the index so reranking touches no files.
37    pub fn doc_text(&self) -> String {
38        if self.body_head.is_empty() {
39            self.description.clone()
40        } else {
41            format!("{}\n{}", self.description, self.body_head)
42        }
43    }
44}
45
46#[derive(Clone, Debug, Default, Serialize, Deserialize)]
47pub struct Index {
48    pub model: String,
49    pub dim: usize,
50    pub skills: Vec<Entry>,
51}
52
53impl Index {
54    pub fn get(&self, id: &str) -> Option<&Entry> {
55        self.skills.iter().find(|e| e.id == id)
56    }
57
58    /// Find the skill whose `SKILL.md` lives at `path`. Used by `ski observe` to
59    /// map a file the model just read back to a skill id. Matches on the raw
60    /// stored string first (cheap, and the common case), then falls back to
61    /// canonicalized comparison so `./x` and `/abs/x` resolve to the same entry.
62    pub fn by_path(&self, path: &Path) -> Option<&Entry> {
63        let raw = path.to_string_lossy();
64        if let Some(e) = self.skills.iter().find(|e| e.path == raw) {
65            return Some(e);
66        }
67        let want = fs::canonicalize(path).ok()?;
68        self.skills
69            .iter()
70            .find(|e| fs::canonicalize(&e.path).ok().as_deref() == Some(want.as_path()))
71    }
72
73    pub fn load(path: &Path) -> anyhow::Result<Option<Index>> {
74        if !path.exists() {
75            return Ok(None);
76        }
77        let data = fs::read_to_string(path)?;
78        Ok(Some(serde_json::from_str(&data)?))
79    }
80
81    /// Persist the index. Writes a per-process temp file then atomically renames
82    /// it over the target, so a concurrent reader (a hook firing while
83    /// `session-start`/`why` refreshes the index) never observes a half-written
84    /// file — a torn read costs that hook a full re-embed of the library.
85    /// Mirrors [`crate::session::Session::save`].
86    pub fn save(&self, path: &Path) -> anyhow::Result<()> {
87        if let Some(parent) = path.parent() {
88            fs::create_dir_all(parent)?;
89        }
90        let json = serde_json::to_string_pretty(self)?;
91        let tmp = path.with_extension(format!("tmp.{}", std::process::id()));
92        fs::write(&tmp, json)?;
93        if let Err(e) = fs::rename(&tmp, path) {
94            let _ = fs::remove_file(&tmp);
95            return Err(e.into());
96        }
97        Ok(())
98    }
99}
100
101/// Build (or incrementally refresh) the index for `skills` using `embedder`.
102/// Entries in `prev` with a matching id+hash and the same model are reused; the
103/// rest are embedded in one batch.
104pub fn build(
105    skills: &[Skill],
106    embedder: &dyn Embedder,
107    prev: Option<&Index>,
108) -> anyhow::Result<Index> {
109    let model = embedder.id();
110    let mut entries: Vec<Option<Entry>> = vec![None; skills.len()];
111    let mut to_embed: Vec<usize> = Vec::new();
112
113    for (i, s) in skills.iter().enumerate() {
114        let reuse = prev
115            .filter(|p| p.model == model)
116            .and_then(|p| p.get(&s.id))
117            .filter(|e| e.hash == s.hash)
118            .cloned();
119        match reuse {
120            // Reuse the cached embedding, but refresh the cheap content-derived
121            // metadata (keywords, trigger phrases, body head): an index written
122            // before these were extracted has a matching hash, so without this the
123            // phrase channel would stay dark, and the reranker would keep re-reading
124            // files, until each skill's content next changed.
125            Some(mut e) => {
126                e.keywords = s.keywords.clone();
127                e.trigger_phrases = s.trigger_phrases.clone();
128                e.body_head = s.body_head.clone();
129                entries[i] = Some(e);
130            }
131            None => to_embed.push(i),
132        }
133    }
134
135    if !to_embed.is_empty() {
136        let texts: Vec<String> = to_embed
137            .iter()
138            .map(|&i| skills[i].description.clone())
139            .collect();
140        let embs = embedder.embed(&texts, EmbedKind::Document)?;
141        for (k, &i) in to_embed.iter().enumerate() {
142            let s = &skills[i];
143            entries[i] = Some(Entry {
144                id: s.id.clone(),
145                name: s.name.clone(),
146                description: s.description.clone(),
147                path: s.path.display().to_string(),
148                keywords: s.keywords.clone(),
149                trigger_phrases: s.trigger_phrases.clone(),
150                body_head: s.body_head.clone(),
151                hash: s.hash.clone(),
152                embedding: embs[k].clone(),
153            });
154        }
155    }
156
157    let skills: Vec<Entry> = entries.into_iter().flatten().collect();
158    let dim = skills.first().map(|e| e.embedding.len()).unwrap_or(0);
159    Ok(Index { model, dim, skills })
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165    use crate::skill::Skill;
166    use std::sync::atomic::{AtomicUsize, Ordering};
167
168    /// Embedder that counts how many texts it was asked to embed, to prove the
169    /// incremental path reuses cached vectors instead of re-embedding.
170    struct CountingEmbedder(AtomicUsize);
171    impl Embedder for CountingEmbedder {
172        fn id(&self) -> String {
173            "counting".into()
174        }
175        fn embed(&self, texts: &[String], _: EmbedKind) -> anyhow::Result<Vec<Vec<f32>>> {
176            self.0.fetch_add(texts.len(), Ordering::SeqCst);
177            Ok(texts.iter().map(|_| vec![1.0, 0.0]).collect())
178        }
179    }
180
181    fn skill(id: &str, hash: &str) -> Skill {
182        Skill {
183            id: id.to_string(),
184            name: id.to_string(),
185            description: format!("does {id}"),
186            body_head: String::new(),
187            keywords: Vec::new(),
188            trigger_phrases: Vec::new(),
189            path: std::path::PathBuf::from(format!("/s/{id}/SKILL.md")),
190            hash: hash.to_string(),
191        }
192    }
193
194    #[test]
195    fn entry_doc_text_appends_body_head_when_present() {
196        let mut e = entry("a", "/s/a/SKILL.md");
197        e.description = "Edit Word documents.".into();
198        // No body head: doc text is the description alone.
199        assert_eq!(e.doc_text(), "Edit Word documents.");
200        // With a body head: description then body head, newline-joined — mirroring
201        // `Skill::doc_text`, but sourced from the index so reranking reads no files.
202        e.body_head = "Insert tables and a table of contents.".into();
203        assert_eq!(
204            e.doc_text(),
205            "Edit Word documents.\nInsert tables and a table of contents."
206        );
207    }
208
209    #[test]
210    fn build_persists_and_refreshes_body_head() {
211        // A skill carries a body head; the freshly built entry must store it so the
212        // reranker can read it back from the index instead of the file.
213        let mut s = skill("a", "h1");
214        s.body_head = "first body line".into();
215        let e = CountingEmbedder(AtomicUsize::new(0));
216        let idx = build(std::slice::from_ref(&s), &e, None).unwrap();
217        assert_eq!(idx.get("a").unwrap().body_head, "first body line");
218
219        // Simulate an index written before body_head existed: same id+hash, but the
220        // stored entry's body_head is empty. A rebuild reuses the cached embedding
221        // (no re-embed) yet must backfill body_head from the freshly parsed skill —
222        // the same guarantee keywords/trigger_phrases already have.
223        let mut stale = idx.clone();
224        stale.skills[0].body_head.clear();
225        let refreshed = build(std::slice::from_ref(&s), &e, Some(&stale)).unwrap();
226        assert_eq!(e.0.load(Ordering::SeqCst), 1, "reuse must not re-embed");
227        assert_eq!(refreshed.get("a").unwrap().body_head, "first body line");
228    }
229
230    #[test]
231    fn body_head_absent_index_still_deserializes() {
232        // An index.json written before the body_head field must load (serde default),
233        // yielding an empty body_head that the next reindex backfills.
234        let json = r#"{"model":"m","dim":2,"skills":[{"id":"a","name":"a",
235            "description":"d","path":"/s/a/SKILL.md","keywords":[],"hash":"h",
236            "embedding":[1.0,0.0]}]}"#;
237        let idx: Index = serde_json::from_str(json).unwrap();
238        assert_eq!(idx.get("a").unwrap().body_head, "");
239        assert_eq!(idx.get("a").unwrap().doc_text(), "d");
240    }
241
242    #[test]
243    fn rebuild_with_prev_reuses_unchanged_embeddings() {
244        let skills = vec![skill("a", "h1"), skill("b", "h2")];
245        let e = CountingEmbedder(AtomicUsize::new(0));
246        let first = build(&skills, &e, None).unwrap();
247        assert_eq!(e.0.load(Ordering::SeqCst), 2); // both embedded
248
249        // Same skills, prev supplied: nothing re-embeds (the `ski why` /
250        // session-start hot path).
251        let again = build(&skills, &e, Some(&first)).unwrap();
252        assert_eq!(
253            e.0.load(Ordering::SeqCst),
254            2,
255            "unchanged skills re-embedded"
256        );
257        assert_eq!(again.skills.len(), 2);
258
259        // One skill's content changes: only that one re-embeds.
260        let changed = vec![skill("a", "h1-new"), skill("b", "h2")];
261        let _ = build(&changed, &e, Some(&first)).unwrap();
262        assert_eq!(
263            e.0.load(Ordering::SeqCst),
264            3,
265            "expected exactly one re-embed"
266        );
267    }
268
269    #[test]
270    fn save_is_atomic_and_leaves_no_temp() {
271        let dir = std::env::temp_dir().join(format!("ski-index-save-{}", std::process::id()));
272        let path = dir.join("index.json");
273        let idx = Index {
274            model: "m".into(),
275            dim: 2,
276            skills: vec![entry("a", "/s/a/SKILL.md")],
277        };
278        idx.save(&path).unwrap();
279        let back = Index::load(&path).unwrap().unwrap();
280        assert_eq!(back.skills[0].id, "a");
281        let leftovers: Vec<_> = fs::read_dir(&dir)
282            .unwrap()
283            .filter_map(|e| e.ok())
284            .map(|e| e.file_name())
285            .filter(|n| n != "index.json")
286            .collect();
287        assert!(leftovers.is_empty(), "temp file left behind: {leftovers:?}");
288        let _ = fs::remove_dir_all(&dir);
289    }
290
291    fn entry(id: &str, path: &str) -> Entry {
292        Entry {
293            id: id.to_string(),
294            name: id.to_string(),
295            description: String::new(),
296            path: path.to_string(),
297            keywords: Vec::new(),
298            trigger_phrases: Vec::new(),
299            body_head: String::new(),
300            hash: String::new(),
301            embedding: Vec::new(),
302        }
303    }
304
305    #[test]
306    fn by_path_matches_stored_string() {
307        let idx = Index {
308            model: "m".into(),
309            dim: 0,
310            skills: vec![
311                entry("pdf", "/skills/pdf/SKILL.md"),
312                entry("xlsx", "/skills/xlsx/SKILL.md"),
313            ],
314        };
315        assert_eq!(
316            idx.by_path(Path::new("/skills/xlsx/SKILL.md")).unwrap().id,
317            "xlsx"
318        );
319        assert!(idx.by_path(Path::new("/skills/none/SKILL.md")).is_none());
320    }
321}