ski/
history.rs

1//! `ski history` — read the opt-in telemetry log. Two views over the same JSONL:
2//! the default **aggregate** calibration readout (recommendations vs. actual
3//! use), and a `--tail N` **per-event** listing that shows each recommendation's
4//! prompt, stage, every candidate's confidence, which ids were injected, and
5//! whether each injected skill was then used in that session — the view you want
6//! when iterating on matcher quality. `ski clear` wipes per-session dedup state
7//! (and optionally the log). All read-only and tolerant of malformed lines
8//! (skips them) so a partially-written log still reports.
9
10use crate::paths;
11use std::collections::{BTreeMap, BTreeSet};
12use std::fs;
13
14/// Parsed counts for the `history` readout.
15#[derive(Debug, Default, PartialEq)]
16pub struct Stats {
17    pub recommend_events: u64,
18    pub use_events: u64,
19    pub sessions: usize,
20    /// Distinct (session, skill) pairs that were recommended.
21    pub recommended: u64,
22    /// Recommended pairs the model then used in the same session.
23    pub used_after_rec: u64,
24    /// Recommended pairs never used (false positives).
25    pub false_positives: u64,
26    /// Used pairs never recommended in that session (recall misses).
27    pub recall_misses: u64,
28    /// skill id -> how many sessions recommended-but-unused.
29    pub fp_by_skill: BTreeMap<String, u64>,
30    /// skill id -> how many sessions used-but-unrecommended.
31    pub miss_by_skill: BTreeMap<String, u64>,
32}
33
34/// One candidate skill with its shown confidence, for the per-event listing.
35#[derive(Debug, PartialEq)]
36pub struct Cand {
37    pub id: String,
38    pub confidence: f32,
39}
40
41/// One entry of a `recommend` event's pre-gate `considered` ranking: a skill and
42/// the raw stage score it was ranked at (cosine-blend, reranker logit, or BM25).
43#[derive(Debug, PartialEq)]
44pub struct Ranked {
45    pub id: String,
46    pub score: f32,
47}
48
49/// A single `recommend` event, fully parsed for the `--tail` listing.
50#[derive(Debug, PartialEq)]
51pub struct RecEvent {
52    pub ts: u128,
53    pub session: String,
54    pub stage: String,
55    pub prompt: String,
56    /// The pre-gate ranking ski considered (top-K, id + raw stage score). Present
57    /// on every prompt, including abstentions — the field the `--compare` view
58    /// joins a native pick against. Empty for events logged before this field.
59    pub considered: Vec<Ranked>,
60    /// Every candidate that cleared the gate.
61    pub candidates: Vec<Cand>,
62    /// The subset that fit the char budget and was actually injected.
63    pub injected: Vec<Cand>,
64    /// Why nothing was injected (`below_gate`, `empty_text`), or `None` on a
65    /// successful inject.
66    pub abstained: Option<String>,
67}
68
69/// A single `use` event (the model loaded a skill itself), for the `--tail`
70/// listing. `prompt` is the call that was active when it loaded, if telemetry
71/// captured it (empty otherwise).
72#[derive(Debug, PartialEq)]
73pub struct UseEvent {
74    pub ts: u128,
75    pub session: String,
76    pub skill: String,
77    pub via: String,
78    pub prompt: String,
79}
80
81/// Parse every `recommend` event in log order (oldest first), keeping full
82/// per-event detail. Pure; the listing view joins these against [`used_by_session`].
83pub fn recommend_events(log: &str) -> Vec<RecEvent> {
84    let mut out = Vec::new();
85    for line in log.lines() {
86        let line = line.trim();
87        if line.is_empty() {
88            continue;
89        }
90        let Ok(v) = serde_json::from_str::<serde_json::Value>(line) else {
91            continue;
92        };
93        if v.get("kind").and_then(|k| k.as_str()) != Some("recommend") {
94            continue;
95        }
96        out.push(RecEvent {
97            ts: v.get("ts").and_then(|t| t.as_u64()).unwrap_or(0) as u128,
98            session: str_field(&v, "session"),
99            stage: str_field(&v, "stage"),
100            prompt: str_field(&v, "prompt"),
101            considered: parse_ranked(v.get("considered")),
102            candidates: parse_cands(v.get("candidates")),
103            injected: parse_cands(v.get("injected")),
104            abstained: v
105                .get("abstained")
106                .and_then(|a| a.as_str())
107                .map(str::to_string),
108        });
109    }
110    out
111}
112
113/// Per-session set of skill ids the model loaded itself (from `use` events), so
114/// the listing can mark each injected candidate used vs. unused. Pure.
115pub fn used_by_session(log: &str) -> BTreeMap<String, BTreeSet<String>> {
116    let mut used: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
117    for line in log.lines() {
118        let Ok(v) = serde_json::from_str::<serde_json::Value>(line.trim()) else {
119            continue;
120        };
121        if v.get("kind").and_then(|k| k.as_str()) != Some("use") {
122            continue;
123        }
124        if let Some(skill) = v.get("skill").and_then(|s| s.as_str()) {
125            used.entry(str_field(&v, "session"))
126                .or_default()
127                .insert(skill.to_string());
128        }
129    }
130    used
131}
132
133/// Parse every `use` event in log order (oldest first), keeping the prompt for
134/// the listing. Pure.
135pub fn use_events(log: &str) -> Vec<UseEvent> {
136    let mut out = Vec::new();
137    for line in log.lines() {
138        let Ok(v) = serde_json::from_str::<serde_json::Value>(line.trim()) else {
139            continue;
140        };
141        if v.get("kind").and_then(|k| k.as_str()) != Some("use") {
142            continue;
143        }
144        let Some(skill) = v.get("skill").and_then(|s| s.as_str()) else {
145            continue;
146        };
147        out.push(UseEvent {
148            ts: v.get("ts").and_then(|t| t.as_u64()).unwrap_or(0) as u128,
149            session: str_field(&v, "session"),
150            skill: skill.to_string(),
151            via: str_field(&v, "via"),
152            prompt: str_field(&v, "prompt"),
153        });
154    }
155    out
156}
157
158/// Per-session set of skill ids that were injected (from `recommend` events), so
159/// the listing can label a `use` as acting on a recommendation vs. a recall miss.
160/// Pure.
161pub fn recommended_by_session(log: &str) -> BTreeMap<String, BTreeSet<String>> {
162    let mut rec: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
163    for e in recommend_events(log) {
164        let entry = rec.entry(e.session).or_default();
165        for c in e.injected {
166            entry.insert(c.id);
167        }
168    }
169    rec
170}
171
172fn str_field(v: &serde_json::Value, key: &str) -> String {
173    v.get(key)
174        .and_then(|s| s.as_str())
175        .unwrap_or("")
176        .to_string()
177}
178
179fn parse_cands(v: Option<&serde_json::Value>) -> Vec<Cand> {
180    v.and_then(|a| a.as_array())
181        .map(|arr| {
182            arr.iter()
183                .filter_map(|item| {
184                    let id = item.get("id")?.as_str()?.to_string();
185                    let confidence = item
186                        .get("confidence")
187                        .and_then(|c| c.as_f64())
188                        .unwrap_or(0.0) as f32;
189                    Some(Cand { id, confidence })
190                })
191                .collect()
192        })
193        .unwrap_or_default()
194}
195
196fn parse_ranked(v: Option<&serde_json::Value>) -> Vec<Ranked> {
197    v.and_then(|a| a.as_array())
198        .map(|arr| {
199            arr.iter()
200                .filter_map(|item| {
201                    let id = item.get("id")?.as_str()?.to_string();
202                    let score = item.get("score").and_then(|s| s.as_f64()).unwrap_or(0.0) as f32;
203                    Some(Ranked { id, score })
204                })
205                .collect()
206        })
207        .unwrap_or_default()
208}
209
210/// Aggregate raw JSONL log text. Pure, so it is unit-testable without IO.
211pub fn aggregate(log: &str) -> Stats {
212    // Per session: the set of recommended skill ids and the set of used skill ids.
213    let mut rec: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
214    let mut used: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
215    let mut stats = Stats::default();
216
217    for line in log.lines() {
218        let line = line.trim();
219        if line.is_empty() {
220            continue;
221        }
222        let Ok(v) = serde_json::from_str::<serde_json::Value>(line) else {
223            continue;
224        };
225        let session = v.get("session").and_then(|s| s.as_str()).unwrap_or("");
226        match v.get("kind").and_then(|k| k.as_str()) {
227            Some("recommend") => {
228                stats.recommend_events += 1;
229                let entry = rec.entry(session.to_string()).or_default();
230                if let Some(arr) = v.get("injected").and_then(|i| i.as_array()) {
231                    for item in arr {
232                        if let Some(id) = item.get("id").and_then(|i| i.as_str()) {
233                            entry.insert(id.to_string());
234                        }
235                    }
236                }
237            }
238            Some("use") => {
239                stats.use_events += 1;
240                if let Some(skill) = v.get("skill").and_then(|s| s.as_str()) {
241                    used.entry(session.to_string())
242                        .or_default()
243                        .insert(skill.to_string());
244                }
245            }
246            _ => {}
247        }
248    }
249
250    let sessions: BTreeSet<&String> = rec.keys().chain(used.keys()).collect();
251    stats.sessions = sessions.len();
252
253    for session in sessions {
254        let recommended = rec.get(session).cloned().unwrap_or_default();
255        let consumed = used.get(session).cloned().unwrap_or_default();
256        for id in &recommended {
257            stats.recommended += 1;
258            if consumed.contains(id) {
259                stats.used_after_rec += 1;
260            } else {
261                stats.false_positives += 1;
262                *stats.fp_by_skill.entry(id.clone()).or_default() += 1;
263            }
264        }
265        for id in &consumed {
266            if !recommended.contains(id) {
267                stats.recall_misses += 1;
268                *stats.miss_by_skill.entry(id.clone()).or_default() += 1;
269            }
270        }
271    }
272    stats
273}
274
275/// Rank at/under which an abstained-on native pick counts as a near-miss (a
276/// tunable edge) rather than buried. ski's top few are the realistically
277/// reachable band for a looser gate or a stronger retrieval channel.
278const NEAR_MISS_RANK: usize = 3;
279
280/// How ski's ranking lines up with the native chooser's pick on one prompt.
281#[derive(Debug, PartialEq)]
282pub enum Verdict {
283    /// ski injected the same skill the model then used. Redundant — and because
284    /// the model saw ski's nudge first, this does *not* prove ski caused the pick.
285    Agreed,
286    /// ski abstained but ranked the native pick in its top [`NEAR_MISS_RANK`] —
287    /// the tunable edge: a looser gate or stronger channel could have surfaced it.
288    NearMiss { rank: usize, score: f32 },
289    /// ski abstained and ranked the native pick deeper in its considered top-K.
290    Buried { rank: usize, score: f32 },
291    /// ski never surfaced the native pick in its considered top-K — the retrieval
292    /// ceiling; narrowing the gate cannot win this one.
293    Absent,
294    /// No considered ranking was logged for the prompt (telemetry was off then, or
295    /// a pre-feature event), so the pick can't be placed.
296    NoRanking,
297}
298
299/// One native-chooser pick (a `use` event) joined to ski's ranking on the same
300/// prompt (the matching `recommend` event).
301#[derive(Debug, PartialEq)]
302pub struct CompareRow {
303    pub session: String,
304    pub prompt: String,
305    pub stage: String,
306    /// The skill the native chooser picked.
307    pub native: String,
308    pub via: String,
309    pub verdict: Verdict,
310}
311
312/// Join every native-chooser pick to where ski ranked it on the same prompt.
313/// Pure, so the classification is unit-testable without IO. A `use` is matched to
314/// the latest `recommend` event with the same session + prompt at/just before it.
315pub fn compare(log: &str) -> Vec<CompareRow> {
316    let recs = recommend_events(log);
317    use_events(log)
318        .into_iter()
319        .map(|u| {
320            let rec = match_recommend(&recs, &u);
321            let (stage, verdict) = match rec {
322                None => (String::new(), Verdict::NoRanking),
323                Some(r) => (r.stage.clone(), classify(r, &u.skill)),
324            };
325            CompareRow {
326                session: u.session,
327                prompt: u.prompt,
328                stage,
329                native: u.skill,
330                via: u.via,
331                verdict,
332            }
333        })
334        .collect()
335}
336
337/// The `recommend` event a `use` should be scored against: same session + prompt,
338/// preferring the latest one at or before the use (the call that was active), and
339/// falling back to the earliest match when none precede it (clock skew / ordering).
340fn match_recommend<'a>(recs: &'a [RecEvent], u: &UseEvent) -> Option<&'a RecEvent> {
341    if u.prompt.is_empty() {
342        return None;
343    }
344    let mut matches: Vec<&RecEvent> = recs
345        .iter()
346        .filter(|r| r.session == u.session && r.prompt == u.prompt)
347        .collect();
348    if matches.is_empty() {
349        return None;
350    }
351    matches.sort_by_key(|r| r.ts);
352    matches
353        .iter()
354        .rev()
355        .find(|r| r.ts <= u.ts)
356        .or_else(|| matches.first())
357        .copied()
358}
359
360/// Place `native` (the picked skill) within one recommend event's outcome.
361fn classify(r: &RecEvent, native: &str) -> Verdict {
362    if r.injected.iter().any(|c| c.id == native) {
363        return Verdict::Agreed;
364    }
365    match r.considered.iter().position(|c| c.id == native) {
366        Some(i) => {
367            let rank = i + 1;
368            let score = r.considered[i].score;
369            if rank <= NEAR_MISS_RANK {
370                Verdict::NearMiss { rank, score }
371            } else {
372                Verdict::Buried { rank, score }
373            }
374        }
375        None => Verdict::Absent,
376    }
377}
378
379/// `ski history`: read the telemetry log. With `tail`, list that many recent
380/// recommendation events individually; with `compare`, show ski's pick vs the
381/// native chooser's pick per prompt; otherwise print the aggregate readout.
382/// `session` filters the listing to sessions whose id contains the substring.
383pub fn run(tail: Option<usize>, session: Option<&str>, compare_view: bool) -> anyhow::Result<()> {
384    let path = paths::telemetry_path();
385    let Ok(log) = fs::read_to_string(&path) else {
386        println!(
387            "no telemetry log at {} (enable with SKI_TELEMETRY=1)",
388            path.display()
389        );
390        return Ok(());
391    };
392    if compare_view {
393        print_compare(&log, session);
394    } else {
395        match tail {
396            Some(n) => print_events(&log, n, session),
397            None => print_aggregate(&log),
398        }
399    }
400    Ok(())
401}
402
403/// One entry in the merged per-call timeline: a recommendation we made, or a
404/// skill the model loaded itself.
405enum Ev<'a> {
406    Rec(&'a RecEvent),
407    Use(&'a UseEvent),
408}
409
410impl Ev<'_> {
411    fn ts(&self) -> u128 {
412        match self {
413            Ev::Rec(e) => e.ts,
414            Ev::Use(u) => u.ts,
415        }
416    }
417}
418
419/// Render the last `n` events (recommendations and self-loads, interleaved by
420/// time) with full per-call detail.
421fn print_events(log: &str, n: usize, session_filter: Option<&str>) {
422    let used = used_by_session(log);
423    let recd = recommended_by_session(log);
424    let recs = recommend_events(log);
425    let uses = use_events(log);
426    let keep = |s: &str| session_filter.is_none_or(|sf| s.contains(sf));
427
428    let mut timeline: Vec<Ev> = recs
429        .iter()
430        .filter(|e| keep(&e.session))
431        .map(Ev::Rec)
432        .chain(uses.iter().filter(|u| keep(&u.session)).map(Ev::Use))
433        .collect();
434    // Stable sort keeps a recommend before a same-millisecond use of it.
435    timeline.sort_by_key(Ev::ts);
436
437    if timeline.is_empty() {
438        println!("no events");
439        return;
440    }
441    let total = timeline.len();
442    let start = total.saturating_sub(n);
443    println!("showing {} of {total} events", total - start);
444    let now = now_ms();
445    let empty = BTreeSet::new();
446    for ev in &timeline[start..] {
447        match ev {
448            Ev::Rec(e) => {
449                let used_here = used.get(&e.session).unwrap_or(&empty);
450                let injected_ids: BTreeSet<&str> =
451                    e.injected.iter().map(|c| c.id.as_str()).collect();
452                println!(
453                    "\n{}  session {}  rec  stage {}",
454                    ago(e.ts, now),
455                    short(&e.session),
456                    if e.stage.is_empty() { "?" } else { &e.stage },
457                );
458                println!("  prompt: {}", truncate(&e.prompt, 120));
459                for c in &e.injected {
460                    let mark = if used_here.contains(&c.id) {
461                        "used"
462                    } else {
463                        "unused"
464                    };
465                    println!("  -> {:<26} {:.2}  {mark}", c.id, c.confidence);
466                }
467                // Candidates that cleared the gate but lost to the char budget.
468                for c in &e.candidates {
469                    if !injected_ids.contains(c.id.as_str()) {
470                        println!("     {:<26} {:.2}  (over budget)", c.id, c.confidence);
471                    }
472                }
473            }
474            Ev::Use(u) => {
475                let acted = recd.get(&u.session).is_some_and(|s| s.contains(&u.skill));
476                let tag = if acted { "acted on rec" } else { "RECALL MISS" };
477                println!(
478                    "\n{}  session {}  use  {} via {} ({tag})",
479                    ago(u.ts, now),
480                    short(&u.session),
481                    u.skill,
482                    u.via,
483                );
484                if !u.prompt.is_empty() {
485                    println!("  prompt: {}", truncate(&u.prompt, 120));
486                }
487            }
488        }
489    }
490}
491
492/// The aggregate calibration readout (the default `ski history` view).
493fn print_aggregate(log: &str) {
494    let s = aggregate(log);
495    println!(
496        "events: {} recommend, {} use across {} sessions",
497        s.recommend_events, s.use_events, s.sessions
498    );
499    println!(
500        "recommended: {}   used-after-rec: {} ({})   false positives: {} ({})",
501        s.recommended,
502        s.used_after_rec,
503        pct(s.used_after_rec, s.recommended),
504        s.false_positives,
505        pct(s.false_positives, s.recommended),
506    );
507    println!(
508        "recall misses (used, never recommended): {}",
509        s.recall_misses
510    );
511    print_top("top false positives", &s.fp_by_skill);
512    print_top("top recall misses", &s.miss_by_skill);
513}
514
515/// The `--compare` view: every native-chooser pick joined to where ski ranked it.
516/// The whole point is the NEAR-MISS bucket — prompts the model found a skill on
517/// that ski ranked but gated out — versus ABSENT, where ski never surfaced it.
518fn print_compare(log: &str, session_filter: Option<&str>) {
519    let rows: Vec<CompareRow> = compare(log)
520        .into_iter()
521        .filter(|r| session_filter.is_none_or(|sf| r.session.contains(sf)))
522        .collect();
523    if rows.is_empty() {
524        println!("no native-chooser picks logged (need `use` events; enable SKI_TELEMETRY=1)");
525        return;
526    }
527    let sessions: BTreeSet<&str> = rows.iter().map(|r| r.session.as_str()).collect();
528    let (mut agreed, mut near, mut buried, mut absent, mut no_rank) = (0, 0, 0, 0, 0);
529    for r in &rows {
530        match r.verdict {
531            Verdict::Agreed => agreed += 1,
532            Verdict::NearMiss { .. } => near += 1,
533            Verdict::Buried { .. } => buried += 1,
534            Verdict::Absent => absent += 1,
535            Verdict::NoRanking => no_rank += 1,
536        }
537    }
538    println!(
539        "ski vs native chooser — {} picks across {} sessions",
540        rows.len(),
541        sessions.len()
542    );
543    println!("  agreed (ski injected it too):                {agreed}");
544    println!("  NEAR-MISS (ski ranked it ≤{NEAR_MISS_RANK}, abstained): {near}   <- tunable edge");
545    println!("  buried (ranked deeper, abstained):           {buried}");
546    println!("  absent (ski never surfaced it):              {absent}   <- retrieval ceiling");
547    if no_rank > 0 {
548        println!("  no ranking logged for the prompt:            {no_rank}");
549    }
550    println!(
551        "\nnote: native picks are observed *after* ski injects, so \"agreed\" doesn't prove ski\n\
552         caused it. The clean edge signal is NEAR-MISS/buried — the model found a skill itself\n\
553         that ski ranked but gated out."
554    );
555
556    // Edge candidates: native picked it, ski ranked but abstained. Best rank first.
557    let mut edges: Vec<(usize, f32, &CompareRow)> = rows
558        .iter()
559        .filter_map(|r| match r.verdict {
560            Verdict::NearMiss { rank, score } | Verdict::Buried { rank, score } => {
561                Some((rank, score, r))
562            }
563            _ => None,
564        })
565        .collect();
566    edges.sort_by(|a, b| a.0.cmp(&b.0).then(a.2.prompt.cmp(&b.2.prompt)));
567    if !edges.is_empty() {
568        println!("\nedge candidates (native picked it, ski abstained):");
569        for (rank, score, r) in &edges {
570            println!(
571                "  {:<60} -> {} via {}  ski #{rank} score {score:.3}  [{}]",
572                truncate(&r.prompt, 60),
573                r.native,
574                r.via,
575                if r.stage.is_empty() { "?" } else { &r.stage },
576            );
577        }
578    }
579
580    let absent_rows: Vec<&CompareRow> = rows
581        .iter()
582        .filter(|r| r.verdict == Verdict::Absent)
583        .collect();
584    if !absent_rows.is_empty() {
585        println!("\nabsent (ski never surfaced — retrieval miss):");
586        for r in &absent_rows {
587            println!(
588                "  {:<60} -> {} via {}",
589                truncate(&r.prompt, 60),
590                r.native,
591                r.via
592            );
593        }
594    }
595}
596
597/// `ski clear`: wipe per-session dedup state; with `telemetry`, also the log.
598pub fn clear(telemetry: bool) -> anyhow::Result<()> {
599    let sessions = paths::sessions_dir();
600    let removed = match fs::remove_dir_all(&sessions) {
601        Ok(()) => true,
602        Err(e) if e.kind() == std::io::ErrorKind::NotFound => false,
603        Err(e) => return Err(e.into()),
604    };
605    println!(
606        "{} session state at {}",
607        if removed { "cleared" } else { "no" },
608        sessions.display()
609    );
610    if telemetry {
611        let log = paths::telemetry_path();
612        match fs::remove_file(&log) {
613            Ok(()) => println!("cleared telemetry log at {}", log.display()),
614            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
615            Err(e) => return Err(e.into()),
616        }
617    }
618    Ok(())
619}
620
621fn now_ms() -> u128 {
622    std::time::SystemTime::now()
623        .duration_since(std::time::UNIX_EPOCH)
624        .map(|d| d.as_millis())
625        .unwrap_or(0)
626}
627
628/// Human-readable age of an event timestamp. `?` for missing/future stamps.
629fn ago(ts_ms: u128, now_ms: u128) -> String {
630    if ts_ms == 0 || ts_ms > now_ms {
631        return "?".to_string();
632    }
633    let secs = (now_ms - ts_ms) / 1000;
634    if secs < 60 {
635        format!("{secs}s ago")
636    } else if secs < 3600 {
637        format!("{}m ago", secs / 60)
638    } else if secs < 86_400 {
639        format!("{}h ago", secs / 3600)
640    } else {
641        format!("{}d ago", secs / 86_400)
642    }
643}
644
645/// First 8 chars of a session id — enough to eyeball-group events.
646fn short(session: &str) -> String {
647    session.chars().take(8).collect()
648}
649
650/// One-line, length-capped prompt for the listing (shared with `ski suggest`).
651pub(crate) fn truncate(s: &str, max: usize) -> String {
652    let s = s.replace('\n', " ");
653    if s.chars().count() <= max {
654        s
655    } else {
656        let cut: String = s.chars().take(max).collect();
657        format!("{cut}…")
658    }
659}
660
661fn pct(n: u64, d: u64) -> String {
662    if d == 0 {
663        "0%".to_string()
664    } else {
665        format!("{:.0}%", 100.0 * n as f64 / d as f64)
666    }
667}
668
669/// Print the highest-count skills, descending, capped at 8.
670fn print_top(label: &str, by_skill: &BTreeMap<String, u64>) {
671    if by_skill.is_empty() {
672        return;
673    }
674    let mut rows: Vec<(&String, &u64)> = by_skill.iter().collect();
675    rows.sort_by(|a, b| b.1.cmp(a.1).then(a.0.cmp(b.0)));
676    let shown: Vec<String> = rows
677        .iter()
678        .take(8)
679        .map(|(id, n)| format!("{id} ×{n}"))
680        .collect();
681    println!("{label}: {}", shown.join(", "));
682}
683
684#[cfg(test)]
685mod tests {
686    use super::*;
687
688    const LOG: &str = r#"
689{"kind":"recommend","session":"s1","injected":[{"id":"git-attribution","confidence":0.9},{"id":"pdf","confidence":0.6}]}
690{"kind":"use","session":"s1","skill":"git-attribution","via":"skill"}
691{"kind":"use","session":"s1","skill":"xlsx","via":"read"}
692{"kind":"recommend","session":"s2","injected":[{"id":"pdf","confidence":0.7}]}
693not json, should be skipped
694{"kind":"recommend","session":"s2","injected":[{"id":"pdf","confidence":0.8}]}
695"#;
696
697    #[test]
698    fn aggregate_counts_outcomes() {
699        let s = aggregate(LOG);
700        assert_eq!(s.recommend_events, 3);
701        assert_eq!(s.use_events, 2);
702        assert_eq!(s.sessions, 2);
703        // s1 recommended git-attribution+pdf; s2 recommended pdf (deduped across 2 events).
704        assert_eq!(s.recommended, 3);
705        // git-attribution used in s1.
706        assert_eq!(s.used_after_rec, 1);
707        // pdf in s1 (unused), pdf in s2 (unused).
708        assert_eq!(s.false_positives, 2);
709        assert_eq!(s.fp_by_skill.get("pdf"), Some(&2));
710        // xlsx used in s1 but never recommended -> recall miss.
711        assert_eq!(s.recall_misses, 1);
712        assert_eq!(s.miss_by_skill.get("xlsx"), Some(&1));
713    }
714
715    #[test]
716    fn empty_log_is_zero() {
717        assert_eq!(aggregate(""), Stats::default());
718    }
719
720    #[test]
721    fn pct_guards_zero() {
722        assert_eq!(pct(0, 0), "0%");
723        assert_eq!(pct(1, 2), "50%");
724    }
725
726    const DETAIL_LOG: &str = r#"
727{"ts":1000,"kind":"recommend","session":"sess-abcdef-1","stage":"cosine","prompt":"make a pdf","candidates":[{"id":"pdf","confidence":0.8},{"id":"docx","confidence":0.4}],"injected":[{"id":"pdf","confidence":0.8}]}
728{"ts":2000,"kind":"use","session":"sess-abcdef-1","skill":"pdf","via":"skill"}
729{"ts":3000,"kind":"recommend","session":"other-2","stage":"rerank","prompt":"line1\nline2","candidates":[{"id":"xlsx","confidence":0.5}],"injected":[{"id":"xlsx","confidence":0.5}]}
730garbage
731"#;
732
733    #[test]
734    fn recommend_events_parses_detail() {
735        let evs = recommend_events(DETAIL_LOG);
736        assert_eq!(evs.len(), 2);
737        let first = &evs[0];
738        assert_eq!(first.ts, 1000);
739        assert_eq!(first.session, "sess-abcdef-1");
740        assert_eq!(first.stage, "cosine");
741        assert_eq!(first.prompt, "make a pdf");
742        assert_eq!(first.candidates.len(), 2);
743        assert_eq!(
744            first.injected,
745            vec![Cand {
746                id: "pdf".into(),
747                confidence: 0.8
748            }]
749        );
750    }
751
752    #[test]
753    fn used_by_session_collects_use_events() {
754        let used = used_by_session(DETAIL_LOG);
755        assert!(used.get("sess-abcdef-1").unwrap().contains("pdf"));
756        assert!(!used.contains_key("other-2"));
757    }
758
759    #[test]
760    fn use_events_parse_prompt_when_present() {
761        let log = r#"
762{"ts":5000,"kind":"use","session":"s1","skill":"xlsx","via":"read","prompt":"clean this csv"}
763{"ts":6000,"kind":"use","session":"s1","skill":"pdf","via":"skill"}
764"#;
765        let evs = use_events(log);
766        assert_eq!(evs.len(), 2);
767        assert_eq!(evs[0].skill, "xlsx");
768        assert_eq!(evs[0].via, "read");
769        assert_eq!(evs[0].prompt, "clean this csv");
770        assert_eq!(evs[1].prompt, ""); // no prompt field -> empty
771    }
772
773    #[test]
774    fn recommended_by_session_uses_injected_ids() {
775        let recd = recommended_by_session(DETAIL_LOG);
776        assert!(recd.get("sess-abcdef-1").unwrap().contains("pdf"));
777        // docx cleared the gate but only pdf was injected in this fixture.
778        assert!(!recd.get("sess-abcdef-1").unwrap().contains("docx"));
779        assert!(recd.get("other-2").unwrap().contains("xlsx"));
780    }
781
782    #[test]
783    fn ago_buckets() {
784        assert_eq!(ago(0, 10_000), "?");
785        assert_eq!(ago(20_000, 10_000), "?"); // future
786        assert_eq!(ago(9_000, 10_000), "1s ago");
787        assert_eq!(ago(0, 120_000 + 1), "?");
788        assert_eq!(ago(1_000, 121_000), "2m ago");
789        assert_eq!(ago(1_000, 7_201_000), "2h ago");
790        assert_eq!(ago(1_000, 172_801_000), "2d ago");
791    }
792
793    #[test]
794    fn truncate_caps_and_flattens() {
795        assert_eq!(truncate("a\nb", 10), "a b");
796        assert_eq!(truncate("abcdef", 3), "abc…");
797        assert_eq!(short("sess-abcdef-1"), "sess-abc");
798    }
799
800    #[test]
801    fn recommend_events_parse_considered_and_abstained() {
802        let log = r#"
803{"ts":1,"kind":"recommend","session":"s","stage":"rerank","prompt":"p","considered":[{"id":"xlsx","score":-1.96},{"id":"pdf","score":-2.1}],"candidates":[],"injected":[],"abstained":"below_gate"}
804"#;
805        let evs = recommend_events(log);
806        assert_eq!(evs.len(), 1);
807        assert_eq!(
808            evs[0].considered,
809            vec![
810                Ranked {
811                    id: "xlsx".into(),
812                    score: -1.96
813                },
814                Ranked {
815                    id: "pdf".into(),
816                    score: -2.1
817                },
818            ]
819        );
820        assert_eq!(evs[0].abstained.as_deref(), Some("below_gate"));
821        // A legacy event without the new fields parses to empty/None, not an error.
822        let legacy =
823            r#"{"kind":"recommend","session":"s","injected":[{"id":"pdf","confidence":0.7}]}"#;
824        let ev = &recommend_events(legacy)[0];
825        assert!(ev.considered.is_empty());
826        assert_eq!(ev.abstained, None);
827    }
828
829    // One recommend + one use per session, exercising each verdict.
830    const COMPARE_LOG: &str = r#"
831{"ts":1000,"kind":"recommend","session":"s1","stage":"rerank","prompt":"make a chart","considered":[{"id":"xlsx","score":-1.9},{"id":"pdf","score":-2.1}],"candidates":[],"injected":[],"abstained":"below_gate"}
832{"ts":1100,"kind":"use","session":"s1","skill":"xlsx","via":"skill","prompt":"make a chart"}
833{"ts":2000,"kind":"recommend","session":"s2","stage":"cosine","prompt":"set up python","considered":[{"id":"uv-setup","score":0.7}],"candidates":[{"id":"uv-setup","confidence":0.7}],"injected":[{"id":"uv-setup","confidence":0.7}]}
834{"ts":2100,"kind":"use","session":"s2","skill":"uv-setup","via":"skill","prompt":"set up python"}
835{"ts":3000,"kind":"recommend","session":"s3","stage":"rerank","prompt":"deep","considered":[{"id":"a","score":0.1},{"id":"b","score":0.1},{"id":"c","score":0.1},{"id":"d","score":0.1},{"id":"gold","score":0.0}],"candidates":[],"injected":[],"abstained":"below_gate"}
836{"ts":3100,"kind":"use","session":"s3","skill":"gold","via":"read","prompt":"deep"}
837{"ts":4000,"kind":"recommend","session":"s4","stage":"cosine","prompt":"weird","considered":[{"id":"x","score":0.2}],"candidates":[],"injected":[],"abstained":"below_gate"}
838{"ts":4100,"kind":"use","session":"s4","skill":"notranked","via":"skill","prompt":"weird"}
839{"ts":5100,"kind":"use","session":"s5","skill":"orphan","via":"skill","prompt":"no rec here"}
840"#;
841
842    #[test]
843    fn compare_classifies_each_verdict() {
844        let rows = compare(COMPARE_LOG);
845        let by: std::collections::HashMap<&str, &Verdict> = rows
846            .iter()
847            .map(|r| (r.native.as_str(), &r.verdict))
848            .collect();
849        assert_eq!(
850            by["xlsx"],
851            &Verdict::NearMiss {
852                rank: 1,
853                score: -1.9
854            }
855        );
856        assert_eq!(by["uv-setup"], &Verdict::Agreed);
857        assert_eq!(
858            by["gold"],
859            &Verdict::Buried {
860                rank: 5,
861                score: 0.0
862            }
863        );
864        assert_eq!(by["notranked"], &Verdict::Absent);
865        // A use with no matching recommend (or no prompt) can't be placed.
866        assert_eq!(by["orphan"], &Verdict::NoRanking);
867    }
868
869    #[test]
870    fn classify_near_miss_rank_boundary() {
871        // rank == NEAR_MISS_RANK is still a near-miss; one deeper is buried.
872        let log = r#"
873{"ts":1,"kind":"recommend","session":"s","stage":"rerank","prompt":"p","considered":[{"id":"a","score":0.3},{"id":"b","score":0.2},{"id":"gold","score":0.1},{"id":"d","score":0.0}],"candidates":[],"injected":[],"abstained":"below_gate"}
874"#;
875        let ev = &recommend_events(log)[0];
876        assert_eq!(
877            classify(ev, "gold"),
878            Verdict::NearMiss {
879                rank: 3,
880                score: 0.1
881            }
882        );
883        assert_eq!(
884            classify(ev, "d"),
885            Verdict::Buried {
886                rank: 4,
887                score: 0.0
888            }
889        );
890    }
891
892    #[test]
893    fn match_recommend_prefers_latest_at_or_before_use() {
894        // Same prompt ranked twice; the use should bind to the one active at its time.
895        let log = r#"
896{"ts":1000,"kind":"recommend","session":"s","stage":"cosine","prompt":"p","considered":[{"id":"a","score":0.5}],"candidates":[],"injected":[],"abstained":"below_gate"}
897{"ts":3000,"kind":"recommend","session":"s","stage":"cosine","prompt":"p","considered":[{"id":"a","score":0.9}],"candidates":[],"injected":[],"abstained":"below_gate"}
898"#;
899        let recs = recommend_events(log);
900        let u = UseEvent {
901            ts: 2000,
902            session: "s".into(),
903            skill: "a".into(),
904            via: "skill".into(),
905            prompt: "p".into(),
906        };
907        // ts 2000 binds to the 1000 event (latest at/before), not the later 3000 one.
908        assert_eq!(match_recommend(&recs, &u).unwrap().ts, 1000);
909    }
910}
ski/history.rs

ski/
history.rs