Skip to main content

kaizen/experiment/
engine.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2//! Turn an `Experiment` + sessions into a report. Pure compute given inputs.
3
4use crate::core::event::{Event, SessionRecord};
5use crate::experiment::binding::{ManualTags, partition};
6use crate::experiment::metric::value_for;
7use crate::experiment::stats::{DEFAULT_RESAMPLES, Summary, summarize};
8use crate::experiment::types::{Classification, Criterion, Direction, Experiment};
9use serde::{Deserialize, Serialize};
10use std::path::Path;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct Report {
14    pub experiment: Experiment,
15    pub summary: Summary,
16    pub excluded_count: usize,
17    pub target_met: Option<bool>,
18}
19
20/// Pure ranking step once `sessions` + per-session `events` gathered.
21pub fn run(
22    exp: &Experiment,
23    sessions: &[(SessionRecord, Vec<Event>)],
24    manual_tags: &ManualTags,
25    workspace: &Path,
26) -> Report {
27    let records: Vec<SessionRecord> = sessions.iter().map(|(s, _)| s.clone()).collect();
28    let (control_s, treatment_s, excluded_s) =
29        partition(&records, &exp.binding, manual_tags, workspace);
30    let control = metric_values(
31        exp,
32        sessions,
33        &control_s,
34        Classification::Control,
35        manual_tags,
36    );
37    let treatment = metric_values(
38        exp,
39        sessions,
40        &treatment_s,
41        Classification::Treatment,
42        manual_tags,
43    );
44    let _ = excluded_s;
45    let excluded = records.len() - control.len() - treatment.len();
46    let summary = summarize(
47        &control,
48        &treatment,
49        stable_seed(&exp.id),
50        DEFAULT_RESAMPLES,
51    );
52    let target_met = evaluate_criterion(&exp.success_criterion, &summary);
53    Report {
54        experiment: exp.clone(),
55        summary,
56        excluded_count: excluded,
57        target_met,
58    }
59}
60
61fn metric_values(
62    exp: &Experiment,
63    sessions: &[(SessionRecord, Vec<Event>)],
64    picked: &[&SessionRecord],
65    _which: Classification,
66    _tags: &ManualTags,
67) -> Vec<f64> {
68    let ids: std::collections::HashSet<&str> = picked.iter().map(|s| s.id.as_str()).collect();
69    sessions
70        .iter()
71        .filter(|(s, _)| ids.contains(s.id.as_str()))
72        .filter_map(|(s, evs)| value_for(exp.metric, s, evs))
73        .collect()
74}
75
76fn evaluate_criterion(c: &Criterion, s: &Summary) -> Option<bool> {
77    match c {
78        Criterion::Delta {
79            direction,
80            target_pct,
81        } => {
82            let pct = s.delta_pct?;
83            Some(match direction {
84                Direction::Decrease => pct <= *target_pct,
85                Direction::Increase => pct >= *target_pct,
86            })
87        }
88        Criterion::Absolute { metric_value } => {
89            let m = s.median_treatment?;
90            Some(m <= *metric_value)
91        }
92    }
93}
94
95fn stable_seed(id: &str) -> u64 {
96    let mut h: u64 = 1469598103934665603;
97    for b in id.as_bytes() {
98        h ^= *b as u64;
99        h = h.wrapping_mul(1099511628211);
100    }
101    h
102}
103
104/// Human-readable markdown per `docs/experiments.md`.
105pub fn to_markdown(report: &Report) -> String {
106    let e = &report.experiment;
107    let s = &report.summary;
108    let mut out = String::new();
109    out.push_str(&format!("# Experiment: {}\n\n", e.name));
110    out.push_str(&format!(
111        "State: {:?} · Duration: {}d\nHypothesis: {}\nChange: {}\n\n",
112        e.state, e.duration_days, e.hypothesis, e.change_description
113    ));
114    let (ctl_label, trt_label) = match &e.binding {
115        crate::experiment::types::Binding::GitCommit {
116            control_commit,
117            treatment_commit,
118        } => (short(control_commit), short(treatment_commit)),
119        crate::experiment::types::Binding::Branch {
120            control_branch,
121            treatment_branch,
122        } => (control_branch.clone(), treatment_branch.clone()),
123        crate::experiment::types::Binding::ManualTag { variant_field } => {
124            (format!("manual:{}", variant_field), "manual".into())
125        }
126    };
127    out.push_str(&format!(
128        "Binding: control {} · treatment {}\nMetric: {}\n\n",
129        ctl_label,
130        trt_label,
131        e.metric.as_str()
132    ));
133    out.push_str("|          | N  | median | mean |\n|---|---|---|---|\n");
134    out.push_str(&format!(
135        "| control  | {} | {} | {} |\n",
136        s.n_control,
137        fmt_opt(s.median_control),
138        fmt_opt(s.mean_control),
139    ));
140    out.push_str(&format!(
141        "| treatment| {} | {} | {} |\n\n",
142        s.n_treatment,
143        fmt_opt(s.median_treatment),
144        fmt_opt(s.mean_treatment),
145    ));
146    if let Some(d) = s.delta_median {
147        out.push_str(&format!(
148            "Delta (median): {:+.2}{}\n",
149            d,
150            s.delta_pct
151                .map(|p| format!(" ({:+.1}%)", p))
152                .unwrap_or_default(),
153        ));
154    }
155    if let (Some(lo), Some(hi)) = (s.ci95_lo, s.ci95_hi) {
156        out.push_str(&format!(
157            "95% bootstrap CI on delta: [{:+.2}, {:+.2}]\n",
158            lo, hi
159        ));
160    }
161    if let Some(met) = report.target_met {
162        out.push_str(&format!(
163            "Target: {}\n",
164            if met { "MET" } else { "not met" }
165        ));
166    }
167    out.push_str(&format!("\nExcluded: {} sessions\n", report.excluded_count));
168    if s.small_sample_warning {
169        out.push_str("Warning: N per arm < 30 — CI may be unreliable.\n");
170    }
171    out
172}
173
174fn fmt_opt(v: Option<f64>) -> String {
175    v.map(|x| format!("{:.2}", x)).unwrap_or_else(|| "—".into())
176}
177
178fn short(commit: &str) -> String {
179    commit.chars().take(7).collect()
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185    use crate::core::event::SessionStatus;
186    use crate::experiment::types::{Binding, Criterion, Direction, Metric, State};
187
188    fn exp() -> Experiment {
189        Experiment {
190            id: "e".into(),
191            name: "e".into(),
192            hypothesis: "h".into(),
193            change_description: "c".into(),
194            metric: Metric::TokensPerSession,
195            binding: Binding::GitCommit {
196                control_commit: "c".into(),
197                treatment_commit: "t".into(),
198            },
199            duration_days: 14,
200            success_criterion: Criterion::Delta {
201                direction: Direction::Decrease,
202                target_pct: -10.0,
203            },
204            state: State::Running,
205            created_at_ms: 0,
206            concluded_at_ms: None,
207        }
208    }
209
210    fn session_with(id: &str, tokens: u32) -> (SessionRecord, Vec<Event>) {
211        let s = SessionRecord {
212            id: id.into(),
213            agent: "cursor".into(),
214            model: None,
215            workspace: "/ws".into(),
216            started_at_ms: 0,
217            ended_at_ms: None,
218            status: SessionStatus::Done,
219            trace_path: String::new(),
220            start_commit: None,
221            end_commit: None,
222            branch: None,
223            dirty_start: None,
224            dirty_end: None,
225            repo_binding_source: None,
226        };
227        let mut ev = Event {
228            session_id: id.into(),
229            seq: 0,
230            ts_ms: 0,
231            ts_exact: false,
232            kind: crate::core::event::EventKind::ToolCall,
233            source: crate::core::event::EventSource::Tail,
234            tool: None,
235            tool_call_id: None,
236            tokens_in: Some(tokens),
237            tokens_out: None,
238            reasoning_tokens: None,
239            cost_usd_e6: None,
240            payload: serde_json::Value::Null,
241        };
242        ev.tokens_in = Some(tokens);
243        (s, vec![ev])
244    }
245
246    #[test]
247    fn manual_tags_drive_partition_without_git() {
248        let e = exp();
249        let sessions = vec![
250            session_with("a", 100),
251            session_with("b", 80),
252            session_with("c", 200),
253            session_with("d", 70),
254        ];
255        let mut tags = ManualTags::new();
256        tags.insert("a".into(), Classification::Control);
257        tags.insert("b".into(), Classification::Control);
258        tags.insert("c".into(), Classification::Treatment);
259        tags.insert("d".into(), Classification::Treatment);
260        let r = run(&e, &sessions, &tags, Path::new("/no"));
261        assert_eq!(r.summary.n_control, 2);
262        assert_eq!(r.summary.n_treatment, 2);
263    }
264}