Skip to main content

algocline_app/service/
eval.rs

1use super::eval_store::{
2    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3    save_eval_result,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9impl AppService {
10    /// Run an evalframe evaluation suite.
11    ///
12    /// Accepts a scenario (bindings + cases) and a strategy name.
13    /// Automatically wires the strategy as the provider and executes
14    /// the evalframe suite, returning the report (summary, scores, failures).
15    ///
16    /// Injects a `std` global (mlua-batteries compatible shim) so evalframe's
17    /// `std.lua` can resolve json/fs/time from algocline's built-in primitives.
18    ///
19    /// # Security: `strategy` is not sanitized
20    ///
21    /// `strategy` is interpolated into a Lua string literal without escaping.
22    /// This is intentional — same rationale as [`make_require_code`]:
23    /// algocline runs Lua in the caller's own process with full ambient
24    /// authority, so Lua injection does not cross a trust boundary.
25    pub async fn eval(
26        &self,
27        scenario: Option<String>,
28        scenario_file: Option<String>,
29        scenario_name: Option<String>,
30        strategy: &str,
31        strategy_opts: Option<serde_json::Value>,
32    ) -> Result<String, String> {
33        // Auto-install bundled packages if evalframe is missing
34        if !is_package_installed("evalframe") {
35            self.auto_install_bundled_packages().await?;
36            if !is_package_installed("evalframe") {
37                return Err(
38                    "Package 'evalframe' not found after installing bundled collection. \
39                     Use alc_pkg_install to install it manually."
40                        .into(),
41                );
42            }
43        }
44
45        let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name)?;
46
47        // Build strategy opts Lua table literal
48        let opts_lua = match &strategy_opts {
49            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
50            _ => "{}".to_string(),
51        };
52
53        // Inject `std` global as a mlua-batteries compatible shim.
54        //
55        // evalframe.std expects the host to provide a `std` global with:
56        //   std.json.decode/encode  — JSON serialization
57        //   std.fs.read/is_file     — filesystem access
58        //   std.time.now            — wall-clock time (epoch seconds, f64)
59        //
60        // We bridge these from algocline's alc.* primitives and Lua's io stdlib.
61        let wrapped = format!(
62            r#"
63std = {{
64  json = {{
65    decode = alc.json_decode,
66    encode = alc.json_encode,
67  }},
68  fs = {{
69    read = function(path)
70      local f, err = io.open(path, "r")
71      if not f then error("std.fs.read: " .. (err or path), 2) end
72      local content = f:read("*a")
73      f:close()
74      return content
75    end,
76    is_file = function(path)
77      local f = io.open(path, "r")
78      if f then f:close(); return true end
79      return false
80    end,
81  }},
82  time = {{
83    now = alc.time,
84  }},
85}}
86
87local ef = require("evalframe")
88
89-- Load scenario (bindings + cases, no provider)
90local spec = (function()
91{scenario_code}
92end)()
93
94-- Inject strategy as provider
95spec.provider = ef.providers.algocline {{
96  strategy = "{strategy}",
97  opts = {opts_lua},
98}}
99
100-- Build and run suite
101local s = ef.suite "eval" (spec)
102local report = s:run()
103return report:to_table()
104"#
105        );
106
107        let ctx = serde_json::Value::Null;
108        let result = self
109            .start_and_tick(wrapped, ctx, Some(strategy), vec![])
110            .await?;
111
112        // Register this session for eval result saving on completion.
113        // start_and_tick returns the first pause (needs_response) or completed.
114        // If completed immediately, save now. Otherwise, save when continue_* finishes.
115        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
116            match parsed.get("status").and_then(|s| s.as_str()) {
117                Some("completed") => {
118                    save_eval_result(strategy, &result);
119                }
120                Some("needs_response") => {
121                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
122                        if let Ok(mut map) = self.eval_sessions.lock() {
123                            map.insert(sid.to_string(), strategy.to_string());
124                        }
125                    }
126                }
127                _ => {}
128            }
129        }
130
131        Ok(result)
132    }
133
134    /// List eval history, optionally filtered by strategy.
135    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
136        let dir = evals_dir()?;
137        list_eval_history(&dir, strategy, limit)
138    }
139
140    /// View a specific eval result by ID.
141    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
142        let evals_dir = evals_dir()?;
143        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
144            .map_err(|e| format!("Invalid eval_id: {e}"))?;
145        if !path.exists() {
146            return Err(format!("Eval result not found: {eval_id}"));
147        }
148        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
149    }
150
151    /// Compare two eval results with statistical significance testing.
152    ///
153    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
154    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
155    /// from each eval result — no re-computation of descriptive statistics.
156    ///
157    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
158    /// lookups of the same pair are file reads only.
159    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
160        // Check for cached comparison
161        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
162        if let Ok(dir) = evals_dir() {
163            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
164                if cached_path.exists() {
165                    return std::fs::read_to_string(&*cached_path)
166                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
167                }
168            }
169        }
170
171        // Auto-install bundled packages if evalframe is missing
172        if !is_package_installed("evalframe") {
173            self.auto_install_bundled_packages().await?;
174            if !is_package_installed("evalframe") {
175                return Err(
176                    "Package 'evalframe' not found after installing bundled collection. \
177                     Use alc_pkg_install to install it manually."
178                        .into(),
179                );
180            }
181        }
182
183        let result_a = self.eval_detail(eval_id_a)?;
184        let result_b = self.eval_detail(eval_id_b)?;
185
186        // Build Lua snippet that uses evalframe's stats module
187        // to compute welch_t from the persisted aggregated scores.
188        let lua_code = format!(
189            r#"
190std = {{
191  json = {{
192    decode = alc.json_decode,
193    encode = alc.json_encode,
194  }},
195  fs = {{ read = function() end, is_file = function() return false end }},
196  time = {{ now = alc.time }},
197}}
198
199local stats = require("evalframe.eval.stats")
200
201local result_a = alc.json_decode('{result_a_escaped}')
202local result_b = alc.json_decode('{result_b_escaped}')
203
204local agg_a = result_a.result and result_a.result.aggregated
205local agg_b = result_b.result and result_b.result.aggregated
206
207if not agg_a or not agg_a.scores then
208  error("No aggregated scores in {eval_id_a}")
209end
210if not agg_b or not agg_b.scores then
211  error("No aggregated scores in {eval_id_b}")
212end
213
214local welch = stats.welch_t(agg_a.scores, agg_b.scores)
215
216local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
217local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
218
219local delta = agg_a.scores.mean - agg_b.scores.mean
220local winner = "none"
221if welch.significant then
222  winner = delta > 0 and "a" or "b"
223end
224
225-- Build summary text
226local parts = {{}}
227if welch.significant then
228  local w, l, d = strategy_a, strategy_b, delta
229  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
230  parts[#parts + 1] = string.format(
231    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
232    w, l, d, math.abs(welch.t_stat), welch.df
233  )
234else
235  parts[#parts + 1] = string.format(
236    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
237    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
238  )
239end
240if agg_a.pass_rate and agg_b.pass_rate then
241  local dp = agg_a.pass_rate - agg_b.pass_rate
242  if math.abs(dp) > 1e-9 then
243    local h = dp > 0 and strategy_a or strategy_b
244    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
245  else
246    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
247  end
248end
249
250return {{
251  a = {{
252    eval_id = "{eval_id_a}",
253    strategy = strategy_a,
254    scores = agg_a.scores,
255    pass_rate = agg_a.pass_rate,
256    pass_at_1 = agg_a.pass_at_1,
257    ci_95 = agg_a.ci_95,
258  }},
259  b = {{
260    eval_id = "{eval_id_b}",
261    strategy = strategy_b,
262    scores = agg_b.scores,
263    pass_rate = agg_b.pass_rate,
264    pass_at_1 = agg_b.pass_at_1,
265    ci_95 = agg_b.ci_95,
266  }},
267  comparison = {{
268    delta_mean = delta,
269    welch_t = {{
270      t_stat = welch.t_stat,
271      df = welch.df,
272      significant = welch.significant,
273      direction = welch.direction,
274    }},
275    winner = winner,
276    summary = table.concat(parts, " "),
277  }},
278}}
279"#,
280            result_a_escaped = escape_for_lua_sq(&result_a),
281            result_b_escaped = escape_for_lua_sq(&result_b),
282            eval_id_a = eval_id_a,
283            eval_id_b = eval_id_b,
284            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
285            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
286        );
287
288        let ctx = serde_json::Value::Null;
289        let raw_result = self.start_and_tick(lua_code, ctx, None, vec![]).await?;
290
291        // Persist comparison result
292        save_compare_result(eval_id_a, eval_id_b, &raw_result);
293
294        Ok(raw_result)
295    }
296}