Skip to main content

algocline_app/service/
eval.rs

1use super::eval_store::{
2    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3    save_eval_result,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9impl AppService {
10    /// Run an evalframe evaluation suite.
11    ///
12    /// Accepts a scenario (bindings + cases) and a strategy name.
13    /// Automatically wires the strategy as the provider and executes
14    /// the evalframe suite, returning the report (summary, scores, failures).
15    ///
16    /// Injects a `std` global (mlua-batteries compatible shim) so evalframe's
17    /// `std.lua` can resolve json/fs/time from algocline's built-in primitives.
18    ///
19    /// # Security: `strategy` is not sanitized
20    ///
21    /// `strategy` is interpolated into a Lua string literal without escaping.
22    /// This is intentional — same rationale as [`make_require_code`]:
23    /// algocline runs Lua in the caller's own process with full ambient
24    /// authority, so Lua injection does not cross a trust boundary.
25    pub async fn eval(
26        &self,
27        scenario: Option<String>,
28        scenario_file: Option<String>,
29        scenario_name: Option<String>,
30        strategy: &str,
31        strategy_opts: Option<serde_json::Value>,
32    ) -> Result<String, String> {
33        // Auto-install bundled packages if evalframe is missing
34        if !is_package_installed("evalframe") {
35            self.auto_install_bundled_packages().await?;
36            if !is_package_installed("evalframe") {
37                return Err(
38                    "Package 'evalframe' not found after installing bundled collection. \
39                     Use alc_pkg_install to install it manually."
40                        .into(),
41                );
42            }
43        }
44
45        let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name)?;
46
47        // Build strategy opts Lua table literal
48        let opts_lua = match &strategy_opts {
49            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
50            _ => "{}".to_string(),
51        };
52
53        // Inject `std` global as a mlua-batteries compatible shim.
54        //
55        // evalframe.std expects the host to provide a `std` global with:
56        //   std.json.decode/encode  — JSON serialization
57        //   std.fs.read/is_file     — filesystem access
58        //   std.time.now            — wall-clock time (epoch seconds, f64)
59        //
60        // We bridge these from algocline's alc.* primitives and Lua's io stdlib.
61        let wrapped = format!(
62            r#"
63std = {{
64  json = {{
65    decode = alc.json_decode,
66    encode = alc.json_encode,
67  }},
68  fs = {{
69    read = function(path)
70      local f, err = io.open(path, "r")
71      if not f then error("std.fs.read: " .. (err or path), 2) end
72      local content = f:read("*a")
73      f:close()
74      return content
75    end,
76    is_file = function(path)
77      local f = io.open(path, "r")
78      if f then f:close(); return true end
79      return false
80    end,
81  }},
82  time = {{
83    now = alc.time,
84  }},
85}}
86
87local ef = require("evalframe")
88
89-- Load scenario (bindings + cases, no provider)
90local spec = (function()
91{scenario_code}
92end)()
93
94-- Inject strategy as provider
95spec.provider = ef.providers.algocline {{
96  strategy = "{strategy}",
97  opts = {opts_lua},
98}}
99
100-- Build and run suite
101local s = ef.suite "eval" (spec)
102local report = s:run()
103return report:to_table()
104"#
105        );
106
107        let ctx = serde_json::Value::Null;
108        let result = self.start_and_tick(wrapped, ctx, Some(strategy)).await?;
109
110        // Register this session for eval result saving on completion.
111        // start_and_tick returns the first pause (needs_response) or completed.
112        // If completed immediately, save now. Otherwise, save when continue_* finishes.
113        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
114            match parsed.get("status").and_then(|s| s.as_str()) {
115                Some("completed") => {
116                    save_eval_result(strategy, &result);
117                }
118                Some("needs_response") => {
119                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
120                        if let Ok(mut map) = self.eval_sessions.lock() {
121                            map.insert(sid.to_string(), strategy.to_string());
122                        }
123                    }
124                }
125                _ => {}
126            }
127        }
128
129        Ok(result)
130    }
131
132    /// List eval history, optionally filtered by strategy.
133    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
134        let dir = evals_dir()?;
135        list_eval_history(&dir, strategy, limit)
136    }
137
138    /// View a specific eval result by ID.
139    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
140        let evals_dir = evals_dir()?;
141        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
142            .map_err(|e| format!("Invalid eval_id: {e}"))?;
143        if !path.exists() {
144            return Err(format!("Eval result not found: {eval_id}"));
145        }
146        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
147    }
148
149    /// Compare two eval results with statistical significance testing.
150    ///
151    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
152    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
153    /// from each eval result — no re-computation of descriptive statistics.
154    ///
155    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
156    /// lookups of the same pair are file reads only.
157    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
158        // Check for cached comparison
159        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
160        if let Ok(dir) = evals_dir() {
161            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
162                if cached_path.exists() {
163                    return std::fs::read_to_string(&*cached_path)
164                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
165                }
166            }
167        }
168
169        // Auto-install bundled packages if evalframe is missing
170        if !is_package_installed("evalframe") {
171            self.auto_install_bundled_packages().await?;
172            if !is_package_installed("evalframe") {
173                return Err(
174                    "Package 'evalframe' not found after installing bundled collection. \
175                     Use alc_pkg_install to install it manually."
176                        .into(),
177                );
178            }
179        }
180
181        let result_a = self.eval_detail(eval_id_a)?;
182        let result_b = self.eval_detail(eval_id_b)?;
183
184        // Build Lua snippet that uses evalframe's stats module
185        // to compute welch_t from the persisted aggregated scores.
186        let lua_code = format!(
187            r#"
188std = {{
189  json = {{
190    decode = alc.json_decode,
191    encode = alc.json_encode,
192  }},
193  fs = {{ read = function() end, is_file = function() return false end }},
194  time = {{ now = alc.time }},
195}}
196
197local stats = require("evalframe.eval.stats")
198
199local result_a = alc.json_decode('{result_a_escaped}')
200local result_b = alc.json_decode('{result_b_escaped}')
201
202local agg_a = result_a.result and result_a.result.aggregated
203local agg_b = result_b.result and result_b.result.aggregated
204
205if not agg_a or not agg_a.scores then
206  error("No aggregated scores in {eval_id_a}")
207end
208if not agg_b or not agg_b.scores then
209  error("No aggregated scores in {eval_id_b}")
210end
211
212local welch = stats.welch_t(agg_a.scores, agg_b.scores)
213
214local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
215local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
216
217local delta = agg_a.scores.mean - agg_b.scores.mean
218local winner = "none"
219if welch.significant then
220  winner = delta > 0 and "a" or "b"
221end
222
223-- Build summary text
224local parts = {{}}
225if welch.significant then
226  local w, l, d = strategy_a, strategy_b, delta
227  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
228  parts[#parts + 1] = string.format(
229    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
230    w, l, d, math.abs(welch.t_stat), welch.df
231  )
232else
233  parts[#parts + 1] = string.format(
234    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
235    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
236  )
237end
238if agg_a.pass_rate and agg_b.pass_rate then
239  local dp = agg_a.pass_rate - agg_b.pass_rate
240  if math.abs(dp) > 1e-9 then
241    local h = dp > 0 and strategy_a or strategy_b
242    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
243  else
244    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
245  end
246end
247
248return {{
249  a = {{
250    eval_id = "{eval_id_a}",
251    strategy = strategy_a,
252    scores = agg_a.scores,
253    pass_rate = agg_a.pass_rate,
254    pass_at_1 = agg_a.pass_at_1,
255    ci_95 = agg_a.ci_95,
256  }},
257  b = {{
258    eval_id = "{eval_id_b}",
259    strategy = strategy_b,
260    scores = agg_b.scores,
261    pass_rate = agg_b.pass_rate,
262    pass_at_1 = agg_b.pass_at_1,
263    ci_95 = agg_b.ci_95,
264  }},
265  comparison = {{
266    delta_mean = delta,
267    welch_t = {{
268      t_stat = welch.t_stat,
269      df = welch.df,
270      significant = welch.significant,
271      direction = welch.direction,
272    }},
273    winner = winner,
274    summary = table.concat(parts, " "),
275  }},
276}}
277"#,
278            result_a_escaped = escape_for_lua_sq(&result_a),
279            result_b_escaped = escape_for_lua_sq(&result_b),
280            eval_id_a = eval_id_a,
281            eval_id_b = eval_id_b,
282            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
283            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
284        );
285
286        let ctx = serde_json::Value::Null;
287        let raw_result = self.start_and_tick(lua_code, ctx, None).await?;
288
289        // Persist comparison result
290        save_compare_result(eval_id_a, eval_id_b, &raw_result);
291
292        Ok(raw_result)
293    }
294}