Skip to main content

algocline_app/service/
eval.rs

1use super::eval_store::{
2    escape_for_lua_sq, evals_dir, extract_strategy_from_id, save_compare_result, save_eval_result,
3};
4use super::path::ContainedPath;
5use super::resolve::{is_package_installed, resolve_scenario_code};
6use super::AppService;
7
8impl AppService {
9    /// Run an evalframe evaluation suite.
10    ///
11    /// Accepts a scenario (bindings + cases) and a strategy name.
12    /// Automatically wires the strategy as the provider and executes
13    /// the evalframe suite, returning the report (summary, scores, failures).
14    ///
15    /// Injects a `std` global (mlua-batteries compatible shim) so evalframe's
16    /// `std.lua` can resolve json/fs/time from algocline's built-in primitives.
17    ///
18    /// # Security: `strategy` is not sanitized
19    ///
20    /// `strategy` is interpolated into a Lua string literal without escaping.
21    /// This is intentional — same rationale as [`make_require_code`]:
22    /// algocline runs Lua in the caller's own process with full ambient
23    /// authority, so Lua injection does not cross a trust boundary.
24    pub async fn eval(
25        &self,
26        scenario: Option<String>,
27        scenario_file: Option<String>,
28        scenario_name: Option<String>,
29        strategy: &str,
30        strategy_opts: Option<serde_json::Value>,
31    ) -> Result<String, String> {
32        // Auto-install bundled packages if evalframe is missing
33        if !is_package_installed("evalframe") {
34            self.auto_install_bundled_packages().await?;
35            if !is_package_installed("evalframe") {
36                return Err(
37                    "Package 'evalframe' not found after installing bundled collection. \
38                     Use alc_pkg_install to install it manually."
39                        .into(),
40                );
41            }
42        }
43
44        let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name)?;
45
46        // Build strategy opts Lua table literal
47        let opts_lua = match &strategy_opts {
48            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
49            _ => "{}".to_string(),
50        };
51
52        // Inject `std` global as a mlua-batteries compatible shim.
53        //
54        // evalframe.std expects the host to provide a `std` global with:
55        //   std.json.decode/encode  — JSON serialization
56        //   std.fs.read/is_file     — filesystem access
57        //   std.time.now            — wall-clock time (epoch seconds, f64)
58        //
59        // We bridge these from algocline's alc.* primitives and Lua's io stdlib.
60        let wrapped = format!(
61            r#"
62std = {{
63  json = {{
64    decode = alc.json_decode,
65    encode = alc.json_encode,
66  }},
67  fs = {{
68    read = function(path)
69      local f, err = io.open(path, "r")
70      if not f then error("std.fs.read: " .. (err or path), 2) end
71      local content = f:read("*a")
72      f:close()
73      return content
74    end,
75    is_file = function(path)
76      local f = io.open(path, "r")
77      if f then f:close(); return true end
78      return false
79    end,
80  }},
81  time = {{
82    now = alc.time,
83  }},
84}}
85
86local ef = require("evalframe")
87
88-- Load scenario (bindings + cases, no provider)
89local spec = (function()
90{scenario_code}
91end)()
92
93-- Inject strategy as provider
94spec.provider = ef.providers.algocline {{
95  strategy = "{strategy}",
96  opts = {opts_lua},
97}}
98
99-- Build and run suite
100local s = ef.suite "eval" (spec)
101local report = s:run()
102return report:to_table()
103"#
104        );
105
106        let ctx = serde_json::Value::Null;
107        let result = self.start_and_tick(wrapped, ctx, Some(strategy)).await?;
108
109        // Register this session for eval result saving on completion.
110        // start_and_tick returns the first pause (needs_response) or completed.
111        // If completed immediately, save now. Otherwise, save when continue_* finishes.
112        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
113            match parsed.get("status").and_then(|s| s.as_str()) {
114                Some("completed") => {
115                    save_eval_result(strategy, &result);
116                }
117                Some("needs_response") => {
118                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
119                        if let Ok(mut map) = self.eval_sessions.lock() {
120                            map.insert(sid.to_string(), strategy.to_string());
121                        }
122                    }
123                }
124                _ => {}
125            }
126        }
127
128        Ok(result)
129    }
130
131    /// List eval history, optionally filtered by strategy.
132    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
133        let evals_dir = evals_dir()?;
134        if !evals_dir.exists() {
135            return Ok(serde_json::json!({ "evals": [] }).to_string());
136        }
137
138        let mut entries: Vec<serde_json::Value> = Vec::new();
139
140        let read_dir =
141            std::fs::read_dir(&evals_dir).map_err(|e| format!("Failed to read evals dir: {e}"))?;
142
143        for entry in read_dir.flatten() {
144            let path = entry.path();
145            if path.extension().and_then(|e| e.to_str()) != Some("json") {
146                continue;
147            }
148            // Skip meta files
149            if path
150                .file_name()
151                .and_then(|n| n.to_str())
152                .is_some_and(|n| n.contains(".meta."))
153            {
154                continue;
155            }
156
157            // Read meta file (lightweight) if it exists.
158            // Derive meta filename from the result filename to stay within evals_dir
159            // (ContainedPath ensures no traversal).
160            let stem = match path.file_stem().and_then(|s| s.to_str()) {
161                Some(s) => s,
162                None => continue,
163            };
164            let meta_path = match ContainedPath::child(&evals_dir, &format!("{stem}.meta.json")) {
165                Ok(p) => p,
166                Err(_) => continue,
167            };
168            let meta = if meta_path.exists() {
169                std::fs::read_to_string(&*meta_path)
170                    .ok()
171                    .and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
172            } else {
173                None
174            };
175
176            if let Some(meta) = meta {
177                // Filter by strategy if specified
178                if let Some(filter) = strategy {
179                    if meta.get("strategy").and_then(|s| s.as_str()) != Some(filter) {
180                        continue;
181                    }
182                }
183                entries.push(meta);
184            }
185        }
186
187        // Sort by timestamp descending (newest first)
188        entries.sort_by(|a, b| {
189            let ts_a = a
190                .get("timestamp")
191                .and_then(serde_json::Value::as_u64)
192                .unwrap_or(0);
193            let ts_b = b
194                .get("timestamp")
195                .and_then(serde_json::Value::as_u64)
196                .unwrap_or(0);
197            ts_b.cmp(&ts_a)
198        });
199        entries.truncate(limit);
200
201        Ok(serde_json::json!({ "evals": entries }).to_string())
202    }
203
204    /// View a specific eval result by ID.
205    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
206        let evals_dir = evals_dir()?;
207        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
208            .map_err(|e| format!("Invalid eval_id: {e}"))?;
209        if !path.exists() {
210            return Err(format!("Eval result not found: {eval_id}"));
211        }
212        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
213    }
214
215    /// Compare two eval results with statistical significance testing.
216    ///
217    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
218    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
219    /// from each eval result — no re-computation of descriptive statistics.
220    ///
221    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
222    /// lookups of the same pair are file reads only.
223    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
224        // Check for cached comparison
225        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
226        if let Ok(dir) = evals_dir() {
227            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
228                if cached_path.exists() {
229                    return std::fs::read_to_string(&*cached_path)
230                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
231                }
232            }
233        }
234
235        // Auto-install bundled packages if evalframe is missing
236        if !is_package_installed("evalframe") {
237            self.auto_install_bundled_packages().await?;
238            if !is_package_installed("evalframe") {
239                return Err(
240                    "Package 'evalframe' not found after installing bundled collection. \
241                     Use alc_pkg_install to install it manually."
242                        .into(),
243                );
244            }
245        }
246
247        let result_a = self.eval_detail(eval_id_a)?;
248        let result_b = self.eval_detail(eval_id_b)?;
249
250        // Build Lua snippet that uses evalframe's stats module
251        // to compute welch_t from the persisted aggregated scores.
252        let lua_code = format!(
253            r#"
254std = {{
255  json = {{
256    decode = alc.json_decode,
257    encode = alc.json_encode,
258  }},
259  fs = {{ read = function() end, is_file = function() return false end }},
260  time = {{ now = alc.time }},
261}}
262
263local stats = require("evalframe.eval.stats")
264
265local result_a = alc.json_decode('{result_a_escaped}')
266local result_b = alc.json_decode('{result_b_escaped}')
267
268local agg_a = result_a.result and result_a.result.aggregated
269local agg_b = result_b.result and result_b.result.aggregated
270
271if not agg_a or not agg_a.scores then
272  error("No aggregated scores in {eval_id_a}")
273end
274if not agg_b or not agg_b.scores then
275  error("No aggregated scores in {eval_id_b}")
276end
277
278local welch = stats.welch_t(agg_a.scores, agg_b.scores)
279
280local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
281local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
282
283local delta = agg_a.scores.mean - agg_b.scores.mean
284local winner = "none"
285if welch.significant then
286  winner = delta > 0 and "a" or "b"
287end
288
289-- Build summary text
290local parts = {{}}
291if welch.significant then
292  local w, l, d = strategy_a, strategy_b, delta
293  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
294  parts[#parts + 1] = string.format(
295    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
296    w, l, d, math.abs(welch.t_stat), welch.df
297  )
298else
299  parts[#parts + 1] = string.format(
300    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
301    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
302  )
303end
304if agg_a.pass_rate and agg_b.pass_rate then
305  local dp = agg_a.pass_rate - agg_b.pass_rate
306  if math.abs(dp) > 1e-9 then
307    local h = dp > 0 and strategy_a or strategy_b
308    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
309  else
310    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
311  end
312end
313
314return {{
315  a = {{
316    eval_id = "{eval_id_a}",
317    strategy = strategy_a,
318    scores = agg_a.scores,
319    pass_rate = agg_a.pass_rate,
320    pass_at_1 = agg_a.pass_at_1,
321    ci_95 = agg_a.ci_95,
322  }},
323  b = {{
324    eval_id = "{eval_id_b}",
325    strategy = strategy_b,
326    scores = agg_b.scores,
327    pass_rate = agg_b.pass_rate,
328    pass_at_1 = agg_b.pass_at_1,
329    ci_95 = agg_b.ci_95,
330  }},
331  comparison = {{
332    delta_mean = delta,
333    welch_t = {{
334      t_stat = welch.t_stat,
335      df = welch.df,
336      significant = welch.significant,
337      direction = welch.direction,
338    }},
339    winner = winner,
340    summary = table.concat(parts, " "),
341  }},
342}}
343"#,
344            result_a_escaped = escape_for_lua_sq(&result_a),
345            result_b_escaped = escape_for_lua_sq(&result_b),
346            eval_id_a = eval_id_a,
347            eval_id_b = eval_id_b,
348            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
349            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
350        );
351
352        let ctx = serde_json::Value::Null;
353        let raw_result = self.start_and_tick(lua_code, ctx, None).await?;
354
355        // Persist comparison result
356        save_compare_result(eval_id_a, eval_id_b, &raw_result);
357
358        Ok(raw_result)
359    }
360}