algocline_app/service/
eval.rs

1use std::collections::HashMap;
2use std::sync::Arc;
3
4use super::eval_store::{
5    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
6    save_eval_result, splice_response_string,
7};
8use super::path::ContainedPath;
9use super::resolve::{is_package_installed, resolve_scenario_code};
10use super::run::normalize_stringified_json_object;
11use super::AppService;
12
13/// Lua shim that bridges algocline's `alc.*` primitives to the `std` global
14/// expected by evalframe.std. Injected once before any evalframe code runs.
15const STD_SHIM: &str = r#"
16std = {
17  json = {
18    decode = alc.json_decode,
19    encode = alc.json_encode,
20  },
21  fs = {
22    read = function(path)
23      local f, err = io.open(path, "r")
24      if not f then error("std.fs.read: " .. (err or path), 2) end
25      local content = f:read("*a")
26      f:close()
27      return content
28    end,
29    is_file = function(path)
30      local f = io.open(path, "r")
31      if f then f:close(); return true end
32      return false
33    end,
34  },
35  time = {
36    now = alc.time,
37  },
38}
39"#;
40
41impl AppService {
42    /// Run an evalframe evaluation suite via `alc.eval()`.
43    ///
44    /// Resolves the scenario from one of three input modes (inline/file/name),
45    /// injects the `std` global shim, and delegates to `alc.eval()` in prelude
46    /// which handles evalframe loading, provider wiring, and optional Card
47    /// emission.
48    ///
49    /// # Security: `strategy` is not sanitized
50    ///
51    /// `strategy` is interpolated into a Lua string literal without escaping.
52    /// This is intentional — algocline runs Lua in the caller's own process
53    /// with full ambient authority, so Lua injection does not cross a trust
54    /// boundary.
55    pub async fn eval(
56        &self,
57        scenario: Option<String>,
58        scenario_file: Option<String>,
59        scenario_name: Option<String>,
60        strategy: &str,
61        strategy_opts: Option<serde_json::Value>,
62        auto_card: bool,
63    ) -> Result<String, String> {
64        // Auto-install bundled packages if evalframe is missing
65        let app_dir = self.log_config.app_dir();
66        if !is_package_installed(&app_dir, "evalframe") {
67            self.auto_install_bundled_packages().await?;
68            if !is_package_installed(&app_dir, "evalframe") {
69                return Err(
70                    "Package 'evalframe' not found after installing bundled collection. \
71                     Use alc_pkg_install to install it manually."
72                        .into(),
73                );
74            }
75        }
76
77        let scenario_code =
78            resolve_scenario_code(&app_dir, scenario, scenario_file, scenario_name.clone())?;
79
80        // Build strategy opts Lua table literal
81        let strategy_opts = strategy_opts.map(normalize_stringified_json_object);
82        let opts_lua = match &strategy_opts {
83            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
84            _ => "nil".to_string(),
85        };
86
87        let auto_card_lua = if auto_card { "true" } else { "false" };
88
89        // Delegate to alc.eval() in prelude.
90        // The shim injects `std` for evalframe, then the scenario code is
91        // evaluated into a table and passed to alc.eval() along with opts.
92        let wrapped = format!(
93            r#"{std_shim}
94
95local scenario = (function()
96{scenario_code}
97end)()
98
99return alc.eval(scenario, "{strategy}", {{
100  strategy_opts = {opts_lua},
101  auto_card = {auto_card_lua},
102}})
103"#,
104            std_shim = STD_SHIM,
105        );
106
107        let ctx = serde_json::Value::Null;
108        // eval path does not accept ctx.env; pass an empty map so alc.env is
109        // present but empty (no env vars visible to eval strategies).
110        let env_map = Arc::new(HashMap::new());
111        let result = self
112            .start_and_tick(env_map, wrapped, ctx, Some(strategy), vec![], vec![])
113            .await?;
114
115        // Persist eval result for history/comparison.
116        // Card emission is handled by alc.eval() Lua-side when auto_card=true.
117        let mut save_warning: Option<String> = None;
118        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
119            match parsed.get("status").and_then(|s| s.as_str()) {
120                Some("completed") => {
121                    if let Err(e) = save_eval_result(&app_dir, strategy, &result) {
122                        save_warning = Some(e);
123                    }
124                }
125                Some("needs_response") => {
126                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
127                        if let Ok(mut map) = self.eval_sessions.lock() {
128                            map.insert(sid.to_string(), strategy.to_string());
129                        }
130                    }
131                }
132                _ => {}
133            }
134        }
135
136        match save_warning {
137            Some(msg) => Ok(splice_response_string(&result, "save_warning", &msg)),
138            None => Ok(result),
139        }
140    }
141
142    /// List eval history, optionally filtered by strategy.
143    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
144        let dir = evals_dir(&self.log_config.app_dir());
145        list_eval_history(&dir, strategy, limit)
146    }
147
148    /// View a specific eval result by ID.
149    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
150        let evals_dir = evals_dir(&self.log_config.app_dir());
151        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
152            .map_err(|e| format!("Invalid eval_id: {e}"))?;
153        if !path.exists() {
154            return Err(format!("Eval result not found: {eval_id}"));
155        }
156        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
157    }
158
159    /// Compare two eval results with statistical significance testing.
160    ///
161    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
162    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
163    /// from each eval result — no re-computation of descriptive statistics.
164    ///
165    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
166    /// lookups of the same pair are file reads only.
167    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
168        let app_dir = self.log_config.app_dir();
169        // Check for cached comparison
170        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
171        {
172            let dir = evals_dir(&app_dir);
173            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
174                if cached_path.exists() {
175                    return std::fs::read_to_string(&*cached_path)
176                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
177                }
178            }
179        }
180
181        // Auto-install bundled packages if evalframe is missing
182        if !is_package_installed(&app_dir, "evalframe") {
183            self.auto_install_bundled_packages().await?;
184            if !is_package_installed(&app_dir, "evalframe") {
185                return Err(
186                    "Package 'evalframe' not found after installing bundled collection. \
187                     Use alc_pkg_install to install it manually."
188                        .into(),
189                );
190            }
191        }
192
193        let result_a = self.eval_detail(eval_id_a)?;
194        let result_b = self.eval_detail(eval_id_b)?;
195
196        // Build Lua snippet that uses evalframe's stats module
197        // to compute welch_t from the persisted aggregated scores.
198        let lua_code = format!(
199            r#"{std_shim}
200
201local stats = require("evalframe.eval.stats")
202
203local result_a = alc.json_decode('{result_a_escaped}')
204local result_b = alc.json_decode('{result_b_escaped}')
205
206local agg_a = result_a.result and result_a.result.aggregated
207local agg_b = result_b.result and result_b.result.aggregated
208
209if not agg_a or not agg_a.scores then
210  error("No aggregated scores in {eval_id_a}")
211end
212if not agg_b or not agg_b.scores then
213  error("No aggregated scores in {eval_id_b}")
214end
215
216local welch = stats.welch_t(agg_a.scores, agg_b.scores)
217
218local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
219local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
220
221local delta = agg_a.scores.mean - agg_b.scores.mean
222local winner = "none"
223if welch.significant then
224  winner = delta > 0 and "a" or "b"
225end
226
227-- Build summary text
228local parts = {{}}
229if welch.significant then
230  local w, l, d = strategy_a, strategy_b, delta
231  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
232  parts[#parts + 1] = string.format(
233    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
234    w, l, d, math.abs(welch.t_stat), welch.df
235  )
236else
237  parts[#parts + 1] = string.format(
238    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
239    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
240  )
241end
242if agg_a.pass_rate and agg_b.pass_rate then
243  local dp = agg_a.pass_rate - agg_b.pass_rate
244  if math.abs(dp) > 1e-9 then
245    local h = dp > 0 and strategy_a or strategy_b
246    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
247  else
248    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
249  end
250end
251
252return {{
253  a = {{
254    eval_id = "{eval_id_a}",
255    strategy = strategy_a,
256    scores = agg_a.scores,
257    pass_rate = agg_a.pass_rate,
258    pass_at_1 = agg_a.pass_at_1,
259    ci_95 = agg_a.ci_95,
260  }},
261  b = {{
262    eval_id = "{eval_id_b}",
263    strategy = strategy_b,
264    scores = agg_b.scores,
265    pass_rate = agg_b.pass_rate,
266    pass_at_1 = agg_b.pass_at_1,
267    ci_95 = agg_b.ci_95,
268  }},
269  comparison = {{
270    delta_mean = delta,
271    welch_t = {{
272      t_stat = welch.t_stat,
273      df = welch.df,
274      significant = welch.significant,
275      direction = welch.direction,
276    }},
277    winner = winner,
278    summary = table.concat(parts, " "),
279  }},
280}}
281"#,
282            result_a_escaped = escape_for_lua_sq(&result_a),
283            result_b_escaped = escape_for_lua_sq(&result_b),
284            eval_id_a = eval_id_a,
285            eval_id_b = eval_id_b,
286            std_shim = STD_SHIM,
287            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
288            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
289        );
290
291        let ctx = serde_json::Value::Null;
292        // compare path does not accept ctx.env; pass an empty map so alc.env is
293        // present but empty (no env vars visible to compare strategies).
294        let env_map = Arc::new(HashMap::new());
295        let raw_result = self
296            .start_and_tick(env_map, lua_code, ctx, None, vec![], vec![])
297            .await?;
298
299        // Persist comparison result. Storage failure surfaces as an
300        // additive `save_warning` field on the response — the comparison
301        // itself ran to completion and remains valid in memory.
302        match save_compare_result(&app_dir, eval_id_a, eval_id_b, &raw_result) {
303            Ok(()) => Ok(raw_result),
304            Err(e) => Ok(splice_response_string(&raw_result, "save_warning", &e)),
305        }
306    }
307}
algocline_app/service/eval.rs

algocline_app/service/
eval.rs