algocline_app/service/
eval.rs

1use super::eval_store::{
2    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3    save_eval_result, splice_response_string,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9/// Lua shim that bridges algocline's `alc.*` primitives to the `std` global
10/// expected by evalframe.std. Injected once before any evalframe code runs.
11const STD_SHIM: &str = r#"
12std = {
13  json = {
14    decode = alc.json_decode,
15    encode = alc.json_encode,
16  },
17  fs = {
18    read = function(path)
19      local f, err = io.open(path, "r")
20      if not f then error("std.fs.read: " .. (err or path), 2) end
21      local content = f:read("*a")
22      f:close()
23      return content
24    end,
25    is_file = function(path)
26      local f = io.open(path, "r")
27      if f then f:close(); return true end
28      return false
29    end,
30  },
31  time = {
32    now = alc.time,
33  },
34}
35"#;
36
37impl AppService {
38    /// Run an evalframe evaluation suite via `alc.eval()`.
39    ///
40    /// Resolves the scenario from one of three input modes (inline/file/name),
41    /// injects the `std` global shim, and delegates to `alc.eval()` in prelude
42    /// which handles evalframe loading, provider wiring, and optional Card
43    /// emission.
44    ///
45    /// # Security: `strategy` is not sanitized
46    ///
47    /// `strategy` is interpolated into a Lua string literal without escaping.
48    /// This is intentional — algocline runs Lua in the caller's own process
49    /// with full ambient authority, so Lua injection does not cross a trust
50    /// boundary.
51    pub async fn eval(
52        &self,
53        scenario: Option<String>,
54        scenario_file: Option<String>,
55        scenario_name: Option<String>,
56        strategy: &str,
57        strategy_opts: Option<serde_json::Value>,
58        auto_card: bool,
59    ) -> Result<String, String> {
60        // Auto-install bundled packages if evalframe is missing
61        let app_dir = self.log_config.app_dir();
62        if !is_package_installed(&app_dir, "evalframe") {
63            self.auto_install_bundled_packages().await?;
64            if !is_package_installed(&app_dir, "evalframe") {
65                return Err(
66                    "Package 'evalframe' not found after installing bundled collection. \
67                     Use alc_pkg_install to install it manually."
68                        .into(),
69                );
70            }
71        }
72
73        let scenario_code =
74            resolve_scenario_code(&app_dir, scenario, scenario_file, scenario_name.clone())?;
75
76        // Build strategy opts Lua table literal
77        let opts_lua = match &strategy_opts {
78            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
79            _ => "nil".to_string(),
80        };
81
82        let auto_card_lua = if auto_card { "true" } else { "false" };
83
84        // Delegate to alc.eval() in prelude.
85        // The shim injects `std` for evalframe, then the scenario code is
86        // evaluated into a table and passed to alc.eval() along with opts.
87        let wrapped = format!(
88            r#"{std_shim}
89
90local scenario = (function()
91{scenario_code}
92end)()
93
94return alc.eval(scenario, "{strategy}", {{
95  strategy_opts = {opts_lua},
96  auto_card = {auto_card_lua},
97}})
98"#,
99            std_shim = STD_SHIM,
100        );
101
102        let ctx = serde_json::Value::Null;
103        let result = self
104            .start_and_tick(wrapped, ctx, Some(strategy), vec![], vec![])
105            .await?;
106
107        // Persist eval result for history/comparison.
108        // Card emission is handled by alc.eval() Lua-side when auto_card=true.
109        let mut save_warning: Option<String> = None;
110        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
111            match parsed.get("status").and_then(|s| s.as_str()) {
112                Some("completed") => {
113                    if let Err(e) = save_eval_result(&app_dir, strategy, &result) {
114                        save_warning = Some(e);
115                    }
116                }
117                Some("needs_response") => {
118                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
119                        if let Ok(mut map) = self.eval_sessions.lock() {
120                            map.insert(sid.to_string(), strategy.to_string());
121                        }
122                    }
123                }
124                _ => {}
125            }
126        }
127
128        match save_warning {
129            Some(msg) => Ok(splice_response_string(&result, "save_warning", &msg)),
130            None => Ok(result),
131        }
132    }
133
134    /// List eval history, optionally filtered by strategy.
135    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
136        let dir = evals_dir(&self.log_config.app_dir());
137        list_eval_history(&dir, strategy, limit)
138    }
139
140    /// View a specific eval result by ID.
141    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
142        let evals_dir = evals_dir(&self.log_config.app_dir());
143        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
144            .map_err(|e| format!("Invalid eval_id: {e}"))?;
145        if !path.exists() {
146            return Err(format!("Eval result not found: {eval_id}"));
147        }
148        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
149    }
150
151    /// Compare two eval results with statistical significance testing.
152    ///
153    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
154    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
155    /// from each eval result — no re-computation of descriptive statistics.
156    ///
157    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
158    /// lookups of the same pair are file reads only.
159    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
160        let app_dir = self.log_config.app_dir();
161        // Check for cached comparison
162        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
163        {
164            let dir = evals_dir(&app_dir);
165            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
166                if cached_path.exists() {
167                    return std::fs::read_to_string(&*cached_path)
168                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
169                }
170            }
171        }
172
173        // Auto-install bundled packages if evalframe is missing
174        if !is_package_installed(&app_dir, "evalframe") {
175            self.auto_install_bundled_packages().await?;
176            if !is_package_installed(&app_dir, "evalframe") {
177                return Err(
178                    "Package 'evalframe' not found after installing bundled collection. \
179                     Use alc_pkg_install to install it manually."
180                        .into(),
181                );
182            }
183        }
184
185        let result_a = self.eval_detail(eval_id_a)?;
186        let result_b = self.eval_detail(eval_id_b)?;
187
188        // Build Lua snippet that uses evalframe's stats module
189        // to compute welch_t from the persisted aggregated scores.
190        let lua_code = format!(
191            r#"{std_shim}
192
193local stats = require("evalframe.eval.stats")
194
195local result_a = alc.json_decode('{result_a_escaped}')
196local result_b = alc.json_decode('{result_b_escaped}')
197
198local agg_a = result_a.result and result_a.result.aggregated
199local agg_b = result_b.result and result_b.result.aggregated
200
201if not agg_a or not agg_a.scores then
202  error("No aggregated scores in {eval_id_a}")
203end
204if not agg_b or not agg_b.scores then
205  error("No aggregated scores in {eval_id_b}")
206end
207
208local welch = stats.welch_t(agg_a.scores, agg_b.scores)
209
210local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
211local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
212
213local delta = agg_a.scores.mean - agg_b.scores.mean
214local winner = "none"
215if welch.significant then
216  winner = delta > 0 and "a" or "b"
217end
218
219-- Build summary text
220local parts = {{}}
221if welch.significant then
222  local w, l, d = strategy_a, strategy_b, delta
223  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
224  parts[#parts + 1] = string.format(
225    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
226    w, l, d, math.abs(welch.t_stat), welch.df
227  )
228else
229  parts[#parts + 1] = string.format(
230    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
231    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
232  )
233end
234if agg_a.pass_rate and agg_b.pass_rate then
235  local dp = agg_a.pass_rate - agg_b.pass_rate
236  if math.abs(dp) > 1e-9 then
237    local h = dp > 0 and strategy_a or strategy_b
238    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
239  else
240    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
241  end
242end
243
244return {{
245  a = {{
246    eval_id = "{eval_id_a}",
247    strategy = strategy_a,
248    scores = agg_a.scores,
249    pass_rate = agg_a.pass_rate,
250    pass_at_1 = agg_a.pass_at_1,
251    ci_95 = agg_a.ci_95,
252  }},
253  b = {{
254    eval_id = "{eval_id_b}",
255    strategy = strategy_b,
256    scores = agg_b.scores,
257    pass_rate = agg_b.pass_rate,
258    pass_at_1 = agg_b.pass_at_1,
259    ci_95 = agg_b.ci_95,
260  }},
261  comparison = {{
262    delta_mean = delta,
263    welch_t = {{
264      t_stat = welch.t_stat,
265      df = welch.df,
266      significant = welch.significant,
267      direction = welch.direction,
268    }},
269    winner = winner,
270    summary = table.concat(parts, " "),
271  }},
272}}
273"#,
274            result_a_escaped = escape_for_lua_sq(&result_a),
275            result_b_escaped = escape_for_lua_sq(&result_b),
276            eval_id_a = eval_id_a,
277            eval_id_b = eval_id_b,
278            std_shim = STD_SHIM,
279            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
280            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
281        );
282
283        let ctx = serde_json::Value::Null;
284        let raw_result = self
285            .start_and_tick(lua_code, ctx, None, vec![], vec![])
286            .await?;
287
288        // Persist comparison result. Storage failure surfaces as an
289        // additive `save_warning` field on the response — the comparison
290        // itself ran to completion and remains valid in memory.
291        match save_compare_result(&app_dir, eval_id_a, eval_id_b, &raw_result) {
292            Ok(()) => Ok(raw_result),
293            Err(e) => Ok(splice_response_string(&raw_result, "save_warning", &e)),
294        }
295    }
296}
algocline_app/service/eval.rs

algocline_app/service/
eval.rs