algocline_app/service/
eval.rs

1use super::eval_store::{
2    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3    save_eval_result, splice_response_string,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::run::normalize_stringified_json_object;
8use super::AppService;
9
10/// Lua shim that bridges algocline's `alc.*` primitives to the `std` global
11/// expected by evalframe.std. Injected once before any evalframe code runs.
12const STD_SHIM: &str = r#"
13std = {
14  json = {
15    decode = alc.json_decode,
16    encode = alc.json_encode,
17  },
18  fs = {
19    read = function(path)
20      local f, err = io.open(path, "r")
21      if not f then error("std.fs.read: " .. (err or path), 2) end
22      local content = f:read("*a")
23      f:close()
24      return content
25    end,
26    is_file = function(path)
27      local f = io.open(path, "r")
28      if f then f:close(); return true end
29      return false
30    end,
31  },
32  time = {
33    now = alc.time,
34  },
35}
36"#;
37
38impl AppService {
39    /// Run an evalframe evaluation suite via `alc.eval()`.
40    ///
41    /// Resolves the scenario from one of three input modes (inline/file/name),
42    /// injects the `std` global shim, and delegates to `alc.eval()` in prelude
43    /// which handles evalframe loading, provider wiring, and optional Card
44    /// emission.
45    ///
46    /// # Security: `strategy` is not sanitized
47    ///
48    /// `strategy` is interpolated into a Lua string literal without escaping.
49    /// This is intentional — algocline runs Lua in the caller's own process
50    /// with full ambient authority, so Lua injection does not cross a trust
51    /// boundary.
52    pub async fn eval(
53        &self,
54        scenario: Option<String>,
55        scenario_file: Option<String>,
56        scenario_name: Option<String>,
57        strategy: &str,
58        strategy_opts: Option<serde_json::Value>,
59        auto_card: bool,
60    ) -> Result<String, String> {
61        // Auto-install bundled packages if evalframe is missing
62        let app_dir = self.log_config.app_dir();
63        if !is_package_installed(&app_dir, "evalframe") {
64            self.auto_install_bundled_packages().await?;
65            if !is_package_installed(&app_dir, "evalframe") {
66                return Err(
67                    "Package 'evalframe' not found after installing bundled collection. \
68                     Use alc_pkg_install to install it manually."
69                        .into(),
70                );
71            }
72        }
73
74        let scenario_code =
75            resolve_scenario_code(&app_dir, scenario, scenario_file, scenario_name.clone())?;
76
77        // Build strategy opts Lua table literal
78        let strategy_opts = strategy_opts.map(normalize_stringified_json_object);
79        let opts_lua = match &strategy_opts {
80            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
81            _ => "nil".to_string(),
82        };
83
84        let auto_card_lua = if auto_card { "true" } else { "false" };
85
86        // Delegate to alc.eval() in prelude.
87        // The shim injects `std` for evalframe, then the scenario code is
88        // evaluated into a table and passed to alc.eval() along with opts.
89        let wrapped = format!(
90            r#"{std_shim}
91
92local scenario = (function()
93{scenario_code}
94end)()
95
96return alc.eval(scenario, "{strategy}", {{
97  strategy_opts = {opts_lua},
98  auto_card = {auto_card_lua},
99}})
100"#,
101            std_shim = STD_SHIM,
102        );
103
104        let ctx = serde_json::Value::Null;
105        let result = self
106            .start_and_tick(wrapped, ctx, Some(strategy), vec![], vec![])
107            .await?;
108
109        // Persist eval result for history/comparison.
110        // Card emission is handled by alc.eval() Lua-side when auto_card=true.
111        let mut save_warning: Option<String> = None;
112        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
113            match parsed.get("status").and_then(|s| s.as_str()) {
114                Some("completed") => {
115                    if let Err(e) = save_eval_result(&app_dir, strategy, &result) {
116                        save_warning = Some(e);
117                    }
118                }
119                Some("needs_response") => {
120                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
121                        if let Ok(mut map) = self.eval_sessions.lock() {
122                            map.insert(sid.to_string(), strategy.to_string());
123                        }
124                    }
125                }
126                _ => {}
127            }
128        }
129
130        match save_warning {
131            Some(msg) => Ok(splice_response_string(&result, "save_warning", &msg)),
132            None => Ok(result),
133        }
134    }
135
136    /// List eval history, optionally filtered by strategy.
137    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
138        let dir = evals_dir(&self.log_config.app_dir());
139        list_eval_history(&dir, strategy, limit)
140    }
141
142    /// View a specific eval result by ID.
143    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
144        let evals_dir = evals_dir(&self.log_config.app_dir());
145        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
146            .map_err(|e| format!("Invalid eval_id: {e}"))?;
147        if !path.exists() {
148            return Err(format!("Eval result not found: {eval_id}"));
149        }
150        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
151    }
152
153    /// Compare two eval results with statistical significance testing.
154    ///
155    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
156    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
157    /// from each eval result — no re-computation of descriptive statistics.
158    ///
159    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
160    /// lookups of the same pair are file reads only.
161    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
162        let app_dir = self.log_config.app_dir();
163        // Check for cached comparison
164        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
165        {
166            let dir = evals_dir(&app_dir);
167            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
168                if cached_path.exists() {
169                    return std::fs::read_to_string(&*cached_path)
170                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
171                }
172            }
173        }
174
175        // Auto-install bundled packages if evalframe is missing
176        if !is_package_installed(&app_dir, "evalframe") {
177            self.auto_install_bundled_packages().await?;
178            if !is_package_installed(&app_dir, "evalframe") {
179                return Err(
180                    "Package 'evalframe' not found after installing bundled collection. \
181                     Use alc_pkg_install to install it manually."
182                        .into(),
183                );
184            }
185        }
186
187        let result_a = self.eval_detail(eval_id_a)?;
188        let result_b = self.eval_detail(eval_id_b)?;
189
190        // Build Lua snippet that uses evalframe's stats module
191        // to compute welch_t from the persisted aggregated scores.
192        let lua_code = format!(
193            r#"{std_shim}
194
195local stats = require("evalframe.eval.stats")
196
197local result_a = alc.json_decode('{result_a_escaped}')
198local result_b = alc.json_decode('{result_b_escaped}')
199
200local agg_a = result_a.result and result_a.result.aggregated
201local agg_b = result_b.result and result_b.result.aggregated
202
203if not agg_a or not agg_a.scores then
204  error("No aggregated scores in {eval_id_a}")
205end
206if not agg_b or not agg_b.scores then
207  error("No aggregated scores in {eval_id_b}")
208end
209
210local welch = stats.welch_t(agg_a.scores, agg_b.scores)
211
212local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
213local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
214
215local delta = agg_a.scores.mean - agg_b.scores.mean
216local winner = "none"
217if welch.significant then
218  winner = delta > 0 and "a" or "b"
219end
220
221-- Build summary text
222local parts = {{}}
223if welch.significant then
224  local w, l, d = strategy_a, strategy_b, delta
225  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
226  parts[#parts + 1] = string.format(
227    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
228    w, l, d, math.abs(welch.t_stat), welch.df
229  )
230else
231  parts[#parts + 1] = string.format(
232    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
233    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
234  )
235end
236if agg_a.pass_rate and agg_b.pass_rate then
237  local dp = agg_a.pass_rate - agg_b.pass_rate
238  if math.abs(dp) > 1e-9 then
239    local h = dp > 0 and strategy_a or strategy_b
240    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
241  else
242    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
243  end
244end
245
246return {{
247  a = {{
248    eval_id = "{eval_id_a}",
249    strategy = strategy_a,
250    scores = agg_a.scores,
251    pass_rate = agg_a.pass_rate,
252    pass_at_1 = agg_a.pass_at_1,
253    ci_95 = agg_a.ci_95,
254  }},
255  b = {{
256    eval_id = "{eval_id_b}",
257    strategy = strategy_b,
258    scores = agg_b.scores,
259    pass_rate = agg_b.pass_rate,
260    pass_at_1 = agg_b.pass_at_1,
261    ci_95 = agg_b.ci_95,
262  }},
263  comparison = {{
264    delta_mean = delta,
265    welch_t = {{
266      t_stat = welch.t_stat,
267      df = welch.df,
268      significant = welch.significant,
269      direction = welch.direction,
270    }},
271    winner = winner,
272    summary = table.concat(parts, " "),
273  }},
274}}
275"#,
276            result_a_escaped = escape_for_lua_sq(&result_a),
277            result_b_escaped = escape_for_lua_sq(&result_b),
278            eval_id_a = eval_id_a,
279            eval_id_b = eval_id_b,
280            std_shim = STD_SHIM,
281            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
282            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
283        );
284
285        let ctx = serde_json::Value::Null;
286        let raw_result = self
287            .start_and_tick(lua_code, ctx, None, vec![], vec![])
288            .await?;
289
290        // Persist comparison result. Storage failure surfaces as an
291        // additive `save_warning` field on the response — the comparison
292        // itself ran to completion and remains valid in memory.
293        match save_compare_result(&app_dir, eval_id_a, eval_id_b, &raw_result) {
294            Ok(()) => Ok(raw_result),
295            Err(e) => Ok(splice_response_string(&raw_result, "save_warning", &e)),
296        }
297    }
298}
algocline_app/service/eval.rs

algocline_app/service/
eval.rs