Skip to main content

algocline_app/service/
eval.rs

1use super::eval_store::{
2    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history,
3    save_compare_result, save_eval_result,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9/// Lua shim that bridges algocline's `alc.*` primitives to the `std` global
10/// expected by evalframe.std. Injected once before any evalframe code runs.
11const STD_SHIM: &str = r#"
12std = {
13  json = {
14    decode = alc.json_decode,
15    encode = alc.json_encode,
16  },
17  fs = {
18    read = function(path)
19      local f, err = io.open(path, "r")
20      if not f then error("std.fs.read: " .. (err or path), 2) end
21      local content = f:read("*a")
22      f:close()
23      return content
24    end,
25    is_file = function(path)
26      local f = io.open(path, "r")
27      if f then f:close(); return true end
28      return false
29    end,
30  },
31  time = {
32    now = alc.time,
33  },
34}
35"#;
36
37impl AppService {
38    /// Run an evalframe evaluation suite via `alc.eval()`.
39    ///
40    /// Resolves the scenario from one of three input modes (inline/file/name),
41    /// injects the `std` global shim, and delegates to `alc.eval()` in prelude
42    /// which handles evalframe loading, provider wiring, and optional Card
43    /// emission.
44    ///
45    /// # Security: `strategy` is not sanitized
46    ///
47    /// `strategy` is interpolated into a Lua string literal without escaping.
48    /// This is intentional — algocline runs Lua in the caller's own process
49    /// with full ambient authority, so Lua injection does not cross a trust
50    /// boundary.
51    pub async fn eval(
52        &self,
53        scenario: Option<String>,
54        scenario_file: Option<String>,
55        scenario_name: Option<String>,
56        strategy: &str,
57        strategy_opts: Option<serde_json::Value>,
58        auto_card: bool,
59    ) -> Result<String, String> {
60        // Auto-install bundled packages if evalframe is missing
61        if !is_package_installed("evalframe") {
62            self.auto_install_bundled_packages().await?;
63            if !is_package_installed("evalframe") {
64                return Err(
65                    "Package 'evalframe' not found after installing bundled collection. \
66                     Use alc_pkg_install to install it manually."
67                        .into(),
68                );
69            }
70        }
71
72        let scenario_code =
73            resolve_scenario_code(scenario, scenario_file, scenario_name.clone())?;
74
75        // Build strategy opts Lua table literal
76        let opts_lua = match &strategy_opts {
77            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
78            _ => "nil".to_string(),
79        };
80
81        let auto_card_lua = if auto_card { "true" } else { "false" };
82
83        // Delegate to alc.eval() in prelude.
84        // The shim injects `std` for evalframe, then the scenario code is
85        // evaluated into a table and passed to alc.eval() along with opts.
86        let wrapped = format!(
87            r#"{std_shim}
88
89local scenario = (function()
90{scenario_code}
91end)()
92
93return alc.eval(scenario, "{strategy}", {{
94  strategy_opts = {opts_lua},
95  auto_card = {auto_card_lua},
96}})
97"#,
98            std_shim = STD_SHIM,
99        );
100
101        let ctx = serde_json::Value::Null;
102        let result = self
103            .start_and_tick(wrapped, ctx, Some(strategy), vec![])
104            .await?;
105
106        // Persist eval result for history/comparison.
107        // Card emission is handled by alc.eval() Lua-side when auto_card=true.
108        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
109            match parsed.get("status").and_then(|s| s.as_str()) {
110                Some("completed") => {
111                    save_eval_result(strategy, &result);
112                }
113                Some("needs_response") => {
114                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
115                        if let Ok(mut map) = self.eval_sessions.lock() {
116                            map.insert(sid.to_string(), strategy.to_string());
117                        }
118                    }
119                }
120                _ => {}
121            }
122        }
123
124        Ok(result)
125    }
126
127    /// List eval history, optionally filtered by strategy.
128    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
129        let dir = evals_dir()?;
130        list_eval_history(&dir, strategy, limit)
131    }
132
133    /// View a specific eval result by ID.
134    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
135        let evals_dir = evals_dir()?;
136        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
137            .map_err(|e| format!("Invalid eval_id: {e}"))?;
138        if !path.exists() {
139            return Err(format!("Eval result not found: {eval_id}"));
140        }
141        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
142    }
143
144    /// Compare two eval results with statistical significance testing.
145    ///
146    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
147    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
148    /// from each eval result — no re-computation of descriptive statistics.
149    ///
150    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
151    /// lookups of the same pair are file reads only.
152    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
153        // Check for cached comparison
154        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
155        if let Ok(dir) = evals_dir() {
156            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
157                if cached_path.exists() {
158                    return std::fs::read_to_string(&*cached_path)
159                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
160                }
161            }
162        }
163
164        // Auto-install bundled packages if evalframe is missing
165        if !is_package_installed("evalframe") {
166            self.auto_install_bundled_packages().await?;
167            if !is_package_installed("evalframe") {
168                return Err(
169                    "Package 'evalframe' not found after installing bundled collection. \
170                     Use alc_pkg_install to install it manually."
171                        .into(),
172                );
173            }
174        }
175
176        let result_a = self.eval_detail(eval_id_a)?;
177        let result_b = self.eval_detail(eval_id_b)?;
178
179        // Build Lua snippet that uses evalframe's stats module
180        // to compute welch_t from the persisted aggregated scores.
181        let lua_code = format!(
182            r#"{std_shim}
183
184local stats = require("evalframe.eval.stats")
185
186local result_a = alc.json_decode('{result_a_escaped}')
187local result_b = alc.json_decode('{result_b_escaped}')
188
189local agg_a = result_a.result and result_a.result.aggregated
190local agg_b = result_b.result and result_b.result.aggregated
191
192if not agg_a or not agg_a.scores then
193  error("No aggregated scores in {eval_id_a}")
194end
195if not agg_b or not agg_b.scores then
196  error("No aggregated scores in {eval_id_b}")
197end
198
199local welch = stats.welch_t(agg_a.scores, agg_b.scores)
200
201local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
202local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
203
204local delta = agg_a.scores.mean - agg_b.scores.mean
205local winner = "none"
206if welch.significant then
207  winner = delta > 0 and "a" or "b"
208end
209
210-- Build summary text
211local parts = {{}}
212if welch.significant then
213  local w, l, d = strategy_a, strategy_b, delta
214  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
215  parts[#parts + 1] = string.format(
216    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
217    w, l, d, math.abs(welch.t_stat), welch.df
218  )
219else
220  parts[#parts + 1] = string.format(
221    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
222    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
223  )
224end
225if agg_a.pass_rate and agg_b.pass_rate then
226  local dp = agg_a.pass_rate - agg_b.pass_rate
227  if math.abs(dp) > 1e-9 then
228    local h = dp > 0 and strategy_a or strategy_b
229    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
230  else
231    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
232  end
233end
234
235return {{
236  a = {{
237    eval_id = "{eval_id_a}",
238    strategy = strategy_a,
239    scores = agg_a.scores,
240    pass_rate = agg_a.pass_rate,
241    pass_at_1 = agg_a.pass_at_1,
242    ci_95 = agg_a.ci_95,
243  }},
244  b = {{
245    eval_id = "{eval_id_b}",
246    strategy = strategy_b,
247    scores = agg_b.scores,
248    pass_rate = agg_b.pass_rate,
249    pass_at_1 = agg_b.pass_at_1,
250    ci_95 = agg_b.ci_95,
251  }},
252  comparison = {{
253    delta_mean = delta,
254    welch_t = {{
255      t_stat = welch.t_stat,
256      df = welch.df,
257      significant = welch.significant,
258      direction = welch.direction,
259    }},
260    winner = winner,
261    summary = table.concat(parts, " "),
262  }},
263}}
264"#,
265            result_a_escaped = escape_for_lua_sq(&result_a),
266            result_b_escaped = escape_for_lua_sq(&result_b),
267            eval_id_a = eval_id_a,
268            eval_id_b = eval_id_b,
269            std_shim = STD_SHIM,
270            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
271            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
272        );
273
274        let ctx = serde_json::Value::Null;
275        let raw_result = self.start_and_tick(lua_code, ctx, None, vec![]).await?;
276
277        // Persist comparison result
278        save_compare_result(eval_id_a, eval_id_b, &raw_result);
279
280        Ok(raw_result)
281    }
282}