Skip to main content

algocline_app/service/
eval.rs

1use super::eval_store::{
2    escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3    save_eval_result,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9/// Lua shim that bridges algocline's `alc.*` primitives to the `std` global
10/// expected by evalframe.std. Injected once before any evalframe code runs.
11const STD_SHIM: &str = r#"
12std = {
13  json = {
14    decode = alc.json_decode,
15    encode = alc.json_encode,
16  },
17  fs = {
18    read = function(path)
19      local f, err = io.open(path, "r")
20      if not f then error("std.fs.read: " .. (err or path), 2) end
21      local content = f:read("*a")
22      f:close()
23      return content
24    end,
25    is_file = function(path)
26      local f = io.open(path, "r")
27      if f then f:close(); return true end
28      return false
29    end,
30  },
31  time = {
32    now = alc.time,
33  },
34}
35"#;
36
37impl AppService {
38    /// Run an evalframe evaluation suite via `alc.eval()`.
39    ///
40    /// Resolves the scenario from one of three input modes (inline/file/name),
41    /// injects the `std` global shim, and delegates to `alc.eval()` in prelude
42    /// which handles evalframe loading, provider wiring, and optional Card
43    /// emission.
44    ///
45    /// # Security: `strategy` is not sanitized
46    ///
47    /// `strategy` is interpolated into a Lua string literal without escaping.
48    /// This is intentional — algocline runs Lua in the caller's own process
49    /// with full ambient authority, so Lua injection does not cross a trust
50    /// boundary.
51    pub async fn eval(
52        &self,
53        scenario: Option<String>,
54        scenario_file: Option<String>,
55        scenario_name: Option<String>,
56        strategy: &str,
57        strategy_opts: Option<serde_json::Value>,
58        auto_card: bool,
59    ) -> Result<String, String> {
60        // Auto-install bundled packages if evalframe is missing
61        if !is_package_installed("evalframe") {
62            self.auto_install_bundled_packages().await?;
63            if !is_package_installed("evalframe") {
64                return Err(
65                    "Package 'evalframe' not found after installing bundled collection. \
66                     Use alc_pkg_install to install it manually."
67                        .into(),
68                );
69            }
70        }
71
72        let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name.clone())?;
73
74        // Build strategy opts Lua table literal
75        let opts_lua = match &strategy_opts {
76            Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
77            _ => "nil".to_string(),
78        };
79
80        let auto_card_lua = if auto_card { "true" } else { "false" };
81
82        // Delegate to alc.eval() in prelude.
83        // The shim injects `std` for evalframe, then the scenario code is
84        // evaluated into a table and passed to alc.eval() along with opts.
85        let wrapped = format!(
86            r#"{std_shim}
87
88local scenario = (function()
89{scenario_code}
90end)()
91
92return alc.eval(scenario, "{strategy}", {{
93  strategy_opts = {opts_lua},
94  auto_card = {auto_card_lua},
95}})
96"#,
97            std_shim = STD_SHIM,
98        );
99
100        let ctx = serde_json::Value::Null;
101        let result = self
102            .start_and_tick(wrapped, ctx, Some(strategy), vec![], vec![])
103            .await?;
104
105        // Persist eval result for history/comparison.
106        // Card emission is handled by alc.eval() Lua-side when auto_card=true.
107        if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
108            match parsed.get("status").and_then(|s| s.as_str()) {
109                Some("completed") => {
110                    save_eval_result(strategy, &result);
111                }
112                Some("needs_response") => {
113                    if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
114                        if let Ok(mut map) = self.eval_sessions.lock() {
115                            map.insert(sid.to_string(), strategy.to_string());
116                        }
117                    }
118                }
119                _ => {}
120            }
121        }
122
123        Ok(result)
124    }
125
126    /// List eval history, optionally filtered by strategy.
127    pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
128        let dir = evals_dir()?;
129        list_eval_history(&dir, strategy, limit)
130    }
131
132    /// View a specific eval result by ID.
133    pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
134        let evals_dir = evals_dir()?;
135        let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
136            .map_err(|e| format!("Invalid eval_id: {e}"))?;
137        if !path.exists() {
138            return Err(format!("Eval result not found: {eval_id}"));
139        }
140        std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
141    }
142
143    /// Compare two eval results with statistical significance testing.
144    ///
145    /// Delegates to evalframe's `stats.welch_t` (single source of truth for
146    /// t-distribution table and test logic). Reads persisted `aggregated.scores`
147    /// from each eval result — no re-computation of descriptive statistics.
148    ///
149    /// The comparison result is persisted to `~/.algocline/evals/` so repeated
150    /// lookups of the same pair are file reads only.
151    pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
152        // Check for cached comparison
153        let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
154        if let Ok(dir) = evals_dir() {
155            if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
156                if cached_path.exists() {
157                    return std::fs::read_to_string(&*cached_path)
158                        .map_err(|e| format!("Failed to read cached comparison: {e}"));
159                }
160            }
161        }
162
163        // Auto-install bundled packages if evalframe is missing
164        if !is_package_installed("evalframe") {
165            self.auto_install_bundled_packages().await?;
166            if !is_package_installed("evalframe") {
167                return Err(
168                    "Package 'evalframe' not found after installing bundled collection. \
169                     Use alc_pkg_install to install it manually."
170                        .into(),
171                );
172            }
173        }
174
175        let result_a = self.eval_detail(eval_id_a)?;
176        let result_b = self.eval_detail(eval_id_b)?;
177
178        // Build Lua snippet that uses evalframe's stats module
179        // to compute welch_t from the persisted aggregated scores.
180        let lua_code = format!(
181            r#"{std_shim}
182
183local stats = require("evalframe.eval.stats")
184
185local result_a = alc.json_decode('{result_a_escaped}')
186local result_b = alc.json_decode('{result_b_escaped}')
187
188local agg_a = result_a.result and result_a.result.aggregated
189local agg_b = result_b.result and result_b.result.aggregated
190
191if not agg_a or not agg_a.scores then
192  error("No aggregated scores in {eval_id_a}")
193end
194if not agg_b or not agg_b.scores then
195  error("No aggregated scores in {eval_id_b}")
196end
197
198local welch = stats.welch_t(agg_a.scores, agg_b.scores)
199
200local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
201local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
202
203local delta = agg_a.scores.mean - agg_b.scores.mean
204local winner = "none"
205if welch.significant then
206  winner = delta > 0 and "a" or "b"
207end
208
209-- Build summary text
210local parts = {{}}
211if welch.significant then
212  local w, l, d = strategy_a, strategy_b, delta
213  if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
214  parts[#parts + 1] = string.format(
215    "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
216    w, l, d, math.abs(welch.t_stat), welch.df
217  )
218else
219  parts[#parts + 1] = string.format(
220    "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
221    strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
222  )
223end
224if agg_a.pass_rate and agg_b.pass_rate then
225  local dp = agg_a.pass_rate - agg_b.pass_rate
226  if math.abs(dp) > 1e-9 then
227    local h = dp > 0 and strategy_a or strategy_b
228    parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
229  else
230    parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
231  end
232end
233
234return {{
235  a = {{
236    eval_id = "{eval_id_a}",
237    strategy = strategy_a,
238    scores = agg_a.scores,
239    pass_rate = agg_a.pass_rate,
240    pass_at_1 = agg_a.pass_at_1,
241    ci_95 = agg_a.ci_95,
242  }},
243  b = {{
244    eval_id = "{eval_id_b}",
245    strategy = strategy_b,
246    scores = agg_b.scores,
247    pass_rate = agg_b.pass_rate,
248    pass_at_1 = agg_b.pass_at_1,
249    ci_95 = agg_b.ci_95,
250  }},
251  comparison = {{
252    delta_mean = delta,
253    welch_t = {{
254      t_stat = welch.t_stat,
255      df = welch.df,
256      significant = welch.significant,
257      direction = welch.direction,
258    }},
259    winner = winner,
260    summary = table.concat(parts, " "),
261  }},
262}}
263"#,
264            result_a_escaped = escape_for_lua_sq(&result_a),
265            result_b_escaped = escape_for_lua_sq(&result_b),
266            eval_id_a = eval_id_a,
267            eval_id_b = eval_id_b,
268            std_shim = STD_SHIM,
269            strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
270            strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
271        );
272
273        let ctx = serde_json::Value::Null;
274        let raw_result = self
275            .start_and_tick(lua_code, ctx, None, vec![], vec![])
276            .await?;
277
278        // Persist comparison result
279        save_compare_result(eval_id_a, eval_id_b, &raw_result);
280
281        Ok(raw_result)
282    }
283}