1use super::eval_store::{
2 escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3 save_eval_result,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9impl AppService {
10 pub async fn eval(
26 &self,
27 scenario: Option<String>,
28 scenario_file: Option<String>,
29 scenario_name: Option<String>,
30 strategy: &str,
31 strategy_opts: Option<serde_json::Value>,
32 ) -> Result<String, String> {
33 if !is_package_installed("evalframe") {
35 self.auto_install_bundled_packages().await?;
36 if !is_package_installed("evalframe") {
37 return Err(
38 "Package 'evalframe' not found after installing bundled collection. \
39 Use alc_pkg_install to install it manually."
40 .into(),
41 );
42 }
43 }
44
45 let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name)?;
46
47 let opts_lua = match &strategy_opts {
49 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
50 _ => "{}".to_string(),
51 };
52
53 let wrapped = format!(
62 r#"
63std = {{
64 json = {{
65 decode = alc.json_decode,
66 encode = alc.json_encode,
67 }},
68 fs = {{
69 read = function(path)
70 local f, err = io.open(path, "r")
71 if not f then error("std.fs.read: " .. (err or path), 2) end
72 local content = f:read("*a")
73 f:close()
74 return content
75 end,
76 is_file = function(path)
77 local f = io.open(path, "r")
78 if f then f:close(); return true end
79 return false
80 end,
81 }},
82 time = {{
83 now = alc.time,
84 }},
85}}
86
87local ef = require("evalframe")
88
89-- Load scenario (bindings + cases, no provider)
90local spec = (function()
91{scenario_code}
92end)()
93
94-- Inject strategy as provider
95spec.provider = ef.providers.algocline {{
96 strategy = "{strategy}",
97 opts = {opts_lua},
98}}
99
100-- Build and run suite
101local s = ef.suite "eval" (spec)
102local report = s:run()
103return report:to_table()
104"#
105 );
106
107 let ctx = serde_json::Value::Null;
108 let result = self
109 .start_and_tick(wrapped, ctx, Some(strategy), vec![])
110 .await?;
111
112 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
116 match parsed.get("status").and_then(|s| s.as_str()) {
117 Some("completed") => {
118 save_eval_result(strategy, &result);
119 }
120 Some("needs_response") => {
121 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
122 if let Ok(mut map) = self.eval_sessions.lock() {
123 map.insert(sid.to_string(), strategy.to_string());
124 }
125 }
126 }
127 _ => {}
128 }
129 }
130
131 Ok(result)
132 }
133
134 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
136 let dir = evals_dir()?;
137 list_eval_history(&dir, strategy, limit)
138 }
139
140 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
142 let evals_dir = evals_dir()?;
143 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
144 .map_err(|e| format!("Invalid eval_id: {e}"))?;
145 if !path.exists() {
146 return Err(format!("Eval result not found: {eval_id}"));
147 }
148 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
149 }
150
151 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
160 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
162 if let Ok(dir) = evals_dir() {
163 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
164 if cached_path.exists() {
165 return std::fs::read_to_string(&*cached_path)
166 .map_err(|e| format!("Failed to read cached comparison: {e}"));
167 }
168 }
169 }
170
171 if !is_package_installed("evalframe") {
173 self.auto_install_bundled_packages().await?;
174 if !is_package_installed("evalframe") {
175 return Err(
176 "Package 'evalframe' not found after installing bundled collection. \
177 Use alc_pkg_install to install it manually."
178 .into(),
179 );
180 }
181 }
182
183 let result_a = self.eval_detail(eval_id_a)?;
184 let result_b = self.eval_detail(eval_id_b)?;
185
186 let lua_code = format!(
189 r#"
190std = {{
191 json = {{
192 decode = alc.json_decode,
193 encode = alc.json_encode,
194 }},
195 fs = {{ read = function() end, is_file = function() return false end }},
196 time = {{ now = alc.time }},
197}}
198
199local stats = require("evalframe.eval.stats")
200
201local result_a = alc.json_decode('{result_a_escaped}')
202local result_b = alc.json_decode('{result_b_escaped}')
203
204local agg_a = result_a.result and result_a.result.aggregated
205local agg_b = result_b.result and result_b.result.aggregated
206
207if not agg_a or not agg_a.scores then
208 error("No aggregated scores in {eval_id_a}")
209end
210if not agg_b or not agg_b.scores then
211 error("No aggregated scores in {eval_id_b}")
212end
213
214local welch = stats.welch_t(agg_a.scores, agg_b.scores)
215
216local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
217local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
218
219local delta = agg_a.scores.mean - agg_b.scores.mean
220local winner = "none"
221if welch.significant then
222 winner = delta > 0 and "a" or "b"
223end
224
225-- Build summary text
226local parts = {{}}
227if welch.significant then
228 local w, l, d = strategy_a, strategy_b, delta
229 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
230 parts[#parts + 1] = string.format(
231 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
232 w, l, d, math.abs(welch.t_stat), welch.df
233 )
234else
235 parts[#parts + 1] = string.format(
236 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
237 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
238 )
239end
240if agg_a.pass_rate and agg_b.pass_rate then
241 local dp = agg_a.pass_rate - agg_b.pass_rate
242 if math.abs(dp) > 1e-9 then
243 local h = dp > 0 and strategy_a or strategy_b
244 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
245 else
246 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
247 end
248end
249
250return {{
251 a = {{
252 eval_id = "{eval_id_a}",
253 strategy = strategy_a,
254 scores = agg_a.scores,
255 pass_rate = agg_a.pass_rate,
256 pass_at_1 = agg_a.pass_at_1,
257 ci_95 = agg_a.ci_95,
258 }},
259 b = {{
260 eval_id = "{eval_id_b}",
261 strategy = strategy_b,
262 scores = agg_b.scores,
263 pass_rate = agg_b.pass_rate,
264 pass_at_1 = agg_b.pass_at_1,
265 ci_95 = agg_b.ci_95,
266 }},
267 comparison = {{
268 delta_mean = delta,
269 welch_t = {{
270 t_stat = welch.t_stat,
271 df = welch.df,
272 significant = welch.significant,
273 direction = welch.direction,
274 }},
275 winner = winner,
276 summary = table.concat(parts, " "),
277 }},
278}}
279"#,
280 result_a_escaped = escape_for_lua_sq(&result_a),
281 result_b_escaped = escape_for_lua_sq(&result_b),
282 eval_id_a = eval_id_a,
283 eval_id_b = eval_id_b,
284 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
285 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
286 );
287
288 let ctx = serde_json::Value::Null;
289 let raw_result = self.start_and_tick(lua_code, ctx, None, vec![]).await?;
290
291 save_compare_result(eval_id_a, eval_id_b, &raw_result);
293
294 Ok(raw_result)
295 }
296}