1use super::eval_store::{
2 escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3 save_eval_result,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9impl AppService {
10 pub async fn eval(
26 &self,
27 scenario: Option<String>,
28 scenario_file: Option<String>,
29 scenario_name: Option<String>,
30 strategy: &str,
31 strategy_opts: Option<serde_json::Value>,
32 ) -> Result<String, String> {
33 if !is_package_installed("evalframe") {
35 self.auto_install_bundled_packages().await?;
36 if !is_package_installed("evalframe") {
37 return Err(
38 "Package 'evalframe' not found after installing bundled collection. \
39 Use alc_pkg_install to install it manually."
40 .into(),
41 );
42 }
43 }
44
45 let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name)?;
46
47 let opts_lua = match &strategy_opts {
49 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
50 _ => "{}".to_string(),
51 };
52
53 let wrapped = format!(
62 r#"
63std = {{
64 json = {{
65 decode = alc.json_decode,
66 encode = alc.json_encode,
67 }},
68 fs = {{
69 read = function(path)
70 local f, err = io.open(path, "r")
71 if not f then error("std.fs.read: " .. (err or path), 2) end
72 local content = f:read("*a")
73 f:close()
74 return content
75 end,
76 is_file = function(path)
77 local f = io.open(path, "r")
78 if f then f:close(); return true end
79 return false
80 end,
81 }},
82 time = {{
83 now = alc.time,
84 }},
85}}
86
87local ef = require("evalframe")
88
89-- Load scenario (bindings + cases, no provider)
90local spec = (function()
91{scenario_code}
92end)()
93
94-- Inject strategy as provider
95spec.provider = ef.providers.algocline {{
96 strategy = "{strategy}",
97 opts = {opts_lua},
98}}
99
100-- Build and run suite
101local s = ef.suite "eval" (spec)
102local report = s:run()
103return report:to_table()
104"#
105 );
106
107 let ctx = serde_json::Value::Null;
108 let result = self.start_and_tick(wrapped, ctx, Some(strategy)).await?;
109
110 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
114 match parsed.get("status").and_then(|s| s.as_str()) {
115 Some("completed") => {
116 save_eval_result(strategy, &result);
117 }
118 Some("needs_response") => {
119 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
120 if let Ok(mut map) = self.eval_sessions.lock() {
121 map.insert(sid.to_string(), strategy.to_string());
122 }
123 }
124 }
125 _ => {}
126 }
127 }
128
129 Ok(result)
130 }
131
132 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
134 let dir = evals_dir()?;
135 list_eval_history(&dir, strategy, limit)
136 }
137
138 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
140 let evals_dir = evals_dir()?;
141 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
142 .map_err(|e| format!("Invalid eval_id: {e}"))?;
143 if !path.exists() {
144 return Err(format!("Eval result not found: {eval_id}"));
145 }
146 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
147 }
148
149 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
158 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
160 if let Ok(dir) = evals_dir() {
161 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
162 if cached_path.exists() {
163 return std::fs::read_to_string(&*cached_path)
164 .map_err(|e| format!("Failed to read cached comparison: {e}"));
165 }
166 }
167 }
168
169 if !is_package_installed("evalframe") {
171 self.auto_install_bundled_packages().await?;
172 if !is_package_installed("evalframe") {
173 return Err(
174 "Package 'evalframe' not found after installing bundled collection. \
175 Use alc_pkg_install to install it manually."
176 .into(),
177 );
178 }
179 }
180
181 let result_a = self.eval_detail(eval_id_a)?;
182 let result_b = self.eval_detail(eval_id_b)?;
183
184 let lua_code = format!(
187 r#"
188std = {{
189 json = {{
190 decode = alc.json_decode,
191 encode = alc.json_encode,
192 }},
193 fs = {{ read = function() end, is_file = function() return false end }},
194 time = {{ now = alc.time }},
195}}
196
197local stats = require("evalframe.eval.stats")
198
199local result_a = alc.json_decode('{result_a_escaped}')
200local result_b = alc.json_decode('{result_b_escaped}')
201
202local agg_a = result_a.result and result_a.result.aggregated
203local agg_b = result_b.result and result_b.result.aggregated
204
205if not agg_a or not agg_a.scores then
206 error("No aggregated scores in {eval_id_a}")
207end
208if not agg_b or not agg_b.scores then
209 error("No aggregated scores in {eval_id_b}")
210end
211
212local welch = stats.welch_t(agg_a.scores, agg_b.scores)
213
214local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
215local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
216
217local delta = agg_a.scores.mean - agg_b.scores.mean
218local winner = "none"
219if welch.significant then
220 winner = delta > 0 and "a" or "b"
221end
222
223-- Build summary text
224local parts = {{}}
225if welch.significant then
226 local w, l, d = strategy_a, strategy_b, delta
227 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
228 parts[#parts + 1] = string.format(
229 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
230 w, l, d, math.abs(welch.t_stat), welch.df
231 )
232else
233 parts[#parts + 1] = string.format(
234 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
235 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
236 )
237end
238if agg_a.pass_rate and agg_b.pass_rate then
239 local dp = agg_a.pass_rate - agg_b.pass_rate
240 if math.abs(dp) > 1e-9 then
241 local h = dp > 0 and strategy_a or strategy_b
242 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
243 else
244 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
245 end
246end
247
248return {{
249 a = {{
250 eval_id = "{eval_id_a}",
251 strategy = strategy_a,
252 scores = agg_a.scores,
253 pass_rate = agg_a.pass_rate,
254 pass_at_1 = agg_a.pass_at_1,
255 ci_95 = agg_a.ci_95,
256 }},
257 b = {{
258 eval_id = "{eval_id_b}",
259 strategy = strategy_b,
260 scores = agg_b.scores,
261 pass_rate = agg_b.pass_rate,
262 pass_at_1 = agg_b.pass_at_1,
263 ci_95 = agg_b.ci_95,
264 }},
265 comparison = {{
266 delta_mean = delta,
267 welch_t = {{
268 t_stat = welch.t_stat,
269 df = welch.df,
270 significant = welch.significant,
271 direction = welch.direction,
272 }},
273 winner = winner,
274 summary = table.concat(parts, " "),
275 }},
276}}
277"#,
278 result_a_escaped = escape_for_lua_sq(&result_a),
279 result_b_escaped = escape_for_lua_sq(&result_b),
280 eval_id_a = eval_id_a,
281 eval_id_b = eval_id_b,
282 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
283 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
284 );
285
286 let ctx = serde_json::Value::Null;
287 let raw_result = self.start_and_tick(lua_code, ctx, None).await?;
288
289 save_compare_result(eval_id_a, eval_id_b, &raw_result);
291
292 Ok(raw_result)
293 }
294}