1use super::eval_store::{
2 escape_for_lua_sq, evals_dir, extract_strategy_from_id, save_compare_result, save_eval_result,
3};
4use super::path::ContainedPath;
5use super::resolve::{is_package_installed, resolve_scenario_code};
6use super::AppService;
7
8impl AppService {
9 pub async fn eval(
25 &self,
26 scenario: Option<String>,
27 scenario_file: Option<String>,
28 scenario_name: Option<String>,
29 strategy: &str,
30 strategy_opts: Option<serde_json::Value>,
31 ) -> Result<String, String> {
32 if !is_package_installed("evalframe") {
34 self.auto_install_bundled_packages().await?;
35 if !is_package_installed("evalframe") {
36 return Err(
37 "Package 'evalframe' not found after installing bundled collection. \
38 Use alc_pkg_install to install it manually."
39 .into(),
40 );
41 }
42 }
43
44 let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name)?;
45
46 let opts_lua = match &strategy_opts {
48 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
49 _ => "{}".to_string(),
50 };
51
52 let wrapped = format!(
61 r#"
62std = {{
63 json = {{
64 decode = alc.json_decode,
65 encode = alc.json_encode,
66 }},
67 fs = {{
68 read = function(path)
69 local f, err = io.open(path, "r")
70 if not f then error("std.fs.read: " .. (err or path), 2) end
71 local content = f:read("*a")
72 f:close()
73 return content
74 end,
75 is_file = function(path)
76 local f = io.open(path, "r")
77 if f then f:close(); return true end
78 return false
79 end,
80 }},
81 time = {{
82 now = alc.time,
83 }},
84}}
85
86local ef = require("evalframe")
87
88-- Load scenario (bindings + cases, no provider)
89local spec = (function()
90{scenario_code}
91end)()
92
93-- Inject strategy as provider
94spec.provider = ef.providers.algocline {{
95 strategy = "{strategy}",
96 opts = {opts_lua},
97}}
98
99-- Build and run suite
100local s = ef.suite "eval" (spec)
101local report = s:run()
102return report:to_table()
103"#
104 );
105
106 let ctx = serde_json::Value::Null;
107 let result = self.start_and_tick(wrapped, ctx, Some(strategy)).await?;
108
109 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
113 match parsed.get("status").and_then(|s| s.as_str()) {
114 Some("completed") => {
115 save_eval_result(strategy, &result);
116 }
117 Some("needs_response") => {
118 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
119 if let Ok(mut map) = self.eval_sessions.lock() {
120 map.insert(sid.to_string(), strategy.to_string());
121 }
122 }
123 }
124 _ => {}
125 }
126 }
127
128 Ok(result)
129 }
130
131 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
133 let evals_dir = evals_dir()?;
134 if !evals_dir.exists() {
135 return Ok(serde_json::json!({ "evals": [] }).to_string());
136 }
137
138 let mut entries: Vec<serde_json::Value> = Vec::new();
139
140 let read_dir =
141 std::fs::read_dir(&evals_dir).map_err(|e| format!("Failed to read evals dir: {e}"))?;
142
143 for entry in read_dir.flatten() {
144 let path = entry.path();
145 if path.extension().and_then(|e| e.to_str()) != Some("json") {
146 continue;
147 }
148 if path
150 .file_name()
151 .and_then(|n| n.to_str())
152 .is_some_and(|n| n.contains(".meta."))
153 {
154 continue;
155 }
156
157 let stem = match path.file_stem().and_then(|s| s.to_str()) {
161 Some(s) => s,
162 None => continue,
163 };
164 let meta_path = match ContainedPath::child(&evals_dir, &format!("{stem}.meta.json")) {
165 Ok(p) => p,
166 Err(_) => continue,
167 };
168 let meta = if meta_path.exists() {
169 std::fs::read_to_string(&*meta_path)
170 .ok()
171 .and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
172 } else {
173 None
174 };
175
176 if let Some(meta) = meta {
177 if let Some(filter) = strategy {
179 if meta.get("strategy").and_then(|s| s.as_str()) != Some(filter) {
180 continue;
181 }
182 }
183 entries.push(meta);
184 }
185 }
186
187 entries.sort_by(|a, b| {
189 let ts_a = a
190 .get("timestamp")
191 .and_then(serde_json::Value::as_u64)
192 .unwrap_or(0);
193 let ts_b = b
194 .get("timestamp")
195 .and_then(serde_json::Value::as_u64)
196 .unwrap_or(0);
197 ts_b.cmp(&ts_a)
198 });
199 entries.truncate(limit);
200
201 Ok(serde_json::json!({ "evals": entries }).to_string())
202 }
203
204 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
206 let evals_dir = evals_dir()?;
207 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
208 .map_err(|e| format!("Invalid eval_id: {e}"))?;
209 if !path.exists() {
210 return Err(format!("Eval result not found: {eval_id}"));
211 }
212 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
213 }
214
215 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
224 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
226 if let Ok(dir) = evals_dir() {
227 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
228 if cached_path.exists() {
229 return std::fs::read_to_string(&*cached_path)
230 .map_err(|e| format!("Failed to read cached comparison: {e}"));
231 }
232 }
233 }
234
235 if !is_package_installed("evalframe") {
237 self.auto_install_bundled_packages().await?;
238 if !is_package_installed("evalframe") {
239 return Err(
240 "Package 'evalframe' not found after installing bundled collection. \
241 Use alc_pkg_install to install it manually."
242 .into(),
243 );
244 }
245 }
246
247 let result_a = self.eval_detail(eval_id_a)?;
248 let result_b = self.eval_detail(eval_id_b)?;
249
250 let lua_code = format!(
253 r#"
254std = {{
255 json = {{
256 decode = alc.json_decode,
257 encode = alc.json_encode,
258 }},
259 fs = {{ read = function() end, is_file = function() return false end }},
260 time = {{ now = alc.time }},
261}}
262
263local stats = require("evalframe.eval.stats")
264
265local result_a = alc.json_decode('{result_a_escaped}')
266local result_b = alc.json_decode('{result_b_escaped}')
267
268local agg_a = result_a.result and result_a.result.aggregated
269local agg_b = result_b.result and result_b.result.aggregated
270
271if not agg_a or not agg_a.scores then
272 error("No aggregated scores in {eval_id_a}")
273end
274if not agg_b or not agg_b.scores then
275 error("No aggregated scores in {eval_id_b}")
276end
277
278local welch = stats.welch_t(agg_a.scores, agg_b.scores)
279
280local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
281local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
282
283local delta = agg_a.scores.mean - agg_b.scores.mean
284local winner = "none"
285if welch.significant then
286 winner = delta > 0 and "a" or "b"
287end
288
289-- Build summary text
290local parts = {{}}
291if welch.significant then
292 local w, l, d = strategy_a, strategy_b, delta
293 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
294 parts[#parts + 1] = string.format(
295 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
296 w, l, d, math.abs(welch.t_stat), welch.df
297 )
298else
299 parts[#parts + 1] = string.format(
300 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
301 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
302 )
303end
304if agg_a.pass_rate and agg_b.pass_rate then
305 local dp = agg_a.pass_rate - agg_b.pass_rate
306 if math.abs(dp) > 1e-9 then
307 local h = dp > 0 and strategy_a or strategy_b
308 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
309 else
310 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
311 end
312end
313
314return {{
315 a = {{
316 eval_id = "{eval_id_a}",
317 strategy = strategy_a,
318 scores = agg_a.scores,
319 pass_rate = agg_a.pass_rate,
320 pass_at_1 = agg_a.pass_at_1,
321 ci_95 = agg_a.ci_95,
322 }},
323 b = {{
324 eval_id = "{eval_id_b}",
325 strategy = strategy_b,
326 scores = agg_b.scores,
327 pass_rate = agg_b.pass_rate,
328 pass_at_1 = agg_b.pass_at_1,
329 ci_95 = agg_b.ci_95,
330 }},
331 comparison = {{
332 delta_mean = delta,
333 welch_t = {{
334 t_stat = welch.t_stat,
335 df = welch.df,
336 significant = welch.significant,
337 direction = welch.direction,
338 }},
339 winner = winner,
340 summary = table.concat(parts, " "),
341 }},
342}}
343"#,
344 result_a_escaped = escape_for_lua_sq(&result_a),
345 result_b_escaped = escape_for_lua_sq(&result_b),
346 eval_id_a = eval_id_a,
347 eval_id_b = eval_id_b,
348 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
349 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
350 );
351
352 let ctx = serde_json::Value::Null;
353 let raw_result = self.start_and_tick(lua_code, ctx, None).await?;
354
355 save_compare_result(eval_id_a, eval_id_b, &raw_result);
357
358 Ok(raw_result)
359 }
360}