1use super::eval_store::{
2 escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3 save_eval_result,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9const STD_SHIM: &str = r#"
12std = {
13 json = {
14 decode = alc.json_decode,
15 encode = alc.json_encode,
16 },
17 fs = {
18 read = function(path)
19 local f, err = io.open(path, "r")
20 if not f then error("std.fs.read: " .. (err or path), 2) end
21 local content = f:read("*a")
22 f:close()
23 return content
24 end,
25 is_file = function(path)
26 local f = io.open(path, "r")
27 if f then f:close(); return true end
28 return false
29 end,
30 },
31 time = {
32 now = alc.time,
33 },
34}
35"#;
36
37impl AppService {
38 pub async fn eval(
52 &self,
53 scenario: Option<String>,
54 scenario_file: Option<String>,
55 scenario_name: Option<String>,
56 strategy: &str,
57 strategy_opts: Option<serde_json::Value>,
58 auto_card: bool,
59 ) -> Result<String, String> {
60 if !is_package_installed("evalframe") {
62 self.auto_install_bundled_packages().await?;
63 if !is_package_installed("evalframe") {
64 return Err(
65 "Package 'evalframe' not found after installing bundled collection. \
66 Use alc_pkg_install to install it manually."
67 .into(),
68 );
69 }
70 }
71
72 let scenario_code = resolve_scenario_code(scenario, scenario_file, scenario_name.clone())?;
73
74 let opts_lua = match &strategy_opts {
76 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
77 _ => "nil".to_string(),
78 };
79
80 let auto_card_lua = if auto_card { "true" } else { "false" };
81
82 let wrapped = format!(
86 r#"{std_shim}
87
88local scenario = (function()
89{scenario_code}
90end)()
91
92return alc.eval(scenario, "{strategy}", {{
93 strategy_opts = {opts_lua},
94 auto_card = {auto_card_lua},
95}})
96"#,
97 std_shim = STD_SHIM,
98 );
99
100 let ctx = serde_json::Value::Null;
101 let result = self
102 .start_and_tick(wrapped, ctx, Some(strategy), vec![], vec![])
103 .await?;
104
105 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
108 match parsed.get("status").and_then(|s| s.as_str()) {
109 Some("completed") => {
110 save_eval_result(strategy, &result);
111 }
112 Some("needs_response") => {
113 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
114 if let Ok(mut map) = self.eval_sessions.lock() {
115 map.insert(sid.to_string(), strategy.to_string());
116 }
117 }
118 }
119 _ => {}
120 }
121 }
122
123 Ok(result)
124 }
125
126 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
128 let dir = evals_dir()?;
129 list_eval_history(&dir, strategy, limit)
130 }
131
132 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
134 let evals_dir = evals_dir()?;
135 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
136 .map_err(|e| format!("Invalid eval_id: {e}"))?;
137 if !path.exists() {
138 return Err(format!("Eval result not found: {eval_id}"));
139 }
140 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
141 }
142
143 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
152 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
154 if let Ok(dir) = evals_dir() {
155 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
156 if cached_path.exists() {
157 return std::fs::read_to_string(&*cached_path)
158 .map_err(|e| format!("Failed to read cached comparison: {e}"));
159 }
160 }
161 }
162
163 if !is_package_installed("evalframe") {
165 self.auto_install_bundled_packages().await?;
166 if !is_package_installed("evalframe") {
167 return Err(
168 "Package 'evalframe' not found after installing bundled collection. \
169 Use alc_pkg_install to install it manually."
170 .into(),
171 );
172 }
173 }
174
175 let result_a = self.eval_detail(eval_id_a)?;
176 let result_b = self.eval_detail(eval_id_b)?;
177
178 let lua_code = format!(
181 r#"{std_shim}
182
183local stats = require("evalframe.eval.stats")
184
185local result_a = alc.json_decode('{result_a_escaped}')
186local result_b = alc.json_decode('{result_b_escaped}')
187
188local agg_a = result_a.result and result_a.result.aggregated
189local agg_b = result_b.result and result_b.result.aggregated
190
191if not agg_a or not agg_a.scores then
192 error("No aggregated scores in {eval_id_a}")
193end
194if not agg_b or not agg_b.scores then
195 error("No aggregated scores in {eval_id_b}")
196end
197
198local welch = stats.welch_t(agg_a.scores, agg_b.scores)
199
200local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
201local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
202
203local delta = agg_a.scores.mean - agg_b.scores.mean
204local winner = "none"
205if welch.significant then
206 winner = delta > 0 and "a" or "b"
207end
208
209-- Build summary text
210local parts = {{}}
211if welch.significant then
212 local w, l, d = strategy_a, strategy_b, delta
213 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
214 parts[#parts + 1] = string.format(
215 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
216 w, l, d, math.abs(welch.t_stat), welch.df
217 )
218else
219 parts[#parts + 1] = string.format(
220 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
221 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
222 )
223end
224if agg_a.pass_rate and agg_b.pass_rate then
225 local dp = agg_a.pass_rate - agg_b.pass_rate
226 if math.abs(dp) > 1e-9 then
227 local h = dp > 0 and strategy_a or strategy_b
228 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
229 else
230 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
231 end
232end
233
234return {{
235 a = {{
236 eval_id = "{eval_id_a}",
237 strategy = strategy_a,
238 scores = agg_a.scores,
239 pass_rate = agg_a.pass_rate,
240 pass_at_1 = agg_a.pass_at_1,
241 ci_95 = agg_a.ci_95,
242 }},
243 b = {{
244 eval_id = "{eval_id_b}",
245 strategy = strategy_b,
246 scores = agg_b.scores,
247 pass_rate = agg_b.pass_rate,
248 pass_at_1 = agg_b.pass_at_1,
249 ci_95 = agg_b.ci_95,
250 }},
251 comparison = {{
252 delta_mean = delta,
253 welch_t = {{
254 t_stat = welch.t_stat,
255 df = welch.df,
256 significant = welch.significant,
257 direction = welch.direction,
258 }},
259 winner = winner,
260 summary = table.concat(parts, " "),
261 }},
262}}
263"#,
264 result_a_escaped = escape_for_lua_sq(&result_a),
265 result_b_escaped = escape_for_lua_sq(&result_b),
266 eval_id_a = eval_id_a,
267 eval_id_b = eval_id_b,
268 std_shim = STD_SHIM,
269 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
270 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
271 );
272
273 let ctx = serde_json::Value::Null;
274 let raw_result = self
275 .start_and_tick(lua_code, ctx, None, vec![], vec![])
276 .await?;
277
278 save_compare_result(eval_id_a, eval_id_b, &raw_result);
280
281 Ok(raw_result)
282 }
283}