1use super::eval_store::{
2 escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history,
3 save_compare_result, save_eval_result,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9const STD_SHIM: &str = r#"
12std = {
13 json = {
14 decode = alc.json_decode,
15 encode = alc.json_encode,
16 },
17 fs = {
18 read = function(path)
19 local f, err = io.open(path, "r")
20 if not f then error("std.fs.read: " .. (err or path), 2) end
21 local content = f:read("*a")
22 f:close()
23 return content
24 end,
25 is_file = function(path)
26 local f = io.open(path, "r")
27 if f then f:close(); return true end
28 return false
29 end,
30 },
31 time = {
32 now = alc.time,
33 },
34}
35"#;
36
37impl AppService {
38 pub async fn eval(
52 &self,
53 scenario: Option<String>,
54 scenario_file: Option<String>,
55 scenario_name: Option<String>,
56 strategy: &str,
57 strategy_opts: Option<serde_json::Value>,
58 auto_card: bool,
59 ) -> Result<String, String> {
60 if !is_package_installed("evalframe") {
62 self.auto_install_bundled_packages().await?;
63 if !is_package_installed("evalframe") {
64 return Err(
65 "Package 'evalframe' not found after installing bundled collection. \
66 Use alc_pkg_install to install it manually."
67 .into(),
68 );
69 }
70 }
71
72 let scenario_code =
73 resolve_scenario_code(scenario, scenario_file, scenario_name.clone())?;
74
75 let opts_lua = match &strategy_opts {
77 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
78 _ => "nil".to_string(),
79 };
80
81 let auto_card_lua = if auto_card { "true" } else { "false" };
82
83 let wrapped = format!(
87 r#"{std_shim}
88
89local scenario = (function()
90{scenario_code}
91end)()
92
93return alc.eval(scenario, "{strategy}", {{
94 strategy_opts = {opts_lua},
95 auto_card = {auto_card_lua},
96}})
97"#,
98 std_shim = STD_SHIM,
99 );
100
101 let ctx = serde_json::Value::Null;
102 let result = self
103 .start_and_tick(wrapped, ctx, Some(strategy), vec![])
104 .await?;
105
106 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
109 match parsed.get("status").and_then(|s| s.as_str()) {
110 Some("completed") => {
111 save_eval_result(strategy, &result);
112 }
113 Some("needs_response") => {
114 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
115 if let Ok(mut map) = self.eval_sessions.lock() {
116 map.insert(sid.to_string(), strategy.to_string());
117 }
118 }
119 }
120 _ => {}
121 }
122 }
123
124 Ok(result)
125 }
126
127 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
129 let dir = evals_dir()?;
130 list_eval_history(&dir, strategy, limit)
131 }
132
133 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
135 let evals_dir = evals_dir()?;
136 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
137 .map_err(|e| format!("Invalid eval_id: {e}"))?;
138 if !path.exists() {
139 return Err(format!("Eval result not found: {eval_id}"));
140 }
141 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
142 }
143
144 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
153 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
155 if let Ok(dir) = evals_dir() {
156 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
157 if cached_path.exists() {
158 return std::fs::read_to_string(&*cached_path)
159 .map_err(|e| format!("Failed to read cached comparison: {e}"));
160 }
161 }
162 }
163
164 if !is_package_installed("evalframe") {
166 self.auto_install_bundled_packages().await?;
167 if !is_package_installed("evalframe") {
168 return Err(
169 "Package 'evalframe' not found after installing bundled collection. \
170 Use alc_pkg_install to install it manually."
171 .into(),
172 );
173 }
174 }
175
176 let result_a = self.eval_detail(eval_id_a)?;
177 let result_b = self.eval_detail(eval_id_b)?;
178
179 let lua_code = format!(
182 r#"{std_shim}
183
184local stats = require("evalframe.eval.stats")
185
186local result_a = alc.json_decode('{result_a_escaped}')
187local result_b = alc.json_decode('{result_b_escaped}')
188
189local agg_a = result_a.result and result_a.result.aggregated
190local agg_b = result_b.result and result_b.result.aggregated
191
192if not agg_a or not agg_a.scores then
193 error("No aggregated scores in {eval_id_a}")
194end
195if not agg_b or not agg_b.scores then
196 error("No aggregated scores in {eval_id_b}")
197end
198
199local welch = stats.welch_t(agg_a.scores, agg_b.scores)
200
201local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
202local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
203
204local delta = agg_a.scores.mean - agg_b.scores.mean
205local winner = "none"
206if welch.significant then
207 winner = delta > 0 and "a" or "b"
208end
209
210-- Build summary text
211local parts = {{}}
212if welch.significant then
213 local w, l, d = strategy_a, strategy_b, delta
214 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
215 parts[#parts + 1] = string.format(
216 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
217 w, l, d, math.abs(welch.t_stat), welch.df
218 )
219else
220 parts[#parts + 1] = string.format(
221 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
222 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
223 )
224end
225if agg_a.pass_rate and agg_b.pass_rate then
226 local dp = agg_a.pass_rate - agg_b.pass_rate
227 if math.abs(dp) > 1e-9 then
228 local h = dp > 0 and strategy_a or strategy_b
229 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
230 else
231 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
232 end
233end
234
235return {{
236 a = {{
237 eval_id = "{eval_id_a}",
238 strategy = strategy_a,
239 scores = agg_a.scores,
240 pass_rate = agg_a.pass_rate,
241 pass_at_1 = agg_a.pass_at_1,
242 ci_95 = agg_a.ci_95,
243 }},
244 b = {{
245 eval_id = "{eval_id_b}",
246 strategy = strategy_b,
247 scores = agg_b.scores,
248 pass_rate = agg_b.pass_rate,
249 pass_at_1 = agg_b.pass_at_1,
250 ci_95 = agg_b.ci_95,
251 }},
252 comparison = {{
253 delta_mean = delta,
254 welch_t = {{
255 t_stat = welch.t_stat,
256 df = welch.df,
257 significant = welch.significant,
258 direction = welch.direction,
259 }},
260 winner = winner,
261 summary = table.concat(parts, " "),
262 }},
263}}
264"#,
265 result_a_escaped = escape_for_lua_sq(&result_a),
266 result_b_escaped = escape_for_lua_sq(&result_b),
267 eval_id_a = eval_id_a,
268 eval_id_b = eval_id_b,
269 std_shim = STD_SHIM,
270 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
271 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
272 );
273
274 let ctx = serde_json::Value::Null;
275 let raw_result = self.start_and_tick(lua_code, ctx, None, vec![]).await?;
276
277 save_compare_result(eval_id_a, eval_id_b, &raw_result);
279
280 Ok(raw_result)
281 }
282}