1use super::eval_store::{
2 escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3 save_eval_result, splice_response_string,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::AppService;
8
9const STD_SHIM: &str = r#"
12std = {
13 json = {
14 decode = alc.json_decode,
15 encode = alc.json_encode,
16 },
17 fs = {
18 read = function(path)
19 local f, err = io.open(path, "r")
20 if not f then error("std.fs.read: " .. (err or path), 2) end
21 local content = f:read("*a")
22 f:close()
23 return content
24 end,
25 is_file = function(path)
26 local f = io.open(path, "r")
27 if f then f:close(); return true end
28 return false
29 end,
30 },
31 time = {
32 now = alc.time,
33 },
34}
35"#;
36
37impl AppService {
38 pub async fn eval(
52 &self,
53 scenario: Option<String>,
54 scenario_file: Option<String>,
55 scenario_name: Option<String>,
56 strategy: &str,
57 strategy_opts: Option<serde_json::Value>,
58 auto_card: bool,
59 ) -> Result<String, String> {
60 let app_dir = self.log_config.app_dir();
62 if !is_package_installed(&app_dir, "evalframe") {
63 self.auto_install_bundled_packages().await?;
64 if !is_package_installed(&app_dir, "evalframe") {
65 return Err(
66 "Package 'evalframe' not found after installing bundled collection. \
67 Use alc_pkg_install to install it manually."
68 .into(),
69 );
70 }
71 }
72
73 let scenario_code =
74 resolve_scenario_code(&app_dir, scenario, scenario_file, scenario_name.clone())?;
75
76 let opts_lua = match &strategy_opts {
78 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
79 _ => "nil".to_string(),
80 };
81
82 let auto_card_lua = if auto_card { "true" } else { "false" };
83
84 let wrapped = format!(
88 r#"{std_shim}
89
90local scenario = (function()
91{scenario_code}
92end)()
93
94return alc.eval(scenario, "{strategy}", {{
95 strategy_opts = {opts_lua},
96 auto_card = {auto_card_lua},
97}})
98"#,
99 std_shim = STD_SHIM,
100 );
101
102 let ctx = serde_json::Value::Null;
103 let result = self
104 .start_and_tick(wrapped, ctx, Some(strategy), vec![], vec![])
105 .await?;
106
107 let mut save_warning: Option<String> = None;
110 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
111 match parsed.get("status").and_then(|s| s.as_str()) {
112 Some("completed") => {
113 if let Err(e) = save_eval_result(&app_dir, strategy, &result) {
114 save_warning = Some(e);
115 }
116 }
117 Some("needs_response") => {
118 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
119 if let Ok(mut map) = self.eval_sessions.lock() {
120 map.insert(sid.to_string(), strategy.to_string());
121 }
122 }
123 }
124 _ => {}
125 }
126 }
127
128 match save_warning {
129 Some(msg) => Ok(splice_response_string(&result, "save_warning", &msg)),
130 None => Ok(result),
131 }
132 }
133
134 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
136 let dir = evals_dir(&self.log_config.app_dir());
137 list_eval_history(&dir, strategy, limit)
138 }
139
140 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
142 let evals_dir = evals_dir(&self.log_config.app_dir());
143 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
144 .map_err(|e| format!("Invalid eval_id: {e}"))?;
145 if !path.exists() {
146 return Err(format!("Eval result not found: {eval_id}"));
147 }
148 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
149 }
150
151 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
160 let app_dir = self.log_config.app_dir();
161 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
163 {
164 let dir = evals_dir(&app_dir);
165 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
166 if cached_path.exists() {
167 return std::fs::read_to_string(&*cached_path)
168 .map_err(|e| format!("Failed to read cached comparison: {e}"));
169 }
170 }
171 }
172
173 if !is_package_installed(&app_dir, "evalframe") {
175 self.auto_install_bundled_packages().await?;
176 if !is_package_installed(&app_dir, "evalframe") {
177 return Err(
178 "Package 'evalframe' not found after installing bundled collection. \
179 Use alc_pkg_install to install it manually."
180 .into(),
181 );
182 }
183 }
184
185 let result_a = self.eval_detail(eval_id_a)?;
186 let result_b = self.eval_detail(eval_id_b)?;
187
188 let lua_code = format!(
191 r#"{std_shim}
192
193local stats = require("evalframe.eval.stats")
194
195local result_a = alc.json_decode('{result_a_escaped}')
196local result_b = alc.json_decode('{result_b_escaped}')
197
198local agg_a = result_a.result and result_a.result.aggregated
199local agg_b = result_b.result and result_b.result.aggregated
200
201if not agg_a or not agg_a.scores then
202 error("No aggregated scores in {eval_id_a}")
203end
204if not agg_b or not agg_b.scores then
205 error("No aggregated scores in {eval_id_b}")
206end
207
208local welch = stats.welch_t(agg_a.scores, agg_b.scores)
209
210local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
211local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
212
213local delta = agg_a.scores.mean - agg_b.scores.mean
214local winner = "none"
215if welch.significant then
216 winner = delta > 0 and "a" or "b"
217end
218
219-- Build summary text
220local parts = {{}}
221if welch.significant then
222 local w, l, d = strategy_a, strategy_b, delta
223 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
224 parts[#parts + 1] = string.format(
225 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
226 w, l, d, math.abs(welch.t_stat), welch.df
227 )
228else
229 parts[#parts + 1] = string.format(
230 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
231 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
232 )
233end
234if agg_a.pass_rate and agg_b.pass_rate then
235 local dp = agg_a.pass_rate - agg_b.pass_rate
236 if math.abs(dp) > 1e-9 then
237 local h = dp > 0 and strategy_a or strategy_b
238 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
239 else
240 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
241 end
242end
243
244return {{
245 a = {{
246 eval_id = "{eval_id_a}",
247 strategy = strategy_a,
248 scores = agg_a.scores,
249 pass_rate = agg_a.pass_rate,
250 pass_at_1 = agg_a.pass_at_1,
251 ci_95 = agg_a.ci_95,
252 }},
253 b = {{
254 eval_id = "{eval_id_b}",
255 strategy = strategy_b,
256 scores = agg_b.scores,
257 pass_rate = agg_b.pass_rate,
258 pass_at_1 = agg_b.pass_at_1,
259 ci_95 = agg_b.ci_95,
260 }},
261 comparison = {{
262 delta_mean = delta,
263 welch_t = {{
264 t_stat = welch.t_stat,
265 df = welch.df,
266 significant = welch.significant,
267 direction = welch.direction,
268 }},
269 winner = winner,
270 summary = table.concat(parts, " "),
271 }},
272}}
273"#,
274 result_a_escaped = escape_for_lua_sq(&result_a),
275 result_b_escaped = escape_for_lua_sq(&result_b),
276 eval_id_a = eval_id_a,
277 eval_id_b = eval_id_b,
278 std_shim = STD_SHIM,
279 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
280 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
281 );
282
283 let ctx = serde_json::Value::Null;
284 let raw_result = self
285 .start_and_tick(lua_code, ctx, None, vec![], vec![])
286 .await?;
287
288 match save_compare_result(&app_dir, eval_id_a, eval_id_b, &raw_result) {
292 Ok(()) => Ok(raw_result),
293 Err(e) => Ok(splice_response_string(&raw_result, "save_warning", &e)),
294 }
295 }
296}