1use super::eval_store::{
2 escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
3 save_eval_result, splice_response_string,
4};
5use super::path::ContainedPath;
6use super::resolve::{is_package_installed, resolve_scenario_code};
7use super::run::normalize_stringified_json_object;
8use super::AppService;
9
10const STD_SHIM: &str = r#"
13std = {
14 json = {
15 decode = alc.json_decode,
16 encode = alc.json_encode,
17 },
18 fs = {
19 read = function(path)
20 local f, err = io.open(path, "r")
21 if not f then error("std.fs.read: " .. (err or path), 2) end
22 local content = f:read("*a")
23 f:close()
24 return content
25 end,
26 is_file = function(path)
27 local f = io.open(path, "r")
28 if f then f:close(); return true end
29 return false
30 end,
31 },
32 time = {
33 now = alc.time,
34 },
35}
36"#;
37
38impl AppService {
39 pub async fn eval(
53 &self,
54 scenario: Option<String>,
55 scenario_file: Option<String>,
56 scenario_name: Option<String>,
57 strategy: &str,
58 strategy_opts: Option<serde_json::Value>,
59 auto_card: bool,
60 ) -> Result<String, String> {
61 let app_dir = self.log_config.app_dir();
63 if !is_package_installed(&app_dir, "evalframe") {
64 self.auto_install_bundled_packages().await?;
65 if !is_package_installed(&app_dir, "evalframe") {
66 return Err(
67 "Package 'evalframe' not found after installing bundled collection. \
68 Use alc_pkg_install to install it manually."
69 .into(),
70 );
71 }
72 }
73
74 let scenario_code =
75 resolve_scenario_code(&app_dir, scenario, scenario_file, scenario_name.clone())?;
76
77 let strategy_opts = strategy_opts.map(normalize_stringified_json_object);
79 let opts_lua = match &strategy_opts {
80 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
81 _ => "nil".to_string(),
82 };
83
84 let auto_card_lua = if auto_card { "true" } else { "false" };
85
86 let wrapped = format!(
90 r#"{std_shim}
91
92local scenario = (function()
93{scenario_code}
94end)()
95
96return alc.eval(scenario, "{strategy}", {{
97 strategy_opts = {opts_lua},
98 auto_card = {auto_card_lua},
99}})
100"#,
101 std_shim = STD_SHIM,
102 );
103
104 let ctx = serde_json::Value::Null;
105 let result = self
106 .start_and_tick(wrapped, ctx, Some(strategy), vec![], vec![])
107 .await?;
108
109 let mut save_warning: Option<String> = None;
112 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
113 match parsed.get("status").and_then(|s| s.as_str()) {
114 Some("completed") => {
115 if let Err(e) = save_eval_result(&app_dir, strategy, &result) {
116 save_warning = Some(e);
117 }
118 }
119 Some("needs_response") => {
120 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
121 if let Ok(mut map) = self.eval_sessions.lock() {
122 map.insert(sid.to_string(), strategy.to_string());
123 }
124 }
125 }
126 _ => {}
127 }
128 }
129
130 match save_warning {
131 Some(msg) => Ok(splice_response_string(&result, "save_warning", &msg)),
132 None => Ok(result),
133 }
134 }
135
136 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
138 let dir = evals_dir(&self.log_config.app_dir());
139 list_eval_history(&dir, strategy, limit)
140 }
141
142 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
144 let evals_dir = evals_dir(&self.log_config.app_dir());
145 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
146 .map_err(|e| format!("Invalid eval_id: {e}"))?;
147 if !path.exists() {
148 return Err(format!("Eval result not found: {eval_id}"));
149 }
150 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
151 }
152
153 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
162 let app_dir = self.log_config.app_dir();
163 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
165 {
166 let dir = evals_dir(&app_dir);
167 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
168 if cached_path.exists() {
169 return std::fs::read_to_string(&*cached_path)
170 .map_err(|e| format!("Failed to read cached comparison: {e}"));
171 }
172 }
173 }
174
175 if !is_package_installed(&app_dir, "evalframe") {
177 self.auto_install_bundled_packages().await?;
178 if !is_package_installed(&app_dir, "evalframe") {
179 return Err(
180 "Package 'evalframe' not found after installing bundled collection. \
181 Use alc_pkg_install to install it manually."
182 .into(),
183 );
184 }
185 }
186
187 let result_a = self.eval_detail(eval_id_a)?;
188 let result_b = self.eval_detail(eval_id_b)?;
189
190 let lua_code = format!(
193 r#"{std_shim}
194
195local stats = require("evalframe.eval.stats")
196
197local result_a = alc.json_decode('{result_a_escaped}')
198local result_b = alc.json_decode('{result_b_escaped}')
199
200local agg_a = result_a.result and result_a.result.aggregated
201local agg_b = result_b.result and result_b.result.aggregated
202
203if not agg_a or not agg_a.scores then
204 error("No aggregated scores in {eval_id_a}")
205end
206if not agg_b or not agg_b.scores then
207 error("No aggregated scores in {eval_id_b}")
208end
209
210local welch = stats.welch_t(agg_a.scores, agg_b.scores)
211
212local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
213local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
214
215local delta = agg_a.scores.mean - agg_b.scores.mean
216local winner = "none"
217if welch.significant then
218 winner = delta > 0 and "a" or "b"
219end
220
221-- Build summary text
222local parts = {{}}
223if welch.significant then
224 local w, l, d = strategy_a, strategy_b, delta
225 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
226 parts[#parts + 1] = string.format(
227 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
228 w, l, d, math.abs(welch.t_stat), welch.df
229 )
230else
231 parts[#parts + 1] = string.format(
232 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
233 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
234 )
235end
236if agg_a.pass_rate and agg_b.pass_rate then
237 local dp = agg_a.pass_rate - agg_b.pass_rate
238 if math.abs(dp) > 1e-9 then
239 local h = dp > 0 and strategy_a or strategy_b
240 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
241 else
242 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
243 end
244end
245
246return {{
247 a = {{
248 eval_id = "{eval_id_a}",
249 strategy = strategy_a,
250 scores = agg_a.scores,
251 pass_rate = agg_a.pass_rate,
252 pass_at_1 = agg_a.pass_at_1,
253 ci_95 = agg_a.ci_95,
254 }},
255 b = {{
256 eval_id = "{eval_id_b}",
257 strategy = strategy_b,
258 scores = agg_b.scores,
259 pass_rate = agg_b.pass_rate,
260 pass_at_1 = agg_b.pass_at_1,
261 ci_95 = agg_b.ci_95,
262 }},
263 comparison = {{
264 delta_mean = delta,
265 welch_t = {{
266 t_stat = welch.t_stat,
267 df = welch.df,
268 significant = welch.significant,
269 direction = welch.direction,
270 }},
271 winner = winner,
272 summary = table.concat(parts, " "),
273 }},
274}}
275"#,
276 result_a_escaped = escape_for_lua_sq(&result_a),
277 result_b_escaped = escape_for_lua_sq(&result_b),
278 eval_id_a = eval_id_a,
279 eval_id_b = eval_id_b,
280 std_shim = STD_SHIM,
281 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
282 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
283 );
284
285 let ctx = serde_json::Value::Null;
286 let raw_result = self
287 .start_and_tick(lua_code, ctx, None, vec![], vec![])
288 .await?;
289
290 match save_compare_result(&app_dir, eval_id_a, eval_id_b, &raw_result) {
294 Ok(()) => Ok(raw_result),
295 Err(e) => Ok(splice_response_string(&raw_result, "save_warning", &e)),
296 }
297 }
298}