1use std::collections::HashMap;
2use std::sync::Arc;
3
4use super::eval_store::{
5 escape_for_lua_sq, evals_dir, extract_strategy_from_id, list_eval_history, save_compare_result,
6 save_eval_result, splice_response_string,
7};
8use super::path::ContainedPath;
9use super::resolve::{is_package_installed, resolve_scenario_code};
10use super::run::normalize_stringified_json_object;
11use super::AppService;
12
13const STD_SHIM: &str = r#"
16std = {
17 json = {
18 decode = alc.json_decode,
19 encode = alc.json_encode,
20 },
21 fs = {
22 read = function(path)
23 local f, err = io.open(path, "r")
24 if not f then error("std.fs.read: " .. (err or path), 2) end
25 local content = f:read("*a")
26 f:close()
27 return content
28 end,
29 is_file = function(path)
30 local f = io.open(path, "r")
31 if f then f:close(); return true end
32 return false
33 end,
34 },
35 time = {
36 now = alc.time,
37 },
38}
39"#;
40
41impl AppService {
42 pub async fn eval(
56 &self,
57 scenario: Option<String>,
58 scenario_file: Option<String>,
59 scenario_name: Option<String>,
60 strategy: &str,
61 strategy_opts: Option<serde_json::Value>,
62 auto_card: bool,
63 ) -> Result<String, String> {
64 let app_dir = self.log_config.app_dir();
66 if !is_package_installed(&app_dir, "evalframe") {
67 self.auto_install_bundled_packages().await?;
68 if !is_package_installed(&app_dir, "evalframe") {
69 return Err(
70 "Package 'evalframe' not found after installing bundled collection. \
71 Use alc_pkg_install to install it manually."
72 .into(),
73 );
74 }
75 }
76
77 let scenario_code =
78 resolve_scenario_code(&app_dir, scenario, scenario_file, scenario_name.clone())?;
79
80 let strategy_opts = strategy_opts.map(normalize_stringified_json_object);
82 let opts_lua = match &strategy_opts {
83 Some(v) if !v.is_null() => format!("alc.json_decode('{}')", v),
84 _ => "nil".to_string(),
85 };
86
87 let auto_card_lua = if auto_card { "true" } else { "false" };
88
89 let wrapped = format!(
93 r#"{std_shim}
94
95local scenario = (function()
96{scenario_code}
97end)()
98
99return alc.eval(scenario, "{strategy}", {{
100 strategy_opts = {opts_lua},
101 auto_card = {auto_card_lua},
102}})
103"#,
104 std_shim = STD_SHIM,
105 );
106
107 let ctx = serde_json::Value::Null;
108 let env_map = Arc::new(HashMap::new());
111 let result = self
112 .start_and_tick(env_map, wrapped, ctx, Some(strategy), vec![], vec![])
113 .await?;
114
115 let mut save_warning: Option<String> = None;
118 if let Ok(parsed) = serde_json::from_str::<serde_json::Value>(&result) {
119 match parsed.get("status").and_then(|s| s.as_str()) {
120 Some("completed") => {
121 if let Err(e) = save_eval_result(&app_dir, strategy, &result) {
122 save_warning = Some(e);
123 }
124 }
125 Some("needs_response") => {
126 if let Some(sid) = parsed.get("session_id").and_then(|s| s.as_str()) {
127 if let Ok(mut map) = self.eval_sessions.lock() {
128 map.insert(sid.to_string(), strategy.to_string());
129 }
130 }
131 }
132 _ => {}
133 }
134 }
135
136 match save_warning {
137 Some(msg) => Ok(splice_response_string(&result, "save_warning", &msg)),
138 None => Ok(result),
139 }
140 }
141
142 pub fn eval_history(&self, strategy: Option<&str>, limit: usize) -> Result<String, String> {
144 let dir = evals_dir(&self.log_config.app_dir());
145 list_eval_history(&dir, strategy, limit)
146 }
147
148 pub fn eval_detail(&self, eval_id: &str) -> Result<String, String> {
150 let evals_dir = evals_dir(&self.log_config.app_dir());
151 let path = ContainedPath::child(&evals_dir, &format!("{eval_id}.json"))
152 .map_err(|e| format!("Invalid eval_id: {e}"))?;
153 if !path.exists() {
154 return Err(format!("Eval result not found: {eval_id}"));
155 }
156 std::fs::read_to_string(&*path).map_err(|e| format!("Failed to read eval: {e}"))
157 }
158
159 pub async fn eval_compare(&self, eval_id_a: &str, eval_id_b: &str) -> Result<String, String> {
168 let app_dir = self.log_config.app_dir();
169 let cache_filename = format!("compare_{eval_id_a}_vs_{eval_id_b}.json");
171 {
172 let dir = evals_dir(&app_dir);
173 if let Ok(cached_path) = ContainedPath::child(&dir, &cache_filename) {
174 if cached_path.exists() {
175 return std::fs::read_to_string(&*cached_path)
176 .map_err(|e| format!("Failed to read cached comparison: {e}"));
177 }
178 }
179 }
180
181 if !is_package_installed(&app_dir, "evalframe") {
183 self.auto_install_bundled_packages().await?;
184 if !is_package_installed(&app_dir, "evalframe") {
185 return Err(
186 "Package 'evalframe' not found after installing bundled collection. \
187 Use alc_pkg_install to install it manually."
188 .into(),
189 );
190 }
191 }
192
193 let result_a = self.eval_detail(eval_id_a)?;
194 let result_b = self.eval_detail(eval_id_b)?;
195
196 let lua_code = format!(
199 r#"{std_shim}
200
201local stats = require("evalframe.eval.stats")
202
203local result_a = alc.json_decode('{result_a_escaped}')
204local result_b = alc.json_decode('{result_b_escaped}')
205
206local agg_a = result_a.result and result_a.result.aggregated
207local agg_b = result_b.result and result_b.result.aggregated
208
209if not agg_a or not agg_a.scores then
210 error("No aggregated scores in {eval_id_a}")
211end
212if not agg_b or not agg_b.scores then
213 error("No aggregated scores in {eval_id_b}")
214end
215
216local welch = stats.welch_t(agg_a.scores, agg_b.scores)
217
218local strategy_a = (result_a.result and result_a.result.name) or "{strategy_a_fallback}"
219local strategy_b = (result_b.result and result_b.result.name) or "{strategy_b_fallback}"
220
221local delta = agg_a.scores.mean - agg_b.scores.mean
222local winner = "none"
223if welch.significant then
224 winner = delta > 0 and "a" or "b"
225end
226
227-- Build summary text
228local parts = {{}}
229if welch.significant then
230 local w, l, d = strategy_a, strategy_b, delta
231 if delta < 0 then w, l, d = strategy_b, strategy_a, -delta end
232 parts[#parts + 1] = string.format(
233 "%s outperforms %s by %.4f (mean score), statistically significant (t=%.3f, df=%.1f).",
234 w, l, d, math.abs(welch.t_stat), welch.df
235 )
236else
237 parts[#parts + 1] = string.format(
238 "No statistically significant difference between %s and %s (t=%.3f, df=%.1f).",
239 strategy_a, strategy_b, math.abs(welch.t_stat), welch.df
240 )
241end
242if agg_a.pass_rate and agg_b.pass_rate then
243 local dp = agg_a.pass_rate - agg_b.pass_rate
244 if math.abs(dp) > 1e-9 then
245 local h = dp > 0 and strategy_a or strategy_b
246 parts[#parts + 1] = string.format("Pass rate: %s +%.1fpp.", h, math.abs(dp) * 100)
247 else
248 parts[#parts + 1] = string.format("Pass rate: identical (%.1f%%).", agg_a.pass_rate * 100)
249 end
250end
251
252return {{
253 a = {{
254 eval_id = "{eval_id_a}",
255 strategy = strategy_a,
256 scores = agg_a.scores,
257 pass_rate = agg_a.pass_rate,
258 pass_at_1 = agg_a.pass_at_1,
259 ci_95 = agg_a.ci_95,
260 }},
261 b = {{
262 eval_id = "{eval_id_b}",
263 strategy = strategy_b,
264 scores = agg_b.scores,
265 pass_rate = agg_b.pass_rate,
266 pass_at_1 = agg_b.pass_at_1,
267 ci_95 = agg_b.ci_95,
268 }},
269 comparison = {{
270 delta_mean = delta,
271 welch_t = {{
272 t_stat = welch.t_stat,
273 df = welch.df,
274 significant = welch.significant,
275 direction = welch.direction,
276 }},
277 winner = winner,
278 summary = table.concat(parts, " "),
279 }},
280}}
281"#,
282 result_a_escaped = escape_for_lua_sq(&result_a),
283 result_b_escaped = escape_for_lua_sq(&result_b),
284 eval_id_a = eval_id_a,
285 eval_id_b = eval_id_b,
286 std_shim = STD_SHIM,
287 strategy_a_fallback = extract_strategy_from_id(eval_id_a).unwrap_or("A"),
288 strategy_b_fallback = extract_strategy_from_id(eval_id_b).unwrap_or("B"),
289 );
290
291 let ctx = serde_json::Value::Null;
292 let env_map = Arc::new(HashMap::new());
295 let raw_result = self
296 .start_and_tick(env_map, lua_code, ctx, None, vec![], vec![])
297 .await?;
298
299 match save_compare_result(&app_dir, eval_id_a, eval_id_b, &raw_result) {
303 Ok(()) => Ok(raw_result),
304 Err(e) => Ok(splice_response_string(&raw_result, "save_warning", &e)),
305 }
306 }
307}